diff --git a/ChangeLog.md b/ChangeLog.md
new file mode 100644
index 0000000000..2be2d4adbe
--- /dev/null
+++ b/ChangeLog.md
@@ -0,0 +1,21 @@
+Version 1.0.1-dev
+-----------------
+- Preliminary support for a subset of the Vector Extension, v0.7.1.
+- Support S-mode vectored interrupts (i.e. `stvec[0]` is now writable).
+- Added support for dynamic linking of libraries containing MMIO devices.
+- Added `--priv` flag to control which privilege modes are available.
+- When the commit log is enabled at configure time (`--enable-commitlog`),
+  it must also be enabled at runtime with the `--log-commits` option.
+- Several debug-related additions and changes:
+  - Added `hasel` debug feature.
+  - Added `--dm-no-abstract-csr` command-line option.
+  - Added `--dm-no-halt-groups` command line option.
+  - Renamed `--progsize` to `--dm-progsize`.
+  - Renamed `--debug-sba` to `--dm-sba`.
+  - Renamed `--debug-auth` to `--dm-auth`.
+  - Renamed `--abstract-rti` to `--dm-abstract-rti`.
+  - Renamed `--without-hasel` to `--dm-no-hasel`.
+
+Version 1.0.0 (2019-03-30)
+--------------------------
+- First versioned release.
diff --git a/Makefile.in b/Makefile.in
index c09fc50588..66e8df08c5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -36,6 +36,9 @@ project_name := @PACKAGE_TARNAME@
 src_dir      := @srcdir@
 scripts_dir  := $(src_dir)/scripts
 
+HAVE_INT128 := @HAVE_INT128@
+HAVE_DLOPEN := @HAVE_DLOPEN@
+
 # If the version information is not in the configure script, then we
 # assume that we are in a working directory. We use the vcs-version.sh
 # script in the scripts directory to generate an appropriate version
@@ -50,17 +53,11 @@ endif
 
 # Installation directories
 
-prefix       := @prefix@
-enable_stow  := @enable_stow@
+prefix       ?= @prefix@
 
-ifeq ($(enable_stow),yes)
-  stow_pkg_dir := $(prefix)/pkgs
-  INSTALLDIR ?= $(DESTDIR)$(stow_pkg_dir)/$(project_name)-$(project_ver)
-else
-  INSTALLDIR ?= $(DESTDIR)$(prefix)
-endif
+INSTALLDIR ?= $(DESTDIR)$(prefix)
 
-install_hdrs_dir := $(INSTALLDIR)/include/$(project_name)
+install_hdrs_dir := $(INSTALLDIR)/include
 install_libs_dir := $(INSTALLDIR)/lib
 install_exes_dir := $(INSTALLDIR)/bin
 
@@ -81,25 +78,52 @@ VPATH := $(addprefix $(src_dir)/, $(sprojs_enabled))
 # C++ compiler
 #  - CPPFLAGS : flags for the preprocessor (eg. -I,-D)
 #  - CXXFLAGS : flags for C++ compiler (eg. -Wall,-g,-O3)
+#
+# To allow a user to specify CFLAGS or similar as part of the Make
+# command, we also have mcpps-CFLAGS etc. with stuff that shouldn't be
+# lost in such a case.
+#
+# The order of precedence (highest to lowest) is then:
+#
+#    - Specified as part of Make command line
+#    - Specified as part of running configure
+#    - Specified here (default-CFLAGS)
+#
+# These all appear on the command line, from lowest precedence to
+# highest.
+
+default-CFLAGS   := -DPREFIX=\"$(prefix)\" -Wall -Wno-unused -g -O2
+default-CXXFLAGS := $(default-CFLAGS) -std=c++11
+
+mcppbs-CPPFLAGS := @CPPFLAGS@
+mcppbs-CFLAGS   := $(default-CFLAGS) @CFLAGS@
+mcppbs-CXXFLAGS := $(default-CXXFLAGS) @CXXFLAGS@
 
 CC            := @CC@
 CXX           := @CXX@
-CFLAGS        += @CFLAGS@ -DPREFIX=\"$(prefix)\"
-CPPFLAGS      += @CPPFLAGS@
-CXXFLAGS      += @CXXFLAGS@ -DPREFIX=\"$(prefix)\"
-COMPILE       := $(CXX) -fPIC -MMD -MP $(CPPFLAGS) $(CXXFLAGS) \
-                 $(sprojs_include)
-COMPILE_C     := $(CC) -fPIC -MMD -MP $(CPPFLAGS) $(CFLAGS) \
-                 $(sprojs_include)
+
+# These are the flags actually used for a C++ compile or a C compile.
+# The language-specific flags come after the preprocessor flags, but
+# user-supplied flags always take precedence.
+all-cxx-flags := \
+  $(mcppbs-CPPFLAGS) $(mcppbs-CXXFLAGS) $(CPPFLAGS) $(CXXFLAGS)
+all-c-flags := \
+  $(mcppbs-CPPFLAGS) $(mcppbs-CFLAGS) $(CPPFLAGS) $(CFLAGS)
+
+COMPILE       := $(CXX) -MMD -MP $(all-cxx-flags) $(sprojs_include)
+COMPILE_C     := $(CC) -MMD -MP $(all-c-flags) $(sprojs_include)
+
 # Linker
 #  - LDFLAGS : Flags for the linker (eg. -L)
 #  - LIBS    : Library flags (eg. -l)
 
+mcppbs-LDFLAGS := @LDFLAGS@
+all-link-flags := $(mcppbs-LDFLAGS) $(LDFLAGS)
+
 comma := ,
 LD            := $(CXX)
-LDFLAGS       := @LDFLAGS@
 LIBS          := @LIBS@
-LINK          := $(LD) -L. $(LDFLAGS) -Wl,-rpath,$(install_libs_dir) $(patsubst -L%,-Wl$(comma)-rpath$(comma)%,$(filter -L%,$(LDFLAGS)))
+LINK          := $(LD) -L. $(all-link-flags) -Wl,-rpath,$(install_libs_dir) $(patsubst -L%,-Wl$(comma)-rpath$(comma)%,$(filter -L%,$(LDFLAGS)))
 
 # Library creation
 
@@ -115,9 +139,9 @@ RUNFLAGS      := @RUNFLAGS@
 
 MKINSTALLDIRS := $(scripts_dir)/mk-install-dirs.sh
 INSTALL       := @INSTALL@
-INSTALL_HDR   := $(INSTALL) -m 444
+INSTALL_HDR   := $(INSTALL) -m 644
 INSTALL_LIB   := $(INSTALL) -m 644
-INSTALL_EXE   := $(INSTALL) -m 555
+INSTALL_EXE   := $(INSTALL) -m 755
 STOW          := @stow@
 
 # Tests
@@ -194,12 +218,12 @@ $(2)_deps := $$(patsubst %.o, %.d, $$($(2)_objs))
 $(2)_deps += $$(patsubst %.o, %.d, $$($(2)_c_objs))
 $(2)_deps += $$(patsubst %.h, %.h.d, $$($(2)_precompiled_hdrs))
 $$($(2)_pch) : %.h.gch : %.h
-	$(COMPILE) -x c++-header $$< -o $$@
+	$(COMPILE) -x c++-header -c $$< -o $$@
 # If using clang, don't depend (and thus don't build) precompiled headers
 $$($(2)_objs) : %.o : %.cc $$($(2)_gen_hdrs) $(if $(filter-out clang,$(CC)),$$($(2)_pch))
-	$(COMPILE) -c $$<
+	$(COMPILE) $$($(2)_CFLAGS) -c $$<
 $$($(2)_c_objs) : %.o : %.c $$($(2)_gen_hdrs)
-	$(COMPILE_C) -c $$<
+	$(COMPILE_C) $$($(2)_CFLAGS) -c $$<
 
 $(2)_junk += $$($(2)_pch) $$($(2)_objs) $$($(2)_c_objs) $$($(2)_deps) \
   $$($(2)_gen_hdrs)
@@ -213,13 +237,17 @@ $(2)_reverse_deps   := $$(call reverse_list,$$($(2)_subproject_deps))
 # Build a library for this subproject
 
 $(2)_lib_libs       := $$($(2)_reverse_deps)
-$(2)_lib_libnames   := $$(patsubst %, lib%.so, $$($(2)_lib_libs))
+$(2)_lib_libnames   := $$(patsubst %, lib%.a, $$($(2)_lib_libs))
 $(2)_lib_libarg     := $$(patsubst %, -l%, $$($(2)_lib_libs))
+$(2)_lib_libnames_shared	:= $$(if $$($(2)_install_shared_lib),lib$(1).so,)
 
-lib$(1).so : $$($(2)_objs) $$($(2)_c_objs) $$($(2)_lib_libnames)
-	$(LINK) -shared -o $$@ $(if $(filter Darwin,$(shell uname -s)),-install_name $(install_libs_dir)/$$@) $$^ $$($(2)_lib_libarg) $(LIBS)
+lib$(1).a : $$($(2)_objs) $$($(2)_c_objs) $$($(2)_lib_libnames)
+	$(AR) rcs $$@ $$^
+lib$(1).so : $$($(2)_objs) $$($(2)_c_objs) $$($(2)_lib_libnames_shared) $$($(2)_lib_libnames)
+	$(LINK) -shared -o $$@ $(if $(filter Darwin,$(shell uname -s)),-install_name $(install_libs_dir)/$$@) $$^ $$($(2)_lib_libnames) $(LIBS)
 
-$(2)_junk += lib$(1).so
+$(2)_junk += lib$(1).a
+$(2)_junk += $$(if $$($(2)_install_shared_lib),lib$(1).so,)
 
 # Build unit tests
 
@@ -228,14 +256,14 @@ $(2)_test_deps      := $$(patsubst %.o, %.d, $$($(2)_test_objs))
 $(2)_test_exes      := $$(patsubst %.t.cc, %-utst, $$($(2)_test_srcs))
 $(2)_test_outs      := $$(patsubst %, %.out, $$($(2)_test_exes))
 $(2)_test_libs      := $(1) $$($(2)_reverse_deps) utst
-$(2)_test_libnames  := $$(patsubst %, lib%.so, $$($(2)_test_libs))
+$(2)_test_libnames  := $$(patsubst %, lib%.a, $$($(2)_test_libs))
 $(2)_test_libarg    := $$(patsubst %, -l%, $$($(2)_test_libs))
 
 $$($(2)_test_objs) : %.o : %.cc
 	$(COMPILE) -c $$<
 
 $$($(2)_test_exes) : %-utst : %.t.o $$($(2)_test_libnames)
-	$(LINK) -o $$@ $$< $$($(2)_test_libarg) $(LIBS)
+	$(LINK) -o $$@ $$< $$($(2)_test_libnames) $(LIBS)
 
 $(2)_deps += $$($(2)_test_deps)
 $(2)_junk += \
@@ -255,14 +283,14 @@ $(2)_prog_objs      := $$(patsubst %.cc, %.o, $$($(2)_prog_srcs))
 $(2)_prog_deps      := $$(patsubst %.o, %.d, $$($(2)_prog_objs))
 $(2)_prog_exes      := $$(patsubst %.cc, %, $$($(2)_prog_srcs))
 $(2)_prog_libs      := $(1) $$($(2)_reverse_deps)
-$(2)_prog_libnames  := $$(patsubst %, lib%.so, $$($(2)_prog_libs))
+$(2)_prog_libnames  := $$(patsubst %, lib%.a, $$($(2)_prog_libs))
 $(2)_prog_libarg    := $$(patsubst %, -l%, $$($(2)_prog_libs))
 
 $$($(2)_prog_objs) : %.o : %.cc
 	$(COMPILE) -c $$<
 
 $$($(2)_prog_exes) : % : %.o $$($(2)_prog_libnames)
-	$(LINK) -o $$@ $$< $$($(2)_prog_libarg) $(LIBS)
+	$(LINK) -o $$@ $$< $$($(2)_prog_libnames) $(LIBS)
 
 $(2)_deps += $$($(2)_prog_deps)
 $(2)_junk += $$($(2)_prog_objs) $$($(2)_prog_deps) $$($(2)_prog_exes)
@@ -277,7 +305,7 @@ $$($(2)_install_prog_objs) : %.o : %.cc $$($(2)_gen_hdrs)
 	$(COMPILE) -c $$<
 
 $$($(2)_install_prog_exes) : % : %.o $$($(2)_prog_libnames)
-	$(LINK) -o $$@ $$< $$($(2)_prog_libarg) $(LIBS)
+	$(LINK) -o $$@ $$< $$($(2)_prog_libnames) $(LIBS)
 
 $(2)_deps += $$($(2)_install_prog_deps)
 $(2)_junk += \
@@ -286,7 +314,7 @@ $(2)_junk += \
 
 # Subproject specific targets
 
-all-$(1) : lib$(1).so $$($(2)_install_prog_exes)
+all-$(1) : lib$(1).a $$($(2)_install_prog_exes)
 
 check-$(1) : $$($(2)_test_outs)
 	echo; grep -h -e'Unit Tests' -e'FAILED' -e'Segementation' $$^; echo
@@ -298,7 +326,7 @@ clean-$(1) :
 
 # Update running variables
 
-libs += lib$(1).so
+libs += lib$(1).a
 objs += $$($(2)_objs)
 srcs += $$(addprefix $(src_dir)/$(1)/, $$($(2)_srcs)) 
 hdrs += $$(addprefix $(src_dir)/$(1)/, $$($(2)_hdrs)) $$($(2)_gen_hdrs)
@@ -307,10 +335,11 @@ deps += $$($(2)_deps)
 
 test_outs += $$($(2)_test_outs)
 
-install_hdrs += $$(addprefix $(src_dir)/$(1)/, $$($(2)_hdrs)) $$($(2)_gen_hdrs)
-install_libs += lib$(1).so
+install_hdrs += $$(addprefix $(src_dir)/$(1)/, $$($(2)_install_hdrs))
+install_libs += $$(if $$($(2)_install_lib),lib$(1).a,)
+install_libs += $$(if $$($(2)_install_shared_lib),lib$(1).so,)
 install_exes += $$($(2)_install_prog_exes)
-install_pcs  += riscv-$(1).pc
+install_pcs  += $$(if $$($(2)_install_lib),riscv-$(1).pc,)
 
 endef
 
@@ -353,11 +382,12 @@ check : check-cpp check-bin
 # Installation
 #-------------------------------------------------------------------------
 
-install-hdrs : $(install_hdrs) config.h
+install-hdrs : $(install_hdrs)
 	$(MKINSTALLDIRS) $(install_hdrs_dir)
-	for file in $^; \
+	for file in $(subst $(src_dir)/,,$^); \
   do \
-    $(INSTALL_HDR) $$file $(install_hdrs_dir); \
+		$(MKINSTALLDIRS) $(install_hdrs_dir)/`dirname $$file`; \
+    $(INSTALL_HDR) $(src_dir)/$$file $(install_hdrs_dir)/`dirname $$file`; \
   done
 
 install-libs : $(install_libs)
@@ -382,12 +412,6 @@ install-pc : $(install_pcs)
   done
 
 install : install-hdrs install-libs install-exes install-pc
-ifeq ($(enable_stow),yes)
-	$(MKINSTALLDIRS) $(stow_pkg_dir)
-	cd $(stow_pkg_dir) && \
-    $(STOW) --delete $(project_name)-* && \
-    $(STOW) $(project_name)-$(project_ver)
-endif
 
 .PHONY : install install-hdrs install-libs install-exes
 
diff --git a/README.md b/README.md
index 018c7d3ead..42f19f806c 100644
--- a/README.md
+++ b/README.md
@@ -5,24 +5,71 @@ About
 -------------
 
 Spike, the RISC-V ISA Simulator, implements a functional model of one or more
-RISC-V processors.
-
-Spike is named after the golden spike used to celebrate the completion of the
-US transcontinental railway.
+RISC-V harts.  It is named after the golden spike used to celebrate the
+completion of the US transcontinental railway.
+
+This fork extends Spike to support custom PULP instructions.
+Together with the repos riscv-opcodes and riscv-tests, it forms a framework that aids in developing extensions, testing implementations and running applications.
+
+Spike supports the following RISC-V ISA features:
+  - RV32I and RV64I base ISAs, v2.1
+  - Zifencei extension, v2.0
+  - Zicsr extension, v2.0
+  - M extension, v2.0
+  - A extension, v2.1
+  - F extension, v2.2
+  - D extension, v2.2
+  - Q extension, v2.2
+  - C extension, v2.0
+  - V extension, v0.9, w/ Zvlsseg/Zvamo/Zvqmac, w/o Zvediv, (_requires a 64-bit host_)
+  - Conformance to both RVWMO and RVTSO (Spike is sequentially consistent)
+  - Machine, Supervisor, and User modes, v1.11
+  - Debug v0.14
+  - All xpulpv3 extension subsets except xpulpelw
+
+Versioning and APIs
+-------------------
+
+Projects are versioned primarily to indicate when the API has been extended or
+rendered incompatible.  In that spirit, Spike aims to follow the
+[SemVer](https://semver.org/spec/v2.0.0.html) versioning scheme, in which
+major version numbers are incremented when backwards-incompatible API changes
+are made; minor version numbers are incremented when new APIs are added; and
+patch version numbers are incremented when bugs are fixed in
+a backwards-compatible manner.
+
+Spike's principal public API is the RISC-V ISA.  _The C++ interface to Spike's
+internals is **not** considered a public API at this time_, and
+backwards-incompatible changes to this interface _will_ be made without
+incrementing the major version number.
 
 Build Steps
 ---------------
 
 We assume that the RISCV environment variable is set to the RISC-V tools
-install path, and that the riscv-fesvr package is installed there.
+install path.
 
     $ apt-get install device-tree-compiler
     $ mkdir build
     $ cd build
-    $ ../configure --prefix=$RISCV --with-fesvr=$RISCV
+    $ ../configure --prefix=$RISCV
     $ make
     $ [sudo] make install
 
+Build Steps on OpenBSD
+----------------------
+
+Install bash, gmake, dtc, and use clang.
+
+    $ pkg_add bash gmake dtc
+    $ exec bash
+    $ export CC=cc; export CXX=c++
+    $ mkdir build
+    $ cd build
+    $ ../configure --prefix=$RISCV
+    $ gmake
+    $ [doas] make install
+
 Compiling and Running a Simple C Program
 -------------------------------------------
 
@@ -31,29 +78,49 @@ Install spike (see Build Steps), riscv-gnu-toolchain, and riscv-pk.
 Write a short C program and name it hello.c.  Then, compile it into a RISC-V
 ELF binary named hello:
 
-    $ riscv64-unknown-elf-gcc -o hello hello.c
+    $ riscv32-unknown-elf-gcc -o hello hello.c
 
 Now you can simulate the program atop the proxy kernel:
 
     $ spike pk hello
 
+Or on bare metal:
+
+```
+$ spike hello
+```
+
+[jonesinator/riscv-spike-minimal-assembly](https://github.com/jonesinator/riscv-spike-minimal-assembly) provides a well documented minimal bare metal program and also one which uses syscall to communicate with the host.
+
+For xpulp-specific examples take a look at the riscv-tests repo, in riscv-tests/isa it contains functional tests for all supported xpulp instructions.
+
 Simulating a New Instruction
 ------------------------------------
 
-Adding an instruction to the simulator requires two steps:
+Adding an instruction to the simulator requires these steps:
+
+  1. Clone riscv-opcodes, add the opcode to it and generate encoding_out.h
+
+  2. Create a soft-link for riscv/encoding.h to the generated encoding_out.h
+
+     ```
+     $ ln -sfr riscv-opcodes/encoding_out.h riscv-isa-sim/riscv/encoding.h
+     ```
+
+  3. Describe the instruction's functional behavior in the file
+     riscv/insns/<new_instruction_name>.h.  Examine other instructions
+     in that directory as a starting point. Use macros from riscv/decode.h.
 
-  1.  Describe the instruction's functional behavior in the file
-      riscv/insns/<new_instruction_name>.h.  Examine other instructions
-      in that directory as a starting point.
+  4. Add the mnemonic format (disassembly format) of the instruction to diasm/diasm.cc
 
-  2.  Add the opcode and opcode mask to riscv/opcodes.h.  Alternatively,
-      add it to the riscv-opcodes package, and it will do so for you:
+  5. In riscv/riscv.mk.in add the instruction to riscv_insn_list.
+     You can get all instructions from your current encoding.h (encoding_out.h) using:
 
-         $ cd ../riscv-opcodes
-         $ vi opcodes       // add a line for the new instruction
-         $ make install
+     ```
+     $ grep ^DECLARE_INSN encoding.h | sed 's/DECLARE_INSN(\(.*\),.*,.*)/\1/'
+     ```
 
-  3.  Rebuild the simulator.
+  6. Rebuild the simulator.
 
 Interactive Debug Mode
 ---------------------------
@@ -84,7 +151,7 @@ To see the contents of memory with a virtual address (0 for core 0):
 
     : mem 0 2020
 
-You can advance by one instruction by pressing <enter>. You can also
+You can advance by one instruction by pressing the enter key. You can also
 execute until a desired equality is reached:
 
     : until pc 0 2020                   (stop when pc=2020)
@@ -136,6 +203,7 @@ int main()
         i++;
     }
 
+done:
     while (!wait)
         ;
 }
@@ -184,8 +252,8 @@ riscv.cpu: target state: halted
 In yet another shell, start your gdb debug session:
 ```
 tnewsome@compy-vm:~/SiFive/spike-test$ riscv64-unknown-elf-gdb rot13-64
-GNU gdb (GDB) 7.12.50.20170505-git
-Copyright (C) 2016 Free Software Foundation, Inc.
+GNU gdb (GDB) 8.0.50.20170724-git
+Copyright (C) 2017 Free Software Foundation, Inc.
 License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
 This is free software: you are free to change and redistribute it.
 There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
@@ -201,21 +269,22 @@ Type "apropos word" to search for commands related to "word"...
 Reading symbols from rot13-64...done.
 (gdb) target remote localhost:3333
 Remote debugging using localhost:3333
-0x000000001001000a in main () at rot13.c:8
-8           while (wait)
+0x0000000010010004 in main () at rot13.c:8
+8	    while (wait)
 (gdb) print wait
 $1 = 1
 (gdb) print wait=0
 $2 = 0
 (gdb) print text
 $3 = "Vafgehpgvba frgf jnag gb or serr!"
-(gdb) b 23
-Breakpoint 1 at 0x10010064: file rot13.c, line 23.
+(gdb) b done 
+Breakpoint 1 at 0x10010064: file rot13.c, line 22.
 (gdb) c
 Continuing.
+Disabling abstract command writes to CSRs.
 
 Breakpoint 1, main () at rot13.c:23
-23          while (!wait)
+23	    while (!wait)
 (gdb) print wait
 $4 = 0
 (gdb) print text
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000000..3af5f50aff
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+#define SPIKE_VERSION "1.0.1-dev"
diff --git a/aclocal.m4 b/aclocal.m4
index 15353f2c95..def74dbadf 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -59,49 +59,6 @@ AC_DEFUN([MCPPBS_PROG_INSTALL],
   # Check for install script
 
   AC_PROG_INSTALL
-
-  # Deterimine if native build and set prefix appropriately
-  
-  AS_IF([ test ${enable_stow} = "yes" ],
-  [
-    AC_CHECK_PROGS([stow],[stow],[no])  
-    AS_IF([ test ${stow} = "no" ],
-    [
-      AC_MSG_ERROR([Cannot use --enable-stow since stow is not available])
-    ])
-
-    # Check if native or non-native build
-
-    AS_IF([ test "${build}" = "${host}" ],
-    [
-
-      # build == host so this is a native build. Make sure --prefix not
-      # set and $STOW_PREFIX is set, then set prefix=$STOW_PREFIX.
-
-      AS_IF([ test "${prefix}" = "NONE" && test -n "${STOW_PREFIX}" ],
-      [
-        prefix="${STOW_PREFIX}"
-        AC_MSG_NOTICE([Using \$STOW_PREFIX from environment])
-        AC_MSG_NOTICE([prefix=${prefix}])
-      ])
-
-    ],[
-
-      # build != host so this is a non-native build. Make sure --prefix
-      # not set and $STOW_ROOT is set, then set
-      # prefix=$STOW_ROOT/${host_alias}.
-
-      AS_IF([ test "${prefix}" = "NONE" && test -n "${STOW_ROOT}" ],
-      [
-        prefix="${STOW_ROOT}/${host_alias}"
-        AC_MSG_NOTICE([Using \$STOW_ROOT from environment])
-        AC_MSG_NOTICE([prefix=${prefix}])
-      ])
-
-    ])
-      
-  ])
-
 ])
 
 #-------------------------------------------------------------------------
diff --git a/ax_append_flag.m4 b/ax_append_flag.m4
new file mode 100644
index 0000000000..dd6d8b6140
--- /dev/null
+++ b/ax_append_flag.m4
@@ -0,0 +1,50 @@
+# ===========================================================================
+#      https://www.gnu.org/software/autoconf-archive/ax_append_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_APPEND_FLAG(FLAG, [FLAGS-VARIABLE])
+#
+# DESCRIPTION
+#
+#   FLAG is appended to the FLAGS-VARIABLE shell variable, with a space
+#   added in between.
+#
+#   If FLAGS-VARIABLE is not specified, the current language's flags (e.g.
+#   CFLAGS) is used.  FLAGS-VARIABLE is not changed if it already contains
+#   FLAG.  If FLAGS-VARIABLE is unset in the shell, it is set to exactly
+#   FLAG.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 8
+
+AC_DEFUN([AX_APPEND_FLAG],
+[dnl
+AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_SET_IF
+AS_VAR_PUSHDEF([FLAGS], [m4_default($2,_AC_LANG_PREFIX[FLAGS])])
+AS_VAR_SET_IF(FLAGS,[
+  AS_CASE([" AS_VAR_GET(FLAGS) "],
+    [*" $1 "*], [AC_RUN_LOG([: FLAGS already contains $1])],
+    [
+     AS_VAR_APPEND(FLAGS,[" $1"])
+     AC_RUN_LOG([: FLAGS="$FLAGS"])
+    ])
+  ],
+  [
+  AS_VAR_SET(FLAGS,[$1])
+  AC_RUN_LOG([: FLAGS="$FLAGS"])
+  ])
+AS_VAR_POPDEF([FLAGS])dnl
+])dnl AX_APPEND_FLAG
diff --git a/ax_append_link_flags.m4 b/ax_append_link_flags.m4
new file mode 100644
index 0000000000..99b9fa5b4e
--- /dev/null
+++ b/ax_append_link_flags.m4
@@ -0,0 +1,44 @@
+# ===========================================================================
+#   https://www.gnu.org/software/autoconf-archive/ax_append_link_flags.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_APPEND_LINK_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   For every FLAG1, FLAG2 it is checked whether the linker works with the
+#   flag.  If it does, the flag is added FLAGS-VARIABLE
+#
+#   If FLAGS-VARIABLE is not specified, the linker's flags (LDFLAGS) is
+#   used. During the check the flag is always added to the linker's flags.
+#
+#   If EXTRA-FLAGS is defined, it is added to the linker's default flags
+#   when the check is done.  The check is thus made with the flags: "LDFLAGS
+#   EXTRA-FLAGS FLAG".  This can for example be used to force the linker to
+#   issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+#   NOTE: This macro depends on the AX_APPEND_FLAG and AX_CHECK_LINK_FLAG.
+#   Please keep this macro in sync with AX_APPEND_COMPILE_FLAGS.
+#
+# LICENSE
+#
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 7
+
+AC_DEFUN([AX_APPEND_LINK_FLAGS],
+[AX_REQUIRE_DEFINED([AX_CHECK_LINK_FLAG])
+AX_REQUIRE_DEFINED([AX_APPEND_FLAG])
+for flag in $1; do
+  AX_CHECK_LINK_FLAG([$flag], [AX_APPEND_FLAG([$flag], [m4_default([$2], [LDFLAGS])])], [], [$3], [$4])
+done
+])dnl AX_APPEND_LINK_FLAGS
diff --git a/ax_check_link_flag.m4 b/ax_check_link_flag.m4
new file mode 100644
index 0000000000..03a30ce4c7
--- /dev/null
+++ b/ax_check_link_flag.m4
@@ -0,0 +1,53 @@
+# ===========================================================================
+#    https://www.gnu.org/software/autoconf-archive/ax_check_link_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_LINK_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the linker or gives an error.
+#   (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the linker's default flags
+#   when the check is done.  The check is thus made with the flags: "LDFLAGS
+#   EXTRA-FLAGS FLAG".  This can for example be used to force the linker to
+#   issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_LINK_IFELSE.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,COMPILE}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 6
+
+AC_DEFUN([AX_CHECK_LINK_FLAG],
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_ldflags_$4_$1])dnl
+AC_CACHE_CHECK([whether the linker accepts $1], CACHEVAR, [
+  ax_check_save_flags=$LDFLAGS
+  LDFLAGS="$LDFLAGS $4 $1"
+  AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  LDFLAGS=$ax_check_save_flags])
+AS_VAR_IF(CACHEVAR,yes,
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_LINK_FLAGS
diff --git a/ax_require_defined.m4 b/ax_require_defined.m4
new file mode 100644
index 0000000000..17c3eab7da
--- /dev/null
+++ b/ax_require_defined.m4
@@ -0,0 +1,37 @@
+# ===========================================================================
+#    https://www.gnu.org/software/autoconf-archive/ax_require_defined.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_REQUIRE_DEFINED(MACRO)
+#
+# DESCRIPTION
+#
+#   AX_REQUIRE_DEFINED is a simple helper for making sure other macros have
+#   been defined and thus are available for use.  This avoids random issues
+#   where a macro isn't expanded.  Instead the configure script emits a
+#   non-fatal:
+#
+#     ./configure: line 1673: AX_CFLAGS_WARN_ALL: command not found
+#
+#   It's like AC_REQUIRE except it doesn't expand the required macro.
+#
+#   Here's an example:
+#
+#     AX_REQUIRE_DEFINED([AX_CHECK_LINK_FLAG])
+#
+# LICENSE
+#
+#   Copyright (c) 2014 Mike Frysinger <vapier@gentoo.org>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 2
+
+AC_DEFUN([AX_REQUIRE_DEFINED], [dnl
+  m4_ifndef([$1], [m4_fatal([macro ]$1[ is not defined; is a m4 file missing?])])
+])dnl AX_REQUIRE_DEFINED
diff --git a/ci-tests/test-spike b/ci-tests/test-spike
new file mode 100755
index 0000000000..3d5ed6d79a
--- /dev/null
+++ b/ci-tests/test-spike
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+mkdir build
+cd build
+mkdir install
+$DIR/../configure --prefix=`pwd`/install
+make -j4
+make install
diff --git a/config.h.in b/config.h.in
index 137f195005..f5bbab1325 100644
--- a/config.h.in
+++ b/config.h.in
@@ -3,21 +3,33 @@
 /* Define if building universal (internal helper macro) */
 #undef AC_APPLE_UNIVERSAL_BUILD
 
+/* Define if subproject MCPPBS_SPROJ_NORM is enabled */
+#undef CUSTOMEXT_ENABLED
+
 /* Default value for --isa switch */
 #undef DEFAULT_ISA
 
-/* Path to the device-tree-compiler */
+/* Default value for --priv switch */
+#undef DEFAULT_PRIV
+
+/* Default value for --varch switch */
+#undef DEFAULT_VARCH
+
+/* Executable name of device-tree-compiler */
 #undef DTC
 
 /* Define if subproject MCPPBS_SPROJ_NORM is enabled */
-#undef DUMMY_ROCC_ENABLED
+#undef FDT_ENABLED
+
+/* Define if subproject MCPPBS_SPROJ_NORM is enabled */
+#undef FESVR_ENABLED
+
+/* Dynamic library loading is supported */
+#undef HAVE_DLOPEN
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
-/* Define to 1 if you have the `fesvr' library (-lfesvr). */
-#undef HAVE_LIBFESVR
-
 /* Define to 1 if you have the `pthread' library (-lpthread). */
 #undef HAVE_LIBPTHREAD
 
diff --git a/configure b/configure
index 015f63e965..7b9da7e86d 100755
--- a/configure
+++ b/configure
@@ -626,7 +626,8 @@ ac_subst_vars='LTLIBOBJS
 LIBOBJS
 subprojects_enabled
 subprojects
-stow
+HAVE_DLOPEN
+HAVE_INT128
 INSTALL_DATA
 INSTALL_SCRIPT
 INSTALL_PROGRAM
@@ -702,7 +703,8 @@ enable_option_checking
 enable_stow
 enable_optional_subprojects
 with_isa
-with_fesvr
+with_priv
+with_varch
 enable_commitlog
 enable_histogram
 enable_dirty
@@ -1360,8 +1362,9 @@ Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
   --with-isa=RV64IMAFDC   Sets the default RISC-V ISA
-  --with-fesvr            path to your fesvr installation if not in a standard
-                          location
+  --with-priv=MSU         Sets the default RISC-V privilege modes supported
+  --with-varch=vlen:128,elen:64,slen:128
+                          Sets the default vector config
 
 Some influential environment variables:
   CC          C compiler command
@@ -1643,6 +1646,60 @@ $as_echo "$ac_res" >&6; }
 
 } # ac_fn_cxx_check_header_compile
 
+# ac_fn_cxx_check_type LINENO TYPE VAR INCLUDES
+# ---------------------------------------------
+# Tests whether TYPE exists after having included INCLUDES, setting cache
+# variable VAR accordingly.
+ac_fn_cxx_check_type ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof ($2))
+	 return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof (($2)))
+	    return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  eval "$3=yes"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_type
+
 # ac_fn_cxx_try_link LINENO
 # -------------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded.
@@ -2148,6 +2205,152 @@ case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
 
 
+# ===========================================================================
+#    https://www.gnu.org/software/autoconf-archive/ax_require_defined.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_REQUIRE_DEFINED(MACRO)
+#
+# DESCRIPTION
+#
+#   AX_REQUIRE_DEFINED is a simple helper for making sure other macros have
+#   been defined and thus are available for use.  This avoids random issues
+#   where a macro isn't expanded.  Instead the configure script emits a
+#   non-fatal:
+#
+#     ./configure: line 1673: AX_CFLAGS_WARN_ALL: command not found
+#
+#   It's like AC_REQUIRE except it doesn't expand the required macro.
+#
+#   Here's an example:
+#
+#     AX_REQUIRE_DEFINED([AX_CHECK_LINK_FLAG])
+#
+# LICENSE
+#
+#   Copyright (c) 2014 Mike Frysinger <vapier@gentoo.org>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 2
+
+
+# ===========================================================================
+#      https://www.gnu.org/software/autoconf-archive/ax_append_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_APPEND_FLAG(FLAG, [FLAGS-VARIABLE])
+#
+# DESCRIPTION
+#
+#   FLAG is appended to the FLAGS-VARIABLE shell variable, with a space
+#   added in between.
+#
+#   If FLAGS-VARIABLE is not specified, the current language's flags (e.g.
+#   CFLAGS) is used.  FLAGS-VARIABLE is not changed if it already contains
+#   FLAG.  If FLAGS-VARIABLE is unset in the shell, it is set to exactly
+#   FLAG.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 8
+
+
+# ===========================================================================
+#    https://www.gnu.org/software/autoconf-archive/ax_check_link_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_LINK_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the linker or gives an error.
+#   (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the linker's default flags
+#   when the check is done.  The check is thus made with the flags: "LDFLAGS
+#   EXTRA-FLAGS FLAG".  This can for example be used to force the linker to
+#   issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_LINK_IFELSE.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,COMPILE}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 6
+
+
+# ===========================================================================
+#   https://www.gnu.org/software/autoconf-archive/ax_append_link_flags.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_APPEND_LINK_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   For every FLAG1, FLAG2 it is checked whether the linker works with the
+#   flag.  If it does, the flag is added FLAGS-VARIABLE
+#
+#   If FLAGS-VARIABLE is not specified, the linker's flags (LDFLAGS) is
+#   used. During the check the flag is always added to the linker's flags.
+#
+#   If EXTRA-FLAGS is defined, it is added to the linker's default flags
+#   when the check is done.  The check is thus made with the flags: "LDFLAGS
+#   EXTRA-FLAGS FLAG".  This can for example be used to force the linker to
+#   issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+#   NOTE: This macro depends on the AX_APPEND_FLAG and AX_CHECK_LINK_FLAG.
+#   Please keep this macro in sync with AX_APPEND_COMPILE_FLAGS.
+#
+# LICENSE
+#
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 7
+
+
+
 #-------------------------------------------------------------------------
 # Checks for programs
 #-------------------------------------------------------------------------
@@ -3428,7 +3631,7 @@ if test x"$DTC" == xno; then :
 fi
 
 cat >>confdefs.h <<_ACEOF
-#define DTC "$DTC"
+#define DTC "dtc"
 _ACEOF
 
 
@@ -4036,7 +4239,8 @@ fi
 $as_echo "$ac_cv_c_bigendian" >&6; }
  case $ac_cv_c_bigendian in #(
    yes)
-     as_fn_error $? "Spike requires a little-endian host" "$LINENO" 5;; #(
+     $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h
+;; #(
    no)
       ;; #(
    universal)
@@ -4173,102 +4377,6 @@ fi
 
 
 
-  # Deterimine if native build and set prefix appropriately
-
-  if  test ${enable_stow} = "yes" ; then :
-
-    for ac_prog in stow
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_stow+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$stow"; then
-  ac_cv_prog_stow="$stow" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_stow="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-stow=$ac_cv_prog_stow
-if test -n "$stow"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $stow" >&5
-$as_echo "$stow" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$stow" && break
-done
-test -n "$stow" || stow="no"
-
-    if  test ${stow} = "no" ; then :
-
-      as_fn_error $? "Cannot use --enable-stow since stow is not available" "$LINENO" 5
-
-fi
-
-    # Check if native or non-native build
-
-    if  test "${build}" = "${host}" ; then :
-
-
-      # build == host so this is a native build. Make sure --prefix not
-      # set and $STOW_PREFIX is set, then set prefix=$STOW_PREFIX.
-
-      if  test "${prefix}" = "NONE" && test -n "${STOW_PREFIX}" ; then :
-
-        prefix="${STOW_PREFIX}"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: Using \$STOW_PREFIX from environment" >&5
-$as_echo "$as_me: Using \$STOW_PREFIX from environment" >&6;}
-        { $as_echo "$as_me:${as_lineno-$LINENO}: prefix=${prefix}" >&5
-$as_echo "$as_me: prefix=${prefix}" >&6;}
-
-fi
-
-
-else
-
-
-      # build != host so this is a non-native build. Make sure --prefix
-      # not set and $STOW_ROOT is set, then set
-      # prefix=$STOW_ROOT/${host_alias}.
-
-      if  test "${prefix}" = "NONE" && test -n "${STOW_ROOT}" ; then :
-
-        prefix="${STOW_ROOT}/${host_alias}"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: Using \$STOW_ROOT from environment" >&5
-$as_echo "$as_me: Using \$STOW_ROOT from environment" >&6;}
-        { $as_echo "$as_me:${as_lineno-$LINENO}: prefix=${prefix}" >&5
-$as_echo "$as_me: prefix=${prefix}" >&6;}
-
-fi
-
-
-fi
-
-
-fi
-
-
 
 #-------------------------------------------------------------------------
 # Checks for header files
@@ -4387,13 +4495,96 @@ $as_echo "#define STDC_HEADERS 1" >>confdefs.h
 fi
 
 
+#-------------------------------------------------------------------------
+# Checks for type
+#-------------------------------------------------------------------------
+
+ac_fn_cxx_check_type "$LINENO" "__int128_t" "ac_cv_type___int128_t" "$ac_includes_default"
+if test "x$ac_cv_type___int128_t" = xyes; then :
+  HAVE_INT128=yes
+
+fi
+
+
 #-------------------------------------------------------------------------
 # Default compiler flags
 #-------------------------------------------------------------------------
 
-CFLAGS="-Wall -Wno-unused -g -O2"
 
-CXXFLAGS="-Wall -Wno-unused -g -O2 -std=c++11"
+
+
+
+for flag in -Wl,--export-dynamic; do
+  as_CACHEVAR=`$as_echo "ax_cv_check_ldflags__$flag" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the linker accepts $flag" >&5
+$as_echo_n "checking whether the linker accepts $flag... " >&6; }
+if eval \${$as_CACHEVAR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+  ax_check_save_flags=$LDFLAGS
+  LDFLAGS="$LDFLAGS  $flag"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$as_CACHEVAR=yes"
+else
+  eval "$as_CACHEVAR=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  LDFLAGS=$ax_check_save_flags
+fi
+eval ac_res=\$$as_CACHEVAR
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"; then :
+
+if ${LDFLAGS+:} false; then :
+
+  case " $LDFLAGS " in #(
+  *" $flag "*) :
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: : LDFLAGS already contains \$flag"; } >&5
+  (: LDFLAGS already contains $flag) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append LDFLAGS " $flag"
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: : LDFLAGS=\"\$LDFLAGS\""; } >&5
+  (: LDFLAGS="$LDFLAGS") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+     ;;
+esac
+
+else
+
+  LDFLAGS=$flag
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: : LDFLAGS=\"\$LDFLAGS\""; } >&5
+  (: LDFLAGS="$LDFLAGS") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
+fi
+
+else
+  :
+fi
+
+done
 
 
 #-------------------------------------------------------------------------
@@ -4438,6 +4629,98 @@ fi
 
 
 
+    # Add subproject to our running list
+
+    subprojects="$subprojects fesvr"
+
+    # Process the subproject appropriately. If enabled add it to the
+    # $enabled_subprojects running shell variable, set a
+    # SUBPROJECT_ENABLED C define, and include the appropriate
+    # 'subproject.ac'.
+
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : fesvr" >&5
+$as_echo "$as_me: configuring default subproject : fesvr" >&6;}
+      ac_config_files="$ac_config_files fesvr.mk:fesvr/fesvr.mk.in"
+
+      enable_fesvr_sproj="yes"
+      subprojects_enabled="$subprojects_enabled fesvr"
+
+$as_echo "#define FESVR_ENABLED /**/" >>confdefs.h
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
+$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
+if ${ac_cv_lib_pthread_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthread  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_pthread_pthread_create=yes
+else
+  ac_cv_lib_pthread_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBPTHREAD 1
+_ACEOF
+
+  LIBS="-lpthread $LIBS"
+
+else
+  as_fn_error $? "libpthread is required" "$LINENO" 5
+fi
+
+
+
+
+
+
+    # Determine if this is a required or an optional subproject
+
+
+
+    # Determine if there is a group with the same name
+
+
+
+    # Create variations of the subproject name suitable for use as a CPP
+    # enabled define, a shell enabled variable, and a shell function
+
+
+
+
+
+
+
+
+
+
+
     # Add subproject to our running list
 
     subprojects="$subprojects riscv"
@@ -4481,6 +4764,41 @@ _ACEOF
 fi
 
 
+
+# Check whether --with-priv was given.
+if test "${with_priv+set}" = set; then :
+  withval=$with_priv;
+cat >>confdefs.h <<_ACEOF
+#define DEFAULT_PRIV "$withval"
+_ACEOF
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define DEFAULT_PRIV "MSU"
+_ACEOF
+
+fi
+
+
+
+# Check whether --with-varch was given.
+if test "${with_varch+set}" = set; then :
+  withval=$with_varch;
+cat >>confdefs.h <<_ACEOF
+#define DEFAULT_VARCH "$withval"
+_ACEOF
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define DEFAULT_VARCH "vlen:128,elen:64,slen:128"
+_ACEOF
+
+fi
+
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dlopen" >&5
 $as_echo_n "checking for library containing dlopen... " >&6; }
 if ${ac_cv_search_dlopen+:} false; then :
@@ -4535,69 +4853,12 @@ ac_res=$ac_cv_search_dlopen
 if test "$ac_res" != no; then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
-else
-
-  as_fn_error $? "unable to find the dlopen() function" "$LINENO" 5
-
-fi
-
-
 
-# Check whether --with-fesvr was given.
-if test "${with_fesvr+set}" = set; then :
-  withval=$with_fesvr;
-    LDFLAGS="-L$withval/lib $LDFLAGS"
-    CPPFLAGS="-I$withval/include $CPPFLAGS"
-
-
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for libfesvr_is_present in -lfesvr" >&5
-$as_echo_n "checking for libfesvr_is_present in -lfesvr... " >&6; }
-if ${ac_cv_lib_fesvr_libfesvr_is_present+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lfesvr -pthread $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char libfesvr_is_present ();
-int
-main ()
-{
-return libfesvr_is_present ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-  ac_cv_lib_fesvr_libfesvr_is_present=yes
-else
-  ac_cv_lib_fesvr_libfesvr_is_present=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_fesvr_libfesvr_is_present" >&5
-$as_echo "$ac_cv_lib_fesvr_libfesvr_is_present" >&6; }
-if test "x$ac_cv_lib_fesvr_libfesvr_is_present" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBFESVR 1
-_ACEOF
+$as_echo "#define HAVE_DLOPEN /**/" >>confdefs.h
+,
+  HAVE_DLOPEN=yes
 
-  LIBS="-lfesvr $LIBS"
 
-else
-  as_fn_error $? "libfesvr is required" "$LINENO" 5
 fi
 
 
@@ -4728,7 +4989,7 @@ fi
 
     # Add subproject to our running list
 
-    subprojects="$subprojects dummy_rocc"
+    subprojects="$subprojects disasm"
 
     # Process the subproject appropriately. If enabled add it to the
     # $enabled_subprojects running shell variable, set a
@@ -4736,14 +4997,104 @@ fi
     # 'subproject.ac'.
 
 
-      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : dummy_rocc" >&5
-$as_echo "$as_me: configuring default subproject : dummy_rocc" >&6;}
-      ac_config_files="$ac_config_files dummy_rocc.mk:dummy_rocc/dummy_rocc.mk.in"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : disasm" >&5
+$as_echo "$as_me: configuring default subproject : disasm" >&6;}
+      ac_config_files="$ac_config_files disasm.mk:disasm/disasm.mk.in"
 
-      enable_dummy_rocc_sproj="yes"
-      subprojects_enabled="$subprojects_enabled dummy_rocc"
+      enable_disasm_sproj="yes"
+      subprojects_enabled="$subprojects_enabled disasm"
 
-$as_echo "#define DUMMY_ROCC_ENABLED /**/" >>confdefs.h
+$as_echo "#define DISASM_ENABLED /**/" >>confdefs.h
+
+
+
+
+
+
+    # Determine if this is a required or an optional subproject
+
+
+
+    # Determine if there is a group with the same name
+
+
+
+    # Create variations of the subproject name suitable for use as a CPP
+    # enabled define, a shell enabled variable, and a shell function
+
+
+
+
+
+
+
+
+
+
+
+    # Add subproject to our running list
+
+    subprojects="$subprojects customext"
+
+    # Process the subproject appropriately. If enabled add it to the
+    # $enabled_subprojects running shell variable, set a
+    # SUBPROJECT_ENABLED C define, and include the appropriate
+    # 'subproject.ac'.
+
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : customext" >&5
+$as_echo "$as_me: configuring default subproject : customext" >&6;}
+      ac_config_files="$ac_config_files customext.mk:customext/customext.mk.in"
+
+      enable_customext_sproj="yes"
+      subprojects_enabled="$subprojects_enabled customext"
+
+$as_echo "#define CUSTOMEXT_ENABLED /**/" >>confdefs.h
+
+
+
+
+
+
+    # Determine if this is a required or an optional subproject
+
+
+
+    # Determine if there is a group with the same name
+
+
+
+    # Create variations of the subproject name suitable for use as a CPP
+    # enabled define, a shell enabled variable, and a shell function
+
+
+
+
+
+
+
+
+
+
+
+    # Add subproject to our running list
+
+    subprojects="$subprojects fdt"
+
+    # Process the subproject appropriately. If enabled add it to the
+    # $enabled_subprojects running shell variable, set a
+    # SUBPROJECT_ENABLED C define, and include the appropriate
+    # 'subproject.ac'.
+
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : fdt" >&5
+$as_echo "$as_me: configuring default subproject : fdt" >&6;}
+      ac_config_files="$ac_config_files fdt.mk:fdt/fdt.mk.in"
+
+      enable_fdt_sproj="yes"
+      subprojects_enabled="$subprojects_enabled fdt"
+
+$as_echo "#define FDT_ENABLED /**/" >>confdefs.h
 
 
 
@@ -4840,6 +5191,51 @@ $as_echo "#define SPIKE_MAIN_ENABLED /**/" >>confdefs.h
 
 
 
+    # Determine if this is a required or an optional subproject
+
+
+
+    # Determine if there is a group with the same name
+
+
+
+    # Create variations of the subproject name suitable for use as a CPP
+    # enabled define, a shell enabled variable, and a shell function
+
+
+
+
+
+
+
+
+
+
+
+    # Add subproject to our running list
+
+    subprojects="$subprojects spike_dasm"
+
+    # Process the subproject appropriately. If enabled add it to the
+    # $enabled_subprojects running shell variable, set a
+    # SUBPROJECT_ENABLED C define, and include the appropriate
+    # 'subproject.ac'.
+
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: configuring default subproject : spike_dasm" >&5
+$as_echo "$as_me: configuring default subproject : spike_dasm" >&6;}
+      ac_config_files="$ac_config_files spike_dasm.mk:spike_dasm/spike_dasm.mk.in"
+
+      enable_spike_dasm_sproj="yes"
+      subprojects_enabled="$subprojects_enabled spike_dasm"
+
+$as_echo "#define SPIKE_DASM_ENABLED /**/" >>confdefs.h
+
+
+
+
+
+
   # Output make variables
 
 
@@ -4866,15 +5262,9 @@ ac_config_headers="$ac_config_headers config.h"
 
 ac_config_files="$ac_config_files Makefile"
 
-ac_config_files="$ac_config_files riscv-spike.pc"
-
-ac_config_files="$ac_config_files riscv-riscv.pc"
-
-ac_config_files="$ac_config_files riscv-softfloat.pc"
-
-ac_config_files="$ac_config_files riscv-dummy_rocc.pc"
+ac_config_files="$ac_config_files riscv-fesvr.pc"
 
-ac_config_files="$ac_config_files riscv-spike_main.pc"
+ac_config_files="$ac_config_files riscv-disasm.pc"
 
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
@@ -5567,17 +5957,18 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 for ac_config_target in $ac_config_targets
 do
   case $ac_config_target in
+    "fesvr.mk") CONFIG_FILES="$CONFIG_FILES fesvr.mk:fesvr/fesvr.mk.in" ;;
     "riscv.mk") CONFIG_FILES="$CONFIG_FILES riscv.mk:riscv/riscv.mk.in" ;;
-    "dummy_rocc.mk") CONFIG_FILES="$CONFIG_FILES dummy_rocc.mk:dummy_rocc/dummy_rocc.mk.in" ;;
+    "disasm.mk") CONFIG_FILES="$CONFIG_FILES disasm.mk:disasm/disasm.mk.in" ;;
+    "customext.mk") CONFIG_FILES="$CONFIG_FILES customext.mk:customext/customext.mk.in" ;;
+    "fdt.mk") CONFIG_FILES="$CONFIG_FILES fdt.mk:fdt/fdt.mk.in" ;;
     "softfloat.mk") CONFIG_FILES="$CONFIG_FILES softfloat.mk:softfloat/softfloat.mk.in" ;;
     "spike_main.mk") CONFIG_FILES="$CONFIG_FILES spike_main.mk:spike_main/spike_main.mk.in" ;;
+    "spike_dasm.mk") CONFIG_FILES="$CONFIG_FILES spike_dasm.mk:spike_dasm/spike_dasm.mk.in" ;;
     "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
-    "riscv-spike.pc") CONFIG_FILES="$CONFIG_FILES riscv-spike.pc" ;;
-    "riscv-riscv.pc") CONFIG_FILES="$CONFIG_FILES riscv-riscv.pc" ;;
-    "riscv-softfloat.pc") CONFIG_FILES="$CONFIG_FILES riscv-softfloat.pc" ;;
-    "riscv-dummy_rocc.pc") CONFIG_FILES="$CONFIG_FILES riscv-dummy_rocc.pc" ;;
-    "riscv-spike_main.pc") CONFIG_FILES="$CONFIG_FILES riscv-spike_main.pc" ;;
+    "riscv-fesvr.pc") CONFIG_FILES="$CONFIG_FILES riscv-fesvr.pc" ;;
+    "riscv-disasm.pc") CONFIG_FILES="$CONFIG_FILES riscv-disasm.pc" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
   esac
diff --git a/configure.ac b/configure.ac
index e361877da2..b7788b4ea9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -43,6 +43,11 @@ AC_CONFIG_AUX_DIR([scripts])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 
+m4_include(ax_require_defined.m4)
+m4_include(ax_append_flag.m4)
+m4_include(ax_check_link_flag.m4)
+m4_include(ax_append_link_flags.m4)
+
 #-------------------------------------------------------------------------
 # Checks for programs
 #-------------------------------------------------------------------------
@@ -53,9 +58,9 @@ AC_CHECK_TOOL([AR],[ar])
 AC_CHECK_TOOL([RANLIB],[ranlib])
 AC_PATH_PROG([DTC],[dtc],[no])
 AS_IF([test x"$DTC" == xno],AC_MSG_ERROR([device-tree-compiler not found]))
-AC_DEFINE_UNQUOTED(DTC, ["$DTC"], [Path to the device-tree-compiler])
+AC_DEFINE_UNQUOTED(DTC, ["dtc"], [Executable name of device-tree-compiler])
 
-AC_C_BIGENDIAN(AC_MSG_ERROR([Spike requires a little-endian host]))
+AC_C_BIGENDIAN
 
 #-------------------------------------------------------------------------
 # MCPPBS specific program checks
@@ -72,12 +77,17 @@ MCPPBS_PROG_INSTALL
 
 AC_HEADER_STDC
 
+#-------------------------------------------------------------------------
+# Checks for type
+#-------------------------------------------------------------------------
+
+AC_CHECK_TYPE([__int128_t], AC_SUBST([HAVE_INT128],[yes]))
+
 #-------------------------------------------------------------------------
 # Default compiler flags
 #-------------------------------------------------------------------------
 
-AC_SUBST([CFLAGS],  ["-Wall -Wno-unused -g -O2"])
-AC_SUBST([CXXFLAGS],["-Wall -Wno-unused -g -O2 -std=c++11"])
+AX_APPEND_LINK_FLAGS([-Wl,--export-dynamic])
 
 #-------------------------------------------------------------------------
 # MCPPBS subproject list
@@ -86,7 +96,7 @@ AC_SUBST([CXXFLAGS],["-Wall -Wno-unused -g -O2 -std=c++11"])
 # The '*' suffix indicates an optional subproject. The '**' suffix
 # indicates an optional subproject which is also the name of a group.
 
-MCPPBS_SUBPROJECTS([ riscv, dummy_rocc, softfloat, spike_main ])
+MCPPBS_SUBPROJECTS([ fesvr, riscv, disasm, customext, fdt, softfloat, spike_main, spike_dasm ])
 
 #-------------------------------------------------------------------------
 # MCPPBS subproject groups
@@ -105,9 +115,6 @@ MCPPBS_SUBPROJECTS([ riscv, dummy_rocc, softfloat, spike_main ])
 
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([Makefile])
-AC_CONFIG_FILES([riscv-spike.pc])
-AC_CONFIG_FILES([riscv-riscv.pc])
-AC_CONFIG_FILES([riscv-softfloat.pc])
-AC_CONFIG_FILES([riscv-dummy_rocc.pc])
-AC_CONFIG_FILES([riscv-spike_main.pc])
+AC_CONFIG_FILES([riscv-fesvr.pc])
+AC_CONFIG_FILES([riscv-disasm.pc])
 AC_OUTPUT
diff --git a/customext/cflush.cc b/customext/cflush.cc
new file mode 100644
index 0000000000..dedcc03763
--- /dev/null
+++ b/customext/cflush.cc
@@ -0,0 +1,41 @@
+#include "extension.h"
+#include <cstring>
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rs1()];
+  }
+} xrs1;
+
+static reg_t custom_cflush(processor_t* p, insn_t insn, reg_t pc)
+{
+  require_privilege(PRV_M);
+
+  return pc + 4; \
+}
+
+class cflush_t : public extension_t
+{
+ public:
+  const char* name() { return "cflush"; }
+
+  cflush_t() {}
+
+  std::vector<insn_desc_t> get_instructions() {
+    std::vector<insn_desc_t> insns;
+    insns.push_back((insn_desc_t){0xFC000073, 0xFFF07FFF, custom_cflush, custom_cflush});
+    insns.push_back((insn_desc_t){0xFC200073, 0xFFF07FFF, custom_cflush, custom_cflush});
+    insns.push_back((insn_desc_t){0xFC100073, 0xFFF07FFF, custom_cflush, custom_cflush});
+    return insns;
+  }
+
+  std::vector<disasm_insn_t*> get_disasms() {
+    std::vector<disasm_insn_t*> insns;
+    insns.push_back(new disasm_insn_t("cflush.d.l1", 0xFC000073, 0xFFF07FFF, {&xrs1}));
+    insns.push_back(new disasm_insn_t("cdiscard.d.l1", 0xFC200073, 0xFFF07FFF, {&xrs1}));
+    insns.push_back(new disasm_insn_t("cflush.i.l1", 0xFC100073, 0xFFF07FFF, {&xrs1}));
+    return insns;
+  }
+};
+
+REGISTER_EXTENSION(cflush, []() { return new cflush_t; })
diff --git a/dummy_rocc/dummy_rocc.ac b/customext/customext.ac
similarity index 100%
rename from dummy_rocc/dummy_rocc.ac
rename to customext/customext.ac
diff --git a/customext/customext.mk.in b/customext/customext.mk.in
new file mode 100644
index 0000000000..0dd725ef2c
--- /dev/null
+++ b/customext/customext.mk.in
@@ -0,0 +1,12 @@
+customext_subproject_deps = \
+	spike_main \
+	riscv \
+	softfloat \
+
+customext_srcs = \
+	dummy_rocc.cc \
+	cflush.cc \
+
+customext_CFLAGS = -fPIC
+
+customext_install_shared_lib = yes
diff --git a/dummy_rocc/dummy_rocc.cc b/customext/dummy_rocc.cc
similarity index 100%
rename from dummy_rocc/dummy_rocc.cc
rename to customext/dummy_rocc.cc
diff --git a/dummy_rocc/dummy_rocc_test.c b/customext/dummy_rocc_test.c
similarity index 100%
rename from dummy_rocc/dummy_rocc_test.c
rename to customext/dummy_rocc_test.c
diff --git a/debug_rom/debug_rom.S b/debug_rom/debug_rom.S
index 28c7076fda..8d8e4cd037 100755
--- a/debug_rom/debug_rom.S
+++ b/debug_rom/debug_rom.S
@@ -14,6 +14,7 @@
 entry:
        jal zero, _entry
 resume:
+       // Not used.
        jal zero, _resume
 exception:
        jal zero, _exception
@@ -37,16 +38,22 @@ entry_loop:
         csrr s0, CSR_MHARTID
         lbu  s0, DEBUG_ROM_FLAGS(s0) // multiple harts can resume  here
         andi s0, s0, (1 << DEBUG_ROM_FLAG_RESUME)
-        bnez s0, resume
+        bnez s0, _resume
+        wfi
         jal  zero, entry_loop
 
 _exception:
+        // Restore S0, which we always save to dscratch.
+        // We need this in case the user tried an abstract write to a
+        // non-existent CSR.
+        csrr    s0, CSR_DSCRATCH
         sw      zero, DEBUG_ROM_EXCEPTION(zero) // Let debug module know you got an exception.
         ebreak
 
 going:
+        csrr s0, CSR_MHARTID
+        sw s0, DEBUG_ROM_GOING(zero)     // When debug module sees this write, the GO flag is reset.
         csrr s0, CSR_DSCRATCH            // Restore s0 here
-        sw zero, DEBUG_ROM_GOING(zero)   // When debug module sees this write, the GO flag is reset.
         fence
         fence.i
         jalr zero, zero, %lo(whereto)    // Debug module will put different instructions and data in the RAM, 
diff --git a/debug_rom/debug_rom.h b/debug_rom/debug_rom.h
index d21e1669c7..7edd5f68f9 100644
--- a/debug_rom/debug_rom.h
+++ b/debug_rom/debug_rom.h
@@ -1,12 +1,13 @@
 static const unsigned char debug_rom_raw[] = {
-  0x6f, 0x00, 0xc0, 0x00, 0x6f, 0x00, 0x40, 0x05, 0x6f, 0x00, 0x40, 0x03,
+  0x6f, 0x00, 0xc0, 0x00, 0x6f, 0x00, 0x00, 0x06, 0x6f, 0x00, 0x80, 0x03,
   0x0f, 0x00, 0xf0, 0x0f, 0x73, 0x10, 0x24, 0x7b, 0x73, 0x24, 0x40, 0xf1,
   0x23, 0x20, 0x80, 0x10, 0x03, 0x44, 0x04, 0x40, 0x13, 0x74, 0x14, 0x00,
-  0x63, 0x10, 0x04, 0x02, 0x73, 0x24, 0x40, 0xf1, 0x03, 0x44, 0x04, 0x40,
-  0x13, 0x74, 0x24, 0x00, 0xe3, 0x18, 0x04, 0xfc, 0x6f, 0xf0, 0xdf, 0xfd,
-  0x23, 0x26, 0x00, 0x10, 0x73, 0x00, 0x10, 0x00, 0x73, 0x24, 0x20, 0x7b,
-  0x23, 0x22, 0x00, 0x10, 0x0f, 0x00, 0xf0, 0x0f, 0x0f, 0x10, 0x00, 0x00,
+  0x63, 0x14, 0x04, 0x02, 0x73, 0x24, 0x40, 0xf1, 0x03, 0x44, 0x04, 0x40,
+  0x13, 0x74, 0x24, 0x00, 0x63, 0x18, 0x04, 0x02, 0x73, 0x00, 0x50, 0x10,
+  0x6f, 0xf0, 0x9f, 0xfd, 0x73, 0x24, 0x20, 0x7b, 0x23, 0x26, 0x00, 0x10,
+  0x73, 0x00, 0x10, 0x00, 0x73, 0x24, 0x40, 0xf1, 0x23, 0x22, 0x80, 0x10,
+  0x73, 0x24, 0x20, 0x7b, 0x0f, 0x00, 0xf0, 0x0f, 0x0f, 0x10, 0x00, 0x00,
   0x67, 0x00, 0x00, 0x30, 0x73, 0x24, 0x40, 0xf1, 0x23, 0x24, 0x80, 0x10,
   0x73, 0x24, 0x20, 0x7b, 0x73, 0x00, 0x20, 0x7b
 };
-static const unsigned int debug_rom_raw_len = 104;
+static const unsigned int debug_rom_raw_len = 116;
diff --git a/disasm/disasm.ac b/disasm/disasm.ac
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/disasm/disasm.cc b/disasm/disasm.cc
new file mode 100644
index 0000000000..0b95893299
--- /dev/null
+++ b/disasm/disasm.cc
@@ -0,0 +1,1744 @@
+// See LICENSE for license details.
+
+#include "disasm.h"
+#include <cassert>
+#include <string>
+#include <vector>
+#include <cstdarg>
+#include <sstream>
+#include <stdlib.h>
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.i_imm()) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} load_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.s_imm()) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} store_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string("(") + xpr_name[insn.rs1()] + ')';
+  }
+} amo_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rd()];
+  }
+} xrd;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rs1()];
+  }
+} xrs1;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rs2()];
+  }
+} xrs2;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rd()];
+  }
+} frd;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rs1()];
+  }
+} frs1;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rs2()];
+  }
+} frs2;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rs3()];
+  }
+} frs3;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    switch (insn.csr())
+    {
+      #define DECLARE_CSR(name, num) case num: return #name;
+      #include "encoding.h"
+      #undef DECLARE_CSR
+      default:
+      {
+        char buf[16];
+        snprintf(buf, sizeof buf, "unknown_%03" PRIx64, insn.csr());
+        return std::string(buf);
+      }
+    }
+  }
+} csr;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.i_imm());
+  }
+} imm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.shamt());
+  }
+} shamt;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    s << std::hex << "0x" << ((uint32_t)insn.u_imm() >> 12);
+    return s.str();
+  }
+} bigimm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string(insn.rs1());
+  }
+} zimm5;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    int32_t target = insn.sb_imm();
+    char sign = target >= 0 ? '+' : '-';
+    s << "pc " << sign << ' ' << abs(target);
+    return s.str();
+  }
+} branch_target;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    int32_t target = insn.uj_imm();
+    char sign = target >= 0 ? '+' : '-';
+    s << "pc " << sign << std::hex << " 0x" << abs(target);
+    return s.str();
+  }
+} jump_target;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rvc_rs1()];
+  }
+} rvc_rs1;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rvc_rs2()];
+  }
+} rvc_rs2;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rvc_rs2()];
+  }
+} rvc_fp_rs2;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rvc_rs1s()];
+  }
+} rvc_rs1s;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[insn.rvc_rs2s()];
+  }
+} rvc_rs2s;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return fpr_name[insn.rvc_rs2s()];
+  }
+} rvc_fp_rs2s;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[X_SP];
+  }
+} rvc_sp;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_imm());
+  }
+} rvc_imm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_addi4spn_imm());
+  }
+} rvc_addi4spn_imm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_addi16sp_imm());
+  }
+} rvc_addi16sp_imm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_lwsp_imm());
+  }
+} rvc_lwsp_imm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)(insn.rvc_imm() & 0x3f));
+  }
+} rvc_shamt;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    s << std::hex << "0x" << ((uint32_t)insn.rvc_imm() << 12 >> 12);
+    return s.str();
+  }
+} rvc_uimm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_lwsp_imm()) + '(' + xpr_name[X_SP] + ')';
+  }
+} rvc_lwsp_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_ldsp_imm()) + '(' + xpr_name[X_SP] + ')';
+  }
+} rvc_ldsp_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_swsp_imm()) + '(' + xpr_name[X_SP] + ')';
+  }
+} rvc_swsp_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_sdsp_imm()) + '(' + xpr_name[X_SP] + ')';
+  }
+} rvc_sdsp_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_lw_imm()) + '(' + xpr_name[insn.rvc_rs1s()] + ')';
+  }
+} rvc_lw_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.rvc_ld_imm()) + '(' + xpr_name[insn.rvc_rs1s()] + ')';
+  }
+} rvc_ld_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    int32_t target = insn.rvc_b_imm();
+    char sign = target >= 0 ? '+' : '-';
+    s << "pc " << sign << ' ' << abs(target);
+    return s.str();
+  }
+} rvc_branch_target;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    int32_t target = insn.rvc_j_imm();
+    char sign = target >= 0 ? '+' : '-';
+    s << "pc " << sign << ' ' << abs(target);
+    return s.str();
+  }
+} rvc_jump_target;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string("(") + xpr_name[insn.rs1()] + ')';
+  }
+} v_address;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return vr_name[insn.rd()];
+  }
+} vd;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return vr_name[insn.rs1()];
+  }
+} vs1;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return vr_name[insn.rs2()];
+  }
+} vs2;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return vr_name[insn.rd()];
+  }
+} vs3;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return insn.v_vm() ? "" : "v0.t";
+  }
+} vm;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return "v0";
+  }
+} v0;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.v_simm5());
+  }
+} v_simm5;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    std::stringstream s;
+    int sew = insn.v_sew();
+    int lmul = insn.v_lmul();
+    auto vta = insn.v_vta() == 1 ? "ta" : "tu";
+    auto vma = insn.v_vma() == 1 ? "ma" : "mu";
+    s << "e" << sew;
+    if(insn.v_frac_lmul()) {
+      std::string lmul_str = "";
+      switch(lmul){
+        case 3:
+          lmul_str = "f2";
+          break;
+        case 2:
+          lmul_str = "f4";
+          break;
+        case 1:
+          lmul_str = "f8";
+          break;
+        default:
+          assert(true && "unsupport fractional LMUL");
+      }
+      s << ", m" << lmul_str;
+    } else {
+      s << ", m" << (1 << lmul);
+    }
+    s << ", " << vta << ", " << vma;
+    return s.str();
+  }
+} v_vtype;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return "x0";
+  }
+} x0;
+
+// Xpulpimg
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_uimmL());
+  }
+} p_uimmL;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_uimmS());
+  }
+} p_uimmS;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_loop());
+  }
+} p_loop;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_Luimm5());
+  }
+} p_Luimm5;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_zimm5());
+  }
+} p_zimm5;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.p_simm5());
+  }
+} p_simm5;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_zimm6());
+  }
+} p_zimm6;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.p_simm6());
+  }
+} p_simm6;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.i_imm()) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} load_address_irpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.rs2()]) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} load_address_rrpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.rs2()]) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} load_address_rr;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.s_imm()) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} store_address_irpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.p_rs3()]) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} store_address_rrpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.p_rs3()]) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} store_address_rr;
+
+
+typedef struct {
+  reg_t match;
+  reg_t mask;
+  const char *fmt;
+  std::vector<const arg_t*>& arg;
+} custom_fmt_t;
+
+std::string disassembler_t::disassemble(insn_t insn) const
+{
+  const disasm_insn_t* disasm_insn = lookup(insn);
+  return disasm_insn ? disasm_insn->to_string(insn) : "unknown";
+}
+
+disassembler_t::disassembler_t(int xlen)
+{
+  const uint32_t mask_rd = 0x1fUL << 7;
+  const uint32_t match_rd_ra = 1UL << 7;
+  const uint32_t mask_rs1 = 0x1fUL << 15;
+  const uint32_t match_rs1_ra = 1UL << 15;
+  const uint32_t mask_rs2 = 0x1fUL << 20;
+  const uint32_t mask_imm = 0xfffUL << 20;
+  const uint32_t match_imm_1 = 1UL << 20;
+  const uint32_t mask_rvc_rs2 = 0x1fUL << 2;
+  const uint32_t mask_rvc_imm = mask_rvc_rs2 | 0x1000UL;
+  const uint32_t mask_nf = 0x7Ul << 29;
+  const uint32_t mask_wd = 0x1Ul << 26;
+  const uint32_t mask_vm = 0x1Ul << 25;
+  const uint32_t mask_vldst = 0x7Ul << 12 | 0x1UL << 28;
+  const uint32_t mask_amoop = 0x1fUl << 27;
+  const uint32_t mask_width = 0x7Ul << 12;
+
+  #define DECLARE_INSN(code, match, mask) \
+   const uint32_t match_##code = match; \
+   const uint32_t mask_##code = mask;
+  #include "encoding.h"
+  #undef DECLARE_INSN
+
+  // explicit per-instruction disassembly
+  #define DISASM_INSN(name, code, extra, ...) \
+    add_insn(new disasm_insn_t(name, match_##code, mask_##code | (extra), __VA_ARGS__));
+  #define DEFINE_NOARG(code) \
+    add_insn(new disasm_insn_t(#code, match_##code, mask_##code, {}));
+  #define DEFINE_RTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &xrs2})
+  #define DEFINE_R1TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1})
+  #define DEFINE_ITYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &imm})
+  #define DEFINE_ITYPE_SHIFT(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &shamt})
+  #define DEFINE_I0TYPE(name, code) DISASM_INSN(name, code, mask_rs1, {&xrd, &imm})
+  #define DEFINE_I1TYPE(name, code) DISASM_INSN(name, code, mask_imm, {&xrd, &xrs1})
+  #define DEFINE_I2TYPE(name, code) DISASM_INSN(name, code, mask_rd | mask_imm, {&xrs1})
+  #define DEFINE_LTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &bigimm})
+  #define DEFINE_BTYPE(code) DISASM_INSN(#code, code, 0, {&xrs1, &xrs2, &branch_target})
+  #define DEFINE_B0TYPE(name, code) DISASM_INSN(name, code, mask_rs1 | mask_rs2, {&branch_target})
+  #define DEFINE_B1TYPE(name, code) DISASM_INSN(name, code, mask_rs2, {&xrs1, &branch_target})
+  #define DEFINE_XLOAD(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address})
+  #define DEFINE_XSTORE(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address})
+  #define DEFINE_XAMO(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs2, &amo_address})
+  #define DEFINE_XAMO_LR(code) DISASM_INSN(#code, code, 0, {&xrd, &amo_address})
+  #define DEFINE_FLOAD(code) DISASM_INSN(#code, code, 0, {&frd, &load_address})
+  #define DEFINE_FSTORE(code) DISASM_INSN(#code, code, 0, {&frs2, &store_address})
+  #define DEFINE_FRTYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1, &frs2})
+  #define DEFINE_FR1TYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1})
+  #define DEFINE_FR3TYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1, &frs2, &frs3})
+  #define DEFINE_FXTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &frs1})
+  #define DEFINE_FX2TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &frs1, &frs2})
+  #define DEFINE_XFTYPE(code) DISASM_INSN(#code, code, 0, {&frd, &xrs1})
+  #define DEFINE_SFENCE_TYPE(code) DISASM_INSN(#code, code, 0, {&xrs1, &xrs2})
+  // Xpulpimg
+  #define DEFINE_PLOAD_IRPOST(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_irpost})
+  #define DEFINE_PLOAD_RRPOST(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_rrpost})
+  #define DEFINE_PLOAD_RR(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_rr})
+  #define DEFINE_PSTORE_IRPOST(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_irpost})
+  #define DEFINE_PSTORE_RRPOST(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_rrpost})
+  #define DEFINE_PSTORE_RR(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_rr})
+  #define DEFINE_PI0TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_zimm5})
+  #define DEFINE_PI1ZTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_zimm6})
+  #define DEFINE_PI1STYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_simm6})
+  #define DEFINE_PLUI2TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_Luimm5, &p_zimm5})
+  #define DEFINE_PBTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_simm5, &branch_target})
+  #define DEFINE_PR2LUITYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &xrs2, &p_Luimm5})
+
+  DEFINE_XLOAD(lb)
+  DEFINE_XLOAD(lbu)
+  DEFINE_XLOAD(lh)
+  DEFINE_XLOAD(lhu)
+  DEFINE_XLOAD(lw)
+  DEFINE_XLOAD(lwu)
+  DEFINE_XLOAD(ld)
+
+  DEFINE_XSTORE(sb)
+  DEFINE_XSTORE(sh)
+  DEFINE_XSTORE(sw)
+  DEFINE_XSTORE(sd)
+
+  DEFINE_XAMO(amoadd_w)
+  DEFINE_XAMO(amoswap_w)
+  DEFINE_XAMO(amoand_w)
+  DEFINE_XAMO(amoor_w)
+  DEFINE_XAMO(amoxor_w)
+  DEFINE_XAMO(amomin_w)
+  DEFINE_XAMO(amomax_w)
+  DEFINE_XAMO(amominu_w)
+  DEFINE_XAMO(amomaxu_w)
+  DEFINE_XAMO(amoadd_d)
+  DEFINE_XAMO(amoswap_d)
+  DEFINE_XAMO(amoand_d)
+  DEFINE_XAMO(amoor_d)
+  DEFINE_XAMO(amoxor_d)
+  DEFINE_XAMO(amomin_d)
+  DEFINE_XAMO(amomax_d)
+  DEFINE_XAMO(amominu_d)
+  DEFINE_XAMO(amomaxu_d)
+
+  DEFINE_XAMO_LR(lr_w)
+  DEFINE_XAMO(sc_w)
+  DEFINE_XAMO_LR(lr_d)
+  DEFINE_XAMO(sc_d)
+
+  DEFINE_FLOAD(flw)
+  DEFINE_FLOAD(fld)
+  DEFINE_FLOAD(flh)
+  DEFINE_FLOAD(flq)
+
+  DEFINE_FSTORE(fsw)
+  DEFINE_FSTORE(fsd)
+  DEFINE_FSTORE(fsh)
+  DEFINE_FSTORE(fsq)
+
+  add_insn(new disasm_insn_t("j", match_jal, mask_jal | mask_rd, {&jump_target}));
+  add_insn(new disasm_insn_t("jal", match_jal | match_rd_ra, mask_jal | mask_rd, {&jump_target}));
+  add_insn(new disasm_insn_t("jal", match_jal, mask_jal, {&xrd, &jump_target}));
+
+  DEFINE_B1TYPE("beqz", beq);
+  DEFINE_B1TYPE("bnez", bne);
+  DEFINE_B1TYPE("bltz", blt);
+  DEFINE_B1TYPE("bgez", bge);
+  DEFINE_BTYPE(beq)
+  DEFINE_BTYPE(bne)
+  DEFINE_BTYPE(blt)
+  DEFINE_BTYPE(bge)
+  DEFINE_BTYPE(bltu)
+  DEFINE_BTYPE(bgeu)
+
+  DEFINE_LTYPE(lui);
+  DEFINE_LTYPE(auipc);
+
+  add_insn(new disasm_insn_t("ret", match_jalr | match_rs1_ra, mask_jalr | mask_rd | mask_rs1 | mask_imm, {}));
+  DEFINE_I2TYPE("jr", jalr);
+  add_insn(new disasm_insn_t("jalr", match_jalr | match_rd_ra, mask_jalr | mask_rd | mask_imm, {&xrs1}));
+  DEFINE_ITYPE(jalr);
+
+  add_insn(new disasm_insn_t("nop", match_addi, mask_addi | mask_rd | mask_rs1 | mask_imm, {}));
+  DEFINE_I0TYPE("li", addi);
+  DEFINE_I1TYPE("mv", addi);
+  DEFINE_ITYPE(addi);
+  DEFINE_ITYPE(slti);
+  add_insn(new disasm_insn_t("seqz", match_sltiu | match_imm_1, mask_sltiu | mask_imm, {&xrd, &xrs1}));
+  DEFINE_ITYPE(sltiu);
+  add_insn(new disasm_insn_t("not", match_xori | mask_imm, mask_xori | mask_imm, {&xrd, &xrs1}));
+  DEFINE_ITYPE(xori);
+
+  DEFINE_ITYPE_SHIFT(slli);
+  DEFINE_ITYPE_SHIFT(srli);
+  DEFINE_ITYPE_SHIFT(srai);
+
+  DEFINE_ITYPE(ori);
+  DEFINE_ITYPE(andi);
+  DEFINE_I1TYPE("sext.w", addiw);
+  DEFINE_ITYPE(addiw);
+
+  DEFINE_ITYPE_SHIFT(slliw);
+  DEFINE_ITYPE_SHIFT(srliw);
+  DEFINE_ITYPE_SHIFT(sraiw);
+
+  DEFINE_RTYPE(add);
+  DEFINE_RTYPE(sub);
+  DEFINE_RTYPE(sll);
+  DEFINE_RTYPE(slt);
+  add_insn(new disasm_insn_t("snez", match_sltu, mask_sltu | mask_rs1, {&xrd, &xrs2}));
+  DEFINE_RTYPE(sltu);
+  DEFINE_RTYPE(xor);
+  DEFINE_RTYPE(srl);
+  DEFINE_RTYPE(sra);
+  DEFINE_RTYPE(or);
+  DEFINE_RTYPE(and);
+  DEFINE_RTYPE(mul);
+  DEFINE_RTYPE(mulh);
+  DEFINE_RTYPE(mulhu);
+  DEFINE_RTYPE(mulhsu);
+  DEFINE_RTYPE(div);
+  DEFINE_RTYPE(divu);
+  DEFINE_RTYPE(rem);
+  DEFINE_RTYPE(remu);
+  DEFINE_RTYPE(addw);
+  DEFINE_RTYPE(subw);
+  DEFINE_RTYPE(sllw);
+  DEFINE_RTYPE(srlw);
+  DEFINE_RTYPE(sraw);
+  DEFINE_RTYPE(mulw);
+  DEFINE_RTYPE(divw);
+  DEFINE_RTYPE(divuw);
+  DEFINE_RTYPE(remw);
+  DEFINE_RTYPE(remuw);
+
+  DEFINE_NOARG(ecall);
+  DEFINE_NOARG(ebreak);
+  DEFINE_NOARG(uret);
+  DEFINE_NOARG(sret);
+  DEFINE_NOARG(mret);
+  DEFINE_NOARG(dret);
+  DEFINE_NOARG(wfi);
+  DEFINE_NOARG(fence);
+  DEFINE_NOARG(fence_i);
+  DEFINE_SFENCE_TYPE(sfence_vma);
+
+  add_insn(new disasm_insn_t("csrr", match_csrrs, mask_csrrs | mask_rs1, {&xrd, &csr}));
+  add_insn(new disasm_insn_t("csrw", match_csrrw, mask_csrrw | mask_rd, {&csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrs", match_csrrs, mask_csrrs | mask_rd, {&csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrc", match_csrrc, mask_csrrc | mask_rd, {&csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrwi", match_csrrwi, mask_csrrwi | mask_rd, {&csr, &zimm5}));
+  add_insn(new disasm_insn_t("csrsi", match_csrrsi, mask_csrrsi | mask_rd, {&csr, &zimm5}));
+  add_insn(new disasm_insn_t("csrci", match_csrrci, mask_csrrci | mask_rd, {&csr, &zimm5}));
+  add_insn(new disasm_insn_t("csrrw", match_csrrw, mask_csrrw, {&xrd, &csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrrs", match_csrrs, mask_csrrs, {&xrd, &csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrrc", match_csrrc, mask_csrrc, {&xrd, &csr, &xrs1}));
+  add_insn(new disasm_insn_t("csrrwi", match_csrrwi, mask_csrrwi, {&xrd, &csr, &zimm5}));
+  add_insn(new disasm_insn_t("csrrsi", match_csrrsi, mask_csrrsi, {&xrd, &csr, &zimm5}));
+  add_insn(new disasm_insn_t("csrrci", match_csrrci, mask_csrrci, {&xrd, &csr, &zimm5}));
+
+  DEFINE_FRTYPE(fadd_s);
+  DEFINE_FRTYPE(fsub_s);
+  DEFINE_FRTYPE(fmul_s);
+  DEFINE_FRTYPE(fdiv_s);
+  DEFINE_FR1TYPE(fsqrt_s);
+  DEFINE_FRTYPE(fmin_s);
+  DEFINE_FRTYPE(fmax_s);
+  DEFINE_FR3TYPE(fmadd_s);
+  DEFINE_FR3TYPE(fmsub_s);
+  DEFINE_FR3TYPE(fnmadd_s);
+  DEFINE_FR3TYPE(fnmsub_s);
+  DEFINE_FRTYPE(fsgnj_s);
+  DEFINE_FRTYPE(fsgnjn_s);
+  DEFINE_FRTYPE(fsgnjx_s);
+  DEFINE_FR1TYPE(fcvt_s_d);
+  DEFINE_FR1TYPE(fcvt_s_q);
+  DEFINE_XFTYPE(fcvt_s_l);
+  DEFINE_XFTYPE(fcvt_s_lu);
+  DEFINE_XFTYPE(fcvt_s_w);
+  DEFINE_XFTYPE(fcvt_s_wu);
+  DEFINE_XFTYPE(fcvt_s_wu);
+  DEFINE_XFTYPE(fmv_w_x);
+  DEFINE_FXTYPE(fcvt_l_s);
+  DEFINE_FXTYPE(fcvt_lu_s);
+  DEFINE_FXTYPE(fcvt_w_s);
+  DEFINE_FXTYPE(fcvt_wu_s);
+  DEFINE_FXTYPE(fclass_s);
+  DEFINE_FXTYPE(fmv_x_w);
+  DEFINE_FX2TYPE(feq_s);
+  DEFINE_FX2TYPE(flt_s);
+  DEFINE_FX2TYPE(fle_s);
+
+  DEFINE_FRTYPE(fadd_d);
+  DEFINE_FRTYPE(fsub_d);
+  DEFINE_FRTYPE(fmul_d);
+  DEFINE_FRTYPE(fdiv_d);
+  DEFINE_FR1TYPE(fsqrt_d);
+  DEFINE_FRTYPE(fmin_d);
+  DEFINE_FRTYPE(fmax_d);
+  DEFINE_FR3TYPE(fmadd_d);
+  DEFINE_FR3TYPE(fmsub_d);
+  DEFINE_FR3TYPE(fnmadd_d);
+  DEFINE_FR3TYPE(fnmsub_d);
+  DEFINE_FRTYPE(fsgnj_d);
+  DEFINE_FRTYPE(fsgnjn_d);
+  DEFINE_FRTYPE(fsgnjx_d);
+  DEFINE_FR1TYPE(fcvt_d_s);
+  DEFINE_FR1TYPE(fcvt_d_q);
+  DEFINE_XFTYPE(fcvt_d_l);
+  DEFINE_XFTYPE(fcvt_d_lu);
+  DEFINE_XFTYPE(fcvt_d_w);
+  DEFINE_XFTYPE(fcvt_d_wu);
+  DEFINE_XFTYPE(fcvt_d_wu);
+  DEFINE_XFTYPE(fmv_d_x);
+  DEFINE_FXTYPE(fcvt_l_d);
+  DEFINE_FXTYPE(fcvt_lu_d);
+  DEFINE_FXTYPE(fcvt_w_d);
+  DEFINE_FXTYPE(fcvt_wu_d);
+  DEFINE_FXTYPE(fclass_d);
+  DEFINE_FXTYPE(fmv_x_d);
+  DEFINE_FX2TYPE(feq_d);
+  DEFINE_FX2TYPE(flt_d);
+  DEFINE_FX2TYPE(fle_d);
+
+  DEFINE_FRTYPE(fadd_h);
+  DEFINE_FRTYPE(fsub_h);
+  DEFINE_FRTYPE(fmul_h);
+  DEFINE_FRTYPE(fdiv_h);
+  DEFINE_FR1TYPE(fsqrt_h);
+  DEFINE_FRTYPE(fmin_h);
+  DEFINE_FRTYPE(fmax_h);
+  DEFINE_FR3TYPE(fmadd_h);
+  DEFINE_FR3TYPE(fmsub_h);
+  DEFINE_FR3TYPE(fnmadd_h);
+  DEFINE_FR3TYPE(fnmsub_h);
+  DEFINE_FRTYPE(fsgnj_h);
+  DEFINE_FRTYPE(fsgnjn_h);
+  DEFINE_FRTYPE(fsgnjx_h);
+  DEFINE_FR1TYPE(fcvt_h_s);
+  DEFINE_FR1TYPE(fcvt_h_d);
+  DEFINE_FR1TYPE(fcvt_h_q);
+  DEFINE_FR1TYPE(fcvt_s_h);
+  DEFINE_FR1TYPE(fcvt_d_h);
+  DEFINE_FR1TYPE(fcvt_q_h);
+  DEFINE_XFTYPE(fcvt_h_l);
+  DEFINE_XFTYPE(fcvt_h_lu);
+  DEFINE_XFTYPE(fcvt_h_w);
+  DEFINE_XFTYPE(fcvt_h_wu);
+  DEFINE_XFTYPE(fcvt_h_wu);
+  DEFINE_XFTYPE(fmv_h_x);
+  DEFINE_FXTYPE(fcvt_l_h);
+  DEFINE_FXTYPE(fcvt_lu_h);
+  DEFINE_FXTYPE(fcvt_w_h);
+  DEFINE_FXTYPE(fcvt_wu_h);
+  DEFINE_FXTYPE(fclass_h);
+  DEFINE_FXTYPE(fmv_x_h);
+  DEFINE_FX2TYPE(feq_h);
+  DEFINE_FX2TYPE(flt_h);
+  DEFINE_FX2TYPE(fle_h);
+
+  DEFINE_FRTYPE(fadd_q);
+  DEFINE_FRTYPE(fsub_q);
+  DEFINE_FRTYPE(fmul_q);
+  DEFINE_FRTYPE(fdiv_q);
+  DEFINE_FR1TYPE(fsqrt_q);
+  DEFINE_FRTYPE(fmin_q);
+  DEFINE_FRTYPE(fmax_q);
+  DEFINE_FR3TYPE(fmadd_q);
+  DEFINE_FR3TYPE(fmsub_q);
+  DEFINE_FR3TYPE(fnmadd_q);
+  DEFINE_FR3TYPE(fnmsub_q);
+  DEFINE_FRTYPE(fsgnj_q);
+  DEFINE_FRTYPE(fsgnjn_q);
+  DEFINE_FRTYPE(fsgnjx_q);
+  DEFINE_FR1TYPE(fcvt_q_s);
+  DEFINE_FR1TYPE(fcvt_q_d);
+  DEFINE_XFTYPE(fcvt_q_l);
+  DEFINE_XFTYPE(fcvt_q_lu);
+  DEFINE_XFTYPE(fcvt_q_w);
+  DEFINE_XFTYPE(fcvt_q_wu);
+  DEFINE_XFTYPE(fcvt_q_wu);
+  //DEFINE_XFTYPE(fmv_q_x);
+  DEFINE_FXTYPE(fcvt_l_q);
+  DEFINE_FXTYPE(fcvt_lu_q);
+  DEFINE_FXTYPE(fcvt_w_q);
+  DEFINE_FXTYPE(fcvt_wu_q);
+  DEFINE_FXTYPE(fclass_q);
+  //DEFINE_FXTYPE(fmv_x_q);
+  DEFINE_FX2TYPE(feq_q);
+  DEFINE_FX2TYPE(flt_q);
+  DEFINE_FX2TYPE(fle_q);
+
+  DISASM_INSN("c.ebreak", c_add, mask_rd | mask_rvc_rs2, {});
+  add_insn(new disasm_insn_t("ret", match_c_jr | match_rd_ra, mask_c_jr | mask_rd | mask_rvc_imm, {}));
+  DISASM_INSN("c.jr", c_jr, mask_rvc_imm, {&rvc_rs1});
+  DISASM_INSN("c.jalr", c_jalr, mask_rvc_imm, {&rvc_rs1});
+  DISASM_INSN("c.nop", c_addi, mask_rd | mask_rvc_imm, {});
+  DISASM_INSN("c.addi16sp", c_addi16sp, mask_rd, {&rvc_sp, &rvc_addi16sp_imm});
+  DISASM_INSN("c.addi4spn", c_addi4spn, 0, {&rvc_rs2s, &rvc_sp, &rvc_addi4spn_imm});
+  DISASM_INSN("c.li", c_li, 0, {&xrd, &rvc_imm});
+  DISASM_INSN("c.lui", c_lui, 0, {&xrd, &rvc_uimm});
+  DISASM_INSN("c.addi", c_addi, 0, {&xrd, &rvc_imm});
+  DISASM_INSN("c.slli", c_slli, 0, {&rvc_rs1, &rvc_shamt});
+  DISASM_INSN("c.srli", c_srli, 0, {&rvc_rs1s, &rvc_shamt});
+  DISASM_INSN("c.srai", c_srai, 0, {&rvc_rs1s, &rvc_shamt});
+  DISASM_INSN("c.andi", c_andi, 0, {&rvc_rs1s, &rvc_imm});
+  DISASM_INSN("c.mv", c_mv, 0, {&xrd, &rvc_rs2});
+  DISASM_INSN("c.add", c_add, 0, {&xrd, &rvc_rs2});
+  DISASM_INSN("c.addw", c_addw, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.sub", c_sub, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.subw", c_subw, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.and", c_and, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.or", c_or, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.xor", c_xor, 0, {&rvc_rs1s, &rvc_rs2s});
+  DISASM_INSN("c.lwsp", c_lwsp, 0, {&xrd, &rvc_lwsp_address});
+  DISASM_INSN("c.fld", c_fld, 0, {&rvc_fp_rs2s, &rvc_ld_address});
+  DISASM_INSN("c.swsp", c_swsp, 0, {&rvc_rs2, &rvc_swsp_address});
+  DISASM_INSN("c.lw", c_lw, 0, {&rvc_rs2s, &rvc_lw_address});
+  DISASM_INSN("c.sw", c_sw, 0, {&rvc_rs2s, &rvc_lw_address});
+  DISASM_INSN("c.beqz", c_beqz, 0, {&rvc_rs1s, &rvc_branch_target});
+  DISASM_INSN("c.bnez", c_bnez, 0, {&rvc_rs1s, &rvc_branch_target});
+  DISASM_INSN("c.j", c_j, 0, {&rvc_jump_target});
+  DISASM_INSN("c.fldsp", c_fldsp, 0, {&frd, &rvc_ldsp_address});
+  DISASM_INSN("c.fsd", c_fsd, 0, {&rvc_fp_rs2s, &rvc_ld_address});
+  DISASM_INSN("c.fsdsp", c_fsdsp, 0, {&rvc_fp_rs2, &rvc_sdsp_address});
+
+#ifdef VECTOR_EXT
+
+  DISASM_INSN("vsetvli", vsetvli, 0, {&xrd, &xrs1, &v_vtype});
+  //DISASM_INSN("vsetvl", vsetvl, 0, {&xrd, &xrs1, &xrs2});
+
+  #define DISASM_VMEM_INSN(name, fmt, ff) \
+    add_insn(new disasm_insn_t(#name "8"    #ff ".v",  match_##name##8##ff##_v,     mask_##name##8##ff##_v    | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "16"   #ff ".v",  match_##name##16##ff##_v,    mask_##name##16##ff##_v   | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "32"   #ff ".v",  match_##name##32##ff##_v,    mask_##name##32##ff##_v   | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "64"   #ff ".v",  match_##name##64##ff##_v,    mask_##name##64##ff##_v   | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "128"  #ff ".v",  match_##name##128##ff##_v,   mask_##name##128##ff##_v  | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "256"  #ff ".v",  match_##name##256##ff##_v,   mask_##name##256##ff##_v  | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "512"  #ff ".v",  match_##name##512##ff##_v,   mask_##name##512##ff##_v  | mask_nf, fmt)); \
+    add_insn(new disasm_insn_t(#name "1024" #ff ".v",  match_##name##1024##ff##_v,  mask_##name##1024##ff##_v | mask_nf, fmt)); \
+
+  std::vector<const arg_t *> v_ld_unit = {&vd, &v_address, &opt, &vm};
+  std::vector<const arg_t *> v_st_unit = {&vs3, &v_address, &opt, &vm};
+  std::vector<const arg_t *> v_ld_stride = {&vd, &v_address, &xrs2, &opt, &vm};
+  std::vector<const arg_t *> v_st_stride = {&vs3, &v_address, &xrs2, &opt, &vm};
+  std::vector<const arg_t *> v_ld_index = {&vd, &v_address, &vs2, &opt, &vm};
+  std::vector<const arg_t *> v_st_index = {&vs3, &v_address, &vs2, &opt, &vm};
+
+  DISASM_VMEM_INSN(vle,    v_ld_unit,   );
+  DISASM_VMEM_INSN(vlse,   v_ld_stride, );
+  DISASM_VMEM_INSN(vlxei,  v_ld_index,  );
+  DISASM_VMEM_INSN(vle,    v_ld_unit, ff);
+  DISASM_VMEM_INSN(vse,    v_st_unit,   );
+  DISASM_VMEM_INSN(vsse,   v_st_stride, );
+  DISASM_VMEM_INSN(vsxei,  v_st_index,  );
+  DISASM_VMEM_INSN(vsuxei, v_st_index,  );
+
+  #undef DISASM_VMEM_INSN
+
+  // handle vector segment load/store
+  for (size_t elt = 0; elt <= 7; ++elt) {
+    const custom_fmt_t template_insn[] = {
+      {match_vle8_v,   mask_vle8_v,   "vlseg%de%d.v",   v_ld_unit},
+      {match_vse8_v,   mask_vse8_v,   "vsseg%de%d.v",   v_st_unit},
+
+      {match_vlse8_v,  mask_vlse8_v,  "vlsseg%de%d.v",  v_ld_stride},
+      {match_vsse8_v,  mask_vsse8_v,  "vssseg%de%d.v",  v_st_stride},
+
+      {match_vlxei8_v, mask_vlxei8_v, "vlxseg%dei%d.v", v_ld_index},
+      {match_vsxei8_v, mask_vsxei8_v, "vsxseg%dei%d.v", v_st_index},
+
+      {match_vle8ff_v, mask_vle8ff_v, "vlseg%de%dff.v", v_ld_unit}
+    };
+
+    reg_t elt_map[] = {0x00000000, 0x00005000, 0x00006000, 0x00007000,
+                       0x10000000, 0x10005000, 0x10006000, 0x10007000};
+
+    for (size_t nf = 1; nf <= 7; ++nf) {
+      for (auto item : template_insn) {
+        const reg_t match_nf = nf << 29;
+        char buf[128];
+        sprintf(buf, item.fmt, nf + 1, 8 << elt);
+        add_insn(new disasm_insn_t(
+          buf,
+          ((item.match | match_nf) & ~mask_vldst) | elt_map[elt],
+          item.mask | mask_nf,
+          item.arg
+          ));
+      }
+    }
+
+    //handle whole register load
+    if (elt >= 4)
+        continue;
+
+    const custom_fmt_t template_insn2[] = {
+      {match_vl1re8_v,   mask_vl1re8_v,   "vl%dre%d.v",   v_ld_unit},
+    };
+
+    for (reg_t i = 0, nf = 7; i < 4; i++, nf >>= 1) {
+      for (auto item : template_insn2) {
+        const reg_t match_nf = nf << 29;
+        char buf[128];
+        sprintf(buf, item.fmt, nf + 1, 8 << elt);
+        add_insn(new disasm_insn_t(
+          buf,
+          item.match | match_nf | elt_map[elt],
+          item.mask | mask_nf,
+          item.arg
+        ));
+      }
+    }
+  }
+
+  #define DISASM_ST_WHOLE_INSN(name, nf) \
+    add_insn(new disasm_insn_t(#name, match_vs1r_v | (nf << 29), \
+                                      mask_vs1r_v | mask_nf, \
+                                      {&vs3, &v_address}));
+  DISASM_ST_WHOLE_INSN(vs1r.v, 0);
+  DISASM_ST_WHOLE_INSN(vs2r.v, 1);
+  DISASM_ST_WHOLE_INSN(vs4r.v, 3);
+  DISASM_ST_WHOLE_INSN(vs8r.v, 7);
+
+  #undef DISASM_ST_WHOLE_INSN
+
+  #define DISASM_OPIV_VXI_INSN(name, sign, suf) \
+    add_insn(new disasm_insn_t(#name "." #suf "v", \
+                match_##name##_##suf##v, mask_##name##_##suf##v, \
+                {&vd, &vs2, &vs1, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "." #suf "x", \
+                match_##name##_##suf##x, mask_##name##_##suf##x, \
+                {&vd, &vs2, &xrs1, &opt, &vm})); \
+    if (sign) \
+    add_insn(new disasm_insn_t(#name "." #suf "i", \
+                match_##name##_##suf##i, mask_##name##_##suf##i, \
+                {&vd, &vs2, &v_simm5, &opt, &vm})); \
+    else \
+    add_insn(new disasm_insn_t(#name "." #suf "i", \
+                match_##name##_##suf##i, mask_##name##_##suf##i, \
+                {&vd, &vs2, &zimm5, &opt, &vm}));
+
+  #define DISASM_OPIV_VX__INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".vv", match_##name##_vv, mask_##name##_vv, \
+                {&vd, &vs2, &vs1, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name ".vx", match_##name##_vx, mask_##name##_vx, \
+                {&vd, &vs2, &xrs1, &opt, &vm})); \
+
+  #define DISASM_OPIV__XI_INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".vx", match_##name##_vx, mask_##name##_vx, \
+                {&vd, &vs2, &xrs1, &opt, &vm})); \
+    if (sign) \
+    add_insn(new disasm_insn_t(#name ".vi", match_##name##_vi, mask_##name##_vi, \
+                {&vd, &vs2, &v_simm5, &opt, &vm})); \
+    else \
+    add_insn(new disasm_insn_t(#name ".vi", match_##name##_vi, mask_##name##_vi, \
+                {&vd, &vs2, &zimm5, &opt, &vm}));
+
+  #define DISASM_OPIV_V___INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".vv", match_##name##_vv, mask_##name##_vv, \
+                {&vd, &vs2, &vs1, &opt, &vm}));
+
+  #define DISASM_OPIV_S___INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".vs", match_##name##_vs, mask_##name##_vs, \
+                {&vd, &vs2, &vs1, &opt, &vm}));
+
+  #define DISASM_OPIV_W___INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".wv", match_##name##_wv, mask_##name##_wv, \
+                {&vd, &vs2, &vs1, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name ".wx", match_##name##_wx, mask_##name##_wx, \
+                {&vd, &vs2, &xrs1, &opt, &vm}));
+
+  #define DISASM_OPIV_M___INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".mm", match_##name##_mm, mask_##name##_mm, \
+                {&vd, &vs2, &vs1}));
+
+  #define DISASM_OPIV__X__INSN(name, sign) \
+    add_insn(new disasm_insn_t(#name ".vx", match_##name##_vx, mask_##name##_vx, \
+                {&vd, &vs2, &xrs1, &opt, &vm}));
+
+  #define DISASM_OPIV_VXIM_INSN(name, sign, has_vm) \
+    add_insn(new disasm_insn_t(#name ".vvm", match_##name##_vvm, \
+                mask_##name##_vvm | mask_vm, \
+                {&vd, &vs2, &vs1, &v0})); \
+    add_insn(new disasm_insn_t(#name ".vxm", match_##name##_vxm, \
+                mask_##name##_vxm | mask_vm, \
+                {&vd, &vs2, &xrs1, &v0})); \
+    add_insn(new disasm_insn_t(#name ".vim", match_##name##_vim, \
+                mask_##name##_vim | mask_vm, \
+                {&vd, &vs2, &v_simm5, &v0})); \
+    if (has_vm) { \
+        add_insn(new disasm_insn_t(#name ".vv", \
+                    match_##name##_vvm | mask_vm, \
+                    mask_##name##_vvm | mask_vm, \
+                    {&vd, &vs2, &vs1})); \
+        add_insn(new disasm_insn_t(#name ".vx", \
+                    match_##name##_vxm | mask_vm, \
+                    mask_##name##_vxm | mask_vm, \
+                    {&vd, &vs2, &xrs1})); \
+        add_insn(new disasm_insn_t(#name ".vi", \
+                    match_##name##_vim | mask_vm, \
+                    mask_##name##_vim | mask_vm, \
+                    {&vd, &vs2, &v_simm5})); \
+    }
+
+  #define DISASM_OPIV_VX_M_INSN(name, sign, has_vm) \
+    add_insn(new disasm_insn_t(#name ".vvm", match_##name##_vvm, \
+                mask_##name##_vvm | mask_vm, \
+                {&vd, &vs2, &vs1, &v0})); \
+    add_insn(new disasm_insn_t(#name ".vxm", match_##name##_vxm, \
+                mask_##name##_vxm | mask_vm, \
+                {&vd, &vs2, &xrs1, &v0})); \
+    if (has_vm) { \
+        add_insn(new disasm_insn_t(#name ".vv", \
+                    match_##name##_vvm | mask_vm, \
+                    mask_##name##_vvm | mask_vm, \
+                    {&vd, &vs2, &vs1})); \
+        add_insn(new disasm_insn_t(#name ".vx", \
+                    match_##name##_vxm | mask_vm, \
+                    mask_##name##_vxm | mask_vm, \
+                    {&vd, &vs2, &xrs1})); \
+    } \
+
+  //OPFVV/OPFVF
+  //0b00_0000
+  DISASM_OPIV_VXI_INSN(vadd,         1, v);
+  DISASM_OPIV_VX__INSN(vsub,         1);
+  DISASM_OPIV__XI_INSN(vrsub,        1);
+  DISASM_OPIV_VX__INSN(vminu,        0);
+  DISASM_OPIV_VX__INSN(vmin,         1);
+  DISASM_OPIV_VX__INSN(vmaxu,        1);
+  DISASM_OPIV_VX__INSN(vmax,         0);
+  DISASM_OPIV_VXI_INSN(vand,         1, v);
+  DISASM_OPIV_VXI_INSN(vor,          1, v);
+  DISASM_OPIV_VXI_INSN(vxor,         1, v);
+  DISASM_OPIV_VXI_INSN(vrgather,     0, v);
+  DISASM_OPIV_V___INSN(vrgatherei16, 0);
+  DISASM_OPIV__XI_INSN(vslideup,     0);
+  DISASM_OPIV__XI_INSN(vslidedown,   0);
+
+  //0b01_0000
+  //DISASM_OPIV_VXIM_INSN(vadc,    1, 0);
+  DISASM_OPIV_VXIM_INSN(vmadc,   1, 1);
+  //DISASM_OPIV_VX_M_INSN(vsbc,    1, 0);
+  DISASM_OPIV_VX_M_INSN(vmsbc,   1, 1);
+  DISASM_OPIV_VXIM_INSN(vmerge,  1, 0);
+  DISASM_INSN("vmv.v.i", vmv_v_i, 0, {&vd, &v_simm5});
+  DISASM_INSN("vmv.v.v", vmv_v_v, 0, {&vd, &vs1});
+  DISASM_INSN("vmv.v.x", vmv_v_x, 0, {&vd, &xrs1});
+  DISASM_OPIV_VXI_INSN(vmseq,     1, v);
+  DISASM_OPIV_VXI_INSN(vmsne,     1, v);
+  DISASM_OPIV_VX__INSN(vmsltu,    0);
+  DISASM_OPIV_VX__INSN(vmslt,     1);
+  DISASM_OPIV_VXI_INSN(vmsleu,    0, v);
+  DISASM_OPIV_VXI_INSN(vmsle,     1, v);
+  DISASM_OPIV__XI_INSN(vmsgtu,    0);
+  DISASM_OPIV__XI_INSN(vmsgt,     1);
+
+  //0b10_0000
+  DISASM_OPIV_VXI_INSN(vsaddu,    0, v);
+  DISASM_OPIV_VXI_INSN(vsadd,     1, v);
+  DISASM_OPIV_VX__INSN(vssubu,    0);
+  DISASM_OPIV_VX__INSN(vssub,     1);
+  DISASM_OPIV_VXI_INSN(vsll,      1, v);
+  DISASM_INSN("vmv1r.v", vmv1r_v, 0, {&vd, &vs2});
+  DISASM_INSN("vmv2r.v", vmv2r_v, 0, {&vd, &vs2});
+  DISASM_INSN("vmv4r.v", vmv4r_v, 0, {&vd, &vs2});
+  DISASM_INSN("vmv8r.v", vmv8r_v, 0, {&vd, &vs2});
+  DISASM_OPIV_VX__INSN(vsmul,     1);
+  DISASM_OPIV_VXI_INSN(vsrl,      0, v);
+  DISASM_OPIV_VXI_INSN(vsra,      0, v);
+  DISASM_OPIV_VXI_INSN(vssrl,     0, v);
+  DISASM_OPIV_VXI_INSN(vssra,     0, v);
+  DISASM_OPIV_VXI_INSN(vnsrl,     0, w);
+  DISASM_OPIV_VXI_INSN(vnsra,     0, w);
+  DISASM_OPIV_VXI_INSN(vnclipu,   0, w);
+  DISASM_OPIV_VXI_INSN(vnclip,    0, w);
+
+  //0b11_0000
+  DISASM_OPIV_S___INSN(vwredsumu, 0);
+  DISASM_OPIV_S___INSN(vwredsum,  1);
+  DISASM_OPIV_V___INSN(vdotu,     0);
+  DISASM_OPIV_V___INSN(vdot,      1);
+
+  //OPMVV/OPMVX
+  //0b00_0000
+  //DISASM_OPIV_VX__INSN(vaaddu,    0);
+  DISASM_OPIV_VX__INSN(vaadd,     0);
+  //DISASM_OPIV_VX__INSN(vasubu,    0);
+  DISASM_OPIV_VX__INSN(vasub,     0);
+
+  DISASM_OPIV_S___INSN(vredsum,   1);
+  DISASM_OPIV_S___INSN(vredand,   1);
+  DISASM_OPIV_S___INSN(vredor,    1);
+  DISASM_OPIV_S___INSN(vredxor,   1);
+  DISASM_OPIV_S___INSN(vredminu,  0);
+  DISASM_OPIV_S___INSN(vredmin,   1);
+  DISASM_OPIV_S___INSN(vredmaxu,  0);
+  DISASM_OPIV_S___INSN(vredmax,   1);
+  //DISASM_OPIV__X__INSN(vslide1up,  1);
+  DISASM_OPIV__X__INSN(vslide1down,1);
+
+  //0b01_0000
+  //VWXUNARY0
+  DISASM_INSN("vmv.x.s", vmv_x_s, 0, {&xrd, &vs2});
+  DISASM_INSN("vpopc.m", vpopc_m, 0, {&xrd, &vs2, &opt, &vm});
+  DISASM_INSN("vfirst.m", vfirst_m, 0, {&xrd, &vs2, &opt, &vm});
+
+  //VRXUNARY0
+  DISASM_INSN("vmv.s.x", vmv_s_x, 0, {&vd, &xrs1});
+
+  //VXUNARY0
+  DISASM_INSN("vzext.vf2", vzext_vf2, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vsext.vf2", vsext_vf2, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vzext.vf4", vzext_vf4, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vsext.vf4", vsext_vf4, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vzext.vf8", vzext_vf8, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vsext.vf8", vsext_vf8, 0, {&vd, &vs2, &opt, &vm});
+
+  //VMUNARY0
+  DISASM_INSN("vmsbf.m", vmsbf_m, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vmsof.m", vmsof_m, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vmsif.m", vmsif_m, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("viota.m", viota_m, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vid.v", vid_v, 0, {&vd, &opt, &vm});
+
+  DISASM_INSN("vid.v", vid_v, 0, {&vd, &opt, &vm});
+
+  DISASM_INSN("vcompress.vm", vcompress_vm, 0, {&vd, &vs2, &vs1});
+
+  DISASM_OPIV_M___INSN(vmandnot,  1);
+  DISASM_OPIV_M___INSN(vmand,     1);
+  DISASM_OPIV_M___INSN(vmor,      1);
+  DISASM_OPIV_M___INSN(vmxor,     1);
+  DISASM_OPIV_M___INSN(vmornot,   1);
+  DISASM_OPIV_M___INSN(vmnand,    1);
+  DISASM_OPIV_M___INSN(vmnor,     1);
+  DISASM_OPIV_M___INSN(vmxnor,    1);
+
+  //0b10_0000
+  //DISASM_OPIV_VX__INSN(vdivu,     0);
+  DISASM_OPIV_VX__INSN(vdiv,      1);
+  DISASM_OPIV_VX__INSN(vremu,     0);
+  DISASM_OPIV_VX__INSN(vrem,      1);
+  //DISASM_OPIV_VX__INSN(vmulhu,    0);
+  DISASM_OPIV_VX__INSN(vmul,      1);
+  //DISASM_OPIV_VX__INSN(vmulhsu,   0);
+  DISASM_OPIV_VX__INSN(vmulh,     1);
+  DISASM_OPIV_VX__INSN(vmadd,     1);
+  DISASM_OPIV_VX__INSN(vnmsub,    1);
+  DISASM_OPIV_VX__INSN(vmacc,     1);
+  DISASM_OPIV_VX__INSN(vnmsac,    1);
+
+  //0b11_0000
+  DISASM_OPIV_VX__INSN(vwaddu,    0);
+  DISASM_OPIV_VX__INSN(vwadd,     1);
+  DISASM_OPIV_VX__INSN(vwsubu,    0);
+  DISASM_OPIV_VX__INSN(vwsub,     1);
+  DISASM_OPIV_W___INSN(vwaddu,    0);
+  DISASM_OPIV_W___INSN(vwadd,     1);
+  DISASM_OPIV_W___INSN(vwsubu,    0);
+  DISASM_OPIV_W___INSN(vwsub,     1);
+  DISASM_OPIV_VX__INSN(vwmulu,    0);
+  DISASM_OPIV_VX__INSN(vwmulsu,   0);
+  DISASM_OPIV_VX__INSN(vwmul,     1);
+  DISASM_OPIV_VX__INSN(vwmaccu,   0);
+  DISASM_OPIV_VX__INSN(vwmacc,    1);
+  DISASM_OPIV__X__INSN(vwmaccus,  1);
+  DISASM_OPIV_VX__INSN(vwmaccsu,  0);
+
+  #undef DISASM_OPIV_VXI_INSN
+  #undef DISASM_OPIV_VX__INSN
+  #undef DISASM_OPIV__XI_INSN
+  #undef DISASM_OPIV_V___INSN
+  #undef DISASM_OPIV_S___INSN
+  #undef DISASM_OPIV_W___INSN
+  #undef DISASM_OPIV_M___INSN
+  #undef DISASM_OPIV__X__INSN
+  #undef DISASM_OPIV_VXIM_INSN
+  #undef DISASM_OPIV_VX_M_INSN
+
+  #define DISASM_OPIV_VF_INSN(name) \
+      add_insn(new disasm_insn_t(#name ".vv", match_##name##_vv, mask_##name##_vv, \
+                  {&vd, &vs2, &vs1, &opt, &vm})); \
+      add_insn(new disasm_insn_t(#name ".vf", match_##name##_vf, mask_##name##_vf, \
+                  {&vd, &vs2, &frs1, &opt, &vm})); \
+
+  #define DISASM_OPIV_WF_INSN(name) \
+      add_insn(new disasm_insn_t(#name ".wv", match_##name##_wv, mask_##name##_wv, \
+                  {&vd, &vs2, &vs1, &opt, &vm})); \
+      add_insn(new disasm_insn_t(#name ".wf", match_##name##_wf, mask_##name##_wf, \
+                  {&vd, &vs2, &frs1, &opt, &vm})); \
+
+  #define DISASM_OPIV_V__INSN(name) \
+      add_insn(new disasm_insn_t(#name ".vv", match_##name##_vv, mask_##name##_vv, \
+                  {&vd, &vs2, &vs1, &opt, &vm}));
+
+  #define DISASM_OPIV_S__INSN(name) \
+      add_insn(new disasm_insn_t(#name ".vs", match_##name##_vs, mask_##name##_vs, \
+                  {&vd, &vs2, &vs1, &opt, &vm}));
+
+  #define DISASM_OPIV__F_INSN(name) \
+    add_insn(new disasm_insn_t(#name ".vf", match_##name##_vf, mask_##name##_vf, \
+                {&vd, &vs2, &frs1, &opt, &vm})); \
+
+  #define DISASM_VFUNARY0_INSN(name, suf) \
+    add_insn(new disasm_insn_t(#name "cvt.xu.f." #suf, \
+                match_##name##cvt_xu_f_##suf, mask_##name##cvt_xu_f_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "cvt.x.f." #suf, \
+                match_##name##cvt_x_f_##suf, mask_##name##cvt_x_f_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "cvt.f.xu." #suf, \
+                match_##name##cvt_f_xu_##suf, mask_##name##cvt_f_xu_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "cvt.f.x." #suf, \
+                match_##name##cvt_f_x_##suf, mask_##name##cvt_f_x_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "cvt.rtz.xu.f." #suf, \
+                match_##name##cvt_rtz_xu_f_##suf, mask_##name##cvt_rtz_xu_f_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+    add_insn(new disasm_insn_t(#name "cvt.rtz.x.f." #suf, \
+                match_##name##cvt_rtz_x_f_##suf, mask_##name##cvt_rtz_x_f_##suf, \
+                {&vd, &vs2, &opt, &vm})); \
+
+  //OPFVV/OPFVF
+  //0b00_0000
+  DISASM_OPIV_VF_INSN(vfadd);
+  DISASM_OPIV_S__INSN(vfredsum);
+  DISASM_OPIV_VF_INSN(vfsub);
+  DISASM_OPIV_S__INSN(vfredosum);
+  DISASM_OPIV_VF_INSN(vfmin);
+  DISASM_OPIV_S__INSN(vfredmin);
+  DISASM_OPIV_VF_INSN(vfmax);
+  DISASM_OPIV_S__INSN(vfredmax);
+  DISASM_OPIV_VF_INSN(vfsgnj);
+  DISASM_OPIV_VF_INSN(vfsgnjn);
+  DISASM_OPIV_VF_INSN(vfsgnjx);
+  DISASM_INSN("vfmv.f.s", vfmv_f_s, 0, {&frd, &vs2});
+  DISASM_INSN("vfmv.s.f", vfmv_s_f, mask_vfmv_s_f, {&vd, &frs1});
+  DISASM_OPIV__F_INSN(vfslide1up);
+  DISASM_OPIV__F_INSN(vfslide1down);
+
+  //0b01_0000
+  DISASM_INSN("vfmerge.vfm", vfmerge_vfm, 0, {&vd, &vs2, &frs1, &v0});
+  DISASM_INSN("vfmv.v.f", vfmv_v_f, 0, {&vd, &frs1});
+  DISASM_OPIV_VF_INSN(vmfeq);
+  DISASM_OPIV_VF_INSN(vmfle);
+  DISASM_OPIV_VF_INSN(vmflt);
+  DISASM_OPIV_VF_INSN(vmfne);
+  DISASM_OPIV__F_INSN(vmfgt);
+  DISASM_OPIV__F_INSN(vmfge);
+
+  //0b10_0000
+  DISASM_OPIV_VF_INSN(vfdiv);
+  DISASM_OPIV__F_INSN(vfrdiv);
+
+  //vfunary0
+  //DISASM_VFUNARY0_INSN(vf,  v);
+
+  DISASM_VFUNARY0_INSN(vfw, v);
+  DISASM_INSN("vfwcvt.f.f.v", vfwcvt_f_f_v, 0, {&vd, &vs2, &opt, &vm});
+
+  DISASM_VFUNARY0_INSN(vfn, w);
+  DISASM_INSN("vfncvt.f.f.w", vfncvt_f_f_w, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vfncvt.rod.f.f.w", vfncvt_rod_f_f_w, 0, {&vd, &vs2, &opt, &vm});
+
+  //vfunary1
+  DISASM_INSN("vfsqrt.v", vfsqrt_v, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vfrsqrte7.v", vfrsqrte7_v, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vfrece7.v", vfrece7_v, 0, {&vd, &vs2, &opt, &vm});
+  DISASM_INSN("vfclass.v", vfclass_v, 0, {&vd, &vs2, &opt, &vm});
+
+  DISASM_OPIV_VF_INSN(vfmul);
+  DISASM_OPIV__F_INSN(vfrsub);
+  DISASM_OPIV_VF_INSN(vfmadd);
+  DISASM_OPIV_VF_INSN(vfnmadd);
+  DISASM_OPIV_VF_INSN(vfmsub);
+  DISASM_OPIV_VF_INSN(vfnmsub);
+  DISASM_OPIV_VF_INSN(vfmacc);
+  DISASM_OPIV_VF_INSN(vfnmacc);
+  DISASM_OPIV_VF_INSN(vfmsac);
+  DISASM_OPIV_VF_INSN(vfnmsac);
+
+  //0b11_0000
+  DISASM_OPIV_VF_INSN(vfwadd);
+  DISASM_OPIV_S__INSN(vfwredsum);
+  DISASM_OPIV_VF_INSN(vfwsub);
+  DISASM_OPIV_S__INSN(vfwredosum);
+  DISASM_OPIV_WF_INSN(vfwadd);
+  DISASM_OPIV_WF_INSN(vfwsub);
+  DISASM_OPIV_VF_INSN(vfwmul);
+  DISASM_OPIV_V__INSN(vfdot);
+  DISASM_OPIV_VF_INSN(vfwmacc);
+  DISASM_OPIV_VF_INSN(vfwnmacc);
+  DISASM_OPIV_VF_INSN(vfwmsac);
+  DISASM_OPIV_VF_INSN(vfwnmsac);
+
+  #undef DISASM_OPIV_VF_INSN
+  #undef DISASM_OPIV_V__INSN
+  #undef DISASM_OPIV__F_INSN
+  #undef DISASM_OPIV_S__INSN
+  #undef DISASM_OPIV_W__INSN
+  #undef DISASM_VFUNARY0_INSN
+
+  // vector amo
+  std::vector<const arg_t *> v_fmt_amo_wd = {&vd, &v_address, &vs2, &vd, &opt, &vm};
+  std::vector<const arg_t *> v_fmt_amo = {&x0, &v_address, &vs2, &vd, &opt, &vm};
+  for (size_t elt = 0; elt <= 3; ++elt) {
+    const custom_fmt_t template_insn[] = {
+      {match_vamoaddei8_v | mask_wd,   mask_vamoaddei8_v | mask_wd,
+         "%sei%d.v", v_fmt_amo_wd},
+      {match_vamoaddei8_v,   mask_vamoaddei8_v | mask_wd,
+         "%sei%d.v", v_fmt_amo},
+    };
+    std::pair<const char*, reg_t> amo_map[] = {
+        {"vamoswap", 0x01ul << 27},
+        {"vamoadd",  0x00ul << 27},
+        {"vamoxor",  0x04ul << 27},
+        {"vamoand",  0x0cul << 27},
+        {"vamoor",   0x08ul << 27},
+        {"vamomin",  0x10ul << 27},
+        {"vamomax",  0x14ul << 27},
+        {"vamominu", 0x18ul << 27},
+        {"vamomaxu", 0x1cul << 27}};
+    const reg_t elt_map[] = {0x0ul << 12,  0x5ul << 12,
+                             0x6ul <<12, 0x7ul << 12};
+
+    for (size_t idx = 0; idx < sizeof(amo_map) / sizeof(amo_map[0]); ++idx) {
+      for (auto item : template_insn) {
+        char buf[128];
+        sprintf(buf, item.fmt, amo_map[idx].first, 8 << elt);
+        add_insn(new disasm_insn_t(buf,
+                  item.match | amo_map[idx].second | elt_map[elt],
+                  item.mask,
+                  item.arg));
+      }
+    }
+  }
+
+#endif
+
+  if (xlen == 32) {
+    DISASM_INSN("c.flw", c_flw, 0, {&rvc_fp_rs2s, &rvc_lw_address});
+    DISASM_INSN("c.flwsp", c_flwsp, 0, {&frd, &rvc_lwsp_address});
+    DISASM_INSN("c.fsw", c_fsw, 0, {&rvc_fp_rs2s, &rvc_lw_address});
+    DISASM_INSN("c.fswsp", c_fswsp, 0, {&rvc_fp_rs2, &rvc_swsp_address});
+    DISASM_INSN("c.jal", c_jal, 0, {&rvc_jump_target});
+  } else {
+    DISASM_INSN("c.ld", c_ld, 0, {&rvc_rs2s, &rvc_ld_address});
+    DISASM_INSN("c.ldsp", c_ldsp, 0, {&xrd, &rvc_ldsp_address});
+    DISASM_INSN("c.sd", c_sd, 0, {&rvc_rs2s, &rvc_ld_address});
+    DISASM_INSN("c.sdsp", c_sdsp, 0, {&rvc_rs2, &rvc_sdsp_address});
+    DISASM_INSN("c.addiw", c_addiw, 0, {&xrd, &rvc_imm});
+  }
+
+  // Xpulpimg extension
+  DEFINE_PLOAD_IRPOST(p_lb_irpost);
+  DEFINE_PLOAD_IRPOST(p_lbu_irpost);
+  DEFINE_PLOAD_IRPOST(p_lh_irpost);
+  DEFINE_PLOAD_IRPOST(p_lhu_irpost);
+  DEFINE_PLOAD_IRPOST(p_lw_irpost);
+  DEFINE_PLOAD_RRPOST(p_lb_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lbu_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lh_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lhu_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lw_rrpost);
+  DEFINE_PLOAD_RR(p_lb_rr);
+  DEFINE_PLOAD_RR(p_lbu_rr);
+  DEFINE_PLOAD_RR(p_lh_rr);
+  DEFINE_PLOAD_RR(p_lhu_rr);
+  DEFINE_PLOAD_RR(p_lw_rr);
+  DEFINE_PSTORE_IRPOST(p_sb_irpost);
+  DEFINE_PSTORE_IRPOST(p_sh_irpost);
+  DEFINE_PSTORE_IRPOST(p_sw_irpost);
+  DEFINE_PSTORE_RRPOST(p_sb_rrpost);
+  DEFINE_PSTORE_RRPOST(p_sh_rrpost);
+  DEFINE_PSTORE_RRPOST(p_sw_rrpost);
+  DEFINE_PSTORE_RR(p_sb_rr);
+  DEFINE_PSTORE_RR(p_sh_rr);
+  DEFINE_PSTORE_RR(p_sw_rr);
+  DEFINE_R1TYPE(p_abs);
+  DEFINE_RTYPE(p_slet);
+  DEFINE_RTYPE(p_sletu);
+  DEFINE_RTYPE(p_min);
+  DEFINE_RTYPE(p_minu);
+  DEFINE_RTYPE(p_max);
+  DEFINE_RTYPE(p_maxu);
+  DEFINE_PI0TYPE(p_clip);
+  DEFINE_PI0TYPE(p_clipu);
+  DEFINE_RTYPE(p_clipr);
+  DEFINE_RTYPE(p_clipur);
+  DEFINE_PBTYPE(p_beqimm);
+  DEFINE_PBTYPE(p_bneimm);
+  DEFINE_RTYPE(p_mac);
+  DEFINE_RTYPE(p_msu);
+
+  // xpulpbitop
+  DEFINE_PLUI2TYPE(p_extract);
+  DEFINE_RTYPE(p_extractr);
+  DEFINE_PLUI2TYPE(p_extractu);
+  DEFINE_RTYPE(p_extractur);
+  DEFINE_PLUI2TYPE(p_insert);
+  DEFINE_RTYPE(p_insertr);
+  DEFINE_PLUI2TYPE(p_bset);
+  DEFINE_RTYPE(p_bsetr);
+  DEFINE_PLUI2TYPE(p_bclr);
+  DEFINE_RTYPE(p_bclrr);
+  // xpulpbitopsmall (subset of xpulpbitop)
+  DEFINE_R1TYPE(p_exths);
+  DEFINE_R1TYPE(p_exthz);
+  DEFINE_R1TYPE(p_extbs);
+  DEFINE_R1TYPE(p_extbz);
+  DEFINE_RTYPE(p_ror);
+  DEFINE_R1TYPE(p_ff1);
+  DEFINE_R1TYPE(p_fl1);
+  DEFINE_R1TYPE(p_clb);
+  DEFINE_R1TYPE(p_cnt);
+
+  // xpulpbitrev
+  DEFINE_PLUI2TYPE(p_bitrev);
+
+  // xpulpmulrnhi
+  DEFINE_PR2LUITYPE(p_muluN);
+  DEFINE_PR2LUITYPE(p_mulhhuN);
+  DEFINE_PR2LUITYPE(p_mulsN);
+  DEFINE_PR2LUITYPE(p_mulhhsN);
+  DEFINE_PR2LUITYPE(p_muluRN);
+  DEFINE_PR2LUITYPE(p_mulhhuRN);
+  DEFINE_PR2LUITYPE(p_mulsRN);
+  DEFINE_PR2LUITYPE(p_mulhhsRN);
+
+  // xpulpmacrnhi
+  DEFINE_PR2LUITYPE(p_macuN);
+  DEFINE_PR2LUITYPE(p_machhuN);
+  DEFINE_PR2LUITYPE(p_macsN);
+  DEFINE_PR2LUITYPE(p_machhsN);
+  DEFINE_PR2LUITYPE(p_macuRN);
+  DEFINE_PR2LUITYPE(p_machhuRN);
+  DEFINE_PR2LUITYPE(p_macsRN);
+  DEFINE_PR2LUITYPE(p_machhsRN);
+
+  // xpulpaddsubrn
+  DEFINE_PR2LUITYPE(p_addN);
+  DEFINE_PR2LUITYPE(p_adduN);
+  DEFINE_PR2LUITYPE(p_addRN);
+  DEFINE_PR2LUITYPE(p_adduRN);
+  DEFINE_PR2LUITYPE(p_subN);
+  DEFINE_PR2LUITYPE(p_subuN);
+  DEFINE_PR2LUITYPE(p_subRN);
+  DEFINE_PR2LUITYPE(p_subuRN);
+  DEFINE_RTYPE(p_addNr);
+  DEFINE_RTYPE(p_adduNr);
+  DEFINE_RTYPE(p_addRNr);
+  DEFINE_RTYPE(p_adduRNr);
+  DEFINE_RTYPE(p_subNr);
+  DEFINE_RTYPE(p_subuNr);
+  DEFINE_RTYPE(p_subRNr);
+  DEFINE_RTYPE(p_subuRNr);
+
+  // xpulphwloop
+  DISASM_INSN("lp_starti", lp_starti, 0, {&p_loop, &p_uimmL});
+  DISASM_INSN("lp_endi", lp_endi, 0, {&p_loop, &p_uimmL});
+  DISASM_INSN("lp_counti", lp_counti, 0, {&p_loop, &p_uimmL});
+  DISASM_INSN("lp_count", lp_count, 0, {&p_loop, &xrs1});
+  DISASM_INSN("lp_setup", lp_setup, 0, {&p_loop, &xrs1, &p_uimmL});
+  DISASM_INSN("lp_setupi", lp_setupi, 0, {&p_loop, &p_uimmL, &p_uimmS});
+
+  // xpulpvect
+  DEFINE_RTYPE(pv_add_h);
+  DEFINE_RTYPE(pv_add_sc_h);
+  DEFINE_PI1STYPE(pv_add_sci_h);
+  DEFINE_RTYPE(pv_add_b);
+  DEFINE_RTYPE(pv_add_sc_b);
+  DEFINE_PI1STYPE(pv_add_sci_b);
+  DEFINE_RTYPE(pv_add_h_div2);
+  DEFINE_RTYPE(pv_add_h_div4);
+  DEFINE_RTYPE(pv_add_h_div8);
+  DEFINE_RTYPE(pv_sub_h);
+  DEFINE_RTYPE(pv_sub_sc_h);
+  DEFINE_PI1STYPE(pv_sub_sci_h);
+  DEFINE_RTYPE(pv_sub_b);
+  DEFINE_RTYPE(pv_sub_sc_b);
+  DEFINE_PI1STYPE(pv_sub_sci_b);
+  DEFINE_RTYPE(pv_sub_h_div2);
+  DEFINE_RTYPE(pv_sub_h_div2);
+  DEFINE_RTYPE(pv_sub_h_div2);
+  DEFINE_RTYPE(pv_avg_h);
+  DEFINE_RTYPE(pv_avg_sc_h);
+  DEFINE_PI1STYPE(pv_avg_sci_h);
+  DEFINE_RTYPE(pv_avg_b);
+  DEFINE_RTYPE(pv_avg_sc_b);
+  DEFINE_PI1STYPE(pv_avg_sci_b);
+  DEFINE_RTYPE(pv_avgu_h);
+  DEFINE_RTYPE(pv_avgu_sc_h);
+  DEFINE_PI1ZTYPE(pv_avgu_sci_h);
+  DEFINE_RTYPE(pv_avgu_b);
+  DEFINE_RTYPE(pv_avgu_sc_b);
+  DEFINE_PI1ZTYPE(pv_avgu_sci_b);
+
+  DEFINE_RTYPE(pv_min_h);
+  DEFINE_RTYPE(pv_min_sc_h);
+  DEFINE_PI1STYPE(pv_min_sci_h);
+  DEFINE_RTYPE(pv_min_b);
+  DEFINE_RTYPE(pv_min_sc_b);
+  DEFINE_PI1STYPE(pv_min_sci_b);
+  DEFINE_RTYPE(pv_minu_h);
+  DEFINE_RTYPE(pv_minu_sc_h);
+  DEFINE_PI1ZTYPE(pv_minu_sci_h);
+  DEFINE_RTYPE(pv_minu_b);
+  DEFINE_RTYPE(pv_minu_sc_b);
+  DEFINE_PI1ZTYPE(pv_minu_sci_b);
+  DEFINE_RTYPE(pv_max_h);
+  DEFINE_RTYPE(pv_max_sc_h);
+  DEFINE_PI1STYPE(pv_max_sci_h);
+  DEFINE_RTYPE(pv_max_b);
+  DEFINE_RTYPE(pv_max_sc_b);
+  DEFINE_PI1STYPE(pv_max_sci_b);
+  DEFINE_RTYPE(pv_maxu_h);
+  DEFINE_RTYPE(pv_maxu_sc_h);
+  DEFINE_PI1ZTYPE(pv_maxu_sci_h);
+  DEFINE_RTYPE(pv_maxu_b);
+  DEFINE_RTYPE(pv_maxu_sc_b);
+  DEFINE_PI1ZTYPE(pv_maxu_sci_b);
+  DEFINE_PI1ZTYPE(pv_and_sci_b);
+  DEFINE_R1TYPE(pv_abs_h);
+  DEFINE_R1TYPE(pv_abs_b);
+
+  DEFINE_RTYPE(pv_srl_h);
+  DEFINE_RTYPE(pv_srl_sc_h);
+  DEFINE_PI1ZTYPE(pv_srl_sci_h);
+  DEFINE_RTYPE(pv_srl_b);
+  DEFINE_RTYPE(pv_srl_sc_b);
+  DEFINE_PI1ZTYPE(pv_srl_sci_b);
+  DEFINE_RTYPE(pv_sra_h);
+  DEFINE_RTYPE(pv_sra_sc_h);
+  DEFINE_PI1ZTYPE(pv_sra_sci_h);
+  DEFINE_RTYPE(pv_sra_b);
+  DEFINE_RTYPE(pv_sra_sc_b);
+  DEFINE_PI1ZTYPE(pv_sra_sci_b);
+  DEFINE_RTYPE(pv_sll_h);
+  DEFINE_RTYPE(pv_sll_sc_h);
+  DEFINE_PI1ZTYPE(pv_sll_sci_h);
+  DEFINE_RTYPE(pv_sll_b);
+  DEFINE_RTYPE(pv_sll_sc_b);
+  DEFINE_PI1ZTYPE(pv_sll_sci_b);
+  DEFINE_RTYPE(pv_or_h);
+  DEFINE_RTYPE(pv_or_sc_h);
+  DEFINE_PI1ZTYPE(pv_or_sci_h);
+  DEFINE_RTYPE(pv_or_b);
+  DEFINE_RTYPE(pv_or_sc_b);
+  DEFINE_PI1ZTYPE(pv_or_sci_b);
+  DEFINE_RTYPE(pv_xor_h);
+  DEFINE_RTYPE(pv_xor_sc_h);
+  DEFINE_PI1ZTYPE(pv_xor_sci_h);
+  DEFINE_RTYPE(pv_xor_b);
+  DEFINE_RTYPE(pv_xor_sc_b);
+  DEFINE_PI1ZTYPE(pv_xor_sci_b);
+  DEFINE_RTYPE(pv_and_h);
+  DEFINE_RTYPE(pv_and_sc_h);
+  DEFINE_PI1ZTYPE(pv_and_sci_h);
+  DEFINE_RTYPE(pv_and_b);
+  DEFINE_RTYPE(pv_and_sc_b);
+
+  DEFINE_PI1ZTYPE(pv_extract_h);
+  DEFINE_PI1ZTYPE(pv_extract_b);
+  DEFINE_PI1ZTYPE(pv_extractu_h);
+  DEFINE_PI1ZTYPE(pv_extractu_b);
+  DEFINE_PI1ZTYPE(pv_insert_h);
+  DEFINE_PI1ZTYPE(pv_insert_b);
+
+  DEFINE_RTYPE(pv_dotup_h);
+  DEFINE_RTYPE(pv_dotup_sc_h);
+  DEFINE_PI1ZTYPE(pv_dotup_sci_h);
+  DEFINE_RTYPE(pv_dotup_b);
+  DEFINE_RTYPE(pv_dotup_sc_b);
+  DEFINE_PI1ZTYPE(pv_dotup_sci_b);
+  DEFINE_RTYPE(pv_dotusp_h);
+  DEFINE_RTYPE(pv_dotusp_sc_h);
+  DEFINE_PI1STYPE(pv_dotusp_sci_h);
+  DEFINE_RTYPE(pv_dotusp_b);
+  DEFINE_RTYPE(pv_dotusp_sc_b);
+  DEFINE_PI1STYPE(pv_dotusp_sci_b);
+  DEFINE_RTYPE(pv_dotsp_h);
+  DEFINE_RTYPE(pv_dotsp_sc_h);
+  DEFINE_PI1STYPE(pv_dotsp_sci_h);
+  DEFINE_RTYPE(pv_dotsp_b);
+  DEFINE_RTYPE(pv_dotsp_sc_b);
+  DEFINE_PI1STYPE(pv_dotsp_sci_b);
+  DEFINE_RTYPE(pv_sdotup_h);
+  DEFINE_RTYPE(pv_sdotup_sc_h);
+  DEFINE_PI1ZTYPE(pv_sdotup_sci_h);
+  DEFINE_RTYPE(pv_sdotup_b);
+  DEFINE_RTYPE(pv_sdotup_sc_b);
+  DEFINE_PI1ZTYPE(pv_sdotup_sci_b);
+  DEFINE_RTYPE(pv_sdotusp_h);
+  DEFINE_RTYPE(pv_sdotusp_sc_h);
+  DEFINE_PI1STYPE(pv_sdotusp_sci_h);
+  DEFINE_RTYPE(pv_sdotusp_b);
+  DEFINE_RTYPE(pv_sdotusp_sc_b);
+  DEFINE_PI1STYPE(pv_sdotusp_sci_b);
+  DEFINE_RTYPE(pv_sdotsp_h);
+  DEFINE_RTYPE(pv_sdotsp_sc_h);
+  DEFINE_PI1STYPE(pv_sdotsp_sci_h);
+  DEFINE_RTYPE(pv_sdotsp_b);
+  DEFINE_RTYPE(pv_sdotsp_sc_b);
+  DEFINE_PI1STYPE(pv_sdotsp_sci_b);
+  
+  DEFINE_RTYPE(pv_cmpeq_h);
+  DEFINE_RTYPE(pv_cmpeq_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpeq_sci_h);
+  DEFINE_RTYPE(pv_cmpeq_b);
+  DEFINE_RTYPE(pv_cmpeq_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpeq_sci_b);
+  DEFINE_RTYPE(pv_cmpne_h);
+  DEFINE_RTYPE(pv_cmpne_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpne_sci_h);
+  DEFINE_RTYPE(pv_cmpne_b);
+  DEFINE_RTYPE(pv_cmpne_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpne_sci_b);
+  DEFINE_RTYPE(pv_cmpgt_h);
+  DEFINE_RTYPE(pv_cmpgt_sc_h);
+  DEFINE_PI1STYPE(pv_cmpgt_sci_h);
+  DEFINE_RTYPE(pv_cmpgt_b);
+  DEFINE_RTYPE(pv_cmpgt_sc_b);
+  DEFINE_PI1STYPE(pv_cmpgt_sci_b);
+  DEFINE_RTYPE(pv_cmpge_h);
+  DEFINE_RTYPE(pv_cmpge_sc_h);
+  DEFINE_PI1STYPE(pv_cmpge_sci_h);
+  DEFINE_RTYPE(pv_cmpge_b);
+  DEFINE_RTYPE(pv_cmpge_sc_b);
+  DEFINE_PI1STYPE(pv_cmpge_sci_b);
+  DEFINE_RTYPE(pv_cmplt_h);
+  DEFINE_RTYPE(pv_cmplt_sc_h);
+  DEFINE_PI1STYPE(pv_cmplt_sci_h);
+  DEFINE_RTYPE(pv_cmplt_b);
+  DEFINE_RTYPE(pv_cmplt_sc_b);
+  DEFINE_PI1STYPE(pv_cmplt_sci_b);
+  DEFINE_RTYPE(pv_cmple_h);
+  DEFINE_RTYPE(pv_cmple_sc_h);
+  DEFINE_PI1STYPE(pv_cmple_sci_h);
+  DEFINE_RTYPE(pv_cmple_b);
+  DEFINE_RTYPE(pv_cmple_sc_b);
+  DEFINE_PI1STYPE(pv_cmple_sci_b);
+  DEFINE_RTYPE(pv_cmpgtu_h);
+  DEFINE_RTYPE(pv_cmpgtu_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpgtu_sci_h);
+  DEFINE_RTYPE(pv_cmpgtu_b);
+  DEFINE_RTYPE(pv_cmpgtu_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpgtu_sci_b);
+  DEFINE_RTYPE(pv_cmpgeu_h);
+  DEFINE_RTYPE(pv_cmpgeu_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpgeu_sci_h);
+  DEFINE_RTYPE(pv_cmpgeu_b);
+  DEFINE_RTYPE(pv_cmpgeu_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpgeu_sci_b);
+  DEFINE_RTYPE(pv_cmpltu_h);
+  DEFINE_RTYPE(pv_cmpltu_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpltu_sci_h);
+  DEFINE_RTYPE(pv_cmpltu_b);
+  DEFINE_RTYPE(pv_cmpltu_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpltu_sci_b);
+  DEFINE_RTYPE(pv_cmpleu_h);
+  DEFINE_RTYPE(pv_cmpleu_sc_h);
+  DEFINE_PI1ZTYPE(pv_cmpleu_sci_h);
+  DEFINE_RTYPE(pv_cmpleu_b);
+  DEFINE_RTYPE(pv_cmpleu_sc_b);
+  DEFINE_PI1ZTYPE(pv_cmpleu_sci_b);
+
+  // xpulpvectcomplex
+  DEFINE_R1TYPE(pv_cplxconj_h);
+  DEFINE_RTYPE(pv_subrotmj_h);
+  DEFINE_RTYPE(pv_subrotmj_h_div2);
+  DEFINE_RTYPE(pv_subrotmj_h_div4);
+  DEFINE_RTYPE(pv_subrotmj_h_div8);
+  DEFINE_RTYPE(pv_cplxmul_h_r);
+  DEFINE_RTYPE(pv_cplxmul_h_r_div2);
+  DEFINE_RTYPE(pv_cplxmul_h_r_div4);
+  DEFINE_RTYPE(pv_cplxmul_h_r_div8);
+  DEFINE_RTYPE(pv_cplxmul_h_i);
+  DEFINE_RTYPE(pv_cplxmul_h_i_div2);
+  DEFINE_RTYPE(pv_cplxmul_h_i_div4);
+  DEFINE_RTYPE(pv_cplxmul_h_i_div8);
+
+  // xpulpvectshufflepack
+  DEFINE_RTYPE(pv_shuffle_h);
+  DEFINE_PI1ZTYPE(pv_shuffle_sci_h);
+  DEFINE_RTYPE(pv_shuffle_b);
+  DEFINE_PI1ZTYPE(pv_shufflei0_sci_b);
+  DEFINE_PI1ZTYPE(pv_shufflei1_sci_b);
+  DEFINE_PI1ZTYPE(pv_shufflei2_sci_b);
+  DEFINE_PI1ZTYPE(pv_shufflei3_sci_b);
+  DEFINE_RTYPE(pv_shuffle2_h);
+  DEFINE_RTYPE(pv_shuffle2_b);
+  DEFINE_RTYPE(pv_pack);
+  DEFINE_RTYPE(pv_pack_h);
+  DEFINE_RTYPE(pv_packhi_b);
+  DEFINE_RTYPE(pv_packlo_b);
+
+  // provide a default disassembly for all instructions as a fallback
+  #define DECLARE_INSN(code, match, mask) \
+   add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {}));
+  #include "encoding.h"
+  #undef DECLARE_INSN
+}
+
+const disasm_insn_t* disassembler_t::lookup(insn_t insn) const
+{
+  size_t idx = insn.bits() % HASH_SIZE;
+  for (size_t j = 0; j < chain[idx].size(); j++)
+    if(*chain[idx][j] == insn)
+      return chain[idx][j];
+
+  idx = HASH_SIZE;
+  for (size_t j = 0; j < chain[idx].size(); j++)
+    if(*chain[idx][j] == insn)
+      return chain[idx][j];
+
+  return NULL;
+}
+
+void NOINLINE disassembler_t::add_insn(disasm_insn_t* insn)
+{
+  size_t idx = HASH_SIZE;
+  if (insn->get_mask() % HASH_SIZE == HASH_SIZE - 1)
+    idx = insn->get_match() % HASH_SIZE;
+  chain[idx].push_back(insn);
+}
+
+disassembler_t::~disassembler_t()
+{
+  for (size_t i = 0; i < HASH_SIZE+1; i++)
+    for (size_t j = 0; j < chain[i].size(); j++)
+      delete chain[i][j];
+}
diff --git a/disasm/disasm.mk.in b/disasm/disasm.mk.in
new file mode 100644
index 0000000000..039a717f90
--- /dev/null
+++ b/disasm/disasm.mk.in
@@ -0,0 +1,7 @@
+disasm_CFLAGS = -fPIC
+
+disasm_srcs = \
+  disasm.cc \
+  regnames.cc \
+
+disasm_install_lib = yes
diff --git a/riscv/regnames.cc b/disasm/regnames.cc
similarity index 75%
rename from riscv/regnames.cc
rename to disasm/regnames.cc
index 0bf8d9c6e9..0a7fd4d22c 100644
--- a/riscv/regnames.cc
+++ b/disasm/regnames.cc
@@ -16,6 +16,13 @@ const char* fpr_name[] = {
   "fs8", "fs9", "fs10", "fs11", "ft8", "ft9", "ft10", "ft11"
 };
 
+const char* vr_name[] = {
+  "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+  "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+  "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+};
+
 const char* csr_name(int which) {
   switch (which) {
     #define DECLARE_CSR(name, number)  case number: return #name;
diff --git a/dummy_rocc/dummy_rocc.mk.in b/dummy_rocc/dummy_rocc.mk.in
deleted file mode 100644
index 0143ffd1c3..0000000000
--- a/dummy_rocc/dummy_rocc.mk.in
+++ /dev/null
@@ -1,7 +0,0 @@
-dummy_rocc_subproject_deps = \
-	spike_main \
-	riscv \
-	softfloat \
-
-dummy_rocc_srcs = \
-	dummy_rocc.cc \
diff --git a/fdt/fdt.ac b/fdt/fdt.ac
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/fdt/fdt.c b/fdt/fdt.c
new file mode 100644
index 0000000000..d6ce7c052d
--- /dev/null
+++ b/fdt/fdt.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+/*
+ * Minimal sanity check for a read-only tree. fdt_ro_probe_() checks
+ * that the given buffer contains what appears to be a flattened
+ * device tree with sane information in its header.
+ */
+int32_t fdt_ro_probe_(const void *fdt)
+{
+	uint32_t totalsize = fdt_totalsize(fdt);
+
+	if (fdt_magic(fdt) == FDT_MAGIC) {
+		/* Complete tree */
+		if (fdt_version(fdt) < FDT_FIRST_SUPPORTED_VERSION)
+			return -FDT_ERR_BADVERSION;
+		if (fdt_last_comp_version(fdt) > FDT_LAST_SUPPORTED_VERSION)
+			return -FDT_ERR_BADVERSION;
+	} else if (fdt_magic(fdt) == FDT_SW_MAGIC) {
+		/* Unfinished sequential-write blob */
+		if (fdt_size_dt_struct(fdt) == 0)
+			return -FDT_ERR_BADSTATE;
+	} else {
+		return -FDT_ERR_BADMAGIC;
+	}
+
+	if (totalsize < INT32_MAX)
+		return totalsize;
+	else
+		return -FDT_ERR_TRUNCATED;
+}
+
+static int check_off_(uint32_t hdrsize, uint32_t totalsize, uint32_t off)
+{
+	return (off >= hdrsize) && (off <= totalsize);
+}
+
+static int check_block_(uint32_t hdrsize, uint32_t totalsize,
+			uint32_t base, uint32_t size)
+{
+	if (!check_off_(hdrsize, totalsize, base))
+		return 0; /* block start out of bounds */
+	if ((base + size) < base)
+		return 0; /* overflow */
+	if (!check_off_(hdrsize, totalsize, base + size))
+		return 0; /* block end out of bounds */
+	return 1;
+}
+
+size_t fdt_header_size_(uint32_t version)
+{
+	if (version <= 1)
+		return FDT_V1_SIZE;
+	else if (version <= 2)
+		return FDT_V2_SIZE;
+	else if (version <= 3)
+		return FDT_V3_SIZE;
+	else if (version <= 16)
+		return FDT_V16_SIZE;
+	else
+		return FDT_V17_SIZE;
+}
+
+int fdt_check_header(const void *fdt)
+{
+	size_t hdrsize;
+
+	if (fdt_magic(fdt) != FDT_MAGIC)
+		return -FDT_ERR_BADMAGIC;
+	hdrsize = fdt_header_size(fdt);
+	if ((fdt_version(fdt) < FDT_FIRST_SUPPORTED_VERSION)
+	    || (fdt_last_comp_version(fdt) > FDT_LAST_SUPPORTED_VERSION))
+		return -FDT_ERR_BADVERSION;
+	if (fdt_version(fdt) < fdt_last_comp_version(fdt))
+		return -FDT_ERR_BADVERSION;
+
+	if ((fdt_totalsize(fdt) < hdrsize)
+	    || (fdt_totalsize(fdt) > INT_MAX))
+		return -FDT_ERR_TRUNCATED;
+
+	/* Bounds check memrsv block */
+	if (!check_off_(hdrsize, fdt_totalsize(fdt), fdt_off_mem_rsvmap(fdt)))
+		return -FDT_ERR_TRUNCATED;
+
+	/* Bounds check structure block */
+	if (fdt_version(fdt) < 17) {
+		if (!check_off_(hdrsize, fdt_totalsize(fdt),
+				fdt_off_dt_struct(fdt)))
+			return -FDT_ERR_TRUNCATED;
+	} else {
+		if (!check_block_(hdrsize, fdt_totalsize(fdt),
+				  fdt_off_dt_struct(fdt),
+				  fdt_size_dt_struct(fdt)))
+			return -FDT_ERR_TRUNCATED;
+	}
+
+	/* Bounds check strings block */
+	if (!check_block_(hdrsize, fdt_totalsize(fdt),
+			  fdt_off_dt_strings(fdt), fdt_size_dt_strings(fdt)))
+		return -FDT_ERR_TRUNCATED;
+
+	return 0;
+}
+
+const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int len)
+{
+	unsigned absoffset = offset + fdt_off_dt_struct(fdt);
+
+	if ((absoffset < offset)
+	    || ((absoffset + len) < absoffset)
+	    || (absoffset + len) > fdt_totalsize(fdt))
+		return NULL;
+
+	if (fdt_version(fdt) >= 0x11)
+		if (((offset + len) < offset)
+		    || ((offset + len) > fdt_size_dt_struct(fdt)))
+			return NULL;
+
+	return fdt_offset_ptr_(fdt, offset);
+}
+
+uint32_t fdt_next_tag(const void *fdt, int startoffset, int *nextoffset)
+{
+	const fdt32_t *tagp, *lenp;
+	uint32_t tag;
+	int offset = startoffset;
+	const char *p;
+
+	*nextoffset = -FDT_ERR_TRUNCATED;
+	tagp = fdt_offset_ptr(fdt, offset, FDT_TAGSIZE);
+	if (!tagp)
+		return FDT_END; /* premature end */
+	tag = fdt32_to_cpu(*tagp);
+	offset += FDT_TAGSIZE;
+
+	*nextoffset = -FDT_ERR_BADSTRUCTURE;
+	switch (tag) {
+	case FDT_BEGIN_NODE:
+		/* skip name */
+		do {
+			p = fdt_offset_ptr(fdt, offset++, 1);
+		} while (p && (*p != '\0'));
+		if (!p)
+			return FDT_END; /* premature end */
+		break;
+
+	case FDT_PROP:
+		lenp = fdt_offset_ptr(fdt, offset, sizeof(*lenp));
+		if (!lenp)
+			return FDT_END; /* premature end */
+		/* skip-name offset, length and value */
+		offset += sizeof(struct fdt_property) - FDT_TAGSIZE
+			+ fdt32_to_cpu(*lenp);
+		if (fdt_version(fdt) < 0x10 && fdt32_to_cpu(*lenp) >= 8 &&
+		    ((offset - fdt32_to_cpu(*lenp)) % 8) != 0)
+			offset += 4;
+		break;
+
+	case FDT_END:
+	case FDT_END_NODE:
+	case FDT_NOP:
+		break;
+
+	default:
+		return FDT_END;
+	}
+
+	if (!fdt_offset_ptr(fdt, startoffset, offset - startoffset))
+		return FDT_END; /* premature end */
+
+	*nextoffset = FDT_TAGALIGN(offset);
+	return tag;
+}
+
+int fdt_check_node_offset_(const void *fdt, int offset)
+{
+	if ((offset < 0) || (offset % FDT_TAGSIZE)
+	    || (fdt_next_tag(fdt, offset, &offset) != FDT_BEGIN_NODE))
+		return -FDT_ERR_BADOFFSET;
+
+	return offset;
+}
+
+int fdt_check_prop_offset_(const void *fdt, int offset)
+{
+	if ((offset < 0) || (offset % FDT_TAGSIZE)
+	    || (fdt_next_tag(fdt, offset, &offset) != FDT_PROP))
+		return -FDT_ERR_BADOFFSET;
+
+	return offset;
+}
+
+int fdt_next_node(const void *fdt, int offset, int *depth)
+{
+	int nextoffset = 0;
+	uint32_t tag;
+
+	if (offset >= 0)
+		if ((nextoffset = fdt_check_node_offset_(fdt, offset)) < 0)
+			return nextoffset;
+
+	do {
+		offset = nextoffset;
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+
+		switch (tag) {
+		case FDT_PROP:
+		case FDT_NOP:
+			break;
+
+		case FDT_BEGIN_NODE:
+			if (depth)
+				(*depth)++;
+			break;
+
+		case FDT_END_NODE:
+			if (depth && ((--(*depth)) < 0))
+				return nextoffset;
+			break;
+
+		case FDT_END:
+			if ((nextoffset >= 0)
+			    || ((nextoffset == -FDT_ERR_TRUNCATED) && !depth))
+				return -FDT_ERR_NOTFOUND;
+			else
+				return nextoffset;
+		}
+	} while (tag != FDT_BEGIN_NODE);
+
+	return offset;
+}
+
+int fdt_first_subnode(const void *fdt, int offset)
+{
+	int depth = 0;
+
+	offset = fdt_next_node(fdt, offset, &depth);
+	if (offset < 0 || depth != 1)
+		return -FDT_ERR_NOTFOUND;
+
+	return offset;
+}
+
+int fdt_next_subnode(const void *fdt, int offset)
+{
+	int depth = 1;
+
+	/*
+	 * With respect to the parent, the depth of the next subnode will be
+	 * the same as the last.
+	 */
+	do {
+		offset = fdt_next_node(fdt, offset, &depth);
+		if (offset < 0 || depth < 1)
+			return -FDT_ERR_NOTFOUND;
+	} while (depth > 1);
+
+	return offset;
+}
+
+const char *fdt_find_string_(const char *strtab, int tabsize, const char *s)
+{
+	int len = strlen(s) + 1;
+	const char *last = strtab + tabsize - len;
+	const char *p;
+
+	for (p = strtab; p <= last; p++)
+		if (memcmp(p, s, len) == 0)
+			return p;
+	return NULL;
+}
+
+int fdt_move(const void *fdt, void *buf, int bufsize)
+{
+	FDT_RO_PROBE(fdt);
+
+	if (fdt_totalsize(fdt) > bufsize)
+		return -FDT_ERR_NOSPACE;
+
+	memmove(buf, fdt, fdt_totalsize(fdt));
+	return 0;
+}
diff --git a/fdt/fdt.h b/fdt/fdt.h
new file mode 100644
index 0000000000..f2e68807f2
--- /dev/null
+++ b/fdt/fdt.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) */
+#ifndef FDT_H
+#define FDT_H
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ * Copyright 2012 Kim Phillips, Freescale Semiconductor.
+ */
+
+#ifndef __ASSEMBLY__
+
+struct fdt_header {
+	fdt32_t magic;			 /* magic word FDT_MAGIC */
+	fdt32_t totalsize;		 /* total size of DT block */
+	fdt32_t off_dt_struct;		 /* offset to structure */
+	fdt32_t off_dt_strings;		 /* offset to strings */
+	fdt32_t off_mem_rsvmap;		 /* offset to memory reserve map */
+	fdt32_t version;		 /* format version */
+	fdt32_t last_comp_version;	 /* last compatible version */
+
+	/* version 2 fields below */
+	fdt32_t boot_cpuid_phys;	 /* Which physical CPU id we're
+					    booting on */
+	/* version 3 fields below */
+	fdt32_t size_dt_strings;	 /* size of the strings block */
+
+	/* version 17 fields below */
+	fdt32_t size_dt_struct;		 /* size of the structure block */
+};
+
+struct fdt_reserve_entry {
+	fdt64_t address;
+	fdt64_t size;
+};
+
+struct fdt_node_header {
+	fdt32_t tag;
+	char name[0];
+};
+
+struct fdt_property {
+	fdt32_t tag;
+	fdt32_t len;
+	fdt32_t nameoff;
+	char data[0];
+};
+
+#endif /* !__ASSEMBLY */
+
+#define FDT_MAGIC	0xd00dfeed	/* 4: version, 4: total size */
+#define FDT_TAGSIZE	sizeof(fdt32_t)
+
+#define FDT_BEGIN_NODE	0x1		/* Start node: full name */
+#define FDT_END_NODE	0x2		/* End node */
+#define FDT_PROP	0x3		/* Property: name off,
+					   size, content */
+#define FDT_NOP		0x4		/* nop */
+#define FDT_END		0x9
+
+#define FDT_V1_SIZE	(7*sizeof(fdt32_t))
+#define FDT_V2_SIZE	(FDT_V1_SIZE + sizeof(fdt32_t))
+#define FDT_V3_SIZE	(FDT_V2_SIZE + sizeof(fdt32_t))
+#define FDT_V16_SIZE	FDT_V3_SIZE
+#define FDT_V17_SIZE	(FDT_V16_SIZE + sizeof(fdt32_t))
+
+#endif /* FDT_H */
diff --git a/fdt/fdt.mk.in b/fdt/fdt.mk.in
new file mode 100644
index 0000000000..273375efb4
--- /dev/null
+++ b/fdt/fdt.mk.in
@@ -0,0 +1,17 @@
+fdt_subproject_deps = \
+
+fdt_hdrs = \
+	fdt.h \
+	libfdt.h \
+	libfdt_env.h \
+
+fdt_c_srcs = \
+	fdt.c \
+	fdt_ro.c \
+	fdt_wip.c \
+	fdt_sw.c \
+	fdt_rw.c \
+	fdt_strerror.c \
+	fdt_empty_tree.c \
+	fdt_addresses.c \
+	fdt_overlay.c \
diff --git a/fdt/fdt_addresses.c b/fdt/fdt_addresses.c
new file mode 100644
index 0000000000..9a82cd0ba2
--- /dev/null
+++ b/fdt/fdt_addresses.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2014 David Gibson <david@gibson.dropbear.id.au>
+ * Copyright (C) 2018 embedded brains GmbH
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+static int fdt_cells(const void *fdt, int nodeoffset, const char *name)
+{
+	const fdt32_t *c;
+	uint32_t val;
+	int len;
+
+	c = fdt_getprop(fdt, nodeoffset, name, &len);
+	if (!c)
+		return len;
+
+	if (len != sizeof(*c))
+		return -FDT_ERR_BADNCELLS;
+
+	val = fdt32_to_cpu(*c);
+	if (val > FDT_MAX_NCELLS)
+		return -FDT_ERR_BADNCELLS;
+
+	return (int)val;
+}
+
+int fdt_address_cells(const void *fdt, int nodeoffset)
+{
+	int val;
+
+	val = fdt_cells(fdt, nodeoffset, "#address-cells");
+	if (val == 0)
+		return -FDT_ERR_BADNCELLS;
+	if (val == -FDT_ERR_NOTFOUND)
+		return 2;
+	return val;
+}
+
+int fdt_size_cells(const void *fdt, int nodeoffset)
+{
+	int val;
+
+	val = fdt_cells(fdt, nodeoffset, "#size-cells");
+	if (val == -FDT_ERR_NOTFOUND)
+		return 1;
+	return val;
+}
+
+/* This function assumes that [address|size]_cells is 1 or 2 */
+int fdt_appendprop_addrrange(void *fdt, int parent, int nodeoffset,
+			     const char *name, uint64_t addr, uint64_t size)
+{
+	int addr_cells, size_cells, ret;
+	uint8_t data[sizeof(fdt64_t) * 2], *prop;
+
+	ret = fdt_address_cells(fdt, parent);
+	if (ret < 0)
+		return ret;
+	addr_cells = ret;
+
+	ret = fdt_size_cells(fdt, parent);
+	if (ret < 0)
+		return ret;
+	size_cells = ret;
+
+	/* check validity of address */
+	prop = data;
+	if (addr_cells == 1) {
+		if ((addr > UINT32_MAX) || ((UINT32_MAX + 1 - addr) < size))
+			return -FDT_ERR_BADVALUE;
+
+		fdt32_st(prop, (uint32_t)addr);
+	} else if (addr_cells == 2) {
+		fdt64_st(prop, addr);
+	} else {
+		return -FDT_ERR_BADNCELLS;
+	}
+
+	/* check validity of size */
+	prop += addr_cells * sizeof(fdt32_t);
+	if (size_cells == 1) {
+		if (size > UINT32_MAX)
+			return -FDT_ERR_BADVALUE;
+
+		fdt32_st(prop, (uint32_t)size);
+	} else if (size_cells == 2) {
+		fdt64_st(prop, size);
+	} else {
+		return -FDT_ERR_BADNCELLS;
+	}
+
+	return fdt_appendprop(fdt, nodeoffset, name, data,
+			      (addr_cells + size_cells) * sizeof(fdt32_t));
+}
diff --git a/fdt/fdt_empty_tree.c b/fdt/fdt_empty_tree.c
new file mode 100644
index 0000000000..49d54d44b8
--- /dev/null
+++ b/fdt/fdt_empty_tree.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2012 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+int fdt_create_empty_tree(void *buf, int bufsize)
+{
+	int err;
+
+	err = fdt_create(buf, bufsize);
+	if (err)
+		return err;
+
+	err = fdt_finish_reservemap(buf);
+	if (err)
+		return err;
+
+	err = fdt_begin_node(buf, "");
+	if (err)
+		return err;
+
+	err =  fdt_end_node(buf);
+	if (err)
+		return err;
+
+	err = fdt_finish(buf);
+	if (err)
+		return err;
+
+	return fdt_open_into(buf, buf, bufsize);
+}
diff --git a/fdt/fdt_overlay.c b/fdt/fdt_overlay.c
new file mode 100644
index 0000000000..be71873366
--- /dev/null
+++ b/fdt/fdt_overlay.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2016 Free Electrons
+ * Copyright (C) 2016 NextThing Co.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+/**
+ * overlay_get_target_phandle - retrieves the target phandle of a fragment
+ * @fdto: pointer to the device tree overlay blob
+ * @fragment: node offset of the fragment in the overlay
+ *
+ * overlay_get_target_phandle() retrieves the target phandle of an
+ * overlay fragment when that fragment uses a phandle (target
+ * property) instead of a path (target-path property).
+ *
+ * returns:
+ *      the phandle pointed by the target property
+ *      0, if the phandle was not found
+ *	-1, if the phandle was malformed
+ */
+static uint32_t overlay_get_target_phandle(const void *fdto, int fragment)
+{
+	const fdt32_t *val;
+	int len;
+
+	val = fdt_getprop(fdto, fragment, "target", &len);
+	if (!val)
+		return 0;
+
+	if ((len != sizeof(*val)) || (fdt32_to_cpu(*val) == (uint32_t)-1))
+		return (uint32_t)-1;
+
+	return fdt32_to_cpu(*val);
+}
+
+/**
+ * overlay_get_target - retrieves the offset of a fragment's target
+ * @fdt: Base device tree blob
+ * @fdto: Device tree overlay blob
+ * @fragment: node offset of the fragment in the overlay
+ * @pathp: pointer which receives the path of the target (or NULL)
+ *
+ * overlay_get_target() retrieves the target offset in the base
+ * device tree of a fragment, no matter how the actual targeting is
+ * done (through a phandle or a path)
+ *
+ * returns:
+ *      the targeted node offset in the base device tree
+ *      Negative error code on error
+ */
+static int overlay_get_target(const void *fdt, const void *fdto,
+			      int fragment, char const **pathp)
+{
+	uint32_t phandle;
+	const char *path = NULL;
+	int path_len = 0, ret;
+
+	/* Try first to do a phandle based lookup */
+	phandle = overlay_get_target_phandle(fdto, fragment);
+	if (phandle == (uint32_t)-1)
+		return -FDT_ERR_BADPHANDLE;
+
+	/* no phandle, try path */
+	if (!phandle) {
+		/* And then a path based lookup */
+		path = fdt_getprop(fdto, fragment, "target-path", &path_len);
+		if (path)
+			ret = fdt_path_offset(fdt, path);
+		else
+			ret = path_len;
+	} else
+		ret = fdt_node_offset_by_phandle(fdt, phandle);
+
+	/*
+	* If we haven't found either a target or a
+	* target-path property in a node that contains a
+	* __overlay__ subnode (we wouldn't be called
+	* otherwise), consider it a improperly written
+	* overlay
+	*/
+	if (ret < 0 && path_len == -FDT_ERR_NOTFOUND)
+		ret = -FDT_ERR_BADOVERLAY;
+
+	/* return on error */
+	if (ret < 0)
+		return ret;
+
+	/* return pointer to path (if available) */
+	if (pathp)
+		*pathp = path ? path : NULL;
+
+	return ret;
+}
+
+/**
+ * overlay_phandle_add_offset - Increases a phandle by an offset
+ * @fdt: Base device tree blob
+ * @node: Device tree overlay blob
+ * @name: Name of the property to modify (phandle or linux,phandle)
+ * @delta: offset to apply
+ *
+ * overlay_phandle_add_offset() increments a node phandle by a given
+ * offset.
+ *
+ * returns:
+ *      0 on success.
+ *      Negative error code on error
+ */
+static int overlay_phandle_add_offset(void *fdt, int node,
+				      const char *name, uint32_t delta)
+{
+	const fdt32_t *val;
+	uint32_t adj_val;
+	int len;
+
+	val = fdt_getprop(fdt, node, name, &len);
+	if (!val)
+		return len;
+
+	if (len != sizeof(*val))
+		return -FDT_ERR_BADPHANDLE;
+
+	adj_val = fdt32_to_cpu(*val);
+	if ((adj_val + delta) < adj_val)
+		return -FDT_ERR_NOPHANDLES;
+
+	adj_val += delta;
+	if (adj_val == (uint32_t)-1)
+		return -FDT_ERR_NOPHANDLES;
+
+	return fdt_setprop_inplace_u32(fdt, node, name, adj_val);
+}
+
+/**
+ * overlay_adjust_node_phandles - Offsets the phandles of a node
+ * @fdto: Device tree overlay blob
+ * @node: Offset of the node we want to adjust
+ * @delta: Offset to shift the phandles of
+ *
+ * overlay_adjust_node_phandles() adds a constant to all the phandles
+ * of a given node. This is mainly use as part of the overlay
+ * application process, when we want to update all the overlay
+ * phandles to not conflict with the overlays of the base device tree.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_adjust_node_phandles(void *fdto, int node,
+					uint32_t delta)
+{
+	int child;
+	int ret;
+
+	ret = overlay_phandle_add_offset(fdto, node, "phandle", delta);
+	if (ret && ret != -FDT_ERR_NOTFOUND)
+		return ret;
+
+	ret = overlay_phandle_add_offset(fdto, node, "linux,phandle", delta);
+	if (ret && ret != -FDT_ERR_NOTFOUND)
+		return ret;
+
+	fdt_for_each_subnode(child, fdto, node) {
+		ret = overlay_adjust_node_phandles(fdto, child, delta);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * overlay_adjust_local_phandles - Adjust the phandles of a whole overlay
+ * @fdto: Device tree overlay blob
+ * @delta: Offset to shift the phandles of
+ *
+ * overlay_adjust_local_phandles() adds a constant to all the
+ * phandles of an overlay. This is mainly use as part of the overlay
+ * application process, when we want to update all the overlay
+ * phandles to not conflict with the overlays of the base device tree.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_adjust_local_phandles(void *fdto, uint32_t delta)
+{
+	/*
+	 * Start adjusting the phandles from the overlay root
+	 */
+	return overlay_adjust_node_phandles(fdto, 0, delta);
+}
+
+/**
+ * overlay_update_local_node_references - Adjust the overlay references
+ * @fdto: Device tree overlay blob
+ * @tree_node: Node offset of the node to operate on
+ * @fixup_node: Node offset of the matching local fixups node
+ * @delta: Offset to shift the phandles of
+ *
+ * overlay_update_local_nodes_references() update the phandles
+ * pointing to a node within the device tree overlay by adding a
+ * constant delta.
+ *
+ * This is mainly used as part of a device tree application process,
+ * where you want the device tree overlays phandles to not conflict
+ * with the ones from the base device tree before merging them.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_update_local_node_references(void *fdto,
+						int tree_node,
+						int fixup_node,
+						uint32_t delta)
+{
+	int fixup_prop;
+	int fixup_child;
+	int ret;
+
+	fdt_for_each_property_offset(fixup_prop, fdto, fixup_node) {
+		const fdt32_t *fixup_val;
+		const char *tree_val;
+		const char *name;
+		int fixup_len;
+		int tree_len;
+		int i;
+
+		fixup_val = fdt_getprop_by_offset(fdto, fixup_prop,
+						  &name, &fixup_len);
+		if (!fixup_val)
+			return fixup_len;
+
+		if (fixup_len % sizeof(uint32_t))
+			return -FDT_ERR_BADOVERLAY;
+
+		tree_val = fdt_getprop(fdto, tree_node, name, &tree_len);
+		if (!tree_val) {
+			if (tree_len == -FDT_ERR_NOTFOUND)
+				return -FDT_ERR_BADOVERLAY;
+
+			return tree_len;
+		}
+
+		for (i = 0; i < (fixup_len / sizeof(uint32_t)); i++) {
+			fdt32_t adj_val;
+			uint32_t poffset;
+
+			poffset = fdt32_to_cpu(fixup_val[i]);
+
+			/*
+			 * phandles to fixup can be unaligned.
+			 *
+			 * Use a memcpy for the architectures that do
+			 * not support unaligned accesses.
+			 */
+			memcpy(&adj_val, tree_val + poffset, sizeof(adj_val));
+
+			adj_val = cpu_to_fdt32(fdt32_to_cpu(adj_val) + delta);
+
+			ret = fdt_setprop_inplace_namelen_partial(fdto,
+								  tree_node,
+								  name,
+								  strlen(name),
+								  poffset,
+								  &adj_val,
+								  sizeof(adj_val));
+			if (ret == -FDT_ERR_NOSPACE)
+				return -FDT_ERR_BADOVERLAY;
+
+			if (ret)
+				return ret;
+		}
+	}
+
+	fdt_for_each_subnode(fixup_child, fdto, fixup_node) {
+		const char *fixup_child_name = fdt_get_name(fdto, fixup_child,
+							    NULL);
+		int tree_child;
+
+		tree_child = fdt_subnode_offset(fdto, tree_node,
+						fixup_child_name);
+		if (tree_child == -FDT_ERR_NOTFOUND)
+			return -FDT_ERR_BADOVERLAY;
+		if (tree_child < 0)
+			return tree_child;
+
+		ret = overlay_update_local_node_references(fdto,
+							   tree_child,
+							   fixup_child,
+							   delta);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * overlay_update_local_references - Adjust the overlay references
+ * @fdto: Device tree overlay blob
+ * @delta: Offset to shift the phandles of
+ *
+ * overlay_update_local_references() update all the phandles pointing
+ * to a node within the device tree overlay by adding a constant
+ * delta to not conflict with the base overlay.
+ *
+ * This is mainly used as part of a device tree application process,
+ * where you want the device tree overlays phandles to not conflict
+ * with the ones from the base device tree before merging them.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_update_local_references(void *fdto, uint32_t delta)
+{
+	int fixups;
+
+	fixups = fdt_path_offset(fdto, "/__local_fixups__");
+	if (fixups < 0) {
+		/* There's no local phandles to adjust, bail out */
+		if (fixups == -FDT_ERR_NOTFOUND)
+			return 0;
+
+		return fixups;
+	}
+
+	/*
+	 * Update our local references from the root of the tree
+	 */
+	return overlay_update_local_node_references(fdto, 0, fixups,
+						    delta);
+}
+
+/**
+ * overlay_fixup_one_phandle - Set an overlay phandle to the base one
+ * @fdt: Base Device Tree blob
+ * @fdto: Device tree overlay blob
+ * @symbols_off: Node offset of the symbols node in the base device tree
+ * @path: Path to a node holding a phandle in the overlay
+ * @path_len: number of path characters to consider
+ * @name: Name of the property holding the phandle reference in the overlay
+ * @name_len: number of name characters to consider
+ * @poffset: Offset within the overlay property where the phandle is stored
+ * @label: Label of the node referenced by the phandle
+ *
+ * overlay_fixup_one_phandle() resolves an overlay phandle pointing to
+ * a node in the base device tree.
+ *
+ * This is part of the device tree overlay application process, when
+ * you want all the phandles in the overlay to point to the actual
+ * base dt nodes.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_fixup_one_phandle(void *fdt, void *fdto,
+				     int symbols_off,
+				     const char *path, uint32_t path_len,
+				     const char *name, uint32_t name_len,
+				     int poffset, const char *label)
+{
+	const char *symbol_path;
+	uint32_t phandle;
+	fdt32_t phandle_prop;
+	int symbol_off, fixup_off;
+	int prop_len;
+
+	if (symbols_off < 0)
+		return symbols_off;
+
+	symbol_path = fdt_getprop(fdt, symbols_off, label,
+				  &prop_len);
+	if (!symbol_path)
+		return prop_len;
+
+	symbol_off = fdt_path_offset(fdt, symbol_path);
+	if (symbol_off < 0)
+		return symbol_off;
+
+	phandle = fdt_get_phandle(fdt, symbol_off);
+	if (!phandle)
+		return -FDT_ERR_NOTFOUND;
+
+	fixup_off = fdt_path_offset_namelen(fdto, path, path_len);
+	if (fixup_off == -FDT_ERR_NOTFOUND)
+		return -FDT_ERR_BADOVERLAY;
+	if (fixup_off < 0)
+		return fixup_off;
+
+	phandle_prop = cpu_to_fdt32(phandle);
+	return fdt_setprop_inplace_namelen_partial(fdto, fixup_off,
+						   name, name_len, poffset,
+						   &phandle_prop,
+						   sizeof(phandle_prop));
+};
+
+/**
+ * overlay_fixup_phandle - Set an overlay phandle to the base one
+ * @fdt: Base Device Tree blob
+ * @fdto: Device tree overlay blob
+ * @symbols_off: Node offset of the symbols node in the base device tree
+ * @property: Property offset in the overlay holding the list of fixups
+ *
+ * overlay_fixup_phandle() resolves all the overlay phandles pointed
+ * to in a __fixups__ property, and updates them to match the phandles
+ * in use in the base device tree.
+ *
+ * This is part of the device tree overlay application process, when
+ * you want all the phandles in the overlay to point to the actual
+ * base dt nodes.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_fixup_phandle(void *fdt, void *fdto, int symbols_off,
+				 int property)
+{
+	const char *value;
+	const char *label;
+	int len;
+
+	value = fdt_getprop_by_offset(fdto, property,
+				      &label, &len);
+	if (!value) {
+		if (len == -FDT_ERR_NOTFOUND)
+			return -FDT_ERR_INTERNAL;
+
+		return len;
+	}
+
+	do {
+		const char *path, *name, *fixup_end;
+		const char *fixup_str = value;
+		uint32_t path_len, name_len;
+		uint32_t fixup_len;
+		char *sep, *endptr;
+		int poffset, ret;
+
+		fixup_end = memchr(value, '\0', len);
+		if (!fixup_end)
+			return -FDT_ERR_BADOVERLAY;
+		fixup_len = fixup_end - fixup_str;
+
+		len -= fixup_len + 1;
+		value += fixup_len + 1;
+
+		path = fixup_str;
+		sep = memchr(fixup_str, ':', fixup_len);
+		if (!sep || *sep != ':')
+			return -FDT_ERR_BADOVERLAY;
+
+		path_len = sep - path;
+		if (path_len == (fixup_len - 1))
+			return -FDT_ERR_BADOVERLAY;
+
+		fixup_len -= path_len + 1;
+		name = sep + 1;
+		sep = memchr(name, ':', fixup_len);
+		if (!sep || *sep != ':')
+			return -FDT_ERR_BADOVERLAY;
+
+		name_len = sep - name;
+		if (!name_len)
+			return -FDT_ERR_BADOVERLAY;
+
+		poffset = strtoul(sep + 1, &endptr, 10);
+		if ((*endptr != '\0') || (endptr <= (sep + 1)))
+			return -FDT_ERR_BADOVERLAY;
+
+		ret = overlay_fixup_one_phandle(fdt, fdto, symbols_off,
+						path, path_len, name, name_len,
+						poffset, label);
+		if (ret)
+			return ret;
+	} while (len > 0);
+
+	return 0;
+}
+
+/**
+ * overlay_fixup_phandles - Resolve the overlay phandles to the base
+ *                          device tree
+ * @fdt: Base Device Tree blob
+ * @fdto: Device tree overlay blob
+ *
+ * overlay_fixup_phandles() resolves all the overlay phandles pointing
+ * to nodes in the base device tree.
+ *
+ * This is one of the steps of the device tree overlay application
+ * process, when you want all the phandles in the overlay to point to
+ * the actual base dt nodes.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_fixup_phandles(void *fdt, void *fdto)
+{
+	int fixups_off, symbols_off;
+	int property;
+
+	/* We can have overlays without any fixups */
+	fixups_off = fdt_path_offset(fdto, "/__fixups__");
+	if (fixups_off == -FDT_ERR_NOTFOUND)
+		return 0; /* nothing to do */
+	if (fixups_off < 0)
+		return fixups_off;
+
+	/* And base DTs without symbols */
+	symbols_off = fdt_path_offset(fdt, "/__symbols__");
+	if ((symbols_off < 0 && (symbols_off != -FDT_ERR_NOTFOUND)))
+		return symbols_off;
+
+	fdt_for_each_property_offset(property, fdto, fixups_off) {
+		int ret;
+
+		ret = overlay_fixup_phandle(fdt, fdto, symbols_off, property);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * overlay_apply_node - Merges a node into the base device tree
+ * @fdt: Base Device Tree blob
+ * @target: Node offset in the base device tree to apply the fragment to
+ * @fdto: Device tree overlay blob
+ * @node: Node offset in the overlay holding the changes to merge
+ *
+ * overlay_apply_node() merges a node into a target base device tree
+ * node pointed.
+ *
+ * This is part of the final step in the device tree overlay
+ * application process, when all the phandles have been adjusted and
+ * resolved and you just have to merge overlay into the base device
+ * tree.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_apply_node(void *fdt, int target,
+			      void *fdto, int node)
+{
+	int property;
+	int subnode;
+
+	fdt_for_each_property_offset(property, fdto, node) {
+		const char *name;
+		const void *prop;
+		int prop_len;
+		int ret;
+
+		prop = fdt_getprop_by_offset(fdto, property, &name,
+					     &prop_len);
+		if (prop_len == -FDT_ERR_NOTFOUND)
+			return -FDT_ERR_INTERNAL;
+		if (prop_len < 0)
+			return prop_len;
+
+		ret = fdt_setprop(fdt, target, name, prop, prop_len);
+		if (ret)
+			return ret;
+	}
+
+	fdt_for_each_subnode(subnode, fdto, node) {
+		const char *name = fdt_get_name(fdto, subnode, NULL);
+		int nnode;
+		int ret;
+
+		nnode = fdt_add_subnode(fdt, target, name);
+		if (nnode == -FDT_ERR_EXISTS) {
+			nnode = fdt_subnode_offset(fdt, target, name);
+			if (nnode == -FDT_ERR_NOTFOUND)
+				return -FDT_ERR_INTERNAL;
+		}
+
+		if (nnode < 0)
+			return nnode;
+
+		ret = overlay_apply_node(fdt, nnode, fdto, subnode);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * overlay_merge - Merge an overlay into its base device tree
+ * @fdt: Base Device Tree blob
+ * @fdto: Device tree overlay blob
+ *
+ * overlay_merge() merges an overlay into its base device tree.
+ *
+ * This is the next to last step in the device tree overlay application
+ * process, when all the phandles have been adjusted and resolved and
+ * you just have to merge overlay into the base device tree.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_merge(void *fdt, void *fdto)
+{
+	int fragment;
+
+	fdt_for_each_subnode(fragment, fdto, 0) {
+		int overlay;
+		int target;
+		int ret;
+
+		/*
+		 * Each fragments will have an __overlay__ node. If
+		 * they don't, it's not supposed to be merged
+		 */
+		overlay = fdt_subnode_offset(fdto, fragment, "__overlay__");
+		if (overlay == -FDT_ERR_NOTFOUND)
+			continue;
+
+		if (overlay < 0)
+			return overlay;
+
+		target = overlay_get_target(fdt, fdto, fragment, NULL);
+		if (target < 0)
+			return target;
+
+		ret = overlay_apply_node(fdt, target, fdto, overlay);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int get_path_len(const void *fdt, int nodeoffset)
+{
+	int len = 0, namelen;
+	const char *name;
+
+	FDT_RO_PROBE(fdt);
+
+	for (;;) {
+		name = fdt_get_name(fdt, nodeoffset, &namelen);
+		if (!name)
+			return namelen;
+
+		/* root? we're done */
+		if (namelen == 0)
+			break;
+
+		nodeoffset = fdt_parent_offset(fdt, nodeoffset);
+		if (nodeoffset < 0)
+			return nodeoffset;
+		len += namelen + 1;
+	}
+
+	/* in case of root pretend it's "/" */
+	if (len == 0)
+		len++;
+	return len;
+}
+
+/**
+ * overlay_symbol_update - Update the symbols of base tree after a merge
+ * @fdt: Base Device Tree blob
+ * @fdto: Device tree overlay blob
+ *
+ * overlay_symbol_update() updates the symbols of the base tree with the
+ * symbols of the applied overlay
+ *
+ * This is the last step in the device tree overlay application
+ * process, allowing the reference of overlay symbols by subsequent
+ * overlay operations.
+ *
+ * returns:
+ *      0 on success
+ *      Negative error code on failure
+ */
+static int overlay_symbol_update(void *fdt, void *fdto)
+{
+	int root_sym, ov_sym, prop, path_len, fragment, target;
+	int len, frag_name_len, ret, rel_path_len;
+	const char *s, *e;
+	const char *path;
+	const char *name;
+	const char *frag_name;
+	const char *rel_path;
+	const char *target_path;
+	char *buf;
+	void *p;
+
+	ov_sym = fdt_subnode_offset(fdto, 0, "__symbols__");
+
+	/* if no overlay symbols exist no problem */
+	if (ov_sym < 0)
+		return 0;
+
+	root_sym = fdt_subnode_offset(fdt, 0, "__symbols__");
+
+	/* it no root symbols exist we should create them */
+	if (root_sym == -FDT_ERR_NOTFOUND)
+		root_sym = fdt_add_subnode(fdt, 0, "__symbols__");
+
+	/* any error is fatal now */
+	if (root_sym < 0)
+		return root_sym;
+
+	/* iterate over each overlay symbol */
+	fdt_for_each_property_offset(prop, fdto, ov_sym) {
+		path = fdt_getprop_by_offset(fdto, prop, &name, &path_len);
+		if (!path)
+			return path_len;
+
+		/* verify it's a string property (terminated by a single \0) */
+		if (path_len < 1 || memchr(path, '\0', path_len) != &path[path_len - 1])
+			return -FDT_ERR_BADVALUE;
+
+		/* keep end marker to avoid strlen() */
+		e = path + path_len;
+
+		if (*path != '/')
+			return -FDT_ERR_BADVALUE;
+
+		/* get fragment name first */
+		s = strchr(path + 1, '/');
+		if (!s) {
+			/* Symbol refers to something that won't end
+			 * up in the target tree */
+			continue;
+		}
+
+		frag_name = path + 1;
+		frag_name_len = s - path - 1;
+
+		/* verify format; safe since "s" lies in \0 terminated prop */
+		len = sizeof("/__overlay__/") - 1;
+		if ((e - s) > len && (memcmp(s, "/__overlay__/", len) == 0)) {
+			/* /<fragment-name>/__overlay__/<relative-subnode-path> */
+			rel_path = s + len;
+			rel_path_len = e - rel_path;
+		} else if ((e - s) == len
+			   && (memcmp(s, "/__overlay__", len - 1) == 0)) {
+			/* /<fragment-name>/__overlay__ */
+			rel_path = "";
+			rel_path_len = 0;
+		} else {
+			/* Symbol refers to something that won't end
+			 * up in the target tree */
+			continue;
+		}
+
+		/* find the fragment index in which the symbol lies */
+		ret = fdt_subnode_offset_namelen(fdto, 0, frag_name,
+					       frag_name_len);
+		/* not found? */
+		if (ret < 0)
+			return -FDT_ERR_BADOVERLAY;
+		fragment = ret;
+
+		/* an __overlay__ subnode must exist */
+		ret = fdt_subnode_offset(fdto, fragment, "__overlay__");
+		if (ret < 0)
+			return -FDT_ERR_BADOVERLAY;
+
+		/* get the target of the fragment */
+		ret = overlay_get_target(fdt, fdto, fragment, &target_path);
+		if (ret < 0)
+			return ret;
+		target = ret;
+
+		/* if we have a target path use */
+		if (!target_path) {
+			ret = get_path_len(fdt, target);
+			if (ret < 0)
+				return ret;
+			len = ret;
+		} else {
+			len = strlen(target_path);
+		}
+
+		ret = fdt_setprop_placeholder(fdt, root_sym, name,
+				len + (len > 1) + rel_path_len + 1, &p);
+		if (ret < 0)
+			return ret;
+
+		if (!target_path) {
+			/* again in case setprop_placeholder changed it */
+			ret = overlay_get_target(fdt, fdto, fragment, &target_path);
+			if (ret < 0)
+				return ret;
+			target = ret;
+		}
+
+		buf = p;
+		if (len > 1) { /* target is not root */
+			if (!target_path) {
+				ret = fdt_get_path(fdt, target, buf, len + 1);
+				if (ret < 0)
+					return ret;
+			} else
+				memcpy(buf, target_path, len + 1);
+
+		} else
+			len--;
+
+		buf[len] = '/';
+		memcpy(buf + len + 1, rel_path, rel_path_len);
+		buf[len + 1 + rel_path_len] = '\0';
+	}
+
+	return 0;
+}
+
+int fdt_overlay_apply(void *fdt, void *fdto)
+{
+	uint32_t delta;
+	int ret;
+
+	FDT_RO_PROBE(fdt);
+	FDT_RO_PROBE(fdto);
+
+	ret = fdt_find_max_phandle(fdt, &delta);
+	if (ret)
+		goto err;
+
+	ret = overlay_adjust_local_phandles(fdto, delta);
+	if (ret)
+		goto err;
+
+	ret = overlay_update_local_references(fdto, delta);
+	if (ret)
+		goto err;
+
+	ret = overlay_fixup_phandles(fdt, fdto);
+	if (ret)
+		goto err;
+
+	ret = overlay_merge(fdt, fdto);
+	if (ret)
+		goto err;
+
+	ret = overlay_symbol_update(fdt, fdto);
+	if (ret)
+		goto err;
+
+	/*
+	 * The overlay has been damaged, erase its magic.
+	 */
+	fdt_set_magic(fdto, ~0);
+
+	return 0;
+
+err:
+	/*
+	 * The overlay might have been damaged, erase its magic.
+	 */
+	fdt_set_magic(fdto, ~0);
+
+	/*
+	 * The base device tree might have been damaged, erase its
+	 * magic.
+	 */
+	fdt_set_magic(fdt, ~0);
+
+	return ret;
+}
diff --git a/fdt/fdt_ro.c b/fdt/fdt_ro.c
new file mode 100644
index 0000000000..a5c2797cde
--- /dev/null
+++ b/fdt/fdt_ro.c
@@ -0,0 +1,898 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+static int fdt_nodename_eq_(const void *fdt, int offset,
+			    const char *s, int len)
+{
+	int olen;
+	const char *p = fdt_get_name(fdt, offset, &olen);
+
+	if (!p || olen < len)
+		/* short match */
+		return 0;
+
+	if (memcmp(p, s, len) != 0)
+		return 0;
+
+	if (p[len] == '\0')
+		return 1;
+	else if (!memchr(s, '@', len) && (p[len] == '@'))
+		return 1;
+	else
+		return 0;
+}
+
+const char *fdt_get_string(const void *fdt, int stroffset, int *lenp)
+{
+	int32_t totalsize = fdt_ro_probe_(fdt);
+	uint32_t absoffset = stroffset + fdt_off_dt_strings(fdt);
+	size_t len;
+	int err;
+	const char *s, *n;
+
+	err = totalsize;
+	if (totalsize < 0)
+		goto fail;
+
+	err = -FDT_ERR_BADOFFSET;
+	if (absoffset >= totalsize)
+		goto fail;
+	len = totalsize - absoffset;
+
+	if (fdt_magic(fdt) == FDT_MAGIC) {
+		if (stroffset < 0)
+			goto fail;
+		if (fdt_version(fdt) >= 17) {
+			if (stroffset >= fdt_size_dt_strings(fdt))
+				goto fail;
+			if ((fdt_size_dt_strings(fdt) - stroffset) < len)
+				len = fdt_size_dt_strings(fdt) - stroffset;
+		}
+	} else if (fdt_magic(fdt) == FDT_SW_MAGIC) {
+		if ((stroffset >= 0)
+		    || (stroffset < -fdt_size_dt_strings(fdt)))
+			goto fail;
+		if ((-stroffset) < len)
+			len = -stroffset;
+	} else {
+		err = -FDT_ERR_INTERNAL;
+		goto fail;
+	}
+
+	s = (const char *)fdt + absoffset;
+	n = memchr(s, '\0', len);
+	if (!n) {
+		/* missing terminating NULL */
+		err = -FDT_ERR_TRUNCATED;
+		goto fail;
+	}
+
+	if (lenp)
+		*lenp = n - s;
+	return s;
+
+fail:
+	if (lenp)
+		*lenp = err;
+	return NULL;
+}
+
+const char *fdt_string(const void *fdt, int stroffset)
+{
+	return fdt_get_string(fdt, stroffset, NULL);
+}
+
+static int fdt_string_eq_(const void *fdt, int stroffset,
+			  const char *s, int len)
+{
+	int slen;
+	const char *p = fdt_get_string(fdt, stroffset, &slen);
+
+	return p && (slen == len) && (memcmp(p, s, len) == 0);
+}
+
+int fdt_find_max_phandle(const void *fdt, uint32_t *phandle)
+{
+	uint32_t max = 0;
+	int offset = -1;
+
+	while (true) {
+		uint32_t value;
+
+		offset = fdt_next_node(fdt, offset, NULL);
+		if (offset < 0) {
+			if (offset == -FDT_ERR_NOTFOUND)
+				break;
+
+			return offset;
+		}
+
+		value = fdt_get_phandle(fdt, offset);
+
+		if (value > max)
+			max = value;
+	}
+
+	if (phandle)
+		*phandle = max;
+
+	return 0;
+}
+
+int fdt_generate_phandle(const void *fdt, uint32_t *phandle)
+{
+	uint32_t max;
+	int err;
+
+	err = fdt_find_max_phandle(fdt, &max);
+	if (err < 0)
+		return err;
+
+	if (max == FDT_MAX_PHANDLE)
+		return -FDT_ERR_NOPHANDLES;
+
+	if (phandle)
+		*phandle = max + 1;
+
+	return 0;
+}
+
+static const struct fdt_reserve_entry *fdt_mem_rsv(const void *fdt, int n)
+{
+	int offset = n * sizeof(struct fdt_reserve_entry);
+	int absoffset = fdt_off_mem_rsvmap(fdt) + offset;
+
+	if (absoffset < fdt_off_mem_rsvmap(fdt))
+		return NULL;
+	if (absoffset > fdt_totalsize(fdt) - sizeof(struct fdt_reserve_entry))
+		return NULL;
+	return fdt_mem_rsv_(fdt, n);
+}
+
+int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size)
+{
+	const struct fdt_reserve_entry *re;
+
+	FDT_RO_PROBE(fdt);
+	re = fdt_mem_rsv(fdt, n);
+	if (!re)
+		return -FDT_ERR_BADOFFSET;
+
+	*address = fdt64_ld(&re->address);
+	*size = fdt64_ld(&re->size);
+	return 0;
+}
+
+int fdt_num_mem_rsv(const void *fdt)
+{
+	int i;
+	const struct fdt_reserve_entry *re;
+
+	for (i = 0; (re = fdt_mem_rsv(fdt, i)) != NULL; i++) {
+		if (fdt64_ld(&re->size) == 0)
+			return i;
+	}
+	return -FDT_ERR_TRUNCATED;
+}
+
+static int nextprop_(const void *fdt, int offset)
+{
+	uint32_t tag;
+	int nextoffset;
+
+	do {
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+
+		switch (tag) {
+		case FDT_END:
+			if (nextoffset >= 0)
+				return -FDT_ERR_BADSTRUCTURE;
+			else
+				return nextoffset;
+
+		case FDT_PROP:
+			return offset;
+		}
+		offset = nextoffset;
+	} while (tag == FDT_NOP);
+
+	return -FDT_ERR_NOTFOUND;
+}
+
+int fdt_subnode_offset_namelen(const void *fdt, int offset,
+			       const char *name, int namelen)
+{
+	int depth;
+
+	FDT_RO_PROBE(fdt);
+
+	for (depth = 0;
+	     (offset >= 0) && (depth >= 0);
+	     offset = fdt_next_node(fdt, offset, &depth))
+		if ((depth == 1)
+		    && fdt_nodename_eq_(fdt, offset, name, namelen))
+			return offset;
+
+	if (depth < 0)
+		return -FDT_ERR_NOTFOUND;
+	return offset; /* error */
+}
+
+int fdt_subnode_offset(const void *fdt, int parentoffset,
+		       const char *name)
+{
+	return fdt_subnode_offset_namelen(fdt, parentoffset, name, strlen(name));
+}
+
+int fdt_path_offset_namelen(const void *fdt, const char *path, int namelen)
+{
+	const char *end = path + namelen;
+	const char *p = path;
+	int offset = 0;
+
+	FDT_RO_PROBE(fdt);
+
+	/* see if we have an alias */
+	if (*path != '/') {
+		const char *q = memchr(path, '/', end - p);
+
+		if (!q)
+			q = end;
+
+		p = fdt_get_alias_namelen(fdt, p, q - p);
+		if (!p)
+			return -FDT_ERR_BADPATH;
+		offset = fdt_path_offset(fdt, p);
+
+		p = q;
+	}
+
+	while (p < end) {
+		const char *q;
+
+		while (*p == '/') {
+			p++;
+			if (p == end)
+				return offset;
+		}
+		q = memchr(p, '/', end - p);
+		if (! q)
+			q = end;
+
+		offset = fdt_subnode_offset_namelen(fdt, offset, p, q-p);
+		if (offset < 0)
+			return offset;
+
+		p = q;
+	}
+
+	return offset;
+}
+
+int fdt_path_offset(const void *fdt, const char *path)
+{
+	return fdt_path_offset_namelen(fdt, path, strlen(path));
+}
+
+const char *fdt_get_name(const void *fdt, int nodeoffset, int *len)
+{
+	const struct fdt_node_header *nh = fdt_offset_ptr_(fdt, nodeoffset);
+	const char *nameptr;
+	int err;
+
+	if (((err = fdt_ro_probe_(fdt)) < 0)
+	    || ((err = fdt_check_node_offset_(fdt, nodeoffset)) < 0))
+			goto fail;
+
+	nameptr = nh->name;
+
+	if (fdt_version(fdt) < 0x10) {
+		/*
+		 * For old FDT versions, match the naming conventions of V16:
+		 * give only the leaf name (after all /). The actual tree
+		 * contents are loosely checked.
+		 */
+		const char *leaf;
+		leaf = strrchr(nameptr, '/');
+		if (leaf == NULL) {
+			err = -FDT_ERR_BADSTRUCTURE;
+			goto fail;
+		}
+		nameptr = leaf+1;
+	}
+
+	if (len)
+		*len = strlen(nameptr);
+
+	return nameptr;
+
+ fail:
+	if (len)
+		*len = err;
+	return NULL;
+}
+
+int fdt_first_property_offset(const void *fdt, int nodeoffset)
+{
+	int offset;
+
+	if ((offset = fdt_check_node_offset_(fdt, nodeoffset)) < 0)
+		return offset;
+
+	return nextprop_(fdt, offset);
+}
+
+int fdt_next_property_offset(const void *fdt, int offset)
+{
+	if ((offset = fdt_check_prop_offset_(fdt, offset)) < 0)
+		return offset;
+
+	return nextprop_(fdt, offset);
+}
+
+static const struct fdt_property *fdt_get_property_by_offset_(const void *fdt,
+						              int offset,
+						              int *lenp)
+{
+	int err;
+	const struct fdt_property *prop;
+
+	if ((err = fdt_check_prop_offset_(fdt, offset)) < 0) {
+		if (lenp)
+			*lenp = err;
+		return NULL;
+	}
+
+	prop = fdt_offset_ptr_(fdt, offset);
+
+	if (lenp)
+		*lenp = fdt32_ld(&prop->len);
+
+	return prop;
+}
+
+const struct fdt_property *fdt_get_property_by_offset(const void *fdt,
+						      int offset,
+						      int *lenp)
+{
+	/* Prior to version 16, properties may need realignment
+	 * and this API does not work. fdt_getprop_*() will, however. */
+
+	if (fdt_version(fdt) < 0x10) {
+		if (lenp)
+			*lenp = -FDT_ERR_BADVERSION;
+		return NULL;
+	}
+
+	return fdt_get_property_by_offset_(fdt, offset, lenp);
+}
+
+static const struct fdt_property *fdt_get_property_namelen_(const void *fdt,
+						            int offset,
+						            const char *name,
+						            int namelen,
+							    int *lenp,
+							    int *poffset)
+{
+	for (offset = fdt_first_property_offset(fdt, offset);
+	     (offset >= 0);
+	     (offset = fdt_next_property_offset(fdt, offset))) {
+		const struct fdt_property *prop;
+
+		if (!(prop = fdt_get_property_by_offset_(fdt, offset, lenp))) {
+			offset = -FDT_ERR_INTERNAL;
+			break;
+		}
+		if (fdt_string_eq_(fdt, fdt32_ld(&prop->nameoff),
+				   name, namelen)) {
+			if (poffset)
+				*poffset = offset;
+			return prop;
+		}
+	}
+
+	if (lenp)
+		*lenp = offset;
+	return NULL;
+}
+
+
+const struct fdt_property *fdt_get_property_namelen(const void *fdt,
+						    int offset,
+						    const char *name,
+						    int namelen, int *lenp)
+{
+	/* Prior to version 16, properties may need realignment
+	 * and this API does not work. fdt_getprop_*() will, however. */
+	if (fdt_version(fdt) < 0x10) {
+		if (lenp)
+			*lenp = -FDT_ERR_BADVERSION;
+		return NULL;
+	}
+
+	return fdt_get_property_namelen_(fdt, offset, name, namelen, lenp,
+					 NULL);
+}
+
+
+const struct fdt_property *fdt_get_property(const void *fdt,
+					    int nodeoffset,
+					    const char *name, int *lenp)
+{
+	return fdt_get_property_namelen(fdt, nodeoffset, name,
+					strlen(name), lenp);
+}
+
+const void *fdt_getprop_namelen(const void *fdt, int nodeoffset,
+				const char *name, int namelen, int *lenp)
+{
+	int poffset;
+	const struct fdt_property *prop;
+
+	prop = fdt_get_property_namelen_(fdt, nodeoffset, name, namelen, lenp,
+					 &poffset);
+	if (!prop)
+		return NULL;
+
+	/* Handle realignment */
+	if (fdt_version(fdt) < 0x10 && (poffset + sizeof(*prop)) % 8 &&
+	    fdt32_ld(&prop->len) >= 8)
+		return prop->data + 4;
+	return prop->data;
+}
+
+const void *fdt_getprop_by_offset(const void *fdt, int offset,
+				  const char **namep, int *lenp)
+{
+	const struct fdt_property *prop;
+
+	prop = fdt_get_property_by_offset_(fdt, offset, lenp);
+	if (!prop)
+		return NULL;
+	if (namep) {
+		const char *name;
+		int namelen;
+		name = fdt_get_string(fdt, fdt32_ld(&prop->nameoff),
+				      &namelen);
+		if (!name) {
+			if (lenp)
+				*lenp = namelen;
+			return NULL;
+		}
+		*namep = name;
+	}
+
+	/* Handle realignment */
+	if (fdt_version(fdt) < 0x10 && (offset + sizeof(*prop)) % 8 &&
+	    fdt32_ld(&prop->len) >= 8)
+		return prop->data + 4;
+	return prop->data;
+}
+
+const void *fdt_getprop(const void *fdt, int nodeoffset,
+			const char *name, int *lenp)
+{
+	return fdt_getprop_namelen(fdt, nodeoffset, name, strlen(name), lenp);
+}
+
+uint32_t fdt_get_phandle(const void *fdt, int nodeoffset)
+{
+	const fdt32_t *php;
+	int len;
+
+	/* FIXME: This is a bit sub-optimal, since we potentially scan
+	 * over all the properties twice. */
+	php = fdt_getprop(fdt, nodeoffset, "phandle", &len);
+	if (!php || (len != sizeof(*php))) {
+		php = fdt_getprop(fdt, nodeoffset, "linux,phandle", &len);
+		if (!php || (len != sizeof(*php)))
+			return 0;
+	}
+
+	return fdt32_ld(php);
+}
+
+const char *fdt_get_alias_namelen(const void *fdt,
+				  const char *name, int namelen)
+{
+	int aliasoffset;
+
+	aliasoffset = fdt_path_offset(fdt, "/aliases");
+	if (aliasoffset < 0)
+		return NULL;
+
+	return fdt_getprop_namelen(fdt, aliasoffset, name, namelen, NULL);
+}
+
+const char *fdt_get_alias(const void *fdt, const char *name)
+{
+	return fdt_get_alias_namelen(fdt, name, strlen(name));
+}
+
+int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen)
+{
+	int pdepth = 0, p = 0;
+	int offset, depth, namelen;
+	const char *name;
+
+	FDT_RO_PROBE(fdt);
+
+	if (buflen < 2)
+		return -FDT_ERR_NOSPACE;
+
+	for (offset = 0, depth = 0;
+	     (offset >= 0) && (offset <= nodeoffset);
+	     offset = fdt_next_node(fdt, offset, &depth)) {
+		while (pdepth > depth) {
+			do {
+				p--;
+			} while (buf[p-1] != '/');
+			pdepth--;
+		}
+
+		if (pdepth >= depth) {
+			name = fdt_get_name(fdt, offset, &namelen);
+			if (!name)
+				return namelen;
+			if ((p + namelen + 1) <= buflen) {
+				memcpy(buf + p, name, namelen);
+				p += namelen;
+				buf[p++] = '/';
+				pdepth++;
+			}
+		}
+
+		if (offset == nodeoffset) {
+			if (pdepth < (depth + 1))
+				return -FDT_ERR_NOSPACE;
+
+			if (p > 1) /* special case so that root path is "/", not "" */
+				p--;
+			buf[p] = '\0';
+			return 0;
+		}
+	}
+
+	if ((offset == -FDT_ERR_NOTFOUND) || (offset >= 0))
+		return -FDT_ERR_BADOFFSET;
+	else if (offset == -FDT_ERR_BADOFFSET)
+		return -FDT_ERR_BADSTRUCTURE;
+
+	return offset; /* error from fdt_next_node() */
+}
+
+int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset,
+				 int supernodedepth, int *nodedepth)
+{
+	int offset, depth;
+	int supernodeoffset = -FDT_ERR_INTERNAL;
+
+	FDT_RO_PROBE(fdt);
+
+	if (supernodedepth < 0)
+		return -FDT_ERR_NOTFOUND;
+
+	for (offset = 0, depth = 0;
+	     (offset >= 0) && (offset <= nodeoffset);
+	     offset = fdt_next_node(fdt, offset, &depth)) {
+		if (depth == supernodedepth)
+			supernodeoffset = offset;
+
+		if (offset == nodeoffset) {
+			if (nodedepth)
+				*nodedepth = depth;
+
+			if (supernodedepth > depth)
+				return -FDT_ERR_NOTFOUND;
+			else
+				return supernodeoffset;
+		}
+	}
+
+	if ((offset == -FDT_ERR_NOTFOUND) || (offset >= 0))
+		return -FDT_ERR_BADOFFSET;
+	else if (offset == -FDT_ERR_BADOFFSET)
+		return -FDT_ERR_BADSTRUCTURE;
+
+	return offset; /* error from fdt_next_node() */
+}
+
+int fdt_node_depth(const void *fdt, int nodeoffset)
+{
+	int nodedepth;
+	int err;
+
+	err = fdt_supernode_atdepth_offset(fdt, nodeoffset, 0, &nodedepth);
+	if (err)
+		return (err < 0) ? err : -FDT_ERR_INTERNAL;
+	return nodedepth;
+}
+
+int fdt_parent_offset(const void *fdt, int nodeoffset)
+{
+	int nodedepth = fdt_node_depth(fdt, nodeoffset);
+
+	if (nodedepth < 0)
+		return nodedepth;
+	return fdt_supernode_atdepth_offset(fdt, nodeoffset,
+					    nodedepth - 1, NULL);
+}
+
+int fdt_node_offset_by_prop_value(const void *fdt, int startoffset,
+				  const char *propname,
+				  const void *propval, int proplen)
+{
+	int offset;
+	const void *val;
+	int len;
+
+	FDT_RO_PROBE(fdt);
+
+	/* FIXME: The algorithm here is pretty horrible: we scan each
+	 * property of a node in fdt_getprop(), then if that didn't
+	 * find what we want, we scan over them again making our way
+	 * to the next node.  Still it's the easiest to implement
+	 * approach; performance can come later. */
+	for (offset = fdt_next_node(fdt, startoffset, NULL);
+	     offset >= 0;
+	     offset = fdt_next_node(fdt, offset, NULL)) {
+		val = fdt_getprop(fdt, offset, propname, &len);
+		if (val && (len == proplen)
+		    && (memcmp(val, propval, len) == 0))
+			return offset;
+	}
+
+	return offset; /* error from fdt_next_node() */
+}
+
+int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle)
+{
+	int offset;
+
+	if ((phandle == 0) || (phandle == -1))
+		return -FDT_ERR_BADPHANDLE;
+
+	FDT_RO_PROBE(fdt);
+
+	/* FIXME: The algorithm here is pretty horrible: we
+	 * potentially scan each property of a node in
+	 * fdt_get_phandle(), then if that didn't find what
+	 * we want, we scan over them again making our way to the next
+	 * node.  Still it's the easiest to implement approach;
+	 * performance can come later. */
+	for (offset = fdt_next_node(fdt, -1, NULL);
+	     offset >= 0;
+	     offset = fdt_next_node(fdt, offset, NULL)) {
+		if (fdt_get_phandle(fdt, offset) == phandle)
+			return offset;
+	}
+
+	return offset; /* error from fdt_next_node() */
+}
+
+int fdt_stringlist_contains(const char *strlist, int listlen, const char *str)
+{
+	int len = strlen(str);
+	const char *p;
+
+	while (listlen >= len) {
+		if (memcmp(str, strlist, len+1) == 0)
+			return 1;
+		p = memchr(strlist, '\0', listlen);
+		if (!p)
+			return 0; /* malformed strlist.. */
+		listlen -= (p-strlist) + 1;
+		strlist = p + 1;
+	}
+	return 0;
+}
+
+int fdt_stringlist_count(const void *fdt, int nodeoffset, const char *property)
+{
+	const char *list, *end;
+	int length, count = 0;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list)
+		return length;
+
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end)
+			return -FDT_ERR_BADVALUE;
+
+		list += length;
+		count++;
+	}
+
+	return count;
+}
+
+int fdt_stringlist_search(const void *fdt, int nodeoffset, const char *property,
+			  const char *string)
+{
+	int length, len, idx = 0;
+	const char *list, *end;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list)
+		return length;
+
+	len = strlen(string) + 1;
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end)
+			return -FDT_ERR_BADVALUE;
+
+		if (length == len && memcmp(list, string, length) == 0)
+			return idx;
+
+		list += length;
+		idx++;
+	}
+
+	return -FDT_ERR_NOTFOUND;
+}
+
+const char *fdt_stringlist_get(const void *fdt, int nodeoffset,
+			       const char *property, int idx,
+			       int *lenp)
+{
+	const char *list, *end;
+	int length;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list) {
+		if (lenp)
+			*lenp = length;
+
+		return NULL;
+	}
+
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end) {
+			if (lenp)
+				*lenp = -FDT_ERR_BADVALUE;
+
+			return NULL;
+		}
+
+		if (idx == 0) {
+			if (lenp)
+				*lenp = length - 1;
+
+			return list;
+		}
+
+		list += length;
+		idx--;
+	}
+
+	if (lenp)
+		*lenp = -FDT_ERR_NOTFOUND;
+
+	return NULL;
+}
+
+int fdt_node_check_compatible(const void *fdt, int nodeoffset,
+			      const char *compatible)
+{
+	const void *prop;
+	int len;
+
+	prop = fdt_getprop(fdt, nodeoffset, "compatible", &len);
+	if (!prop)
+		return len;
+
+	return !fdt_stringlist_contains(prop, len, compatible);
+}
+
+int fdt_node_offset_by_compatible(const void *fdt, int startoffset,
+				  const char *compatible)
+{
+	int offset, err;
+
+	FDT_RO_PROBE(fdt);
+
+	/* FIXME: The algorithm here is pretty horrible: we scan each
+	 * property of a node in fdt_node_check_compatible(), then if
+	 * that didn't find what we want, we scan over them again
+	 * making our way to the next node.  Still it's the easiest to
+	 * implement approach; performance can come later. */
+	for (offset = fdt_next_node(fdt, startoffset, NULL);
+	     offset >= 0;
+	     offset = fdt_next_node(fdt, offset, NULL)) {
+		err = fdt_node_check_compatible(fdt, offset, compatible);
+		if ((err < 0) && (err != -FDT_ERR_NOTFOUND))
+			return err;
+		else if (err == 0)
+			return offset;
+	}
+
+	return offset; /* error from fdt_next_node() */
+}
+
+int fdt_check_full(const void *fdt, size_t bufsize)
+{
+	int err;
+	int num_memrsv;
+	int offset, nextoffset = 0;
+	uint32_t tag;
+	unsigned depth = 0;
+	const void *prop;
+	const char *propname;
+
+	if (bufsize < FDT_V1_SIZE)
+		return -FDT_ERR_TRUNCATED;
+	err = fdt_check_header(fdt);
+	if (err != 0)
+		return err;
+	if (bufsize < fdt_totalsize(fdt))
+		return -FDT_ERR_TRUNCATED;
+
+	num_memrsv = fdt_num_mem_rsv(fdt);
+	if (num_memrsv < 0)
+		return num_memrsv;
+
+	while (1) {
+		offset = nextoffset;
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+
+		if (nextoffset < 0)
+			return nextoffset;
+
+		switch (tag) {
+		case FDT_NOP:
+			break;
+
+		case FDT_END:
+			if (depth != 0)
+				return -FDT_ERR_BADSTRUCTURE;
+			return 0;
+
+		case FDT_BEGIN_NODE:
+			depth++;
+			if (depth > INT_MAX)
+				return -FDT_ERR_BADSTRUCTURE;
+			break;
+
+		case FDT_END_NODE:
+			if (depth == 0)
+				return -FDT_ERR_BADSTRUCTURE;
+			depth--;
+			break;
+
+		case FDT_PROP:
+			prop = fdt_getprop_by_offset(fdt, offset, &propname,
+						     &err);
+			if (!prop)
+				return err;
+			break;
+
+		default:
+			return -FDT_ERR_INTERNAL;
+		}
+	}
+}
diff --git a/fdt/fdt_rw.c b/fdt/fdt_rw.c
new file mode 100644
index 0000000000..8795947c00
--- /dev/null
+++ b/fdt/fdt_rw.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+static int fdt_blocks_misordered_(const void *fdt,
+				  int mem_rsv_size, int struct_size)
+{
+	return (fdt_off_mem_rsvmap(fdt) < FDT_ALIGN(sizeof(struct fdt_header), 8))
+		|| (fdt_off_dt_struct(fdt) <
+		    (fdt_off_mem_rsvmap(fdt) + mem_rsv_size))
+		|| (fdt_off_dt_strings(fdt) <
+		    (fdt_off_dt_struct(fdt) + struct_size))
+		|| (fdt_totalsize(fdt) <
+		    (fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt)));
+}
+
+static int fdt_rw_probe_(void *fdt)
+{
+	FDT_RO_PROBE(fdt);
+
+	if (fdt_version(fdt) < 17)
+		return -FDT_ERR_BADVERSION;
+	if (fdt_blocks_misordered_(fdt, sizeof(struct fdt_reserve_entry),
+				   fdt_size_dt_struct(fdt)))
+		return -FDT_ERR_BADLAYOUT;
+	if (fdt_version(fdt) > 17)
+		fdt_set_version(fdt, 17);
+
+	return 0;
+}
+
+#define FDT_RW_PROBE(fdt) \
+	{ \
+		int err_; \
+		if ((err_ = fdt_rw_probe_(fdt)) != 0) \
+			return err_; \
+	}
+
+static inline int fdt_data_size_(void *fdt)
+{
+	return fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt);
+}
+
+static int fdt_splice_(void *fdt, void *splicepoint, int oldlen, int newlen)
+{
+	char *p = splicepoint;
+	char *end = (char *)fdt + fdt_data_size_(fdt);
+
+	if (((p + oldlen) < p) || ((p + oldlen) > end))
+		return -FDT_ERR_BADOFFSET;
+	if ((p < (char *)fdt) || ((end - oldlen + newlen) < (char *)fdt))
+		return -FDT_ERR_BADOFFSET;
+	if ((end - oldlen + newlen) > ((char *)fdt + fdt_totalsize(fdt)))
+		return -FDT_ERR_NOSPACE;
+	memmove(p + newlen, p + oldlen, end - p - oldlen);
+	return 0;
+}
+
+static int fdt_splice_mem_rsv_(void *fdt, struct fdt_reserve_entry *p,
+			       int oldn, int newn)
+{
+	int delta = (newn - oldn) * sizeof(*p);
+	int err;
+	err = fdt_splice_(fdt, p, oldn * sizeof(*p), newn * sizeof(*p));
+	if (err)
+		return err;
+	fdt_set_off_dt_struct(fdt, fdt_off_dt_struct(fdt) + delta);
+	fdt_set_off_dt_strings(fdt, fdt_off_dt_strings(fdt) + delta);
+	return 0;
+}
+
+static int fdt_splice_struct_(void *fdt, void *p,
+			      int oldlen, int newlen)
+{
+	int delta = newlen - oldlen;
+	int err;
+
+	if ((err = fdt_splice_(fdt, p, oldlen, newlen)))
+		return err;
+
+	fdt_set_size_dt_struct(fdt, fdt_size_dt_struct(fdt) + delta);
+	fdt_set_off_dt_strings(fdt, fdt_off_dt_strings(fdt) + delta);
+	return 0;
+}
+
+/* Must only be used to roll back in case of error */
+static void fdt_del_last_string_(void *fdt, const char *s)
+{
+	int newlen = strlen(s) + 1;
+
+	fdt_set_size_dt_strings(fdt, fdt_size_dt_strings(fdt) - newlen);
+}
+
+static int fdt_splice_string_(void *fdt, int newlen)
+{
+	void *p = (char *)fdt
+		+ fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt);
+	int err;
+
+	if ((err = fdt_splice_(fdt, p, 0, newlen)))
+		return err;
+
+	fdt_set_size_dt_strings(fdt, fdt_size_dt_strings(fdt) + newlen);
+	return 0;
+}
+
+static int fdt_find_add_string_(void *fdt, const char *s, int *allocated)
+{
+	char *strtab = (char *)fdt + fdt_off_dt_strings(fdt);
+	const char *p;
+	char *new;
+	int len = strlen(s) + 1;
+	int err;
+
+	*allocated = 0;
+
+	p = fdt_find_string_(strtab, fdt_size_dt_strings(fdt), s);
+	if (p)
+		/* found it */
+		return (p - strtab);
+
+	new = strtab + fdt_size_dt_strings(fdt);
+	err = fdt_splice_string_(fdt, len);
+	if (err)
+		return err;
+
+	*allocated = 1;
+
+	memcpy(new, s, len);
+	return (new - strtab);
+}
+
+int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size)
+{
+	struct fdt_reserve_entry *re;
+	int err;
+
+	FDT_RW_PROBE(fdt);
+
+	re = fdt_mem_rsv_w_(fdt, fdt_num_mem_rsv(fdt));
+	err = fdt_splice_mem_rsv_(fdt, re, 0, 1);
+	if (err)
+		return err;
+
+	re->address = cpu_to_fdt64(address);
+	re->size = cpu_to_fdt64(size);
+	return 0;
+}
+
+int fdt_del_mem_rsv(void *fdt, int n)
+{
+	struct fdt_reserve_entry *re = fdt_mem_rsv_w_(fdt, n);
+
+	FDT_RW_PROBE(fdt);
+
+	if (n >= fdt_num_mem_rsv(fdt))
+		return -FDT_ERR_NOTFOUND;
+
+	return fdt_splice_mem_rsv_(fdt, re, 1, 0);
+}
+
+static int fdt_resize_property_(void *fdt, int nodeoffset, const char *name,
+				int len, struct fdt_property **prop)
+{
+	int oldlen;
+	int err;
+
+	*prop = fdt_get_property_w(fdt, nodeoffset, name, &oldlen);
+	if (!*prop)
+		return oldlen;
+
+	if ((err = fdt_splice_struct_(fdt, (*prop)->data, FDT_TAGALIGN(oldlen),
+				      FDT_TAGALIGN(len))))
+		return err;
+
+	(*prop)->len = cpu_to_fdt32(len);
+	return 0;
+}
+
+static int fdt_add_property_(void *fdt, int nodeoffset, const char *name,
+			     int len, struct fdt_property **prop)
+{
+	int proplen;
+	int nextoffset;
+	int namestroff;
+	int err;
+	int allocated;
+
+	if ((nextoffset = fdt_check_node_offset_(fdt, nodeoffset)) < 0)
+		return nextoffset;
+
+	namestroff = fdt_find_add_string_(fdt, name, &allocated);
+	if (namestroff < 0)
+		return namestroff;
+
+	*prop = fdt_offset_ptr_w_(fdt, nextoffset);
+	proplen = sizeof(**prop) + FDT_TAGALIGN(len);
+
+	err = fdt_splice_struct_(fdt, *prop, 0, proplen);
+	if (err) {
+		if (allocated)
+			fdt_del_last_string_(fdt, name);
+		return err;
+	}
+
+	(*prop)->tag = cpu_to_fdt32(FDT_PROP);
+	(*prop)->nameoff = cpu_to_fdt32(namestroff);
+	(*prop)->len = cpu_to_fdt32(len);
+	return 0;
+}
+
+int fdt_set_name(void *fdt, int nodeoffset, const char *name)
+{
+	char *namep;
+	int oldlen, newlen;
+	int err;
+
+	FDT_RW_PROBE(fdt);
+
+	namep = (char *)(uintptr_t)fdt_get_name(fdt, nodeoffset, &oldlen);
+	if (!namep)
+		return oldlen;
+
+	newlen = strlen(name);
+
+	err = fdt_splice_struct_(fdt, namep, FDT_TAGALIGN(oldlen+1),
+				 FDT_TAGALIGN(newlen+1));
+	if (err)
+		return err;
+
+	memcpy(namep, name, newlen+1);
+	return 0;
+}
+
+int fdt_setprop_placeholder(void *fdt, int nodeoffset, const char *name,
+			    int len, void **prop_data)
+{
+	struct fdt_property *prop;
+	int err;
+
+	FDT_RW_PROBE(fdt);
+
+	err = fdt_resize_property_(fdt, nodeoffset, name, len, &prop);
+	if (err == -FDT_ERR_NOTFOUND)
+		err = fdt_add_property_(fdt, nodeoffset, name, len, &prop);
+	if (err)
+		return err;
+
+	*prop_data = prop->data;
+	return 0;
+}
+
+int fdt_setprop(void *fdt, int nodeoffset, const char *name,
+		const void *val, int len)
+{
+	void *prop_data;
+	int err;
+
+	err = fdt_setprop_placeholder(fdt, nodeoffset, name, len, &prop_data);
+	if (err)
+		return err;
+
+	if (len)
+		memcpy(prop_data, val, len);
+	return 0;
+}
+
+int fdt_appendprop(void *fdt, int nodeoffset, const char *name,
+		   const void *val, int len)
+{
+	struct fdt_property *prop;
+	int err, oldlen, newlen;
+
+	FDT_RW_PROBE(fdt);
+
+	prop = fdt_get_property_w(fdt, nodeoffset, name, &oldlen);
+	if (prop) {
+		newlen = len + oldlen;
+		err = fdt_splice_struct_(fdt, prop->data,
+					 FDT_TAGALIGN(oldlen),
+					 FDT_TAGALIGN(newlen));
+		if (err)
+			return err;
+		prop->len = cpu_to_fdt32(newlen);
+		memcpy(prop->data + oldlen, val, len);
+	} else {
+		err = fdt_add_property_(fdt, nodeoffset, name, len, &prop);
+		if (err)
+			return err;
+		memcpy(prop->data, val, len);
+	}
+	return 0;
+}
+
+int fdt_delprop(void *fdt, int nodeoffset, const char *name)
+{
+	struct fdt_property *prop;
+	int len, proplen;
+
+	FDT_RW_PROBE(fdt);
+
+	prop = fdt_get_property_w(fdt, nodeoffset, name, &len);
+	if (!prop)
+		return len;
+
+	proplen = sizeof(*prop) + FDT_TAGALIGN(len);
+	return fdt_splice_struct_(fdt, prop, proplen, 0);
+}
+
+int fdt_add_subnode_namelen(void *fdt, int parentoffset,
+			    const char *name, int namelen)
+{
+	struct fdt_node_header *nh;
+	int offset, nextoffset;
+	int nodelen;
+	int err;
+	uint32_t tag;
+	fdt32_t *endtag;
+
+	FDT_RW_PROBE(fdt);
+
+	offset = fdt_subnode_offset_namelen(fdt, parentoffset, name, namelen);
+	if (offset >= 0)
+		return -FDT_ERR_EXISTS;
+	else if (offset != -FDT_ERR_NOTFOUND)
+		return offset;
+
+	/* Try to place the new node after the parent's properties */
+	fdt_next_tag(fdt, parentoffset, &nextoffset); /* skip the BEGIN_NODE */
+	do {
+		offset = nextoffset;
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+	} while ((tag == FDT_PROP) || (tag == FDT_NOP));
+
+	nh = fdt_offset_ptr_w_(fdt, offset);
+	nodelen = sizeof(*nh) + FDT_TAGALIGN(namelen+1) + FDT_TAGSIZE;
+
+	err = fdt_splice_struct_(fdt, nh, 0, nodelen);
+	if (err)
+		return err;
+
+	nh->tag = cpu_to_fdt32(FDT_BEGIN_NODE);
+	memset(nh->name, 0, FDT_TAGALIGN(namelen+1));
+	memcpy(nh->name, name, namelen);
+	endtag = (fdt32_t *)((char *)nh + nodelen - FDT_TAGSIZE);
+	*endtag = cpu_to_fdt32(FDT_END_NODE);
+
+	return offset;
+}
+
+int fdt_add_subnode(void *fdt, int parentoffset, const char *name)
+{
+	return fdt_add_subnode_namelen(fdt, parentoffset, name, strlen(name));
+}
+
+int fdt_del_node(void *fdt, int nodeoffset)
+{
+	int endoffset;
+
+	FDT_RW_PROBE(fdt);
+
+	endoffset = fdt_node_end_offset_(fdt, nodeoffset);
+	if (endoffset < 0)
+		return endoffset;
+
+	return fdt_splice_struct_(fdt, fdt_offset_ptr_w_(fdt, nodeoffset),
+				  endoffset - nodeoffset, 0);
+}
+
+static void fdt_packblocks_(const char *old, char *new,
+			    int mem_rsv_size, int struct_size)
+{
+	int mem_rsv_off, struct_off, strings_off;
+
+	mem_rsv_off = FDT_ALIGN(sizeof(struct fdt_header), 8);
+	struct_off = mem_rsv_off + mem_rsv_size;
+	strings_off = struct_off + struct_size;
+
+	memmove(new + mem_rsv_off, old + fdt_off_mem_rsvmap(old), mem_rsv_size);
+	fdt_set_off_mem_rsvmap(new, mem_rsv_off);
+
+	memmove(new + struct_off, old + fdt_off_dt_struct(old), struct_size);
+	fdt_set_off_dt_struct(new, struct_off);
+	fdt_set_size_dt_struct(new, struct_size);
+
+	memmove(new + strings_off, old + fdt_off_dt_strings(old),
+		fdt_size_dt_strings(old));
+	fdt_set_off_dt_strings(new, strings_off);
+	fdt_set_size_dt_strings(new, fdt_size_dt_strings(old));
+}
+
+int fdt_open_into(const void *fdt, void *buf, int bufsize)
+{
+	int err;
+	int mem_rsv_size, struct_size;
+	int newsize;
+	const char *fdtstart = fdt;
+	const char *fdtend = fdtstart + fdt_totalsize(fdt);
+	char *tmp;
+
+	FDT_RO_PROBE(fdt);
+
+	mem_rsv_size = (fdt_num_mem_rsv(fdt)+1)
+		* sizeof(struct fdt_reserve_entry);
+
+	if (fdt_version(fdt) >= 17) {
+		struct_size = fdt_size_dt_struct(fdt);
+	} else {
+		struct_size = 0;
+		while (fdt_next_tag(fdt, struct_size, &struct_size) != FDT_END)
+			;
+		if (struct_size < 0)
+			return struct_size;
+	}
+
+	if (!fdt_blocks_misordered_(fdt, mem_rsv_size, struct_size)) {
+		/* no further work necessary */
+		err = fdt_move(fdt, buf, bufsize);
+		if (err)
+			return err;
+		fdt_set_version(buf, 17);
+		fdt_set_size_dt_struct(buf, struct_size);
+		fdt_set_totalsize(buf, bufsize);
+		return 0;
+	}
+
+	/* Need to reorder */
+	newsize = FDT_ALIGN(sizeof(struct fdt_header), 8) + mem_rsv_size
+		+ struct_size + fdt_size_dt_strings(fdt);
+
+	if (bufsize < newsize)
+		return -FDT_ERR_NOSPACE;
+
+	/* First attempt to build converted tree at beginning of buffer */
+	tmp = buf;
+	/* But if that overlaps with the old tree... */
+	if (((tmp + newsize) > fdtstart) && (tmp < fdtend)) {
+		/* Try right after the old tree instead */
+		tmp = (char *)(uintptr_t)fdtend;
+		if ((tmp + newsize) > ((char *)buf + bufsize))
+			return -FDT_ERR_NOSPACE;
+	}
+
+	fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size);
+	memmove(buf, tmp, newsize);
+
+	fdt_set_magic(buf, FDT_MAGIC);
+	fdt_set_totalsize(buf, bufsize);
+	fdt_set_version(buf, 17);
+	fdt_set_last_comp_version(buf, 16);
+	fdt_set_boot_cpuid_phys(buf, fdt_boot_cpuid_phys(fdt));
+
+	return 0;
+}
+
+int fdt_pack(void *fdt)
+{
+	int mem_rsv_size;
+
+	FDT_RW_PROBE(fdt);
+
+	mem_rsv_size = (fdt_num_mem_rsv(fdt)+1)
+		* sizeof(struct fdt_reserve_entry);
+	fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt));
+	fdt_set_totalsize(fdt, fdt_data_size_(fdt));
+
+	return 0;
+}
diff --git a/fdt/fdt_strerror.c b/fdt/fdt_strerror.c
new file mode 100644
index 0000000000..768db66ead
--- /dev/null
+++ b/fdt/fdt_strerror.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+struct fdt_errtabent {
+	const char *str;
+};
+
+#define FDT_ERRTABENT(val) \
+	[(val)] = { .str = #val, }
+
+static struct fdt_errtabent fdt_errtable[] = {
+	FDT_ERRTABENT(FDT_ERR_NOTFOUND),
+	FDT_ERRTABENT(FDT_ERR_EXISTS),
+	FDT_ERRTABENT(FDT_ERR_NOSPACE),
+
+	FDT_ERRTABENT(FDT_ERR_BADOFFSET),
+	FDT_ERRTABENT(FDT_ERR_BADPATH),
+	FDT_ERRTABENT(FDT_ERR_BADPHANDLE),
+	FDT_ERRTABENT(FDT_ERR_BADSTATE),
+
+	FDT_ERRTABENT(FDT_ERR_TRUNCATED),
+	FDT_ERRTABENT(FDT_ERR_BADMAGIC),
+	FDT_ERRTABENT(FDT_ERR_BADVERSION),
+	FDT_ERRTABENT(FDT_ERR_BADSTRUCTURE),
+	FDT_ERRTABENT(FDT_ERR_BADLAYOUT),
+	FDT_ERRTABENT(FDT_ERR_INTERNAL),
+	FDT_ERRTABENT(FDT_ERR_BADNCELLS),
+	FDT_ERRTABENT(FDT_ERR_BADVALUE),
+	FDT_ERRTABENT(FDT_ERR_BADOVERLAY),
+	FDT_ERRTABENT(FDT_ERR_NOPHANDLES),
+	FDT_ERRTABENT(FDT_ERR_BADFLAGS),
+};
+#define FDT_ERRTABSIZE	(sizeof(fdt_errtable) / sizeof(fdt_errtable[0]))
+
+const char *fdt_strerror(int errval)
+{
+	if (errval > 0)
+		return "<valid offset/length>";
+	else if (errval == 0)
+		return "<no error>";
+	else if (errval > -FDT_ERRTABSIZE) {
+		const char *s = fdt_errtable[-errval].str;
+
+		if (s)
+			return s;
+	}
+
+	return "<unknown error>";
+}
diff --git a/fdt/fdt_sw.c b/fdt/fdt_sw.c
new file mode 100644
index 0000000000..76bea22f73
--- /dev/null
+++ b/fdt/fdt_sw.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+static int fdt_sw_probe_(void *fdt)
+{
+	if (fdt_magic(fdt) == FDT_MAGIC)
+		return -FDT_ERR_BADSTATE;
+	else if (fdt_magic(fdt) != FDT_SW_MAGIC)
+		return -FDT_ERR_BADMAGIC;
+	return 0;
+}
+
+#define FDT_SW_PROBE(fdt) \
+	{ \
+		int err; \
+		if ((err = fdt_sw_probe_(fdt)) != 0) \
+			return err; \
+	}
+
+/* 'memrsv' state:	Initial state after fdt_create()
+ *
+ * Allowed functions:
+ *	fdt_add_reservmap_entry()
+ *	fdt_finish_reservemap()		[moves to 'struct' state]
+ */
+static int fdt_sw_probe_memrsv_(void *fdt)
+{
+	int err = fdt_sw_probe_(fdt);
+	if (err)
+		return err;
+
+	if (fdt_off_dt_strings(fdt) != 0)
+		return -FDT_ERR_BADSTATE;
+	return 0;
+}
+
+#define FDT_SW_PROBE_MEMRSV(fdt) \
+	{ \
+		int err; \
+		if ((err = fdt_sw_probe_memrsv_(fdt)) != 0) \
+			return err; \
+	}
+
+/* 'struct' state:	Enter this state after fdt_finish_reservemap()
+ *
+ * Allowed functions:
+ *	fdt_begin_node()
+ *	fdt_end_node()
+ *	fdt_property*()
+ *	fdt_finish()			[moves to 'complete' state]
+ */
+static int fdt_sw_probe_struct_(void *fdt)
+{
+	int err = fdt_sw_probe_(fdt);
+	if (err)
+		return err;
+
+	if (fdt_off_dt_strings(fdt) != fdt_totalsize(fdt))
+		return -FDT_ERR_BADSTATE;
+	return 0;
+}
+
+#define FDT_SW_PROBE_STRUCT(fdt) \
+	{ \
+		int err; \
+		if ((err = fdt_sw_probe_struct_(fdt)) != 0) \
+			return err; \
+	}
+
+static inline uint32_t sw_flags(void *fdt)
+{
+	/* assert: (fdt_magic(fdt) == FDT_SW_MAGIC) */
+	return fdt_last_comp_version(fdt);
+}
+
+/* 'complete' state:	Enter this state after fdt_finish()
+ *
+ * Allowed functions: none
+ */
+
+static void *fdt_grab_space_(void *fdt, size_t len)
+{
+	int offset = fdt_size_dt_struct(fdt);
+	int spaceleft;
+
+	spaceleft = fdt_totalsize(fdt) - fdt_off_dt_struct(fdt)
+		- fdt_size_dt_strings(fdt);
+
+	if ((offset + len < offset) || (offset + len > spaceleft))
+		return NULL;
+
+	fdt_set_size_dt_struct(fdt, offset + len);
+	return fdt_offset_ptr_w_(fdt, offset);
+}
+
+int fdt_create_with_flags(void *buf, int bufsize, uint32_t flags)
+{
+	const size_t hdrsize = FDT_ALIGN(sizeof(struct fdt_header),
+					 sizeof(struct fdt_reserve_entry));
+	void *fdt = buf;
+
+	if (bufsize < hdrsize)
+		return -FDT_ERR_NOSPACE;
+
+	if (flags & ~FDT_CREATE_FLAGS_ALL)
+		return -FDT_ERR_BADFLAGS;
+
+	memset(buf, 0, bufsize);
+
+	/*
+	 * magic and last_comp_version keep intermediate state during the fdt
+	 * creation process, which is replaced with the proper FDT format by
+	 * fdt_finish().
+	 *
+	 * flags should be accessed with sw_flags().
+	 */
+	fdt_set_magic(fdt, FDT_SW_MAGIC);
+	fdt_set_version(fdt, FDT_LAST_SUPPORTED_VERSION);
+	fdt_set_last_comp_version(fdt, flags);
+
+	fdt_set_totalsize(fdt,  bufsize);
+
+	fdt_set_off_mem_rsvmap(fdt, hdrsize);
+	fdt_set_off_dt_struct(fdt, fdt_off_mem_rsvmap(fdt));
+	fdt_set_off_dt_strings(fdt, 0);
+
+	return 0;
+}
+
+int fdt_create(void *buf, int bufsize)
+{
+	return fdt_create_with_flags(buf, bufsize, 0);
+}
+
+int fdt_resize(void *fdt, void *buf, int bufsize)
+{
+	size_t headsize, tailsize;
+	char *oldtail, *newtail;
+
+	FDT_SW_PROBE(fdt);
+
+	headsize = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt);
+	tailsize = fdt_size_dt_strings(fdt);
+
+	if ((headsize + tailsize) > fdt_totalsize(fdt))
+		return -FDT_ERR_INTERNAL;
+
+	if ((headsize + tailsize) > bufsize)
+		return -FDT_ERR_NOSPACE;
+
+	oldtail = (char *)fdt + fdt_totalsize(fdt) - tailsize;
+	newtail = (char *)buf + bufsize - tailsize;
+
+	/* Two cases to avoid clobbering data if the old and new
+	 * buffers partially overlap */
+	if (buf <= fdt) {
+		memmove(buf, fdt, headsize);
+		memmove(newtail, oldtail, tailsize);
+	} else {
+		memmove(newtail, oldtail, tailsize);
+		memmove(buf, fdt, headsize);
+	}
+
+	fdt_set_totalsize(buf, bufsize);
+	if (fdt_off_dt_strings(buf))
+		fdt_set_off_dt_strings(buf, bufsize);
+
+	return 0;
+}
+
+int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size)
+{
+	struct fdt_reserve_entry *re;
+	int offset;
+
+	FDT_SW_PROBE_MEMRSV(fdt);
+
+	offset = fdt_off_dt_struct(fdt);
+	if ((offset + sizeof(*re)) > fdt_totalsize(fdt))
+		return -FDT_ERR_NOSPACE;
+
+	re = (struct fdt_reserve_entry *)((char *)fdt + offset);
+	re->address = cpu_to_fdt64(addr);
+	re->size = cpu_to_fdt64(size);
+
+	fdt_set_off_dt_struct(fdt, offset + sizeof(*re));
+
+	return 0;
+}
+
+int fdt_finish_reservemap(void *fdt)
+{
+	int err = fdt_add_reservemap_entry(fdt, 0, 0);
+
+	if (err)
+		return err;
+
+	fdt_set_off_dt_strings(fdt, fdt_totalsize(fdt));
+	return 0;
+}
+
+int fdt_begin_node(void *fdt, const char *name)
+{
+	struct fdt_node_header *nh;
+	int namelen;
+
+	FDT_SW_PROBE_STRUCT(fdt);
+
+	namelen = strlen(name) + 1;
+	nh = fdt_grab_space_(fdt, sizeof(*nh) + FDT_TAGALIGN(namelen));
+	if (! nh)
+		return -FDT_ERR_NOSPACE;
+
+	nh->tag = cpu_to_fdt32(FDT_BEGIN_NODE);
+	memcpy(nh->name, name, namelen);
+	return 0;
+}
+
+int fdt_end_node(void *fdt)
+{
+	fdt32_t *en;
+
+	FDT_SW_PROBE_STRUCT(fdt);
+
+	en = fdt_grab_space_(fdt, FDT_TAGSIZE);
+	if (! en)
+		return -FDT_ERR_NOSPACE;
+
+	*en = cpu_to_fdt32(FDT_END_NODE);
+	return 0;
+}
+
+static int fdt_add_string_(void *fdt, const char *s)
+{
+	char *strtab = (char *)fdt + fdt_totalsize(fdt);
+	int strtabsize = fdt_size_dt_strings(fdt);
+	int len = strlen(s) + 1;
+	int struct_top, offset;
+
+	offset = -strtabsize - len;
+	struct_top = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt);
+	if (fdt_totalsize(fdt) + offset < struct_top)
+		return 0; /* no more room :( */
+
+	memcpy(strtab + offset, s, len);
+	fdt_set_size_dt_strings(fdt, strtabsize + len);
+	return offset;
+}
+
+/* Must only be used to roll back in case of error */
+static void fdt_del_last_string_(void *fdt, const char *s)
+{
+	int strtabsize = fdt_size_dt_strings(fdt);
+	int len = strlen(s) + 1;
+
+	fdt_set_size_dt_strings(fdt, strtabsize - len);
+}
+
+static int fdt_find_add_string_(void *fdt, const char *s, int *allocated)
+{
+	char *strtab = (char *)fdt + fdt_totalsize(fdt);
+	int strtabsize = fdt_size_dt_strings(fdt);
+	const char *p;
+
+	*allocated = 0;
+
+	p = fdt_find_string_(strtab - strtabsize, strtabsize, s);
+	if (p)
+		return p - strtab;
+
+	*allocated = 1;
+
+	return fdt_add_string_(fdt, s);
+}
+
+int fdt_property_placeholder(void *fdt, const char *name, int len, void **valp)
+{
+	struct fdt_property *prop;
+	int nameoff;
+	int allocated;
+
+	FDT_SW_PROBE_STRUCT(fdt);
+
+	/* String de-duplication can be slow, _NO_NAME_DEDUP skips it */
+	if (sw_flags(fdt) & FDT_CREATE_FLAG_NO_NAME_DEDUP) {
+		allocated = 1;
+		nameoff = fdt_add_string_(fdt, name);
+	} else {
+		nameoff = fdt_find_add_string_(fdt, name, &allocated);
+	}
+	if (nameoff == 0)
+		return -FDT_ERR_NOSPACE;
+
+	prop = fdt_grab_space_(fdt, sizeof(*prop) + FDT_TAGALIGN(len));
+	if (! prop) {
+		if (allocated)
+			fdt_del_last_string_(fdt, name);
+		return -FDT_ERR_NOSPACE;
+	}
+
+	prop->tag = cpu_to_fdt32(FDT_PROP);
+	prop->nameoff = cpu_to_fdt32(nameoff);
+	prop->len = cpu_to_fdt32(len);
+	*valp = prop->data;
+	return 0;
+}
+
+int fdt_property(void *fdt, const char *name, const void *val, int len)
+{
+	void *ptr;
+	int ret;
+
+	ret = fdt_property_placeholder(fdt, name, len, &ptr);
+	if (ret)
+		return ret;
+	memcpy(ptr, val, len);
+	return 0;
+}
+
+int fdt_finish(void *fdt)
+{
+	char *p = (char *)fdt;
+	fdt32_t *end;
+	int oldstroffset, newstroffset;
+	uint32_t tag;
+	int offset, nextoffset;
+
+	FDT_SW_PROBE_STRUCT(fdt);
+
+	/* Add terminator */
+	end = fdt_grab_space_(fdt, sizeof(*end));
+	if (! end)
+		return -FDT_ERR_NOSPACE;
+	*end = cpu_to_fdt32(FDT_END);
+
+	/* Relocate the string table */
+	oldstroffset = fdt_totalsize(fdt) - fdt_size_dt_strings(fdt);
+	newstroffset = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt);
+	memmove(p + newstroffset, p + oldstroffset, fdt_size_dt_strings(fdt));
+	fdt_set_off_dt_strings(fdt, newstroffset);
+
+	/* Walk the structure, correcting string offsets */
+	offset = 0;
+	while ((tag = fdt_next_tag(fdt, offset, &nextoffset)) != FDT_END) {
+		if (tag == FDT_PROP) {
+			struct fdt_property *prop =
+				fdt_offset_ptr_w_(fdt, offset);
+			int nameoff;
+
+			nameoff = fdt32_to_cpu(prop->nameoff);
+			nameoff += fdt_size_dt_strings(fdt);
+			prop->nameoff = cpu_to_fdt32(nameoff);
+		}
+		offset = nextoffset;
+	}
+	if (nextoffset < 0)
+		return nextoffset;
+
+	/* Finally, adjust the header */
+	fdt_set_totalsize(fdt, newstroffset + fdt_size_dt_strings(fdt));
+
+	/* And fix up fields that were keeping intermediate state. */
+	fdt_set_last_comp_version(fdt, FDT_FIRST_SUPPORTED_VERSION);
+	fdt_set_magic(fdt, FDT_MAGIC);
+
+	return 0;
+}
diff --git a/fdt/fdt_wip.c b/fdt/fdt_wip.c
new file mode 100644
index 0000000000..f64139e0b3
--- /dev/null
+++ b/fdt/fdt_wip.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include "libfdt_env.h"
+
+#include <fdt.h>
+#include <libfdt.h>
+
+#include "libfdt_internal.h"
+
+int fdt_setprop_inplace_namelen_partial(void *fdt, int nodeoffset,
+					const char *name, int namelen,
+					uint32_t idx, const void *val,
+					int len)
+{
+	void *propval;
+	int proplen;
+
+	propval = fdt_getprop_namelen_w(fdt, nodeoffset, name, namelen,
+					&proplen);
+	if (!propval)
+		return proplen;
+
+	if (proplen < (len + idx))
+		return -FDT_ERR_NOSPACE;
+
+	memcpy((char *)propval + idx, val, len);
+	return 0;
+}
+
+int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name,
+			const void *val, int len)
+{
+	const void *propval;
+	int proplen;
+
+	propval = fdt_getprop(fdt, nodeoffset, name, &proplen);
+	if (!propval)
+		return proplen;
+
+	if (proplen != len)
+		return -FDT_ERR_NOSPACE;
+
+	return fdt_setprop_inplace_namelen_partial(fdt, nodeoffset, name,
+						   strlen(name), 0,
+						   val, len);
+}
+
+static void fdt_nop_region_(void *start, int len)
+{
+	fdt32_t *p;
+
+	for (p = start; (char *)p < ((char *)start + len); p++)
+		*p = cpu_to_fdt32(FDT_NOP);
+}
+
+int fdt_nop_property(void *fdt, int nodeoffset, const char *name)
+{
+	struct fdt_property *prop;
+	int len;
+
+	prop = fdt_get_property_w(fdt, nodeoffset, name, &len);
+	if (!prop)
+		return len;
+
+	fdt_nop_region_(prop, len + sizeof(*prop));
+
+	return 0;
+}
+
+int fdt_node_end_offset_(void *fdt, int offset)
+{
+	int depth = 0;
+
+	while ((offset >= 0) && (depth >= 0))
+		offset = fdt_next_node(fdt, offset, &depth);
+
+	return offset;
+}
+
+int fdt_nop_node(void *fdt, int nodeoffset)
+{
+	int endoffset;
+
+	endoffset = fdt_node_end_offset_(fdt, nodeoffset);
+	if (endoffset < 0)
+		return endoffset;
+
+	fdt_nop_region_(fdt_offset_ptr_w(fdt, nodeoffset, 0),
+			endoffset - nodeoffset);
+	return 0;
+}
diff --git a/fdt/libfdt.h b/fdt/libfdt.h
new file mode 100644
index 0000000000..d2356cce43
--- /dev/null
+++ b/fdt/libfdt.h
@@ -0,0 +1,2077 @@
+/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) */
+#ifndef LIBFDT_H
+#define LIBFDT_H
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+
+#include <libfdt_env.h>
+#include <fdt.h>
+
+#define FDT_FIRST_SUPPORTED_VERSION	0x02
+#define FDT_LAST_SUPPORTED_VERSION	0x11
+
+/* Error codes: informative error codes */
+#define FDT_ERR_NOTFOUND	1
+	/* FDT_ERR_NOTFOUND: The requested node or property does not exist */
+#define FDT_ERR_EXISTS		2
+	/* FDT_ERR_EXISTS: Attempted to create a node or property which
+	 * already exists */
+#define FDT_ERR_NOSPACE		3
+	/* FDT_ERR_NOSPACE: Operation needed to expand the device
+	 * tree, but its buffer did not have sufficient space to
+	 * contain the expanded tree. Use fdt_open_into() to move the
+	 * device tree to a buffer with more space. */
+
+/* Error codes: codes for bad parameters */
+#define FDT_ERR_BADOFFSET	4
+	/* FDT_ERR_BADOFFSET: Function was passed a structure block
+	 * offset which is out-of-bounds, or which points to an
+	 * unsuitable part of the structure for the operation. */
+#define FDT_ERR_BADPATH		5
+	/* FDT_ERR_BADPATH: Function was passed a badly formatted path
+	 * (e.g. missing a leading / for a function which requires an
+	 * absolute path) */
+#define FDT_ERR_BADPHANDLE	6
+	/* FDT_ERR_BADPHANDLE: Function was passed an invalid phandle.
+	 * This can be caused either by an invalid phandle property
+	 * length, or the phandle value was either 0 or -1, which are
+	 * not permitted. */
+#define FDT_ERR_BADSTATE	7
+	/* FDT_ERR_BADSTATE: Function was passed an incomplete device
+	 * tree created by the sequential-write functions, which is
+	 * not sufficiently complete for the requested operation. */
+
+/* Error codes: codes for bad device tree blobs */
+#define FDT_ERR_TRUNCATED	8
+	/* FDT_ERR_TRUNCATED: FDT or a sub-block is improperly
+	 * terminated (overflows, goes outside allowed bounds, or
+	 * isn't properly terminated).  */
+#define FDT_ERR_BADMAGIC	9
+	/* FDT_ERR_BADMAGIC: Given "device tree" appears not to be a
+	 * device tree at all - it is missing the flattened device
+	 * tree magic number. */
+#define FDT_ERR_BADVERSION	10
+	/* FDT_ERR_BADVERSION: Given device tree has a version which
+	 * can't be handled by the requested operation.  For
+	 * read-write functions, this may mean that fdt_open_into() is
+	 * required to convert the tree to the expected version. */
+#define FDT_ERR_BADSTRUCTURE	11
+	/* FDT_ERR_BADSTRUCTURE: Given device tree has a corrupt
+	 * structure block or other serious error (e.g. misnested
+	 * nodes, or subnodes preceding properties). */
+#define FDT_ERR_BADLAYOUT	12
+	/* FDT_ERR_BADLAYOUT: For read-write functions, the given
+	 * device tree has it's sub-blocks in an order that the
+	 * function can't handle (memory reserve map, then structure,
+	 * then strings).  Use fdt_open_into() to reorganize the tree
+	 * into a form suitable for the read-write operations. */
+
+/* "Can't happen" error indicating a bug in libfdt */
+#define FDT_ERR_INTERNAL	13
+	/* FDT_ERR_INTERNAL: libfdt has failed an internal assertion.
+	 * Should never be returned, if it is, it indicates a bug in
+	 * libfdt itself. */
+
+/* Errors in device tree content */
+#define FDT_ERR_BADNCELLS	14
+	/* FDT_ERR_BADNCELLS: Device tree has a #address-cells, #size-cells
+	 * or similar property with a bad format or value */
+
+#define FDT_ERR_BADVALUE	15
+	/* FDT_ERR_BADVALUE: Device tree has a property with an unexpected
+	 * value. For example: a property expected to contain a string list
+	 * is not NUL-terminated within the length of its value. */
+
+#define FDT_ERR_BADOVERLAY	16
+	/* FDT_ERR_BADOVERLAY: The device tree overlay, while
+	 * correctly structured, cannot be applied due to some
+	 * unexpected or missing value, property or node. */
+
+#define FDT_ERR_NOPHANDLES	17
+	/* FDT_ERR_NOPHANDLES: The device tree doesn't have any
+	 * phandle available anymore without causing an overflow */
+
+#define FDT_ERR_BADFLAGS	18
+	/* FDT_ERR_BADFLAGS: The function was passed a flags field that
+	 * contains invalid flags or an invalid combination of flags. */
+
+#define FDT_ERR_MAX		18
+
+/* constants */
+#define FDT_MAX_PHANDLE 0xfffffffe
+	/* Valid values for phandles range from 1 to 2^32-2. */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**********************************************************************/
+/* Low-level functions (you probably don't need these)                */
+/**********************************************************************/
+
+#ifndef SWIG /* This function is not useful in Python */
+const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int checklen);
+#endif
+static inline void *fdt_offset_ptr_w(void *fdt, int offset, int checklen)
+{
+	return (void *)(uintptr_t)fdt_offset_ptr(fdt, offset, checklen);
+}
+
+uint32_t fdt_next_tag(const void *fdt, int offset, int *nextoffset);
+
+/*
+ * Alignment helpers:
+ *     These helpers access words from a device tree blob.  They're
+ *     built to work even with unaligned pointers on platforms (ike
+ *     ARM) that don't like unaligned loads and stores
+ */
+
+static inline uint32_t fdt32_ld(const fdt32_t *p)
+{
+	const uint8_t *bp = (const uint8_t *)p;
+
+	return ((uint32_t)bp[0] << 24)
+		| ((uint32_t)bp[1] << 16)
+		| ((uint32_t)bp[2] << 8)
+		| bp[3];
+}
+
+static inline void fdt32_st(void *property, uint32_t value)
+{
+	uint8_t *bp = (uint8_t *)property;
+
+	bp[0] = value >> 24;
+	bp[1] = (value >> 16) & 0xff;
+	bp[2] = (value >> 8) & 0xff;
+	bp[3] = value & 0xff;
+}
+
+static inline uint64_t fdt64_ld(const fdt64_t *p)
+{
+	const uint8_t *bp = (const uint8_t *)p;
+
+	return ((uint64_t)bp[0] << 56)
+		| ((uint64_t)bp[1] << 48)
+		| ((uint64_t)bp[2] << 40)
+		| ((uint64_t)bp[3] << 32)
+		| ((uint64_t)bp[4] << 24)
+		| ((uint64_t)bp[5] << 16)
+		| ((uint64_t)bp[6] << 8)
+		| bp[7];
+}
+
+static inline void fdt64_st(void *property, uint64_t value)
+{
+	uint8_t *bp = (uint8_t *)property;
+
+	bp[0] = value >> 56;
+	bp[1] = (value >> 48) & 0xff;
+	bp[2] = (value >> 40) & 0xff;
+	bp[3] = (value >> 32) & 0xff;
+	bp[4] = (value >> 24) & 0xff;
+	bp[5] = (value >> 16) & 0xff;
+	bp[6] = (value >> 8) & 0xff;
+	bp[7] = value & 0xff;
+}
+
+/**********************************************************************/
+/* Traversal functions                                                */
+/**********************************************************************/
+
+int fdt_next_node(const void *fdt, int offset, int *depth);
+
+/**
+ * fdt_first_subnode() - get offset of first direct subnode
+ *
+ * @fdt:	FDT blob
+ * @offset:	Offset of node to check
+ * @return offset of first subnode, or -FDT_ERR_NOTFOUND if there is none
+ */
+int fdt_first_subnode(const void *fdt, int offset);
+
+/**
+ * fdt_next_subnode() - get offset of next direct subnode
+ *
+ * After first calling fdt_first_subnode(), call this function repeatedly to
+ * get direct subnodes of a parent node.
+ *
+ * @fdt:	FDT blob
+ * @offset:	Offset of previous subnode
+ * @return offset of next subnode, or -FDT_ERR_NOTFOUND if there are no more
+ * subnodes
+ */
+int fdt_next_subnode(const void *fdt, int offset);
+
+/**
+ * fdt_for_each_subnode - iterate over all subnodes of a parent
+ *
+ * @node:	child node (int, lvalue)
+ * @fdt:	FDT blob (const void *)
+ * @parent:	parent node (int)
+ *
+ * This is actually a wrapper around a for loop and would be used like so:
+ *
+ *	fdt_for_each_subnode(node, fdt, parent) {
+ *		Use node
+ *		...
+ *	}
+ *
+ *	if ((node < 0) && (node != -FDT_ERR_NOTFOUND)) {
+ *		Error handling
+ *	}
+ *
+ * Note that this is implemented as a macro and @node is used as
+ * iterator in the loop. The parent variable be constant or even a
+ * literal.
+ *
+ */
+#define fdt_for_each_subnode(node, fdt, parent)		\
+	for (node = fdt_first_subnode(fdt, parent);	\
+	     node >= 0;					\
+	     node = fdt_next_subnode(fdt, node))
+
+/**********************************************************************/
+/* General functions                                                  */
+/**********************************************************************/
+#define fdt_get_header(fdt, field) \
+	(fdt32_ld(&((const struct fdt_header *)(fdt))->field))
+#define fdt_magic(fdt)			(fdt_get_header(fdt, magic))
+#define fdt_totalsize(fdt)		(fdt_get_header(fdt, totalsize))
+#define fdt_off_dt_struct(fdt)		(fdt_get_header(fdt, off_dt_struct))
+#define fdt_off_dt_strings(fdt)		(fdt_get_header(fdt, off_dt_strings))
+#define fdt_off_mem_rsvmap(fdt)		(fdt_get_header(fdt, off_mem_rsvmap))
+#define fdt_version(fdt)		(fdt_get_header(fdt, version))
+#define fdt_last_comp_version(fdt)	(fdt_get_header(fdt, last_comp_version))
+#define fdt_boot_cpuid_phys(fdt)	(fdt_get_header(fdt, boot_cpuid_phys))
+#define fdt_size_dt_strings(fdt)	(fdt_get_header(fdt, size_dt_strings))
+#define fdt_size_dt_struct(fdt)		(fdt_get_header(fdt, size_dt_struct))
+
+#define fdt_set_hdr_(name) \
+	static inline void fdt_set_##name(void *fdt, uint32_t val) \
+	{ \
+		struct fdt_header *fdth = (struct fdt_header *)fdt; \
+		fdth->name = cpu_to_fdt32(val); \
+	}
+fdt_set_hdr_(magic);
+fdt_set_hdr_(totalsize);
+fdt_set_hdr_(off_dt_struct);
+fdt_set_hdr_(off_dt_strings);
+fdt_set_hdr_(off_mem_rsvmap);
+fdt_set_hdr_(version);
+fdt_set_hdr_(last_comp_version);
+fdt_set_hdr_(boot_cpuid_phys);
+fdt_set_hdr_(size_dt_strings);
+fdt_set_hdr_(size_dt_struct);
+#undef fdt_set_hdr_
+
+/**
+ * fdt_header_size - return the size of the tree's header
+ * @fdt: pointer to a flattened device tree
+ */
+size_t fdt_header_size_(uint32_t version);
+static inline size_t fdt_header_size(const void *fdt)
+{
+	return fdt_header_size_(fdt_version(fdt));
+}
+
+/**
+ * fdt_check_header - sanity check a device tree header
+
+ * @fdt: pointer to data which might be a flattened device tree
+ *
+ * fdt_check_header() checks that the given buffer contains what
+ * appears to be a flattened device tree, and that the header contains
+ * valid information (to the extent that can be determined from the
+ * header alone).
+ *
+ * returns:
+ *     0, if the buffer appears to contain a valid device tree
+ *     -FDT_ERR_BADMAGIC,
+ *     -FDT_ERR_BADVERSION,
+ *     -FDT_ERR_BADSTATE,
+ *     -FDT_ERR_TRUNCATED, standard meanings, as above
+ */
+int fdt_check_header(const void *fdt);
+
+/**
+ * fdt_move - move a device tree around in memory
+ * @fdt: pointer to the device tree to move
+ * @buf: pointer to memory where the device is to be moved
+ * @bufsize: size of the memory space at buf
+ *
+ * fdt_move() relocates, if possible, the device tree blob located at
+ * fdt to the buffer at buf of size bufsize.  The buffer may overlap
+ * with the existing device tree blob at fdt.  Therefore,
+ *     fdt_move(fdt, fdt, fdt_totalsize(fdt))
+ * should always succeed.
+ *
+ * returns:
+ *     0, on success
+ *     -FDT_ERR_NOSPACE, bufsize is insufficient to contain the device tree
+ *     -FDT_ERR_BADMAGIC,
+ *     -FDT_ERR_BADVERSION,
+ *     -FDT_ERR_BADSTATE, standard meanings
+ */
+int fdt_move(const void *fdt, void *buf, int bufsize);
+
+/**********************************************************************/
+/* Read-only functions                                                */
+/**********************************************************************/
+
+int fdt_check_full(const void *fdt, size_t bufsize);
+
+/**
+ * fdt_get_string - retrieve a string from the strings block of a device tree
+ * @fdt: pointer to the device tree blob
+ * @stroffset: offset of the string within the strings block (native endian)
+ * @lenp: optional pointer to return the string's length
+ *
+ * fdt_get_string() retrieves a pointer to a single string from the
+ * strings block of the device tree blob at fdt, and optionally also
+ * returns the string's length in *lenp.
+ *
+ * returns:
+ *     a pointer to the string, on success
+ *     NULL, if stroffset is out of bounds, or doesn't point to a valid string
+ */
+const char *fdt_get_string(const void *fdt, int stroffset, int *lenp);
+
+/**
+ * fdt_string - retrieve a string from the strings block of a device tree
+ * @fdt: pointer to the device tree blob
+ * @stroffset: offset of the string within the strings block (native endian)
+ *
+ * fdt_string() retrieves a pointer to a single string from the
+ * strings block of the device tree blob at fdt.
+ *
+ * returns:
+ *     a pointer to the string, on success
+ *     NULL, if stroffset is out of bounds, or doesn't point to a valid string
+ */
+const char *fdt_string(const void *fdt, int stroffset);
+
+/**
+ * fdt_find_max_phandle - find and return the highest phandle in a tree
+ * @fdt: pointer to the device tree blob
+ * @phandle: return location for the highest phandle value found in the tree
+ *
+ * fdt_find_max_phandle() finds the highest phandle value in the given device
+ * tree. The value returned in @phandle is only valid if the function returns
+ * success.
+ *
+ * returns:
+ *     0 on success or a negative error code on failure
+ */
+int fdt_find_max_phandle(const void *fdt, uint32_t *phandle);
+
+/**
+ * fdt_get_max_phandle - retrieves the highest phandle in a tree
+ * @fdt: pointer to the device tree blob
+ *
+ * fdt_get_max_phandle retrieves the highest phandle in the given
+ * device tree. This will ignore badly formatted phandles, or phandles
+ * with a value of 0 or -1.
+ *
+ * This function is deprecated in favour of fdt_find_max_phandle().
+ *
+ * returns:
+ *      the highest phandle on success
+ *      0, if no phandle was found in the device tree
+ *      -1, if an error occurred
+ */
+static inline uint32_t fdt_get_max_phandle(const void *fdt)
+{
+	uint32_t phandle;
+	int err;
+
+	err = fdt_find_max_phandle(fdt, &phandle);
+	if (err < 0)
+		return (uint32_t)-1;
+
+	return phandle;
+}
+
+/**
+ * fdt_generate_phandle - return a new, unused phandle for a device tree blob
+ * @fdt: pointer to the device tree blob
+ * @phandle: return location for the new phandle
+ *
+ * Walks the device tree blob and looks for the highest phandle value. On
+ * success, the new, unused phandle value (one higher than the previously
+ * highest phandle value in the device tree blob) will be returned in the
+ * @phandle parameter.
+ *
+ * Returns:
+ *   0 on success or a negative error-code on failure
+ */
+int fdt_generate_phandle(const void *fdt, uint32_t *phandle);
+
+/**
+ * fdt_num_mem_rsv - retrieve the number of memory reserve map entries
+ * @fdt: pointer to the device tree blob
+ *
+ * Returns the number of entries in the device tree blob's memory
+ * reservation map.  This does not include the terminating 0,0 entry
+ * or any other (0,0) entries reserved for expansion.
+ *
+ * returns:
+ *     the number of entries
+ */
+int fdt_num_mem_rsv(const void *fdt);
+
+/**
+ * fdt_get_mem_rsv - retrieve one memory reserve map entry
+ * @fdt: pointer to the device tree blob
+ * @address, @size: pointers to 64-bit variables
+ *
+ * On success, *address and *size will contain the address and size of
+ * the n-th reserve map entry from the device tree blob, in
+ * native-endian format.
+ *
+ * returns:
+ *     0, on success
+ *     -FDT_ERR_BADMAGIC,
+ *     -FDT_ERR_BADVERSION,
+ *     -FDT_ERR_BADSTATE, standard meanings
+ */
+int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size);
+
+/**
+ * fdt_subnode_offset_namelen - find a subnode based on substring
+ * @fdt: pointer to the device tree blob
+ * @parentoffset: structure block offset of a node
+ * @name: name of the subnode to locate
+ * @namelen: number of characters of name to consider
+ *
+ * Identical to fdt_subnode_offset(), but only examine the first
+ * namelen characters of name for matching the subnode name.  This is
+ * useful for finding subnodes based on a portion of a larger string,
+ * such as a full path.
+ */
+#ifndef SWIG /* Not available in Python */
+int fdt_subnode_offset_namelen(const void *fdt, int parentoffset,
+			       const char *name, int namelen);
+#endif
+/**
+ * fdt_subnode_offset - find a subnode of a given node
+ * @fdt: pointer to the device tree blob
+ * @parentoffset: structure block offset of a node
+ * @name: name of the subnode to locate
+ *
+ * fdt_subnode_offset() finds a subnode of the node at structure block
+ * offset parentoffset with the given name.  name may include a unit
+ * address, in which case fdt_subnode_offset() will find the subnode
+ * with that unit address, or the unit address may be omitted, in
+ * which case fdt_subnode_offset() will find an arbitrary subnode
+ * whose name excluding unit address matches the given name.
+ *
+ * returns:
+ *	structure block offset of the requested subnode (>=0), on success
+ *	-FDT_ERR_NOTFOUND, if the requested subnode does not exist
+ *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE
+ *		tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings.
+ */
+int fdt_subnode_offset(const void *fdt, int parentoffset, const char *name);
+
+/**
+ * fdt_path_offset_namelen - find a tree node by its full path
+ * @fdt: pointer to the device tree blob
+ * @path: full path of the node to locate
+ * @namelen: number of characters of path to consider
+ *
+ * Identical to fdt_path_offset(), but only consider the first namelen
+ * characters of path as the path name.
+ */
+#ifndef SWIG /* Not available in Python */
+int fdt_path_offset_namelen(const void *fdt, const char *path, int namelen);
+#endif
+
+/**
+ * fdt_path_offset - find a tree node by its full path
+ * @fdt: pointer to the device tree blob
+ * @path: full path of the node to locate
+ *
+ * fdt_path_offset() finds a node of a given path in the device tree.
+ * Each path component may omit the unit address portion, but the
+ * results of this are undefined if any such path component is
+ * ambiguous (that is if there are multiple nodes at the relevant
+ * level matching the given component, differentiated only by unit
+ * address).
+ *
+ * returns:
+ *	structure block offset of the node with the requested path (>=0), on
+ *		success
+ *	-FDT_ERR_BADPATH, given path does not begin with '/' or is invalid
+ *	-FDT_ERR_NOTFOUND, if the requested node does not exist
+ *      -FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings.
+ */
+int fdt_path_offset(const void *fdt, const char *path);
+
+/**
+ * fdt_get_name - retrieve the name of a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: structure block offset of the starting node
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_get_name() retrieves the name (including unit address) of the
+ * device tree node at structure block offset nodeoffset.  If lenp is
+ * non-NULL, the length of this name is also returned, in the integer
+ * pointed to by lenp.
+ *
+ * returns:
+ *	pointer to the node's name, on success
+ *		If lenp is non-NULL, *lenp contains the length of that name
+ *			(>=0)
+ *	NULL, on error
+ *		if lenp is non-NULL *lenp contains an error code (<0):
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
+ *		-FDT_ERR_BADMAGIC,
+ *		-FDT_ERR_BADVERSION,
+ *		-FDT_ERR_BADSTATE, standard meanings
+ */
+const char *fdt_get_name(const void *fdt, int nodeoffset, int *lenp);
+
+/**
+ * fdt_first_property_offset - find the offset of a node's first property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: structure block offset of a node
+ *
+ * fdt_first_property_offset() finds the first property of the node at
+ * the given structure block offset.
+ *
+ * returns:
+ *	structure block offset of the property (>=0), on success
+ *	-FDT_ERR_NOTFOUND, if the requested node has no properties
+ *	-FDT_ERR_BADOFFSET, if nodeoffset did not point to an FDT_BEGIN_NODE tag
+ *      -FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings.
+ */
+int fdt_first_property_offset(const void *fdt, int nodeoffset);
+
+/**
+ * fdt_next_property_offset - step through a node's properties
+ * @fdt: pointer to the device tree blob
+ * @offset: structure block offset of a property
+ *
+ * fdt_next_property_offset() finds the property immediately after the
+ * one at the given structure block offset.  This will be a property
+ * of the same node as the given property.
+ *
+ * returns:
+ *	structure block offset of the next property (>=0), on success
+ *	-FDT_ERR_NOTFOUND, if the given property is the last in its node
+ *	-FDT_ERR_BADOFFSET, if nodeoffset did not point to an FDT_PROP tag
+ *      -FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings.
+ */
+int fdt_next_property_offset(const void *fdt, int offset);
+
+/**
+ * fdt_for_each_property_offset - iterate over all properties of a node
+ *
+ * @property_offset:	property offset (int, lvalue)
+ * @fdt:		FDT blob (const void *)
+ * @node:		node offset (int)
+ *
+ * This is actually a wrapper around a for loop and would be used like so:
+ *
+ *	fdt_for_each_property_offset(property, fdt, node) {
+ *		Use property
+ *		...
+ *	}
+ *
+ *	if ((property < 0) && (property != -FDT_ERR_NOTFOUND)) {
+ *		Error handling
+ *	}
+ *
+ * Note that this is implemented as a macro and property is used as
+ * iterator in the loop. The node variable can be constant or even a
+ * literal.
+ */
+#define fdt_for_each_property_offset(property, fdt, node)	\
+	for (property = fdt_first_property_offset(fdt, node);	\
+	     property >= 0;					\
+	     property = fdt_next_property_offset(fdt, property))
+
+/**
+ * fdt_get_property_by_offset - retrieve the property at a given offset
+ * @fdt: pointer to the device tree blob
+ * @offset: offset of the property to retrieve
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_get_property_by_offset() retrieves a pointer to the
+ * fdt_property structure within the device tree blob at the given
+ * offset.  If lenp is non-NULL, the length of the property value is
+ * also returned, in the integer pointed to by lenp.
+ *
+ * Note that this code only works on device tree versions >= 16. fdt_getprop()
+ * works on all versions.
+ *
+ * returns:
+ *	pointer to the structure representing the property
+ *		if lenp is non-NULL, *lenp contains the length of the property
+ *		value (>=0)
+ *	NULL, on error
+ *		if lenp is non-NULL, *lenp contains an error code (<0):
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_PROP tag
+ *		-FDT_ERR_BADMAGIC,
+ *		-FDT_ERR_BADVERSION,
+ *		-FDT_ERR_BADSTATE,
+ *		-FDT_ERR_BADSTRUCTURE,
+ *		-FDT_ERR_TRUNCATED, standard meanings
+ */
+const struct fdt_property *fdt_get_property_by_offset(const void *fdt,
+						      int offset,
+						      int *lenp);
+
+/**
+ * fdt_get_property_namelen - find a property based on substring
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to find
+ * @name: name of the property to find
+ * @namelen: number of characters of name to consider
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * Identical to fdt_get_property(), but only examine the first namelen
+ * characters of name for matching the property name.
+ */
+#ifndef SWIG /* Not available in Python */
+const struct fdt_property *fdt_get_property_namelen(const void *fdt,
+						    int nodeoffset,
+						    const char *name,
+						    int namelen, int *lenp);
+#endif
+
+/**
+ * fdt_get_property - find a given property in a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to find
+ * @name: name of the property to find
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_get_property() retrieves a pointer to the fdt_property
+ * structure within the device tree blob corresponding to the property
+ * named 'name' of the node at offset nodeoffset.  If lenp is
+ * non-NULL, the length of the property value is also returned, in the
+ * integer pointed to by lenp.
+ *
+ * returns:
+ *	pointer to the structure representing the property
+ *		if lenp is non-NULL, *lenp contains the length of the property
+ *		value (>=0)
+ *	NULL, on error
+ *		if lenp is non-NULL, *lenp contains an error code (<0):
+ *		-FDT_ERR_NOTFOUND, node does not have named property
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
+ *		-FDT_ERR_BADMAGIC,
+ *		-FDT_ERR_BADVERSION,
+ *		-FDT_ERR_BADSTATE,
+ *		-FDT_ERR_BADSTRUCTURE,
+ *		-FDT_ERR_TRUNCATED, standard meanings
+ */
+const struct fdt_property *fdt_get_property(const void *fdt, int nodeoffset,
+					    const char *name, int *lenp);
+static inline struct fdt_property *fdt_get_property_w(void *fdt, int nodeoffset,
+						      const char *name,
+						      int *lenp)
+{
+	return (struct fdt_property *)(uintptr_t)
+		fdt_get_property(fdt, nodeoffset, name, lenp);
+}
+
+/**
+ * fdt_getprop_by_offset - retrieve the value of a property at a given offset
+ * @fdt: pointer to the device tree blob
+ * @offset: offset of the property to read
+ * @namep: pointer to a string variable (will be overwritten) or NULL
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_getprop_by_offset() retrieves a pointer to the value of the
+ * property at structure block offset 'offset' (this will be a pointer
+ * to within the device blob itself, not a copy of the value).  If
+ * lenp is non-NULL, the length of the property value is also
+ * returned, in the integer pointed to by lenp.  If namep is non-NULL,
+ * the property's namne will also be returned in the char * pointed to
+ * by namep (this will be a pointer to within the device tree's string
+ * block, not a new copy of the name).
+ *
+ * returns:
+ *	pointer to the property's value
+ *		if lenp is non-NULL, *lenp contains the length of the property
+ *		value (>=0)
+ *		if namep is non-NULL *namep contiains a pointer to the property
+ *		name.
+ *	NULL, on error
+ *		if lenp is non-NULL, *lenp contains an error code (<0):
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_PROP tag
+ *		-FDT_ERR_BADMAGIC,
+ *		-FDT_ERR_BADVERSION,
+ *		-FDT_ERR_BADSTATE,
+ *		-FDT_ERR_BADSTRUCTURE,
+ *		-FDT_ERR_TRUNCATED, standard meanings
+ */
+#ifndef SWIG /* This function is not useful in Python */
+const void *fdt_getprop_by_offset(const void *fdt, int offset,
+				  const char **namep, int *lenp);
+#endif
+
+/**
+ * fdt_getprop_namelen - get property value based on substring
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to find
+ * @name: name of the property to find
+ * @namelen: number of characters of name to consider
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * Identical to fdt_getprop(), but only examine the first namelen
+ * characters of name for matching the property name.
+ */
+#ifndef SWIG /* Not available in Python */
+const void *fdt_getprop_namelen(const void *fdt, int nodeoffset,
+				const char *name, int namelen, int *lenp);
+static inline void *fdt_getprop_namelen_w(void *fdt, int nodeoffset,
+					  const char *name, int namelen,
+					  int *lenp)
+{
+	return (void *)(uintptr_t)fdt_getprop_namelen(fdt, nodeoffset, name,
+						      namelen, lenp);
+}
+#endif
+
+/**
+ * fdt_getprop - retrieve the value of a given property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to find
+ * @name: name of the property to find
+ * @lenp: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_getprop() retrieves a pointer to the value of the property
+ * named 'name' of the node at offset nodeoffset (this will be a
+ * pointer to within the device blob itself, not a copy of the value).
+ * If lenp is non-NULL, the length of the property value is also
+ * returned, in the integer pointed to by lenp.
+ *
+ * returns:
+ *	pointer to the property's value
+ *		if lenp is non-NULL, *lenp contains the length of the property
+ *		value (>=0)
+ *	NULL, on error
+ *		if lenp is non-NULL, *lenp contains an error code (<0):
+ *		-FDT_ERR_NOTFOUND, node does not have named property
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
+ *		-FDT_ERR_BADMAGIC,
+ *		-FDT_ERR_BADVERSION,
+ *		-FDT_ERR_BADSTATE,
+ *		-FDT_ERR_BADSTRUCTURE,
+ *		-FDT_ERR_TRUNCATED, standard meanings
+ */
+const void *fdt_getprop(const void *fdt, int nodeoffset,
+			const char *name, int *lenp);
+static inline void *fdt_getprop_w(void *fdt, int nodeoffset,
+				  const char *name, int *lenp)
+{
+	return (void *)(uintptr_t)fdt_getprop(fdt, nodeoffset, name, lenp);
+}
+
+/**
+ * fdt_get_phandle - retrieve the phandle of a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: structure block offset of the node
+ *
+ * fdt_get_phandle() retrieves the phandle of the device tree node at
+ * structure block offset nodeoffset.
+ *
+ * returns:
+ *	the phandle of the node at nodeoffset, on success (!= 0, != -1)
+ *	0, if the node has no phandle, or another error occurs
+ */
+uint32_t fdt_get_phandle(const void *fdt, int nodeoffset);
+
+/**
+ * fdt_get_alias_namelen - get alias based on substring
+ * @fdt: pointer to the device tree blob
+ * @name: name of the alias th look up
+ * @namelen: number of characters of name to consider
+ *
+ * Identical to fdt_get_alias(), but only examine the first namelen
+ * characters of name for matching the alias name.
+ */
+#ifndef SWIG /* Not available in Python */
+const char *fdt_get_alias_namelen(const void *fdt,
+				  const char *name, int namelen);
+#endif
+
+/**
+ * fdt_get_alias - retrieve the path referenced by a given alias
+ * @fdt: pointer to the device tree blob
+ * @name: name of the alias th look up
+ *
+ * fdt_get_alias() retrieves the value of a given alias.  That is, the
+ * value of the property named 'name' in the node /aliases.
+ *
+ * returns:
+ *	a pointer to the expansion of the alias named 'name', if it exists
+ *	NULL, if the given alias or the /aliases node does not exist
+ */
+const char *fdt_get_alias(const void *fdt, const char *name);
+
+/**
+ * fdt_get_path - determine the full path of a node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose path to find
+ * @buf: character buffer to contain the returned path (will be overwritten)
+ * @buflen: size of the character buffer at buf
+ *
+ * fdt_get_path() computes the full path of the node at offset
+ * nodeoffset, and records that path in the buffer at buf.
+ *
+ * NOTE: This function is expensive, as it must scan the device tree
+ * structure from the start to nodeoffset.
+ *
+ * returns:
+ *	0, on success
+ *		buf contains the absolute path of the node at
+ *		nodeoffset, as a NUL-terminated string.
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_NOSPACE, the path of the given node is longer than (bufsize-1)
+ *		characters and will not fit in the given buffer.
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen);
+
+/**
+ * fdt_supernode_atdepth_offset - find a specific ancestor of a node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose parent to find
+ * @supernodedepth: depth of the ancestor to find
+ * @nodedepth: pointer to an integer variable (will be overwritten) or NULL
+ *
+ * fdt_supernode_atdepth_offset() finds an ancestor of the given node
+ * at a specific depth from the root (where the root itself has depth
+ * 0, its immediate subnodes depth 1 and so forth).  So
+ *	fdt_supernode_atdepth_offset(fdt, nodeoffset, 0, NULL);
+ * will always return 0, the offset of the root node.  If the node at
+ * nodeoffset has depth D, then:
+ *	fdt_supernode_atdepth_offset(fdt, nodeoffset, D, NULL);
+ * will return nodeoffset itself.
+ *
+ * NOTE: This function is expensive, as it must scan the device tree
+ * structure from the start to nodeoffset.
+ *
+ * returns:
+ *	structure block offset of the node at node offset's ancestor
+ *		of depth supernodedepth (>=0), on success
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_NOTFOUND, supernodedepth was greater than the depth of
+ *		nodeoffset
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset,
+				 int supernodedepth, int *nodedepth);
+
+/**
+ * fdt_node_depth - find the depth of a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose parent to find
+ *
+ * fdt_node_depth() finds the depth of a given node.  The root node
+ * has depth 0, its immediate subnodes depth 1 and so forth.
+ *
+ * NOTE: This function is expensive, as it must scan the device tree
+ * structure from the start to nodeoffset.
+ *
+ * returns:
+ *	depth of the node at nodeoffset (>=0), on success
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_node_depth(const void *fdt, int nodeoffset);
+
+/**
+ * fdt_parent_offset - find the parent of a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose parent to find
+ *
+ * fdt_parent_offset() locates the parent node of a given node (that
+ * is, it finds the offset of the node which contains the node at
+ * nodeoffset as a subnode).
+ *
+ * NOTE: This function is expensive, as it must scan the device tree
+ * structure from the start to nodeoffset, *twice*.
+ *
+ * returns:
+ *	structure block offset of the parent of the node at nodeoffset
+ *		(>=0), on success
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_parent_offset(const void *fdt, int nodeoffset);
+
+/**
+ * fdt_node_offset_by_prop_value - find nodes with a given property value
+ * @fdt: pointer to the device tree blob
+ * @startoffset: only find nodes after this offset
+ * @propname: property name to check
+ * @propval: property value to search for
+ * @proplen: length of the value in propval
+ *
+ * fdt_node_offset_by_prop_value() returns the offset of the first
+ * node after startoffset, which has a property named propname whose
+ * value is of length proplen and has value equal to propval; or if
+ * startoffset is -1, the very first such node in the tree.
+ *
+ * To iterate through all nodes matching the criterion, the following
+ * idiom can be used:
+ *	offset = fdt_node_offset_by_prop_value(fdt, -1, propname,
+ *					       propval, proplen);
+ *	while (offset != -FDT_ERR_NOTFOUND) {
+ *		// other code here
+ *		offset = fdt_node_offset_by_prop_value(fdt, offset, propname,
+ *						       propval, proplen);
+ *	}
+ *
+ * Note the -1 in the first call to the function, if 0 is used here
+ * instead, the function will never locate the root node, even if it
+ * matches the criterion.
+ *
+ * returns:
+ *	structure block offset of the located node (>= 0, >startoffset),
+ *		 on success
+ *	-FDT_ERR_NOTFOUND, no node matching the criterion exists in the
+ *		tree after startoffset
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_node_offset_by_prop_value(const void *fdt, int startoffset,
+				  const char *propname,
+				  const void *propval, int proplen);
+
+/**
+ * fdt_node_offset_by_phandle - find the node with a given phandle
+ * @fdt: pointer to the device tree blob
+ * @phandle: phandle value
+ *
+ * fdt_node_offset_by_phandle() returns the offset of the node
+ * which has the given phandle value.  If there is more than one node
+ * in the tree with the given phandle (an invalid tree), results are
+ * undefined.
+ *
+ * returns:
+ *	structure block offset of the located node (>= 0), on success
+ *	-FDT_ERR_NOTFOUND, no node with that phandle exists
+ *	-FDT_ERR_BADPHANDLE, given phandle value was invalid (0 or -1)
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle);
+
+/**
+ * fdt_node_check_compatible: check a node's compatible property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @compatible: string to match against
+ *
+ *
+ * fdt_node_check_compatible() returns 0 if the given node contains a
+ * 'compatible' property with the given string as one of its elements,
+ * it returns non-zero otherwise, or on error.
+ *
+ * returns:
+ *	0, if the node has a 'compatible' property listing the given string
+ *	1, if the node has a 'compatible' property, but it does not list
+ *		the given string
+ *	-FDT_ERR_NOTFOUND, if the given node has no 'compatible' property
+ *	-FDT_ERR_BADOFFSET, if nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_node_check_compatible(const void *fdt, int nodeoffset,
+			      const char *compatible);
+
+/**
+ * fdt_node_offset_by_compatible - find nodes with a given 'compatible' value
+ * @fdt: pointer to the device tree blob
+ * @startoffset: only find nodes after this offset
+ * @compatible: 'compatible' string to match against
+ *
+ * fdt_node_offset_by_compatible() returns the offset of the first
+ * node after startoffset, which has a 'compatible' property which
+ * lists the given compatible string; or if startoffset is -1, the
+ * very first such node in the tree.
+ *
+ * To iterate through all nodes matching the criterion, the following
+ * idiom can be used:
+ *	offset = fdt_node_offset_by_compatible(fdt, -1, compatible);
+ *	while (offset != -FDT_ERR_NOTFOUND) {
+ *		// other code here
+ *		offset = fdt_node_offset_by_compatible(fdt, offset, compatible);
+ *	}
+ *
+ * Note the -1 in the first call to the function, if 0 is used here
+ * instead, the function will never locate the root node, even if it
+ * matches the criterion.
+ *
+ * returns:
+ *	structure block offset of the located node (>= 0, >startoffset),
+ *		 on success
+ *	-FDT_ERR_NOTFOUND, no node matching the criterion exists in the
+ *		tree after startoffset
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE, standard meanings
+ */
+int fdt_node_offset_by_compatible(const void *fdt, int startoffset,
+				  const char *compatible);
+
+/**
+ * fdt_stringlist_contains - check a string list property for a string
+ * @strlist: Property containing a list of strings to check
+ * @listlen: Length of property
+ * @str: String to search for
+ *
+ * This is a utility function provided for convenience. The list contains
+ * one or more strings, each terminated by \0, as is found in a device tree
+ * "compatible" property.
+ *
+ * @return: 1 if the string is found in the list, 0 not found, or invalid list
+ */
+int fdt_stringlist_contains(const char *strlist, int listlen, const char *str);
+
+/**
+ * fdt_stringlist_count - count the number of strings in a string list
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @return:
+ *   the number of strings in the given property
+ *   -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *   -FDT_ERR_NOTFOUND if the property does not exist
+ */
+int fdt_stringlist_count(const void *fdt, int nodeoffset, const char *property);
+
+/**
+ * fdt_stringlist_search - find a string in a string list and return its index
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @string: string to look up in the string list
+ *
+ * Note that it is possible for this function to succeed on property values
+ * that are not NUL-terminated. That's because the function will stop after
+ * finding the first occurrence of @string. This can for example happen with
+ * small-valued cell properties, such as #address-cells, when searching for
+ * the empty string.
+ *
+ * @return:
+ *   the index of the string in the list of strings
+ *   -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *   -FDT_ERR_NOTFOUND if the property does not exist or does not contain
+ *                     the given string
+ */
+int fdt_stringlist_search(const void *fdt, int nodeoffset, const char *property,
+			  const char *string);
+
+/**
+ * fdt_stringlist_get() - obtain the string at a given index in a string list
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @index: index of the string to return
+ * @lenp: return location for the string length or an error code on failure
+ *
+ * Note that this will successfully extract strings from properties with
+ * non-NUL-terminated values. For example on small-valued cell properties
+ * this function will return the empty string.
+ *
+ * If non-NULL, the length of the string (on success) or a negative error-code
+ * (on failure) will be stored in the integer pointer to by lenp.
+ *
+ * @return:
+ *   A pointer to the string at the given index in the string list or NULL on
+ *   failure. On success the length of the string will be stored in the memory
+ *   location pointed to by the lenp parameter, if non-NULL. On failure one of
+ *   the following negative error codes will be returned in the lenp parameter
+ *   (if non-NULL):
+ *     -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *     -FDT_ERR_NOTFOUND if the property does not exist
+ */
+const char *fdt_stringlist_get(const void *fdt, int nodeoffset,
+			       const char *property, int index,
+			       int *lenp);
+
+/**********************************************************************/
+/* Read-only functions (addressing related)                           */
+/**********************************************************************/
+
+/**
+ * FDT_MAX_NCELLS - maximum value for #address-cells and #size-cells
+ *
+ * This is the maximum value for #address-cells, #size-cells and
+ * similar properties that will be processed by libfdt.  IEE1275
+ * requires that OF implementations handle values up to 4.
+ * Implementations may support larger values, but in practice higher
+ * values aren't used.
+ */
+#define FDT_MAX_NCELLS		4
+
+/**
+ * fdt_address_cells - retrieve address size for a bus represented in the tree
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node to find the address size for
+ *
+ * When the node has a valid #address-cells property, returns its value.
+ *
+ * returns:
+ *	0 <= n < FDT_MAX_NCELLS, on success
+ *      2, if the node has no #address-cells property
+ *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid
+ *		#address-cells property
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_address_cells(const void *fdt, int nodeoffset);
+
+/**
+ * fdt_size_cells - retrieve address range size for a bus represented in the
+ *                  tree
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node to find the address range size for
+ *
+ * When the node has a valid #size-cells property, returns its value.
+ *
+ * returns:
+ *	0 <= n < FDT_MAX_NCELLS, on success
+ *      1, if the node has no #size-cells property
+ *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid
+ *		#size-cells property
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_size_cells(const void *fdt, int nodeoffset);
+
+
+/**********************************************************************/
+/* Write-in-place functions                                           */
+/**********************************************************************/
+
+/**
+ * fdt_setprop_inplace_namelen_partial - change a property's value,
+ *                                       but not its size
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @namelen: number of characters of name to consider
+ * @idx: index of the property to change in the array
+ * @val: pointer to data to replace the property value with
+ * @len: length of the property value
+ *
+ * Identical to fdt_setprop_inplace(), but modifies the given property
+ * starting from the given index, and using only the first characters
+ * of the name. It is useful when you want to manipulate only one value of
+ * an array and you have a string that doesn't end with \0.
+ */
+#ifndef SWIG /* Not available in Python */
+int fdt_setprop_inplace_namelen_partial(void *fdt, int nodeoffset,
+					const char *name, int namelen,
+					uint32_t idx, const void *val,
+					int len);
+#endif
+
+/**
+ * fdt_setprop_inplace - change a property's value, but not its size
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: pointer to data to replace the property value with
+ * @len: length of the property value
+ *
+ * fdt_setprop_inplace() replaces the value of a given property with
+ * the data in val, of length len.  This function cannot change the
+ * size of a property, and so will only work if len is equal to the
+ * current length of the property.
+ *
+ * This function will alter only the bytes in the blob which contain
+ * the given property value, and will not alter or move any other part
+ * of the tree.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, if len is not equal to the property's current length
+ *	-FDT_ERR_NOTFOUND, node does not have the named property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+#ifndef SWIG /* Not available in Python */
+int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name,
+			const void *val, int len);
+#endif
+
+/**
+ * fdt_setprop_inplace_u32 - change the value of a 32-bit integer property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 32-bit integer value to replace the property with
+ *
+ * fdt_setprop_inplace_u32() replaces the value of a given property
+ * with the 32-bit integer value in val, converting val to big-endian
+ * if necessary.  This function cannot change the size of a property,
+ * and so will only work if the property already exists and has length
+ * 4.
+ *
+ * This function will alter only the bytes in the blob which contain
+ * the given property value, and will not alter or move any other part
+ * of the tree.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, if the property's length is not equal to 4
+ *	-FDT_ERR_NOTFOUND, node does not have the named property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_setprop_inplace_u32(void *fdt, int nodeoffset,
+					  const char *name, uint32_t val)
+{
+	fdt32_t tmp = cpu_to_fdt32(val);
+	return fdt_setprop_inplace(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_setprop_inplace_u64 - change the value of a 64-bit integer property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 64-bit integer value to replace the property with
+ *
+ * fdt_setprop_inplace_u64() replaces the value of a given property
+ * with the 64-bit integer value in val, converting val to big-endian
+ * if necessary.  This function cannot change the size of a property,
+ * and so will only work if the property already exists and has length
+ * 8.
+ *
+ * This function will alter only the bytes in the blob which contain
+ * the given property value, and will not alter or move any other part
+ * of the tree.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, if the property's length is not equal to 8
+ *	-FDT_ERR_NOTFOUND, node does not have the named property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_setprop_inplace_u64(void *fdt, int nodeoffset,
+					  const char *name, uint64_t val)
+{
+	fdt64_t tmp = cpu_to_fdt64(val);
+	return fdt_setprop_inplace(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_setprop_inplace_cell - change the value of a single-cell property
+ *
+ * This is an alternative name for fdt_setprop_inplace_u32()
+ */
+static inline int fdt_setprop_inplace_cell(void *fdt, int nodeoffset,
+					   const char *name, uint32_t val)
+{
+	return fdt_setprop_inplace_u32(fdt, nodeoffset, name, val);
+}
+
+/**
+ * fdt_nop_property - replace a property with nop tags
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to nop
+ * @name: name of the property to nop
+ *
+ * fdt_nop_property() will replace a given property's representation
+ * in the blob with FDT_NOP tags, effectively removing it from the
+ * tree.
+ *
+ * This function will alter only the bytes in the blob which contain
+ * the property, and will not alter or move any other part of the
+ * tree.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOTFOUND, node does not have the named property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_nop_property(void *fdt, int nodeoffset, const char *name);
+
+/**
+ * fdt_nop_node - replace a node (subtree) with nop tags
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node to nop
+ *
+ * fdt_nop_node() will replace a given node's representation in the
+ * blob, including all its subnodes, if any, with FDT_NOP tags,
+ * effectively removing it from the tree.
+ *
+ * This function will alter only the bytes in the blob which contain
+ * the node and its properties and subnodes, and will not alter or
+ * move any other part of the tree.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_nop_node(void *fdt, int nodeoffset);
+
+/**********************************************************************/
+/* Sequential write functions                                         */
+/**********************************************************************/
+
+/* fdt_create_with_flags flags */
+#define FDT_CREATE_FLAG_NO_NAME_DEDUP 0x1
+	/* FDT_CREATE_FLAG_NO_NAME_DEDUP: Do not try to de-duplicate property
+	 * names in the fdt. This can result in faster creation times, but
+	 * a larger fdt. */
+
+#define FDT_CREATE_FLAGS_ALL	(FDT_CREATE_FLAG_NO_NAME_DEDUP)
+
+/**
+ * fdt_create_with_flags - begin creation of a new fdt
+ * @fdt: pointer to memory allocated where fdt will be created
+ * @bufsize: size of the memory space at fdt
+ * @flags: a valid combination of FDT_CREATE_FLAG_ flags, or 0.
+ *
+ * fdt_create_with_flags() begins the process of creating a new fdt with
+ * the sequential write interface.
+ *
+ * fdt creation process must end with fdt_finished() to produce a valid fdt.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, bufsize is insufficient for a minimal fdt
+ *	-FDT_ERR_BADFLAGS, flags is not valid
+ */
+int fdt_create_with_flags(void *buf, int bufsize, uint32_t flags);
+
+/**
+ * fdt_create - begin creation of a new fdt
+ * @fdt: pointer to memory allocated where fdt will be created
+ * @bufsize: size of the memory space at fdt
+ *
+ * fdt_create() is equivalent to fdt_create_with_flags() with flags=0.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, bufsize is insufficient for a minimal fdt
+ */
+int fdt_create(void *buf, int bufsize);
+
+int fdt_resize(void *fdt, void *buf, int bufsize);
+int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size);
+int fdt_finish_reservemap(void *fdt);
+int fdt_begin_node(void *fdt, const char *name);
+int fdt_property(void *fdt, const char *name, const void *val, int len);
+static inline int fdt_property_u32(void *fdt, const char *name, uint32_t val)
+{
+	fdt32_t tmp = cpu_to_fdt32(val);
+	return fdt_property(fdt, name, &tmp, sizeof(tmp));
+}
+static inline int fdt_property_u64(void *fdt, const char *name, uint64_t val)
+{
+	fdt64_t tmp = cpu_to_fdt64(val);
+	return fdt_property(fdt, name, &tmp, sizeof(tmp));
+}
+
+#ifndef SWIG /* Not available in Python */
+static inline int fdt_property_cell(void *fdt, const char *name, uint32_t val)
+{
+	return fdt_property_u32(fdt, name, val);
+}
+#endif
+
+/**
+ * fdt_property_placeholder - add a new property and return a ptr to its value
+ *
+ * @fdt: pointer to the device tree blob
+ * @name: name of property to add
+ * @len: length of property value in bytes
+ * @valp: returns a pointer to where where the value should be placed
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_NOSPACE, standard meanings
+ */
+int fdt_property_placeholder(void *fdt, const char *name, int len, void **valp);
+
+#define fdt_property_string(fdt, name, str) \
+	fdt_property(fdt, name, str, strlen(str)+1)
+int fdt_end_node(void *fdt);
+int fdt_finish(void *fdt);
+
+/**********************************************************************/
+/* Read-write functions                                               */
+/**********************************************************************/
+
+int fdt_create_empty_tree(void *buf, int bufsize);
+int fdt_open_into(const void *fdt, void *buf, int bufsize);
+int fdt_pack(void *fdt);
+
+/**
+ * fdt_add_mem_rsv - add one memory reserve map entry
+ * @fdt: pointer to the device tree blob
+ * @address, @size: 64-bit values (native endian)
+ *
+ * Adds a reserve map entry to the given blob reserving a region at
+ * address address of length size.
+ *
+ * This function will insert data into the reserve map and will
+ * therefore change the indexes of some entries in the table.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new reservation entry
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size);
+
+/**
+ * fdt_del_mem_rsv - remove a memory reserve map entry
+ * @fdt: pointer to the device tree blob
+ * @n: entry to remove
+ *
+ * fdt_del_mem_rsv() removes the n-th memory reserve map entry from
+ * the blob.
+ *
+ * This function will delete data from the reservation table and will
+ * therefore change the indexes of some entries in the table.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOTFOUND, there is no entry of the given index (i.e. there
+ *		are less than n+1 reserve map entries)
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_del_mem_rsv(void *fdt, int n);
+
+/**
+ * fdt_set_name - change the name of a given node
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: structure block offset of a node
+ * @name: name to give the node
+ *
+ * fdt_set_name() replaces the name (including unit address, if any)
+ * of the given node with the given string.  NOTE: this function can't
+ * efficiently check if the new name is unique amongst the given
+ * node's siblings; results are undefined if this function is invoked
+ * with a name equal to one of the given node's siblings.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob
+ *		to contain the new name
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE, standard meanings
+ */
+int fdt_set_name(void *fdt, int nodeoffset, const char *name);
+
+/**
+ * fdt_setprop - create or change a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: pointer to data to set the property value to
+ * @len: length of the property value
+ *
+ * fdt_setprop() sets the value of the named property in the given
+ * node to the given value and length, creating the property if it
+ * does not already exist.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_setprop(void *fdt, int nodeoffset, const char *name,
+		const void *val, int len);
+
+/**
+ * fdt_setprop_placeholder - allocate space for a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @len: length of the property value
+ * @prop_data: return pointer to property data
+ *
+ * fdt_setprop_placeholer() allocates the named property in the given node.
+ * If the property exists it is resized. In either case a pointer to the
+ * property data is returned.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_setprop_placeholder(void *fdt, int nodeoffset, const char *name,
+			    int len, void **prop_data);
+
+/**
+ * fdt_setprop_u32 - set a property to a 32-bit integer
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 32-bit integer value for the property (native endian)
+ *
+ * fdt_setprop_u32() sets the value of the named property in the given
+ * node to the given 32-bit integer value (converting to big-endian if
+ * necessary), or creates a new property with that value if it does
+ * not already exist.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_setprop_u32(void *fdt, int nodeoffset, const char *name,
+				  uint32_t val)
+{
+	fdt32_t tmp = cpu_to_fdt32(val);
+	return fdt_setprop(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_setprop_u64 - set a property to a 64-bit integer
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 64-bit integer value for the property (native endian)
+ *
+ * fdt_setprop_u64() sets the value of the named property in the given
+ * node to the given 64-bit integer value (converting to big-endian if
+ * necessary), or creates a new property with that value if it does
+ * not already exist.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_setprop_u64(void *fdt, int nodeoffset, const char *name,
+				  uint64_t val)
+{
+	fdt64_t tmp = cpu_to_fdt64(val);
+	return fdt_setprop(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_setprop_cell - set a property to a single cell value
+ *
+ * This is an alternative name for fdt_setprop_u32()
+ */
+static inline int fdt_setprop_cell(void *fdt, int nodeoffset, const char *name,
+				   uint32_t val)
+{
+	return fdt_setprop_u32(fdt, nodeoffset, name, val);
+}
+
+/**
+ * fdt_setprop_string - set a property to a string value
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @str: string value for the property
+ *
+ * fdt_setprop_string() sets the value of the named property in the
+ * given node to the given string value (using the length of the
+ * string to determine the new length of the property), or creates a
+ * new property with that value if it does not already exist.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+#define fdt_setprop_string(fdt, nodeoffset, name, str) \
+	fdt_setprop((fdt), (nodeoffset), (name), (str), strlen(str)+1)
+
+
+/**
+ * fdt_setprop_empty - set a property to an empty value
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ *
+ * fdt_setprop_empty() sets the value of the named property in the
+ * given node to an empty (zero length) value, or creates a new empty
+ * property if it does not already exist.
+ *
+ * This function may insert or delete data from the blob, and will
+ * therefore change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+#define fdt_setprop_empty(fdt, nodeoffset, name) \
+	fdt_setprop((fdt), (nodeoffset), (name), NULL, 0)
+
+/**
+ * fdt_appendprop - append to or create a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to append to
+ * @val: pointer to data to append to the property value
+ * @len: length of the data to append to the property value
+ *
+ * fdt_appendprop() appends the value to the named property in the
+ * given node, creating the property if it does not already exist.
+ *
+ * This function may insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_appendprop(void *fdt, int nodeoffset, const char *name,
+		   const void *val, int len);
+
+/**
+ * fdt_appendprop_u32 - append a 32-bit integer value to a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 32-bit integer value to append to the property (native endian)
+ *
+ * fdt_appendprop_u32() appends the given 32-bit integer value
+ * (converting to big-endian if necessary) to the value of the named
+ * property in the given node, or creates a new property with that
+ * value if it does not already exist.
+ *
+ * This function may insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_appendprop_u32(void *fdt, int nodeoffset,
+				     const char *name, uint32_t val)
+{
+	fdt32_t tmp = cpu_to_fdt32(val);
+	return fdt_appendprop(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_appendprop_u64 - append a 64-bit integer value to a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @val: 64-bit integer value to append to the property (native endian)
+ *
+ * fdt_appendprop_u64() appends the given 64-bit integer value
+ * (converting to big-endian if necessary) to the value of the named
+ * property in the given node, or creates a new property with that
+ * value if it does not already exist.
+ *
+ * This function may insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+static inline int fdt_appendprop_u64(void *fdt, int nodeoffset,
+				     const char *name, uint64_t val)
+{
+	fdt64_t tmp = cpu_to_fdt64(val);
+	return fdt_appendprop(fdt, nodeoffset, name, &tmp, sizeof(tmp));
+}
+
+/**
+ * fdt_appendprop_cell - append a single cell value to a property
+ *
+ * This is an alternative name for fdt_appendprop_u32()
+ */
+static inline int fdt_appendprop_cell(void *fdt, int nodeoffset,
+				      const char *name, uint32_t val)
+{
+	return fdt_appendprop_u32(fdt, nodeoffset, name, val);
+}
+
+/**
+ * fdt_appendprop_string - append a string to a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @str: string value to append to the property
+ *
+ * fdt_appendprop_string() appends the given string to the value of
+ * the named property in the given node, or creates a new property
+ * with that value if it does not already exist.
+ *
+ * This function may insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain the new property value
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+#define fdt_appendprop_string(fdt, nodeoffset, name, str) \
+	fdt_appendprop((fdt), (nodeoffset), (name), (str), strlen(str)+1)
+
+/**
+ * fdt_appendprop_addrrange - append a address range property
+ * @fdt: pointer to the device tree blob
+ * @parent: offset of the parent node
+ * @nodeoffset: offset of the node to add a property at
+ * @name: name of property
+ * @addr: start address of a given range
+ * @size: size of a given range
+ *
+ * fdt_appendprop_addrrange() appends an address range value (start
+ * address and size) to the value of the named property in the given
+ * node, or creates a new property with that value if it does not
+ * already exist.
+ * If "name" is not specified, a default "reg" is used.
+ * Cell sizes are determined by parent's #address-cells and #size-cells.
+ *
+ * This function may insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid
+ *		#address-cells property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADVALUE, addr or size doesn't fit to respective cells size
+ *	-FDT_ERR_NOSPACE, there is insufficient free space in the blob to
+ *		contain a new property
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_appendprop_addrrange(void *fdt, int parent, int nodeoffset,
+			     const char *name, uint64_t addr, uint64_t size);
+
+/**
+ * fdt_delprop - delete a property
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to nop
+ * @name: name of the property to nop
+ *
+ * fdt_del_property() will delete the given property.
+ *
+ * This function will delete data from the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOTFOUND, node does not have the named property
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_delprop(void *fdt, int nodeoffset, const char *name);
+
+/**
+ * fdt_add_subnode_namelen - creates a new node based on substring
+ * @fdt: pointer to the device tree blob
+ * @parentoffset: structure block offset of a node
+ * @name: name of the subnode to locate
+ * @namelen: number of characters of name to consider
+ *
+ * Identical to fdt_add_subnode(), but use only the first namelen
+ * characters of name as the name of the new node.  This is useful for
+ * creating subnodes based on a portion of a larger string, such as a
+ * full path.
+ */
+#ifndef SWIG /* Not available in Python */
+int fdt_add_subnode_namelen(void *fdt, int parentoffset,
+			    const char *name, int namelen);
+#endif
+
+/**
+ * fdt_add_subnode - creates a new node
+ * @fdt: pointer to the device tree blob
+ * @parentoffset: structure block offset of a node
+ * @name: name of the subnode to locate
+ *
+ * fdt_add_subnode() creates a new node as a subnode of the node at
+ * structure block offset parentoffset, with the given name (which
+ * should include the unit address, if any).
+ *
+ * This function will insert data into the blob, and will therefore
+ * change the offsets of some existing nodes.
+
+ * returns:
+ *	structure block offset of the created nodeequested subnode (>=0), on
+ *		success
+ *	-FDT_ERR_NOTFOUND, if the requested subnode does not exist
+ *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE
+ *		tag
+ *	-FDT_ERR_EXISTS, if the node at parentoffset already has a subnode of
+ *		the given name
+ *	-FDT_ERR_NOSPACE, if there is insufficient free space in the
+ *		blob to contain the new node
+ *	-FDT_ERR_NOSPACE
+ *	-FDT_ERR_BADLAYOUT
+ *      -FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings.
+ */
+int fdt_add_subnode(void *fdt, int parentoffset, const char *name);
+
+/**
+ * fdt_del_node - delete a node (subtree)
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node to nop
+ *
+ * fdt_del_node() will remove the given node, including all its
+ * subnodes if any, from the blob.
+ *
+ * This function will delete data from the blob, and will therefore
+ * change the offsets of some existing nodes.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_del_node(void *fdt, int nodeoffset);
+
+/**
+ * fdt_overlay_apply - Applies a DT overlay on a base DT
+ * @fdt: pointer to the base device tree blob
+ * @fdto: pointer to the device tree overlay blob
+ *
+ * fdt_overlay_apply() will apply the given device tree overlay on the
+ * given base device tree.
+ *
+ * Expect the base device tree to be modified, even if the function
+ * returns an error.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there's not enough space in the base device tree
+ *	-FDT_ERR_NOTFOUND, the overlay points to some inexistant nodes or
+ *		properties in the base DT
+ *	-FDT_ERR_BADPHANDLE,
+ *	-FDT_ERR_BADOVERLAY,
+ *	-FDT_ERR_NOPHANDLES,
+ *	-FDT_ERR_INTERNAL,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADOFFSET,
+ *	-FDT_ERR_BADPATH,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_overlay_apply(void *fdt, void *fdto);
+
+/**********************************************************************/
+/* Debugging / informational functions                                */
+/**********************************************************************/
+
+const char *fdt_strerror(int errval);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBFDT_H */
diff --git a/fdt/libfdt_env.h b/fdt/libfdt_env.h
new file mode 100644
index 0000000000..2363810c17
--- /dev/null
+++ b/fdt/libfdt_env.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) */
+#ifndef LIBFDT_ENV_H
+#define LIBFDT_ENV_H
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ * Copyright 2012 Kim Phillips, Freescale Semiconductor.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#define INT_MAX		INT32_MAX
+#define UINT_MAX	UINT32_MAX
+
+#ifdef __CHECKER__
+#define FDT_FORCE __attribute__((force))
+#define FDT_BITWISE __attribute__((bitwise))
+#else
+#define FDT_FORCE
+#define FDT_BITWISE
+#endif
+
+typedef uint16_t FDT_BITWISE fdt16_t;
+typedef uint32_t FDT_BITWISE fdt32_t;
+typedef uint64_t FDT_BITWISE fdt64_t;
+
+#define EXTRACT_BYTE(x, n)	((unsigned long long)((uint8_t *)&x)[n])
+#define CPU_TO_FDT16(x) ((EXTRACT_BYTE(x, 0) << 8) | EXTRACT_BYTE(x, 1))
+#define CPU_TO_FDT32(x) ((EXTRACT_BYTE(x, 0) << 24) | (EXTRACT_BYTE(x, 1) << 16) | \
+			 (EXTRACT_BYTE(x, 2) << 8) | EXTRACT_BYTE(x, 3))
+#define CPU_TO_FDT64(x) ((EXTRACT_BYTE(x, 0) << 56) | (EXTRACT_BYTE(x, 1) << 48) | \
+			 (EXTRACT_BYTE(x, 2) << 40) | (EXTRACT_BYTE(x, 3) << 32) | \
+			 (EXTRACT_BYTE(x, 4) << 24) | (EXTRACT_BYTE(x, 5) << 16) | \
+			 (EXTRACT_BYTE(x, 6) << 8) | EXTRACT_BYTE(x, 7))
+
+static inline uint16_t fdt16_to_cpu(fdt16_t x)
+{
+	return (FDT_FORCE uint16_t)CPU_TO_FDT16(x);
+}
+static inline fdt16_t cpu_to_fdt16(uint16_t x)
+{
+	return (FDT_FORCE fdt16_t)CPU_TO_FDT16(x);
+}
+
+static inline uint32_t fdt32_to_cpu(fdt32_t x)
+{
+	return (FDT_FORCE uint32_t)CPU_TO_FDT32(x);
+}
+static inline fdt32_t cpu_to_fdt32(uint32_t x)
+{
+	return (FDT_FORCE fdt32_t)CPU_TO_FDT32(x);
+}
+
+static inline uint64_t fdt64_to_cpu(fdt64_t x)
+{
+	return (FDT_FORCE uint64_t)CPU_TO_FDT64(x);
+}
+static inline fdt64_t cpu_to_fdt64(uint64_t x)
+{
+	return (FDT_FORCE fdt64_t)CPU_TO_FDT64(x);
+}
+#undef CPU_TO_FDT64
+#undef CPU_TO_FDT32
+#undef CPU_TO_FDT16
+#undef EXTRACT_BYTE
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+
+/* strnlen() is not available on Mac OS < 10.7 */
+# if !defined(MAC_OS_X_VERSION_10_7) || (MAC_OS_X_VERSION_MAX_ALLOWED < \
+                                         MAC_OS_X_VERSION_10_7)
+
+#define strnlen fdt_strnlen
+
+/*
+ * fdt_strnlen: returns the length of a string or max_count - which ever is
+ * smallest.
+ * Input 1 string: the string whose size is to be determined
+ * Input 2 max_count: the maximum value returned by this function
+ * Output: length of the string or max_count (the smallest of the two)
+ */
+static inline size_t fdt_strnlen(const char *string, size_t max_count)
+{
+    const char *p = memchr(string, 0, max_count);
+    return p ? p - string : max_count;
+}
+
+#endif /* !defined(MAC_OS_X_VERSION_10_7) || (MAC_OS_X_VERSION_MAX_ALLOWED <
+          MAC_OS_X_VERSION_10_7) */
+
+#endif /* __APPLE__ */
+
+#endif /* LIBFDT_ENV_H */
diff --git a/fdt/libfdt_internal.h b/fdt/libfdt_internal.h
new file mode 100644
index 0000000000..741eeb3150
--- /dev/null
+++ b/fdt/libfdt_internal.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) */
+#ifndef LIBFDT_INTERNAL_H
+#define LIBFDT_INTERNAL_H
+/*
+ * libfdt - Flat Device Tree manipulation
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ */
+#include <fdt.h>
+
+#define FDT_ALIGN(x, a)		(((x) + (a) - 1) & ~((a) - 1))
+#define FDT_TAGALIGN(x)		(FDT_ALIGN((x), FDT_TAGSIZE))
+
+int fdt_ro_probe_(const void *fdt);
+#define FDT_RO_PROBE(fdt)					\
+	{							\
+		int totalsize_;					\
+		if ((totalsize_ = fdt_ro_probe_(fdt)) < 0)	\
+			return totalsize_;			\
+	}
+
+int fdt_check_node_offset_(const void *fdt, int offset);
+int fdt_check_prop_offset_(const void *fdt, int offset);
+const char *fdt_find_string_(const char *strtab, int tabsize, const char *s);
+int fdt_node_end_offset_(void *fdt, int nodeoffset);
+
+static inline const void *fdt_offset_ptr_(const void *fdt, int offset)
+{
+	return (const char *)fdt + fdt_off_dt_struct(fdt) + offset;
+}
+
+static inline void *fdt_offset_ptr_w_(void *fdt, int offset)
+{
+	return (void *)(uintptr_t)fdt_offset_ptr_(fdt, offset);
+}
+
+static inline const struct fdt_reserve_entry *fdt_mem_rsv_(const void *fdt, int n)
+{
+	const struct fdt_reserve_entry *rsv_table =
+		(const struct fdt_reserve_entry *)
+		((const char *)fdt + fdt_off_mem_rsvmap(fdt));
+
+	return rsv_table + n;
+}
+static inline struct fdt_reserve_entry *fdt_mem_rsv_w_(void *fdt, int n)
+{
+	return (void *)(uintptr_t)fdt_mem_rsv_(fdt, n);
+}
+
+#define FDT_SW_MAGIC		(~FDT_MAGIC)
+
+#endif /* LIBFDT_INTERNAL_H */
diff --git a/fesvr/context.cc b/fesvr/context.cc
new file mode 100644
index 0000000000..ca73813768
--- /dev/null
+++ b/fesvr/context.cc
@@ -0,0 +1,115 @@
+#include "context.h"
+#include <assert.h>
+#include <sched.h>
+#include <stdlib.h>
+
+static __thread context_t* cur;
+
+context_t::context_t()
+  : creator(NULL), func(NULL), arg(NULL),
+#ifndef USE_UCONTEXT
+    mutex(PTHREAD_MUTEX_INITIALIZER),
+    cond(PTHREAD_COND_INITIALIZER), flag(0)
+#else
+    context(new ucontext_t)
+#endif
+{
+}
+
+#ifdef USE_UCONTEXT
+#ifndef GLIBC_64BIT_PTR_BUG
+void context_t::wrapper(context_t* ctx)
+{
+#else
+void context_t::wrapper(unsigned int hi, unsigned int lo)
+{
+  context_t* ctx = reinterpret_cast<context_t*>(static_cast<unsigned long>(lo) | (static_cast<unsigned long>(hi) << 32));
+#endif
+  ctx->creator->switch_to();
+  ctx->func(ctx->arg);
+}
+#else
+void* context_t::wrapper(void* a)
+{
+  context_t* ctx = static_cast<context_t*>(a);
+  cur = ctx;
+  ctx->creator->switch_to();
+
+  ctx->func(ctx->arg);
+  return NULL;
+}
+#endif
+
+void context_t::init(void (*f)(void*), void* a)
+{
+  func = f;
+  arg = a;
+  creator = current();
+
+#ifdef USE_UCONTEXT
+  getcontext(context.get());
+  context->uc_link = creator->context.get();
+  context->uc_stack.ss_size = 64*1024;
+  context->uc_stack.ss_sp = new void*[context->uc_stack.ss_size/sizeof(void*)];
+#ifndef GLIBC_64BIT_PTR_BUG
+  makecontext(context.get(), (void(*)(void))&context_t::wrapper, 1, this);
+#else
+  unsigned int hi(reinterpret_cast<unsigned long>(this) >> 32);
+  unsigned int lo(reinterpret_cast<unsigned long>(this));
+  makecontext(context.get(), (void(*)(void))&context_t::wrapper, 2, hi, lo);
+#endif
+  switch_to();
+#else
+  assert(flag == 0);
+
+  pthread_mutex_lock(&creator->mutex);
+  creator->flag = 0;
+  if (pthread_create(&thread, NULL, &context_t::wrapper, this) != 0)
+    abort();
+  pthread_detach(thread);
+  while (!creator->flag)
+    pthread_cond_wait(&creator->cond, &creator->mutex);
+  pthread_mutex_unlock(&creator->mutex);
+#endif
+}
+
+context_t::~context_t()
+{
+  assert(this != cur);
+}
+
+void context_t::switch_to()
+{
+  assert(this != cur);
+#ifdef USE_UCONTEXT
+  context_t* prev = cur;
+  cur = this;
+  if (swapcontext(prev->context.get(), context.get()) != 0)
+    abort();
+#else
+  cur->flag = 0;
+  this->flag = 1;
+  pthread_mutex_lock(&this->mutex);
+  pthread_cond_signal(&this->cond);
+  pthread_mutex_unlock(&this->mutex);
+  pthread_mutex_lock(&cur->mutex);
+  while (!cur->flag)
+    pthread_cond_wait(&cur->cond, &cur->mutex);
+  pthread_mutex_unlock(&cur->mutex);
+#endif
+}
+
+context_t* context_t::current()
+{
+  if (cur == NULL)
+  {
+    cur = new context_t;
+#ifdef USE_UCONTEXT
+    getcontext(cur->context.get());
+#else
+    cur->thread = pthread_self();
+    cur->flag = 1;
+#endif
+  }
+  return cur;
+}
diff --git a/fesvr/context.h b/fesvr/context.h
new file mode 100644
index 0000000000..18bf50ef8d
--- /dev/null
+++ b/fesvr/context.h
@@ -0,0 +1,54 @@
+#ifndef _HTIF_CONTEXT_H
+#define _HTIF_CONTEXT_H
+
+// A replacement for ucontext.h, which is sadly deprecated.
+
+#include <pthread.h>
+
+#if defined(__GLIBC__)
+# undef USE_UCONTEXT
+# define USE_UCONTEXT
+# include <ucontext.h>
+# include <memory>
+#include <limits.h>
+
+#if (ULONG_MAX > UINT_MAX) // 64-bit systems only
+#if (100*GLIB_MAJOR_VERSION+GLIB_MINOR_VERSION < 208)
+#define GLIBC_64BIT_PTR_BUG
+static_assert (sizeof(unsigned int)  == 4, "uint size doesn't match expected 32bit");
+static_assert (sizeof(unsigned long) == 8, "ulong size doesn't match expected 64bit");
+static_assert (sizeof(void*)         == 8, "ptr size doesn't match expected 64bit");
+#endif
+#endif /* ULONG_MAX > UINT_MAX */
+
+#endif
+
+class context_t
+{
+ public:
+  context_t();
+  ~context_t();
+  void init(void (*func)(void*), void* arg);
+  void switch_to();
+  static context_t* current();
+ private:
+  context_t* creator;
+  void (*func)(void*);
+  void* arg;
+#ifdef USE_UCONTEXT
+  std::unique_ptr<ucontext_t> context;
+#ifndef GLIBC_64BIT_PTR_BUG
+  static void wrapper(context_t*);
+#else
+  static void wrapper(unsigned int, unsigned int);
+#endif
+#else
+  pthread_t thread;
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+  volatile int flag;
+  static void* wrapper(void*);
+#endif
+};
+
+#endif
diff --git a/fesvr/debug_defines.h b/fesvr/debug_defines.h
new file mode 100644
index 0000000000..e5f9291058
--- /dev/null
+++ b/fesvr/debug_defines.h
@@ -0,0 +1,1418 @@
+#define DTM_IDCODE                          0x01
+/*
+* Identifies the release version of this part.
+ */
+#define DTM_IDCODE_VERSION_OFFSET           28
+#define DTM_IDCODE_VERSION_LENGTH           4
+#define DTM_IDCODE_VERSION                  (0xf << DTM_IDCODE_VERSION_OFFSET)
+/*
+* Identifies the designer's part number of this part.
+ */
+#define DTM_IDCODE_PARTNUMBER_OFFSET        12
+#define DTM_IDCODE_PARTNUMBER_LENGTH        16
+#define DTM_IDCODE_PARTNUMBER               (0xffff << DTM_IDCODE_PARTNUMBER_OFFSET)
+/*
+* Identifies the designer/manufacturer of this part. Bits 6:0 must be
+* bits 6:0 of the designer/manufacturer's Identification Code as
+* assigned by JEDEC Standard JEP106. Bits 10:7 contain the modulo-16
+* count of the number of continuation characters (0x7f) in that same
+* Identification Code.
+ */
+#define DTM_IDCODE_MANUFID_OFFSET           1
+#define DTM_IDCODE_MANUFID_LENGTH           11
+#define DTM_IDCODE_MANUFID                  (0x7ff << DTM_IDCODE_MANUFID_OFFSET)
+#define DTM_IDCODE_1_OFFSET                 0
+#define DTM_IDCODE_1_LENGTH                 1
+#define DTM_IDCODE_1                        (0x1 << DTM_IDCODE_1_OFFSET)
+#define DTM_DTMCS                           0x10
+/*
+* Writing 1 to this bit does a hard reset of the DTM,
+* causing the DTM to forget about any outstanding DMI transactions.
+* In general this should only be used when the Debugger has
+* reason to expect that the outstanding DMI transaction will never
+* complete (e.g. a reset condition caused an inflight DMI transaction to
+* be cancelled).
+ */
+#define DTM_DTMCS_DMIHARDRESET_OFFSET       17
+#define DTM_DTMCS_DMIHARDRESET_LENGTH       1
+#define DTM_DTMCS_DMIHARDRESET              (0x1 << DTM_DTMCS_DMIHARDRESET_OFFSET)
+/*
+* Writing 1 to this bit clears the sticky error state
+* and allows the DTM to retry or complete the previous
+* transaction.
+ */
+#define DTM_DTMCS_DMIRESET_OFFSET           16
+#define DTM_DTMCS_DMIRESET_LENGTH           1
+#define DTM_DTMCS_DMIRESET                  (0x1 << DTM_DTMCS_DMIRESET_OFFSET)
+/*
+* This is a hint to the debugger of the minimum number of
+* cycles a debugger should spend in
+* Run-Test/Idle after every DMI scan to avoid a `busy'
+* return code (\Fdmistat of 3). A debugger must still
+* check \Fdmistat when necessary.
+*
+* 0: It is not necessary to enter Run-Test/Idle at all.
+*
+* 1: Enter Run-Test/Idle and leave it immediately.
+*
+* 2: Enter Run-Test/Idle and stay there for 1 cycle before leaving.
+*
+* And so on.
+ */
+#define DTM_DTMCS_IDLE_OFFSET               12
+#define DTM_DTMCS_IDLE_LENGTH               3
+#define DTM_DTMCS_IDLE                      (0x7 << DTM_DTMCS_IDLE_OFFSET)
+/*
+* 0: No error.
+*
+* 1: Reserved. Interpret the same as 2.
+*
+* 2: An operation failed (resulted in \Fop of 2).
+*
+* 3: An operation was attempted while a DMI access was still in
+* progress (resulted in \Fop of 3).
+ */
+#define DTM_DTMCS_DMISTAT_OFFSET            10
+#define DTM_DTMCS_DMISTAT_LENGTH            2
+#define DTM_DTMCS_DMISTAT                   (0x3 << DTM_DTMCS_DMISTAT_OFFSET)
+/*
+* The size of \Faddress in \Rdmi.
+ */
+#define DTM_DTMCS_ABITS_OFFSET              4
+#define DTM_DTMCS_ABITS_LENGTH              6
+#define DTM_DTMCS_ABITS                     (0x3f << DTM_DTMCS_ABITS_OFFSET)
+/*
+* 0: Version described in spec version 0.11.
+*
+* 1: Version described in spec version 0.13 (and later?), which
+* reduces the DMI data width to 32 bits.
+*
+* Other values are reserved for future use.
+ */
+#define DTM_DTMCS_VERSION_OFFSET            0
+#define DTM_DTMCS_VERSION_LENGTH            4
+#define DTM_DTMCS_VERSION                   (0xf << DTM_DTMCS_VERSION_OFFSET)
+#define DTM_DMI                             0x11
+/*
+* Address used for DMI access. In Update-DR this value is used
+* to access the DM over the DMI.
+ */
+#define DTM_DMI_ADDRESS_OFFSET              34
+#define DTM_DMI_ADDRESS_LENGTH              abits
+#define DTM_DMI_ADDRESS                     (((1L<<abits)-1) << DTM_DMI_ADDRESS_OFFSET)
+/*
+* The data to send to the DM over the DMI during Update-DR, and
+* the data returned from the DM as a result of the previous operation.
+ */
+#define DTM_DMI_DATA_OFFSET                 2
+#define DTM_DMI_DATA_LENGTH                 32
+#define DTM_DMI_DATA                        (0xffffffffL << DTM_DMI_DATA_OFFSET)
+/*
+* When the debugger writes this field, it has the following meaning:
+*
+* 0: Ignore \Fdata and \Faddress. (nop)
+*
+* Don't send anything over the DMI during Update-DR.
+* This operation should never result in a busy or error response.
+* The address and data reported in the following Capture-DR
+* are undefined.
+*
+* 1: Read from \Faddress. (read)
+*
+* 2: Write \Fdata to \Faddress. (write)
+*
+* 3: Reserved.
+*
+* When the debugger reads this field, it means the following:
+*
+* 0: The previous operation completed successfully.
+*
+* 1: Reserved.
+*
+* 2: A previous operation failed.  The data scanned into \Rdmi in
+* this access will be ignored.  This status is sticky and can be
+* cleared by writing \Fdmireset in \Rdtmcs.
+*
+* This indicates that the DM itself responded with an error, e.g.
+* in the System Bus and Serial Port overflow/underflow cases.
+*
+* 3: An operation was attempted while a DMI request is still in
+* progress. The data scanned into \Rdmi in this access will be
+* ignored. This status is sticky and can be cleared by writing
+* \Fdmireset in \Rdtmcs. If a debugger sees this status, it
+* needs to give the target more TCK edges between Update-DR and
+* Capture-DR. The simplest way to do that is to add extra transitions
+* in Run-Test/Idle.
+*
+* (The DTM, DM, and/or component may be in different clock domains,
+* so synchronization may be required. Some relatively fixed number of
+* TCK ticks may be needed for the request to reach the DM, complete,
+* and for the response to be synchronized back into the TCK domain.)
+ */
+#define DTM_DMI_OP_OFFSET                   0
+#define DTM_DMI_OP_LENGTH                   2
+#define DTM_DMI_OP                          (0x3L << DTM_DMI_OP_OFFSET)
+#define CSR_DCSR                            0x7b0
+/*
+* 0: There is no external debug support.
+*
+* 4: External debug support exists as it is described in this document.
+ */
+#define CSR_DCSR_XDEBUGVER_OFFSET           28
+#define CSR_DCSR_XDEBUGVER_LENGTH           4
+#define CSR_DCSR_XDEBUGVER                  (0xf << CSR_DCSR_XDEBUGVER_OFFSET)
+/*
+* When 1, {\tt ebreak} instructions in Machine Mode enter Debug Mode.
+ */
+#define CSR_DCSR_EBREAKM_OFFSET             15
+#define CSR_DCSR_EBREAKM_LENGTH             1
+#define CSR_DCSR_EBREAKM                    (0x1 << CSR_DCSR_EBREAKM_OFFSET)
+/*
+* When 1, {\tt ebreak} instructions in Hypervisor Mode enter Debug Mode.
+ */
+#define CSR_DCSR_EBREAKH_OFFSET             14
+#define CSR_DCSR_EBREAKH_LENGTH             1
+#define CSR_DCSR_EBREAKH                    (0x1 << CSR_DCSR_EBREAKH_OFFSET)
+/*
+* When 1, {\tt ebreak} instructions in Supervisor Mode enter Debug Mode.
+ */
+#define CSR_DCSR_EBREAKS_OFFSET             13
+#define CSR_DCSR_EBREAKS_LENGTH             1
+#define CSR_DCSR_EBREAKS                    (0x1 << CSR_DCSR_EBREAKS_OFFSET)
+/*
+* When 1, {\tt ebreak} instructions in User/Application Mode enter
+* Debug Mode.
+ */
+#define CSR_DCSR_EBREAKU_OFFSET             12
+#define CSR_DCSR_EBREAKU_LENGTH             1
+#define CSR_DCSR_EBREAKU                    (0x1 << CSR_DCSR_EBREAKU_OFFSET)
+/*
+* 0: Increment counters as usual.
+*
+* 1: Don't increment any counters while in Debug Mode.  This includes
+* the {\tt cycle} and {\tt instret} CSRs. This is preferred for most
+* debugging scenarios.
+*
+* An implementation may choose not to support writing to this bit.
+* The debugger must read back the value it writes to check whether
+* the feature is supported.
+ */
+#define CSR_DCSR_STOPCOUNT_OFFSET           10
+#define CSR_DCSR_STOPCOUNT_LENGTH           1
+#define CSR_DCSR_STOPCOUNT                  (0x1 << CSR_DCSR_STOPCOUNT_OFFSET)
+/*
+* 0: Increment timers as usual.
+*
+* 1: Don't increment any hart-local timers while in Debug Mode.
+*
+* An implementation may choose not to support writing to this bit.
+* The debugger must read back the value it writes to check whether
+* the feature is supported.
+ */
+#define CSR_DCSR_STOPTIME_OFFSET            9
+#define CSR_DCSR_STOPTIME_LENGTH            1
+#define CSR_DCSR_STOPTIME                   (0x1 << CSR_DCSR_STOPTIME_OFFSET)
+/*
+* Explains why Debug Mode was entered.
+*
+* When there are multiple reasons to enter Debug Mode in a single
+* cycle, the cause with the highest priority is the one written.
+*
+* 1: An {\tt ebreak} instruction was executed. (priority 3)
+*
+* 2: The Trigger Module caused a halt. (priority 4)
+*
+* 3: \Fhaltreq was set. (priority 2)
+*
+* 4: The hart single stepped because \Fstep was set. (priority 1)
+*
+* Other values are reserved for future use.
+ */
+#define CSR_DCSR_CAUSE_OFFSET               6
+#define CSR_DCSR_CAUSE_LENGTH               3
+#define CSR_DCSR_CAUSE                      (0x7 << CSR_DCSR_CAUSE_OFFSET)
+/*
+* When set and not in Debug Mode, the hart will only execute a single
+* instruction and then enter Debug Mode.
+* Interrupts are disabled when this bit is set.
+* If the instruction does not complete due to an exception,
+* the hart will immediately enter Debug Mode before executing
+* the trap handler, with appropriate exception registers set.
+ */
+#define CSR_DCSR_STEP_OFFSET                2
+#define CSR_DCSR_STEP_LENGTH                1
+#define CSR_DCSR_STEP                       (0x1 << CSR_DCSR_STEP_OFFSET)
+/*
+* Contains the privilege level the hart was operating in when Debug
+* Mode was entered. The encoding is described in Table
+* \ref{tab:privlevel}.  A debugger can change this value to change
+* the hart's privilege level when exiting Debug Mode.
+*
+* Not all privilege levels are supported on all harts. If the
+* encoding written is not supported or the debugger is not allowed to
+* change to it, the hart may change to any supported privilege level.
+ */
+#define CSR_DCSR_PRV_OFFSET                 0
+#define CSR_DCSR_PRV_LENGTH                 2
+#define CSR_DCSR_PRV                        (0x3 << CSR_DCSR_PRV_OFFSET)
+#define CSR_DPC                             0x7b1
+#define CSR_DPC_DPC_OFFSET                  0
+#define CSR_DPC_DPC_LENGTH                  XLEN
+#define CSR_DPC_DPC                         (((1L<<XLEN)-1) << CSR_DPC_DPC_OFFSET)
+#define CSR_DSCRATCH0                       0x7b2
+#define CSR_DSCRATCH1                       0x7b3
+#define CSR_TSELECT                         0x7a0
+#define CSR_TSELECT_INDEX_OFFSET            0
+#define CSR_TSELECT_INDEX_LENGTH            XLEN
+#define CSR_TSELECT_INDEX                   (((1L<<XLEN)-1) << CSR_TSELECT_INDEX_OFFSET)
+#define CSR_TDATA1                          0x7a1
+/*
+* 0: There is no trigger at this \Rtselect.
+*
+* 1: The trigger is a legacy SiFive address match trigger. These
+* should not be implemented and aren't further documented here.
+*
+* 2: The trigger is an address/data match trigger. The remaining bits
+* in this register act as described in \Rmcontrol.
+*
+* 3: The trigger is an instruction count trigger. The remaining bits
+* in this register act as described in \Ricount.
+*
+* 15: This trigger exists (so enumeration shouldn't terminate), but
+* is not currently available.
+*
+* Other values are reserved for future use.
+ */
+#define CSR_TDATA1_TYPE_OFFSET              XLEN-4
+#define CSR_TDATA1_TYPE_LENGTH              4
+#define CSR_TDATA1_TYPE                     (0xfL << CSR_TDATA1_TYPE_OFFSET)
+/*
+* 0: Both Debug and M Mode can write the {\tt tdata} registers at the
+* selected \Rtselect.
+*
+* 1: Only Debug Mode can write the {\tt tdata} registers at the
+* selected \Rtselect.  Writes from other modes are ignored.
+*
+* This bit is only writable from Debug Mode.
+ */
+#define CSR_TDATA1_HMODE_OFFSET             XLEN-5
+#define CSR_TDATA1_HMODE_LENGTH             1
+#define CSR_TDATA1_HMODE                    (0x1L << CSR_TDATA1_HMODE_OFFSET)
+/*
+* Trigger-specific data.
+ */
+#define CSR_TDATA1_DATA_OFFSET              0
+#define CSR_TDATA1_DATA_LENGTH              XLEN - 5
+#define CSR_TDATA1_DATA                     (((1L<<XLEN - 5)-1) << CSR_TDATA1_DATA_OFFSET)
+#define CSR_TDATA2                          0x7a2
+#define CSR_TDATA2_DATA_OFFSET              0
+#define CSR_TDATA2_DATA_LENGTH              XLEN
+#define CSR_TDATA2_DATA                     (((1L<<XLEN)-1) << CSR_TDATA2_DATA_OFFSET)
+#define CSR_TDATA3                          0x7a3
+#define CSR_TDATA3_DATA_OFFSET              0
+#define CSR_TDATA3_DATA_LENGTH              XLEN
+#define CSR_TDATA3_DATA                     (((1L<<XLEN)-1) << CSR_TDATA3_DATA_OFFSET)
+#define CSR_MCONTROL                        0x7a1
+#define CSR_MCONTROL_TYPE_OFFSET            XLEN-4
+#define CSR_MCONTROL_TYPE_LENGTH            4
+#define CSR_MCONTROL_TYPE                   (0xfL << CSR_MCONTROL_TYPE_OFFSET)
+#define CSR_MCONTROL_DMODE_OFFSET           XLEN-5
+#define CSR_MCONTROL_DMODE_LENGTH           1
+#define CSR_MCONTROL_DMODE                  (0x1L << CSR_MCONTROL_DMODE_OFFSET)
+/*
+* Specifies the largest naturally aligned powers-of-two (NAPOT) range
+* supported by the hardware. The value is the logarithm base 2 of the
+* number of bytes in that range.  A value of 0 indicates that only
+* exact value matches are supported (one byte range). A value of 63
+* corresponds to the maximum NAPOT range, which is $2^{63}$ bytes in
+* size.
+ */
+#define CSR_MCONTROL_MASKMAX_OFFSET         XLEN-11
+#define CSR_MCONTROL_MASKMAX_LENGTH         6
+#define CSR_MCONTROL_MASKMAX                (0x3fL << CSR_MCONTROL_MASKMAX_OFFSET)
+/*
+* 0: Perform a match on the virtual address.
+*
+* 1: Perform a match on the data value loaded/stored, or the
+* instruction executed.
+ */
+#define CSR_MCONTROL_SELECT_OFFSET          19
+#define CSR_MCONTROL_SELECT_LENGTH          1
+#define CSR_MCONTROL_SELECT                 (0x1L << CSR_MCONTROL_SELECT_OFFSET)
+/*
+* 0: The action for this trigger will be taken just before the
+* instruction that triggered it is executed, but after all preceding
+* instructions are are committed.
+*
+* 1: The action for this trigger will be taken after the instruction
+* that triggered it is executed. It should be taken before the next
+* instruction is executed, but it is better to implement triggers and
+* not implement that suggestion than to not implement them at all.
+*
+* Most hardware will only implement one timing or the other, possibly
+* dependent on \Fselect, \Fexecute, \Fload, and \Fstore. This bit
+* primarily exists for the hardware to communicate to the debugger
+* what will happen. Hardware may implement the bit fully writable, in
+* which case the debugger has a little more control.
+*
+* Data load triggers with \Ftiming of 0 will result in the same load
+* happening again when the debugger lets the core run. For data load
+* triggers, debuggers must first attempt to set the breakpoint with
+* \Ftiming of 1.
+*
+* A chain of triggers that don't all have the same \Ftiming value
+* will never fire (unless consecutive instructions match the
+* appropriate triggers).
+ */
+#define CSR_MCONTROL_TIMING_OFFSET          18
+#define CSR_MCONTROL_TIMING_LENGTH          1
+#define CSR_MCONTROL_TIMING                 (0x1L << CSR_MCONTROL_TIMING_OFFSET)
+/*
+* Determines what happens when this trigger matches.
+*
+* 0: Raise a breakpoint exception. (Used when software wants to use
+* the trigger module without an external debugger attached.)
+*
+* 1: Enter Debug Mode. (Only supported when \Fhmode is 1.)
+*
+* 2: Start tracing.
+*
+* 3: Stop tracing.
+*
+* 4: Emit trace data for this match. If it is a data access match,
+* emit appropriate Load/Store Address/Data. If it is an instruction
+* execution, emit its PC.
+*
+* Other values are reserved for future use.
+ */
+#define CSR_MCONTROL_ACTION_OFFSET          12
+#define CSR_MCONTROL_ACTION_LENGTH          6
+#define CSR_MCONTROL_ACTION                 (0x3fL << CSR_MCONTROL_ACTION_OFFSET)
+/*
+* 0: When this trigger matches, the configured action is taken.
+*
+* 1: While this trigger does not match, it prevents the trigger with
+* the next index from matching.
+ */
+#define CSR_MCONTROL_CHAIN_OFFSET           11
+#define CSR_MCONTROL_CHAIN_LENGTH           1
+#define CSR_MCONTROL_CHAIN                  (0x1L << CSR_MCONTROL_CHAIN_OFFSET)
+/*
+* 0: Matches when the value equals \Rtdatatwo.
+*
+* 1: Matches when the top M bits of the value match the top M bits of
+* \Rtdatatwo. M is XLEN-1 minus the index of the least-significant
+* bit containing 0 in \Rtdatatwo.
+*
+* 2: Matches when the value is greater than or equal to \Rtdatatwo.
+*
+* 3: Matches when the value is less than \Rtdatatwo.
+*
+* 4: Matches when the lower half of the value equals the lower half
+* of \Rtdatatwo after the lower half of the value is ANDed with the
+* upper half of \Rtdatatwo.
+*
+* 5: Matches when the upper half of the value equals the lower half
+* of \Rtdatatwo after the upper half of the value is ANDed with the
+* upper half of \Rtdatatwo.
+*
+* Other values are reserved for future use.
+ */
+#define CSR_MCONTROL_MATCH_OFFSET           7
+#define CSR_MCONTROL_MATCH_LENGTH           4
+#define CSR_MCONTROL_MATCH                  (0xfL << CSR_MCONTROL_MATCH_OFFSET)
+/*
+* When set, enable this trigger in M mode.
+ */
+#define CSR_MCONTROL_M_OFFSET               6
+#define CSR_MCONTROL_M_LENGTH               1
+#define CSR_MCONTROL_M                      (0x1L << CSR_MCONTROL_M_OFFSET)
+/*
+* When set, enable this trigger in H mode.
+ */
+#define CSR_MCONTROL_H_OFFSET               5
+#define CSR_MCONTROL_H_LENGTH               1
+#define CSR_MCONTROL_H                      (0x1L << CSR_MCONTROL_H_OFFSET)
+/*
+* When set, enable this trigger in S mode.
+ */
+#define CSR_MCONTROL_S_OFFSET               4
+#define CSR_MCONTROL_S_LENGTH               1
+#define CSR_MCONTROL_S                      (0x1L << CSR_MCONTROL_S_OFFSET)
+/*
+* When set, enable this trigger in U mode.
+ */
+#define CSR_MCONTROL_U_OFFSET               3
+#define CSR_MCONTROL_U_LENGTH               1
+#define CSR_MCONTROL_U                      (0x1L << CSR_MCONTROL_U_OFFSET)
+/*
+* When set, the trigger fires on the virtual address or opcode of an
+* instruction that is executed.
+ */
+#define CSR_MCONTROL_EXECUTE_OFFSET         2
+#define CSR_MCONTROL_EXECUTE_LENGTH         1
+#define CSR_MCONTROL_EXECUTE                (0x1L << CSR_MCONTROL_EXECUTE_OFFSET)
+/*
+* When set, the trigger fires on the virtual address or data of a store.
+ */
+#define CSR_MCONTROL_STORE_OFFSET           1
+#define CSR_MCONTROL_STORE_LENGTH           1
+#define CSR_MCONTROL_STORE                  (0x1L << CSR_MCONTROL_STORE_OFFSET)
+/*
+* When set, the trigger fires on the virtual address or data of a load.
+ */
+#define CSR_MCONTROL_LOAD_OFFSET            0
+#define CSR_MCONTROL_LOAD_LENGTH            1
+#define CSR_MCONTROL_LOAD                   (0x1L << CSR_MCONTROL_LOAD_OFFSET)
+#define CSR_ICOUNT                          0x7a1
+#define CSR_ICOUNT_TYPE_OFFSET              XLEN-4
+#define CSR_ICOUNT_TYPE_LENGTH              4
+#define CSR_ICOUNT_TYPE                     (0xfL << CSR_ICOUNT_TYPE_OFFSET)
+#define CSR_ICOUNT_DMODE_OFFSET             XLEN-5
+#define CSR_ICOUNT_DMODE_LENGTH             1
+#define CSR_ICOUNT_DMODE                    (0x1L << CSR_ICOUNT_DMODE_OFFSET)
+/*
+* When count is decremented to 0, the trigger fires. Instead of
+* changing \Fcount from 1 to 0, it is also acceptable for hardware to
+* clear \Fm, \Fh, \Fs, and \Fu. This allows \Fcount to be hard-wired
+* to 1 if this register just exists for single step.
+ */
+#define CSR_ICOUNT_COUNT_OFFSET             10
+#define CSR_ICOUNT_COUNT_LENGTH             14
+#define CSR_ICOUNT_COUNT                    (0x3fffL << CSR_ICOUNT_COUNT_OFFSET)
+/*
+* When set, every instruction completed or exception taken in M mode decrements \Fcount
+* by 1.
+ */
+#define CSR_ICOUNT_M_OFFSET                 9
+#define CSR_ICOUNT_M_LENGTH                 1
+#define CSR_ICOUNT_M                        (0x1L << CSR_ICOUNT_M_OFFSET)
+/*
+* When set, every instruction completed or exception taken in in H mode decrements \Fcount
+* by 1.
+ */
+#define CSR_ICOUNT_H_OFFSET                 8
+#define CSR_ICOUNT_H_LENGTH                 1
+#define CSR_ICOUNT_H                        (0x1L << CSR_ICOUNT_H_OFFSET)
+/*
+* When set, every instruction completed or exception taken in S mode decrements \Fcount
+* by 1.
+ */
+#define CSR_ICOUNT_S_OFFSET                 7
+#define CSR_ICOUNT_S_LENGTH                 1
+#define CSR_ICOUNT_S                        (0x1L << CSR_ICOUNT_S_OFFSET)
+/*
+* When set, every instruction completed or exception taken in U mode decrements \Fcount
+* by 1.
+ */
+#define CSR_ICOUNT_U_OFFSET                 6
+#define CSR_ICOUNT_U_LENGTH                 1
+#define CSR_ICOUNT_U                        (0x1L << CSR_ICOUNT_U_OFFSET)
+/*
+* Determines what happens when this trigger matches.
+*
+* 0: Raise a breakpoint exception. (Used when software wants to use the
+* trigger module without an external debugger attached.)
+*
+* 1: Enter Debug Mode. (Only supported when \Fhmode is 1.)
+*
+* 2: Start tracing.
+*
+* 3: Stop tracing.
+*
+* 4: Emit trace data for this match. If it is a data access match,
+* emit appropriate Load/Store Address/Data. If it is an instruction
+* execution, emit its PC.
+*
+* Other values are reserved for future use.
+ */
+#define CSR_ICOUNT_ACTION_OFFSET            0
+#define CSR_ICOUNT_ACTION_LENGTH            6
+#define CSR_ICOUNT_ACTION                   (0x3fL << CSR_ICOUNT_ACTION_OFFSET)
+#define DMI_DMSTATUS                        0x11
+/*
+* This field is 1 when all currently selected harts have acknowledged the previous \Fresumereq.
+ */
+#define DMI_DMSTATUS_ALLRESUMEACK_OFFSET    17
+#define DMI_DMSTATUS_ALLRESUMEACK_LENGTH    1
+#define DMI_DMSTATUS_ALLRESUMEACK           (0x1 << DMI_DMSTATUS_ALLRESUMEACK_OFFSET)
+/*
+* This field is 1 when any currently selected hart has acknowledged the previous \Fresumereq.
+ */
+#define DMI_DMSTATUS_ANYRESUMEACK_OFFSET    16
+#define DMI_DMSTATUS_ANYRESUMEACK_LENGTH    1
+#define DMI_DMSTATUS_ANYRESUMEACK           (0x1 << DMI_DMSTATUS_ANYRESUMEACK_OFFSET)
+/*
+* This field is 1 when all currently selected harts do not exist in this system.
+ */
+#define DMI_DMSTATUS_ALLNONEXISTENT_OFFSET  15
+#define DMI_DMSTATUS_ALLNONEXISTENT_LENGTH  1
+#define DMI_DMSTATUS_ALLNONEXISTENT         (0x1 << DMI_DMSTATUS_ALLNONEXISTENT_OFFSET)
+/*
+* This field is 1 when any currently selected hart does not exist in this system.
+ */
+#define DMI_DMSTATUS_ANYNONEXISTENT_OFFSET  14
+#define DMI_DMSTATUS_ANYNONEXISTENT_LENGTH  1
+#define DMI_DMSTATUS_ANYNONEXISTENT         (0x1 << DMI_DMSTATUS_ANYNONEXISTENT_OFFSET)
+/*
+* This field is 1 when all currently selected harts are unavailable.
+ */
+#define DMI_DMSTATUS_ALLUNAVAIL_OFFSET      13
+#define DMI_DMSTATUS_ALLUNAVAIL_LENGTH      1
+#define DMI_DMSTATUS_ALLUNAVAIL             (0x1 << DMI_DMSTATUS_ALLUNAVAIL_OFFSET)
+/*
+* This field is 1 when any currently selected hart is unavailable.
+ */
+#define DMI_DMSTATUS_ANYUNAVAIL_OFFSET      12
+#define DMI_DMSTATUS_ANYUNAVAIL_LENGTH      1
+#define DMI_DMSTATUS_ANYUNAVAIL             (0x1 << DMI_DMSTATUS_ANYUNAVAIL_OFFSET)
+/*
+* This field is 1 when all currently selected harts are running.
+ */
+#define DMI_DMSTATUS_ALLRUNNING_OFFSET      11
+#define DMI_DMSTATUS_ALLRUNNING_LENGTH      1
+#define DMI_DMSTATUS_ALLRUNNING             (0x1 << DMI_DMSTATUS_ALLRUNNING_OFFSET)
+/*
+* This field is 1 when any currently selected hart is running.
+ */
+#define DMI_DMSTATUS_ANYRUNNING_OFFSET      10
+#define DMI_DMSTATUS_ANYRUNNING_LENGTH      1
+#define DMI_DMSTATUS_ANYRUNNING             (0x1 << DMI_DMSTATUS_ANYRUNNING_OFFSET)
+/*
+* This field is 1 when all currently selected harts are halted.
+ */
+#define DMI_DMSTATUS_ALLHALTED_OFFSET       9
+#define DMI_DMSTATUS_ALLHALTED_LENGTH       1
+#define DMI_DMSTATUS_ALLHALTED              (0x1 << DMI_DMSTATUS_ALLHALTED_OFFSET)
+/*
+* This field is 1 when any currently selected hart is halted.
+ */
+#define DMI_DMSTATUS_ANYHALTED_OFFSET       8
+#define DMI_DMSTATUS_ANYHALTED_LENGTH       1
+#define DMI_DMSTATUS_ANYHALTED              (0x1 << DMI_DMSTATUS_ANYHALTED_OFFSET)
+/*
+* 0 when authentication is required before using the DM.  1 when the
+* authentication check has passed. On components that don't implement
+* authentication, this bit must be preset as 1.
+ */
+#define DMI_DMSTATUS_AUTHENTICATED_OFFSET   7
+#define DMI_DMSTATUS_AUTHENTICATED_LENGTH   1
+#define DMI_DMSTATUS_AUTHENTICATED          (0x1 << DMI_DMSTATUS_AUTHENTICATED_OFFSET)
+/*
+* 0: The authentication module is ready to process the next
+* read/write to \Rauthdata.
+*
+* 1: The authentication module is busy. Accessing \Rauthdata results
+* in unspecified behavior.
+*
+* \Fauthbusy only becomes set in immediate response to an access to
+* \Rauthdata.
+ */
+#define DMI_DMSTATUS_AUTHBUSY_OFFSET        6
+#define DMI_DMSTATUS_AUTHBUSY_LENGTH        1
+#define DMI_DMSTATUS_AUTHBUSY               (0x1 << DMI_DMSTATUS_AUTHBUSY_OFFSET)
+#define DMI_DMSTATUS_CFGSTRVALID_OFFSET     4
+#define DMI_DMSTATUS_CFGSTRVALID_LENGTH     1
+#define DMI_DMSTATUS_CFGSTRVALID            (0x1 << DMI_DMSTATUS_CFGSTRVALID_OFFSET)
+/*
+* 0: There is no Debug Module present.
+*
+* 1: There is a Debug Module and it conforms to version 0.11 of this
+* specification.
+*
+* 2: There is a Debug Module and it conforms to version 0.13 of this
+* specification.
+ */
+#define DMI_DMSTATUS_VERSION_OFFSET         0
+#define DMI_DMSTATUS_VERSION_LENGTH         4
+#define DMI_DMSTATUS_VERSION                (0xf << DMI_DMSTATUS_VERSION_OFFSET)
+#define DMI_DMCONTROL                       0x10
+/*
+* Halt request signal for all currently selected harts. When set to
+* 1, each selected hart will halt if it is not currently halted.
+*
+* Writing 1 or 0 has no effect on a hart which is already halted, but
+* the bit should be cleared to 0 before the hart is resumed.
+* Setting both \Fhaltreq and \Fresumereq leads to undefined behavior.
+*
+* Writes apply to the new value of \Fhartsel and \Fhasel.
+ */
+#define DMI_DMCONTROL_HALTREQ_OFFSET        31
+#define DMI_DMCONTROL_HALTREQ_LENGTH        1
+#define DMI_DMCONTROL_HALTREQ               (0x1 << DMI_DMCONTROL_HALTREQ_OFFSET)
+/*
+* Resume request signal for all currently selected harts. When set to 1,
+* each selected hart will resume if it is currently halted.
+* Setting both \Fhaltreq and \Fresumereq leads to undefined behavior.
+*
+* Writes apply to the new value of \Fhartsel and \Fhasel.
+ */
+#define DMI_DMCONTROL_RESUMEREQ_OFFSET      30
+#define DMI_DMCONTROL_RESUMEREQ_LENGTH      1
+#define DMI_DMCONTROL_RESUMEREQ             (0x1 << DMI_DMCONTROL_RESUMEREQ_OFFSET)
+/*
+* This optional bit controls reset to all the currently selected harts.
+* To perform a reset the debugger writes 1, and then writes 0 to
+* deassert the reset signal.
+*
+* If this feature is not implemented, the bit always stays 0, so
+* after writing 1 the debugger can read the register back to see if
+* the feature is supported.
+*
+* Writes apply to the new value of \Fhartsel and \Fhasel.
+ */
+#define DMI_DMCONTROL_HARTRESET_OFFSET      29
+#define DMI_DMCONTROL_HARTRESET_LENGTH      1
+#define DMI_DMCONTROL_HARTRESET             (0x1 << DMI_DMCONTROL_HARTRESET_OFFSET)
+/*
+* Selects the  definition of currently selected harts.
+*
+* 0: There is a single currently selected hart, that selected by \Fhartsel.
+*
+* 1: There may be multiple currently selected harts -- that selected by \Fhartsel,
+* plus those selected by the hart array mask register.
+*
+* An implementation which does not implement the hart array mask register
+* should tie this field to 0. A debugger which wishes to use the hart array
+* mask register feature should set this bit and read back to see if the functionality
+* is supported.
+ */
+#define DMI_DMCONTROL_HASEL_OFFSET          26
+#define DMI_DMCONTROL_HASEL_LENGTH          1
+#define DMI_DMCONTROL_HASEL                 (0x1 << DMI_DMCONTROL_HASEL_OFFSET)
+/*
+* The DM-specific index of the hart to select. This hart is always part of the
+* currently selected harts.
+ */
+#define DMI_DMCONTROL_HARTSEL_OFFSET        16
+#define DMI_DMCONTROL_HARTSEL_LENGTH        10
+#define DMI_DMCONTROL_HARTSEL               (0x3ff << DMI_DMCONTROL_HARTSEL_OFFSET)
+/*
+* This bit controls the reset signal from the DM to the rest of the
+* system. To perform a system reset the debugger writes 1,
+* and then writes 0
+* to deassert the reset. This bit must not reset the Debug Module
+* registers. What it does reset is platform-specific (it may
+* reset nothing).
+ */
+#define DMI_DMCONTROL_NDMRESET_OFFSET       1
+#define DMI_DMCONTROL_NDMRESET_LENGTH       1
+#define DMI_DMCONTROL_NDMRESET              (0x1 << DMI_DMCONTROL_NDMRESET_OFFSET)
+/*
+* This bit serves as a reset signal for the Debug Module itself.
+*
+* 0: The module's state, including authentication mechanism,
+* takes its reset values (the \Fdmactive bit is the only bit which can
+* be written to something other than its reset value).
+*
+* 1: The module functions normally.
+*
+* No other mechanism should exist that may result in resetting the
+* Debug Module after power up, including the platform's system reset
+* or Debug Transport reset signals.
+*
+* A debugger may pulse this bit low to get the debug module into a
+* known state.
+*
+* Implementations may use this bit to aid debugging, for example by
+* preventing the Debug Module from being power gated while debugging
+* is active.
+ */
+#define DMI_DMCONTROL_DMACTIVE_OFFSET       0
+#define DMI_DMCONTROL_DMACTIVE_LENGTH       1
+#define DMI_DMCONTROL_DMACTIVE              (0x1 << DMI_DMCONTROL_DMACTIVE_OFFSET)
+#define DMI_HARTINFO                        0x12
+/*
+* Number of {\tt dscratch} registers available for the debugger
+* to use during program buffer execution, starting from \Rdscratchzero.
+* The debugger can make no assumptions about the contents of these
+* registers between commands.
+ */
+#define DMI_HARTINFO_NSCRATCH_OFFSET        20
+#define DMI_HARTINFO_NSCRATCH_LENGTH        4
+#define DMI_HARTINFO_NSCRATCH               (0xf << DMI_HARTINFO_NSCRATCH_OFFSET)
+/*
+* 0: The {\tt data} registers are shadowed in the hart by CSR
+* registers. Each CSR register is XLEN bits in size, and corresponds
+* to a single argument, per Table~\ref{tab:datareg}.
+*
+* 1: The {\tt data} registers are shadowed in the hart's memory map.
+* Each register takes up 4 bytes in the memory map.
+ */
+#define DMI_HARTINFO_DATAACCESS_OFFSET      16
+#define DMI_HARTINFO_DATAACCESS_LENGTH      1
+#define DMI_HARTINFO_DATAACCESS             (0x1 << DMI_HARTINFO_DATAACCESS_OFFSET)
+/*
+* If \Fdataaccess is 0: Number of CSR registers dedicated to
+* shadowing the {\tt data} registers.
+*
+* If \Fdataaccess is 1: Number of 32-bit words in the memory map
+* dedicated to shadowing the {\tt data} registers.
+ */
+#define DMI_HARTINFO_DATASIZE_OFFSET        12
+#define DMI_HARTINFO_DATASIZE_LENGTH        4
+#define DMI_HARTINFO_DATASIZE               (0xf << DMI_HARTINFO_DATASIZE_OFFSET)
+/*
+* If \Fdataaccess is 0: The number of the first CSR dedicated to
+* shadowing the {\tt data} registers.
+*
+* If \Fdataaccess is 1: Signed address of RAM where the {\tt data}
+* registers are shadowed.
+ */
+#define DMI_HARTINFO_DATAADDR_OFFSET        0
+#define DMI_HARTINFO_DATAADDR_LENGTH        12
+#define DMI_HARTINFO_DATAADDR               (0xfff << DMI_HARTINFO_DATAADDR_OFFSET)
+#define DMI_HALTSUM                         0x13
+#define DMI_HALTSUM_HALT1023_992_OFFSET     31
+#define DMI_HALTSUM_HALT1023_992_LENGTH     1
+#define DMI_HALTSUM_HALT1023_992            (0x1 << DMI_HALTSUM_HALT1023_992_OFFSET)
+#define DMI_HALTSUM_HALT991_960_OFFSET      30
+#define DMI_HALTSUM_HALT991_960_LENGTH      1
+#define DMI_HALTSUM_HALT991_960             (0x1 << DMI_HALTSUM_HALT991_960_OFFSET)
+#define DMI_HALTSUM_HALT959_928_OFFSET      29
+#define DMI_HALTSUM_HALT959_928_LENGTH      1
+#define DMI_HALTSUM_HALT959_928             (0x1 << DMI_HALTSUM_HALT959_928_OFFSET)
+#define DMI_HALTSUM_HALT927_896_OFFSET      28
+#define DMI_HALTSUM_HALT927_896_LENGTH      1
+#define DMI_HALTSUM_HALT927_896             (0x1 << DMI_HALTSUM_HALT927_896_OFFSET)
+#define DMI_HALTSUM_HALT895_864_OFFSET      27
+#define DMI_HALTSUM_HALT895_864_LENGTH      1
+#define DMI_HALTSUM_HALT895_864             (0x1 << DMI_HALTSUM_HALT895_864_OFFSET)
+#define DMI_HALTSUM_HALT863_832_OFFSET      26
+#define DMI_HALTSUM_HALT863_832_LENGTH      1
+#define DMI_HALTSUM_HALT863_832             (0x1 << DMI_HALTSUM_HALT863_832_OFFSET)
+#define DMI_HALTSUM_HALT831_800_OFFSET      25
+#define DMI_HALTSUM_HALT831_800_LENGTH      1
+#define DMI_HALTSUM_HALT831_800             (0x1 << DMI_HALTSUM_HALT831_800_OFFSET)
+#define DMI_HALTSUM_HALT799_768_OFFSET      24
+#define DMI_HALTSUM_HALT799_768_LENGTH      1
+#define DMI_HALTSUM_HALT799_768             (0x1 << DMI_HALTSUM_HALT799_768_OFFSET)
+#define DMI_HALTSUM_HALT767_736_OFFSET      23
+#define DMI_HALTSUM_HALT767_736_LENGTH      1
+#define DMI_HALTSUM_HALT767_736             (0x1 << DMI_HALTSUM_HALT767_736_OFFSET)
+#define DMI_HALTSUM_HALT735_704_OFFSET      22
+#define DMI_HALTSUM_HALT735_704_LENGTH      1
+#define DMI_HALTSUM_HALT735_704             (0x1 << DMI_HALTSUM_HALT735_704_OFFSET)
+#define DMI_HALTSUM_HALT703_672_OFFSET      21
+#define DMI_HALTSUM_HALT703_672_LENGTH      1
+#define DMI_HALTSUM_HALT703_672             (0x1 << DMI_HALTSUM_HALT703_672_OFFSET)
+#define DMI_HALTSUM_HALT671_640_OFFSET      20
+#define DMI_HALTSUM_HALT671_640_LENGTH      1
+#define DMI_HALTSUM_HALT671_640             (0x1 << DMI_HALTSUM_HALT671_640_OFFSET)
+#define DMI_HALTSUM_HALT639_608_OFFSET      19
+#define DMI_HALTSUM_HALT639_608_LENGTH      1
+#define DMI_HALTSUM_HALT639_608             (0x1 << DMI_HALTSUM_HALT639_608_OFFSET)
+#define DMI_HALTSUM_HALT607_576_OFFSET      18
+#define DMI_HALTSUM_HALT607_576_LENGTH      1
+#define DMI_HALTSUM_HALT607_576             (0x1 << DMI_HALTSUM_HALT607_576_OFFSET)
+#define DMI_HALTSUM_HALT575_544_OFFSET      17
+#define DMI_HALTSUM_HALT575_544_LENGTH      1
+#define DMI_HALTSUM_HALT575_544             (0x1 << DMI_HALTSUM_HALT575_544_OFFSET)
+#define DMI_HALTSUM_HALT543_512_OFFSET      16
+#define DMI_HALTSUM_HALT543_512_LENGTH      1
+#define DMI_HALTSUM_HALT543_512             (0x1 << DMI_HALTSUM_HALT543_512_OFFSET)
+#define DMI_HALTSUM_HALT511_480_OFFSET      15
+#define DMI_HALTSUM_HALT511_480_LENGTH      1
+#define DMI_HALTSUM_HALT511_480             (0x1 << DMI_HALTSUM_HALT511_480_OFFSET)
+#define DMI_HALTSUM_HALT479_448_OFFSET      14
+#define DMI_HALTSUM_HALT479_448_LENGTH      1
+#define DMI_HALTSUM_HALT479_448             (0x1 << DMI_HALTSUM_HALT479_448_OFFSET)
+#define DMI_HALTSUM_HALT447_416_OFFSET      13
+#define DMI_HALTSUM_HALT447_416_LENGTH      1
+#define DMI_HALTSUM_HALT447_416             (0x1 << DMI_HALTSUM_HALT447_416_OFFSET)
+#define DMI_HALTSUM_HALT415_384_OFFSET      12
+#define DMI_HALTSUM_HALT415_384_LENGTH      1
+#define DMI_HALTSUM_HALT415_384             (0x1 << DMI_HALTSUM_HALT415_384_OFFSET)
+#define DMI_HALTSUM_HALT383_352_OFFSET      11
+#define DMI_HALTSUM_HALT383_352_LENGTH      1
+#define DMI_HALTSUM_HALT383_352             (0x1 << DMI_HALTSUM_HALT383_352_OFFSET)
+#define DMI_HALTSUM_HALT351_320_OFFSET      10
+#define DMI_HALTSUM_HALT351_320_LENGTH      1
+#define DMI_HALTSUM_HALT351_320             (0x1 << DMI_HALTSUM_HALT351_320_OFFSET)
+#define DMI_HALTSUM_HALT319_288_OFFSET      9
+#define DMI_HALTSUM_HALT319_288_LENGTH      1
+#define DMI_HALTSUM_HALT319_288             (0x1 << DMI_HALTSUM_HALT319_288_OFFSET)
+#define DMI_HALTSUM_HALT287_256_OFFSET      8
+#define DMI_HALTSUM_HALT287_256_LENGTH      1
+#define DMI_HALTSUM_HALT287_256             (0x1 << DMI_HALTSUM_HALT287_256_OFFSET)
+#define DMI_HALTSUM_HALT255_224_OFFSET      7
+#define DMI_HALTSUM_HALT255_224_LENGTH      1
+#define DMI_HALTSUM_HALT255_224             (0x1 << DMI_HALTSUM_HALT255_224_OFFSET)
+#define DMI_HALTSUM_HALT223_192_OFFSET      6
+#define DMI_HALTSUM_HALT223_192_LENGTH      1
+#define DMI_HALTSUM_HALT223_192             (0x1 << DMI_HALTSUM_HALT223_192_OFFSET)
+#define DMI_HALTSUM_HALT191_160_OFFSET      5
+#define DMI_HALTSUM_HALT191_160_LENGTH      1
+#define DMI_HALTSUM_HALT191_160             (0x1 << DMI_HALTSUM_HALT191_160_OFFSET)
+#define DMI_HALTSUM_HALT159_128_OFFSET      4
+#define DMI_HALTSUM_HALT159_128_LENGTH      1
+#define DMI_HALTSUM_HALT159_128             (0x1 << DMI_HALTSUM_HALT159_128_OFFSET)
+#define DMI_HALTSUM_HALT127_96_OFFSET       3
+#define DMI_HALTSUM_HALT127_96_LENGTH       1
+#define DMI_HALTSUM_HALT127_96              (0x1 << DMI_HALTSUM_HALT127_96_OFFSET)
+#define DMI_HALTSUM_HALT95_64_OFFSET        2
+#define DMI_HALTSUM_HALT95_64_LENGTH        1
+#define DMI_HALTSUM_HALT95_64               (0x1 << DMI_HALTSUM_HALT95_64_OFFSET)
+#define DMI_HALTSUM_HALT63_32_OFFSET        1
+#define DMI_HALTSUM_HALT63_32_LENGTH        1
+#define DMI_HALTSUM_HALT63_32               (0x1 << DMI_HALTSUM_HALT63_32_OFFSET)
+#define DMI_HALTSUM_HALT31_0_OFFSET         0
+#define DMI_HALTSUM_HALT31_0_LENGTH         1
+#define DMI_HALTSUM_HALT31_0                (0x1 << DMI_HALTSUM_HALT31_0_OFFSET)
+#define DMI_HAWINDOWSEL                     0x14
+#define DMI_HAWINDOWSEL_HAWINDOWSEL_OFFSET  0
+#define DMI_HAWINDOWSEL_HAWINDOWSEL_LENGTH  5
+#define DMI_HAWINDOWSEL_HAWINDOWSEL         (0x1f << DMI_HAWINDOWSEL_HAWINDOWSEL_OFFSET)
+#define DMI_HAWINDOW                        0x15
+#define DMI_HAWINDOW_MASKDATA_OFFSET        0
+#define DMI_HAWINDOW_MASKDATA_LENGTH        32
+#define DMI_HAWINDOW_MASKDATA               (0xffffffff << DMI_HAWINDOW_MASKDATA_OFFSET)
+#define DMI_ABSTRACTCS                      0x16
+/*
+* Size of the Program Buffer, in 32-bit words. Valid sizes are 0 - 16.
+*
+* TODO: Explain what can be done with each size of the buffer, to suggest
+* why you would want more or less words.
+ */
+#define DMI_ABSTRACTCS_PROGSIZE_OFFSET      24
+#define DMI_ABSTRACTCS_PROGSIZE_LENGTH      5
+#define DMI_ABSTRACTCS_PROGSIZE             (0x1f << DMI_ABSTRACTCS_PROGSIZE_OFFSET)
+/*
+* 1: An abstract command is currently being executed.
+*
+* This bit is set as soon as \Rcommand is written, and is
+* not cleared until that command has completed.
+ */
+#define DMI_ABSTRACTCS_BUSY_OFFSET          12
+#define DMI_ABSTRACTCS_BUSY_LENGTH          1
+#define DMI_ABSTRACTCS_BUSY                 (0x1 << DMI_ABSTRACTCS_BUSY_OFFSET)
+/*
+* Gets set if an abstract command fails. The bits in this field remain set until
+* they are cleared by writing 1 to them. No abstract command is
+* started until the value is reset to 0.
+*
+* 0 (none): No error.
+*
+* 1 (busy): An abstract command was executing while \Rcommand,
+* \Rabstractcs, \Rabstractauto was written, or when one
+* of the {\tt data} or {\tt progbuf} registers was read or written.
+*
+* 2 (not supported): The requested command is not supported. A
+* command that is not supported while the hart is running may be
+* supported when it is halted.
+*
+* 3 (exception): An exception occurred while executing the command
+* (eg. while executing the Program Buffer).
+*
+* 4 (halt/resume): An abstract command couldn't execute because the
+* hart wasn't in the expected state (running/halted).
+*
+* 7 (other): The command failed for another reason.
+ */
+#define DMI_ABSTRACTCS_CMDERR_OFFSET        8
+#define DMI_ABSTRACTCS_CMDERR_LENGTH        3
+#define DMI_ABSTRACTCS_CMDERR               (0x7 << DMI_ABSTRACTCS_CMDERR_OFFSET)
+/*
+* Number of {\tt data} registers that are implemented as part of the
+* abstract command interface. Valid sizes are 0 - 12.
+ */
+#define DMI_ABSTRACTCS_DATACOUNT_OFFSET     0
+#define DMI_ABSTRACTCS_DATACOUNT_LENGTH     5
+#define DMI_ABSTRACTCS_DATACOUNT            (0x1f << DMI_ABSTRACTCS_DATACOUNT_OFFSET)
+#define DMI_COMMAND                         0x17
+/*
+* The type determines the overall functionality of this
+* abstract command.
+ */
+#define DMI_COMMAND_CMDTYPE_OFFSET          24
+#define DMI_COMMAND_CMDTYPE_LENGTH          8
+#define DMI_COMMAND_CMDTYPE                 (0xff << DMI_COMMAND_CMDTYPE_OFFSET)
+/*
+* This field is interpreted in a command-specific manner,
+* described for each abstract command.
+ */
+#define DMI_COMMAND_CONTROL_OFFSET          0
+#define DMI_COMMAND_CONTROL_LENGTH          24
+#define DMI_COMMAND_CONTROL                 (0xffffff << DMI_COMMAND_CONTROL_OFFSET)
+#define DMI_ABSTRACTAUTO                    0x18
+/*
+* When a bit in this field is 1, read or write accesses the corresponding {\tt progbuf} word
+* cause the command in \Rcommand to be executed again.
+ */
+#define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_OFFSET 16
+#define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_LENGTH 16
+#define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF    (0xffff << DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_OFFSET)
+/*
+* When a bit in this field is 1, read or write accesses the corresponding {\tt data} word
+* cause the command in \Rcommand to be executed again.
+ */
+#define DMI_ABSTRACTAUTO_AUTOEXECDATA_OFFSET 0
+#define DMI_ABSTRACTAUTO_AUTOEXECDATA_LENGTH 12
+#define DMI_ABSTRACTAUTO_AUTOEXECDATA       (0xfff << DMI_ABSTRACTAUTO_AUTOEXECDATA_OFFSET)
+#define DMI_CFGSTRADDR0                     0x19
+#define DMI_CFGSTRADDR0_ADDR_OFFSET         0
+#define DMI_CFGSTRADDR0_ADDR_LENGTH         32
+#define DMI_CFGSTRADDR0_ADDR                (0xffffffff << DMI_CFGSTRADDR0_ADDR_OFFSET)
+#define DMI_CFGSTRADDR1                     0x1a
+#define DMI_CFGSTRADDR2                     0x1b
+#define DMI_CFGSTRADDR3                     0x1c
+#define DMI_DATA0                           0x04
+#define DMI_DATA0_DATA_OFFSET               0
+#define DMI_DATA0_DATA_LENGTH               32
+#define DMI_DATA0_DATA                      (0xffffffff << DMI_DATA0_DATA_OFFSET)
+#define DMI_DATA11                          0x0f
+#define DMI_PROGBUF0                        0x20
+#define DMI_PROGBUF0_DATA_OFFSET            0
+#define DMI_PROGBUF0_DATA_LENGTH            32
+#define DMI_PROGBUF0_DATA                   (0xffffffff << DMI_PROGBUF0_DATA_OFFSET)
+#define DMI_PROGBUF15                       0x2f
+#define DMI_AUTHDATA                        0x30
+#define DMI_AUTHDATA_DATA_OFFSET            0
+#define DMI_AUTHDATA_DATA_LENGTH            32
+#define DMI_AUTHDATA_DATA                   (0xffffffff << DMI_AUTHDATA_DATA_OFFSET)
+#define DMI_SERCS                           0x34
+/*
+* Number of supported serial ports.
+ */
+#define DMI_SERCS_SERIALCOUNT_OFFSET        28
+#define DMI_SERCS_SERIALCOUNT_LENGTH        4
+#define DMI_SERCS_SERIALCOUNT               (0xf << DMI_SERCS_SERIALCOUNT_OFFSET)
+/*
+* Select which serial port is accessed by \Rserrx and \Rsertx.
+ */
+#define DMI_SERCS_SERIAL_OFFSET             24
+#define DMI_SERCS_SERIAL_LENGTH             3
+#define DMI_SERCS_SERIAL                    (0x7 << DMI_SERCS_SERIAL_OFFSET)
+#define DMI_SERCS_ERROR7_OFFSET             23
+#define DMI_SERCS_ERROR7_LENGTH             1
+#define DMI_SERCS_ERROR7                    (0x1 << DMI_SERCS_ERROR7_OFFSET)
+#define DMI_SERCS_VALID7_OFFSET             22
+#define DMI_SERCS_VALID7_LENGTH             1
+#define DMI_SERCS_VALID7                    (0x1 << DMI_SERCS_VALID7_OFFSET)
+#define DMI_SERCS_FULL7_OFFSET              21
+#define DMI_SERCS_FULL7_LENGTH              1
+#define DMI_SERCS_FULL7                     (0x1 << DMI_SERCS_FULL7_OFFSET)
+#define DMI_SERCS_ERROR6_OFFSET             20
+#define DMI_SERCS_ERROR6_LENGTH             1
+#define DMI_SERCS_ERROR6                    (0x1 << DMI_SERCS_ERROR6_OFFSET)
+#define DMI_SERCS_VALID6_OFFSET             19
+#define DMI_SERCS_VALID6_LENGTH             1
+#define DMI_SERCS_VALID6                    (0x1 << DMI_SERCS_VALID6_OFFSET)
+#define DMI_SERCS_FULL6_OFFSET              18
+#define DMI_SERCS_FULL6_LENGTH              1
+#define DMI_SERCS_FULL6                     (0x1 << DMI_SERCS_FULL6_OFFSET)
+#define DMI_SERCS_ERROR5_OFFSET             17
+#define DMI_SERCS_ERROR5_LENGTH             1
+#define DMI_SERCS_ERROR5                    (0x1 << DMI_SERCS_ERROR5_OFFSET)
+#define DMI_SERCS_VALID5_OFFSET             16
+#define DMI_SERCS_VALID5_LENGTH             1
+#define DMI_SERCS_VALID5                    (0x1 << DMI_SERCS_VALID5_OFFSET)
+#define DMI_SERCS_FULL5_OFFSET              15
+#define DMI_SERCS_FULL5_LENGTH              1
+#define DMI_SERCS_FULL5                     (0x1 << DMI_SERCS_FULL5_OFFSET)
+#define DMI_SERCS_ERROR4_OFFSET             14
+#define DMI_SERCS_ERROR4_LENGTH             1
+#define DMI_SERCS_ERROR4                    (0x1 << DMI_SERCS_ERROR4_OFFSET)
+#define DMI_SERCS_VALID4_OFFSET             13
+#define DMI_SERCS_VALID4_LENGTH             1
+#define DMI_SERCS_VALID4                    (0x1 << DMI_SERCS_VALID4_OFFSET)
+#define DMI_SERCS_FULL4_OFFSET              12
+#define DMI_SERCS_FULL4_LENGTH              1
+#define DMI_SERCS_FULL4                     (0x1 << DMI_SERCS_FULL4_OFFSET)
+#define DMI_SERCS_ERROR3_OFFSET             11
+#define DMI_SERCS_ERROR3_LENGTH             1
+#define DMI_SERCS_ERROR3                    (0x1 << DMI_SERCS_ERROR3_OFFSET)
+#define DMI_SERCS_VALID3_OFFSET             10
+#define DMI_SERCS_VALID3_LENGTH             1
+#define DMI_SERCS_VALID3                    (0x1 << DMI_SERCS_VALID3_OFFSET)
+#define DMI_SERCS_FULL3_OFFSET              9
+#define DMI_SERCS_FULL3_LENGTH              1
+#define DMI_SERCS_FULL3                     (0x1 << DMI_SERCS_FULL3_OFFSET)
+#define DMI_SERCS_ERROR2_OFFSET             8
+#define DMI_SERCS_ERROR2_LENGTH             1
+#define DMI_SERCS_ERROR2                    (0x1 << DMI_SERCS_ERROR2_OFFSET)
+#define DMI_SERCS_VALID2_OFFSET             7
+#define DMI_SERCS_VALID2_LENGTH             1
+#define DMI_SERCS_VALID2                    (0x1 << DMI_SERCS_VALID2_OFFSET)
+#define DMI_SERCS_FULL2_OFFSET              6
+#define DMI_SERCS_FULL2_LENGTH              1
+#define DMI_SERCS_FULL2                     (0x1 << DMI_SERCS_FULL2_OFFSET)
+#define DMI_SERCS_ERROR1_OFFSET             5
+#define DMI_SERCS_ERROR1_LENGTH             1
+#define DMI_SERCS_ERROR1                    (0x1 << DMI_SERCS_ERROR1_OFFSET)
+#define DMI_SERCS_VALID1_OFFSET             4
+#define DMI_SERCS_VALID1_LENGTH             1
+#define DMI_SERCS_VALID1                    (0x1 << DMI_SERCS_VALID1_OFFSET)
+#define DMI_SERCS_FULL1_OFFSET              3
+#define DMI_SERCS_FULL1_LENGTH              1
+#define DMI_SERCS_FULL1                     (0x1 << DMI_SERCS_FULL1_OFFSET)
+/*
+* 1 when the debugger-to-core queue for serial port 0 has
+* over or underflowed. This bit will remain set until it is reset by
+* writing 1 to this bit.
+ */
+#define DMI_SERCS_ERROR0_OFFSET             2
+#define DMI_SERCS_ERROR0_LENGTH             1
+#define DMI_SERCS_ERROR0                    (0x1 << DMI_SERCS_ERROR0_OFFSET)
+/*
+* 1 when the core-to-debugger queue for serial port 0 is not empty.
+ */
+#define DMI_SERCS_VALID0_OFFSET             1
+#define DMI_SERCS_VALID0_LENGTH             1
+#define DMI_SERCS_VALID0                    (0x1 << DMI_SERCS_VALID0_OFFSET)
+/*
+* 1 when the debugger-to-core queue for serial port 0 is full.
+ */
+#define DMI_SERCS_FULL0_OFFSET              0
+#define DMI_SERCS_FULL0_LENGTH              1
+#define DMI_SERCS_FULL0                     (0x1 << DMI_SERCS_FULL0_OFFSET)
+#define DMI_SERTX                           0x35
+#define DMI_SERTX_DATA_OFFSET               0
+#define DMI_SERTX_DATA_LENGTH               32
+#define DMI_SERTX_DATA                      (0xffffffff << DMI_SERTX_DATA_OFFSET)
+#define DMI_SERRX                           0x36
+#define DMI_SERRX_DATA_OFFSET               0
+#define DMI_SERRX_DATA_LENGTH               32
+#define DMI_SERRX_DATA                      (0xffffffff << DMI_SERRX_DATA_OFFSET)
+#define DMI_SBCS                            0x38
+/*
+* When a 1 is written here, triggers a read at the address in {\tt
+* sbaddress} using the access size set by \Fsbaccess.
+ */
+#define DMI_SBCS_SBSINGLEREAD_OFFSET        20
+#define DMI_SBCS_SBSINGLEREAD_LENGTH        1
+#define DMI_SBCS_SBSINGLEREAD               (0x1 << DMI_SBCS_SBSINGLEREAD_OFFSET)
+/*
+* Select the access size to use for system bus accesses triggered by
+* writes to the {\tt sbaddress} registers or \Rsbdatazero.
+*
+* 0: 8-bit
+*
+* 1: 16-bit
+*
+* 2: 32-bit
+*
+* 3: 64-bit
+*
+* 4: 128-bit
+*
+* If an unsupported system bus access size is written here,
+* the DM may not perform the access, or may perform the access
+* with any access size.
+ */
+#define DMI_SBCS_SBACCESS_OFFSET            17
+#define DMI_SBCS_SBACCESS_LENGTH            3
+#define DMI_SBCS_SBACCESS                   (0x7 << DMI_SBCS_SBACCESS_OFFSET)
+/*
+* When 1, the internal address value (used by the system bus master)
+* is incremented by the access size (in bytes) selected in \Fsbaccess
+* after every system bus access.
+ */
+#define DMI_SBCS_SBAUTOINCREMENT_OFFSET     16
+#define DMI_SBCS_SBAUTOINCREMENT_LENGTH     1
+#define DMI_SBCS_SBAUTOINCREMENT            (0x1 << DMI_SBCS_SBAUTOINCREMENT_OFFSET)
+/*
+* When 1, every read from \Rsbdatazero automatically triggers a system
+* bus read at the new address.
+ */
+#define DMI_SBCS_SBAUTOREAD_OFFSET          15
+#define DMI_SBCS_SBAUTOREAD_LENGTH          1
+#define DMI_SBCS_SBAUTOREAD                 (0x1 << DMI_SBCS_SBAUTOREAD_OFFSET)
+/*
+* When the debug module's system bus
+* master causes a bus error, this field gets set. The bits in this
+* field remain set until they are cleared by writing 1 to them.
+* While this field is non-zero, no more system bus accesses can be
+* initiated by the debug module.
+*
+* 0: There was no bus error.
+*
+* 1: There was a timeout.
+*
+* 2: A bad address was accessed.
+*
+* 3: There was some other error (eg. alignment).
+*
+* 4: The system bus master was busy when one of the
+* {\tt sbaddress} or {\tt sbdata} registers was written,
+* or the {\tt sbdata} register was read when it had
+* stale data.
+ */
+#define DMI_SBCS_SBERROR_OFFSET             12
+#define DMI_SBCS_SBERROR_LENGTH             3
+#define DMI_SBCS_SBERROR                    (0x7 << DMI_SBCS_SBERROR_OFFSET)
+/*
+* Width of system bus addresses in bits. (0 indicates there is no bus
+* access support.)
+ */
+#define DMI_SBCS_SBASIZE_OFFSET             5
+#define DMI_SBCS_SBASIZE_LENGTH             7
+#define DMI_SBCS_SBASIZE                    (0x7f << DMI_SBCS_SBASIZE_OFFSET)
+/*
+* 1 when 128-bit system bus accesses are supported.
+ */
+#define DMI_SBCS_SBACCESS128_OFFSET         4
+#define DMI_SBCS_SBACCESS128_LENGTH         1
+#define DMI_SBCS_SBACCESS128                (0x1 << DMI_SBCS_SBACCESS128_OFFSET)
+/*
+* 1 when 64-bit system bus accesses are supported.
+ */
+#define DMI_SBCS_SBACCESS64_OFFSET          3
+#define DMI_SBCS_SBACCESS64_LENGTH          1
+#define DMI_SBCS_SBACCESS64                 (0x1 << DMI_SBCS_SBACCESS64_OFFSET)
+/*
+* 1 when 32-bit system bus accesses are supported.
+ */
+#define DMI_SBCS_SBACCESS32_OFFSET          2
+#define DMI_SBCS_SBACCESS32_LENGTH          1
+#define DMI_SBCS_SBACCESS32                 (0x1 << DMI_SBCS_SBACCESS32_OFFSET)
+/*
+* 1 when 16-bit system bus accesses are supported.
+ */
+#define DMI_SBCS_SBACCESS16_OFFSET          1
+#define DMI_SBCS_SBACCESS16_LENGTH          1
+#define DMI_SBCS_SBACCESS16                 (0x1 << DMI_SBCS_SBACCESS16_OFFSET)
+/*
+* 1 when 8-bit system bus accesses are supported.
+ */
+#define DMI_SBCS_SBACCESS8_OFFSET           0
+#define DMI_SBCS_SBACCESS8_LENGTH           1
+#define DMI_SBCS_SBACCESS8                  (0x1 << DMI_SBCS_SBACCESS8_OFFSET)
+#define DMI_SBADDRESS0                      0x39
+/*
+* Accesses bits 31:0 of the internal address.
+ */
+#define DMI_SBADDRESS0_ADDRESS_OFFSET       0
+#define DMI_SBADDRESS0_ADDRESS_LENGTH       32
+#define DMI_SBADDRESS0_ADDRESS              (0xffffffff << DMI_SBADDRESS0_ADDRESS_OFFSET)
+#define DMI_SBADDRESS1                      0x3a
+/*
+* Accesses bits 63:32 of the internal address (if the system address
+* bus is that wide).
+ */
+#define DMI_SBADDRESS1_ADDRESS_OFFSET       0
+#define DMI_SBADDRESS1_ADDRESS_LENGTH       32
+#define DMI_SBADDRESS1_ADDRESS              (0xffffffff << DMI_SBADDRESS1_ADDRESS_OFFSET)
+#define DMI_SBADDRESS2                      0x3b
+/*
+* Accesses bits 95:64 of the internal address (if the system address
+* bus is that wide).
+ */
+#define DMI_SBADDRESS2_ADDRESS_OFFSET       0
+#define DMI_SBADDRESS2_ADDRESS_LENGTH       32
+#define DMI_SBADDRESS2_ADDRESS              (0xffffffff << DMI_SBADDRESS2_ADDRESS_OFFSET)
+#define DMI_SBDATA0                         0x3c
+/*
+* Accesses bits 31:0 of the internal data.
+ */
+#define DMI_SBDATA0_DATA_OFFSET             0
+#define DMI_SBDATA0_DATA_LENGTH             32
+#define DMI_SBDATA0_DATA                    (0xffffffff << DMI_SBDATA0_DATA_OFFSET)
+#define DMI_SBDATA1                         0x3d
+/*
+* Accesses bits 63:32 of the internal data (if the system bus is
+* that wide).
+ */
+#define DMI_SBDATA1_DATA_OFFSET             0
+#define DMI_SBDATA1_DATA_LENGTH             32
+#define DMI_SBDATA1_DATA                    (0xffffffff << DMI_SBDATA1_DATA_OFFSET)
+#define DMI_SBDATA2                         0x3e
+/*
+* Accesses bits 95:64 of the internal data (if the system bus is
+* that wide).
+ */
+#define DMI_SBDATA2_DATA_OFFSET             0
+#define DMI_SBDATA2_DATA_LENGTH             32
+#define DMI_SBDATA2_DATA                    (0xffffffff << DMI_SBDATA2_DATA_OFFSET)
+#define DMI_SBDATA3                         0x3f
+/*
+* Accesses bits 127:96 of the internal data (if the system bus is
+* that wide).
+ */
+#define DMI_SBDATA3_DATA_OFFSET             0
+#define DMI_SBDATA3_DATA_LENGTH             32
+#define DMI_SBDATA3_DATA                    (0xffffffff << DMI_SBDATA3_DATA_OFFSET)
+#define TRACE                               0x728
+/*
+* 1 if the trace buffer has wrapped since the last time \Fdiscard was
+* written. 0 otherwise.
+ */
+#define TRACE_WRAPPED_OFFSET                24
+#define TRACE_WRAPPED_LENGTH                1
+#define TRACE_WRAPPED                       (0x1 << TRACE_WRAPPED_OFFSET)
+/*
+* Emit Timestamp trace sequences.
+ */
+#define TRACE_EMITTIMESTAMP_OFFSET          23
+#define TRACE_EMITTIMESTAMP_LENGTH          1
+#define TRACE_EMITTIMESTAMP                 (0x1 << TRACE_EMITTIMESTAMP_OFFSET)
+/*
+* Emit Store Data trace sequences.
+ */
+#define TRACE_EMITSTOREDATA_OFFSET          22
+#define TRACE_EMITSTOREDATA_LENGTH          1
+#define TRACE_EMITSTOREDATA                 (0x1 << TRACE_EMITSTOREDATA_OFFSET)
+/*
+* Emit Load Data trace sequences.
+ */
+#define TRACE_EMITLOADDATA_OFFSET           21
+#define TRACE_EMITLOADDATA_LENGTH           1
+#define TRACE_EMITLOADDATA                  (0x1 << TRACE_EMITLOADDATA_OFFSET)
+/*
+* Emit Store Address trace sequences.
+ */
+#define TRACE_EMITSTOREADDR_OFFSET          20
+#define TRACE_EMITSTOREADDR_LENGTH          1
+#define TRACE_EMITSTOREADDR                 (0x1 << TRACE_EMITSTOREADDR_OFFSET)
+/*
+* Emit Load Address trace sequences.
+ */
+#define TRACE_EMITLOADADDR_OFFSET           19
+#define TRACE_EMITLOADADDR_LENGTH           1
+#define TRACE_EMITLOADADDR                  (0x1 << TRACE_EMITLOADADDR_OFFSET)
+/*
+* Emit Privilege Level trace sequences.
+ */
+#define TRACE_EMITPRIV_OFFSET               18
+#define TRACE_EMITPRIV_LENGTH               1
+#define TRACE_EMITPRIV                      (0x1 << TRACE_EMITPRIV_OFFSET)
+/*
+* Emit Branch Taken and Branch Not Taken trace sequences.
+ */
+#define TRACE_EMITBRANCH_OFFSET             17
+#define TRACE_EMITBRANCH_LENGTH             1
+#define TRACE_EMITBRANCH                    (0x1 << TRACE_EMITBRANCH_OFFSET)
+/*
+* Emit PC trace sequences.
+ */
+#define TRACE_EMITPC_OFFSET                 16
+#define TRACE_EMITPC_LENGTH                 1
+#define TRACE_EMITPC                        (0x1 << TRACE_EMITPC_OFFSET)
+/*
+* Determine what happens when the trace buffer is full.  0 means wrap
+* and overwrite. 1 means turn off trace until \Fdiscard is written as 1.
+* 2 means cause a trace full exception. 3 is reserved for future use.
+ */
+#define TRACE_FULLACTION_OFFSET             8
+#define TRACE_FULLACTION_LENGTH             2
+#define TRACE_FULLACTION                    (0x3 << TRACE_FULLACTION_OFFSET)
+/*
+* 0: Trace to a dedicated on-core RAM (which is not further defined in
+* this spec).
+*
+* 1: Trace to RAM on the system bus.
+*
+* 2: Send trace data to a dedicated off-chip interface (which is not
+* defined in this spec). This does not affect execution speed.
+*
+* 3: Reserved for future use.
+*
+* Options 0 and 1 slow down execution (eg. because of system bus
+* contention).
+ */
+#define TRACE_DESTINATION_OFFSET            4
+#define TRACE_DESTINATION_LENGTH            2
+#define TRACE_DESTINATION                   (0x3 << TRACE_DESTINATION_OFFSET)
+/*
+* When 1, the trace logic may stall processor execution to ensure it
+* can emit all the trace sequences required. When 0 individual trace
+* sequences may be dropped.
+ */
+#define TRACE_STALL_OFFSET                  2
+#define TRACE_STALL_LENGTH                  1
+#define TRACE_STALL                         (0x1 << TRACE_STALL_OFFSET)
+/*
+* Writing 1 to this bit tells the trace logic that any trace
+* collected is no longer required. When tracing to RAM, it resets the
+* trace write pointer to the start of the memory, as well as
+* \Fwrapped.
+ */
+#define TRACE_DISCARD_OFFSET                1
+#define TRACE_DISCARD_LENGTH                1
+#define TRACE_DISCARD                       (0x1 << TRACE_DISCARD_OFFSET)
+#define TRACE_SUPPORTED_OFFSET              0
+#define TRACE_SUPPORTED_LENGTH              1
+#define TRACE_SUPPORTED                     (0x1 << TRACE_SUPPORTED_OFFSET)
+#define TBUFSTART                           0x729
+#define TBUFEND                             0x72a
+#define TBUFWRITE                           0x72b
+#define SHORTNAME                           0x123
+/*
+* Description of what this field is used for.
+ */
+#define SHORTNAME_FIELD_OFFSET              0
+#define SHORTNAME_FIELD_LENGTH              8
+#define SHORTNAME_FIELD                     (0xff << SHORTNAME_FIELD_OFFSET)
+#define AC_ACCESS_REGISTER                  None
+/*
+* This is 0 to indicate Access Register Command.
+ */
+#define AC_ACCESS_REGISTER_CMDTYPE_OFFSET   24
+#define AC_ACCESS_REGISTER_CMDTYPE_LENGTH   8
+#define AC_ACCESS_REGISTER_CMDTYPE          (0xff << AC_ACCESS_REGISTER_CMDTYPE_OFFSET)
+/*
+* 2: Access the lowest 32 bits of the register.
+*
+* 3: Access the lowest 64 bits of the register.
+*
+* 4: Access the lowest 128 bits of the register.
+*
+* If \Fsize specifies a size larger than the register's actual size,
+* then the access must fail. If a register is accessible, then reads of \Fsize
+* less than or equal to the register's actual size must be supported.
+ */
+#define AC_ACCESS_REGISTER_SIZE_OFFSET      20
+#define AC_ACCESS_REGISTER_SIZE_LENGTH      3
+#define AC_ACCESS_REGISTER_SIZE             (0x7 << AC_ACCESS_REGISTER_SIZE_OFFSET)
+/*
+* When 1, execute the program in the Program Buffer exactly once
+* after performing the transfer, if any.
+ */
+#define AC_ACCESS_REGISTER_POSTEXEC_OFFSET  18
+#define AC_ACCESS_REGISTER_POSTEXEC_LENGTH  1
+#define AC_ACCESS_REGISTER_POSTEXEC         (0x1 << AC_ACCESS_REGISTER_POSTEXEC_OFFSET)
+/*
+* 0: Don't do the operation specified by \Fwrite.
+*
+* 1: Do the operation specified by \Fwrite.
+ */
+#define AC_ACCESS_REGISTER_TRANSFER_OFFSET  17
+#define AC_ACCESS_REGISTER_TRANSFER_LENGTH  1
+#define AC_ACCESS_REGISTER_TRANSFER         (0x1 << AC_ACCESS_REGISTER_TRANSFER_OFFSET)
+/*
+* When \Ftransfer is set:
+* 0: Copy data from the specified register into {\tt arg0} portion
+* of {\tt data}.
+*
+* 1: Copy data from {\tt arg0} portion of {\tt data} into the
+* specified register.
+ */
+#define AC_ACCESS_REGISTER_WRITE_OFFSET     16
+#define AC_ACCESS_REGISTER_WRITE_LENGTH     1
+#define AC_ACCESS_REGISTER_WRITE            (0x1 << AC_ACCESS_REGISTER_WRITE_OFFSET)
+/*
+* Number of the register to access, as described in
+* Table~\ref{tab:regno}.
+* \Rdpc may be used as an alias for PC if this command is
+* supported on a non-halted hart.
+ */
+#define AC_ACCESS_REGISTER_REGNO_OFFSET     0
+#define AC_ACCESS_REGISTER_REGNO_LENGTH     16
+#define AC_ACCESS_REGISTER_REGNO            (0xffff << AC_ACCESS_REGISTER_REGNO_OFFSET)
+#define AC_QUICK_ACCESS                     None
+/*
+* This is 1 to indicate Quick Access command.
+ */
+#define AC_QUICK_ACCESS_CMDTYPE_OFFSET      24
+#define AC_QUICK_ACCESS_CMDTYPE_LENGTH      8
+#define AC_QUICK_ACCESS_CMDTYPE             (0xff << AC_QUICK_ACCESS_CMDTYPE_OFFSET)
+#define VIRT_PRIV                           virtual
+/*
+* Contains the privilege level the hart was operating in when Debug
+* Mode was entered. The encoding is described in Table
+* \ref{tab:privlevel}. A user can write this value to change the
+* hart's privilege level when exiting Debug Mode.
+ */
+#define VIRT_PRIV_PRV_OFFSET                0
+#define VIRT_PRIV_PRV_LENGTH                2
+#define VIRT_PRIV_PRV                       (0x3 << VIRT_PRIV_PRV_OFFSET)
diff --git a/fesvr/device.cc b/fesvr/device.cc
new file mode 100644
index 0000000000..3a4cc95a3b
--- /dev/null
+++ b/fesvr/device.cc
@@ -0,0 +1,155 @@
+#include "device.h"
+#include "term.h"
+#include "memif.h"
+#include <cassert>
+#include <algorithm>
+#include <climits>
+#include <iostream>
+#include <thread>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+using namespace std::placeholders;
+
+device_t::device_t()
+  : command_handlers(command_t::MAX_COMMANDS),
+    command_names(command_t::MAX_COMMANDS)
+{
+  for (size_t cmd = 0; cmd < command_t::MAX_COMMANDS; cmd++)
+    register_command(cmd, std::bind(&device_t::handle_null_command, this, _1), "");
+  register_command(command_t::MAX_COMMANDS-1, std::bind(&device_t::handle_identify, this, _1), "identity");
+}
+
+void device_t::register_command(size_t cmd, command_func_t handler, const char* name)
+{
+  assert(cmd < command_t::MAX_COMMANDS);
+  assert(strlen(name) < IDENTITY_SIZE);
+  command_handlers[cmd] = handler;
+  command_names[cmd] = name;
+}
+
+void device_t::handle_command(command_t cmd)
+{
+  command_handlers[cmd.cmd()](cmd);
+}
+
+void device_t::handle_null_command(command_t cmd)
+{
+}
+
+void device_t::handle_identify(command_t cmd)
+{
+  size_t what = cmd.payload() % command_t::MAX_COMMANDS;
+  uint64_t addr = cmd.payload() / command_t::MAX_COMMANDS;
+  assert(addr % IDENTITY_SIZE == 0);
+
+  char id[IDENTITY_SIZE] = {0};
+  if (what == command_t::MAX_COMMANDS-1)
+  {
+    assert(strlen(identity()) < IDENTITY_SIZE);
+    strcpy(id, identity());
+  }
+  else
+    strcpy(id, command_names[what].c_str());
+
+  cmd.memif().write(addr, IDENTITY_SIZE, id);
+  cmd.respond(1);
+}
+
+bcd_t::bcd_t()
+{
+  register_command(0, std::bind(&bcd_t::handle_read, this, _1), "read");
+  register_command(1, std::bind(&bcd_t::handle_write, this, _1), "write");
+}
+
+void bcd_t::handle_read(command_t cmd)
+{
+  pending_reads.push(cmd);
+}
+
+void bcd_t::handle_write(command_t cmd)
+{
+  canonical_terminal_t::write(cmd.payload());
+}
+
+void bcd_t::tick()
+{
+  int ch;
+  if (!pending_reads.empty() && (ch = canonical_terminal_t::read()) != -1)
+  {
+    pending_reads.front().respond(0x100 | ch);
+    pending_reads.pop();
+  }
+}
+
+disk_t::disk_t(const char* fn)
+{
+  fd = ::open(fn, O_RDWR);
+  if (fd < 0)
+    throw std::runtime_error("could not open " + std::string(fn));
+
+  register_command(0, std::bind(&disk_t::handle_read, this, _1), "read");
+  register_command(1, std::bind(&disk_t::handle_write, this, _1), "write");
+
+  struct stat st;
+  if (fstat(fd, &st) < 0)
+    throw std::runtime_error("could not stat " + std::string(fn));
+
+  size = st.st_size;
+  id = "disk size=" + std::to_string(size);
+}
+
+disk_t::~disk_t()
+{
+  close(fd);
+}
+
+void disk_t::handle_read(command_t cmd)
+{
+  request_t req;
+  cmd.memif().read(cmd.payload(), sizeof(req), &req);
+
+  std::vector<uint8_t> buf(req.size);
+  if ((size_t)::pread(fd, &buf[0], buf.size(), req.offset) != req.size)
+    throw std::runtime_error("could not read " + id + " @ " + std::to_string(req.offset));
+
+  cmd.memif().write(req.addr, buf.size(), &buf[0]);
+  cmd.respond(req.tag);
+}
+
+void disk_t::handle_write(command_t cmd)
+{
+  request_t req;
+  cmd.memif().read(cmd.payload(), sizeof(req), &req);
+
+  std::vector<uint8_t> buf(req.size);
+  cmd.memif().read(req.addr, buf.size(), &buf[0]);
+
+  if ((size_t)::pwrite(fd, &buf[0], buf.size(), req.offset) != req.size)
+    throw std::runtime_error("could not write " + id + " @ " + std::to_string(req.offset));
+
+  cmd.respond(req.tag);
+}
+
+device_list_t::device_list_t()
+  : devices(command_t::MAX_COMMANDS, &null_device), num_devices(0)
+{
+}
+
+void device_list_t::register_device(device_t* dev)
+{
+  num_devices++;
+  assert(num_devices < command_t::MAX_DEVICES);
+  devices[num_devices-1] = dev;
+}
+
+void device_list_t::handle_command(command_t cmd)
+{
+  devices[cmd.device()]->handle_command(cmd);
+}
+
+void device_list_t::tick()
+{
+  for (size_t i = 0; i < num_devices; i++)
+    devices[i]->tick();
+}
diff --git a/fesvr/device.h b/fesvr/device.h
new file mode 100644
index 0000000000..1387b745ff
--- /dev/null
+++ b/fesvr/device.h
@@ -0,0 +1,118 @@
+#ifndef _DEVICE_H
+#define _DEVICE_H
+
+#include <vector>
+#include <queue>
+#include <cstring>
+#include <string>
+#include <functional>
+
+class memif_t;
+
+class command_t
+{
+ public:
+  typedef std::function<void(uint64_t)> callback_t;
+  command_t(memif_t& memif, uint64_t tohost, callback_t cb)
+    : _memif(memif), tohost(tohost), cb(cb) {}
+
+  memif_t& memif() { return _memif; }
+  uint8_t device() { return tohost >> 56; }
+  uint8_t cmd() { return tohost >> 48; }
+  uint64_t payload() { return tohost << 16 >> 16; }
+  void respond(uint64_t resp) { cb((tohost >> 48 << 48) | (resp << 16 >> 16)); }
+
+  static const size_t MAX_COMMANDS = 256;
+  static const size_t MAX_DEVICES = 256;
+
+ private:
+  memif_t& _memif;
+  uint64_t tohost;
+  callback_t cb;
+};
+
+class device_t
+{
+ public:
+  device_t();
+  virtual ~device_t() {}
+  virtual const char* identity() = 0;
+  virtual void tick() {}
+
+  void handle_command(command_t cmd);
+
+ protected:
+  typedef std::function<void(command_t)> command_func_t;
+  void register_command(size_t, command_func_t, const char*);
+
+ private:
+  device_t& operator = (const device_t&); // disallow
+  device_t(const device_t&); // disallow
+
+  static const size_t IDENTITY_SIZE = 64;
+  void handle_null_command(command_t cmd);
+  void handle_identify(command_t cmd);
+
+  std::vector<command_func_t> command_handlers;
+  std::vector<std::string> command_names;
+};
+
+class bcd_t : public device_t
+{
+ public:
+  bcd_t();
+  const char* identity() { return "bcd"; }
+  void tick();
+
+ private:
+  void handle_read(command_t cmd);
+  void handle_write(command_t cmd);
+
+  std::queue<command_t> pending_reads;
+};
+
+class disk_t : public device_t
+{
+ public:
+  disk_t(const char* fn);
+  ~disk_t();
+  const char* identity() { return id.c_str(); }
+
+ private:
+  struct request_t
+  {
+    uint64_t addr;
+    uint64_t offset;
+    uint64_t size;
+    uint64_t tag;
+  };
+
+  void handle_read(command_t cmd);
+  void handle_write(command_t cmd);
+
+  std::string id;
+  size_t size;
+  int fd;
+};
+
+class null_device_t : public device_t
+{
+ public:
+  const char* identity() { return ""; }
+};
+
+class device_list_t
+{
+ public:
+  device_list_t();
+  void register_device(device_t* dev);
+  void handle_command(command_t cmd);
+  void tick();
+
+ private:
+  std::vector<device_t*> devices;
+  null_device_t null_device;
+  size_t num_devices;
+};
+
+#endif
diff --git a/fesvr/dtm.cc b/fesvr/dtm.cc
new file mode 100644
index 0000000000..418ac63abd
--- /dev/null
+++ b/fesvr/dtm.cc
@@ -0,0 +1,645 @@
+#include "dtm.h"
+#include "debug_defines.h"
+#include "encoding.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+#include <stdexcept>
+
+#define RV_X(x, s, n) \
+  (((x) >> (s)) & ((1 << (n)) - 1))
+#define ENCODE_ITYPE_IMM(x) \
+  (RV_X(x, 0, 12) << 20)
+#define ENCODE_STYPE_IMM(x) \
+  ((RV_X(x, 0, 5) << 7) | (RV_X(x, 5, 7) << 25))
+#define ENCODE_SBTYPE_IMM(x) \
+  ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31))
+#define ENCODE_UTYPE_IMM(x) \
+  (RV_X(x, 12, 20) << 12)
+#define ENCODE_UJTYPE_IMM(x) \
+  ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31))
+
+#define LOAD(xlen, dst, base, imm) \
+  (((xlen) == 64 ? 0x00003003 : 0x00002003) \
+   | ((dst) << 7) | ((base) << 15) | (uint32_t)ENCODE_ITYPE_IMM(imm))
+#define STORE(xlen, src, base, imm) \
+  (((xlen) == 64 ? 0x00003023 : 0x00002023) \
+   | ((src) << 20) | ((base) << 15) | (uint32_t)ENCODE_STYPE_IMM(imm))
+#define JUMP(there, here) (0x6f | (uint32_t)ENCODE_UJTYPE_IMM((there) - (here)))
+#define BNE(r1, r2, there, here) (0x1063 | ((r1) << 15) | ((r2) << 20) | (uint32_t)ENCODE_SBTYPE_IMM((there) - (here)))
+#define ADDI(dst, src, imm) (0x13 | ((dst) << 7) | ((src) << 15) | (uint32_t)ENCODE_ITYPE_IMM(imm))
+#define SRL(dst, src, sh) (0x5033 | ((dst) << 7) | ((src) << 15) | ((sh) << 20))
+#define FENCE_I 0x100f
+#define EBREAK  0x00100073
+#define X0 0
+#define S0 8
+#define S1 9
+
+#define AC_AR_REGNO(x) ((0x1000 | x) << AC_ACCESS_REGISTER_REGNO_OFFSET)
+#define AC_AR_SIZE(x)  (((x == 128)? 4 : (x == 64 ? 3 : 2)) << AC_ACCESS_REGISTER_SIZE_OFFSET)
+
+#define WRITE 1
+#define SET 2
+#define CLEAR 3
+#define CSRRx(type, dst, csr, src) (0x73 | ((type) << 12) | ((dst) << 7) | ((src) << 15) | (uint32_t)((csr) << 20))
+
+#define get_field(reg, mask) (((reg) & (mask)) / ((mask) & ~((mask) << 1)))
+#define set_field(reg, mask, val) (((reg) & ~(mask)) | (((val) * ((mask) & ~((mask) << 1))) & (mask)))
+
+#define RUN_AC_OR_DIE(a, b, c, d, e) { \
+    uint32_t cmderr = run_abstract_command(a, b, c, d, e);      \
+    if (cmderr) {                                               \
+      die(cmderr);                                              \
+    }                                                           \
+  }
+
+uint32_t dtm_t::do_command(dtm_t::req r)
+{
+  req_buf = r;
+  target->switch_to();
+  assert(resp_buf.resp == 0);
+  return resp_buf.data;
+}
+
+uint32_t dtm_t::read(uint32_t addr)
+{
+  return do_command((req){addr, 1, 0});
+}
+
+uint32_t dtm_t::write(uint32_t addr, uint32_t data)
+{
+  return do_command((req){addr, 2, data});
+}
+
+void dtm_t::nop()
+{
+  do_command((req){0, 0, 0});
+}
+
+void dtm_t::select_hart(int hartsel) {
+  int dmcontrol = read(DMI_DMCONTROL);
+  write (DMI_DMCONTROL, set_field(dmcontrol, DMI_DMCONTROL_HARTSEL, hartsel));
+  current_hart = hartsel;
+}
+
+int dtm_t::enumerate_harts() {
+  int max_hart = (1 << DMI_DMCONTROL_HARTSEL_LENGTH) - 1;
+  write(DMI_DMCONTROL, set_field(read(DMI_DMCONTROL), DMI_DMCONTROL_HARTSEL, max_hart));
+  read(DMI_DMSTATUS);
+  max_hart = get_field(read(DMI_DMCONTROL), DMI_DMCONTROL_HARTSEL);
+
+  int hartsel;
+  for (hartsel = 0; hartsel <= max_hart; hartsel++) {
+    select_hart(hartsel);
+    int dmstatus = read(DMI_DMSTATUS);
+    if (get_field(dmstatus, DMI_DMSTATUS_ANYNONEXISTENT))
+      break;
+  }
+  return hartsel;
+}
+
+void dtm_t::halt(int hartsel)
+{
+  if (running) {
+    write(DMI_DMCONTROL, DMI_DMCONTROL_DMACTIVE);
+    // Read dmstatus to avoid back-to-back writes to dmcontrol.
+    read(DMI_DMSTATUS);
+  }
+
+  int dmcontrol = DMI_DMCONTROL_HALTREQ | DMI_DMCONTROL_DMACTIVE;
+  dmcontrol = set_field(dmcontrol, DMI_DMCONTROL_HARTSEL, hartsel);
+  write(DMI_DMCONTROL, dmcontrol);
+  int dmstatus;
+  do {
+    dmstatus = read(DMI_DMSTATUS);
+  } while(get_field(dmstatus, DMI_DMSTATUS_ALLHALTED) == 0);
+  dmcontrol &= ~DMI_DMCONTROL_HALTREQ;
+  write(DMI_DMCONTROL, dmcontrol);
+  // Read dmstatus to avoid back-to-back writes to dmcontrol.
+  read(DMI_DMSTATUS);
+  current_hart = hartsel;
+}
+
+void dtm_t::resume(int hartsel)
+{
+  int dmcontrol = DMI_DMCONTROL_RESUMEREQ | DMI_DMCONTROL_DMACTIVE;
+  dmcontrol = set_field(dmcontrol, DMI_DMCONTROL_HARTSEL, hartsel);
+  write(DMI_DMCONTROL, dmcontrol);
+  int dmstatus;
+  do {
+    dmstatus = read(DMI_DMSTATUS);
+  } while (get_field(dmstatus, DMI_DMSTATUS_ALLRESUMEACK) == 0);
+  dmcontrol &= ~DMI_DMCONTROL_RESUMEREQ;
+  write(DMI_DMCONTROL, dmcontrol);
+  // Read dmstatus to avoid back-to-back writes to dmcontrol.
+  read(DMI_DMSTATUS);
+  current_hart = hartsel;
+
+  if (running) {
+    write(DMI_DMCONTROL, DMI_DMCONTROL_DMACTIVE);
+    // Read dmstatus to avoid back-to-back writes to dmcontrol.
+    read(DMI_DMSTATUS);
+  }
+}
+
+uint64_t dtm_t::save_reg(unsigned regno)
+{
+  uint32_t data[xlen/(8*4)];
+  uint32_t command = AC_ACCESS_REGISTER_TRANSFER | AC_AR_SIZE(xlen) | AC_AR_REGNO(regno);
+  RUN_AC_OR_DIE(command, 0, 0, data, xlen / (8*4));
+
+  uint64_t result = data[0];
+  if (xlen > 32) {
+    result |= ((uint64_t)data[1]) << 32;
+  }
+  return result;
+}
+
+void dtm_t::restore_reg(unsigned regno, uint64_t val)
+{
+  uint32_t data[xlen/(8*4)];
+  data[0] = (uint32_t) val;
+  if (xlen > 32) {
+    data[1] = (uint32_t) (val >> 32);
+  }
+
+  uint32_t command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(regno);
+  
+  RUN_AC_OR_DIE(command, 0, 0, data, xlen / (8*4));
+
+}
+
+uint32_t dtm_t::run_abstract_command(uint32_t command,
+                                     const uint32_t program[], size_t program_n,
+                                     uint32_t data[], size_t data_n)
+{ 
+  assert(program_n <= ram_words);
+  assert(data_n    <= data_words);
+  
+  for (size_t i = 0; i < program_n; i++) {
+    write(DMI_PROGBUF0 + i, program[i]);
+  }
+
+  if (get_field(command, AC_ACCESS_REGISTER_WRITE) &&
+      get_field(command, AC_ACCESS_REGISTER_TRANSFER)) {
+    for (size_t i = 0; i < data_n; i++) {
+      write(DMI_DATA0 + i, data[i]);
+    }
+  }
+  
+  write(DMI_COMMAND, command);
+  
+  // Wait for not busy and then check for error.
+  uint32_t abstractcs;
+  do {
+    abstractcs = read(DMI_ABSTRACTCS);
+  } while (abstractcs & DMI_ABSTRACTCS_BUSY);
+
+  if ((get_field(command, AC_ACCESS_REGISTER_WRITE) == 0) &&
+      get_field(command, AC_ACCESS_REGISTER_TRANSFER)) {
+    for (size_t i = 0; i < data_n; i++){
+      data[i] = read(DMI_DATA0 + i);
+    }
+  }
+  
+  return get_field(abstractcs, DMI_ABSTRACTCS_CMDERR);
+
+}
+
+size_t dtm_t::chunk_align()
+{
+  return xlen / 8;
+}
+
+void dtm_t::read_chunk(uint64_t taddr, size_t len, void* dst)
+{
+  uint32_t prog[ram_words];
+  uint32_t data[data_words];
+
+  uint8_t * curr = (uint8_t*) dst;
+
+  halt(current_hart);
+
+  uint64_t s0 = save_reg(S0);
+  uint64_t s1 = save_reg(S1);
+  
+  prog[0] = LOAD(xlen, S1, S0, 0);
+  prog[1] = ADDI(S0, S0, xlen/8);
+  prog[2] = EBREAK;
+
+  data[0] = (uint32_t) taddr;
+  if (xlen > 32) {
+    data[1] = (uint32_t) (taddr >> 32);
+  }
+
+  // Write s0 with the address, then execute program buffer.
+  // This will get S1 with the data and increment s0.
+  uint32_t command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_ACCESS_REGISTER_POSTEXEC |
+    AC_AR_SIZE(xlen) | 
+    AC_AR_REGNO(S0);
+
+  RUN_AC_OR_DIE(command, prog, 3, data, xlen/(4*8));
+
+  // TODO: could use autoexec here.
+  for (size_t i = 0; i < (len * 8 / xlen); i++){
+    command = AC_ACCESS_REGISTER_TRANSFER |
+      AC_AR_SIZE(xlen) |
+      AC_AR_REGNO(S1);
+    if ((i + 1) < (len * 8 / xlen)) {
+      command |= AC_ACCESS_REGISTER_POSTEXEC;
+    }
+    
+    RUN_AC_OR_DIE(command, 0, 0, data, xlen/(4*8));
+
+    memcpy(curr, data, xlen/8);
+    curr += xlen/8;
+  }
+
+  restore_reg(S0, s0);
+  restore_reg(S1, s1);
+
+  resume(current_hart); 
+
+}
+
+void dtm_t::write_chunk(uint64_t taddr, size_t len, const void* src)
+{  
+  uint32_t prog[ram_words];
+  uint32_t data[data_words];
+
+  const uint8_t * curr = (const uint8_t*) src;
+
+  halt(current_hart);
+
+  uint64_t s0 = save_reg(S0);
+  uint64_t s1 = save_reg(S1);
+  
+  prog[0] = STORE(xlen, S1, S0, 0);
+  prog[1] = ADDI(S0, S0, xlen/8);
+  prog[2] = EBREAK;
+  
+  data[0] = (uint32_t) taddr;
+  if (xlen > 32) {
+    data[1] = (uint32_t) (taddr >> 32);
+  }
+
+  // Write the program (not used yet).
+  // Write s0 with the address. 
+  uint32_t command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(S0);
+  
+  RUN_AC_OR_DIE(command, prog, 3, data, xlen/(4*8));
+
+  // Use Autoexec for more than one word of transfer.
+  // Write S1 with data, then execution stores S1 to
+  // 0(S0) and increments S0.
+  // Each time we write XLEN bits.
+  memcpy(data, curr, xlen/8);
+  curr += xlen/8;
+  
+  command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_POSTEXEC |
+    AC_ACCESS_REGISTER_WRITE | 
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(S1);
+
+  RUN_AC_OR_DIE(command, 0, 0, data, xlen/(4*8));
+
+  uint32_t abstractcs;
+  for (size_t i = 1; i < (len * 8 / xlen); i++){
+    if (i == 1) {
+      write(DMI_ABSTRACTAUTO, 1 << DMI_ABSTRACTAUTO_AUTOEXECDATA_OFFSET);
+    }
+    memcpy(data, curr, xlen/8);
+    curr += xlen/8;
+    if (xlen == 64) {
+      write(DMI_DATA0 + 1, data[1]);
+    }
+    write(DMI_DATA0, data[0]); //Triggers a command w/ autoexec.
+    
+    do {
+      abstractcs = read(DMI_ABSTRACTCS);
+    } while (abstractcs & DMI_ABSTRACTCS_BUSY);
+    if ( get_field(abstractcs, DMI_ABSTRACTCS_CMDERR)) {
+      die(get_field(abstractcs, DMI_ABSTRACTCS_CMDERR));
+    }
+  }
+  if ((len * 8 / xlen) > 1) {
+    write(DMI_ABSTRACTAUTO, 0);
+  }
+  
+  restore_reg(S0, s0);
+  restore_reg(S1, s1);
+  resume(current_hart);
+}
+
+void dtm_t::die(uint32_t cmderr)
+{
+  const char * codes[] = {
+    "OK",
+    "BUSY",
+    "NOT_SUPPORTED",
+    "EXCEPTION",
+    "HALT/RESUME"
+  };
+  const char * msg;
+  if (cmderr < (sizeof(codes) / sizeof(*codes))){
+    msg = codes[cmderr];
+  } else {
+    msg = "OTHER";
+  }
+  //throw std::runtime_error("Debug Abstract Command Error #" + std::to_string(cmderr) + "(" +  msg + ")");
+  printf("ERROR: %s:%d, Debug Abstract Command Error #%d (%s)", __FILE__, __LINE__, cmderr, msg);
+  printf("ERROR: %s:%d, Should die, but allowing simulation to continue and fail.", __FILE__, __LINE__);
+  write(DMI_ABSTRACTCS, DMI_ABSTRACTCS_CMDERR);
+
+}
+
+void dtm_t::clear_chunk(uint64_t taddr, size_t len)
+{
+  uint32_t prog[ram_words];
+  uint32_t data[data_words];
+  
+  halt(current_hart);
+  uint64_t s0 = save_reg(S0);
+  uint64_t s1 = save_reg(S1);
+
+  uint32_t command;
+
+  // S0 = Addr
+  data[0] = (uint32_t) taddr;
+  data[1] = (uint32_t) (taddr >> 32);
+  command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(S0);
+  RUN_AC_OR_DIE(command, 0, 0, data, xlen/(4*8));
+
+  // S1 = Addr + len, loop until S0 = S1
+  prog[0] = STORE(xlen, X0, S0, 0);
+  prog[1] = ADDI(S0, S0, xlen/8);
+  prog[2] = BNE(S0, S1, 0*4, 2*4);
+  prog[3] = EBREAK;
+
+  data[0] = (uint32_t) (taddr + len);
+  data[1] = (uint32_t) ((taddr + len) >> 32);
+  command = AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(S1)  |
+    AC_ACCESS_REGISTER_POSTEXEC;
+  RUN_AC_OR_DIE(command, prog, 4, data, xlen/(4*8));
+
+  restore_reg(S0, s0);
+  restore_reg(S1, s1);
+
+  resume(current_hart);
+}
+
+uint64_t dtm_t::write_csr(unsigned which, uint64_t data)
+{
+  return modify_csr(which, data, WRITE);
+}
+
+uint64_t dtm_t::set_csr(unsigned which, uint64_t data)
+{
+  return modify_csr(which, data, SET);
+}
+
+uint64_t dtm_t::clear_csr(unsigned which, uint64_t data)
+{
+  return modify_csr(which, data, CLEAR);
+}
+
+uint64_t dtm_t::read_csr(unsigned which)
+{
+  return set_csr(which, 0);
+}
+
+uint64_t dtm_t::modify_csr(unsigned which, uint64_t data, uint32_t type)
+{
+  halt(current_hart);
+
+  // This code just uses DSCRATCH to save S0
+  // and data_base to do the transfer so we don't
+  // need to run more commands to save and restore
+  // S0.
+  uint32_t prog[] = {
+    CSRRx(WRITE, S0, CSR_DSCRATCH0, S0),
+    LOAD(xlen, S0, X0, data_base),
+    CSRRx(type, S0, which, S0),
+    STORE(xlen, S0, X0, data_base),
+    CSRRx(WRITE, S0, CSR_DSCRATCH0, S0),
+    EBREAK
+  };
+
+  //TODO: Use transfer = 0. For now both HW and OpenOCD
+  // ignore transfer bit, so use "store to X0" NOOP.
+  // We sort of need this anyway because run_abstract_command
+  // needs the DATA to be written so may as well use the WRITE flag.
+  
+  uint32_t adata[] = {(uint32_t) data,
+                      (uint32_t) (data >> 32)};
+  
+  uint32_t command = AC_ACCESS_REGISTER_POSTEXEC |
+    AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(X0);
+  
+  RUN_AC_OR_DIE(command, prog, sizeof(prog) / sizeof(*prog), adata, xlen/(4*8));
+  
+  uint64_t res = read(DMI_DATA0);//adata[0];
+  if (xlen == 64)
+    res |= read(DMI_DATA0 + 1);//((uint64_t) adata[1]) << 32;
+  
+  resume(current_hart);
+  return res;  
+}
+
+size_t dtm_t::chunk_max_size()
+{
+  // Arbitrary choice. 4k Page size seems reasonable.
+  return 4096;
+}
+
+uint32_t dtm_t::get_xlen()
+{
+  // Attempt to read S0 to find out what size it is.
+  // You could also attempt to run code, but you need to save registers
+  // to do that anyway. If what you really want to do is figure out
+  // the size of S0 so you can save it later, then do that.
+  uint32_t command = AC_ACCESS_REGISTER_TRANSFER | AC_AR_REGNO(S0);
+  uint32_t cmderr;
+  
+  const uint32_t prog[] = {};
+  uint32_t data[] = {};
+
+  cmderr = run_abstract_command(command | AC_AR_SIZE(128), prog, 0, data, 0);
+  if (cmderr == 0){
+    throw std::runtime_error("FESVR DTM Does not support 128-bit");
+    abort();
+    return 128;
+  }
+  write(DMI_ABSTRACTCS, DMI_ABSTRACTCS_CMDERR);
+
+  cmderr = run_abstract_command(command | AC_AR_SIZE(64), prog, 0, data, 0);
+  if (cmderr == 0){
+    return 64;
+  }
+  write(DMI_ABSTRACTCS, DMI_ABSTRACTCS_CMDERR);
+
+  cmderr = run_abstract_command(command | AC_AR_SIZE(32), prog, 0, data, 0);
+  if (cmderr == 0){
+    return 32;
+  }
+  
+  throw std::runtime_error("FESVR DTM can't determine XLEN. Aborting");
+}
+
+void dtm_t::fence_i()
+{
+  halt(current_hart);
+
+  const uint32_t prog[] = {
+    FENCE_I,
+    EBREAK
+  };
+
+  //TODO: Use the transfer = 0.
+  uint32_t command = AC_ACCESS_REGISTER_POSTEXEC |
+    AC_ACCESS_REGISTER_TRANSFER |
+    AC_ACCESS_REGISTER_WRITE |
+    AC_AR_SIZE(xlen) |
+    AC_AR_REGNO(X0);
+
+  RUN_AC_OR_DIE(command, prog, sizeof(prog)/sizeof(*prog), 0, 0);
+  
+  resume(current_hart);
+
+}
+
+void host_thread_main(void* arg)
+{
+  ((dtm_t*)arg)->producer_thread();
+}
+
+void dtm_t::reset()
+{
+  for (int hartsel = 0; hartsel < num_harts; hartsel ++ ){
+    select_hart(hartsel);
+    // this command also does a halt and resume
+    fence_i();
+    // after this command, the hart will run from _start.
+    write_csr(0x7b1, get_entry_point());
+  }
+  // In theory any hart can handle the memory accesses,
+  // this will enforce that hart 0 handles them.
+  select_hart(0);
+  read(DMI_DMSTATUS);
+} 
+
+void dtm_t::idle()
+{
+  for (int idle_cycles = 0; idle_cycles < max_idle_cycles; idle_cycles++)
+    nop();
+}
+
+void dtm_t::producer_thread()
+{
+  // Learn about the Debug Module and assert things we
+  // depend on in this code.
+
+  // Enable the debugger.
+  write(DMI_DMCONTROL, DMI_DMCONTROL_DMACTIVE);
+  // Poll until the debugger agrees it's enabled.
+  while ((read(DMI_DMCONTROL) & DMI_DMCONTROL_DMACTIVE) == 0) ;
+    
+  // These are checked every time we run an abstract command.
+  uint32_t abstractcs = read(DMI_ABSTRACTCS);
+  ram_words = get_field(abstractcs, DMI_ABSTRACTCS_PROGSIZE);
+  data_words = get_field(abstractcs, DMI_ABSTRACTCS_DATACOUNT);
+
+  // These things are only needed for the 'modify_csr' function.
+  // That could be re-written to not use these at some performance
+  // overhead.
+  uint32_t hartinfo = read(DMI_HARTINFO);
+  assert(get_field(hartinfo, DMI_HARTINFO_NSCRATCH) > 0);
+  assert(get_field(hartinfo, DMI_HARTINFO_DATAACCESS));
+
+  data_base = get_field(hartinfo, DMI_HARTINFO_DATAADDR);
+  
+  num_harts = enumerate_harts();
+  halt(0);
+  // Note: We don't support systems with heterogeneous XLEN.
+  // It's possible to do this at the cost of extra cycles.
+  xlen = get_xlen();
+  resume(0);
+
+  running = true;
+
+  htif_t::run();
+
+  while (true)
+    nop();
+}
+
+void dtm_t::start_host_thread()
+{
+  req_wait = false;
+  resp_wait = false;
+
+  target = context_t::current();
+  host.init(host_thread_main, this);
+  host.switch_to();
+}
+
+dtm_t::dtm_t(int argc, char** argv)
+  : htif_t(argc, argv), running(false)
+{
+  start_host_thread();
+}
+
+dtm_t::~dtm_t()
+{
+}
+
+void dtm_t::tick(
+  bool      req_ready,
+  bool      resp_valid,
+  resp      resp_bits)
+{
+  if (!resp_wait) {
+    if (!req_wait) {
+      req_wait = true;
+    } else if (req_ready) {
+      req_wait = false;
+      resp_wait = true;
+    }
+  }
+
+  if (resp_valid) {
+    assert(resp_wait);
+    resp_wait = false;
+
+    resp_buf = resp_bits;
+    // update the target with the current context
+    target = context_t::current();
+    host.switch_to();
+  }
+}
+
+void dtm_t::return_resp(resp resp_bits){
+  resp_buf = resp_bits;
+  target = context_t::current();
+  host.switch_to();
+}
diff --git a/fesvr/dtm.h b/fesvr/dtm.h
new file mode 100644
index 0000000000..fbf161efec
--- /dev/null
+++ b/fesvr/dtm.h
@@ -0,0 +1,115 @@
+#ifndef _ROCKET_DTM_H
+#define _ROCKET_DTM_H
+
+#include "htif.h"
+#include "context.h"
+#include <stdint.h>
+#include <queue>
+#include <semaphore.h>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+
+// abstract debug transport module
+class dtm_t : public htif_t
+{
+ public:
+  dtm_t(int argc, char**argv);
+  ~dtm_t();
+
+  struct req {
+    uint32_t addr;
+    uint32_t op;
+    uint32_t data;
+  };
+
+  struct resp {
+    uint32_t resp;
+    uint32_t data;
+  };
+
+  void tick(
+    bool  req_ready,
+    bool  resp_valid,
+    resp  resp_bits
+  );
+  // Akin to tick, but the target thread returns a response on every invocation
+  void return_resp(
+    resp  resp_bits
+  );
+
+  
+  bool req_valid() { return req_wait; }
+  req req_bits() { return req_buf; }
+  bool resp_ready() { return true; }
+
+  uint32_t read(uint32_t addr);
+  uint32_t write(uint32_t addr, uint32_t data);
+  void nop();
+
+  uint64_t read_csr(unsigned which);
+  uint64_t write_csr(unsigned which, uint64_t data);
+  uint64_t clear_csr(unsigned which, uint64_t data);
+  uint64_t set_csr(unsigned which, uint64_t data);
+  void fence_i();
+
+  void producer_thread();
+
+ protected:
+  virtual void read_chunk(addr_t taddr, size_t len, void* dst) override;
+  virtual void write_chunk(addr_t taddr, size_t len, const void* src) override;
+  virtual void clear_chunk(addr_t taddr, size_t len) override;
+  virtual size_t chunk_align() override;
+  virtual size_t chunk_max_size() override;
+  virtual void reset() override;
+  virtual void idle() override;
+
+ private:
+  context_t host;
+  context_t* target;
+  pthread_t producer;
+  sem_t req_produce;
+  sem_t req_consume;
+  sem_t resp_produce;
+  sem_t resp_consume;
+  req req_buf;
+  resp resp_buf;
+  bool running;
+
+  uint32_t run_abstract_command(uint32_t command, const uint32_t program[], size_t program_n,
+                                uint32_t data[], size_t data_n);
+
+  void die(uint32_t cmderr);
+  void halt(int);
+  int enumerate_harts();
+  void select_hart(int);
+  void resume(int);
+  uint64_t save_reg(unsigned regno);
+  void restore_reg(unsigned regno, uint64_t val);
+  
+  uint64_t modify_csr(unsigned which, uint64_t data, uint32_t type);
+
+  bool req_wait;
+  bool resp_wait;
+  uint32_t data_base;
+  
+  uint32_t xlen;
+
+  static const int max_idle_cycles = 10000;
+
+  size_t ram_words;
+  size_t data_words;
+  int num_harts;
+  int current_hart;
+  
+  uint32_t get_xlen();
+  uint32_t do_command(dtm_t::req r);
+
+  void parse_args(const std::vector<std::string>& args);
+  void register_devices();
+  void start_host_thread();
+
+  friend class memif_t;
+};
+
+#endif
diff --git a/fesvr/dummy.cc b/fesvr/dummy.cc
new file mode 100644
index 0000000000..a155d3e56c
--- /dev/null
+++ b/fesvr/dummy.cc
@@ -0,0 +1,4 @@
+// See LICENSE for license details.
+
+// help out poor, C-centric autoconf
+extern "C" void libfesvr_is_present() {}
diff --git a/fesvr/elf.h b/fesvr/elf.h
new file mode 100644
index 0000000000..a213832755
--- /dev/null
+++ b/fesvr/elf.h
@@ -0,0 +1,132 @@
+// See LICENSE for details.
+
+#ifndef _ELF_H
+#define _ELF_H
+
+#include <stdint.h>
+
+#define ET_EXEC 2
+#define EM_RISCV 243
+#define EM_NONE 0
+#define EV_CURRENT 1
+
+#define IS_ELF(hdr) \
+  ((hdr).e_ident[0] == 0x7f && (hdr).e_ident[1] == 'E' && \
+   (hdr).e_ident[2] == 'L'  && (hdr).e_ident[3] == 'F')
+
+#define IS_ELF32(hdr) (IS_ELF(hdr) && (hdr).e_ident[4] == 1)
+#define IS_ELF64(hdr) (IS_ELF(hdr) && (hdr).e_ident[4] == 2)
+#define IS_ELFLE(hdr) (IS_ELF(hdr) && (hdr).e_ident[5] == 1)
+#define IS_ELFBE(hdr) (IS_ELF(hdr) && (hdr).e_ident[5] == 2)
+#define IS_ELF_EXEC(hdr) (IS_ELF(hdr) && (hdr).e_type == ET_EXEC)
+#define IS_ELF_RISCV(hdr) (IS_ELF(hdr) && (hdr).e_machine == EM_RISCV)
+#define IS_ELF_EM_NONE(hdr) (IS_ELF(hdr) && (hdr).e_machine == EM_NONE)
+#define IS_ELF_VCURRENT(hdr) (IS_ELF(hdr) && (hdr).e_version == EV_CURRENT)
+
+#define PT_LOAD 1
+
+#define SHT_NOBITS 8
+
+typedef struct {
+  uint8_t  e_ident[16];
+  uint16_t e_type;
+  uint16_t e_machine;
+  uint32_t e_version;
+  uint32_t e_entry;
+  uint32_t e_phoff;
+  uint32_t e_shoff;
+  uint32_t e_flags;
+  uint16_t e_ehsize;
+  uint16_t e_phentsize;
+  uint16_t e_phnum;
+  uint16_t e_shentsize;
+  uint16_t e_shnum;
+  uint16_t e_shstrndx;
+} Elf32_Ehdr;
+
+typedef struct {
+  uint32_t sh_name;
+  uint32_t sh_type;
+  uint32_t sh_flags;
+  uint32_t sh_addr;
+  uint32_t sh_offset;
+  uint32_t sh_size;
+  uint32_t sh_link;
+  uint32_t sh_info;
+  uint32_t sh_addralign;
+  uint32_t sh_entsize;
+} Elf32_Shdr;
+
+typedef struct
+{
+  uint32_t p_type;
+  uint32_t p_offset;
+  uint32_t p_vaddr;
+  uint32_t p_paddr;
+  uint32_t p_filesz;
+  uint32_t p_memsz;
+  uint32_t p_flags;
+  uint32_t p_align;
+} Elf32_Phdr;
+
+typedef struct
+{
+  uint32_t st_name;
+  uint32_t st_value;
+  uint32_t st_size;
+  uint8_t  st_info;
+  uint8_t  st_other;
+  uint16_t st_shndx;
+} Elf32_Sym;
+
+typedef struct {
+  uint8_t  e_ident[16];
+  uint16_t e_type;
+  uint16_t e_machine;
+  uint32_t e_version;
+  uint64_t e_entry;
+  uint64_t e_phoff;
+  uint64_t e_shoff;
+  uint32_t e_flags;
+  uint16_t e_ehsize;
+  uint16_t e_phentsize;
+  uint16_t e_phnum;
+  uint16_t e_shentsize;
+  uint16_t e_shnum;
+  uint16_t e_shstrndx;
+} Elf64_Ehdr;
+
+typedef struct {
+  uint32_t sh_name;
+  uint32_t sh_type;
+  uint64_t sh_flags;
+  uint64_t sh_addr;
+  uint64_t sh_offset;
+  uint64_t sh_size;
+  uint32_t sh_link;
+  uint32_t sh_info;
+  uint64_t sh_addralign;
+  uint64_t sh_entsize;
+} Elf64_Shdr;
+
+typedef struct {
+  uint32_t p_type;
+  uint32_t p_flags;
+  uint64_t p_offset;
+  uint64_t p_vaddr;
+  uint64_t p_paddr;
+  uint64_t p_filesz;
+  uint64_t p_memsz;
+  uint64_t p_align;
+} Elf64_Phdr;
+
+typedef struct {
+  uint32_t st_name;
+  uint8_t  st_info;
+  uint8_t  st_other;
+  uint16_t st_shndx;
+  uint64_t st_value;
+  uint64_t st_size;
+} Elf64_Sym;
+
+#endif
diff --git a/fesvr/elf2hex.cc b/fesvr/elf2hex.cc
new file mode 100644
index 0000000000..327cf2d933
--- /dev/null
+++ b/fesvr/elf2hex.cc
@@ -0,0 +1,47 @@
+// See LICENSE for license details.
+
+#include <iostream>
+#include "htif_hexwriter.h"
+#include "memif.h"
+#include "elfloader.h"
+
+int main(int argc, char** argv)
+{
+  if(argc < 4 || argc > 5)
+  {
+    std::cerr << "Usage: " << argv[0] << " <width> <depth> <elf_file> [base]" << std::endl;
+    return 1;
+  }
+
+  unsigned width = atoi(argv[1]);
+  if(width == 0 || (width & (width-1)))
+  {
+    std::cerr << "width must be a power of 2" << std::endl;
+    return 1;
+  }
+
+  unsigned long long int base = 0;
+  if(argc==5) {
+    base = atoll(argv[4]);
+    if(base & (width-1))
+    {
+      std::cerr << "base must be divisible by width" << std::endl;
+      return 1;
+    }
+  }
+
+  unsigned depth = atoi(argv[2]);
+  if(depth == 0 || (depth & (depth-1)))
+  {
+    std::cerr << "depth must be a power of 2" << std::endl;
+    return 1;
+  }
+
+  htif_hexwriter_t htif(base, width, depth);
+  memif_t memif(&htif);
+  reg_t entry;
+  load_elf(argv[3], &memif, &entry);
+  std::cout << htif;
+
+  return 0;
+}
diff --git a/fesvr/elfloader.cc b/fesvr/elfloader.cc
new file mode 100644
index 0000000000..a4bae1e7d7
--- /dev/null
+++ b/fesvr/elfloader.cc
@@ -0,0 +1,94 @@
+// See LICENSE for license details.
+
+#include "elf.h"
+#include "memif.h"
+#include "byteorder.h"
+#include <cstring>
+#include <string>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <vector>
+#include <map>
+
+std::map<std::string, uint64_t> load_elf(const char* fn, memif_t* memif, reg_t* entry)
+{
+  int fd = open(fn, O_RDONLY);
+  struct stat s;
+  assert(fd != -1);
+  if (fstat(fd, &s) < 0)
+    abort();
+  size_t size = s.st_size;
+
+  char* buf = (char*)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
+  assert(buf != MAP_FAILED);
+  close(fd);
+
+  assert(size >= sizeof(Elf64_Ehdr));
+  const Elf64_Ehdr* eh64 = (const Elf64_Ehdr*)buf;
+  assert(IS_ELF32(*eh64) || IS_ELF64(*eh64));
+  assert(IS_ELFLE(*eh64));
+  assert(IS_ELF_EXEC(*eh64));
+  assert(IS_ELF_RISCV(*eh64) || IS_ELF_EM_NONE(*eh64));
+  assert(IS_ELF_VCURRENT(*eh64));
+
+  std::vector<uint8_t> zeros;
+  std::map<std::string, uint64_t> symbols;
+
+  #define LOAD_ELF(ehdr_t, phdr_t, shdr_t, sym_t, bswap) do { \
+    ehdr_t* eh = (ehdr_t*)buf; \
+    phdr_t* ph = (phdr_t*)(buf + bswap(eh->e_phoff)); \
+    *entry = bswap(eh->e_entry); \
+    assert(size >= bswap(eh->e_phoff) + bswap(eh->e_phnum)*sizeof(*ph)); \
+    for (unsigned i = 0; i < bswap(eh->e_phnum); i++) {			\
+      if(bswap(ph[i].p_type) == PT_LOAD && bswap(ph[i].p_memsz)) {	\
+        if (bswap(ph[i].p_filesz)) {					\
+          assert(size >= bswap(ph[i].p_offset) + bswap(ph[i].p_filesz)); \
+          memif->write(bswap(ph[i].p_paddr), bswap(ph[i].p_filesz), (uint8_t*)buf + bswap(ph[i].p_offset)); \
+        } \
+        zeros.resize(bswap(ph[i].p_memsz) - bswap(ph[i].p_filesz)); \
+        memif->write(bswap(ph[i].p_paddr) + bswap(ph[i].p_filesz), bswap(ph[i].p_memsz) - bswap(ph[i].p_filesz), &zeros[0]); \
+      } \
+    } \
+    shdr_t* sh = (shdr_t*)(buf + bswap(eh->e_shoff)); \
+    assert(size >= bswap(eh->e_shoff) + bswap(eh->e_shnum)*sizeof(*sh)); \
+    assert(bswap(eh->e_shstrndx) < bswap(eh->e_shnum)); \
+    assert(size >= bswap(sh[bswap(eh->e_shstrndx)].sh_offset) + bswap(sh[bswap(eh->e_shstrndx)].sh_size)); \
+    char *shstrtab = buf + bswap(sh[bswap(eh->e_shstrndx)].sh_offset);	\
+    unsigned strtabidx = 0, symtabidx = 0; \
+    for (unsigned i = 0; i < bswap(eh->e_shnum); i++) {		     \
+      unsigned max_len = bswap(sh[bswap(eh->e_shstrndx)].sh_size) - bswap(sh[i].sh_name); \
+      assert(bswap(sh[i].sh_name) < bswap(sh[bswap(eh->e_shstrndx)].sh_size));	\
+      assert(strnlen(shstrtab + bswap(sh[i].sh_name), max_len) < max_len); \
+      if (bswap(sh[i].sh_type) & SHT_NOBITS) continue; \
+      assert(size >= bswap(sh[i].sh_offset) + bswap(sh[i].sh_size)); \
+      if (strcmp(shstrtab + bswap(sh[i].sh_name), ".strtab") == 0) \
+        strtabidx = i; \
+      if (strcmp(shstrtab + bswap(sh[i].sh_name), ".symtab") == 0) \
+        symtabidx = i; \
+    } \
+    if (strtabidx && symtabidx) { \
+      char* strtab = buf + bswap(sh[strtabidx].sh_offset); \
+      sym_t* sym = (sym_t*)(buf + bswap(sh[symtabidx].sh_offset)); \
+      for (unsigned i = 0; i < bswap(sh[symtabidx].sh_size)/sizeof(sym_t); i++) { \
+        unsigned max_len = bswap(sh[strtabidx].sh_size) - bswap(sym[i].st_name); \
+        assert(bswap(sym[i].st_name) < bswap(sh[strtabidx].sh_size));	\
+        assert(strnlen(strtab + bswap(sym[i].st_name), max_len) < max_len); \
+        symbols[strtab + bswap(sym[i].st_name)] = bswap(sym[i].st_value); \
+      } \
+    } \
+  } while(0)
+
+  if (IS_ELF32(*eh64))
+    LOAD_ELF(Elf32_Ehdr, Elf32_Phdr, Elf32_Shdr, Elf32_Sym, from_le);
+  else
+    LOAD_ELF(Elf64_Ehdr, Elf64_Phdr, Elf64_Shdr, Elf64_Sym, from_le);
+
+  munmap(buf, size);
+
+  return symbols;
+}
diff --git a/fesvr/elfloader.h b/fesvr/elfloader.h
new file mode 100644
index 0000000000..696ef47849
--- /dev/null
+++ b/fesvr/elfloader.h
@@ -0,0 +1,13 @@
+// See LICENSE for license details.
+
+#ifndef _ELFLOADER_H
+#define _ELFLOADER_H
+
+#include "elf.h"
+#include <map>
+#include <string>
+
+class memif_t;
+std::map<std::string, uint64_t> load_elf(const char* fn, memif_t* memif, reg_t* entry);
+
+#endif
diff --git a/fesvr/fesvr.ac b/fesvr/fesvr.ac
new file mode 100644
index 0000000000..60e6c57fc3
--- /dev/null
+++ b/fesvr/fesvr.ac
@@ -0,0 +1 @@
+AC_CHECK_LIB(pthread, pthread_create, [], [AC_MSG_ERROR([libpthread is required])])
diff --git a/fesvr/fesvr.mk.in b/fesvr/fesvr.mk.in
new file mode 100644
index 0000000000..30c8bfeb94
--- /dev/null
+++ b/fesvr/fesvr.mk.in
@@ -0,0 +1,40 @@
+fesvr_hdrs = \
+  elf.h \
+  elfloader.h \
+  htif.h \
+  dtm.h \
+  memif.h \
+  syscall.h \
+  context.h \
+  htif_pthread.h \
+  htif_hexwriter.h \
+  option_parser.h \
+  term.h \
+  device.h \
+  rfb.h \
+  tsi.h \
+
+fesvr_CFLAGS = -fPIC
+
+fesvr_install_hdrs = $(fesvr_hdrs)
+
+fesvr_install_lib = yes
+
+fesvr_srcs = \
+  elfloader.cc \
+  htif.cc \
+  memif.cc \
+  dtm.cc \
+  syscall.cc \
+  device.cc \
+  rfb.cc \
+  context.cc \
+  htif_pthread.cc \
+  htif_hexwriter.cc \
+  dummy.cc \
+  option_parser.cc \
+  term.cc \
+  tsi.cc \
+
+fesvr_install_prog_srcs = \
+  elf2hex.cc \
diff --git a/fesvr/fesvr.pc.in b/fesvr/fesvr.pc.in
new file mode 100644
index 0000000000..f2d12563b4
--- /dev/null
+++ b/fesvr/fesvr.pc.in
@@ -0,0 +1,26 @@
+#=========================================================================
+# Modular C++ Build System Subproject Package Config
+#=========================================================================
+# Please read the documenation in 'mcppbs-uguide.txt' for more details
+# on how the Modular C++ Build System works.
+
+#-------------------------------------------------------------------------
+# Generic variables 
+#-------------------------------------------------------------------------
+
+prefix=@prefix@
+include_dir=${prefix}/include/fesvr
+lib_dir=${prefix}/lib
+
+#-------------------------------------------------------------------------
+# Keywords
+#-------------------------------------------------------------------------
+
+Name        : fesvr
+Version     : @PACKAGE_VERSION@
+Description : Frontend Server C/C++ API
+Requires    : @fesvr_pkcdeps@
+Cflags      : -I${include_dir} @CPPFLAGS@ @fesvr_extra_cppflags@
+Libs        : -L${lib_dir} @LDFLAGS@ @fesvr_extra_ldflags@ \
+              -lfesvr @fesvr_extra_libs@
+
diff --git a/fesvr/htif.cc b/fesvr/htif.cc
new file mode 100644
index 0000000000..f828494654
--- /dev/null
+++ b/fesvr/htif.cc
@@ -0,0 +1,371 @@
+// See LICENSE for license details.
+
+#include "htif.h"
+#include "rfb.h"
+#include "elfloader.h"
+#include "encoding.h"
+#include "byteorder.h"
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#include <queue>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <getopt.h>
+
+/* Attempt to determine the execution prefix automatically.  autoconf
+ * sets PREFIX, and pconfigure sets __PCONFIGURE__PREFIX. */
+#if !defined(PREFIX) && defined(__PCONFIGURE__PREFIX)
+# define PREFIX __PCONFIGURE__PREFIX
+#endif
+
+#ifndef TARGET_ARCH
+# define TARGET_ARCH "riscv64-unknown-elf"
+#endif
+
+#ifndef TARGET_DIR
+# define TARGET_DIR "/" TARGET_ARCH "/bin/"
+#endif
+
+static volatile bool signal_exit = false;
+static void handle_signal(int sig)
+{
+  if (sig == SIGABRT || signal_exit) // someone set up us the bomb!
+    exit(-1);
+  signal_exit = true;
+  signal(sig, &handle_signal);
+}
+
+htif_t::htif_t()
+  : mem(this), entry(DRAM_BASE), sig_addr(0), sig_len(0),
+    tohost_addr(0), fromhost_addr(0), exitcode(0), stopped(false),
+    syscall_proxy(this)
+{
+  signal(SIGINT, &handle_signal);
+  signal(SIGTERM, &handle_signal);
+  signal(SIGABRT, &handle_signal); // we still want to call static destructors
+}
+
+htif_t::htif_t(int argc, char** argv) : htif_t()
+{
+  parse_arguments(argc, argv);
+  register_devices();
+}
+
+htif_t::htif_t(const std::vector<std::string>& args) : htif_t()
+{
+  int argc = args.size() + 1;
+  char * argv[argc];
+  argv[0] = (char *) "htif";
+  for (unsigned int i = 0; i < args.size(); i++) {
+    argv[i+1] = (char *) args[i].c_str();
+  }
+
+  parse_arguments(argc, argv);
+  register_devices();
+}
+
+htif_t::~htif_t()
+{
+  for (auto d : dynamic_devices)
+    delete d;
+}
+
+void htif_t::start()
+{
+  if (!targs.empty() && targs[0] != "none")
+      load_program();
+
+  reset();
+}
+
+std::map<std::string, uint64_t> htif_t::load_payload(const std::string& payload, reg_t* entry)
+{
+  std::string path;
+  if (access(payload.c_str(), F_OK) == 0)
+    path = payload;
+  else if (payload.find('/') == std::string::npos)
+  {
+    std::string test_path = PREFIX TARGET_DIR + payload;
+    if (access(test_path.c_str(), F_OK) == 0)
+      path = test_path;
+  }
+
+  if (path.empty())
+    throw std::runtime_error(
+        "could not open " + payload +
+        " (did you misspell it? If VCS, did you forget +permissive/+permissive-off?)");
+
+  // temporarily construct a memory interface that skips writing bytes
+  // that have already been preloaded through a sideband
+  class preload_aware_memif_t : public memif_t {
+   public:
+    preload_aware_memif_t(htif_t* htif) : memif_t(htif), htif(htif) {}
+
+    void write(addr_t taddr, size_t len, const void* src) override
+    {
+      if (!htif->is_address_preloaded(taddr, len))
+        memif_t::write(taddr, len, src);
+    }
+
+   private:
+    htif_t* htif;
+  } preload_aware_memif(this);
+
+  return load_elf(path.c_str(), &preload_aware_memif, entry);
+}
+
+void htif_t::load_program()
+{
+  std::map<std::string, uint64_t> symbols = load_payload(targs[0], &entry);
+
+  if (symbols.count("tohost") && symbols.count("fromhost")) {
+    tohost_addr = symbols["tohost"];
+    fromhost_addr = symbols["fromhost"];
+  } else {
+    fprintf(stderr, "warning: tohost and fromhost symbols not in ELF; can't communicate with target\n");
+  }
+
+  // detect torture tests so we can print the memory signature at the end
+  if (symbols.count("begin_signature") && symbols.count("end_signature"))
+  {
+    sig_addr = symbols["begin_signature"];
+    sig_len = symbols["end_signature"] - sig_addr;
+  }
+
+  for (auto payload : payloads)
+  {
+    reg_t dummy_entry;
+    load_payload(payload, &dummy_entry);
+  }
+
+   for (auto i : symbols)
+   {
+     auto it = addr2symbol.find(i.second);
+     if ( it == addr2symbol.end())
+       addr2symbol[i.second] = i.first;
+   }
+
+   return;
+}
+
+const char* htif_t::get_symbol(uint64_t addr)
+{
+  auto it = addr2symbol.find(addr);
+
+  if(it == addr2symbol.end())
+      return nullptr;
+
+  return it->second.c_str();
+}
+
+void htif_t::stop()
+{
+  if (!sig_file.empty() && sig_len) // print final torture test signature
+  {
+    std::vector<uint8_t> buf(sig_len);
+    mem.read(sig_addr, sig_len, &buf[0]);
+
+    std::ofstream sigs(sig_file);
+    assert(sigs && "can't open signature file!");
+    sigs << std::setfill('0') << std::hex;
+
+    const addr_t incr = 16;
+    assert(sig_len % incr == 0);
+    for (addr_t i = 0; i < sig_len; i += incr)
+    {
+      for (addr_t j = incr; j > 0; j--)
+        sigs << std::setw(2) << (uint16_t)buf[i+j-1];
+      sigs << '\n';
+    }
+
+    sigs.close();
+  }
+
+  stopped = true;
+}
+
+void htif_t::clear_chunk(addr_t taddr, size_t len)
+{
+  char zeros[chunk_max_size()];
+  memset(zeros, 0, chunk_max_size());
+
+  for (size_t pos = 0; pos < len; pos += chunk_max_size())
+    write_chunk(taddr + pos, std::min(len - pos, chunk_max_size()), zeros);
+}
+
+int htif_t::run()
+{
+  start();
+
+  auto enq_func = [](std::queue<reg_t>* q, uint64_t x) { q->push(x); };
+  std::queue<reg_t> fromhost_queue;
+  std::function<void(reg_t)> fromhost_callback =
+    std::bind(enq_func, &fromhost_queue, std::placeholders::_1);
+
+  if (tohost_addr == 0) {
+    while (true)
+      idle();
+  }
+
+  while (!signal_exit && exitcode == 0)
+  {
+    if (auto tohost = from_le(mem.read_uint64(tohost_addr))) {
+      mem.write_uint64(tohost_addr, 0);
+      command_t cmd(mem, tohost, fromhost_callback);
+      device_list.handle_command(cmd);
+    } else {
+      idle();
+    }
+
+    device_list.tick();
+
+    if (!fromhost_queue.empty() && mem.read_uint64(fromhost_addr) == 0) {
+      mem.write_uint64(fromhost_addr, to_le(fromhost_queue.front()));
+      fromhost_queue.pop();
+    }
+  }
+
+  stop();
+
+  return exit_code();
+}
+
+bool htif_t::done()
+{
+  return stopped;
+}
+
+int htif_t::exit_code()
+{
+  return exitcode >> 1;
+}
+
+void htif_t::parse_arguments(int argc, char ** argv)
+{
+  optind = 0; // reset optind as HTIF may run getopt _after_ others
+  while (1) {
+    static struct option long_options[] = { HTIF_LONG_OPTIONS };
+    int option_index = 0;
+    int c = getopt_long(argc, argv, "-h", long_options, &option_index);
+
+    if (c == -1) break;
+ retry:
+    switch (c) {
+      case 'h': usage(argv[0]);
+        throw std::invalid_argument("User queried htif_t help text");
+      case HTIF_LONG_OPTIONS_OPTIND:
+        if (optarg) dynamic_devices.push_back(new rfb_t(atoi(optarg)));
+        else        dynamic_devices.push_back(new rfb_t);
+        break;
+      case HTIF_LONG_OPTIONS_OPTIND + 1:
+        // [TODO] Remove once disks are supported again
+        throw std::invalid_argument("--disk/+disk unsupported (use a ramdisk)");
+        dynamic_devices.push_back(new disk_t(optarg));
+        break;
+      case HTIF_LONG_OPTIONS_OPTIND + 2:
+        sig_file = optarg;
+        break;
+      case HTIF_LONG_OPTIONS_OPTIND + 3:
+        syscall_proxy.set_chroot(optarg);
+        break;
+      case HTIF_LONG_OPTIONS_OPTIND + 4:
+        payloads.push_back(optarg);
+        break;
+      case '?':
+        if (!opterr)
+          break;
+        throw std::invalid_argument("Unknown argument (did you mean to enable +permissive parsing?)");
+      case 1: {
+        std::string arg = optarg;
+        if (arg == "+h" || arg == "+help") {
+          c = 'h';
+          optarg = nullptr;
+        }
+        else if (arg == "+rfb") {
+          c = HTIF_LONG_OPTIONS_OPTIND;
+          optarg = nullptr;
+        }
+        else if (arg.find("+rfb=") == 0) {
+          c = HTIF_LONG_OPTIONS_OPTIND;
+          optarg = optarg + 5;
+        }
+        else if (arg.find("+disk=") == 0) {
+          c = HTIF_LONG_OPTIONS_OPTIND + 1;
+          optarg = optarg + 6;
+        }
+        else if (arg.find("+signature=") == 0) {
+          c = HTIF_LONG_OPTIONS_OPTIND + 2;
+          optarg = optarg + 11;
+        }
+        else if (arg.find("+chroot=") == 0) {
+          c = HTIF_LONG_OPTIONS_OPTIND + 3;
+          optarg = optarg + 8;
+        }
+        else if (arg.find("+payload=") == 0) {
+          c = HTIF_LONG_OPTIONS_OPTIND + 4;
+          optarg = optarg + 9;
+        }
+        else if (arg.find("+permissive-off") == 0) {
+          if (opterr)
+            throw std::invalid_argument("Found +permissive-off when not parsing permissively");
+          opterr = 1;
+          break;
+        }
+        else if (arg.find("+permissive") == 0) {
+          if (!opterr)
+            throw std::invalid_argument("Found +permissive when already parsing permissively");
+          opterr = 0;
+          break;
+        }
+        else {
+          if (!opterr)
+            break;
+          else {
+            optind--;
+            goto done_processing;
+          }
+        }
+        goto retry;
+      }
+    }
+  }
+
+done_processing:
+  while (optind < argc)
+    targs.push_back(argv[optind++]);
+  if (!targs.size()) {
+    usage(argv[0]);
+    throw std::invalid_argument("No binary specified (Did you forget it? Did you forget '+permissive-off' if running with +permissive?)");
+  }
+}
+
+void htif_t::register_devices()
+{
+  device_list.register_device(&syscall_proxy);
+  device_list.register_device(&bcd);
+  for (auto d : dynamic_devices)
+    device_list.register_device(d);
+}
+
+void htif_t::usage(const char * program_name)
+{
+  printf("Usage: %s [EMULATOR OPTION]... [VERILOG PLUSARG]... [HOST OPTION]... BINARY [TARGET OPTION]...\n ",
+         program_name);
+  fputs("\
+Run a BINARY on the Rocket Chip emulator.\n\
+\n\
+Mandatory arguments to long options are mandatory for short options too.\n\
+\n\
+EMULATOR OPTIONS\n\
+  Consult emulator.cc if using Verilator or VCS documentation if using VCS\n\
+    for available options.\n\
+EMUALTOR VERILOG PLUSARGS\n\
+  Consult generated-src*/*.plusArgs for available options\n\
+", stdout);
+  fputs("\n" HTIF_USAGE_OPTIONS, stdout);
+}
diff --git a/fesvr/htif.h b/fesvr/htif.h
new file mode 100644
index 0000000000..5b16a60db4
--- /dev/null
+++ b/fesvr/htif.h
@@ -0,0 +1,126 @@
+// See LICENSE for license details.
+
+#ifndef __HTIF_H
+#define __HTIF_H
+
+#include "memif.h"
+#include "syscall.h"
+#include "device.h"
+#include <string.h>
+#include <map>
+#include <vector>
+
+class htif_t : public chunked_memif_t
+{
+ public:
+  htif_t();
+  htif_t(int argc, char** argv);
+  htif_t(const std::vector<std::string>& args);
+  virtual ~htif_t();
+
+  virtual void start();
+  virtual void stop();
+
+  int run();
+  bool done();
+  int exit_code();
+
+  virtual memif_t& memif() { return mem; }
+
+ protected:
+  virtual void reset() = 0;
+
+  virtual void read_chunk(addr_t taddr, size_t len, void* dst) = 0;
+  virtual void write_chunk(addr_t taddr, size_t len, const void* src) = 0;
+  virtual void clear_chunk(addr_t taddr, size_t len);
+
+  virtual size_t chunk_align() = 0;
+  virtual size_t chunk_max_size() = 0;
+
+  virtual std::map<std::string, uint64_t> load_payload(const std::string& payload, reg_t* entry);
+  virtual void load_program();
+  virtual void idle() {}
+
+  const std::vector<std::string>& host_args() { return hargs; }
+
+  reg_t get_entry_point() { return entry; }
+
+  // indicates that the initial program load can skip writing this address
+  // range to memory, because it has already been loaded through a sideband
+  virtual bool is_address_preloaded(addr_t taddr, size_t len) { return false; }
+
+  // Given an address, return symbol from addr2symbol map
+  const char* get_symbol(uint64_t addr);
+
+ private:
+  void parse_arguments(int argc, char ** argv);
+  void register_devices();
+  void usage(const char * program_name);
+
+  memif_t mem;
+  reg_t entry;
+  bool writezeros;
+  std::vector<std::string> hargs;
+  std::vector<std::string> targs;
+  std::string sig_file;
+  addr_t sig_addr; // torture
+  addr_t sig_len; // torture
+  addr_t tohost_addr;
+  addr_t fromhost_addr;
+  int exitcode;
+  bool stopped;
+
+  device_list_t device_list;
+  syscall_t syscall_proxy;
+  bcd_t bcd;
+  std::vector<device_t*> dynamic_devices;
+  std::vector<std::string> payloads;
+
+  const std::vector<std::string>& target_args() { return targs; }
+
+  std::map<uint64_t, std::string> addr2symbol;
+
+  friend class memif_t;
+  friend class syscall_t;
+};
+
+/* Alignment guide for emulator.cc options:
+  -x, --long-option        Description with max 80 characters --------------->\n\
+       +plus-arg-equivalent\n\
+ */
+#define HTIF_USAGE_OPTIONS \
+"HOST OPTIONS\n\
+  -h, --help               Display this help and exit\n\
+  +h,  +help\n\
+       +permissive         The host will ignore any unparsed options up until\n\
+                             +permissive-off (Only needed for VCS)\n\
+       +permissive-off     Stop ignoring options. This is mandatory if using\n\
+                             +permissive (Only needed for VCS)\n\
+      --rfb=DISPLAY        Add new remote frame buffer on display DISPLAY\n\
+       +rfb=DISPLAY          to be accessible on 5900 + DISPLAY (default = 0)\n\
+      --signature=FILE     Write torture test signature to FILE\n\
+       +signature=FILE\n\
+      --chroot=PATH        Use PATH as location of syscall-servicing binaries\n\
+       +chroot=PATH\n\
+      --payload=PATH       Load PATH memory as an additional ELF payload\n\
+       +payload=PATH\n\
+\n\
+HOST OPTIONS (currently unsupported)\n\
+      --disk=DISK          Add DISK device. Use a ramdisk since this isn't\n\
+       +disk=DISK            supported\n\
+\n\
+TARGET (RISC-V BINARY) OPTIONS\n\
+  These are the options passed to the program executing on the emulated RISC-V\n\
+  microprocessor.\n"
+
+#define HTIF_LONG_OPTIONS_OPTIND 1024
+#define HTIF_LONG_OPTIONS                                               \
+{"help",      no_argument,       0, 'h'                          },     \
+{"rfb",       optional_argument, 0, HTIF_LONG_OPTIONS_OPTIND     },     \
+{"disk",      required_argument, 0, HTIF_LONG_OPTIONS_OPTIND + 1 },     \
+{"signature", required_argument, 0, HTIF_LONG_OPTIONS_OPTIND + 2 },     \
+{"chroot",    required_argument, 0, HTIF_LONG_OPTIONS_OPTIND + 3 },     \
+{"payload",   required_argument, 0, HTIF_LONG_OPTIONS_OPTIND + 4 },     \
+{0, 0, 0, 0}
+
+#endif // __HTIF_H
diff --git a/fesvr/htif_hexwriter.cc b/fesvr/htif_hexwriter.cc
new file mode 100644
index 0000000000..e4811b3bee
--- /dev/null
+++ b/fesvr/htif_hexwriter.cc
@@ -0,0 +1,76 @@
+// See LICENSE for license details.
+
+#include <iostream>
+#include <assert.h>
+#include "htif_hexwriter.h"
+
+htif_hexwriter_t::htif_hexwriter_t(size_t b, size_t w, size_t d)
+  : base(b), width(w), depth(d)
+{
+}
+
+void htif_hexwriter_t::read_chunk(addr_t taddr, size_t len, void* vdst)
+{
+  taddr -= base;
+
+  assert(len % chunk_align() == 0);
+  assert(taddr < width*depth);
+  assert(taddr+len <= width*depth);
+
+  uint8_t* dst = (uint8_t*)vdst;
+  while(len)
+  {
+    if(mem[taddr/width].size() == 0)
+      mem[taddr/width].resize(width,0);
+
+    for(size_t j = 0; j < width; j++)
+      dst[j] = mem[taddr/width][j];
+
+    len -= width;
+    taddr += width;
+    dst += width;
+  }
+}
+
+void htif_hexwriter_t::write_chunk(addr_t taddr, size_t len, const void* vsrc)
+{
+  taddr -= base;
+
+  assert(len % chunk_align() == 0);
+  assert(taddr < width*depth);
+  assert(taddr+len <= width*depth);
+
+  const uint8_t* src = (const uint8_t*)vsrc;
+  while(len)
+  {
+    if(mem[taddr/width].size() == 0)
+      mem[taddr/width].resize(width,0);
+
+    for(size_t j = 0; j < width; j++)
+      mem[taddr/width][j] = src[j];
+
+    len -= width;
+    taddr += width;
+  }
+}
+
+std::ostream& operator<< (std::ostream& o, const htif_hexwriter_t& h)
+{
+  std::ios_base::fmtflags flags = o.setf(std::ios::hex,std::ios::basefield);
+
+  for(size_t addr = 0; addr < h.depth; addr++)
+  {
+    std::map<addr_t,std::vector<char> >::const_iterator i = h.mem.find(addr);
+    if(i == h.mem.end())
+      for(size_t j = 0; j < h.width; j++)
+        o << "00";
+    else
+      for(size_t j = 0; j < h.width; j++)
+        o << ((i->second[h.width-j-1] >> 4) & 0xF) << (i->second[h.width-j-1] & 0xF);
+    o << std::endl;
+  }
+
+  o.setf(flags);
+
+  return o;
+}
diff --git a/fesvr/htif_hexwriter.h b/fesvr/htif_hexwriter.h
new file mode 100644
index 0000000000..725616626e
--- /dev/null
+++ b/fesvr/htif_hexwriter.h
@@ -0,0 +1,32 @@
+// See LICENSE for license details.
+
+#ifndef __HTIF_HEXWRITER_H
+#define __HTIF_HEXWRITER_H
+
+#include <map>
+#include <vector>
+#include <stdlib.h>
+#include "memif.h"
+
+class htif_hexwriter_t : public chunked_memif_t
+{
+public:
+  htif_hexwriter_t(size_t b, size_t w, size_t d);
+
+protected:
+  size_t base;
+  size_t width;
+  size_t depth;
+  std::map<addr_t,std::vector<char> > mem;
+
+  void read_chunk(addr_t taddr, size_t len, void* dst);
+  void write_chunk(addr_t taddr, size_t len, const void* src);
+  void clear_chunk(addr_t taddr, size_t len) {}
+
+  size_t chunk_max_size() { return width; }
+  size_t chunk_align() { return width; }
+
+  friend std::ostream& operator<< (std::ostream&, const htif_hexwriter_t&);
+};
+
+#endif // __HTIF_HEXWRITER_H
diff --git a/fesvr/htif_pthread.cc b/fesvr/htif_pthread.cc
new file mode 100644
index 0000000000..b9e3832b94
--- /dev/null
+++ b/fesvr/htif_pthread.cc
@@ -0,0 +1,66 @@
+// See LICENSE for license details.
+
+#include "htif_pthread.h"
+#include <algorithm>
+#include <stdio.h>
+
+void htif_pthread_t::thread_main(void* arg)
+{
+  htif_pthread_t* htif = static_cast<htif_pthread_t*>(arg);
+  htif->run();
+  while (true)
+    htif->target->switch_to();
+}
+
+htif_pthread_t::htif_pthread_t(int argc, char** argv)
+    : htif_t(argc, argv)
+{
+  target = context_t::current();
+  host.init(thread_main, this);
+}
+
+htif_pthread_t::~htif_pthread_t()
+{
+}
+
+ssize_t htif_pthread_t::read(void* buf, size_t max_size)
+{
+  while (th_data.size() == 0)
+    target->switch_to();
+
+  size_t s = std::min(max_size, th_data.size());
+  std::copy(th_data.begin(), th_data.begin() + s, (char*)buf);
+  th_data.erase(th_data.begin(), th_data.begin() + s);
+
+  return s;
+}
+
+ssize_t htif_pthread_t::write(const void* buf, size_t size)
+{
+  ht_data.insert(ht_data.end(), (const char*)buf, (const char*)buf + size);
+  return size;
+}
+
+void htif_pthread_t::send(const void* buf, size_t size)
+{
+  th_data.insert(th_data.end(), (const char*)buf, (const char*)buf + size);
+}
+
+void htif_pthread_t::recv(void* buf, size_t size)
+{
+  while (!this->recv_nonblocking(buf, size))
+    ;
+}
+
+bool htif_pthread_t::recv_nonblocking(void* buf, size_t size)
+{
+  if (ht_data.size() < size)
+  {
+    host.switch_to();
+    return false;
+  }
+
+  std::copy(ht_data.begin(), ht_data.begin() + size, (char*)buf);
+  ht_data.erase(ht_data.begin(), ht_data.begin() + size);
+  return true;
+}
diff --git a/fesvr/htif_pthread.h b/fesvr/htif_pthread.h
new file mode 100644
index 0000000000..c00c38230c
--- /dev/null
+++ b/fesvr/htif_pthread.h
@@ -0,0 +1,38 @@
+// See LICENSE for license details.
+
+#ifndef _HTIF_PTHREAD_H
+#define _HTIF_PTHREAD_H
+
+#include "htif.h"
+#include "context.h"
+#include <deque>
+
+class htif_pthread_t : public htif_t
+{
+ public:
+  htif_pthread_t(int argc, char** argv);
+  virtual ~htif_pthread_t();
+
+  // target inteface
+  void send(const void* buf, size_t size);
+  void recv(void* buf, size_t size);
+  bool recv_nonblocking(void* buf, size_t size);
+
+ protected:
+  // host interface
+  virtual ssize_t read(void* buf, size_t max_size);
+  virtual ssize_t write(const void* buf, size_t size);
+
+  virtual size_t chunk_align() { return 64; }
+  virtual size_t chunk_max_size() { return 1024; }
+
+ private:
+  context_t host;
+  context_t* target;
+  std::deque<char> th_data;
+  std::deque<char> ht_data;
+
+  static void thread_main(void* htif);
+};
+
+#endif
diff --git a/fesvr/memif.cc b/fesvr/memif.cc
new file mode 100644
index 0000000000..fd9629144f
--- /dev/null
+++ b/fesvr/memif.cc
@@ -0,0 +1,183 @@
+// See LICENSE for license details.
+
+#include <algorithm>
+#include <stdlib.h>
+#include <string.h>
+#include <stdexcept>
+#include "memif.h"
+
+void memif_t::read(addr_t addr, size_t len, void* bytes)
+{
+  size_t align = cmemif->chunk_align();
+  if (len && (addr & (align-1)))
+  {
+    size_t this_len = std::min(len, align - size_t(addr & (align-1)));
+    uint8_t chunk[align];
+
+    cmemif->read_chunk(addr & ~(align-1), align, chunk);
+    memcpy(bytes, chunk + (addr & (align-1)), this_len);
+
+    bytes = (char*)bytes + this_len;
+    addr += this_len;
+    len -= this_len;
+  }
+
+  if (len & (align-1))
+  {
+    size_t this_len = len & (align-1);
+    size_t start = len - this_len;
+    uint8_t chunk[align];
+
+    cmemif->read_chunk(addr + start, align, chunk);
+    memcpy((char*)bytes + start, chunk, this_len);
+
+    len -= this_len;
+  }
+
+  // now we're aligned
+  for (size_t pos = 0; pos < len; pos += cmemif->chunk_max_size())
+    cmemif->read_chunk(addr + pos, std::min(cmemif->chunk_max_size(), len - pos), (char*)bytes + pos);
+}
+
+void memif_t::write(addr_t addr, size_t len, const void* bytes)
+{
+  size_t align = cmemif->chunk_align();
+  if (len && (addr & (align-1)))
+  {
+    size_t this_len = std::min(len, align - size_t(addr & (align-1)));
+    uint8_t chunk[align];
+
+    cmemif->read_chunk(addr & ~(align-1), align, chunk);
+    memcpy(chunk + (addr & (align-1)), bytes, this_len);
+    cmemif->write_chunk(addr & ~(align-1), align, chunk);
+
+    bytes = (char*)bytes + this_len;
+    addr += this_len;
+    len -= this_len;
+  }
+
+  if (len & (align-1))
+  {
+    size_t this_len = len & (align-1);
+    size_t start = len - this_len;
+    uint8_t chunk[align];
+
+    cmemif->read_chunk(addr + start, align, chunk);
+    memcpy(chunk, (char*)bytes + start, this_len);
+    cmemif->write_chunk(addr + start, align, chunk);
+
+    len -= this_len;
+  }
+
+  // now we're aligned
+  bool all_zero = len != 0;
+  for (size_t i = 0; i < len; i++)
+    all_zero &= ((const char*)bytes)[i] == 0;
+
+  if (all_zero) {
+    cmemif->clear_chunk(addr, len);
+  } else {
+    size_t max_chunk = cmemif->chunk_max_size();
+    for (size_t pos = 0; pos < len; pos += max_chunk)
+      cmemif->write_chunk(addr + pos, std::min(max_chunk, len - pos), (char*)bytes + pos);
+  }
+}
+
+#define MEMIF_READ_FUNC \
+  if(addr & (sizeof(val)-1)) \
+    throw std::runtime_error("misaligned address"); \
+  this->read(addr, sizeof(val), &val); \
+  return val
+
+#define MEMIF_WRITE_FUNC \
+  if(addr & (sizeof(val)-1)) \
+    throw std::runtime_error("misaligned address"); \
+  this->write(addr, sizeof(val), &val)
+
+uint8_t memif_t::read_uint8(addr_t addr)
+{
+  uint8_t val;
+  MEMIF_READ_FUNC;
+}
+
+int8_t memif_t::read_int8(addr_t addr)
+{
+  int8_t val;
+  MEMIF_READ_FUNC;
+}
+
+void memif_t::write_uint8(addr_t addr, uint8_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+void memif_t::write_int8(addr_t addr, int8_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+uint16_t memif_t::read_uint16(addr_t addr)
+{
+  uint16_t val;
+  MEMIF_READ_FUNC;
+}
+
+int16_t memif_t::read_int16(addr_t addr)
+{
+  int16_t val;
+  MEMIF_READ_FUNC;
+}
+
+void memif_t::write_uint16(addr_t addr, uint16_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+void memif_t::write_int16(addr_t addr, int16_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+uint32_t memif_t::read_uint32(addr_t addr)
+{
+  uint32_t val;
+  MEMIF_READ_FUNC;
+}
+
+int32_t memif_t::read_int32(addr_t addr)
+{
+  int32_t val;
+  MEMIF_READ_FUNC;
+}
+
+void memif_t::write_uint32(addr_t addr, uint32_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+void memif_t::write_int32(addr_t addr, int32_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+uint64_t memif_t::read_uint64(addr_t addr)
+{
+  uint64_t val;
+  MEMIF_READ_FUNC;
+}
+
+int64_t memif_t::read_int64(addr_t addr)
+{
+  int64_t val;
+  MEMIF_READ_FUNC;
+}
+
+void memif_t::write_uint64(addr_t addr, uint64_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
+
+void memif_t::write_int64(addr_t addr, int64_t val)
+{
+  MEMIF_WRITE_FUNC;
+}
diff --git a/fesvr/memif.h b/fesvr/memif.h
new file mode 100644
index 0000000000..3854d664c8
--- /dev/null
+++ b/fesvr/memif.h
@@ -0,0 +1,62 @@
+// See LICENSE for license details.
+
+#ifndef __MEMIF_H
+#define __MEMIF_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+typedef uint64_t reg_t;
+typedef int64_t sreg_t;
+typedef reg_t addr_t;
+
+class chunked_memif_t
+{
+public:
+  virtual void read_chunk(addr_t taddr, size_t len, void* dst) = 0;
+  virtual void write_chunk(addr_t taddr, size_t len, const void* src) = 0;
+  virtual void clear_chunk(addr_t taddr, size_t len) = 0;
+
+  virtual size_t chunk_align() = 0;
+  virtual size_t chunk_max_size() = 0;
+};
+
+class memif_t
+{
+public:
+  memif_t(chunked_memif_t* _cmemif) : cmemif(_cmemif) {}
+  virtual ~memif_t(){}
+
+  // read and write byte arrays
+  virtual void read(addr_t addr, size_t len, void* bytes);
+  virtual void write(addr_t addr, size_t len, const void* bytes);
+
+  // read and write 8-bit words
+  virtual uint8_t read_uint8(addr_t addr);
+  virtual int8_t read_int8(addr_t addr);
+  virtual void write_uint8(addr_t addr, uint8_t val);
+  virtual void write_int8(addr_t addr, int8_t val);
+
+  // read and write 16-bit words
+  virtual uint16_t read_uint16(addr_t addr);
+  virtual int16_t read_int16(addr_t addr);
+  virtual void write_uint16(addr_t addr, uint16_t val);
+  virtual void write_int16(addr_t addr, int16_t val);
+
+  // read and write 32-bit words
+  virtual uint32_t read_uint32(addr_t addr);
+  virtual int32_t read_int32(addr_t addr);
+  virtual void write_uint32(addr_t addr, uint32_t val);
+  virtual void write_int32(addr_t addr, int32_t val);
+
+  // read and write 64-bit words
+  virtual uint64_t read_uint64(addr_t addr);
+  virtual int64_t read_int64(addr_t addr);
+  virtual void write_uint64(addr_t addr, uint64_t val);
+  virtual void write_int64(addr_t addr, int64_t val);
+
+protected:
+  chunked_memif_t* cmemif;
+};
+
+#endif // __MEMIF_H
diff --git a/fesvr/option_parser.cc b/fesvr/option_parser.cc
new file mode 100644
index 0000000000..72daec40ef
--- /dev/null
+++ b/fesvr/option_parser.cc
@@ -0,0 +1,51 @@
+// See LICENSE for license details.
+
+#include "option_parser.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+void option_parser_t::option(char c, const char* s, int arg, std::function<void(const char*)> action)
+{
+  opts.push_back(option_t(c, s, arg, action));
+}
+
+const char* const* option_parser_t::parse(const char* const* argv0)
+{
+  assert(argv0);
+  const char* const* argv = argv0 + 1;
+  for (const char* opt; (opt = *argv) != NULL && opt[0] == '-'; argv++)
+  {
+    bool found = false;
+    for (auto it = opts.begin(); !found && it != opts.end(); it++)
+    {
+      size_t slen = it->str ? strlen(it->str) : 0;
+      bool chr_match = opt[1] != '-' && it->chr && opt[1] == it->chr;
+      bool str_match = opt[1] == '-' && slen && strncmp(opt+2, it->str, slen) == 0;
+      if (chr_match || (str_match && (opt[2+slen] == '=' || opt[2+slen] == '\0')))
+      {
+        const char* optarg =
+          chr_match ? (opt[2] ? &opt[2] : NULL) :
+          opt[2+slen] ? &opt[3+slen] :
+          it->arg ? *(++argv) : NULL;
+        if (optarg && !it->arg)
+          error("no argument allowed for option", *argv0, opt);
+        if (!optarg && it->arg)
+          error("argument required for option", *argv0, opt);
+        it->func(optarg);
+        found = true;
+      }
+    }
+    if (!found)
+      error("unrecognized option", *argv0, opt);
+  }
+  return argv;
+}
+
+void option_parser_t::error(const char* msg, const char* argv0, const char* arg)
+{
+  fprintf(stderr, "%s: %s %s\n", argv0, msg, arg ? arg : "");
+  if (helpmsg) helpmsg();
+  exit(1);
+}
diff --git a/fesvr/option_parser.h b/fesvr/option_parser.h
new file mode 100644
index 0000000000..b2cb8edf9d
--- /dev/null
+++ b/fesvr/option_parser.h
@@ -0,0 +1,31 @@
+// See LICENSE for license details.
+
+#ifndef _OPTION_PARSER_H
+#define _OPTION_PARSER_H
+
+#include <vector>
+#include <functional>
+
+class option_parser_t
+{
+ public:
+  option_parser_t() : helpmsg(0) {}
+  void help(void (*helpm)(void)) { helpmsg = helpm; }
+  void option(char c, const char* s, int arg, std::function<void(const char*)> action);
+  const char* const* parse(const char* const* argv0);
+ private:
+  struct option_t
+  {
+    char chr;
+    const char* str;
+    int arg;
+    std::function<void(const char*)> func;
+    option_t(char chr, const char* str, int arg, std::function<void(const char*)> func)
+     : chr(chr), str(str), arg(arg), func(func) {}
+  };
+  std::vector<option_t> opts;
+  void (*helpmsg)(void);
+  void error(const char* msg, const char* argv0, const char* arg);
+};
+
+#endif
diff --git a/fesvr/rfb.cc b/fesvr/rfb.cc
new file mode 100644
index 0000000000..2594a1b871
--- /dev/null
+++ b/fesvr/rfb.cc
@@ -0,0 +1,230 @@
+#include "rfb.h"
+#include "memif.h"
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sched.h>
+#include <netinet/in.h>
+#include <unistd.h>
+#include <cstdlib>
+#include <stdexcept>
+#include <string>
+#include <cstring>
+#include <cinttypes>
+using namespace std::placeholders;
+
+rfb_t::rfb_t(int display)
+  : sockfd(-1), afd(-1),
+    memif(0), addr(0), width(0), height(0), bpp(0), display(display),
+    thread(pthread_self()), fb1(0), fb2(0), read_pos(0),
+    lock(PTHREAD_MUTEX_INITIALIZER)
+{
+  register_command(0, std::bind(&rfb_t::handle_configure, this, _1), "configure");
+  register_command(1, std::bind(&rfb_t::handle_set_address, this, _1), "set_address");
+}
+
+void* rfb_thread_main(void* arg)
+{
+  ((rfb_t*)arg)->thread_main();
+  return 0;
+}
+
+void rfb_t::thread_main()
+{
+  pthread_mutex_lock(&lock);
+
+  int port = 5900 + display;
+  sockfd = socket(PF_INET, SOCK_STREAM, 0);
+  if (sockfd < 0)
+    throw std::runtime_error("could not acquire tcp socket");
+
+  struct sockaddr_in saddr, caddr;
+  saddr.sin_family = AF_INET;
+  saddr.sin_addr.s_addr = INADDR_ANY;
+  saddr.sin_port = htons(port);
+  if (bind(sockfd, (struct sockaddr*)&saddr, sizeof(saddr)) < 0)
+    throw std::runtime_error("could not bind to port " + std::to_string(port));
+  if (listen(sockfd, 0) < 0)
+    throw std::runtime_error("could not listen on port " + std::to_string(port));
+ 
+  socklen_t clen = sizeof(caddr);
+  afd = accept(sockfd, (struct sockaddr*)&caddr, &clen);
+  if (afd < 0)
+    throw std::runtime_error("could not accept connection");
+
+  std::string version = "RFB 003.003\n";
+  write(version);
+  if (read() != version)
+    throw std::runtime_error("bad client version");
+
+  write(str(uint32_t(htonl(1))));
+
+  read(); // clientinit
+
+  std::string serverinit;
+  serverinit += str(uint16_t(htons(width)));
+  serverinit += str(uint16_t(htons(height)));
+  serverinit += pixel_format();
+  std::string name = "RISC-V";
+  serverinit += str(uint32_t(htonl(name.length())));
+  serverinit += name;
+  write(serverinit);
+
+  pthread_mutex_unlock(&lock);
+
+  while (memif == NULL)
+    sched_yield();
+
+  while (memif != NULL)
+  {
+    std::string s = read();
+    if (s.length() < 4)
+      break; //throw std::runtime_error("bad command");
+
+    switch (s[0])
+    {
+      case 0: set_pixel_format(s); break;
+      case 2: set_encodings(s); break;
+      case 3: break;
+    }
+  }
+
+  pthread_mutex_lock(&lock);
+  close(afd);
+  close(sockfd);
+  afd = -1;
+  sockfd = -1;
+  pthread_mutex_unlock(&lock);
+
+  thread_main();
+}
+
+rfb_t::~rfb_t()
+{
+  memif = 0;
+  if (!pthread_equal(pthread_self(), thread))
+    pthread_join(thread, 0);
+  delete [] fb1;
+  delete [] fb2;
+}
+
+void rfb_t::set_encodings(const std::string& s)
+{
+  uint16_t n = htons(*(uint16_t*)&s[2]);
+  for (size_t b = s.length(); b < 4U+4U*n; b += read().length());
+}
+
+void rfb_t::set_pixel_format(const std::string& s)
+{
+  if (s.length() != 20 || s.substr(4, 16) != pixel_format())
+    throw std::runtime_error("bad pixel format");
+}
+
+void rfb_t::fb_update(const std::string& s)
+{
+  std::string u;
+  u += str(uint8_t(0));
+  u += str(uint8_t(0));
+  u += str(uint16_t(htons(1)));
+  u += str(uint16_t(htons(0)));
+  u += str(uint16_t(htons(0)));
+  u += str(uint16_t(htons(width)));
+  u += str(uint16_t(htons(height)));
+  u += str(uint32_t(htonl(0)));
+  u += std::string((char*)fb1, fb_bytes());
+
+  try
+  {
+    write(u);
+  }
+  catch (std::runtime_error& e)
+  {
+  }
+}
+
+void rfb_t::tick()
+{
+  if (fb_bytes() == 0 || memif == NULL)
+    return;
+
+  memif->read(addr + read_pos, FB_ALIGN, const_cast<char*>(fb2 + read_pos));
+  read_pos = (read_pos + FB_ALIGN) % fb_bytes();
+  if (read_pos == 0)
+  {
+    std::swap(fb1, fb2);
+    if (pthread_mutex_trylock(&lock) == 0)
+    {
+      fb_update("");
+      pthread_mutex_unlock(&lock);
+    }
+  }
+}
+
+std::string rfb_t::pixel_format()
+{
+  int red_bits = 8, green_bits = 8, blue_bits = 8;
+  int bpp = red_bits + green_bits + blue_bits;
+  while (bpp & (bpp-1)) bpp++;
+
+  std::string fmt;
+  fmt += str(uint8_t(bpp));
+  fmt += str(uint8_t(red_bits + green_bits + blue_bits));
+  fmt += str(uint8_t(0)); // little-endian
+  fmt += str(uint8_t(1)); // true color
+  fmt += str(uint16_t(htons((1<<red_bits)-1)));
+  fmt += str(uint16_t(htons((1<<green_bits)-1)));
+  fmt += str(uint16_t(htons((1<<blue_bits)-1)));
+  fmt += str(uint8_t(blue_bits+green_bits));
+  fmt += str(uint8_t(blue_bits));
+  fmt += str(uint8_t(0));
+  fmt += str(uint16_t(0)); // pad
+  fmt += str(uint8_t(0)); // pad
+  return fmt;
+}
+
+void rfb_t::write(const std::string& s)
+{
+  if ((size_t)::write(afd, s.c_str(), s.length()) != s.length())
+    throw std::runtime_error("could not write");
+}
+
+std::string rfb_t::read()
+{
+  char buf[2048];
+  ssize_t len = ::read(afd, buf, sizeof(buf));
+  if (len < 0)
+    throw std::runtime_error("could not read");
+  if (len == sizeof(buf))
+    throw std::runtime_error("received oversized packet");
+  return std::string(buf, len);
+}
+
+void rfb_t::handle_configure(command_t cmd)
+{
+  if (fb1)
+    throw std::runtime_error("you must only set the rfb configuration once");
+
+  width = cmd.payload();
+  height = cmd.payload() >> 16;
+
+  bpp = cmd.payload() >> 32;
+  if (bpp != 32)
+    throw std::runtime_error("rfb requires 32 bpp true color");
+
+  if (fb_bytes() % FB_ALIGN != 0)
+    throw std::runtime_error("rfb size must be a multiple of " + std::to_string(FB_ALIGN));
+
+  fb1 = new char[fb_bytes()];
+  fb2 = new char[fb_bytes()];
+  if (pthread_create(&thread, 0, rfb_thread_main, this))
+    throw std::runtime_error("could not create thread");
+  cmd.respond(1);
+}
+
+void rfb_t::handle_set_address(command_t cmd)
+{
+  addr = cmd.payload();
+  if (addr % FB_ALIGN != 0)
+    throw std::runtime_error("rfb address must be " + std::to_string(FB_ALIGN) + "-byte aligned");
+  memif = &cmd.memif();
+  cmd.respond(1);
+}
diff --git a/fesvr/rfb.h b/fesvr/rfb.h
new file mode 100644
index 0000000000..263663a24d
--- /dev/null
+++ b/fesvr/rfb.h
@@ -0,0 +1,53 @@
+#ifndef _RFB_H
+#define _RFB_H
+
+#include "device.h"
+#include "memif.h"
+#include <pthread.h>
+
+// remote frame buffer
+class rfb_t : public device_t
+{
+ public:
+  rfb_t(int display = 0);
+  ~rfb_t();
+  void tick();
+  std::string name() { return "RISC-V"; }
+  const char* identity() { return "rfb"; }
+
+ private:
+  template <typename T>
+  std::string str(T x)
+  {
+    return std::string((char*)&x, sizeof(x));
+  }
+  size_t fb_bytes() { return size_t(width) * height * bpp/8; }
+  void thread_main();
+  friend void* rfb_thread_main(void*);
+  std::string pixel_format();
+  void fb_update(const std::string& s);
+  void set_encodings(const std::string& s);
+  void set_pixel_format(const std::string& s);
+  void write(const std::string& s);
+  std::string read();
+  void handle_configure(command_t cmd);
+  void handle_set_address(command_t cmd);
+
+  int sockfd;
+  int afd;
+  memif_t* memif;
+  reg_t addr;
+  uint16_t width;
+  uint16_t height;
+  uint16_t bpp;
+  int display;
+  pthread_t thread;
+  volatile char* volatile fb1;
+  volatile char* volatile fb2;
+  size_t read_pos;
+  pthread_mutex_t lock;
+
+  static const int FB_ALIGN = 256;
+};
+
+#endif
diff --git a/fesvr/syscall.cc b/fesvr/syscall.cc
new file mode 100644
index 0000000000..f0bdd259bd
--- /dev/null
+++ b/fesvr/syscall.cc
@@ -0,0 +1,395 @@
+// See LICENSE for license details.
+
+#include "syscall.h"
+#include "htif.h"
+#include "byteorder.h"
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <sstream>
+#include <iostream>
+using namespace std::placeholders;
+
+#define RISCV_AT_FDCWD -100
+
+struct riscv_stat
+{
+  uint64_t dev;
+  uint64_t ino;
+  uint32_t mode;
+  uint32_t nlink;
+  uint32_t uid;
+  uint32_t gid;
+  uint64_t rdev;
+  uint64_t __pad1;
+  uint64_t size;
+  uint32_t blksize;
+  uint32_t __pad2;
+  uint64_t blocks;
+  uint64_t atime;
+  uint64_t __pad3;
+  uint64_t mtime;
+  uint64_t __pad4;
+  uint64_t ctime;
+  uint64_t __pad5;
+  uint32_t __unused4;
+  uint32_t __unused5;
+
+  riscv_stat(const struct stat& s)
+    : dev(s.st_dev), ino(s.st_ino), mode(s.st_mode), nlink(s.st_nlink),
+      uid(s.st_uid), gid(s.st_gid), rdev(s.st_rdev), __pad1(0),
+      size(s.st_size), blksize(s.st_blksize), __pad2(0),
+      blocks(s.st_blocks), atime(s.st_atime), __pad3(0),
+      mtime(s.st_mtime), __pad4(0), ctime(s.st_ctime), __pad5(0),
+      __unused4(0), __unused5(0) {}
+};
+
+syscall_t::syscall_t(htif_t* htif)
+  : htif(htif), memif(&htif->memif()), table(2048)
+{
+  table[17] = &syscall_t::sys_getcwd;
+  table[25] = &syscall_t::sys_fcntl;
+  table[34] = &syscall_t::sys_mkdirat;
+  table[35] = &syscall_t::sys_unlinkat;
+  table[37] = &syscall_t::sys_linkat;
+  table[38] = &syscall_t::sys_renameat;
+  table[46] = &syscall_t::sys_ftruncate;
+  table[48] = &syscall_t::sys_faccessat;
+  table[49] = &syscall_t::sys_chdir;
+  table[56] = &syscall_t::sys_openat;
+  table[57] = &syscall_t::sys_close;
+  table[62] = &syscall_t::sys_lseek;
+  table[63] = &syscall_t::sys_read;
+  table[64] = &syscall_t::sys_write;
+  table[67] = &syscall_t::sys_pread;
+  table[68] = &syscall_t::sys_pwrite;
+  table[79] = &syscall_t::sys_fstatat;
+  table[80] = &syscall_t::sys_fstat;
+  table[93] = &syscall_t::sys_exit;
+  table[1039] = &syscall_t::sys_lstat;
+  table[2011] = &syscall_t::sys_getmainvars;
+
+  register_command(0, std::bind(&syscall_t::handle_syscall, this, _1), "syscall");
+
+  int stdin_fd = dup(0), stdout_fd0 = dup(1), stdout_fd1 = dup(1);
+  if (stdin_fd < 0 || stdout_fd0 < 0 || stdout_fd1 < 0)
+    throw std::runtime_error("could not dup stdin/stdout");
+
+  fds.alloc(stdin_fd); // stdin -> stdin
+  fds.alloc(stdout_fd0); // stdout -> stdout
+  fds.alloc(stdout_fd1); // stderr -> stdout
+}
+
+std::string syscall_t::do_chroot(const char* fn)
+{
+  if (!chroot.empty() && *fn == '/')
+    return chroot + fn;
+  return fn;
+}
+
+std::string syscall_t::undo_chroot(const char* fn)
+{
+  if (chroot.empty())
+    return fn;
+  if (strncmp(fn, chroot.c_str(), chroot.size()) == 0
+      && (chroot.back() == '/' || fn[chroot.size()] == '/'))
+    return fn + chroot.size() - (chroot.back() == '/');
+  return "/";
+}
+
+void syscall_t::handle_syscall(command_t cmd)
+{
+  if (cmd.payload() & 1) // test pass/fail
+  {
+    htif->exitcode = cmd.payload();
+    if (htif->exit_code())
+      std::cerr << "*** FAILED *** (tohost = " << htif->exit_code() << ")" << std::endl;
+    return;
+  }
+  else // proxied system call
+    dispatch(cmd.payload());
+
+  cmd.respond(1);
+}
+
+reg_t syscall_t::sys_exit(reg_t code, reg_t a1, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  htif->exitcode = code << 1 | 1;
+  return 0;
+}
+
+static reg_t sysret_errno(sreg_t ret)
+{
+  return ret == -1 ? -errno : ret;
+}
+
+reg_t syscall_t::sys_read(reg_t fd, reg_t pbuf, reg_t len, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> buf(len);
+  ssize_t ret = read(fds.lookup(fd), &buf[0], len);
+  reg_t ret_errno = sysret_errno(ret);
+  if (ret > 0)
+    memif->write(pbuf, ret, &buf[0]);
+  return ret_errno;
+}
+
+reg_t syscall_t::sys_pread(reg_t fd, reg_t pbuf, reg_t len, reg_t off, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> buf(len);
+  ssize_t ret = pread(fds.lookup(fd), &buf[0], len, off);
+  reg_t ret_errno = sysret_errno(ret);
+  if (ret > 0)
+    memif->write(pbuf, ret, &buf[0]);
+  return ret_errno;
+}
+
+reg_t syscall_t::sys_write(reg_t fd, reg_t pbuf, reg_t len, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> buf(len);
+  memif->read(pbuf, len, &buf[0]);
+  reg_t ret = sysret_errno(write(fds.lookup(fd), &buf[0], len));
+  return ret;
+}
+
+reg_t syscall_t::sys_pwrite(reg_t fd, reg_t pbuf, reg_t len, reg_t off, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> buf(len);
+  memif->read(pbuf, len, &buf[0]);
+  reg_t ret = sysret_errno(pwrite(fds.lookup(fd), &buf[0], len, off));
+  return ret;
+}
+
+reg_t syscall_t::sys_close(reg_t fd, reg_t a1, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  if (close(fds.lookup(fd)) < 0)
+    return sysret_errno(-1);
+  fds.dealloc(fd);
+  return 0;
+}
+
+reg_t syscall_t::sys_lseek(reg_t fd, reg_t ptr, reg_t dir, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  return sysret_errno(lseek(fds.lookup(fd), ptr, dir));
+}
+
+reg_t syscall_t::sys_fstat(reg_t fd, reg_t pbuf, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  struct stat buf;
+  reg_t ret = sysret_errno(fstat(fds.lookup(fd), &buf));
+  if (ret != (reg_t)-1)
+  {
+    riscv_stat rbuf(buf);
+    memif->write(pbuf, sizeof(rbuf), &rbuf);
+  }
+  return ret;
+}
+
+reg_t syscall_t::sys_fcntl(reg_t fd, reg_t cmd, reg_t arg, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  return sysret_errno(fcntl(fds.lookup(fd), cmd, arg));
+}
+
+reg_t syscall_t::sys_ftruncate(reg_t fd, reg_t len, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  return sysret_errno(ftruncate(fds.lookup(fd), len));
+}
+
+reg_t syscall_t::sys_lstat(reg_t pname, reg_t len, reg_t pbuf, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+
+  struct stat buf;
+  reg_t ret = sysret_errno(lstat(do_chroot(&name[0]).c_str(), &buf));
+  riscv_stat rbuf(buf);
+  if (ret != (reg_t)-1)
+  {
+    riscv_stat rbuf(buf);
+    memif->write(pbuf, sizeof(rbuf), &rbuf);
+  }
+  return ret;
+}
+
+#define AT_SYSCALL(syscall, fd, name, ...) \
+  (syscall(fds.lookup(fd), int(fd) == RISCV_AT_FDCWD ? do_chroot(name).c_str() : (name), __VA_ARGS__))
+
+reg_t syscall_t::sys_openat(reg_t dirfd, reg_t pname, reg_t len, reg_t flags, reg_t mode, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+  int fd = sysret_errno(AT_SYSCALL(openat, dirfd, &name[0], flags, mode));
+  if (fd < 0)
+    return sysret_errno(-1);
+  return fds.alloc(fd);
+}
+
+reg_t syscall_t::sys_fstatat(reg_t dirfd, reg_t pname, reg_t len, reg_t pbuf, reg_t flags, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+
+  struct stat buf;
+  reg_t ret = sysret_errno(AT_SYSCALL(fstatat, dirfd, &name[0], &buf, flags));
+  if (ret != (reg_t)-1)
+  {
+    riscv_stat rbuf(buf);
+    memif->write(pbuf, sizeof(rbuf), &rbuf);
+  }
+  return ret;
+}
+
+reg_t syscall_t::sys_faccessat(reg_t dirfd, reg_t pname, reg_t len, reg_t mode, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+  return sysret_errno(AT_SYSCALL(faccessat, dirfd, &name[0], mode, 0));
+}
+
+reg_t syscall_t::sys_renameat(reg_t odirfd, reg_t popath, reg_t olen, reg_t ndirfd, reg_t pnpath, reg_t nlen, reg_t a6)
+{
+  std::vector<char> opath(olen), npath(nlen);
+  memif->read(popath, olen, &opath[0]);
+  memif->read(pnpath, nlen, &npath[0]);
+  return sysret_errno(renameat(fds.lookup(odirfd), int(odirfd) == RISCV_AT_FDCWD ? do_chroot(&opath[0]).c_str() : &opath[0],
+                             fds.lookup(ndirfd), int(ndirfd) == RISCV_AT_FDCWD ? do_chroot(&npath[0]).c_str() : &npath[0]));
+}
+
+reg_t syscall_t::sys_linkat(reg_t odirfd, reg_t poname, reg_t olen, reg_t ndirfd, reg_t pnname, reg_t nlen, reg_t flags)
+{
+  std::vector<char> oname(olen), nname(nlen);
+  memif->read(poname, olen, &oname[0]);
+  memif->read(pnname, nlen, &nname[0]);
+  return sysret_errno(linkat(fds.lookup(odirfd), int(odirfd) == RISCV_AT_FDCWD ? do_chroot(&oname[0]).c_str() : &oname[0],
+                             fds.lookup(ndirfd), int(ndirfd) == RISCV_AT_FDCWD ? do_chroot(&nname[0]).c_str() : &nname[0],
+                             flags));
+}
+
+reg_t syscall_t::sys_unlinkat(reg_t dirfd, reg_t pname, reg_t len, reg_t flags, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+  return sysret_errno(AT_SYSCALL(unlinkat, dirfd, &name[0], flags));
+}
+
+reg_t syscall_t::sys_mkdirat(reg_t dirfd, reg_t pname, reg_t len, reg_t mode, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> name(len);
+  memif->read(pname, len, &name[0]);
+  return sysret_errno(AT_SYSCALL(mkdirat, dirfd, &name[0], mode));
+}
+
+reg_t syscall_t::sys_getcwd(reg_t pbuf, reg_t size, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<char> buf(size);
+  char* ret = getcwd(&buf[0], size);
+  if (ret == NULL)
+    return sysret_errno(-1);
+  std::string tmp = undo_chroot(&buf[0]);
+  if (size <= tmp.size())
+    return -ENOMEM;
+  memif->write(pbuf, tmp.size() + 1, &tmp[0]);
+  return tmp.size() + 1;
+}
+
+reg_t syscall_t::sys_getmainvars(reg_t pbuf, reg_t limit, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  std::vector<std::string> args = htif->target_args();
+  std::vector<uint64_t> words(args.size() + 3);
+  words[0] = to_le(args.size());
+  words[args.size()+1] = 0; // argv[argc] = NULL
+  words[args.size()+2] = 0; // envp[0] = NULL
+
+  size_t sz = (args.size() + 3) * sizeof(words[0]);
+  for (size_t i = 0; i < args.size(); i++)
+  {
+    words[i+1] = to_le(sz + pbuf);
+    sz += args[i].length() + 1;
+  }
+
+  std::vector<char> bytes(sz);
+  memcpy(&bytes[0], &words[0], sizeof(words[0]) * words.size());
+  for (size_t i = 0; i < args.size(); i++)
+    strcpy(&bytes[from_le(words[i+1]) - pbuf], args[i].c_str());
+
+  if (bytes.size() > limit)
+    return -ENOMEM;
+
+  memif->write(pbuf, bytes.size(), &bytes[0]);
+  return 0;
+}
+
+reg_t syscall_t::sys_chdir(reg_t path, reg_t a1, reg_t a2, reg_t a3, reg_t a4, reg_t a5, reg_t a6)
+{
+  size_t size = 0;
+  while (memif->read_uint8(path + size++))
+    ;
+  std::vector<char> buf(size);
+  for (size_t offset = 0;; offset++)
+  {
+    buf[offset] = memif->read_uint8(path + offset);
+    if (!buf[offset])
+      break;
+  }
+  return sysret_errno(chdir(buf.data()));
+}
+
+void syscall_t::dispatch(reg_t mm)
+{
+  reg_t magicmem[8];
+  memif->read(mm, sizeof(magicmem), magicmem);
+
+  reg_t n = from_le(magicmem[0]);
+  if (n >= table.size() || !table[n])
+    throw std::runtime_error("bad syscall #" + std::to_string(n));
+
+  magicmem[0] = to_le((this->*table[n])(from_le(magicmem[1]), from_le(magicmem[2]), from_le(magicmem[3]), from_le(magicmem[4]), from_le(magicmem[5]), from_le(magicmem[6]), from_le(magicmem[7])));
+
+  memif->write(mm, sizeof(magicmem), magicmem);
+}
+
+reg_t fds_t::alloc(int fd)
+{
+  reg_t i;
+  for (i = 0; i < fds.size(); i++)
+    if (fds[i] == -1)
+      break;
+
+  if (i == fds.size())
+    fds.resize(i+1);
+
+  fds[i] = fd;
+  return i;
+}
+
+void fds_t::dealloc(reg_t fd)
+{
+  fds[fd] = -1;
+}
+
+int fds_t::lookup(reg_t fd)
+{
+  if (int(fd) == RISCV_AT_FDCWD)
+    return AT_FDCWD;
+  return fd >= fds.size() ? -1 : fds[fd];
+}
+
+void syscall_t::set_chroot(const char* where)
+{
+  char buf1[PATH_MAX], buf2[PATH_MAX];
+
+  if (getcwd(buf1, sizeof(buf1)) == NULL
+      || chdir(where) != 0
+      || getcwd(buf2, sizeof(buf2)) == NULL
+      || chdir(buf1) != 0)
+  {
+    fprintf(stderr, "could not chroot to %s\n", where);
+    exit(-1);
+  }
+
+  chroot = buf2;
+}
diff --git a/fesvr/syscall.h b/fesvr/syscall.h
new file mode 100644
index 0000000000..82946969b3
--- /dev/null
+++ b/fesvr/syscall.h
@@ -0,0 +1,72 @@
+// See LICENSE for license details.
+
+#ifndef __SYSCALL_H
+#define __SYSCALL_H
+
+#include "device.h"
+#include "memif.h"
+#include <vector>
+#include <string>
+
+class syscall_t;
+typedef reg_t (syscall_t::*syscall_func_t)(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+
+class htif_t;
+class memif_t;
+
+class fds_t
+{
+ public:
+  reg_t alloc(int fd);
+  void dealloc(reg_t fd);
+  int lookup(reg_t fd);
+ private:
+  std::vector<int> fds;
+};
+
+class syscall_t : public device_t
+{
+ public:
+  syscall_t(htif_t*);
+
+  void set_chroot(const char* where);
+  
+ private:
+  const char* identity() { return "syscall_proxy"; }
+
+  htif_t* htif;
+  memif_t* memif;
+  std::vector<syscall_func_t> table;
+  fds_t fds;
+
+  void handle_syscall(command_t cmd);
+  void dispatch(addr_t mm);
+
+  std::string chroot;
+  std::string do_chroot(const char* fn);
+  std::string undo_chroot(const char* fn);
+
+  reg_t sys_exit(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_openat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_read(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_pread(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_write(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_pwrite(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_close(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_lseek(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_fstat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_lstat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_fstatat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_faccessat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_fcntl(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_ftruncate(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_renameat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_linkat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_unlinkat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_mkdirat(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_getcwd(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_getmainvars(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+  reg_t sys_chdir(reg_t, reg_t, reg_t, reg_t, reg_t, reg_t, reg_t);
+};
+
+#endif
diff --git a/fesvr/term.cc b/fesvr/term.cc
new file mode 100644
index 0000000000..c4cba0c07c
--- /dev/null
+++ b/fesvr/term.cc
@@ -0,0 +1,53 @@
+#include "term.h"
+#include <termios.h>
+#include <unistd.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdlib.h>
+
+class canonical_termios_t
+{
+ public:
+  canonical_termios_t()
+   : restore_tios(false)
+  {
+    if (tcgetattr(0, &old_tios) == 0)
+    {
+      struct termios new_tios = old_tios;
+      new_tios.c_lflag &= ~(ICANON | ECHO);
+      if (tcsetattr(0, TCSANOW, &new_tios) == 0)
+        restore_tios = true;
+    }
+  }
+
+  ~canonical_termios_t()
+  {
+    if (restore_tios)
+      tcsetattr(0, TCSANOW, &old_tios);
+  }
+ private:
+  struct termios old_tios;
+  bool restore_tios;
+};
+
+static canonical_termios_t tios; // exit() will clean up for us
+
+int canonical_terminal_t::read()
+{
+  struct pollfd pfd;
+  pfd.fd = 0;
+  pfd.events = POLLIN;
+  int ret = poll(&pfd, 1, 0);
+  if (ret <= 0 || !(pfd.revents & POLLIN))
+    return -1;
+
+  unsigned char ch;
+  ret = ::read(0, &ch, 1);
+  return ret <= 0 ? -1 : ch;
+}
+
+void canonical_terminal_t::write(char ch)
+{
+  if (::write(1, &ch, 1) != 1)
+    abort();
+}
diff --git a/fesvr/term.h b/fesvr/term.h
new file mode 100644
index 0000000000..7a2c22fc28
--- /dev/null
+++ b/fesvr/term.h
@@ -0,0 +1,11 @@
+#ifndef _TERM_H
+#define _TERM_H
+
+class canonical_terminal_t
+{
+ public:
+  static int read();
+  static void write(char);
+};
+
+#endif
diff --git a/fesvr/tsi.cc b/fesvr/tsi.cc
new file mode 100644
index 0000000000..5ccafc4b77
--- /dev/null
+++ b/fesvr/tsi.cc
@@ -0,0 +1,115 @@
+#include "tsi.h"
+#include <cstdio>
+#include <cstdlib>
+
+#define NHARTS_MAX 16
+
+void tsi_t::host_thread(void *arg)
+{
+  tsi_t *tsi = static_cast<tsi_t*>(arg);
+  tsi->run();
+
+  while (true)
+    tsi->target->switch_to();
+}
+
+tsi_t::tsi_t(int argc, char** argv) : htif_t(argc, argv)
+{
+  target = context_t::current();
+  host.init(host_thread, this);
+}
+
+tsi_t::~tsi_t(void)
+{
+}
+
+#define MSIP_BASE 0x2000000
+
+// Interrupt core 0 to make it start executing the program in DRAM
+void tsi_t::reset()
+{
+  uint32_t one = 1;
+
+  write_chunk(MSIP_BASE, sizeof(uint32_t), &one);
+}
+
+void tsi_t::push_addr(addr_t addr)
+{
+  for (int i = 0; i < SAI_ADDR_CHUNKS; i++) {
+    in_data.push_back(addr & 0xffffffff);
+    addr = addr >> 32;
+  }
+}
+
+void tsi_t::push_len(addr_t len)
+{
+  for (int i = 0; i < SAI_LEN_CHUNKS; i++) {
+    in_data.push_back(len & 0xffffffff);
+    len = len >> 32;
+  }
+}
+
+void tsi_t::read_chunk(addr_t taddr, size_t nbytes, void* dst)
+{
+  uint32_t *result = static_cast<uint32_t*>(dst);
+  size_t len = nbytes / sizeof(uint32_t);
+
+  in_data.push_back(SAI_CMD_READ);
+  push_addr(taddr);
+  push_len(len - 1);
+
+  for (size_t i = 0; i < len; i++) {
+    while (out_data.empty())
+      switch_to_target();
+    result[i] = out_data.front();
+    out_data.pop_front();
+  }
+}
+
+void tsi_t::write_chunk(addr_t taddr, size_t nbytes, const void* src)
+{
+  const uint32_t *src_data = static_cast<const uint32_t*>(src);
+  size_t len = nbytes / sizeof(uint32_t);
+
+  in_data.push_back(SAI_CMD_WRITE);
+  push_addr(taddr);
+  push_len(len - 1);
+
+  in_data.insert(in_data.end(), src_data, src_data + len);
+}
+
+void tsi_t::send_word(uint32_t word)
+{
+  out_data.push_back(word);
+}
+
+uint32_t tsi_t::recv_word(void)
+{
+  uint32_t word = in_data.front();
+  in_data.pop_front();
+  return word;
+}
+
+bool tsi_t::data_available(void)
+{
+  return !in_data.empty();
+}
+
+void tsi_t::switch_to_host(void)
+{
+  host.switch_to();
+}
+
+void tsi_t::switch_to_target(void)
+{
+  target->switch_to();
+}
+
+void tsi_t::tick(bool out_valid, uint32_t out_bits, bool in_ready)
+{
+  if (out_valid && out_ready())
+    out_data.push_back(out_bits);
+
+  if (in_valid() && in_ready)
+    in_data.pop_front();
+}
diff --git a/fesvr/tsi.h b/fesvr/tsi.h
new file mode 100644
index 0000000000..825a3a0038
--- /dev/null
+++ b/fesvr/tsi.h
@@ -0,0 +1,57 @@
+#ifndef __SAI_H
+#define __SAI_H
+
+#include "htif.h"
+#include "context.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <stdint.h>
+
+#define SAI_CMD_READ 0
+#define SAI_CMD_WRITE 1
+
+#define SAI_ADDR_CHUNKS 2
+#define SAI_LEN_CHUNKS 2
+
+class tsi_t : public htif_t
+{
+ public:
+  tsi_t(int argc, char** argv);
+  virtual ~tsi_t();
+
+  bool data_available();
+  void send_word(uint32_t word);
+  uint32_t recv_word();
+  void switch_to_host();
+
+  uint32_t in_bits() { return in_data.front(); }
+  bool in_valid() { return !in_data.empty(); }
+  bool out_ready() { return true; }
+  void tick(bool out_valid, uint32_t out_bits, bool in_ready);
+
+ protected:
+  void reset() override;
+  void read_chunk(addr_t taddr, size_t nbytes, void* dst) override;
+  void write_chunk(addr_t taddr, size_t nbytes, const void* src) override;
+  void switch_to_target();
+
+  size_t chunk_align() override { return 4; }
+  size_t chunk_max_size() override { return 1024; }
+
+  int get_ipi_addrs(addr_t *addrs);
+
+ private:
+  context_t host;
+  context_t* target;
+  std::deque<uint32_t> in_data;
+  std::deque<uint32_t> out_data;
+
+  void push_addr(addr_t addr);
+  void push_len(addr_t len);
+
+  static void host_thread(void *tsi);
+};
+
+#endif
diff --git a/riscv-disasm.pc.in b/riscv-disasm.pc.in
new file mode 100644
index 0000000000..8e022e930f
--- /dev/null
+++ b/riscv-disasm.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@prefix@
+libdir=${prefix}/@libdir@
+includedir=${prefix}/@includedir@
+
+Name: riscv-disasm
+Description: RISC-V disassembler
+Version: git
+Libs: -Wl,-rpath,${libdir} -L${libdir} -ldisasm
+Cflags: -I${includedir} 
+URL: http://riscv.org/download.html#tab_disasm
diff --git a/riscv-dummy_rocc.pc.in b/riscv-dummy_rocc.pc.in
deleted file mode 100644
index 31635f6dad..0000000000
--- a/riscv-dummy_rocc.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@prefix@
-libdir=${prefix}/@libdir@
-includedir=${prefix}/@includedir@
-
-Name: riscv-dummy_rocc
-Description: Example RISC-V ROCC accelerator
-Version: git
-Libs: -Wl,-rpath,${libdir} -L${libdir} -ldummy_rocc
-Cflags: -I${includedir}
-URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv-fesvr.pc.in b/riscv-fesvr.pc.in
new file mode 100644
index 0000000000..efd7eed1e3
--- /dev/null
+++ b/riscv-fesvr.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@prefix@
+libdir=${prefix}/@libdir@
+includedir=${prefix}/@includedir@
+
+Name: riscv-fesvr
+Description: RISC-V front-end server
+Version: git
+Libs: -Wl,-rpath,${libdir} -L${libdir} -lfesvr
+Cflags: -I${includedir} 
+URL: http://riscv.org/download.html#tab_fesvr
diff --git a/riscv-riscv.pc.in b/riscv-riscv.pc.in
deleted file mode 100644
index 5e86b1c448..0000000000
--- a/riscv-riscv.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@prefix@
-libdir=${prefix}/@libdir@
-includedir=${prefix}/@includedir@
-
-Name: riscv-riscv
-Description: RISC-V 
-Version: git
-Libs: -Wl,-rpath,${libdir} -L${libdir} -lriscv
-Cflags: -I${includedir}
-URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv-softfloat.pc.in b/riscv-softfloat.pc.in
deleted file mode 100644
index 6b18e88441..0000000000
--- a/riscv-softfloat.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@prefix@
-libdir=${prefix}/@libdir@
-includedir=${prefix}/@includedir@
-
-Name: riscv-softfloat
-Description: RISC-V softfloat library
-Version: git
-Libs: -Wl,-rpath,${libdir} -L${libdir} -lsoftfloat
-Cflags: -I${includedir}
-URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv-spike.pc.in b/riscv-spike.pc.in
deleted file mode 100644
index 007ad392bd..0000000000
--- a/riscv-spike.pc.in
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=@prefix@
-exec_prefix=@prefix@
-libdir=${prefix}/@libdir@
-includedir=${prefix}/@includedir@
-
-Name: riscv-spike
-Description: RISC-V spike meta library
-Version: git
-Depends: riscv-spike_main riscv-riscv riscv-softfloat
-URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv-spike_main.pc.in b/riscv-spike_main.pc.in
deleted file mode 100644
index c9b0eccca8..0000000000
--- a/riscv-spike_main.pc.in
+++ /dev/null
@@ -1,12 +0,0 @@
-prefix=@prefix@
-exec_prefix=@prefix@
-libdir=${prefix}/@libdir@
-includedir=${prefix}/@includedir@
-
-Name: riscv-spike_main
-Description: RISC-V ISA simulator library
-Version: git
-Depends: riscv-riscv riscv-softfloat
-Libs: -Wl,-rpath,${libdir} -L${libdir} -lspike_main
-Cflags: -I${includedir}
-URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv/arith.h b/riscv/arith.h
new file mode 100644
index 0000000000..35dd53051e
--- /dev/null
+++ b/riscv/arith.h
@@ -0,0 +1,123 @@
+// See LICENSE for license details.
+
+#ifndef _RISCV_ARITH_H
+#define _RISCV_ARITH_H
+
+#include <cassert>
+#include <cstdint>
+#include <climits>
+
+inline uint64_t mulhu(uint64_t a, uint64_t b)
+{
+  uint64_t t;
+  uint32_t y1, y2, y3;
+  uint64_t a0 = (uint32_t)a, a1 = a >> 32;
+  uint64_t b0 = (uint32_t)b, b1 = b >> 32;
+
+  t = a1*b0 + ((a0*b0) >> 32);
+  y1 = t;
+  y2 = t >> 32;
+
+  t = a0*b1 + y1;
+  y1 = t;
+
+  t = a1*b1 + y2 + (t >> 32);
+  y2 = t;
+  y3 = t >> 32;
+
+  return ((uint64_t)y3 << 32) | y2;
+}
+
+inline int64_t mulh(int64_t a, int64_t b)
+{
+  int negate = (a < 0) != (b < 0);
+  uint64_t res = mulhu(a < 0 ? -a : a, b < 0 ? -b : b);
+  return negate ? ~res + (a * b == 0) : res;
+}
+
+inline int64_t mulhsu(int64_t a, uint64_t b)
+{
+  int negate = a < 0;
+  uint64_t res = mulhu(a < 0 ? -a : a, b);
+  return negate ? ~res + (a * b == 0) : res;
+}
+
+//ref:  https://locklessinc.com/articles/sat_arithmetic/
+template<typename T, typename UT>
+static inline T sat_add(T x, T y, bool &sat)
+{
+  UT ux = x;
+  UT uy = y;
+  UT res = ux + uy;
+  sat = false;
+  int sh = sizeof(T) * 8 - 1;
+
+  /* Calculate overflowed result. (Don't change the sign bit of ux) */
+  ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+  /* Force compiler to use cmovns instruction */
+  if ((T) ((ux ^ uy) | ~(uy ^ res)) >= 0) {
+    res = ux;
+    sat = true;
+  }
+
+  return res;
+}
+
+template<typename T, typename UT>
+static inline T sat_sub(T x, T y, bool &sat)
+{
+  UT ux = x;
+  UT uy = y;
+  UT res = ux - uy;
+  sat = false;
+  int sh = sizeof(T) * 8 - 1;
+
+  /* Calculate overflowed result. (Don't change the sign bit of ux) */
+  ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+  /* Force compiler to use cmovns instruction */
+  if ((T) ((ux ^ uy) & (ux ^ res)) < 0) {
+    res = ux;
+    sat = true;
+  }
+
+  return res;
+}
+
+template<typename T>
+T sat_addu(T x, T y, bool &sat)
+{
+  T res = x + y;
+  sat = false;
+
+  sat = res < x;
+  res |= -(res < x);
+
+  return res;
+}
+
+template<typename T>
+T sat_subu(T x, T y, bool &sat)
+{
+  T res = x - y;
+  sat = false;
+
+  sat = !(res <= x);
+  res &= -(res <= x);
+
+  return res;
+}
+
+static inline uint64_t extract64(uint64_t val, int pos, int len)
+{
+  assert(pos >= 0 && len > 0 && len <= 64 - pos);
+  return (val >> pos) & (~UINT64_C(0) >> (64 - len));
+}
+
+static inline uint64_t make_mask64(int pos, int len)
+{
+    assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
+    return (UINT64_MAX >> (64 - len)) << pos;
+}
+#endif
diff --git a/riscv/byteorder.h b/riscv/byteorder.h
new file mode 100644
index 0000000000..393a70bd7b
--- /dev/null
+++ b/riscv/byteorder.h
@@ -0,0 +1,30 @@
+// See LICENSE for license details.
+
+#ifndef _RISCV_BYTEORDER_H
+#define _RISCV_BYTEORDER_H
+
+#include "config.h"
+#include <stdint.h>
+
+static inline uint8_t swap(uint8_t n) { return n; }
+static inline uint16_t swap(uint16_t n) { return __builtin_bswap16(n); }
+static inline uint32_t swap(uint32_t n) { return __builtin_bswap32(n); }
+static inline uint64_t swap(uint64_t n) { return __builtin_bswap64(n); }
+static inline int8_t swap(int8_t n) { return n; }
+static inline int16_t swap(int16_t n) { return __builtin_bswap16(n); }
+static inline int32_t swap(int32_t n) { return __builtin_bswap32(n); }
+static inline int64_t swap(int64_t n) { return __builtin_bswap64(n); }
+
+#ifdef WORDS_BIGENDIAN
+template<typename T> static inline T from_be(T n) { return n; }
+template<typename T> static inline T to_be(T n) { return n; }
+template<typename T> static inline T from_le(T n) { return swap(n); }
+template<typename T> static inline T to_le(T n) { return swap(n); }
+#else
+template<typename T> static inline T from_le(T n) { return n; }
+template<typename T> static inline T to_le(T n) { return n; }
+template<typename T> static inline T from_be(T n) { return swap(n); }
+template<typename T> static inline T to_be(T n) { return swap(n); }
+#endif
+
+#endif
diff --git a/riscv/clint.cc b/riscv/clint.cc
index 08508b43fa..aee995bfbb 100644
--- a/riscv/clint.cc
+++ b/riscv/clint.cc
@@ -1,9 +1,16 @@
+#include <sys/time.h>
 #include "devices.h"
 #include "processor.h"
 
-clint_t::clint_t(std::vector<processor_t*>& procs)
-  : procs(procs), mtimecmp(procs.size())
+clint_t::clint_t(std::vector<processor_t*>& procs, uint64_t freq_hz, bool real_time)
+  : procs(procs), freq_hz(freq_hz), real_time(real_time), mtime(0), mtimecmp(procs.size())
 {
+  struct timeval base;
+
+  gettimeofday(&base, NULL);
+
+  real_time_ref_secs = base.tv_sec;
+  real_time_ref_usecs = base.tv_usec;
 }
 
 /* 0000 msip hart 0
@@ -22,6 +29,7 @@ clint_t::clint_t(std::vector<processor_t*>& procs)
 
 bool clint_t::load(reg_t addr, size_t len, uint8_t* bytes)
 {
+  increment(0);
   if (addr >= MSIP_BASE && addr + len <= MSIP_BASE + procs.size()*sizeof(msip_t)) {
     std::vector<msip_t> msip(procs.size());
     for (size_t i = 0; i < procs.size(); ++i)
@@ -63,7 +71,16 @@ bool clint_t::store(reg_t addr, size_t len, const uint8_t* bytes)
 
 void clint_t::increment(reg_t inc)
 {
-  mtime += inc;
+  if (real_time) {
+   struct timeval now;
+   uint64_t diff_usecs;
+
+   gettimeofday(&now, NULL);
+   diff_usecs = ((now.tv_sec - real_time_ref_secs) * 1000000) + (now.tv_usec - real_time_ref_usecs);
+   mtime = diff_usecs * freq_hz / 1000000;
+  } else {
+    mtime += inc;
+  }
   for (size_t i = 0; i < procs.size(); i++) {
     procs[i]->state.mip &= ~MIP_MTIP;
     if (mtime >= mtimecmp[i])
diff --git a/riscv/common.h b/riscv/common.h
index 8ddd9849d8..3c523d00d7 100644
--- a/riscv/common.h
+++ b/riscv/common.h
@@ -6,4 +6,6 @@
 #define   likely(x) __builtin_expect(x, 1)
 #define unlikely(x) __builtin_expect(x, 0)
 
+#define NOINLINE __attribute__ ((noinline))
+
 #endif
diff --git a/riscv/debug_defines.h b/riscv/debug_defines.h
index d6ddd4ff1e..e6c2c5d3ea 100644
--- a/riscv/debug_defines.h
+++ b/riscv/debug_defines.h
@@ -84,8 +84,7 @@
 /*
 * 0: Version described in spec version 0.11.
 *
-* 1: Version described in spec version 0.13 (and later?), which
-* reduces the DMI data width to 32 bits.
+* 1: Version described in spec version 0.13.
 *
 * 15: Version not described in any available version of this spec.
  */
@@ -134,7 +133,7 @@
 * cleared by writing \Fdmireset in \Rdtmcs.
 *
 * This indicates that the DM itself responded with an error.
-* Note: there are no specified cases in which the DM would
+* There are no specified cases in which the DM would
 * respond with an error, and DMI is not required to support
 * returning errors.
 *
@@ -145,11 +144,6 @@
 * needs to give the target more TCK edges between Update-DR and
 * Capture-DR. The simplest way to do that is to add extra transitions
 * in Run-Test/Idle.
-*
-* (The DTM, DM, and/or component may be in different clock domains,
-* so synchronization may be required. Some relatively fixed number of
-* TCK ticks may be needed for the request to reach the DM, complete,
-* and for the response to be synchronized back into the TCK domain.)
  */
 #define DTM_DMI_OP_OFFSET                   0
 #define DTM_DMI_OP_LENGTH                   2
@@ -167,20 +161,28 @@
 #define CSR_DCSR_XDEBUGVER_LENGTH           4
 #define CSR_DCSR_XDEBUGVER                  (0xfU << CSR_DCSR_XDEBUGVER_OFFSET)
 /*
-* When 1, {\tt ebreak} instructions in Machine Mode enter Debug Mode.
+* 0: {\tt ebreak} instructions in M-mode behave as described in the
+* Privileged Spec.
+*
+* 1: {\tt ebreak} instructions in M-mode enter Debug Mode.
  */
 #define CSR_DCSR_EBREAKM_OFFSET             15
 #define CSR_DCSR_EBREAKM_LENGTH             1
 #define CSR_DCSR_EBREAKM                    (0x1U << CSR_DCSR_EBREAKM_OFFSET)
 /*
-* When 1, {\tt ebreak} instructions in Supervisor Mode enter Debug Mode.
+* 0: {\tt ebreak} instructions in S-mode behave as described in the
+* Privileged Spec.
+*
+* 1: {\tt ebreak} instructions in S-mode enter Debug Mode.
  */
 #define CSR_DCSR_EBREAKS_OFFSET             13
 #define CSR_DCSR_EBREAKS_LENGTH             1
 #define CSR_DCSR_EBREAKS                    (0x1U << CSR_DCSR_EBREAKS_OFFSET)
 /*
-* When 1, {\tt ebreak} instructions in User/Application Mode enter
-* Debug Mode.
+* 0: {\tt ebreak} instructions in U-mode behave as described in the
+* Privileged Spec.
+*
+* 1: {\tt ebreak} instructions in U-mode enter Debug Mode.
  */
 #define CSR_DCSR_EBREAKU_OFFSET             12
 #define CSR_DCSR_EBREAKU_LENGTH             1
@@ -191,9 +193,10 @@
 * 1: Interrupts are enabled during single stepping.
 *
 * Implementations may hard wire this bit to 0.
-* The debugger must read back the value it
-* writes to check whether the feature is supported. If not
-* supported, interrupt behavior can be emulated by the debugger.
+* In that case interrupt behavior can be emulated by the debugger.
+*
+* The debugger must not change the value of this bit while the hart
+* is running.
  */
 #define CSR_DCSR_STEPIE_OFFSET              11
 #define CSR_DCSR_STEPIE_LENGTH              1
@@ -201,14 +204,13 @@
 /*
 * 0: Increment counters as usual.
 *
-* 1: Don't increment any counters while in Debug Mode or on {\tt
-* ebreak} instructions that cause entry into Debug Mode.  These
-* counters include the {\tt cycle} and {\tt instret} CSRs. This is
-* preferred for most debugging scenarios.
+* 1: Don't increment any hart-local counters while in Debug Mode or
+* on {\tt ebreak} instructions that cause entry into Debug Mode.
+* These counters include the {\tt instret} CSR. On single-hart cores
+* {\tt cycle} should be stopped, but on multi-hart cores it must keep
+* incrementing.
 *
-* An implementation may choose not to support writing to this bit.
-* The debugger must read back the value it writes to check whether
-* the feature is supported.
+* An implementation may hardwire this bit to 0 or 1.
  */
 #define CSR_DCSR_STOPCOUNT_OFFSET           10
 #define CSR_DCSR_STOPCOUNT_LENGTH           1
@@ -218,9 +220,7 @@
 *
 * 1: Don't increment any hart-local timers while in Debug Mode.
 *
-* An implementation may choose not to support writing to this bit.
-* The debugger must read back the value it writes to check whether
-* the feature is supported.
+* An implementation may hardwire this bit to 0 or 1.
  */
 #define CSR_DCSR_STOPTIME_OFFSET            9
 #define CSR_DCSR_STOPTIME_LENGTH            1
@@ -236,9 +236,16 @@
 *
 * 2: The Trigger Module caused a breakpoint exception. (priority 4)
 *
-* 3: The debugger requested entry to Debug Mode. (priority 2)
+* 3: The debugger requested entry to Debug Mode using \Fhaltreq.
+* (priority 1)
 *
-* 4: The hart single stepped because \Fstep was set. (priority 1)
+* 4: The hart single stepped because \Fstep was set. (priority 0, lowest)
+*
+* 5: The hart halted directly out of reset due to \Fresethaltreq. It
+* is also acceptable to report 3 when this happens. (priority 2)
+*
+* 6: The hart halted because it's part of a halt group. (priority 5,
+* highest) Harts may report 3 for this cause instead.
 *
 * Other values are reserved for future use.
  */
@@ -246,10 +253,11 @@
 #define CSR_DCSR_CAUSE_LENGTH               3
 #define CSR_DCSR_CAUSE                      (0x7U << CSR_DCSR_CAUSE_OFFSET)
 /*
-* When 1, \Fmprv in \Rmstatus takes effect during debug mode.
-* When 0, it is ignored during debug mode.
-* Implementing this bit is optional.
-* If not implemented it should be tied to 0.
+* 0: \Fmprv in \Rmstatus is ignored in Debug Mode.
+*
+* 1: \Fmprv in \Rmstatus takes effect in Debug Mode.
+*
+* Implementing this bit is optional. It may be tied to either 0 or 1.
  */
 #define CSR_DCSR_MPRVEN_OFFSET              4
 #define CSR_DCSR_MPRVEN_LENGTH              1
@@ -270,6 +278,9 @@
 * If the instruction does not complete due to an exception,
 * the hart will immediately enter Debug Mode before executing
 * the trap handler, with appropriate exception registers set.
+*
+* The debugger must not change the value of this bit while the hart
+* is running.
  */
 #define CSR_DCSR_STEP_OFFSET                2
 #define CSR_DCSR_STEP_LENGTH                1
@@ -289,14 +300,14 @@
 #define CSR_DCSR_PRV                        (0x3U << CSR_DCSR_PRV_OFFSET)
 #define CSR_DPC                             0x7b1
 #define CSR_DPC_DPC_OFFSET                  0
-#define CSR_DPC_DPC_LENGTH                  MXLEN
-#define CSR_DPC_DPC                         (((1L<<MXLEN)-1) << CSR_DPC_DPC_OFFSET)
+#define CSR_DPC_DPC_LENGTH                  DXLEN
+#define CSR_DPC_DPC                         (((1L<<DXLEN)-1) << CSR_DPC_DPC_OFFSET)
 #define CSR_DSCRATCH0                       0x7b2
 #define CSR_DSCRATCH1                       0x7b3
 #define CSR_TSELECT                         0x7a0
 #define CSR_TSELECT_INDEX_OFFSET            0
-#define CSR_TSELECT_INDEX_LENGTH            MXLEN
-#define CSR_TSELECT_INDEX                   (((1L<<MXLEN)-1) << CSR_TSELECT_INDEX_OFFSET)
+#define CSR_TSELECT_INDEX_LENGTH            XLEN
+#define CSR_TSELECT_INDEX                   (((1L<<XLEN)-1) << CSR_TSELECT_INDEX_OFFSET)
 #define CSR_TDATA1                          0x7a1
 /*
 * 0: There is no trigger at this \Rtselect.
@@ -316,20 +327,18 @@
 * 5: The trigger is an exception trigger. The remaining bits
 * in this register act as described in \Retrigger.
 *
+* 12--14: These trigger types are available for non-standard use.
+*
 * 15: This trigger exists (so enumeration shouldn't terminate), but
 * is not currently available.
 *
 * Other values are reserved for future use.
-*
-* When this field is written to an unsupported value, it takes on its
-* reset value instead. The reset value is any one of the types
-* supported by the trigger selected by \Rtselect.
  */
-#define CSR_TDATA1_TYPE_OFFSET              (MXLEN-4)
+#define CSR_TDATA1_TYPE_OFFSET              (XLEN-4)
 #define CSR_TDATA1_TYPE_LENGTH              4
 #define CSR_TDATA1_TYPE                     (0xfULL << CSR_TDATA1_TYPE_OFFSET)
 /*
-* 0: Both Debug and M Mode can write the {\tt tdata} registers at the
+* 0: Both Debug and M-mode can write the {\tt tdata} registers at the
 * selected \Rtselect.
 *
 * 1: Only Debug Mode can write the {\tt tdata} registers at the
@@ -337,23 +346,23 @@
 *
 * This bit is only writable from Debug Mode.
  */
-#define CSR_TDATA1_DMODE_OFFSET             (MXLEN-5)
+#define CSR_TDATA1_DMODE_OFFSET             (XLEN-5)
 #define CSR_TDATA1_DMODE_LENGTH             1
 #define CSR_TDATA1_DMODE                    (0x1ULL << CSR_TDATA1_DMODE_OFFSET)
 /*
 * Trigger-specific data.
  */
 #define CSR_TDATA1_DATA_OFFSET              0
-#define CSR_TDATA1_DATA_LENGTH              (MXLEN - 5)
-#define CSR_TDATA1_DATA                     (((1L<<MXLEN - 5)-1) << CSR_TDATA1_DATA_OFFSET)
+#define CSR_TDATA1_DATA_LENGTH              (XLEN - 5)
+#define CSR_TDATA1_DATA                     (((1L<<XLEN - 5)-1) << CSR_TDATA1_DATA_OFFSET)
 #define CSR_TDATA2                          0x7a2
 #define CSR_TDATA2_DATA_OFFSET              0
-#define CSR_TDATA2_DATA_LENGTH              MXLEN
-#define CSR_TDATA2_DATA                     (((1L<<MXLEN)-1) << CSR_TDATA2_DATA_OFFSET)
+#define CSR_TDATA2_DATA_LENGTH              XLEN
+#define CSR_TDATA2_DATA                     (((1L<<XLEN)-1) << CSR_TDATA2_DATA_OFFSET)
 #define CSR_TDATA3                          0x7a3
 #define CSR_TDATA3_DATA_OFFSET              0
-#define CSR_TDATA3_DATA_LENGTH              MXLEN
-#define CSR_TDATA3_DATA                     (((1L<<MXLEN)-1) << CSR_TDATA3_DATA_OFFSET)
+#define CSR_TDATA3_DATA_LENGTH              XLEN
+#define CSR_TDATA3_DATA                     (((1L<<XLEN)-1) << CSR_TDATA3_DATA_OFFSET)
 #define CSR_TINFO                           0x7a4
 /*
 * One bit for each possible \Ftype enumerated in \Rtdataone. Bit N
@@ -371,11 +380,60 @@
 #define CSR_TINFO_INFO_OFFSET               0
 #define CSR_TINFO_INFO_LENGTH               16
 #define CSR_TINFO_INFO                      (0xffffULL << CSR_TINFO_INFO_OFFSET)
+#define CSR_TCONTROL                        0x7a5
+/*
+* M-mode previous trigger enable field.
+*
+* When a trap into M-mode is taken, \Fmpte is set to the value of
+* \Fmte.
+ */
+#define CSR_TCONTROL_MPTE_OFFSET            7
+#define CSR_TCONTROL_MPTE_LENGTH            1
+#define CSR_TCONTROL_MPTE                   (0x1ULL << CSR_TCONTROL_MPTE_OFFSET)
+/*
+* M-mode trigger enable field.
+*
+* 0: Triggers with action=0 do not match/fire while the hart is in M-mode.
+*
+* 1: Triggers do match/fire while the hart is in M-mode.
+*
+* When a trap into M-mode is taken, \Fmte is set to 0. When {\tt
+* mret} is executed, \Fmte is set to the value of \Fmpte.
+ */
+#define CSR_TCONTROL_MTE_OFFSET             3
+#define CSR_TCONTROL_MTE_LENGTH             1
+#define CSR_TCONTROL_MTE                    (0x1ULL << CSR_TCONTROL_MTE_OFFSET)
+#define CSR_MCONTEXT                        0x7a8
+/*
+* Machine mode software can write a context number to this register,
+* which can be used to set triggers that only fire in that specific
+* context.
+*
+* An implementation may tie any number of upper bits in this field to
+* 0. It's recommended to implement no more than 6 bits on RV32, and
+* 13 on RV64.
+ */
+#define CSR_MCONTEXT_MCONTEXT_OFFSET        0
+#define CSR_MCONTEXT_MCONTEXT_LENGTH        XLEN
+#define CSR_MCONTEXT_MCONTEXT               (((1L<<XLEN)-1) << CSR_MCONTEXT_MCONTEXT_OFFSET)
+#define CSR_SCONTEXT                        0x7aa
+/*
+* Supervisor mode software can write a context number to this
+* register, which can be used to set triggers that only fire in that
+* specific context.
+*
+* An implementation may tie any number of high bits in this field to
+* 0. It's recommended to implement no more than 16 bits on RV32, and
+* 34 on RV64.
+ */
+#define CSR_SCONTEXT_DATA_OFFSET            0
+#define CSR_SCONTEXT_DATA_LENGTH            XLEN
+#define CSR_SCONTEXT_DATA                   (((1L<<XLEN)-1) << CSR_SCONTEXT_DATA_OFFSET)
 #define CSR_MCONTROL                        0x7a1
-#define CSR_MCONTROL_TYPE_OFFSET            (MXLEN-4)
+#define CSR_MCONTROL_TYPE_OFFSET            (XLEN-4)
 #define CSR_MCONTROL_TYPE_LENGTH            4
 #define CSR_MCONTROL_TYPE                   (0xfULL << CSR_MCONTROL_TYPE_OFFSET)
-#define CSR_MCONTROL_DMODE_OFFSET           (MXLEN-5)
+#define CSR_MCONTROL_DMODE_OFFSET           (XLEN-5)
 #define CSR_MCONTROL_DMODE_LENGTH           1
 #define CSR_MCONTROL_DMODE                  (0x1ULL << CSR_MCONTROL_DMODE_OFFSET)
 /*
@@ -387,13 +445,21 @@
 * corresponds to the maximum NAPOT range, which is $2^{63}$ bytes in
 * size.
  */
-#define CSR_MCONTROL_MASKMAX_OFFSET         (MXLEN-11)
+#define CSR_MCONTROL_MASKMAX_OFFSET         (XLEN-11)
 #define CSR_MCONTROL_MASKMAX_LENGTH         6
 #define CSR_MCONTROL_MASKMAX                (0x3fULL << CSR_MCONTROL_MASKMAX_OFFSET)
 /*
+* This field only exists if XLEN is greater than 32. In that case it
+* extends \Fsize. If it does not exist then hardware operates as if
+* the field contains 0.
+ */
+#define CSR_MCONTROL_SIZEHI_OFFSET          21
+#define CSR_MCONTROL_SIZEHI_LENGTH          2
+#define CSR_MCONTROL_SIZEHI                 (0x3ULL << CSR_MCONTROL_SIZEHI_OFFSET)
+/*
 * If this optional bit is implemented, the hardware sets it when this
 * trigger matches. The trigger's user can set or clear it at any
-* time. The trigger's user can use this bit to determine which
+* time. It is used to determine which
 * trigger(s) matched.  If the bit is not implemented, it is always 0
 * and writing it has no effect.
  */
@@ -401,9 +467,10 @@
 #define CSR_MCONTROL_HIT_LENGTH             1
 #define CSR_MCONTROL_HIT                    (0x1ULL << CSR_MCONTROL_HIT_OFFSET)
 /*
-* 0: Perform a match on the virtual address.
+* 0: Perform a match on the virtual base address of the access.
+* (E.g. on a 32-bit read from 0x4000, the base address is 0x4000.)
 *
-* 1: Perform a match on the data value loaded/stored, or the
+* 1: Perform a match on the data value loaded or stored, or the
 * instruction executed.
  */
 #define CSR_MCONTROL_SELECT_OFFSET          19
@@ -412,7 +479,7 @@
 /*
 * 0: The action for this trigger will be taken just before the
 * instruction that triggered it is executed, but after all preceding
-* instructions are are committed.
+* instructions are committed.
 *
 * 1: The action for this trigger will be taken after the instruction
 * that triggered it is executed. It should be taken before the next
@@ -433,23 +500,68 @@
 * A chain of triggers that don't all have the same \Ftiming value
 * will never fire (unless consecutive instructions match the
 * appropriate triggers).
+*
+* If a trigger with \Ftiming of 0 matches, it is
+* implementation-dependent whether that prevents a trigger with
+* \Ftiming of 1 matching as well.
  */
 #define CSR_MCONTROL_TIMING_OFFSET          18
 #define CSR_MCONTROL_TIMING_LENGTH          1
 #define CSR_MCONTROL_TIMING                 (0x1ULL << CSR_MCONTROL_TIMING_OFFSET)
 /*
+* This field contains the 2 low bits of \Fsize. The high bits come
+* from \Fsizehi. The combined value is interpreted as follows:
+*
+* 0: The trigger will attempt to match against an access of any size.
+* The behavior is only well-defined if $|select|=0$, or if the access
+* size is XLEN.
+*
+* 1: The trigger will only match against 8-bit memory accesses.
+*
+* 2: The trigger will only match against 16-bit memory accesses or
+* execution of 16-bit instructions.
+*
+* 3: The trigger will only match against 32-bit memory accesses or
+* execution of 32-bit instructions.
+*
+* 4: The trigger will only match against execution of 48-bit instructions.
+*
+* 5: The trigger will only match against 64-bit memory accesses or
+* execution of 64-bit instructions.
+*
+* 6: The trigger will only match against execution of 80-bit instructions.
+*
+* 7: The trigger will only match against execution of 96-bit instructions.
+*
+* 8: The trigger will only match against execution of 112-bit instructions.
+*
+* 9: The trigger will only match against 128-bit memory accesses or
+* execution of 128-bit instructions.
+ */
+#define CSR_MCONTROL_SIZELO_OFFSET          16
+#define CSR_MCONTROL_SIZELO_LENGTH          2
+#define CSR_MCONTROL_SIZELO                 (0x3ULL << CSR_MCONTROL_SIZELO_OFFSET)
+/*
 * The action to take when the trigger fires. The values are explained
 * in Table~\ref{tab:action}.
  */
 #define CSR_MCONTROL_ACTION_OFFSET          12
-#define CSR_MCONTROL_ACTION_LENGTH          6
-#define CSR_MCONTROL_ACTION                 (0x3fULL << CSR_MCONTROL_ACTION_OFFSET)
+#define CSR_MCONTROL_ACTION_LENGTH          4
+#define CSR_MCONTROL_ACTION                 (0xfULL << CSR_MCONTROL_ACTION_OFFSET)
 /*
 * 0: When this trigger matches, the configured action is taken.
 *
 * 1: While this trigger does not match, it prevents the trigger with
 * the next index from matching.
 *
+* A trigger chain starts on the first trigger with $|chain|=1$ after
+* a trigger with $|chain|=0$, or simply on the first trigger if that
+* has $|chain|=1$. It ends on the first trigger after that which has
+* $|chain|=0$. This final trigger is part of the chain. The action
+* on all but the final trigger is ignored.  The action on that final
+* trigger will be taken if and only if all the triggers in the chain
+* match at the same time.
+*
 * Because \Fchain affects the next trigger, hardware must zero it in
 * writes to \Rmcontrol that set \Fdmode to 0 if the next trigger has
 * \Fdmode of 1.
@@ -466,10 +578,14 @@
 #define CSR_MCONTROL_CHAIN_LENGTH           1
 #define CSR_MCONTROL_CHAIN                  (0x1ULL << CSR_MCONTROL_CHAIN_OFFSET)
 /*
-* 0: Matches when the value equals \Rtdatatwo.
+* 0: Matches when the value equals \Rtdatatwo. Additionally, if
+* \Fselect=0 then it is recommended that the trigger also matches if
+* any of the accessed addresses equal \Rtdatatwo. (E.g. on a 32-bit
+* read from 0x4000, the following addresses are accessed: 0x4000,
+* 0x4001, 0x4002, and 0x4003.)
 *
 * 1: Matches when the top M bits of the value match the top M bits of
-* \Rtdatatwo. M is MXLEN-1 minus the index of the least-significant
+* \Rtdatatwo. M is XLEN-1 minus the index of the least-significant
 * bit containing 0 in \Rtdatatwo.
 *
 * 2: Matches when the value is greater than (unsigned) or equal to
@@ -491,19 +607,19 @@
 #define CSR_MCONTROL_MATCH_LENGTH           4
 #define CSR_MCONTROL_MATCH                  (0xfULL << CSR_MCONTROL_MATCH_OFFSET)
 /*
-* When set, enable this trigger in M mode.
+* When set, enable this trigger in M-mode.
  */
 #define CSR_MCONTROL_M_OFFSET               6
 #define CSR_MCONTROL_M_LENGTH               1
 #define CSR_MCONTROL_M                      (0x1ULL << CSR_MCONTROL_M_OFFSET)
 /*
-* When set, enable this trigger in S mode.
+* When set, enable this trigger in S-mode.
  */
 #define CSR_MCONTROL_S_OFFSET               4
 #define CSR_MCONTROL_S_LENGTH               1
 #define CSR_MCONTROL_S                      (0x1ULL << CSR_MCONTROL_S_OFFSET)
 /*
-* When set, enable this trigger in U mode.
+* When set, enable this trigger in U-mode.
  */
 #define CSR_MCONTROL_U_OFFSET               3
 #define CSR_MCONTROL_U_LENGTH               1
@@ -528,16 +644,16 @@
 #define CSR_MCONTROL_LOAD_LENGTH            1
 #define CSR_MCONTROL_LOAD                   (0x1ULL << CSR_MCONTROL_LOAD_OFFSET)
 #define CSR_ICOUNT                          0x7a1
-#define CSR_ICOUNT_TYPE_OFFSET              (MXLEN-4)
+#define CSR_ICOUNT_TYPE_OFFSET              (XLEN-4)
 #define CSR_ICOUNT_TYPE_LENGTH              4
 #define CSR_ICOUNT_TYPE                     (0xfULL << CSR_ICOUNT_TYPE_OFFSET)
-#define CSR_ICOUNT_DMODE_OFFSET             (MXLEN-5)
+#define CSR_ICOUNT_DMODE_OFFSET             (XLEN-5)
 #define CSR_ICOUNT_DMODE_LENGTH             1
 #define CSR_ICOUNT_DMODE                    (0x1ULL << CSR_ICOUNT_DMODE_OFFSET)
 /*
 * If this optional bit is implemented, the hardware sets it when this
 * trigger matches. The trigger's user can set or clear it at any
-* time. The trigger's user can use this bit to determine which
+* time. It is used to determine which
 * trigger(s) matched.  If the bit is not implemented, it is always 0
 * and writing it has no effect.
  */
@@ -554,21 +670,21 @@
 #define CSR_ICOUNT_COUNT_LENGTH             14
 #define CSR_ICOUNT_COUNT                    (0x3fffULL << CSR_ICOUNT_COUNT_OFFSET)
 /*
-* When set, every instruction completed or exception taken in M mode decrements \Fcount
+* When set, every instruction completed or exception taken in M-mode decrements \Fcount
 * by 1.
  */
 #define CSR_ICOUNT_M_OFFSET                 9
 #define CSR_ICOUNT_M_LENGTH                 1
 #define CSR_ICOUNT_M                        (0x1ULL << CSR_ICOUNT_M_OFFSET)
 /*
-* When set, every instruction completed or exception taken in S mode decrements \Fcount
+* When set, every instruction completed or exception taken in S-mode decrements \Fcount
 * by 1.
  */
 #define CSR_ICOUNT_S_OFFSET                 7
 #define CSR_ICOUNT_S_LENGTH                 1
 #define CSR_ICOUNT_S                        (0x1ULL << CSR_ICOUNT_S_OFFSET)
 /*
-* When set, every instruction completed or exception taken in U mode decrements \Fcount
+* When set, every instruction completed or exception taken in U-mode decrements \Fcount
 * by 1.
  */
 #define CSR_ICOUNT_U_OFFSET                 6
@@ -582,20 +698,20 @@
 #define CSR_ICOUNT_ACTION_LENGTH            6
 #define CSR_ICOUNT_ACTION                   (0x3fULL << CSR_ICOUNT_ACTION_OFFSET)
 #define CSR_ITRIGGER                        0x7a1
-#define CSR_ITRIGGER_TYPE_OFFSET            (MXLEN-4)
+#define CSR_ITRIGGER_TYPE_OFFSET            (XLEN-4)
 #define CSR_ITRIGGER_TYPE_LENGTH            4
 #define CSR_ITRIGGER_TYPE                   (0xfULL << CSR_ITRIGGER_TYPE_OFFSET)
-#define CSR_ITRIGGER_DMODE_OFFSET           (MXLEN-5)
+#define CSR_ITRIGGER_DMODE_OFFSET           (XLEN-5)
 #define CSR_ITRIGGER_DMODE_LENGTH           1
 #define CSR_ITRIGGER_DMODE                  (0x1ULL << CSR_ITRIGGER_DMODE_OFFSET)
 /*
 * If this optional bit is implemented, the hardware sets it when this
 * trigger matches. The trigger's user can set or clear it at any
-* time. The trigger's user can use this bit to determine which
+* time. It is used to determine which
 * trigger(s) matched.  If the bit is not implemented, it is always 0
 * and writing it has no effect.
  */
-#define CSR_ITRIGGER_HIT_OFFSET             (MXLEN-6)
+#define CSR_ITRIGGER_HIT_OFFSET             (XLEN-6)
 #define CSR_ITRIGGER_HIT_LENGTH             1
 #define CSR_ITRIGGER_HIT                    (0x1ULL << CSR_ITRIGGER_HIT_OFFSET)
 /*
@@ -627,23 +743,30 @@
 #define CSR_ITRIGGER_ACTION_LENGTH          6
 #define CSR_ITRIGGER_ACTION                 (0x3fULL << CSR_ITRIGGER_ACTION_OFFSET)
 #define CSR_ETRIGGER                        0x7a1
-#define CSR_ETRIGGER_TYPE_OFFSET            (MXLEN-4)
+#define CSR_ETRIGGER_TYPE_OFFSET            (XLEN-4)
 #define CSR_ETRIGGER_TYPE_LENGTH            4
 #define CSR_ETRIGGER_TYPE                   (0xfULL << CSR_ETRIGGER_TYPE_OFFSET)
-#define CSR_ETRIGGER_DMODE_OFFSET           (MXLEN-5)
+#define CSR_ETRIGGER_DMODE_OFFSET           (XLEN-5)
 #define CSR_ETRIGGER_DMODE_LENGTH           1
 #define CSR_ETRIGGER_DMODE                  (0x1ULL << CSR_ETRIGGER_DMODE_OFFSET)
 /*
 * If this optional bit is implemented, the hardware sets it when this
 * trigger matches. The trigger's user can set or clear it at any
-* time. The trigger's user can use this bit to determine which
+* time. It is used to determine which
 * trigger(s) matched.  If the bit is not implemented, it is always 0
 * and writing it has no effect.
  */
-#define CSR_ETRIGGER_HIT_OFFSET             (MXLEN-6)
+#define CSR_ETRIGGER_HIT_OFFSET             (XLEN-6)
 #define CSR_ETRIGGER_HIT_LENGTH             1
 #define CSR_ETRIGGER_HIT                    (0x1ULL << CSR_ETRIGGER_HIT_OFFSET)
 /*
+* When this optional bit is set, non-maskable interrupts cause this
+* trigger to fire, regardless of the values of \Fm, \Fs, and \Fu.
+ */
+#define CSR_ETRIGGER_NMI_OFFSET             10
+#define CSR_ETRIGGER_NMI_LENGTH             1
+#define CSR_ETRIGGER_NMI                    (0x1ULL << CSR_ETRIGGER_NMI_OFFSET)
+/*
 * When set, enable this trigger for exceptions that are taken from M
 * mode.
  */
@@ -671,6 +794,54 @@
 #define CSR_ETRIGGER_ACTION_OFFSET          0
 #define CSR_ETRIGGER_ACTION_LENGTH          6
 #define CSR_ETRIGGER_ACTION                 (0x3fULL << CSR_ETRIGGER_ACTION_OFFSET)
+#define CSR_TEXTRA32                        0x7a3
+/*
+* Data used together with \Fmselect.
+ */
+#define CSR_TEXTRA32_MVALUE_OFFSET          26
+#define CSR_TEXTRA32_MVALUE_LENGTH          6
+#define CSR_TEXTRA32_MVALUE                 (0x3fU << CSR_TEXTRA32_MVALUE_OFFSET)
+/*
+* 0: Ignore \Fmvalue.
+*
+* 1: This trigger will only match if the low bits of
+* \Rmcontext equal \Fmvalue.
+ */
+#define CSR_TEXTRA32_MSELECT_OFFSET         25
+#define CSR_TEXTRA32_MSELECT_LENGTH         1
+#define CSR_TEXTRA32_MSELECT                (0x1U << CSR_TEXTRA32_MSELECT_OFFSET)
+/*
+* Data used together with \Fsselect.
+ */
+#define CSR_TEXTRA32_SVALUE_OFFSET          2
+#define CSR_TEXTRA32_SVALUE_LENGTH          16
+#define CSR_TEXTRA32_SVALUE                 (0xffffU << CSR_TEXTRA32_SVALUE_OFFSET)
+/*
+* 0: Ignore \Fsvalue.
+*
+* 1: This trigger will only match if the low bits of
+* \Rscontext equal \Fsvalue.
+*
+* 2: This trigger will only match if \Fasid in \Rsatp
+* equals the lower ASIDMAX (defined in the Privileged Spec) bits of
+* \Fsvalue.
+ */
+#define CSR_TEXTRA32_SSELECT_OFFSET         0
+#define CSR_TEXTRA32_SSELECT_LENGTH         2
+#define CSR_TEXTRA32_SSELECT                (0x3U << CSR_TEXTRA32_SSELECT_OFFSET)
+#define CSR_TEXTRA64                        0x7a3
+#define CSR_TEXTRA64_MVALUE_OFFSET          51
+#define CSR_TEXTRA64_MVALUE_LENGTH          13
+#define CSR_TEXTRA64_MVALUE                 (0x1fffULL << CSR_TEXTRA64_MVALUE_OFFSET)
+#define CSR_TEXTRA64_MSELECT_OFFSET         50
+#define CSR_TEXTRA64_MSELECT_LENGTH         1
+#define CSR_TEXTRA64_MSELECT                (0x1ULL << CSR_TEXTRA64_MSELECT_OFFSET)
+#define CSR_TEXTRA64_SVALUE_OFFSET          2
+#define CSR_TEXTRA64_SVALUE_LENGTH          34
+#define CSR_TEXTRA64_SVALUE                 (0x3ffffffffULL << CSR_TEXTRA64_SVALUE_OFFSET)
+#define CSR_TEXTRA64_SSELECT_OFFSET         0
+#define CSR_TEXTRA64_SSELECT_LENGTH         2
+#define CSR_TEXTRA64_SSELECT                (0x3ULL << CSR_TEXTRA64_SSELECT_OFFSET)
 #define DMI_DMSTATUS                        0x11
 /*
 * If 1, then there is an implicit {\tt ebreak} instruction at the
@@ -684,39 +855,43 @@
 #define DMI_DMSTATUS_IMPEBREAK_LENGTH       1
 #define DMI_DMSTATUS_IMPEBREAK              (0x1U << DMI_DMSTATUS_IMPEBREAK_OFFSET)
 /*
-* This field is 1 when all currently selected harts have been reset but the reset has not been acknowledged.
+* This field is 1 when all currently selected harts have been reset
+* and reset has not been acknowledged for any of them.
  */
 #define DMI_DMSTATUS_ALLHAVERESET_OFFSET    19
 #define DMI_DMSTATUS_ALLHAVERESET_LENGTH    1
 #define DMI_DMSTATUS_ALLHAVERESET           (0x1U << DMI_DMSTATUS_ALLHAVERESET_OFFSET)
 /*
-* This field is 1 when any currently selected hart has been reset but the reset has not been acknowledged.
+* This field is 1 when at least one currently selected hart has been
+* reset and reset has not been acknowledged for that hart.
  */
 #define DMI_DMSTATUS_ANYHAVERESET_OFFSET    18
 #define DMI_DMSTATUS_ANYHAVERESET_LENGTH    1
 #define DMI_DMSTATUS_ANYHAVERESET           (0x1U << DMI_DMSTATUS_ANYHAVERESET_OFFSET)
 /*
 * This field is 1 when all currently selected harts have acknowledged
-* the previous resume request.
+* their last resume request.
  */
 #define DMI_DMSTATUS_ALLRESUMEACK_OFFSET    17
 #define DMI_DMSTATUS_ALLRESUMEACK_LENGTH    1
 #define DMI_DMSTATUS_ALLRESUMEACK           (0x1U << DMI_DMSTATUS_ALLRESUMEACK_OFFSET)
 /*
 * This field is 1 when any currently selected hart has acknowledged
-* the previous resume request.
+* its last resume request.
  */
 #define DMI_DMSTATUS_ANYRESUMEACK_OFFSET    16
 #define DMI_DMSTATUS_ANYRESUMEACK_LENGTH    1
 #define DMI_DMSTATUS_ANYRESUMEACK           (0x1U << DMI_DMSTATUS_ANYRESUMEACK_OFFSET)
 /*
-* This field is 1 when all currently selected harts do not exist in this system.
+* This field is 1 when all currently selected harts do not exist in
+* this platform.
  */
 #define DMI_DMSTATUS_ALLNONEXISTENT_OFFSET  15
 #define DMI_DMSTATUS_ALLNONEXISTENT_LENGTH  1
 #define DMI_DMSTATUS_ALLNONEXISTENT         (0x1U << DMI_DMSTATUS_ALLNONEXISTENT_OFFSET)
 /*
-* This field is 1 when any currently selected hart does not exist in this system.
+* This field is 1 when any currently selected hart does not exist in
+* this platform.
  */
 #define DMI_DMSTATUS_ANYNONEXISTENT_OFFSET  14
 #define DMI_DMSTATUS_ANYNONEXISTENT_LENGTH  1
@@ -758,9 +933,12 @@
 #define DMI_DMSTATUS_ANYHALTED_LENGTH       1
 #define DMI_DMSTATUS_ANYHALTED              (0x1U << DMI_DMSTATUS_ANYHALTED_OFFSET)
 /*
-* 0 when authentication is required before using the DM.  1 when the
-* authentication check has passed. On components that don't implement
-* authentication, this bit must be preset as 1.
+* 0: Authentication is required before using the DM.
+*
+* 1: The authentication check has passed.
+*
+* On components that don't implement authentication, this bit must be
+* preset as 1.
  */
 #define DMI_DMSTATUS_AUTHENTICATED_OFFSET   7
 #define DMI_DMSTATUS_AUTHENTICATED_LENGTH   1
@@ -787,15 +965,15 @@
 #define DMI_DMSTATUS_HASRESETHALTREQ_LENGTH 1
 #define DMI_DMSTATUS_HASRESETHALTREQ        (0x1U << DMI_DMSTATUS_HASRESETHALTREQ_OFFSET)
 /*
-* 0: \Rdevtreeaddrzero--\Rdevtreeaddrthree hold information which
-* is not relevant to the Device Tree.
+* 0: \Rconfstrptrzero--\Rconfstrptrthree hold information which
+* is not relevant to the configuration string.
 *
-* 1: \Rdevtreeaddrzero--\Rdevtreeaddrthree registers hold the address of the
-* Device Tree.
+* 1: \Rconfstrptrzero--\Rconfstrptrthree hold the address of the
+* configuration string.
  */
-#define DMI_DMSTATUS_DEVTREEVALID_OFFSET    4
-#define DMI_DMSTATUS_DEVTREEVALID_LENGTH    1
-#define DMI_DMSTATUS_DEVTREEVALID           (0x1U << DMI_DMSTATUS_DEVTREEVALID_OFFSET)
+#define DMI_DMSTATUS_CONFSTRPTRVALID_OFFSET 4
+#define DMI_DMSTATUS_CONFSTRPTRVALID_LENGTH 1
+#define DMI_DMSTATUS_CONFSTRPTRVALID        (0x1U << DMI_DMSTATUS_CONFSTRPTRVALID_OFFSET)
 /*
 * 0: There is no Debug Module present.
 *
@@ -813,12 +991,12 @@
 #define DMI_DMSTATUS_VERSION                (0xfU << DMI_DMSTATUS_VERSION_OFFSET)
 #define DMI_DMCONTROL                       0x10
 /*
-* Writes the halt request bit for all currently selected harts.
-* When set to 1, each selected hart will halt if it is not currently
-* halted.
+* Writing 0 clears the halt request bit for all currently selected
+* harts. This may cancel outstanding halt requests for those harts.
 *
-* Writing 1 or 0 has no effect on a hart which is already halted, but
-* the bit must be cleared to 0 before the hart is resumed.
+* Writing 1 sets the halt request bit for all currently selected
+* harts. Running harts will halt whenever their halt request bit is
+* set.
 *
 * Writes apply to the new value of \Fhartsel and \Fhasel.
  */
@@ -826,12 +1004,11 @@
 #define DMI_DMCONTROL_HALTREQ_LENGTH        1
 #define DMI_DMCONTROL_HALTREQ               (0x1U << DMI_DMCONTROL_HALTREQ_OFFSET)
 /*
-* Writes the resume request bit for all currently selected harts.
-* When set to 1, each selected hart will resume if it is currently
-* halted.
+* Writing 1 causes the currently selected harts to resume once, if
+* they are halted when the write occurs. It also clears the resume
+* ack bit for those harts.
 *
-* The resume request bit is ignored while the halt request bit is
-* set.
+* \Fresumereq is ignored if \Fhaltreq is set.
 *
 * Writes apply to the new value of \Fhartsel and \Fhasel.
  */
@@ -843,6 +1020,9 @@
 * selected harts.  To perform a reset the debugger writes 1, and then
 * writes 0 to deassert the reset signal.
 *
+* While this bit is 1, the debugger must not change which harts are
+* selected.
+*
 * If this feature is not implemented, the bit always stays 0, so
 * after writing 1 the debugger can read the register back to see if
 * the feature is supported.
@@ -853,8 +1033,9 @@
 #define DMI_DMCONTROL_HARTRESET_LENGTH      1
 #define DMI_DMCONTROL_HARTRESET             (0x1U << DMI_DMCONTROL_HARTRESET_OFFSET)
 /*
-* Writing 1 to this bit clears the {\tt havereset} bits for
-* any selected harts.
+* 0: No effect.
+*
+* 1: Clears {\tt havereset} for any selected harts.
 *
 * Writes apply to the new value of \Fhartsel and \Fhasel.
  */
@@ -862,12 +1043,13 @@
 #define DMI_DMCONTROL_ACKHAVERESET_LENGTH   1
 #define DMI_DMCONTROL_ACKHAVERESET          (0x1U << DMI_DMCONTROL_ACKHAVERESET_OFFSET)
 /*
-* Selects the  definition of currently selected harts.
+* Selects the definition of currently selected harts.
 *
-* 0: There is a single currently selected hart, that selected by \Fhartsel.
+* 0: There is a single currently selected hart, that is selected by \Fhartsel.
 *
-* 1: There may be multiple currently selected harts -- that selected by \Fhartsel,
-* plus those selected by the hart array mask register.
+* 1: There may be multiple currently selected harts -- the hart
+* selected by \Fhartsel, plus those selected by the hart array mask
+* register.
 *
 * An implementation which does not implement the hart array mask register
 * must tie this field to 0. A debugger which wishes to use the hart array
@@ -893,7 +1075,8 @@
 #define DMI_DMCONTROL_HARTSELHI             (0x3ffU << DMI_DMCONTROL_HARTSELHI_OFFSET)
 /*
 * This optional field writes the halt-on-reset request bit for all
-* currently selected harts.
+* currently selected harts, unless \Fclrresethaltreq is
+* simultaneously set to 1.
 * When set to 1, each selected hart will halt upon the next deassertion
 * of its reset. The halt-on-reset request bit is not automatically
 * cleared. The debugger must write to \Fclrresethaltreq to clear it.
@@ -931,20 +1114,26 @@
 *
 * 0: The module's state, including authentication mechanism,
 * takes its reset values (the \Fdmactive bit is the only bit which can
-* be written to something other than its reset value).
+* be written to something other than its reset value). Any accesses
+* to the module may fail. Specifically, \Fversion may not return
+* correct data.
 *
 * 1: The module functions normally.
 *
 * No other mechanism should exist that may result in resetting the
-* Debug Module after power up, including the platform's system reset
-* or Debug Transport reset signals.
+* Debug Module after power up, with the possible (but not
+* recommended) exception of a global reset signal that resets the
+* entire platform.
 *
 * A debugger may pulse this bit low to get the Debug Module into a
 * known state.
 *
-* Implementations may use this bit to aid debugging, for example by
-* preventing the Debug Module from being power gated while debugging
-* is active.
+* Implementations may pay attention to this bit to further aid
+* debugging, for example by preventing the Debug Module from being
+* power gated while debugging is active.
+*
+* For forward compatibility, \Fversion will always be readable when
+* \Fdmactive is 1.
  */
 #define DMI_DMCONTROL_DMACTIVE_OFFSET       0
 #define DMI_DMCONTROL_DMACTIVE_LENGTH       1
@@ -960,8 +1149,8 @@
 #define DMI_HARTINFO_NSCRATCH_LENGTH        4
 #define DMI_HARTINFO_NSCRATCH               (0xfU << DMI_HARTINFO_NSCRATCH_OFFSET)
 /*
-* 0: The {\tt data} registers are shadowed in the hart by CSR
-* registers. Each CSR register is MXLEN bits in size, and corresponds
+* 0: The {\tt data} registers are shadowed in the hart by CSRs.
+* Each CSR is DXLEN bits in size, and corresponds
 * to a single argument, per Table~\ref{tab:datareg}.
 *
 * 1: The {\tt data} registers are shadowed in the hart's memory map.
@@ -971,7 +1160,7 @@
 #define DMI_HARTINFO_DATAACCESS_LENGTH      1
 #define DMI_HARTINFO_DATAACCESS             (0x1U << DMI_HARTINFO_DATAACCESS_OFFSET)
 /*
-* If \Fdataaccess is 0: Number of CSR registers dedicated to
+* If \Fdataaccess is 0: Number of CSRs dedicated to
 * shadowing the {\tt data} registers.
 *
 * If \Fdataaccess is 1: Number of 32-bit words in the memory map
@@ -996,7 +1185,7 @@
 #define DMI_HAWINDOWSEL                     0x14
 /*
 * The high bits of this field may be tied to 0, depending on how large
-* the array mask register is.  Eg. on a system with 48 harts only bit 0
+* the array mask register is.  E.g.\ on a system with 48 harts only bit 0
 * of this field may actually be writable.
  */
 #define DMI_HAWINDOWSEL_HAWINDOWSEL_OFFSET  0
@@ -1027,20 +1216,26 @@
 * they are cleared by writing 1 to them. No abstract command is
 * started until the value is reset to 0.
 *
+* This field only contains a valid value if \Fbusy is 0.
+*
 * 0 (none): No error.
 *
 * 1 (busy): An abstract command was executing while \Rcommand,
-* \Rabstractcs, \Rabstractauto was written, or when one
+* \Rabstractcs, or \Rabstractauto was written, or when one
 * of the {\tt data} or {\tt progbuf} registers was read or written.
+* This status is only written if \Fcmderr contains 0.
 *
 * 2 (not supported): The requested command is not supported,
 * regardless of whether the hart is running or not.
 *
 * 3 (exception): An exception occurred while executing the command
-* (eg. while executing the Program Buffer).
+* (e.g.\ while executing the Program Buffer).
 *
 * 4 (halt/resume): The abstract command couldn't execute because the
-* hart wasn't in the required state (running/halted).
+* hart wasn't in the required state (running/halted), or unavailable.
+*
+* 5 (bus): The abstract command failed due to a bus error (e.g.\
+* alignment, access size, or timeout).
 *
 * 7 (other): The command failed for another reason.
  */
@@ -1049,7 +1244,7 @@
 #define DMI_ABSTRACTCS_CMDERR               (0x7U << DMI_ABSTRACTCS_CMDERR_OFFSET)
 /*
 * Number of {\tt data} registers that are implemented as part of the
-* abstract command interface. Valid sizes are 0 - 12.
+* abstract command interface. Valid sizes are 1 -- 12.
  */
 #define DMI_ABSTRACTCS_DATACOUNT_OFFSET     0
 #define DMI_ABSTRACTCS_DATACOUNT_LENGTH     4
@@ -1071,26 +1266,28 @@
 #define DMI_COMMAND_CONTROL                 (0xffffffU << DMI_COMMAND_CONTROL_OFFSET)
 #define DMI_ABSTRACTAUTO                    0x18
 /*
-* When a bit in this field is 1, read or write accesses to the corresponding {\tt progbuf} word
-* cause the command in \Rcommand to be executed again.
+* When a bit in this field is 1, read or write accesses to the
+* corresponding {\tt progbuf} word cause the command in \Rcommand to
+* be executed again.
  */
 #define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_OFFSET 16
 #define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_LENGTH 16
 #define DMI_ABSTRACTAUTO_AUTOEXECPROGBUF    (0xffffU << DMI_ABSTRACTAUTO_AUTOEXECPROGBUF_OFFSET)
 /*
-* When a bit in this field is 1, read or write accesses to the corresponding {\tt data} word
-* cause the command in \Rcommand to be executed again.
+* When a bit in this field is 1, read or write accesses to the
+* corresponding {\tt data} word cause the command in \Rcommand to be
+* executed again.
  */
 #define DMI_ABSTRACTAUTO_AUTOEXECDATA_OFFSET 0
 #define DMI_ABSTRACTAUTO_AUTOEXECDATA_LENGTH 12
 #define DMI_ABSTRACTAUTO_AUTOEXECDATA       (0xfffU << DMI_ABSTRACTAUTO_AUTOEXECDATA_OFFSET)
-#define DMI_DEVTREEADDR0                    0x19
-#define DMI_DEVTREEADDR0_ADDR_OFFSET        0
-#define DMI_DEVTREEADDR0_ADDR_LENGTH        32
-#define DMI_DEVTREEADDR0_ADDR               (0xffffffffU << DMI_DEVTREEADDR0_ADDR_OFFSET)
-#define DMI_DEVTREEADDR1                    0x1a
-#define DMI_DEVTREEADDR2                    0x1b
-#define DMI_DEVTREEADDR3                    0x1c
+#define DMI_CONFSTRPTR0                     0x19
+#define DMI_CONFSTRPTR0_ADDR_OFFSET         0
+#define DMI_CONFSTRPTR0_ADDR_LENGTH         32
+#define DMI_CONFSTRPTR0_ADDR                (0xffffffffU << DMI_CONFSTRPTR0_ADDR_OFFSET)
+#define DMI_CONFSTRPTR1                     0x1a
+#define DMI_CONFSTRPTR2                     0x1b
+#define DMI_CONFSTRPTR3                     0x1c
 #define DMI_NEXTDM                          0x1d
 #define DMI_NEXTDM_ADDR_OFFSET              0
 #define DMI_NEXTDM_ADDR_LENGTH              32
@@ -1109,6 +1306,55 @@
 #define DMI_AUTHDATA_DATA_OFFSET            0
 #define DMI_AUTHDATA_DATA_LENGTH            32
 #define DMI_AUTHDATA_DATA                   (0xffffffffU << DMI_AUTHDATA_DATA_OFFSET)
+#define DMI_DMCS2                           0x32
+/*
+* This field contains the currently selected external trigger.
+*
+* If a non-existent trigger value is written here, the hardware will
+* change it to a valid one or 0 if no external triggers exist.
+ */
+#define DMI_DMCS2_EXTTRIGGER_OFFSET         7
+#define DMI_DMCS2_EXTTRIGGER_LENGTH         4
+#define DMI_DMCS2_EXTTRIGGER                (0xfU << DMI_DMCS2_EXTTRIGGER_OFFSET)
+/*
+* When \Fhgselect is 0, contains the halt group of the hart
+* specified by \Fhartsel.
+*
+* When \Fhgselect is 1, contains the halt group of the external
+* trigger selected by \Fexttrigger.
+*
+* Writes only have an effect if \Fhgwrite is also written 1.
+*
+* An implementation may tie any number of upper bits in this field to
+* 0. If halt groups aren't implemented, then this entire field
+* is 0.
+ */
+#define DMI_DMCS2_HALTGROUP_OFFSET          2
+#define DMI_DMCS2_HALTGROUP_LENGTH          5
+#define DMI_DMCS2_HALTGROUP                 (0x1fU << DMI_DMCS2_HALTGROUP_OFFSET)
+/*
+* When \Fhgselect is 0, writing 1 changes the halt group of all
+* selected harts to the value written to \Fhaltgroup.
+*
+* When \Fhgselect is 1, writing 1 changes the halt group of the
+* external trigger selected by \Fexttrigger to the value written to
+* \Fhaltgroup.
+*
+* Writing 0 has no effect.
+ */
+#define DMI_DMCS2_HGWRITE_OFFSET            1
+#define DMI_DMCS2_HGWRITE_LENGTH            1
+#define DMI_DMCS2_HGWRITE                   (0x1U << DMI_DMCS2_HGWRITE_OFFSET)
+/*
+* 0: Operate on harts.
+*
+* 1: Operate on external triggers.
+*
+* If there are no external triggers, this field must be tied to 0.
+ */
+#define DMI_DMCS2_HGSELECT_OFFSET           0
+#define DMI_DMCS2_HGSELECT_LENGTH           1
+#define DMI_DMCS2_HGSELECT                  (0x1U << DMI_DMCS2_HGSELECT_OFFSET)
 #define DMI_HALTSUM0                        0x40
 #define DMI_HALTSUM0_HALTSUM0_OFFSET        0
 #define DMI_HALTSUM0_HALTSUM0_LENGTH        32
@@ -1125,14 +1371,6 @@
 #define DMI_HALTSUM3_HALTSUM3_OFFSET        0
 #define DMI_HALTSUM3_HALTSUM3_LENGTH        32
 #define DMI_HALTSUM3_HALTSUM3               (0xffffffffU << DMI_HALTSUM3_HALTSUM3_OFFSET)
-#define DMI_SBADDRESS3                      0x37
-/*
-* Accesses bits 127:96 of the physical address in {\tt sbaddress} (if
-* the system address bus is that wide).
- */
-#define DMI_SBADDRESS3_ADDRESS_OFFSET       0
-#define DMI_SBADDRESS3_ADDRESS_LENGTH       32
-#define DMI_SBADDRESS3_ADDRESS              (0xffffffffU << DMI_SBADDRESS3_ADDRESS_OFFSET)
 #define DMI_SBCS                            0x38
 /*
 * 0: The System Bus interface conforms to mainline drafts of this
@@ -1151,7 +1389,7 @@
 * already in progress (while \Fsbbusy is set). It remains set until
 * it's explicitly cleared by the debugger.
 *
-* While this field is non-zero, no more system bus accesses can be
+* While this field is set, no more system bus accesses can be
 * initiated by the Debug Module.
  */
 #define DMI_SBCS_SBBUSYERROR_OFFSET         22
@@ -1191,7 +1429,7 @@
 * 4: 128-bit
 *
 * If \Fsbaccess has an unsupported value when the DM starts a bus
-* access, the access is not performed and \Fsberror is set to 3.
+* access, the access is not performed and \Fsberror is set to 4.
  */
 #define DMI_SBCS_SBACCESS_OFFSET            17
 #define DMI_SBCS_SBACCESS_LENGTH            3
@@ -1212,12 +1450,12 @@
 #define DMI_SBCS_SBREADONDATA               (0x1U << DMI_SBCS_SBREADONDATA_OFFSET)
 /*
 * When the Debug Module's system bus
-* master causes a bus error, this field gets set. The bits in this
+* master encounters an error, this field gets set. The bits in this
 * field remain set until they are cleared by writing 1 to them.
 * While this field is non-zero, no more system bus accesses can be
 * initiated by the Debug Module.
 *
-* An implementation may report "Other" (7) for any error condition.
+* An implementation may report ``Other'' (7) for any error condition.
 *
 * 0: There was no bus error.
 *
@@ -1294,6 +1532,14 @@
 #define DMI_SBADDRESS2_ADDRESS_OFFSET       0
 #define DMI_SBADDRESS2_ADDRESS_LENGTH       32
 #define DMI_SBADDRESS2_ADDRESS              (0xffffffffU << DMI_SBADDRESS2_ADDRESS_OFFSET)
+#define DMI_SBADDRESS3                      0x37
+/*
+* Accesses bits 127:96 of the physical address in {\tt sbaddress} (if
+* the system address bus is that wide).
+ */
+#define DMI_SBADDRESS3_ADDRESS_OFFSET       0
+#define DMI_SBADDRESS3_ADDRESS_LENGTH       32
+#define DMI_SBADDRESS3_ADDRESS              (0xffffffffU << DMI_SBADDRESS3_ADDRESS_OFFSET)
 #define DMI_SBDATA0                         0x3c
 /*
 * Accesses bits 31:0 of {\tt sbdata}.
@@ -1325,6 +1571,9 @@
 #define DMI_SBDATA3_DATA_OFFSET             0
 #define DMI_SBDATA3_DATA_LENGTH             32
 #define DMI_SBDATA3_DATA                    (0xffffffffU << DMI_SBDATA3_DATA_OFFSET)
+#define DMI_CUSTOM                          0x1f
+#define DMI_CUSTOM0                         0x70
+#define DMI_CUSTOM15                        0x7f
 #define SHORTNAME                           0x123
 /*
 * Description of what this field is used for.
@@ -1346,19 +1595,32 @@
 *
 * 4: Access the lowest 128 bits of the register.
 *
-* If \Fsize specifies a size larger than the register's actual size,
-* then the access must fail. If a register is accessible, then reads of \Fsize
+* If \Faarsize specifies a size larger than the register's actual size,
+* then the access must fail. If a register is accessible, then reads of \Faarsize
 * less than or equal to the register's actual size must be supported.
 *
 * This field controls the Argument Width as referenced in
 * Table~\ref{tab:datareg}.
  */
-#define AC_ACCESS_REGISTER_SIZE_OFFSET      20
-#define AC_ACCESS_REGISTER_SIZE_LENGTH      3
-#define AC_ACCESS_REGISTER_SIZE             (0x7U << AC_ACCESS_REGISTER_SIZE_OFFSET)
+#define AC_ACCESS_REGISTER_AARSIZE_OFFSET   20
+#define AC_ACCESS_REGISTER_AARSIZE_LENGTH   3
+#define AC_ACCESS_REGISTER_AARSIZE          (0x7U << AC_ACCESS_REGISTER_AARSIZE_OFFSET)
+/*
+* 0: No effect. This variant must be supported.
+*
+* 1: After a successful register access, \Fregno is incremented
+* (wrapping around to 0). Supporting this variant is optional.
+ */
+#define AC_ACCESS_REGISTER_AARPOSTINCREMENT_OFFSET 19
+#define AC_ACCESS_REGISTER_AARPOSTINCREMENT_LENGTH 1
+#define AC_ACCESS_REGISTER_AARPOSTINCREMENT (0x1U << AC_ACCESS_REGISTER_AARPOSTINCREMENT_OFFSET)
 /*
-* When 1, execute the program in the Program Buffer exactly once
-* after performing the transfer, if any.
+* 0: No effect. This variant must be supported, and is the only
+* supported one if \Fprogbufsize is 0.
+*
+* 1: Execute the program in the Program Buffer exactly once after
+* performing the transfer, if any. Supporting this variant is
+* optional.
  */
 #define AC_ACCESS_REGISTER_POSTEXEC_OFFSET  18
 #define AC_ACCESS_REGISTER_POSTEXEC_LENGTH  1
@@ -1369,7 +1631,7 @@
 * 1: Do the operation specified by \Fwrite.
 *
 * This bit can be used to just execute the Program Buffer without
-* having to worry about placing valid values into \Fsize or \Fregno.
+* having to worry about placing valid values into \Faarsize or \Fregno.
  */
 #define AC_ACCESS_REGISTER_TRANSFER_OFFSET  17
 #define AC_ACCESS_REGISTER_TRANSFER_LENGTH  1
@@ -1401,14 +1663,177 @@
 #define AC_QUICK_ACCESS_CMDTYPE_OFFSET      24
 #define AC_QUICK_ACCESS_CMDTYPE_LENGTH      8
 #define AC_QUICK_ACCESS_CMDTYPE             (0xffU << AC_QUICK_ACCESS_CMDTYPE_OFFSET)
+#define AC_ACCESS_MEMORY                    None
+/*
+* This is 2 to indicate Access Memory Command.
+ */
+#define AC_ACCESS_MEMORY_CMDTYPE_OFFSET     24
+#define AC_ACCESS_MEMORY_CMDTYPE_LENGTH     8
+#define AC_ACCESS_MEMORY_CMDTYPE            (0xffU << AC_ACCESS_MEMORY_CMDTYPE_OFFSET)
+/*
+* An implementation does not have to implement both virtual and
+* physical accesses, but it must fail accesses that it doesn't
+* support.
+*
+* 0: Addresses are physical (to the hart they are performed on).
+*
+* 1: Addresses are virtual, and translated the way they would be from
+* M-mode, with \Fmprv set.
+ */
+#define AC_ACCESS_MEMORY_AAMVIRTUAL_OFFSET  23
+#define AC_ACCESS_MEMORY_AAMVIRTUAL_LENGTH  1
+#define AC_ACCESS_MEMORY_AAMVIRTUAL         (0x1U << AC_ACCESS_MEMORY_AAMVIRTUAL_OFFSET)
+/*
+* 0: Access the lowest 8 bits of the memory location.
+*
+* 1: Access the lowest 16 bits of the memory location.
+*
+* 2: Access the lowest 32 bits of the memory location.
+*
+* 3: Access the lowest 64 bits of the memory location.
+*
+* 4: Access the lowest 128 bits of the memory location.
+ */
+#define AC_ACCESS_MEMORY_AAMSIZE_OFFSET     20
+#define AC_ACCESS_MEMORY_AAMSIZE_LENGTH     3
+#define AC_ACCESS_MEMORY_AAMSIZE            (0x7U << AC_ACCESS_MEMORY_AAMSIZE_OFFSET)
+/*
+* After a memory access has completed, if this bit is 1, increment
+* {\tt arg1} (which contains the address used) by the number of bytes
+* encoded in \Faamsize.
+ */
+#define AC_ACCESS_MEMORY_AAMPOSTINCREMENT_OFFSET 19
+#define AC_ACCESS_MEMORY_AAMPOSTINCREMENT_LENGTH 1
+#define AC_ACCESS_MEMORY_AAMPOSTINCREMENT   (0x1U << AC_ACCESS_MEMORY_AAMPOSTINCREMENT_OFFSET)
+/*
+* 0: Copy data from the memory location specified in {\tt arg1} into
+* the low bits of {\tt arg0}. Any remaining bits of {\tt arg0} now
+* have an undefined value.
+*
+* 1: Copy data from the low bits of {\tt arg0} into the memory
+* location specified in {\tt arg1}.
+ */
+#define AC_ACCESS_MEMORY_WRITE_OFFSET       16
+#define AC_ACCESS_MEMORY_WRITE_LENGTH       1
+#define AC_ACCESS_MEMORY_WRITE              (0x1U << AC_ACCESS_MEMORY_WRITE_OFFSET)
+/*
+* These bits are reserved for target-specific uses.
+ */
+#define AC_ACCESS_MEMORY_TARGET_SPECIFIC_OFFSET 14
+#define AC_ACCESS_MEMORY_TARGET_SPECIFIC_LENGTH 2
+#define AC_ACCESS_MEMORY_TARGET_SPECIFIC    (0x3U << AC_ACCESS_MEMORY_TARGET_SPECIFIC_OFFSET)
 #define VIRT_PRIV                           virtual
 /*
 * Contains the privilege level the hart was operating in when Debug
 * Mode was entered. The encoding is described in Table
 * \ref{tab:privlevel}, and matches the privilege level encoding from
-* the RISC-V Privileged ISA Specification. A user can write this
+* the Privileged Spec. A user can write this
 * value to change the hart's privilege level when exiting Debug Mode.
  */
 #define VIRT_PRIV_PRV_OFFSET                0
 #define VIRT_PRIV_PRV_LENGTH                2
 #define VIRT_PRIV_PRV                       (0x3U << VIRT_PRIV_PRV_OFFSET)
+#define DMI_SERCS                           0x34
+/*
+* Number of supported serial ports.
+ */
+#define DMI_SERCS_SERIALCOUNT_OFFSET        28
+#define DMI_SERCS_SERIALCOUNT_LENGTH        4
+#define DMI_SERCS_SERIALCOUNT               (0xfU << DMI_SERCS_SERIALCOUNT_OFFSET)
+/*
+* Select which serial port is accessed by \Rserrx and \Rsertx.
+ */
+#define DMI_SERCS_SERIAL_OFFSET             24
+#define DMI_SERCS_SERIAL_LENGTH             3
+#define DMI_SERCS_SERIAL                    (0x7U << DMI_SERCS_SERIAL_OFFSET)
+#define DMI_SERCS_ERROR7_OFFSET             23
+#define DMI_SERCS_ERROR7_LENGTH             1
+#define DMI_SERCS_ERROR7                    (0x1U << DMI_SERCS_ERROR7_OFFSET)
+#define DMI_SERCS_VALID7_OFFSET             22
+#define DMI_SERCS_VALID7_LENGTH             1
+#define DMI_SERCS_VALID7                    (0x1U << DMI_SERCS_VALID7_OFFSET)
+#define DMI_SERCS_FULL7_OFFSET              21
+#define DMI_SERCS_FULL7_LENGTH              1
+#define DMI_SERCS_FULL7                     (0x1U << DMI_SERCS_FULL7_OFFSET)
+#define DMI_SERCS_ERROR6_OFFSET             20
+#define DMI_SERCS_ERROR6_LENGTH             1
+#define DMI_SERCS_ERROR6                    (0x1U << DMI_SERCS_ERROR6_OFFSET)
+#define DMI_SERCS_VALID6_OFFSET             19
+#define DMI_SERCS_VALID6_LENGTH             1
+#define DMI_SERCS_VALID6                    (0x1U << DMI_SERCS_VALID6_OFFSET)
+#define DMI_SERCS_FULL6_OFFSET              18
+#define DMI_SERCS_FULL6_LENGTH              1
+#define DMI_SERCS_FULL6                     (0x1U << DMI_SERCS_FULL6_OFFSET)
+#define DMI_SERCS_ERROR5_OFFSET             17
+#define DMI_SERCS_ERROR5_LENGTH             1
+#define DMI_SERCS_ERROR5                    (0x1U << DMI_SERCS_ERROR5_OFFSET)
+#define DMI_SERCS_VALID5_OFFSET             16
+#define DMI_SERCS_VALID5_LENGTH             1
+#define DMI_SERCS_VALID5                    (0x1U << DMI_SERCS_VALID5_OFFSET)
+#define DMI_SERCS_FULL5_OFFSET              15
+#define DMI_SERCS_FULL5_LENGTH              1
+#define DMI_SERCS_FULL5                     (0x1U << DMI_SERCS_FULL5_OFFSET)
+#define DMI_SERCS_ERROR4_OFFSET             14
+#define DMI_SERCS_ERROR4_LENGTH             1
+#define DMI_SERCS_ERROR4                    (0x1U << DMI_SERCS_ERROR4_OFFSET)
+#define DMI_SERCS_VALID4_OFFSET             13
+#define DMI_SERCS_VALID4_LENGTH             1
+#define DMI_SERCS_VALID4                    (0x1U << DMI_SERCS_VALID4_OFFSET)
+#define DMI_SERCS_FULL4_OFFSET              12
+#define DMI_SERCS_FULL4_LENGTH              1
+#define DMI_SERCS_FULL4                     (0x1U << DMI_SERCS_FULL4_OFFSET)
+#define DMI_SERCS_ERROR3_OFFSET             11
+#define DMI_SERCS_ERROR3_LENGTH             1
+#define DMI_SERCS_ERROR3                    (0x1U << DMI_SERCS_ERROR3_OFFSET)
+#define DMI_SERCS_VALID3_OFFSET             10
+#define DMI_SERCS_VALID3_LENGTH             1
+#define DMI_SERCS_VALID3                    (0x1U << DMI_SERCS_VALID3_OFFSET)
+#define DMI_SERCS_FULL3_OFFSET              9
+#define DMI_SERCS_FULL3_LENGTH              1
+#define DMI_SERCS_FULL3                     (0x1U << DMI_SERCS_FULL3_OFFSET)
+#define DMI_SERCS_ERROR2_OFFSET             8
+#define DMI_SERCS_ERROR2_LENGTH             1
+#define DMI_SERCS_ERROR2                    (0x1U << DMI_SERCS_ERROR2_OFFSET)
+#define DMI_SERCS_VALID2_OFFSET             7
+#define DMI_SERCS_VALID2_LENGTH             1
+#define DMI_SERCS_VALID2                    (0x1U << DMI_SERCS_VALID2_OFFSET)
+#define DMI_SERCS_FULL2_OFFSET              6
+#define DMI_SERCS_FULL2_LENGTH              1
+#define DMI_SERCS_FULL2                     (0x1U << DMI_SERCS_FULL2_OFFSET)
+#define DMI_SERCS_ERROR1_OFFSET             5
+#define DMI_SERCS_ERROR1_LENGTH             1
+#define DMI_SERCS_ERROR1                    (0x1U << DMI_SERCS_ERROR1_OFFSET)
+#define DMI_SERCS_VALID1_OFFSET             4
+#define DMI_SERCS_VALID1_LENGTH             1
+#define DMI_SERCS_VALID1                    (0x1U << DMI_SERCS_VALID1_OFFSET)
+#define DMI_SERCS_FULL1_OFFSET              3
+#define DMI_SERCS_FULL1_LENGTH              1
+#define DMI_SERCS_FULL1                     (0x1U << DMI_SERCS_FULL1_OFFSET)
+/*
+* 1 when the debugger-to-core queue for serial port 0 has
+* over or underflowed. This bit will remain set until it is reset by
+* writing 1 to this bit.
+ */
+#define DMI_SERCS_ERROR0_OFFSET             2
+#define DMI_SERCS_ERROR0_LENGTH             1
+#define DMI_SERCS_ERROR0                    (0x1U << DMI_SERCS_ERROR0_OFFSET)
+/*
+* 1 when the core-to-debugger queue for serial port 0 is not empty.
+ */
+#define DMI_SERCS_VALID0_OFFSET             1
+#define DMI_SERCS_VALID0_LENGTH             1
+#define DMI_SERCS_VALID0                    (0x1U << DMI_SERCS_VALID0_OFFSET)
+/*
+* 1 when the debugger-to-core queue for serial port 0 is full.
+ */
+#define DMI_SERCS_FULL0_OFFSET              0
+#define DMI_SERCS_FULL0_LENGTH              1
+#define DMI_SERCS_FULL0                     (0x1U << DMI_SERCS_FULL0_OFFSET)
+#define DMI_SERTX                           0x35
+#define DMI_SERTX_DATA_OFFSET               0
+#define DMI_SERTX_DATA_LENGTH               32
+#define DMI_SERTX_DATA                      (0xffffffffU << DMI_SERTX_DATA_OFFSET)
+#define DMI_SERRX                           0x36
+#define DMI_SERRX_DATA_OFFSET               0
+#define DMI_SERRX_DATA_LENGTH               32
+#define DMI_SERRX_DATA                      (0xffffffffU << DMI_SERRX_DATA_OFFSET)
diff --git a/riscv/debug_module.cc b/riscv/debug_module.cc
index 96de3c8ab2..5490d0edcf 100644
--- a/riscv/debug_module.cc
+++ b/riscv/debug_module.cc
@@ -15,36 +15,56 @@
 #  define D(x)
 #endif
 
+// Return the number of bits wide that a field has to be to encode up to n
+// different values.
+// 1->0, 2->1, 3->2, 4->2
+static unsigned field_width(unsigned n)
+{
+  unsigned i = 0;
+  n -= 1;
+  while (n) {
+    i++;
+    n >>= 1;
+  }
+  return i;
+}
+
 ///////////////////////// debug_module_t
 
-debug_module_t::debug_module_t(sim_t *sim, unsigned progbufsize, unsigned max_bus_master_bits,
-    bool require_authentication) :
-  progbufsize(progbufsize),
-  program_buffer_bytes(4 + 4*progbufsize),
-  max_bus_master_bits(max_bus_master_bits),
-  require_authentication(require_authentication),
+debug_module_t::debug_module_t(sim_t *sim, const debug_module_config_t &config) :
+  nprocs(sim->nprocs()),
+  config(config),
+  program_buffer_bytes((config.support_impebreak ? 4 : 0) + 4*config.progbufsize),
   debug_progbuf_start(debug_data_start - program_buffer_bytes),
   debug_abstract_start(debug_progbuf_start - debug_abstract_size*4),
   custom_base(0),
-  sim(sim)
+  hartsellen(field_width(sim->nprocs())),
+  sim(sim),
+  // The spec lets a debugger select nonexistent harts. Create hart_state for
+  // them because I'm too lazy to add the code to just ignore accesses.
+  hart_state(1 << field_width(sim->nprocs())),
+  hart_array_mask(sim->nprocs()),
+  rti_remaining(0)
 {
   D(fprintf(stderr, "debug_data_start=0x%x\n", debug_data_start));
   D(fprintf(stderr, "debug_progbuf_start=0x%x\n", debug_progbuf_start));
   D(fprintf(stderr, "debug_abstract_start=0x%x\n", debug_abstract_start));
 
+  assert(nprocs <= 1024);
+
   program_buffer = new uint8_t[program_buffer_bytes];
 
-  memset(halted, 0, sizeof(halted));
   memset(debug_rom_flags, 0, sizeof(debug_rom_flags));
-  memset(resumeack, 0, sizeof(resumeack));
-  memset(havereset, 0, sizeof(havereset));
   memset(program_buffer, 0, program_buffer_bytes);
-  program_buffer[4*progbufsize] = ebreak();
-  program_buffer[4*progbufsize+1] = ebreak() >> 8;
-  program_buffer[4*progbufsize+2] = ebreak() >> 16;
-  program_buffer[4*progbufsize+3] = ebreak() >> 24;
   memset(dmdata, 0, sizeof(dmdata));
 
+  if (config.support_impebreak) {
+    program_buffer[4*config.progbufsize] = ebreak();
+    program_buffer[4*config.progbufsize+1] = ebreak() >> 8;
+    program_buffer[4*config.progbufsize+2] = ebreak() >> 16;
+    program_buffer[4*config.progbufsize+3] = ebreak() >> 24;
+  }
+
   write32(debug_rom_whereto, 0,
           jal(ZERO, debug_abstract_start - DEBUG_ROM_WHERETO));
 
@@ -60,37 +80,38 @@ debug_module_t::~debug_module_t()
 
 void debug_module_t::reset()
 {
+  assert(sim->nprocs() > 0);
   for (unsigned i = 0; i < sim->nprocs(); i++) {
     processor_t *proc = sim->get_core(i);
     if (proc)
-      proc->halt_request = false;
+      proc->halt_request = proc->HR_NONE;
   }
 
   dmcontrol = {0};
 
   dmstatus = {0};
-  dmstatus.impebreak = true;
-  dmstatus.authenticated = !require_authentication;
+  dmstatus.impebreak = config.support_impebreak;
+  dmstatus.authenticated = !config.require_authentication;
   dmstatus.version = 2;
 
   abstractcs = {0};
   abstractcs.datacount = sizeof(dmdata) / 4;
-  abstractcs.progbufsize = progbufsize;
+  abstractcs.progbufsize = config.progbufsize;
 
   abstractauto = {0};
 
   sbcs = {0};
-  if (max_bus_master_bits > 0) {
+  if (config.max_bus_master_bits > 0) {
     sbcs.version = 1;
     sbcs.asize = sizeof(reg_t) * 8;
   }
-  if (max_bus_master_bits >= 64)
+  if (config.max_bus_master_bits >= 64)
     sbcs.access64 = true;
-  if (max_bus_master_bits >= 32)
+  if (config.max_bus_master_bits >= 32)
     sbcs.access32 = true;
-  if (max_bus_master_bits >= 16)
+  if (config.max_bus_master_bits >= 16)
     sbcs.access16 = true;
-  if (max_bus_master_bits >= 8)
+  if (config.max_bus_master_bits >= 8)
     sbcs.access8 = true;
 
   challenge = random();
@@ -135,8 +156,8 @@ bool debug_module_t::load(reg_t addr, size_t len, uint8_t* bytes)
     return true;
   }
 
-  fprintf(stderr, "ERROR: invalid load from debug module: %zd bytes at 0x%016"
-          PRIx64 "\n", len, addr);
+  D(fprintf(stderr, "ERROR: invalid load from debug module: %zd bytes at 0x%016"
+          PRIx64 "\n", len, addr));
 
   return false;
 }
@@ -179,11 +200,24 @@ bool debug_module_t::store(reg_t addr, size_t len, const uint8_t* bytes)
 
   if (addr == DEBUG_ROM_HALTED) {
     assert (len == 4);
-    halted[id] = true;
+    if (!hart_state[id].halted) {
+      hart_state[id].halted = true;
+      if (hart_state[id].haltgroup) {
+        for (unsigned i = 0; i < nprocs; i++) {
+          if (!hart_state[i].halted &&
+              hart_state[i].haltgroup == hart_state[id].haltgroup) {
+            processor_t *proc = sim->get_core(i);
+            proc->halt_request = proc->HR_GROUP;
+            // TODO: What if the debugger comes and writes dmcontrol before the
+            // halt occurs?
+          }
+        }
+      }
+    }
     if (dmcontrol.hartsel == id) {
         if (0 == (debug_rom_flags[id] & (1 << DEBUG_ROM_FLAG_GO))){
           if (dmcontrol.hartsel == id) {
-              abstractcs.busy = false;
+              abstract_command_completed = true;
           }
         }
     }
@@ -191,14 +225,15 @@ bool debug_module_t::store(reg_t addr, size_t len, const uint8_t* bytes)
   }
 
   if (addr == DEBUG_ROM_GOING) {
-    debug_rom_flags[dmcontrol.hartsel] &= ~(1 << DEBUG_ROM_FLAG_GO);
+    assert(len == 4);
+    debug_rom_flags[id] &= ~(1 << DEBUG_ROM_FLAG_GO);
     return true;
   }
 
   if (addr == DEBUG_ROM_RESUMING) {
     assert (len == 4);
-    halted[id] = false;
-    resumeack[id] = true;
+    hart_state[id].halted = false;
+    hart_state[id].resumeack = true;
     debug_rom_flags[id] &= ~(1 << DEBUG_ROM_FLAG_RESUME);
     return true;
   }
@@ -210,8 +245,8 @@ bool debug_module_t::store(reg_t addr, size_t len, const uint8_t* bytes)
     return true;
   }
 
-  fprintf(stderr, "ERROR: invalid store to debug module: %zd bytes at 0x%016"
-          PRIx64 "\n", len, addr);
+  D(fprintf(stderr, "ERROR: invalid store to debug module: %zd bytes at 0x%016"
+          PRIx64 "\n", len, addr));
   return false;
 }
 
@@ -234,16 +269,25 @@ uint32_t debug_module_t::read32(uint8_t *memory, unsigned int index)
   return value;
 }
 
-processor_t *debug_module_t::current_proc() const
+processor_t *debug_module_t::processor(unsigned hartid) const
 {
   processor_t *proc = NULL;
   try {
-    proc = sim->get_core(dmcontrol.hartsel);
+    proc = sim->get_core(hartid);
   } catch (const std::out_of_range&) {
   }
   return proc;
 }
 
+bool debug_module_t::hart_selected(unsigned hartid) const
+{
+  if (dmcontrol.hasel) {
+    return hartid == dmcontrol.hartsel || hart_array_mask[hartid];
+  } else {
+    return hartid == dmcontrol.hartsel;
+  }
+}
+
 unsigned debug_module_t::sb_access_bits()
 {
   return 8 << sbcs.sbaccess;
@@ -251,7 +295,7 @@ unsigned debug_module_t::sb_access_bits()
 
 void debug_module_t::sb_autoincrement()
 {
-  if (!sbcs.autoincrement || !max_bus_master_bits)
+  if (!sbcs.autoincrement || !config.max_bus_master_bits)
     return;
 
   uint64_t value = sbaddress[0] + sb_access_bits() / 8;
@@ -273,13 +317,13 @@ void debug_module_t::sb_read()
 {
   reg_t address = ((uint64_t) sbaddress[1] << 32) | sbaddress[0];
   try {
-    if (sbcs.sbaccess == 0 && max_bus_master_bits >= 8) {
+    if (sbcs.sbaccess == 0 && config.max_bus_master_bits >= 8) {
       sbdata[0] = sim->debug_mmu->load_uint8(address);
-    } else if (sbcs.sbaccess == 1 && max_bus_master_bits >= 16) {
+    } else if (sbcs.sbaccess == 1 && config.max_bus_master_bits >= 16) {
       sbdata[0] = sim->debug_mmu->load_uint16(address);
-    } else if (sbcs.sbaccess == 2 && max_bus_master_bits >= 32) {
+    } else if (sbcs.sbaccess == 2 && config.max_bus_master_bits >= 32) {
       sbdata[0] = sim->debug_mmu->load_uint32(address);
-    } else if (sbcs.sbaccess == 3 && max_bus_master_bits >= 64) {
+    } else if (sbcs.sbaccess == 3 && config.max_bus_master_bits >= 64) {
       uint64_t value = sim->debug_mmu->load_uint64(address);
       sbdata[0] = value;
       sbdata[1] = value >> 32;
@@ -295,13 +339,13 @@ void debug_module_t::sb_write()
 {
   reg_t address = ((uint64_t) sbaddress[1] << 32) | sbaddress[0];
   D(fprintf(stderr, "sb_write() 0x%x @ 0x%lx\n", sbdata[0], address));
-  if (sbcs.sbaccess == 0 && max_bus_master_bits >= 8) {
+  if (sbcs.sbaccess == 0 && config.max_bus_master_bits >= 8) {
     sim->debug_mmu->store_uint8(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 1 && max_bus_master_bits >= 16) {
+  } else if (sbcs.sbaccess == 1 && config.max_bus_master_bits >= 16) {
     sim->debug_mmu->store_uint16(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 2 && max_bus_master_bits >= 32) {
+  } else if (sbcs.sbaccess == 2 && config.max_bus_master_bits >= 32) {
     sim->debug_mmu->store_uint32(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 3 && max_bus_master_bits >= 64) {
+  } else if (sbcs.sbaccess == 3 && config.max_bus_master_bits >= 64) {
     sim->debug_mmu->store_uint64(address,
         (((uint64_t) sbdata[1]) << 32) | sbdata[0]);
   } else {
@@ -318,7 +362,7 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
     result = read32(dmdata, i);
     if (abstractcs.busy) {
       result = -1;
-      fprintf(stderr, "\ndmi_read(0x%02x (data[%d]) -> -1 because abstractcs.busy==true\n", address, i);
+      D(fprintf(stderr, "\ndmi_read(0x%02x (data[%d]) -> -1 because abstractcs.busy==true\n", address, i));
     }
 
     if (abstractcs.busy && abstractcs.cmderr == CMDERR_NONE) {
@@ -328,12 +372,12 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
     if (!abstractcs.busy && ((abstractauto.autoexecdata >> i) & 1)) {
       perform_abstract_command();
     }
-  } else if (address >= DMI_PROGBUF0 && address < DMI_PROGBUF0 + progbufsize) {
+  } else if (address >= DMI_PROGBUF0 && address < DMI_PROGBUF0 + config.progbufsize) {
     unsigned i = address - DMI_PROGBUF0;
     result = read32(program_buffer, i);
     if (abstractcs.busy) {
       result = -1;
-      fprintf(stderr, "\ndmi_read(0x%02x (progbuf[%d]) -> -1 because abstractcs.busy==true\n", address, i);
+      D(fprintf(stderr, "\ndmi_read(0x%02x (progbuf[%d]) -> -1 because abstractcs.busy==true\n", address, i));
     }
     if (!abstractcs.busy && ((abstractauto.autoexecprogbuf >> i) & 1)) {
       perform_abstract_command();
@@ -343,14 +387,11 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
     switch (address) {
       case DMI_DMCONTROL:
         {
-          processor_t *proc = current_proc();
-          if (proc)
-            dmcontrol.haltreq = proc->halt_request;
-
           result = set_field(result, DMI_DMCONTROL_HALTREQ, dmcontrol.haltreq);
           result = set_field(result, DMI_DMCONTROL_RESUMEREQ, dmcontrol.resumereq);
           result = set_field(result, DMI_DMCONTROL_HARTSELHI,
               dmcontrol.hartsel >> DMI_DMCONTROL_HARTSELLO_LENGTH);
+          result = set_field(result, DMI_DMCONTROL_HASEL, dmcontrol.hasel);
           result = set_field(result, DMI_DMCONTROL_HARTSELLO, dmcontrol.hartsel);
           result = set_field(result, DMI_DMCONTROL_HARTRESET, dmcontrol.hartreset);
 	  result = set_field(result, DMI_DMCONTROL_NDMRESET, dmcontrol.ndmreset);
@@ -359,42 +400,45 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
         break;
       case DMI_DMSTATUS:
         {
-          processor_t *proc = current_proc();
-
-	  dmstatus.allnonexistant = false;
-	  dmstatus.allunavail = false;
-	  dmstatus.allrunning = false;
-	  dmstatus.allhalted = false;
-          dmstatus.allresumeack = false;
-          if (proc) {
-            if (halted[dmcontrol.hartsel]) {
-              dmstatus.allhalted = true;
-            } else {
-              dmstatus.allrunning = true;
-            }
-          } else {
-	    dmstatus.allnonexistant = true;
-          }
-	  dmstatus.anynonexistant = dmstatus.allnonexistant;
-	  dmstatus.anyunavail = dmstatus.allunavail;
-	  dmstatus.anyrunning = dmstatus.allrunning;
-	  dmstatus.anyhalted = dmstatus.allhalted;
-          if (proc) {
-            if (resumeack[dmcontrol.hartsel]) {
-              dmstatus.allresumeack = true;
-            } else {
-              dmstatus.allresumeack = false;
+	  dmstatus.allhalted = true;
+          dmstatus.anyhalted = false;
+	  dmstatus.allrunning = true;
+          dmstatus.anyrunning = false;
+          dmstatus.allnonexistant = true;
+          dmstatus.allresumeack = true;
+          dmstatus.anyresumeack = false;
+          for (unsigned i = 0; i < nprocs; i++) {
+            if (hart_selected(i)) {
+              dmstatus.allnonexistant = false;
+              if (hart_state[i].resumeack) {
+                dmstatus.anyresumeack = true;
+              } else {
+                dmstatus.allresumeack = false;
+              }
+              if (hart_state[i].halted) {
+                dmstatus.allrunning = false;
+                dmstatus.anyhalted = true;
+              } else {
+                dmstatus.allhalted = false;
+                dmstatus.anyrunning = true;
+              }
             }
-          } else {
-            dmstatus.allresumeack = false;
           }
 
+          // We don't allow selecting non-existant harts through
+          // hart_array_mask, so the only way it's possible is by writing a
+          // non-existant hartsel.
+          dmstatus.anynonexistant = (dmcontrol.hartsel >= nprocs);
+
+	  dmstatus.allunavail = false;
+	  dmstatus.anyunavail = false;
+
           result = set_field(result, DMI_DMSTATUS_IMPEBREAK,
               dmstatus.impebreak);
           result = set_field(result, DMI_DMSTATUS_ALLHAVERESET,
-              havereset[dmcontrol.hartsel]);
+              hart_state[dmcontrol.hartsel].havereset);
           result = set_field(result, DMI_DMSTATUS_ANYHAVERESET,
-              havereset[dmcontrol.hartsel]);
+              hart_state[dmcontrol.hartsel].havereset);
 	  result = set_field(result, DMI_DMSTATUS_ALLNONEXISTENT, dmstatus.allnonexistant);
 	  result = set_field(result, DMI_DMSTATUS_ALLUNAVAIL, dmstatus.allunavail);
 	  result = set_field(result, DMI_DMSTATUS_ALLRUNNING, dmstatus.allrunning);
@@ -430,6 +474,20 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
         result = set_field(result, DMI_HARTINFO_DATASIZE, abstractcs.datacount);
         result = set_field(result, DMI_HARTINFO_DATAADDR, debug_data_start);
         break;
+      case DMI_HAWINDOWSEL:
+        result = hawindowsel;
+        break;
+      case DMI_HAWINDOW:
+        {
+          unsigned base = hawindowsel * 32;
+          for (unsigned i = 0; i < 32; i++) {
+            unsigned n = base + i;
+            if (n < nprocs && hart_array_mask[n]) {
+              result |= 1 << i;
+            }
+          }
+        }
+        break;
       case DMI_SBCS:
         result = set_field(result, DMI_SBCS_SBVERSION, sbcs.version);
         result = set_field(result, DMI_SBCS_SBREADONADDR, sbcs.readonaddr);
@@ -459,10 +517,12 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
       case DMI_SBDATA0:
         result = sbdata[0];
         if (sbcs.error == 0) {
-          sb_autoincrement();
           if (sbcs.readondata) {
             sb_read();
           }
+          if (sbcs.error == 0) {
+            sb_autoincrement();
+          }
         }
         break;
       case DMI_SBDATA1:
@@ -477,6 +537,10 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
       case DMI_AUTHDATA:
         result = challenge;
         break;
+      case DMI_DMCS2:
+        result = set_field(result, DMI_DMCS2_HALTGROUP,
+            hart_state[dmcontrol.hartsel].haltgroup);
+        break;
       default:
         result = 0;
         D(fprintf(stderr, "Unexpected. Returning Error."));
@@ -488,6 +552,22 @@ bool debug_module_t::dmi_read(unsigned address, uint32_t *value)
   return true;
 }
 
+void debug_module_t::run_test_idle()
+{
+  if (rti_remaining > 0) {
+    rti_remaining--;
+  }
+  if (rti_remaining == 0 && abstractcs.busy && abstract_command_completed) {
+    abstractcs.busy = false;
+  }
+}
+
+static bool is_fpu_reg(unsigned regno)
+{
+  return (regno >= 0x1020 && regno <= 0x103f) || regno == CSR_FFLAGS ||
+    regno == CSR_FRM || regno == CSR_FCSR;
+}
+
 bool debug_module_t::perform_abstract_command()
 {
   if (abstractcs.cmderr != CMDERR_NONE)
@@ -499,11 +579,11 @@ bool debug_module_t::perform_abstract_command()
 
   if ((command >> 24) == 0) {
     // register access
-    unsigned size = get_field(command, AC_ACCESS_REGISTER_SIZE);
+    unsigned size = get_field(command, AC_ACCESS_REGISTER_AARSIZE);
     bool write = get_field(command, AC_ACCESS_REGISTER_WRITE);
     unsigned regno = get_field(command, AC_ACCESS_REGISTER_REGNO);
 
-    if (!halted[dmcontrol.hartsel]) {
+    if (!hart_state[dmcontrol.hartsel].halted) {
       abstractcs.cmderr = CMDERR_HALTRESUME;
       return true;
     }
@@ -511,10 +591,22 @@ bool debug_module_t::perform_abstract_command()
     unsigned i = 0;
     if (get_field(command, AC_ACCESS_REGISTER_TRANSFER)) {
 
-      if (regno < 0x1000 && progbufsize < 2) {
-        // Make the debugger use the program buffer if it's available, so it
-        // can test both use cases.
-        write32(debug_abstract, i++, csrw(S0, CSR_DSCRATCH));
+      if (is_fpu_reg(regno)) {
+        // Save S0
+        write32(debug_abstract, i++, csrw(S0, CSR_DSCRATCH0));
+        // Save mstatus
+        write32(debug_abstract, i++, csrr(S0, CSR_MSTATUS));
+        write32(debug_abstract, i++, csrw(S0, CSR_DSCRATCH1));
+        // Set mstatus.fs
+        assert((MSTATUS_FS & 0xfff) == 0);
+        write32(debug_abstract, i++, lui(S0, MSTATUS_FS >> 12));
+        write32(debug_abstract, i++, csrrs(ZERO, S0, CSR_MSTATUS));
+      }
+
+      if (regno < 0x1000 && config.support_abstract_csr_access) {
+        if (!is_fpu_reg(regno)) {
+          write32(debug_abstract, i++, csrw(S0, CSR_DSCRATCH0));
+        }
 
         if (write) {
           switch (size) {
@@ -544,7 +636,9 @@ bool debug_module_t::perform_abstract_command()
               return true;
           }
         }
-        write32(debug_abstract, i++, csrr(S0, CSR_DSCRATCH));
+        if (!is_fpu_reg(regno)) {
+          write32(debug_abstract, i++, csrr(S0, CSR_DSCRATCH0));
+        }
 
       } else if (regno >= 0x1000 && regno < 0x1020) {
         unsigned regnum = regno - 0x1000;
@@ -568,9 +662,6 @@ bool debug_module_t::perform_abstract_command()
         }
 
       } else if (regno >= 0x1020 && regno < 0x1040) {
-        // Don't force the debugger to use progbuf if it exists, so the
-        // debugger has to make the decision not to use abstract commands to
-        // access 64-bit FPRs on 32-bit targets.
         unsigned fprnum = regno - 0x1020;
 
         if (write) {
@@ -618,6 +709,14 @@ bool debug_module_t::perform_abstract_command()
         abstractcs.cmderr = CMDERR_NOTSUP;
         return true;
       }
+
+      if (is_fpu_reg(regno)) {
+        // restore mstatus
+        write32(debug_abstract, i++, csrr(S0, CSR_DSCRATCH1));
+        write32(debug_abstract, i++, csrw(S0, CSR_MSTATUS));
+        // restore s0
+        write32(debug_abstract, i++, csrr(S0, CSR_DSCRATCH0));
+      }
     }
 
     if (get_field(command, AC_ACCESS_REGISTER_POSTEXEC)) {
@@ -629,6 +728,8 @@ bool debug_module_t::perform_abstract_command()
     }
 
     debug_rom_flags[dmcontrol.hartsel] |= 1 << DEBUG_ROM_FLAG_GO;
+    rti_remaining = config.abstract_rti;
+    abstract_command_completed = false;
 
     abstractcs.busy = true;
   } else {
@@ -659,7 +760,7 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
     }
     return true;
 
-  } else if (address >= DMI_PROGBUF0 && address < DMI_PROGBUF0 + progbufsize) {
+  } else if (address >= DMI_PROGBUF0 && address < DMI_PROGBUF0 + config.progbufsize) {
     unsigned i = address - DMI_PROGBUF0;
 
     if (!abstractcs.busy)
@@ -677,35 +778,47 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
           if (!dmcontrol.dmactive && get_field(value, DMI_DMCONTROL_DMACTIVE))
             reset();
           dmcontrol.dmactive = get_field(value, DMI_DMCONTROL_DMACTIVE);
-          if (!dmstatus.authenticated)
+          if (!dmstatus.authenticated || !dmcontrol.dmactive)
             return true;
-          if (dmcontrol.dmactive) {
-            dmcontrol.haltreq = get_field(value, DMI_DMCONTROL_HALTREQ);
-            dmcontrol.resumereq = get_field(value, DMI_DMCONTROL_RESUMEREQ);
-            dmcontrol.hartreset = get_field(value, DMI_DMCONTROL_HARTRESET);
-            dmcontrol.ndmreset = get_field(value, DMI_DMCONTROL_NDMRESET);
-            dmcontrol.hartsel = get_field(value, DMI_DMCONTROL_HARTSELHI) <<
-              DMI_DMCONTROL_HARTSELLO_LENGTH;
-            dmcontrol.hartsel |= get_field(value, DMI_DMCONTROL_HARTSELLO);
-            dmcontrol.hartsel &= (1L<<hartsellen) - 1;
-            if (get_field(value, DMI_DMCONTROL_ACKHAVERESET)) {
-              havereset[dmcontrol.hartsel] = false;
-            }
-          }
-          processor_t *proc = current_proc();
-          if (proc) {
-            proc->halt_request = dmcontrol.haltreq;
-            if (dmcontrol.resumereq) {
-              debug_rom_flags[dmcontrol.hartsel] |= (1 << DEBUG_ROM_FLAG_RESUME);
-              resumeack[dmcontrol.hartsel] = false;
+
+          dmcontrol.haltreq = get_field(value, DMI_DMCONTROL_HALTREQ);
+          dmcontrol.resumereq = get_field(value, DMI_DMCONTROL_RESUMEREQ);
+          dmcontrol.hartreset = get_field(value, DMI_DMCONTROL_HARTRESET);
+          dmcontrol.ndmreset = get_field(value, DMI_DMCONTROL_NDMRESET);
+          if (config.support_hasel)
+            dmcontrol.hasel = get_field(value, DMI_DMCONTROL_HASEL);
+          else
+            dmcontrol.hasel = 0;
+          dmcontrol.hartsel = get_field(value, DMI_DMCONTROL_HARTSELHI) <<
+            DMI_DMCONTROL_HARTSELLO_LENGTH;
+          dmcontrol.hartsel |= get_field(value, DMI_DMCONTROL_HARTSELLO);
+          dmcontrol.hartsel &= (1L<<hartsellen) - 1;
+          for (unsigned i = 0; i < nprocs; i++) {
+            if (hart_selected(i)) {
+              if (get_field(value, DMI_DMCONTROL_ACKHAVERESET)) {
+                hart_state[i].havereset = false;
+              }
+              processor_t *proc = processor(i);
+              if (proc) {
+                proc->halt_request = dmcontrol.haltreq ? proc->HR_REGULAR : proc->HR_NONE;
+                if (dmcontrol.haltreq) {
+                  D(fprintf(stderr, "halt hart %d\n", i));
+                }
+                if (dmcontrol.resumereq) {
+                  D(fprintf(stderr, "resume hart %d\n", i));
+                  debug_rom_flags[i] |= (1 << DEBUG_ROM_FLAG_RESUME);
+                  hart_state[i].resumeack = false;
+                }
+                if (dmcontrol.hartreset) {
+                  proc->reset();
+                }
+              }
             }
-	    if (dmcontrol.hartreset) {
-	      proc->reset();
-	    }
           }
+
           if (dmcontrol.ndmreset) {
             for (size_t i = 0; i < sim->nprocs(); i++) {
-              proc = sim->get_core(i);
+              processor_t *proc = sim->get_core(i);
               proc->reset();
             }
           }
@@ -716,6 +829,22 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
         command = value;
         return perform_abstract_command();
 
+      case DMI_HAWINDOWSEL:
+        hawindowsel = value & ((1U<<field_width(nprocs))-1);
+        return true;
+
+      case DMI_HAWINDOW:
+        {
+          unsigned base = hawindowsel * 32;
+          for (unsigned i = 0; i < 32; i++) {
+            unsigned n = base + i;
+            if (n < nprocs) {
+              hart_array_mask[n] = (value >> i) & 1;
+            }
+          }
+        }
+        return true;
+
       case DMI_ABSTRACTCS:
         abstractcs.cmderr = (cmderr_t) (((uint32_t) (abstractcs.cmderr)) & (~(uint32_t)(get_field(value, DMI_ABSTRACTCS_CMDERR))));
         return true;
@@ -737,6 +866,7 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
         sbaddress[0] = value;
         if (sbcs.error == 0 && sbcs.readonaddr) {
           sb_read();
+          sb_autoincrement();
         }
         return true;
       case DMI_SBADDRESS1:
@@ -752,7 +882,7 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
         sbdata[0] = value;
         if (sbcs.error == 0) {
           sb_write();
-          if (sbcs.autoincrement && sbcs.error == 0) {
+          if (sbcs.error == 0) {
             sb_autoincrement();
           }
         }
@@ -769,7 +899,7 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
       case DMI_AUTHDATA:
         D(fprintf(stderr, "debug authentication: got 0x%x; 0x%x unlocks\n", value,
             challenge + secret));
-        if (require_authentication) {
+        if (config.require_authentication) {
           if (value == challenge + secret) {
             dmstatus.authenticated = true;
           } else {
@@ -778,6 +908,12 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
           }
         }
         return true;
+      case DMI_DMCS2:
+        if (config.support_haltgroups && get_field(value, DMI_DMCS2_HGWRITE)) {
+          hart_state[dmcontrol.hartsel].haltgroup = get_field(value,
+              DMI_DMCS2_HALTGROUP);
+        }
+        return true;
     }
   }
   return false;
@@ -785,6 +921,7 @@ bool debug_module_t::dmi_write(unsigned address, uint32_t value)
 
 void debug_module_t::proc_reset(unsigned id)
 {
-  havereset[id] = true;
-  halted[id] = false;
+  hart_state[id].havereset = true;
+  hart_state[id].halted = false;
+  hart_state[id].haltgroup = 0;
 }
diff --git a/riscv/debug_module.h b/riscv/debug_module.h
index 5b43ed628c..2bcdba4228 100644
--- a/riscv/debug_module.h
+++ b/riscv/debug_module.h
@@ -8,9 +8,23 @@
 
 class sim_t;
 
+typedef struct {
+    // Size of program_buffer in 32-bit words, as exposed to the rest of the
+    // world.
+    unsigned progbufsize;
+    unsigned max_bus_master_bits;
+    bool require_authentication;
+    unsigned abstract_rti;
+    bool support_hasel;
+    bool support_abstract_csr_access;
+    bool support_haltgroups;
+    bool support_impebreak;
+} debug_module_config_t;
+
 typedef struct {
   bool haltreq;
   bool resumereq;
+  bool hasel;
   unsigned hartsel;
   bool hartreset;
   bool dmactive;
@@ -73,6 +87,13 @@ typedef struct {
   bool access8;
 } sbcs_t;
 
+typedef struct {
+  bool halted;
+  bool resumeack;
+  bool havereset;
+  uint8_t haltgroup;
+} hart_debug_state_t;
+
 class debug_module_t : public abstract_device_t
 {
   public:
@@ -81,9 +102,11 @@ class debug_module_t : public abstract_device_t
      * follows:
      * 1. Read a 32-bit value from authdata:
      * 2. Write the value that was read back, plus one, to authdata.
+     *
+     * abstract_rti is extra run-test/idle cycles that each abstract command
+     * takes to execute. Useful for testing OpenOCD.
      */
-    debug_module_t(sim_t *sim, unsigned progbufsize, unsigned max_bus_master_bits,
-        bool require_authentication);
+    debug_module_t(sim_t *sim, const debug_module_config_t &config);
     ~debug_module_t();
 
     void add_device(bus_t *bus);
@@ -97,23 +120,23 @@ class debug_module_t : public abstract_device_t
     bool dmi_read(unsigned address, uint32_t *value);
     bool dmi_write(unsigned address, uint32_t value);
 
+    // Called for every cycle the JTAG TAP spends in Run-Test/Idle.
+    void run_test_idle();
+
     // Called when one of the attached harts was reset.
     void proc_reset(unsigned id);
 
   private:
     static const unsigned datasize = 2;
-    // Size of program_buffer in 32-bit words, as exposed to the rest of the
-    // world.
-    unsigned progbufsize;
+    unsigned nprocs;
+    debug_module_config_t config;
     // Actual size of the program buffer, which is 1 word bigger than we let on
     // to implement the implicit ebreak at the end.
     unsigned program_buffer_bytes;
-    unsigned max_bus_master_bits;
-    bool require_authentication;
     static const unsigned debug_data_start = 0x380;
     unsigned debug_progbuf_start;
 
-    static const unsigned debug_abstract_size = 5;
+    static const unsigned debug_abstract_size = 12;
     unsigned debug_abstract_start;
     // R/W this through custom registers, to allow debuggers to test that
     // functionality.
@@ -121,7 +144,7 @@ class debug_module_t : public abstract_device_t
 
     // We only support 1024 harts currently. More requires at least resizing
     // the arrays below, and their corresponding special memory regions.
-    static const unsigned hartsellen = 10;
+    unsigned hartsellen = 10;
 
     sim_t *sim;
 
@@ -130,9 +153,7 @@ class debug_module_t : public abstract_device_t
     uint8_t *program_buffer;
     uint8_t dmdata[datasize * 4];
 
-    bool halted[1024];
-    bool resumeack[1024];
-    bool havereset[1024];
+    std::vector<hart_debug_state_t> hart_state;
     uint8_t debug_rom_flags[1024];
 
     void write32(uint8_t *rom, unsigned int index, uint32_t value);
@@ -148,6 +169,8 @@ class debug_module_t : public abstract_device_t
     abstractcs_t abstractcs;
     abstractauto_t abstractauto;
     uint32_t command;
+    uint16_t hawindowsel;
+    std::vector<bool> hart_array_mask;
 
     sbcs_t sbcs;
     uint32_t sbaddress[4];
@@ -156,9 +179,13 @@ class debug_module_t : public abstract_device_t
     uint32_t challenge;
     const uint32_t secret = 1;
 
-    processor_t *current_proc() const;
+    processor_t *processor(unsigned hartid) const;
+    bool hart_selected(unsigned hartid) const;
     void reset();
     bool perform_abstract_command();
+
+    bool abstract_command_completed;
+    unsigned rti_remaining;
 };
 
 #endif
diff --git a/riscv/decode.h b/riscv/decode.h
index f9e3b6f649..3bb74996b4 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -7,10 +7,7 @@
 # error spike requires a two''s-complement c++ implementation
 #endif
 
-#ifdef WORDS_BIGENDIAN
-# error spike requires a little-endian host
-#endif
-
+#include <algorithm>
 #include <cstdint>
 #include <string.h>
 #include <strings.h>
@@ -24,13 +21,25 @@
 typedef int64_t sreg_t;
 typedef uint64_t reg_t;
 
+#ifdef __SIZEOF_INT128__
+typedef __int128 int128_t;
+typedef unsigned __int128 uint128_t;
+#endif
+
 const int NXPR = 32;
 const int NFPR = 32;
+const int NVPR = 32;
 const int NCSR = 4096;
 
 #define X_RA 1
 #define X_SP 2
 
+#define VCSR_VXRM_SHIFT 1
+#define VCSR_VXRM  (0x3 << VCSR_VXRM_SHIFT)
+
+#define VCSR_VXSAT_SHIFT 0
+#define VCSR_VXSAT  (0x1 << VCSR_VXSAT_SHIFT)
+
 #define FP_RD_NE  0
 #define FP_RD_0   1
 #define FP_RD_DN  2
@@ -68,7 +77,7 @@ class insn_t
 public:
   insn_t() = default;
   insn_t(insn_bits_t bits) : b(bits) {}
-  insn_bits_t bits() { return b; }
+  insn_bits_t bits() { return b & ~((UINT64_MAX) << (length() * 8)); }
   int length() { return insn_length(b); }
   int64_t i_imm() { return int64_t(b) >> 20; }
   int64_t shamt() { return x(20, 6); }
@@ -101,6 +110,36 @@ class insn_t
   uint64_t rvc_rs2() { return x(2, 5); }
   uint64_t rvc_rs1s() { return 8 + x(7, 3); }
   uint64_t rvc_rs2s() { return 8 + x(2, 3); }
+
+  uint64_t v_vm() { return x(25, 1); }
+  uint64_t v_wd() { return x(26, 1); }
+  uint64_t v_nf() { return x(29, 3); }
+  uint64_t v_simm5() { return xs(15, 5); }
+  uint64_t v_zimm5() { return x(15, 5); }
+  uint64_t v_zimm11() { return x(20, 11); }
+  uint64_t v_lmul() { return x(20, 2); }
+  uint64_t v_frac_lmul() { return x(22, 1); }
+  uint64_t v_sew() { return 1 << (x(23, 3) + 3); }
+  uint64_t v_width() { return x(12, 3); }
+  uint64_t v_mop() { return x(26, 2); }
+  uint64_t v_lumop() { return x(20, 5); }
+  uint64_t v_sumop() { return x(20, 5); }
+  uint64_t v_vta() { return x(26, 1); }
+  uint64_t v_vma() { return x(27, 1); }
+  uint64_t v_mew() { return x(28, 1); }
+
+  // Xpulpimg
+  uint64_t p_uimmL() { return x(20, 12); }  // [31:20] same bits as I-type imm12
+  uint64_t p_uimmS() { return x(15, 5); }   // [19:15] same bits as RS1
+  uint64_t p_loop() { return x(7, 1); }     // [7] called 'L' in the docs
+  uint64_t p_Luimm5() { return x(25, 5); }  // [29:25]
+  uint64_t p_zimm5() { return x(20, 5); }   // [24:20]
+  int64_t p_simm5() { return xs(20, 5); }   // [24:20]
+  uint64_t p_rs3() { return x(7, 5); }      // [11:07] alias for RD
+  uint64_t p_zimm6() { return x(25,1) + (x(20, 5) << 1); }
+  int64_t p_simm6() { return x(25,1) + (xs(20, 5) << 1); }
+
+
 private:
   insn_bits_t b;
   uint64_t x(int lo, int len) { return (b >> lo) & ((insn_bits_t(1) << len)-1); }
@@ -121,6 +160,14 @@ class regfile_t
   {
     return data[i];
   }
+  regfile_t()
+  {
+    reset();
+  }
+  void reset()
+  {
+    memset(data, 0, sizeof(data));
+  }
 private:
   T data[N];
 };
@@ -128,26 +175,38 @@ class regfile_t
 // helpful macros, etc
 #define MMU (*p->get_mmu())
 #define STATE (*p->get_state())
+#define P (*p)
+#define FLEN (p->get_flen())
 #define READ_REG(reg) STATE.XPR[reg]
 #define READ_FREG(reg) STATE.FPR[reg]
+#define RD READ_REG(insn.rd())
 #define RS1 READ_REG(insn.rs1())
 #define RS2 READ_REG(insn.rs2())
+#define RS3 READ_REG(insn.rs3())
 #define WRITE_RD(value) WRITE_REG(insn.rd(), value)
 
 #ifndef RISCV_ENABLE_COMMITLOG
 # define WRITE_REG(reg, value) STATE.XPR.write(reg, value)
 # define WRITE_FREG(reg, value) DO_WRITE_FREG(reg, freg(value))
+# define WRITE_VSTATUS
 #else
+   /* 0 : int
+    * 1 : floating
+    * 2 : vector reg
+    * 3 : vector hint
+    * 4 : csr
+    */
 # define WRITE_REG(reg, value) ({ \
     reg_t wdata = (value); /* value may have side effects */ \
-    STATE.log_reg_write = (commit_log_reg_t){(reg) << 1, {wdata, 0}}; \
+    STATE.log_reg_write[(reg) << 4] = {wdata, 0}; \
     STATE.XPR.write(reg, wdata); \
   })
 # define WRITE_FREG(reg, value) ({ \
     freg_t wdata = freg(value); /* value may have side effects */ \
-    STATE.log_reg_write = (commit_log_reg_t){((reg) << 1) | 1, wdata}; \
+    STATE.log_reg_write[((reg) << 4) | 1] = wdata; \
     DO_WRITE_FREG(reg, wdata); \
   })
+# define WRITE_VSTATUS STATE.log_reg_write[3] = {0, 0};
 #endif
 
 // RVC macros
@@ -168,34 +227,85 @@ class regfile_t
 #define FRS3 READ_FREG(insn.rs3())
 #define dirty_fp_state (STATE.mstatus |= MSTATUS_FS | (xlen == 64 ? MSTATUS64_SD : MSTATUS32_SD))
 #define dirty_ext_state (STATE.mstatus |= MSTATUS_XS | (xlen == 64 ? MSTATUS64_SD : MSTATUS32_SD))
+#define dirty_vs_state (STATE.mstatus |= MSTATUS_VS | (xlen == 64 ? MSTATUS64_SD : MSTATUS32_SD))
 #define DO_WRITE_FREG(reg, value) (STATE.FPR.write(reg, value), dirty_fp_state)
 #define WRITE_FRD(value) WRITE_FREG(insn.rd(), value)
- 
+
 #define SHAMT (insn.i_imm() & 0x3F)
 #define BRANCH_TARGET (pc + insn.sb_imm())
 #define JUMP_TARGET (pc + insn.uj_imm())
 #define RM ({ int rm = insn.rm(); \
               if(rm == 7) rm = STATE.frm; \
-              if(rm > 4) throw trap_illegal_instruction(0); \
+              if(rm > 4) throw trap_illegal_instruction(insn.bits()); \
               rm; })
 
 #define get_field(reg, mask) (((reg) & (decltype(reg))(mask)) / ((mask) & ~((mask) << 1)))
 #define set_field(reg, mask, val) (((reg) & ~(decltype(reg))(mask)) | (((decltype(reg))(val) * ((mask) & ~((mask) << 1))) & (decltype(reg))(mask)))
 
-#define require(x) if (unlikely(!(x))) throw trap_illegal_instruction(0)
+#define require(x) if (unlikely(!(x))) throw trap_illegal_instruction(insn.bits())
 #define require_privilege(p) require(STATE.prv >= (p))
+#define require_novirt() if (unlikely(STATE.v)) throw trap_virtual_instruction(insn.bits())
 #define require_rv64 require(xlen == 64)
 #define require_rv32 require(xlen == 32)
 #define require_extension(s) require(p->supports_extension(s))
 #define require_fp require((STATE.mstatus & MSTATUS_FS) != 0)
 #define require_accelerator require((STATE.mstatus & MSTATUS_XS) != 0)
 
+#define require_vector_vs require((STATE.mstatus & MSTATUS_VS) != 0);
+#define require_vector(alu) \
+  do { \
+    require_vector_vs; \
+    require_extension('V'); \
+    require(!P.VU.vill); \
+    if (alu && !P.VU.vstart_alu) \
+      require(P.VU.vstart == 0); \
+    WRITE_VSTATUS; \
+    dirty_vs_state; \
+  } while (0);
+#define require_vector_novtype(is_log, alu) \
+  do {  \
+    require_vector_vs; \
+    require_extension('V'); \
+    if (alu && !P.VU.vstart_alu) \
+      require(P.VU.vstart == 0); \
+    if (is_log) \
+      WRITE_VSTATUS; \
+    dirty_vs_state; \
+  } while (0);
+#define require_align(val, pos) require(is_aligned(val, pos))
+#define require_noover(astart, asize, bstart, bsize) \
+  require(!is_overlapped(astart, asize, bstart, bsize))
+#define require_noover_widen(astart, asize, bstart, bsize) \
+  require(!is_overlapped_widen(astart, asize, bstart, bsize))
+#define require_vm do { if (insn.v_vm() == 0) require(insn.rd() != 0);} while(0);
+
 #define set_fp_exceptions ({ if (softfloat_exceptionFlags) { \
                                dirty_fp_state; \
                                STATE.fflags |= softfloat_exceptionFlags; \
                              } \
                              softfloat_exceptionFlags = 0; })
 
+// Xpulpimg macros
+#define sext16(x) ((sreg_t)(int16_t)(x))
+#define zext16(x) ((reg_t)(uint16_t)(x))
+
+#define sext8(x)  ((sreg_t)(int8_t)(x))
+#define zext8(x)  ((reg_t)(uint8_t)(x))
+
+#define sextr(x, u, l) ( (sreg_t)( ((sreg_t)x) << (63-(u)) >> ((l)+63-(u)) ) )  // sext(x[u:l])
+#define zextr(x, u, l) ((reg_t)( ((x) >> l) & ( (1 << ((u)-(l)+1))-1 ) ))       // zext(x[u:l])
+
+#define P_RS3 READ_REG(insn.p_rs3()) /* same as RD, just different semantical value */
+#define WRITE_RS1(value) WRITE_REG(insn.rs1(), value)
+
+#define RS1_H(i) ((RS1 >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rs1 half: i should only be 0 or 1 */
+#define RS1_B(i) ((RS1 >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rs1 byte: i should only be from 0 to 3 */
+#define RS2_H(i) ((RS2 >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rs2 half: i should only be 0 or 1 */
+#define RS2_B(i) ((RS2 >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rs2 byte: i should only be from 0 to 3 */
+#define RD_H(i) ((RD >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rd half: i should only be 0 or 1 */
+#define RD_B(i) ((RD >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rd byte: i should only be from 0 to 3 */
+
+
 #define sext32(x) ((sreg_t)(int32_t)(x))
 #define zext32(x) ((reg_t)(uint32_t)(x))
 #define sext_xlen(x) (((sreg_t)(x) << (64-xlen)) >> (64-xlen))
@@ -212,9 +322,12 @@ class regfile_t
        STATE.pc = __npc; \
      } while(0)
 
+class wait_for_interrupt_t {};
+
 #define wfi() \
   do { set_pc_and_serialize(npc); \
        npc = PC_SERIALIZE_WFI; \
+       throw wait_for_interrupt_t(); \
      } while(0)
 
 #define serialize() set_pc_and_serialize(npc)
@@ -226,21 +339,29 @@ class regfile_t
 #define invalid_pc(pc) ((pc) & 1)
 
 /* Convenience wrappers to simplify softfloat code sequences */
+#define isBoxedF16(r) (isBoxedF32(r) && ((uint64_t)((r.v[0] >> 16) + 1) == ((uint64_t)1 << 48)))
+#define unboxF16(r) (isBoxedF16(r) ? (uint16_t)r.v[0] : defaultNaNF16UI)
 #define isBoxedF32(r) (isBoxedF64(r) && ((uint32_t)((r.v[0] >> 32) + 1) == 0))
 #define unboxF32(r) (isBoxedF32(r) ? (uint32_t)r.v[0] : defaultNaNF32UI)
 #define isBoxedF64(r) ((r.v[1] + 1) == 0)
 #define unboxF64(r) (isBoxedF64(r) ? r.v[0] : defaultNaNF64UI)
 typedef float128_t freg_t;
+inline float16_t f16(uint16_t v) { return { v }; }
 inline float32_t f32(uint32_t v) { return { v }; }
 inline float64_t f64(uint64_t v) { return { v }; }
+inline float16_t f16(freg_t r) { return f16(unboxF16(r)); }
 inline float32_t f32(freg_t r) { return f32(unboxF32(r)); }
 inline float64_t f64(freg_t r) { return f64(unboxF64(r)); }
 inline float128_t f128(freg_t r) { return r; }
+inline freg_t freg(float16_t f) { return { ((uint64_t)-1 << 16) | f.v, (uint64_t)-1 }; }
 inline freg_t freg(float32_t f) { return { ((uint64_t)-1 << 32) | f.v, (uint64_t)-1 }; }
 inline freg_t freg(float64_t f) { return { f.v, (uint64_t)-1 }; }
 inline freg_t freg(float128_t f) { return f; }
+#define F16_SIGN ((uint16_t)1 << 15)
 #define F32_SIGN ((uint32_t)1 << 31)
 #define F64_SIGN ((uint64_t)1 << 63)
+#define fsgnj16(a, b, n, x) \
+  f16((f16(a).v & ~F16_SIGN) | ((((x) ? f16(a).v : (n) ? F16_SIGN : 0) ^ f16(b).v) & F16_SIGN))
 #define fsgnj32(a, b, n, x) \
   f32((f32(a).v & ~F32_SIGN) | ((((x) ? f32(a).v : (n) ? F32_SIGN : 0) ^ f32(b).v) & F32_SIGN))
 #define fsgnj64(a, b, n, x) \
@@ -268,14 +389,2034 @@ inline freg_t f128_negate(freg_t a)
 #define validate_csr(which, write) ({ \
   if (!STATE.serialized) return PC_SERIALIZE_BEFORE; \
   STATE.serialized = false; \
-  unsigned csr_priv = get_field((which), 0x300); \
-  unsigned csr_read_only = get_field((which), 0xC00) == 3; \
-  if (((write) && csr_read_only) || STATE.prv < csr_priv) \
-    throw trap_illegal_instruction(0); \
+  /* permissions check occurs in get_csr */ \
   (which); })
 
-// Seems that 0x0 doesn't work.
-#define DEBUG_START             0x100
-#define DEBUG_END                 (0x1000 - 1)
+/* For debug only. This will fail if the native machine's float types are not IEEE */
+inline float to_f(float32_t f){float r; memcpy(&r, &f, sizeof(r)); return r;}
+inline double to_f(float64_t f){double r; memcpy(&r, &f, sizeof(r)); return r;}
+inline long double to_f(float128_t f){long double r; memcpy(&r, &f, sizeof(r)); return r;}
+
+
+// Interpret register as packed SIMD
+union simd_reg {
+    reg_t reg;
+    sreg_t sreg;
+    // halfwords (signed and unsigned)
+    int16_t h[4];
+    uint16_t hu[4];
+    // bytes (signed and unsigned)
+    int8_t b[8];
+    uint8_t bu[8];
+};
+
+
+// Vector macros
+#define e8 8      // 8b elements
+#define e16 16    // 16b elements
+#define e32 32    // 32b elements
+#define e64 64    // 64b elements
+#define e128 128  // 128b elements
+#define e256 256  // 256b elements
+#define e512 512  // 512b elements
+#define e1024 1024  // 1024b elements
+
+#define vsext(x, sew) ( ((sreg_t)(x) << (64-(sew))) >> (64-(sew)) )
+#define vzext(x, sew) ( ((reg_t)(x) << (64-(sew))) >> (64-(sew)) )
+
+#define DEBUG_RVV 0
+
+#if DEBUG_RVV
+#define DEBUG_RVV_FP_VV \
+  printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2));
+#define DEBUG_RVV_FP_VF \
+  printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2));
+#define DEBUG_RVV_FMA_VV \
+  printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2), to_f(vd_old));
+#define DEBUG_RVV_FMA_VF \
+  printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2), to_f(vd_old));
+#else
+#define DEBUG_RVV_FP_VV 0
+#define DEBUG_RVV_FP_VF 0
+#define DEBUG_RVV_FMA_VV 0
+#define DEBUG_RVV_FMA_VF 0
+#endif
+
+//
+// vector: masking skip helper
+//
+#define VI_MASK_VARS \
+  const int midx = i / 64; \
+  const int mpos = i % 64;
+
+#define VI_LOOP_ELEMENT_SKIP(BODY) \
+  VI_MASK_VARS \
+  if (insn.v_vm() == 0) { \
+    BODY; \
+    bool skip = ((P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1) == 0; \
+    if (skip) {\
+        continue; \
+    }\
+  }
+
+#define VI_ELEMENT_SKIP(inx) \
+  if (inx >= vl) { \
+    continue; \
+  } else if (inx < P.VU.vstart) { \
+    continue; \
+  } else { \
+    VI_LOOP_ELEMENT_SKIP(); \
+  }
+
+//
+// vector: operation and register acccess check helper
+//
+static inline bool is_overlapped(const int astart, int asize,
+                                const int bstart, int bsize)
+{
+  asize = asize == 0 ? 1 : asize;
+  bsize = bsize == 0 ? 1 : bsize;
+
+  const int aend = astart + asize;
+  const int bend = bstart + bsize;
+
+  return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
+}
+
+static inline bool is_overlapped_widen(const int astart, int asize,
+                                       const int bstart, int bsize)
+{
+  asize = asize == 0 ? 1 : asize;
+  bsize = bsize == 0 ? 1 : bsize;
+
+  const int aend = astart + asize;
+  const int bend = bstart + bsize;
+
+  if (astart < bstart &&
+      is_overlapped(astart, asize, bstart, bsize) &&
+      !is_overlapped(astart, asize, bstart + bsize, bsize)) {
+      return false;
+  } else  {
+    return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
+  }
+}
+
+static inline bool is_aligned(const unsigned val, const unsigned pos)
+{
+  return pos ? (val & (pos - 1)) == 0 : true;
+}
+
+#define VI_NARROW_CHECK_COMMON \
+  require_vector(true);\
+  require(P.VU.vflmul <= 4); \
+  require(P.VU.vsew * 2 <= P.VU.ELEN); \
+  require_align(insn.rs2(), P.VU.vflmul * 2); \
+  require_align(insn.rd(), P.VU.vflmul); \
+  require_vm; \
+
+#define VI_WIDE_CHECK_COMMON \
+  require_vector(true);\
+  require(P.VU.vflmul <= 4); \
+  require(P.VU.vsew * 2 <= P.VU.ELEN); \
+  require_align(insn.rd(), P.VU.vflmul * 2); \
+  require_vm; \
+
+#define VI_CHECK_ST_INDEX(elt_width) \
+  require_vector(false); \
+  float vemul = ((float)elt_width / P.VU.vsew * P.VU.vflmul); \
+  require(vemul >= 0.125 && vemul <= 8); \
+  reg_t emul = vemul < 1 ? 1 : vemul; \
+  reg_t flmul = P.VU.vflmul < 1 ? 1 : P.VU.vflmul; \
+  require_align(insn.rd(), P.VU.vflmul); \
+  require_align(insn.rs2(), vemul); \
+  require((nf * flmul) <= (NVPR / 4) && \
+          (insn.rd() + nf * flmul) <= NVPR); \
+
+#define VI_CHECK_LD_INDEX(elt_width) \
+  VI_CHECK_ST_INDEX(elt_width); \
+  if (elt_width > P.VU.vsew) { \
+    if (insn.rd() != insn.rs2()) \
+      require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+  } else if (elt_width < P.VU.vsew) { \
+    if (vemul < 1) {\
+      require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+    } else {\
+      require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+    } \
+  } \
+  if (insn.v_nf() > 0) {\
+    require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+    require_noover(vd, nf, insn.rs2(), 1); \
+  } \
+  require_vm; \
+
+#define VI_CHECK_MSS(is_vs1) \
+  if (insn.rd() != insn.rs2()) \
+    require_noover(insn.rd(), 1, insn.rs2(), P.VU.vflmul); \
+  require_align(insn.rs2(), P.VU.vflmul); \
+  if (is_vs1) {\
+    if (insn.rd() != insn.rs1()) \
+      require_noover(insn.rd(), 1, insn.rs1(), P.VU.vflmul); \
+    require_align(insn.rs1(), P.VU.vflmul); \
+  } \
+
+#define VI_CHECK_SSS(is_vs1) \
+  require_vm; \
+  if (P.VU.vflmul > 1) { \
+    require_align(insn.rd(), P.VU.vflmul); \
+    require_align(insn.rs2(), P.VU.vflmul); \
+    if (is_vs1) { \
+      require_align(insn.rs1(), P.VU.vflmul); \
+    } \
+  }
+
+#define VI_CHECK_STORE(elt_width) \
+  require_vector(false); \
+  reg_t veew = sizeof(elt_width##_t) * 8; \
+  float vemul = ((float)veew / P.VU.vsew * P.VU.vflmul); \
+  reg_t emul = vemul < 1 ? 1 : vemul; \
+  require(vemul >= 0.125 && vemul <= 8); \
+  require_align(insn.rd(), vemul); \
+  require((nf * emul) <= (NVPR / 4) && \
+          (insn.rd() + nf * emul) <= NVPR); \
+
+#define VI_CHECK_LOAD(elt_width) \
+  VI_CHECK_STORE(elt_width); \
+  require_vm; \
+
+#define VI_CHECK_DSS(is_vs1) \
+  VI_WIDE_CHECK_COMMON; \
+  require_align(insn.rs2(), P.VU.vflmul); \
+  if (P.VU.vflmul < 1) {\
+    require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
+  } else {\
+    require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
+  } \
+  if (is_vs1) {\
+    require_align(insn.rs1(), P.VU.vflmul); \
+    if (P.VU.vflmul < 1) {\
+      require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
+    } else {\
+      require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
+    } \
+  }
+
+#define VI_CHECK_QSS(is_vs1) \
+  require_vector(true);\
+  p->supports_extension(EXT_ZVQMAC); \
+  require(P.VU.vflmul <= 2); \
+  require(P.VU.vsew * 4 <= P.VU.ELEN); \
+  require_align(insn.rd(), P.VU.vflmul * 4); \
+  require_align(insn.rs2(), P.VU.vflmul); \
+  require_vm; \
+  if (P.VU.vflmul < 1) {\
+    require_noover(insn.rd(), P.VU.vflmul * 4, insn.rs2(), P.VU.vflmul); \
+  } else {\
+    require_noover_widen(insn.rd(), P.VU.vflmul * 4, insn.rs2(), P.VU.vflmul); \
+  } \
+  if (is_vs1) {\
+     require_align(insn.rs1(), P.VU.vflmul); \
+    if (P.VU.vflmul < 1) {\
+      require_noover(insn.rd(), P.VU.vflmul * 4, insn.rs1(), P.VU.vflmul); \
+    } else {\
+      require_noover_widen(insn.rd(), P.VU.vflmul * 4, insn.rs1(), P.VU.vflmul); \
+    } \
+  }
+
+#define VI_CHECK_DDS(is_rs) \
+  VI_WIDE_CHECK_COMMON; \
+  require_align(insn.rs2(), P.VU.vflmul * 2); \
+  if (is_rs) { \
+     require_align(insn.rs1(), P.VU.vflmul); \
+    if (P.VU.vflmul < 1) {\
+      require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
+    } else {\
+      require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
+    } \
+  }
+
+#define VI_CHECK_SDS(is_vs1) \
+  VI_NARROW_CHECK_COMMON; \
+  if (insn.rd() != insn.rs2()) \
+    require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul * 2); \
+  if (is_vs1) \
+    require_align(insn.rs1(), P.VU.vflmul); \
+
+#define VI_CHECK_REDUCTION(is_wide) \
+  require_vector(true);\
+  if (is_wide) {\
+    require(P.VU.vsew * 2 <= P.VU.ELEN); \
+  } \
+  require_align(insn.rs2(), P.VU.vflmul); \
+  require(P.VU.vstart == 0); \
+
+#define VI_CHECK_SLIDE(is_over) \
+  require_align(insn.rs2(), P.VU.vflmul); \
+  require_align(insn.rd(), P.VU.vflmul); \
+  require_vm; \
+  if (is_over) \
+    require(insn.rd() != insn.rs2()); \
+
+
+//
+// vector: loop header and end helper
+//
+#define VI_GENERAL_LOOP_BASE \
+  require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
+  require_vector(true);\
+  reg_t vl = P.VU.vl; \
+  reg_t sew = P.VU.vsew; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){
+
+#define VI_LOOP_BASE \
+    VI_GENERAL_LOOP_BASE \
+    VI_LOOP_ELEMENT_SKIP();
+
+#define VI_LOOP_END \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_REDUCTION_END(x) \
+  } \
+  if (vl > 0) { \
+    vd_0_des = vd_0_res; \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_CMP_BASE \
+  require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
+  require_vector(true);\
+  reg_t vl = P.VU.vl; \
+  reg_t sew = P.VU.vsew; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    uint64_t mmask = UINT64_C(1) << mpos; \
+    uint64_t &vdi = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
+    uint64_t res = 0;
+
+#define VI_LOOP_CMP_END \
+    vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_MASK(op) \
+  require(P.VU.vsew <= e64); \
+  require_vector(true);\
+  reg_t vl = P.VU.vl; \
+  for (reg_t i = P.VU.vstart; i < vl; ++i) { \
+    int midx = i / 64; \
+    int mpos = i % 64; \
+    uint64_t mmask = UINT64_C(1) << mpos; \
+    uint64_t vs2 = P.VU.elt<uint64_t>(insn.rs2(), midx); \
+    uint64_t vs1 = P.VU.elt<uint64_t>(insn.rs1(), midx); \
+    uint64_t &res = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
+    res = (res & ~mmask) | ((op) & (1ULL << mpos)); \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_NSHIFT_BASE \
+  VI_GENERAL_LOOP_BASE; \
+  VI_LOOP_ELEMENT_SKIP({\
+    require(!(insn.rd() == 0 && P.VU.vflmul > 1));\
+  });
+
+
+#define INT_ROUNDING(result, xrm, gb) \
+  do { \
+    const uint64_t lsb = 1UL << (gb); \
+    const uint64_t lsb_half = lsb >> 1; \
+    switch (xrm) {\
+      case VRM::RNU:\
+        result += lsb_half; \
+        break;\
+      case VRM::RNE:\
+        if ((result & lsb_half) && ((result & (lsb_half - 1)) || (result & lsb))) \
+          result += lsb; \
+        break;\
+      case VRM::RDN:\
+        break;\
+      case VRM::ROD:\
+        if (result & (lsb - 1)) \
+          result |= lsb; \
+        break;\
+      case VRM::INVALID_RM:\
+        assert(true);\
+    } \
+  } while (0)
+
+//
+// vector: integer and masking operand access helper
+//
+#define VXI_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
+  type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5();
+
+#define VV_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VX_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VI_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type zimm5 = (type_usew_t<x>::type)insn.v_zimm5(); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VV_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VX_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VI_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define XV_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, RS1);
+
+#define VV_UCMP_PARAMS(x) \
+  type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VX_UCMP_PARAMS(x) \
+  type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VI_UCMP_PARAMS(x) \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VV_CMP_PARAMS(x) \
+  type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VX_CMP_PARAMS(x) \
+  type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VI_CMP_PARAMS(x) \
+  type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VI_XI_SLIDEDOWN_PARAMS(x, off) \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i + off);
+
+#define VI_XI_SLIDEUP_PARAMS(x, offset) \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i - offset);
+
+#define VI_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5();
+
+#define VX_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<sew1>::type)RS1;
+
+#define VV_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i);
+
+#define XI_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<x>::type)RS1; \
+  auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
+
+#define VV_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
+
+#define XI_WITH_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<x>::type)RS1; \
+  auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
+
+#define VV_WITH_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
+
+//
+// vector: integer and masking operation loop
+//
+
+// comparision result to masking register
+#define VI_VV_LOOP_CMP(BODY) \
+  VI_CHECK_MSS(true); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VV_CMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_CMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_CMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_CMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VX_LOOP_CMP(BODY) \
+  VI_CHECK_MSS(false); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VX_CMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_CMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_CMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_CMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VI_LOOP_CMP(BODY) \
+  VI_CHECK_MSS(false); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VI_CMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_CMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_CMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_CMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VV_ULOOP_CMP(BODY) \
+  VI_CHECK_MSS(true); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VV_UCMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_UCMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_UCMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_UCMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VX_ULOOP_CMP(BODY) \
+  VI_CHECK_MSS(false); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VX_UCMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_UCMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_UCMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_UCMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VI_ULOOP_CMP(BODY) \
+  VI_CHECK_MSS(false); \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VI_UCMP_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_UCMP_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_UCMP_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_UCMP_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+// merge and copy loop
+#define VI_VVXI_MERGE_LOOP(BODY) \
+  VI_GENERAL_LOOP_BASE \
+  if (sew == e8){ \
+    VXI_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VXI_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VXI_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VXI_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+// reduction loop - signed
+#define VI_LOOP_REDUCTION_BASE(x) \
+  require(x >= e8 && x <= e64); \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true); \
+  auto vd_0_res = P.VU.elt<type_sew_t<x>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+
+#define REDUCTION_LOOP(x, BODY) \
+  VI_LOOP_REDUCTION_BASE(x) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(x)
+
+#define VI_VV_LOOP_REDUCTION(BODY) \
+  VI_CHECK_REDUCTION(false); \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8) { \
+    REDUCTION_LOOP(e8, BODY) \
+  } else if(sew == e16) { \
+    REDUCTION_LOOP(e16, BODY) \
+  } else if(sew == e32) { \
+    REDUCTION_LOOP(e32, BODY) \
+  } else if(sew == e64) { \
+    REDUCTION_LOOP(e64, BODY) \
+  }
+
+// reduction loop - unsgied
+#define VI_ULOOP_REDUCTION_BASE(x) \
+  require(x >= e8 && x <= e64); \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_usew_t<x>::type>(rd_num, 0, true); \
+  auto vd_0_res = P.VU.elt<type_usew_t<x>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define REDUCTION_ULOOP(x, BODY) \
+  VI_ULOOP_REDUCTION_BASE(x) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(x)
+
+#define VI_VV_ULOOP_REDUCTION(BODY) \
+  VI_CHECK_REDUCTION(false); \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    REDUCTION_ULOOP(e8, BODY) \
+  } else if(sew == e16) { \
+    REDUCTION_ULOOP(e16, BODY) \
+  } else if(sew == e32) { \
+    REDUCTION_ULOOP(e32, BODY) \
+  } else if(sew == e64) { \
+    REDUCTION_ULOOP(e64, BODY) \
+  }
+
+
+// genearl VXI signed/unsgied loop
+#define VI_VV_ULOOP(BODY) \
+  VI_CHECK_SSS(true) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VV_LOOP(BODY) \
+  VI_CHECK_SSS(true) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_LOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VI_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VI_LOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+// narrow operation loop
+#define VI_VV_LOOP_NARROW(BODY) \
+VI_NARROW_CHECK_COMMON; \
+VI_LOOP_BASE \
+if (sew == e8){ \
+  VI_NARROW_SHIFT(e8, e16) \
+  BODY; \
+}else if(sew == e16){ \
+  VI_NARROW_SHIFT(e16, e32) \
+  BODY; \
+}else if(sew == e32){ \
+  VI_NARROW_SHIFT(e32, e64) \
+  BODY; \
+} \
+VI_LOOP_END
+
+#define VI_NARROW_SHIFT(sew1, sew2) \
+  type_usew_t<sew1>::type &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
+  type_usew_t<sew2>::type vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  type_usew_t<sew1>::type zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5(); \
+  type_sew_t<sew2>::type vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  type_sew_t<sew1>::type vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
+  type_sew_t<sew1>::type rs1 = (type_sew_t<sew1>::type)RS1;
+
+#define VI_VVXI_LOOP_NARROW(BODY, is_vs1) \
+  VI_CHECK_SDS(is_vs1); \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_NARROW_SHIFT(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VI_NARROW_SHIFT(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VI_NARROW_SHIFT(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VI_LOOP_NSHIFT(BODY, is_vs1) \
+  VI_CHECK_SDS(is_vs1); \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VI_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VI_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VI_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_LOOP_NSHIFT(BODY, is_vs1) \
+  VI_CHECK_SDS(is_vs1); \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VX_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VX_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VX_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VV_LOOP_NSHIFT(BODY, is_vs1) \
+  VI_CHECK_SDS(is_vs1); \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VV_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VV_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VV_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+// widen operation loop
+#define VI_VV_LOOP_WIDEN(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_PARAMS(e32); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_LOOP_WIDEN(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_PARAMS(e32); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_WIDE_OP_AND_ASSIGN(var0, var1, var2, op0, op1, sign) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign##16_t vd_w = P.VU.elt<sign##16_t>(rd_num, i); \
+    P.VU.elt<uint16_t>(rd_num, i, true) = \
+      op1((sign##16_t)(sign##8_t)var0 op0 (sign##16_t)(sign##8_t)var1) + var2; \
+    } \
+    break; \
+  case e16: { \
+    sign##32_t vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i, true) = \
+      op1((sign##32_t)(sign##16_t)var0 op0 (sign##32_t)(sign##16_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign##64_t vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i, true) = \
+      op1((sign##64_t)(sign##32_t)var0 op0 (sign##64_t)(sign##32_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+#define VI_WIDE_OP_AND_ASSIGN_MIX(var0, var1, var2, op0, op1, sign_d, sign_1, sign_2) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign_d##16_t vd_w = P.VU.elt<sign_d##16_t>(rd_num, i); \
+    P.VU.elt<uint16_t>(rd_num, i, true) = \
+      op1((sign_1##16_t)(sign_1##8_t)var0 op0 (sign_2##16_t)(sign_2##8_t)var1) + var2; \
+    } \
+    break; \
+  case e16: { \
+    sign_d##32_t vd_w = P.VU.elt<sign_d##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i, true) = \
+      op1((sign_1##32_t)(sign_1##16_t)var0 op0 (sign_2##32_t)(sign_2##16_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign_d##64_t vd_w = P.VU.elt<sign_d##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i, true) = \
+      op1((sign_1##64_t)(sign_1##32_t)var0 op0 (sign_2##64_t)(sign_2##32_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+#define VI_WIDE_WVX_OP(var0, op0, sign) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign##16_t &vd_w = P.VU.elt<sign##16_t>(rd_num, i, true); \
+    sign##16_t vs2_w = P.VU.elt<sign##16_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##16_t)(sign##8_t)var0; \
+    } \
+    break; \
+  case e16: { \
+    sign##32_t &vd_w = P.VU.elt<sign##32_t>(rd_num, i, true); \
+    sign##32_t vs2_w = P.VU.elt<sign##32_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##32_t)(sign##16_t)var0; \
+    } \
+    break; \
+  default: { \
+    sign##64_t &vd_w = P.VU.elt<sign##64_t>(rd_num, i, true); \
+    sign##64_t vs2_w = P.VU.elt<sign##64_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##64_t)(sign##32_t)var0; \
+    } \
+    break; \
+  }
+
+// quad operation loop
+#define VI_VV_LOOP_QUAD(BODY) \
+  VI_CHECK_QSS(true); \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_LOOP_QUAD(BODY) \
+  VI_CHECK_QSS(false); \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_QUAD_OP_AND_ASSIGN(var0, var1, var2, op0, op1, sign) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign##32_t vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i, true) = \
+      op1((sign##32_t)(sign##8_t)var0 op0 (sign##32_t)(sign##8_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign##64_t vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i, true) = \
+      op1((sign##64_t)(sign##16_t)var0 op0 (sign##64_t)(sign##16_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+#define VI_QUAD_OP_AND_ASSIGN_MIX(var0, var1, var2, op0, op1, sign_d, sign_1, sign_2) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign_d##32_t vd_w = P.VU.elt<sign_d##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i, true) = \
+      op1((sign_1##32_t)(sign_1##8_t)var0 op0 (sign_2##32_t)(sign_2##8_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign_d##64_t vd_w = P.VU.elt<sign_d##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i, true) = \
+      op1((sign_1##64_t)(sign_1##16_t)var0 op0 (sign_2##64_t)(sign_2##16_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+// wide reduction loop - signed
+#define VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_sew_t<sew2>::type>(rd_num, 0, true); \
+  auto vd_0_res = P.VU.elt<type_sew_t<sew2>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i);
+
+#define WIDE_REDUCTION_LOOP(sew1, sew2, BODY) \
+  VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(sew2)
+
+#define VI_VV_LOOP_WIDE_REDUCTION(BODY) \
+  VI_CHECK_REDUCTION(true); \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    WIDE_REDUCTION_LOOP(e8, e16, BODY) \
+  } else if(sew == e16){ \
+    WIDE_REDUCTION_LOOP(e16, e32, BODY) \
+  } else if(sew == e32){ \
+    WIDE_REDUCTION_LOOP(e32, e64, BODY) \
+  }
+
+// wide reduction loop - unsigned
+#define VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_usew_t<sew2>::type>(rd_num, 0, true); \
+  auto vd_0_res = P.VU.elt<type_usew_t<sew2>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i) { \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i);
+
+#define WIDE_REDUCTION_ULOOP(sew1, sew2, BODY) \
+  VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(sew2)
+
+#define VI_VV_ULOOP_WIDE_REDUCTION(BODY) \
+  VI_CHECK_REDUCTION(true); \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    WIDE_REDUCTION_ULOOP(e8, e16, BODY) \
+  } else if(sew == e16){ \
+    WIDE_REDUCTION_ULOOP(e16, e32, BODY) \
+  } else if(sew == e32){ \
+    WIDE_REDUCTION_ULOOP(e32, e64, BODY) \
+  }
+
+// carry/borrow bit loop
+#define VI_VV_LOOP_CARRY(BODY) \
+  VI_CHECK_MSS(true); \
+  VI_GENERAL_LOOP_BASE \
+  VI_MASK_VARS \
+    if (sew == e8){ \
+      VV_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      VV_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      VV_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      VV_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  VI_LOOP_END
+
+#define VI_XI_LOOP_CARRY(BODY) \
+  VI_CHECK_MSS(false); \
+  VI_GENERAL_LOOP_BASE \
+  VI_MASK_VARS \
+    if (sew == e8){ \
+      XI_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      XI_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      XI_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      XI_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  VI_LOOP_END
+
+#define VI_VV_LOOP_WITH_CARRY(BODY) \
+  require(insn.rd() != 0); \
+  VI_CHECK_SSS(true); \
+  VI_GENERAL_LOOP_BASE \
+  VI_MASK_VARS \
+    if (sew == e8){ \
+      VV_WITH_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      VV_WITH_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      VV_WITH_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      VV_WITH_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  VI_LOOP_END
+
+#define VI_XI_LOOP_WITH_CARRY(BODY) \
+  require(insn.rd() != 0); \
+  VI_CHECK_SSS(false); \
+  VI_GENERAL_LOOP_BASE \
+  VI_MASK_VARS \
+    if (sew == e8){ \
+      XI_WITH_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      XI_WITH_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      XI_WITH_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      XI_WITH_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  VI_LOOP_END
+
+// average loop
+#define VI_VVX_LOOP_AVG(opd, op, is_vs1) \
+VI_CHECK_SSS(is_vs1); \
+VRM xrm = p->VU.get_vround_mode(); \
+VI_LOOP_BASE \
+  switch(sew) { \
+    case e8: { \
+     VV_PARAMS(e8); \
+     type_sew_t<e8>::type rs1 = RS1; \
+     auto res = (int32_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e16: { \
+     VV_PARAMS(e16); \
+     type_sew_t<e16>::type rs1 = RS1; \
+     auto res = (int32_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e32: { \
+     VV_PARAMS(e32); \
+     type_sew_t<e32>::type rs1 = RS1; \
+     auto res = (int64_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    default: { \
+     VV_PARAMS(e64); \
+     type_sew_t<e64>::type rs1 = RS1; \
+     auto res = (int128_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+  } \
+VI_LOOP_END
+
+#define VI_VVX_ULOOP_AVG(opd, op, is_vs1) \
+VI_CHECK_SSS(is_vs1); \
+VRM xrm = p->VU.get_vround_mode(); \
+VI_LOOP_BASE \
+  switch(sew) { \
+    case e8: { \
+     VV_U_PARAMS(e8); \
+     type_usew_t<e8>::type rs1 = RS1; \
+     auto res = (uint16_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e16: { \
+     VV_U_PARAMS(e16); \
+     type_usew_t<e16>::type rs1 = RS1; \
+     auto res = (uint32_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e32: { \
+     VV_U_PARAMS(e32); \
+     type_usew_t<e32>::type rs1 = RS1; \
+     auto res = (uint64_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    default: { \
+     VV_U_PARAMS(e64); \
+     type_usew_t<e64>::type rs1 = RS1; \
+     auto res = (uint128_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+  } \
+VI_LOOP_END
+
+//
+// vector: load/store helper
+//
+#define VI_STRIP(inx) \
+  reg_t vreg_inx = inx;
+
+#define VI_DUPLICATE_VREG(reg_num, idx_sew) \
+reg_t index[P.VU.vlmax]; \
+for (reg_t i = 0; i < P.VU.vlmax && P.VU.vl != 0; ++i) { \
+  switch(idx_sew) { \
+    case e8: \
+      index[i] = P.VU.elt<uint8_t>(reg_num, i); \
+      break; \
+    case e16: \
+      index[i] = P.VU.elt<uint16_t>(reg_num, i); \
+      break; \
+    case e32: \
+      index[i] = P.VU.elt<uint32_t>(reg_num, i); \
+      break; \
+    case e64: \
+      index[i] = P.VU.elt<uint64_t>(reg_num, i); \
+      break; \
+  } \
+}
+
+#define VI_LD(stride, offset, elt_width) \
+  const reg_t nf = insn.v_nf() + 1; \
+  const reg_t vl = P.VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t vd = insn.rd(); \
+  VI_CHECK_LOAD(elt_width); \
+  for (reg_t i = 0; i < vl; ++i) { \
+    VI_ELEMENT_SKIP(i); \
+    VI_STRIP(i); \
+    P.VU.vstart = i; \
+    for (reg_t fn = 0; fn < nf; ++fn) { \
+      elt_width##_t val = MMU.load_##elt_width( \
+        baseAddr + (stride) + (offset) * sizeof(elt_width##_t)); \
+      P.VU.elt<elt_width##_t>(vd + fn * emul, vreg_inx, true) = val; \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LD_INDEX(elt_width, is_seg) \
+  const reg_t nf = insn.v_nf() + 1; \
+  const reg_t vl = P.VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t vd = insn.rd(); \
+  if (!is_seg) \
+    require(nf == 1); \
+  VI_CHECK_LD_INDEX(elt_width); \
+  VI_DUPLICATE_VREG(insn.rs2(), elt_width); \
+  for (reg_t i = 0; i < vl; ++i) { \
+    VI_ELEMENT_SKIP(i); \
+    VI_STRIP(i); \
+    P.VU.vstart = i; \
+    for (reg_t fn = 0; fn < nf; ++fn) { \
+      switch(P.VU.vsew){ \
+        case e8: \
+          P.VU.elt<uint8_t>(vd + fn * flmul, vreg_inx, true) = \
+            MMU.load_uint8(baseAddr + index[i] + fn * 1); \
+          break; \
+        case e16: \
+          P.VU.elt<uint16_t>(vd + fn * flmul, vreg_inx, true) = \
+            MMU.load_uint16(baseAddr + index[i] + fn * 2); \
+          break; \
+        case e32: \
+          P.VU.elt<uint32_t>(vd + fn * flmul, vreg_inx, true) = \
+            MMU.load_uint32(baseAddr + index[i] + fn * 4); \
+          break; \
+        default: \
+          P.VU.elt<uint64_t>(vd + fn * flmul, vreg_inx, true) = \
+            MMU.load_uint64(baseAddr + index[i] + fn * 8); \
+          break; \
+      } \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_ST(stride, offset, elt_width) \
+  const reg_t nf = insn.v_nf() + 1; \
+  const reg_t vl = P.VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t vs3 = insn.rd(); \
+  VI_CHECK_STORE(elt_width); \
+  for (reg_t i = 0; i < vl; ++i) { \
+    VI_STRIP(i) \
+    VI_ELEMENT_SKIP(i); \
+    P.VU.vstart = i; \
+    for (reg_t fn = 0; fn < nf; ++fn) { \
+      elt_width##_t val = P.VU.elt<elt_width##_t>(vs3 + fn * emul, vreg_inx); \
+      MMU.store_##elt_width( \
+        baseAddr + (stride) + (offset) * sizeof(elt_width##_t), val); \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_ST_INDEX(elt_width, is_seg) \
+  const reg_t nf = insn.v_nf() + 1; \
+  const reg_t vl = P.VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t vs3 = insn.rd(); \
+  if (!is_seg) \
+    require(nf == 1); \
+  VI_CHECK_ST_INDEX(elt_width); \
+  VI_DUPLICATE_VREG(insn.rs2(), elt_width);   \
+  for (reg_t i = 0; i < vl; ++i) { \
+    VI_STRIP(i) \
+    VI_ELEMENT_SKIP(i); \
+    P.VU.vstart = i; \
+    for (reg_t fn = 0; fn < nf; ++fn) { \
+      switch (P.VU.vsew) { \
+      case e8: \
+        MMU.store_uint8(baseAddr + index[i] + fn * 1, \
+          P.VU.elt<uint8_t>(vs3 + fn * flmul, vreg_inx)); \
+        break; \
+      case e16: \
+        MMU.store_uint16(baseAddr + index[i] + fn * 2, \
+          P.VU.elt<uint16_t>(vs3 + fn * flmul, vreg_inx)); \
+        break; \
+      case e32: \
+        MMU.store_uint32(baseAddr + index[i] + fn * 4, \
+          P.VU.elt<uint32_t>(vs3 + fn * flmul, vreg_inx)); \
+        break; \
+      default: \
+        MMU.store_uint64(baseAddr + index[i] + fn * 8, \
+          P.VU.elt<uint64_t>(vs3 + fn * flmul, vreg_inx)); \
+        break; \
+      } \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LDST_FF(elt_width) \
+  const reg_t nf = insn.v_nf() + 1; \
+  const reg_t sew = p->VU.vsew; \
+  const reg_t vl = p->VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t rd_num = insn.rd(); \
+  VI_CHECK_LOAD(elt_width); \
+  bool early_stop = false; \
+  for (reg_t i = p->VU.vstart; i < vl; ++i) { \
+    VI_STRIP(i); \
+    VI_ELEMENT_SKIP(i); \
+    \
+    for (reg_t fn = 0; fn < nf; ++fn) { \
+      uint64_t val; \
+      try { \
+        val = MMU.load_##elt_width( \
+          baseAddr + (i * nf + fn) * sizeof(elt_width##_t)); \
+      } catch (trap_t& t) { \
+        if (i == 0) \
+          throw; /* Only take exception on zeroth element */ \
+        /* Reduce VL if an exception occurs on a later element */ \
+        early_stop = true; \
+        P.VU.vl = i; \
+        break; \
+      } \
+      p->VU.elt<elt_width##_t>(rd_num + fn * emul, vreg_inx, true) = val; \
+    } \
+    \
+    if (early_stop) { \
+      break; \
+    } \
+  } \
+  p->VU.vstart = 0;
+
+#define VI_LD_WHOLE(elt_width) \
+  require_vector_novtype(true, false); \
+  const reg_t baseAddr = RS1; \
+  const reg_t vd = insn.rd(); \
+  const reg_t len = insn.v_nf() + 1; \
+  require_align(vd, len); \
+  const reg_t elt_per_reg = P.VU.vlenb / sizeof(elt_width ## _t); \
+  const reg_t size = len * elt_per_reg; \
+  if (P.VU.vstart < size) { \
+    reg_t i = P.VU.vstart / elt_per_reg; \
+    reg_t off = P.VU.vstart % elt_per_reg; \
+    if (off) { \
+      for (reg_t pos = off; pos < elt_per_reg; ++pos) { \
+        auto val = MMU.load_## elt_width(baseAddr + \
+          P.VU.vstart * sizeof(elt_width ## _t)); \
+        P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
+        P.VU.vstart++; \
+      } \
+      ++i; \
+    } \
+    for (; i < len; ++i) { \
+      for (reg_t pos = 0; pos < elt_per_reg; ++pos) { \
+        auto val = MMU.load_## elt_width(baseAddr + \
+          P.VU.vstart * sizeof(elt_width ## _t)); \
+        P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
+        P.VU.vstart++; \
+      } \
+    } \
+  } \
+  P.VU.vstart = 0; \
+
+#define VI_ST_WHOLE \
+  require_vector_novtype(true, false); \
+  const reg_t baseAddr = RS1; \
+  const reg_t vs3 = insn.rd(); \
+  const reg_t len = insn.v_nf() + 1; \
+  require_align(vs3, len); \
+  const reg_t size = len * P.VU.vlenb; \
+   \
+  if (P.VU.vstart < size) { \
+    reg_t i = P.VU.vstart / P.VU.vlenb; \
+    reg_t off = P.VU.vstart % P.VU.vlenb; \
+    if (off) { \
+      for (reg_t pos = off; pos < P.VU.vlenb; ++pos) { \
+        auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
+        MMU.store_uint8(baseAddr + P.VU.vstart, val); \
+        P.VU.vstart++; \
+      } \
+      i++; \
+    } \
+    for (; i < len; ++i) { \
+      for (reg_t pos = 0; pos < P.VU.vlenb; ++pos) { \
+        auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
+        MMU.store_uint8(baseAddr + P.VU.vstart, val); \
+        P.VU.vstart++; \
+      } \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+//
+// vector: amo
+//
+#define VI_AMO(op, type, idx_type) \
+  require_vector(false); \
+  require_align(insn.rd(), P.VU.vflmul); \
+  require(P.VU.vsew <= P.get_xlen() && P.VU.vsew >= 32); \
+  require_align(insn.rd(), P.VU.vflmul); \
+  float vemul = ((float)idx_type / P.VU.vsew * P.VU.vflmul); \
+  require(vemul >= 0.125 && vemul <= 8); \
+  require_align(insn.rs2(), vemul); \
+  if (insn.v_wd()) {\
+    require_vm; \
+    if (idx_type > P.VU.vsew) { \
+      if (insn.rd() != insn.rs2()) \
+        require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+    } else if (idx_type < P.VU.vsew) { \
+      if (vemul < 1) {\
+        require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+      } else {\
+        require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
+      } \
+    } \
+  } \
+  VI_DUPLICATE_VREG(insn.rs2(), idx_type); \
+  const reg_t vl = P.VU.vl; \
+  const reg_t baseAddr = RS1; \
+  const reg_t vd = insn.rd(); \
+  for (reg_t i = P.VU.vstart; i < vl; ++i) { \
+    VI_ELEMENT_SKIP(i); \
+    VI_STRIP(i); \
+    switch (P.VU.vsew) { \
+    case e32: {\
+      auto vs3 = P.VU.elt< type ## 32_t>(vd, vreg_inx); \
+      auto val = MMU.amo_uint32(baseAddr + index[i], [&]( type ## 32_t lhs) { op }); \
+      if (insn.v_wd()) \
+        P.VU.elt< type ## 32_t>(vd, vreg_inx, true) = val; \
+      } \
+      break; \
+    case e64: {\
+      auto vs3 = P.VU.elt< type ## 64_t>(vd, vreg_inx); \
+      auto val = MMU.amo_uint64(baseAddr + index[i], [&]( type ## 64_t lhs) { op }); \
+      if (insn.v_wd()) \
+        P.VU.elt< type ## 64_t>(vd, vreg_inx, true) = val; \
+      } \
+      break; \
+    default: \
+      require(0); \
+      break; \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+// vector: sign/unsiged extension
+#define VI_VV_EXT(div, type) \
+  require(insn.rd() != insn.rs2()); \
+  require_vm; \
+  reg_t from = P.VU.vsew / div; \
+  require(from >= e8 && from <= e64); \
+  require_align(insn.rd(), P.VU.vflmul); \
+  require_align(insn.rs2(), P.VU.vflmul / div); \
+  if ((P.VU.vflmul / div) < 1) { \
+    require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
+  } else {\
+    require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
+  } \
+  reg_t pat = (((P.VU.vsew >> 3) << 4) | from >> 3); \
+  VI_GENERAL_LOOP_BASE \
+  VI_LOOP_ELEMENT_SKIP(); \
+    switch (pat) { \
+      case 0x21: \
+        P.VU.elt<type##16_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
+        break; \
+      case 0x41: \
+        P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
+        break; \
+      case 0x81: \
+        P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
+        break; \
+      case 0x42: \
+        P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
+        break; \
+      case 0x82: \
+        P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
+        break; \
+      case 0x84: \
+        P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
+        break; \
+      case 0x88: \
+        P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
+        break; \
+      default: \
+        break; \
+    } \
+  VI_LOOP_END
+
+//
+// vector: vfp helper
+//
+#define VI_VFP_COMMON \
+  require_fp; \
+  require((P.VU.vsew == e16 && p->supports_extension(EXT_ZFH)) || \
+          (P.VU.vsew == e32 && p->supports_extension('F')) || \
+          (P.VU.vsew == e64 && p->supports_extension('D'))); \
+  require_vector(true);\
+  require(STATE.frm < 0x5);\
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  softfloat_roundingMode = STATE.frm;
+
+#define VI_VFP_LOOP_BASE \
+  VI_VFP_COMMON \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP();
+
+#define VI_VFP_LOOP_CMP_BASE \
+  VI_VFP_COMMON \
+  for (reg_t i = P.VU.vstart; i < vl; ++i) { \
+    VI_LOOP_ELEMENT_SKIP(); \
+    uint64_t mmask = UINT64_C(1) << mpos; \
+    uint64_t &vdi = P.VU.elt<uint64_t>(rd_num, midx, true); \
+    uint64_t res = 0;
+
+#define VI_VFP_LOOP_REDUCTION_BASE(width) \
+  float##width##_t vd_0 = P.VU.elt<float##width##_t>(rd_num, 0); \
+  float##width##_t vs1_0 = P.VU.elt<float##width##_t>(rs1_num, 0); \
+  vd_0 = vs1_0; \
+  bool is_active = false; \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    float##width##_t vs2 = P.VU.elt<float##width##_t>(rs2_num, i); \
+    is_active = true; \
+
+#define VI_VFP_LOOP_WIDE_REDUCTION_BASE \
+  VI_VFP_COMMON \
+  float64_t vd_0 = f64(P.VU.elt<float64_t>(rs1_num, 0).v); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i) { \
+    VI_LOOP_ELEMENT_SKIP();
+
+#define VI_VFP_LOOP_END \
+  } \
+  P.VU.vstart = 0; \
+
+#define VI_VFP_LOOP_REDUCTION_END(x) \
+  } \
+  P.VU.vstart = 0; \
+  if (vl > 0) { \
+    if (is_propagate && !is_active) { \
+      switch (x) { \
+        case e16: {\
+            auto ret = f16_classify(f16(vd_0.v)); \
+            if (ret & 0x300) { \
+              if (ret & 0x100) { \
+                softfloat_exceptionFlags |= softfloat_flag_invalid; \
+                set_fp_exceptions; \
+              } \
+              P.VU.elt<uint16_t>(rd_num, 0, true) = defaultNaNF16UI; \
+            } else { \
+              P.VU.elt<uint16_t>(rd_num, 0, true) = vd_0.v; \
+            } \
+          } \
+          break; \
+        case e32: { \
+            auto ret = f32_classify(f32(vd_0.v)); \
+            if (ret & 0x300) { \
+              if (ret & 0x100) { \
+                softfloat_exceptionFlags |= softfloat_flag_invalid; \
+                set_fp_exceptions; \
+              } \
+              P.VU.elt<uint32_t>(rd_num, 0, true) = defaultNaNF32UI; \
+            } else { \
+              P.VU.elt<uint32_t>(rd_num, 0, true) = vd_0.v; \
+            } \
+          } \
+          break; \
+        case e64: {\
+            auto ret = f64_classify(f64(vd_0.v)); \
+            if (ret & 0x300) { \
+              if (ret & 0x100) { \
+                softfloat_exceptionFlags |= softfloat_flag_invalid; \
+                set_fp_exceptions; \
+              } \
+              P.VU.elt<uint64_t>(rd_num, 0, true) = defaultNaNF64UI; \
+            } else { \
+              P.VU.elt<uint64_t>(rd_num, 0, true) = vd_0.v; \
+            } \
+          } \
+          break; \
+      } \
+    } else { \
+      P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true) = vd_0.v; \
+    } \
+  }
+
+#define VI_VFP_LOOP_CMP_END \
+  switch(P.VU.vsew) { \
+    case e16: \
+    case e32: \
+    case e64: { \
+      vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
+      break; \
+    } \
+    default: \
+      require(0); \
+      break; \
+    }; \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_VFP_VV_LOOP(BODY16, BODY32, BODY64) \
+  VI_CHECK_SSS(true); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
+      float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
+      float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e64: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      BODY64; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_V_LOOP(BODY16, BODY32, BODY64) \
+  VI_CHECK_SSS(false); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
+      float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
+      BODY16; \
+      break; \
+    }\
+    case e32: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      BODY32; \
+      break; \
+    }\
+    case e64: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      BODY64; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  set_fp_exceptions; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_VV_LOOP_REDUCTION(BODY16, BODY32, BODY64) \
+  VI_CHECK_REDUCTION(false) \
+  VI_VFP_COMMON \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      VI_VFP_LOOP_REDUCTION_BASE(16) \
+        BODY16; \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_REDUCTION_END(e16) \
+      break; \
+    }\
+    case e32: {\
+      VI_VFP_LOOP_REDUCTION_BASE(32) \
+        BODY32; \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_REDUCTION_END(e32) \
+      break; \
+    }\
+    case e64: {\
+      VI_VFP_LOOP_REDUCTION_BASE(64) \
+        BODY64; \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_REDUCTION_END(e64) \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+
+#define VI_VFP_VV_LOOP_WIDE_REDUCTION(BODY16, BODY32) \
+  VI_CHECK_REDUCTION(true) \
+  VI_VFP_COMMON \
+  require((P.VU.vsew == e16 && p->supports_extension('F')) || \
+          (P.VU.vsew == e32 && p->supports_extension('D'))); \
+  bool is_active = false; \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float32_t vd_0 = P.VU.elt<float32_t>(rs1_num, 0); \
+      for (reg_t i=P.VU.vstart; i<vl; ++i) { \
+        VI_LOOP_ELEMENT_SKIP(); \
+        is_active = true; \
+        float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
+        BODY16; \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_REDUCTION_END(e32) \
+      break; \
+    }\
+    case e32: {\
+      float64_t vd_0 = P.VU.elt<float64_t>(rs1_num, 0); \
+      for (reg_t i=P.VU.vstart; i<vl; ++i) { \
+        VI_LOOP_ELEMENT_SKIP(); \
+        is_active = true; \
+        float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
+        BODY32; \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_REDUCTION_END(e64) \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+
+#define VI_VFP_VF_LOOP(BODY16, BODY32, BODY64) \
+  VI_CHECK_SSS(false); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
+      float16_t rs1 = f16(READ_FREG(rs1_num)); \
+      float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t rs1 = f32(READ_FREG(rs1_num)); \
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e64: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t rs1 = f64(READ_FREG(rs1_num)); \
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      BODY64; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VF; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_LOOP_CMP(BODY16, BODY32, BODY64, is_vs1) \
+  VI_CHECK_MSS(is_vs1); \
+  VI_VFP_LOOP_CMP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
+      float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
+      float16_t rs1 = f16(READ_FREG(rs1_num)); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
+      float32_t rs1 = f32(READ_FREG(rs1_num)); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e64: {\
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
+      float64_t rs1 = f64(READ_FREG(rs1_num)); \
+      BODY64; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  VI_VFP_LOOP_CMP_END \
+
+#define VI_VFP_VF_LOOP_WIDE(BODY16, BODY32) \
+  VI_CHECK_DSS(false); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: { \
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
+      float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    } \
+    case e32: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
+      float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
+
+#define VI_VFP_VV_LOOP_WIDE(BODY16, BODY32) \
+  VI_CHECK_DSS(true); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
+      float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
+      float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_WF_LOOP_WIDE(BODY16, BODY32) \
+  VI_CHECK_DDS(false); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_WV_LOOP_WIDE(BODY16, BODY32) \
+  VI_CHECK_DDS(true); \
+  VI_VFP_LOOP_BASE \
+  switch(P.VU.vsew) { \
+    case e16: {\
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
+      float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
+      BODY16; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    case e32: {\
+      float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
+      float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
+      float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
+      BODY32; \
+      set_fp_exceptions; \
+      break; \
+    }\
+    default: \
+      require(0); \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
+#define VI_VFP_LOOP_SCALE_BASE \
+  require_fp; \
+  require_vector(true);\
+  require((P.VU.vsew == e8 && p->supports_extension(EXT_ZFH)) || \
+          (P.VU.vsew == e16 && p->supports_extension('F')) || \
+          (P.VU.vsew == e32 && p->supports_extension('D'))); \
+  require(STATE.frm < 0x5);\
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  softfloat_roundingMode = STATE.frm; \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP();
+
+#define VI_VFP_CVT_SCALE(BODY8, BODY16, BODY32, \
+                         CHECK8, CHECK16, CHECK32, \
+                         is_widen, eew_check) \
+  if (is_widen) { \
+    VI_CHECK_DSS(false);\
+  } else { \
+    VI_CHECK_SDS(false); \
+  } \
+  require(eew_check); \
+  switch(P.VU.vsew) { \
+    case e8: {\
+      CHECK8 \
+      VI_VFP_LOOP_SCALE_BASE \
+        BODY8 \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_END \
+      } \
+      break; \
+    case e16: {\
+      CHECK16 \
+      VI_VFP_LOOP_SCALE_BASE \
+        BODY16 \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_END \
+      } \
+      break; \
+    case e32: {\
+      CHECK32 \
+      VI_VFP_LOOP_SCALE_BASE \
+        BODY32 \
+        set_fp_exceptions; \
+      VI_VFP_LOOP_END \
+      } \
+      break; \
+    default: \
+      require(0); \
+      break; \
+  }
+
+#define DEBUG_START             0x0
+#define DEBUG_END               (0x1000 - 1)
 
 #endif
diff --git a/riscv/devices.cc b/riscv/devices.cc
index bcdd3a19ea..4b768b6092 100644
--- a/riscv/devices.cc
+++ b/riscv/devices.cc
@@ -48,3 +48,43 @@ std::pair<reg_t, abstract_device_t*> bus_t::find_device(reg_t addr)
   it--;
   return std::make_pair(it->first, it->second);
 }
+
+// Type for holding all registered MMIO plugins by name.
+using mmio_plugin_map_t = std::map<std::string, mmio_plugin_t>;
+
+// Simple singleton instance of an mmio_plugin_map_t.
+static mmio_plugin_map_t& mmio_plugin_map()
+{
+  static mmio_plugin_map_t instance;
+  return instance;
+}
+
+void register_mmio_plugin(const char* name_cstr,
+                          const mmio_plugin_t* mmio_plugin)
+{
+  std::string name(name_cstr);
+  if (!mmio_plugin_map().emplace(name, *mmio_plugin).second) {
+    throw std::runtime_error("Plugin \"" + name + "\" already registered!");
+  }
+}
+
+mmio_plugin_device_t::mmio_plugin_device_t(const std::string& name,
+                                           const std::string& args)
+  : plugin(mmio_plugin_map().at(name)), user_data((*plugin.alloc)(args.c_str()))
+{
+}
+
+mmio_plugin_device_t::~mmio_plugin_device_t()
+{
+  (*plugin.dealloc)(user_data);
+}
+
+bool mmio_plugin_device_t::load(reg_t addr, size_t len, uint8_t* bytes)
+{
+  return (*plugin.load)(user_data, addr, len, bytes);
+}
+
+bool mmio_plugin_device_t::store(reg_t addr, size_t len, const uint8_t* bytes)
+{
+  return (*plugin.store)(user_data, addr, len, bytes);
+}
diff --git a/riscv/devices.h b/riscv/devices.h
index 4e4d27ff60..3dd6c66936 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -2,10 +2,12 @@
 #define _RISCV_DEVICES_H
 
 #include "decode.h"
+#include "mmio_plugin.h"
 #include <cstdlib>
 #include <string>
 #include <map>
 #include <vector>
+#include <stdexcept>
 
 class processor_t;
 
@@ -62,7 +64,7 @@ class mem_t : public abstract_device_t {
 
 class clint_t : public abstract_device_t {
  public:
-  clint_t(std::vector<processor_t*>&);
+  clint_t(std::vector<processor_t*>&, uint64_t freq_hz, bool real_time);
   bool load(reg_t addr, size_t len, uint8_t* bytes);
   bool store(reg_t addr, size_t len, const uint8_t* bytes);
   size_t size() { return CLINT_SIZE; }
@@ -72,8 +74,25 @@ class clint_t : public abstract_device_t {
   typedef uint64_t mtimecmp_t;
   typedef uint32_t msip_t;
   std::vector<processor_t*>& procs;
+  uint64_t freq_hz;
+  bool real_time;
+  uint64_t real_time_ref_secs;
+  uint64_t real_time_ref_usecs;
   mtime_t mtime;
   std::vector<mtimecmp_t> mtimecmp;
 };
 
+class mmio_plugin_device_t : public abstract_device_t {
+ public:
+  mmio_plugin_device_t(const std::string& name, const std::string& args);
+  virtual ~mmio_plugin_device_t() override;
+
+  virtual bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  virtual bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
+
+ private:
+  mmio_plugin_t plugin;
+  void* user_data;
+};
+
 #endif
diff --git a/riscv/disasm.h b/riscv/disasm.h
index 94e007a12d..88d0e9b3ad 100644
--- a/riscv/disasm.h
+++ b/riscv/disasm.h
@@ -10,6 +10,7 @@
 
 extern const char* xpr_name[NXPR];
 extern const char* fpr_name[NFPR];
+extern const char* vr_name[NVPR];
 extern const char* csr_name(int which);
 
 class arg_t
@@ -19,18 +20,31 @@ class arg_t
   virtual ~arg_t() {}
 };
 
+// Indicates that the next arg (only) is optional.
+// If the result of converting the next arg to a string is ""
+// then it will not be printed.
+struct : public arg_t {
+  std::string to_string(insn_t insn) const { return ""; }
+} opt;
+
 class disasm_insn_t
 {
  public:
-  disasm_insn_t(const char* name, uint32_t match, uint32_t mask,
-                const std::vector<const arg_t*>& args)
-    : match(match), mask(mask), args(args), name(name) {}
+  NOINLINE disasm_insn_t(const char* name, uint32_t match, uint32_t mask,
+                         const std::vector<const arg_t*>& args)
+    : match(match), mask(mask), args(args), name(strdup(name)) {}
+  ~disasm_insn_t() { free(const_cast<char *>(name)); }
 
   bool operator == (insn_t insn) const
   {
     return (insn.bits() & mask) == match;
   }
 
+  const char* get_name() const
+  {
+    return name;
+  }
+
   std::string to_string(insn_t insn) const
   {
     std::stringstream s;
@@ -40,10 +54,21 @@ class disasm_insn_t
 
     if (args.size())
     {
+      bool next_arg_optional  = false;
       s << std::string(std::max(1, 8 - len), ' ');
-      for (size_t i = 0; i < args.size()-1; i++)
-        s << args[i]->to_string(insn) << ", ";
-      s << args[args.size()-1]->to_string(insn);
+      for (size_t i = 0; i < args.size(); i++) {
+        if (args[i] == &opt) {
+          next_arg_optional = true;
+          continue;
+        }
+        std::string argString = args[i]->to_string(insn);
+        if (next_arg_optional) {
+          next_arg_optional = false;
+          if (argString.empty()) continue;
+        }
+        if (i != 0) s << ", ";
+        s << argString;
+      }
     }
     return s.str();
   }
@@ -63,12 +88,15 @@ class disassembler_t
  public:
   disassembler_t(int xlen);
   ~disassembler_t();
+
   std::string disassemble(insn_t insn) const;
+  const disasm_insn_t* lookup(insn_t insn) const;
+
   void add_insn(disasm_insn_t* insn);
+
  private:
   static const int HASH_SIZE = 256;
   std::vector<const disasm_insn_t*> chain[HASH_SIZE+1];
-  const disasm_insn_t* lookup(insn_t insn) const;
 };
 
 #endif
diff --git a/riscv/dts.cc b/riscv/dts.cc
index b8a5f9d7c1..56b76e6c50 100644
--- a/riscv/dts.cc
+++ b/riscv/dts.cc
@@ -1,6 +1,7 @@
 // See LICENSE for license details.
 
 #include "dts.h"
+#include "libfdt.h"
 #include <iostream>
 #include <sstream>
 #include <signal.h>
@@ -9,6 +10,8 @@
 #include <sys/types.h>
 
 std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
+                     reg_t initrd_start, reg_t initrd_end,
+                     const char* bootargs,
                      std::vector<processor_t*> procs,
                      std::vector<std::pair<reg_t, mem_t*>> mems)
 {
@@ -21,6 +24,25 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
          "  #size-cells = <2>;\n"
          "  compatible = \"ucbbar,spike-bare-dev\";\n"
          "  model = \"ucbbar,spike-bare\";\n"
+         "  chosen {\n";
+  if (initrd_start < initrd_end) {
+    s << "    linux,initrd-start = <" << (size_t)initrd_start << ">;\n"
+         "    linux,initrd-end = <" << (size_t)initrd_end << ">;\n";
+    if (!bootargs)
+      bootargs = "root=/dev/ram console=hvc0 earlycon=sbi";
+  } else {
+    if (!bootargs)
+      bootargs = "console=hvc0 earlycon=sbi";
+  }
+    s << "    bootargs = \"";
+  for (size_t i = 0; i < strlen(bootargs); i++) {
+    if (bootargs[i] == '"')
+    s << '\\' << bootargs[i];
+    else
+    s << bootargs[i];
+  }
+    s << "\";\n";
+    s << "  };\n"
          "  cpus {\n"
          "    #address-cells = <1>;\n"
          "    #size-cells = <0>;\n"
@@ -33,6 +55,8 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
          "      compatible = \"riscv\";\n"
          "      riscv,isa = \"" << procs[i]->get_isa_string() << "\";\n"
          "      mmu-type = \"riscv," << (procs[i]->get_max_xlen() <= 32 ? "sv32" : "sv48") << "\";\n"
+         "      riscv,pmpregions = <16>;\n"
+         "      riscv,pmpgranularity = <4>;\n"
          "      clock-frequency = <" << cpu_hz << ">;\n"
          "      CPU" << i << "_intc: interrupt-controller {\n"
          "        #interrupt-cells = <1>;\n"
@@ -47,7 +71,7 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
          "  memory@" << m.first << " {\n"
          "    device_type = \"memory\";\n"
          "    reg = <0x" << (m.first >> 32) << " 0x" << (m.first & (uint32_t)-1) <<
-                   " 0x" << (m.second->size() >> 32) << " 0x" << (m.second->size() & (uint32_t)-1) << ">;\n"
+                   " 0x" << (m.second->size() >> 16 >> 16) << " 0x" << (m.second->size() & (uint32_t)-1) << ">;\n"
          "  };\n";
   }
   s <<   "  soc {\n"
@@ -80,6 +104,7 @@ std::string dts_compile(const std::string& dts)
   int dts_pipe[2];
   pid_t dts_pid;
 
+  fflush(NULL); // flush stdout/stderr before forking
   if (pipe(dts_pipe) != 0 || (dts_pid = fork()) < 0) {
     std::cerr << "Failed to fork dts child: " << strerror(errno) << std::endl;
     exit(1);
@@ -116,7 +141,7 @@ std::string dts_compile(const std::string& dts)
     close(dts_pipe[1]);
     close(dtb_pipe[0]);
     close(dtb_pipe[1]);
-    execl(DTC, DTC, "-O", "dtb", 0);
+    execlp(DTC, DTC, "-O", "dtb", 0);
     std::cerr << "Failed to run " DTC ": " << strerror(errno) << std::endl;
     exit(1);
   }
@@ -154,3 +179,97 @@ std::string dts_compile(const std::string& dts)
 
   return dtb.str();
 }
+
+
+static int fdt_get_node_addr_size(void *fdt, int node, reg_t *addr,
+                                  unsigned long *size, const char *field)
+{
+  int parent, len, i;
+  int cell_addr, cell_size;
+  const fdt32_t *prop_addr, *prop_size;
+  uint64_t temp = 0;
+
+  parent = fdt_parent_offset(fdt, node);
+  if (parent < 0)
+    return parent;
+
+  cell_addr = fdt_address_cells(fdt, parent);
+  if (cell_addr < 1)
+    return -ENODEV;
+
+  cell_size = fdt_size_cells(fdt, parent);
+  if (cell_size < 0)
+    return -ENODEV;
+
+  if (!field)
+    return -ENODEV;
+
+  prop_addr = (fdt32_t *)fdt_getprop(fdt, node, field, &len);
+  if (!prop_addr)
+    return -ENODEV;
+  prop_size = prop_addr + cell_addr;
+
+  if (addr) {
+    for (i = 0; i < cell_addr; i++)
+      temp = (temp << 32) | fdt32_to_cpu(*prop_addr++);
+    *addr = temp;
+  }
+  temp = 0;
+
+  if (size) {
+    for (i = 0; i < cell_size; i++)
+      temp = (temp << 32) | fdt32_to_cpu(*prop_size++);
+    *size = temp;
+  }
+
+  return 0;
+}
+
+int fdt_parse_clint(void *fdt, reg_t *clint_addr,
+                    const char *compatible)
+{
+  int nodeoffset, rc;
+
+  nodeoffset = fdt_node_offset_by_compatible(fdt, -1, compatible);
+  if (nodeoffset < 0)
+    return nodeoffset;
+
+  rc = fdt_get_node_addr_size(fdt, nodeoffset, clint_addr, NULL, "reg");
+  if (rc < 0 || !clint_addr)
+    return -ENODEV;
+
+  return 0;
+}
+
+int fdt_parse_pmp_num(void *fdt, reg_t *pmp_num, const char *compatible)
+{
+  int nodeoffset, rc;
+
+  nodeoffset = fdt_node_offset_by_compatible(fdt, -1, compatible);
+  if (nodeoffset < 0)
+    return nodeoffset;
+
+  rc = fdt_get_node_addr_size(fdt, nodeoffset, pmp_num, NULL,
+                              "riscv,pmpregions");
+  if (rc < 0 || !pmp_num)
+    return -ENODEV;
+
+  return 0;
+}
+
+int fdt_parse_pmp_alignment(void *fdt, reg_t *pmp_align,
+                            const char *compatible)
+{
+  int nodeoffset, rc;
+
+  nodeoffset = fdt_node_offset_by_compatible(fdt, -1, compatible);
+  if (nodeoffset < 0)
+    return nodeoffset;
+
+  rc = fdt_get_node_addr_size(fdt, nodeoffset, pmp_align, NULL,
+                              "riscv,pmpgranularity");
+  if (rc < 0 || !pmp_align)
+    return -ENODEV;
+
+  return 0;
+}
diff --git a/riscv/dts.h b/riscv/dts.h
index ec0aa6161b..1f01e0f8c8 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -7,9 +7,17 @@
 #include <string>
 
 std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
+                     reg_t initrd_start, reg_t initrd_end,
+                     const char* bootargs,
                      std::vector<processor_t*> procs,
                      std::vector<std::pair<reg_t, mem_t*>> mems);
 
 std::string dts_compile(const std::string& dts);
 
+int fdt_parse_clint(void *fdt, reg_t *clint_addr,
+                    const char *compatible);
+int fdt_parse_pmp_num(void *fdt, reg_t *pmp_num,
+                      const char *compatible);
+int fdt_parse_pmp_alignment(void *fdt, reg_t *pmp_align,
+                            const char *compatible);
 #endif
diff --git a/riscv/encoding.h b/riscv/encoding.h
deleted file mode 100644
index c109ce189d..0000000000
--- a/riscv/encoding.h
+++ /dev/null
@@ -1,1471 +0,0 @@
-// See LICENSE for license details.
-
-#ifndef RISCV_CSR_ENCODING_H
-#define RISCV_CSR_ENCODING_H
-
-#define MSTATUS_UIE         0x00000001
-#define MSTATUS_SIE         0x00000002
-#define MSTATUS_HIE         0x00000004
-#define MSTATUS_MIE         0x00000008
-#define MSTATUS_UPIE        0x00000010
-#define MSTATUS_SPIE        0x00000020
-#define MSTATUS_HPIE        0x00000040
-#define MSTATUS_MPIE        0x00000080
-#define MSTATUS_SPP         0x00000100
-#define MSTATUS_HPP         0x00000600
-#define MSTATUS_MPP         0x00001800
-#define MSTATUS_FS          0x00006000
-#define MSTATUS_XS          0x00018000
-#define MSTATUS_MPRV        0x00020000
-#define MSTATUS_SUM         0x00040000
-#define MSTATUS_MXR         0x00080000
-#define MSTATUS_TVM         0x00100000
-#define MSTATUS_TW          0x00200000
-#define MSTATUS_TSR         0x00400000
-#define MSTATUS32_SD        0x80000000
-#define MSTATUS_UXL         0x0000000300000000
-#define MSTATUS_SXL         0x0000000C00000000
-#define MSTATUS64_SD        0x8000000000000000
-
-#define SSTATUS_UIE         0x00000001
-#define SSTATUS_SIE         0x00000002
-#define SSTATUS_UPIE        0x00000010
-#define SSTATUS_SPIE        0x00000020
-#define SSTATUS_SPP         0x00000100
-#define SSTATUS_FS          0x00006000
-#define SSTATUS_XS          0x00018000
-#define SSTATUS_SUM         0x00040000
-#define SSTATUS_MXR         0x00080000
-#define SSTATUS32_SD        0x80000000
-#define SSTATUS_UXL         0x0000000300000000
-#define SSTATUS64_SD        0x8000000000000000
-
-#define DCSR_XDEBUGVER      (3U<<30)
-#define DCSR_NDRESET        (1<<29)
-#define DCSR_FULLRESET      (1<<28)
-#define DCSR_EBREAKM        (1<<15)
-#define DCSR_EBREAKH        (1<<14)
-#define DCSR_EBREAKS        (1<<13)
-#define DCSR_EBREAKU        (1<<12)
-#define DCSR_STOPCYCLE      (1<<10)
-#define DCSR_STOPTIME       (1<<9)
-#define DCSR_CAUSE          (7<<6)
-#define DCSR_DEBUGINT       (1<<5)
-#define DCSR_HALT           (1<<3)
-#define DCSR_STEP           (1<<2)
-#define DCSR_PRV            (3<<0)
-
-#define DCSR_CAUSE_NONE     0
-#define DCSR_CAUSE_SWBP     1
-#define DCSR_CAUSE_HWBP     2
-#define DCSR_CAUSE_DEBUGINT 3
-#define DCSR_CAUSE_STEP     4
-#define DCSR_CAUSE_HALT     5
-
-#define MCONTROL_TYPE(xlen)    (0xfULL<<((xlen)-4))
-#define MCONTROL_DMODE(xlen)   (1ULL<<((xlen)-5))
-#define MCONTROL_MASKMAX(xlen) (0x3fULL<<((xlen)-11))
-
-#define MCONTROL_SELECT     (1<<19)
-#define MCONTROL_TIMING     (1<<18)
-#define MCONTROL_ACTION     (0x3f<<12)
-#define MCONTROL_CHAIN      (1<<11)
-#define MCONTROL_MATCH      (0xf<<7)
-#define MCONTROL_M          (1<<6)
-#define MCONTROL_H          (1<<5)
-#define MCONTROL_S          (1<<4)
-#define MCONTROL_U          (1<<3)
-#define MCONTROL_EXECUTE    (1<<2)
-#define MCONTROL_STORE      (1<<1)
-#define MCONTROL_LOAD       (1<<0)
-
-#define MCONTROL_TYPE_NONE      0
-#define MCONTROL_TYPE_MATCH     2
-
-#define MCONTROL_ACTION_DEBUG_EXCEPTION   0
-#define MCONTROL_ACTION_DEBUG_MODE        1
-#define MCONTROL_ACTION_TRACE_START       2
-#define MCONTROL_ACTION_TRACE_STOP        3
-#define MCONTROL_ACTION_TRACE_EMIT        4
-
-#define MCONTROL_MATCH_EQUAL     0
-#define MCONTROL_MATCH_NAPOT     1
-#define MCONTROL_MATCH_GE        2
-#define MCONTROL_MATCH_LT        3
-#define MCONTROL_MATCH_MASK_LOW  4
-#define MCONTROL_MATCH_MASK_HIGH 5
-
-#define MIP_SSIP            (1 << IRQ_S_SOFT)
-#define MIP_HSIP            (1 << IRQ_H_SOFT)
-#define MIP_MSIP            (1 << IRQ_M_SOFT)
-#define MIP_STIP            (1 << IRQ_S_TIMER)
-#define MIP_HTIP            (1 << IRQ_H_TIMER)
-#define MIP_MTIP            (1 << IRQ_M_TIMER)
-#define MIP_SEIP            (1 << IRQ_S_EXT)
-#define MIP_HEIP            (1 << IRQ_H_EXT)
-#define MIP_MEIP            (1 << IRQ_M_EXT)
-
-#define SIP_SSIP MIP_SSIP
-#define SIP_STIP MIP_STIP
-
-#define PRV_U 0
-#define PRV_S 1
-#define PRV_H 2
-#define PRV_M 3
-
-#define SATP32_MODE 0x80000000
-#define SATP32_ASID 0x7FC00000
-#define SATP32_PPN  0x003FFFFF
-#define SATP64_MODE 0xF000000000000000
-#define SATP64_ASID 0x0FFFF00000000000
-#define SATP64_PPN  0x00000FFFFFFFFFFF
-
-#define SATP_MODE_OFF  0
-#define SATP_MODE_SV32 1
-#define SATP_MODE_SV39 8
-#define SATP_MODE_SV48 9
-#define SATP_MODE_SV57 10
-#define SATP_MODE_SV64 11
-
-#define PMP_R     0x01
-#define PMP_W     0x02
-#define PMP_X     0x04
-#define PMP_A     0x18
-#define PMP_L     0x80
-#define PMP_SHIFT 2
-
-#define PMP_TOR   0x08
-#define PMP_NA4   0x10
-#define PMP_NAPOT 0x18
-
-#define IRQ_S_SOFT   1
-#define IRQ_H_SOFT   2
-#define IRQ_M_SOFT   3
-#define IRQ_S_TIMER  5
-#define IRQ_H_TIMER  6
-#define IRQ_M_TIMER  7
-#define IRQ_S_EXT    9
-#define IRQ_H_EXT    10
-#define IRQ_M_EXT    11
-#define IRQ_COP      12
-#define IRQ_HOST     13
-
-#define DEFAULT_RSTVEC     0x00001000
-#define CLINT_BASE         0x02000000
-#define CLINT_SIZE         0x000c0000
-#define EXT_IO_BASE        0x40000000
-#define DRAM_BASE          0x80000000
-
-// page table entry (PTE) fields
-#define PTE_V     0x001 // Valid
-#define PTE_R     0x002 // Read
-#define PTE_W     0x004 // Write
-#define PTE_X     0x008 // Execute
-#define PTE_U     0x010 // User
-#define PTE_G     0x020 // Global
-#define PTE_A     0x040 // Accessed
-#define PTE_D     0x080 // Dirty
-#define PTE_SOFT  0x300 // Reserved for Software
-
-#define PTE_PPN_SHIFT 10
-
-#define PTE_TABLE(PTE) (((PTE) & (PTE_V | PTE_R | PTE_W | PTE_X)) == PTE_V)
-
-#ifdef __riscv
-
-#if __riscv_xlen == 64
-# define MSTATUS_SD MSTATUS64_SD
-# define SSTATUS_SD SSTATUS64_SD
-# define RISCV_PGLEVEL_BITS 9
-# define SATP_MODE SATP64_MODE
-#else
-# define MSTATUS_SD MSTATUS32_SD
-# define SSTATUS_SD SSTATUS32_SD
-# define RISCV_PGLEVEL_BITS 10
-# define SATP_MODE SATP32_MODE
-#endif
-#define RISCV_PGSHIFT 12
-#define RISCV_PGSIZE (1 << RISCV_PGSHIFT)
-
-#ifndef __ASSEMBLER__
-
-#ifdef __GNUC__
-
-#define read_csr(reg) ({ unsigned long __tmp; \
-  asm volatile ("csrr %0, " #reg : "=r"(__tmp)); \
-  __tmp; })
-
-#define write_csr(reg, val) ({ \
-  asm volatile ("csrw " #reg ", %0" :: "rK"(val)); })
-
-#define swap_csr(reg, val) ({ unsigned long __tmp; \
-  asm volatile ("csrrw %0, " #reg ", %1" : "=r"(__tmp) : "rK"(val)); \
-  __tmp; })
-
-#define set_csr(reg, bit) ({ unsigned long __tmp; \
-  asm volatile ("csrrs %0, " #reg ", %1" : "=r"(__tmp) : "rK"(bit)); \
-  __tmp; })
-
-#define clear_csr(reg, bit) ({ unsigned long __tmp; \
-  asm volatile ("csrrc %0, " #reg ", %1" : "=r"(__tmp) : "rK"(bit)); \
-  __tmp; })
-
-#define rdtime() read_csr(time)
-#define rdcycle() read_csr(cycle)
-#define rdinstret() read_csr(instret)
-
-#endif
-
-#endif
-
-#endif
-
-#endif
-/* Automatically generated by parse-opcodes.  */
-#ifndef RISCV_ENCODING_H
-#define RISCV_ENCODING_H
-#define MATCH_BEQ 0x63
-#define MASK_BEQ  0x707f
-#define MATCH_BNE 0x1063
-#define MASK_BNE  0x707f
-#define MATCH_BLT 0x4063
-#define MASK_BLT  0x707f
-#define MATCH_BGE 0x5063
-#define MASK_BGE  0x707f
-#define MATCH_BLTU 0x6063
-#define MASK_BLTU  0x707f
-#define MATCH_BGEU 0x7063
-#define MASK_BGEU  0x707f
-#define MATCH_JALR 0x67
-#define MASK_JALR  0x707f
-#define MATCH_JAL 0x6f
-#define MASK_JAL  0x7f
-#define MATCH_LUI 0x37
-#define MASK_LUI  0x7f
-#define MATCH_AUIPC 0x17
-#define MASK_AUIPC  0x7f
-#define MATCH_ADDI 0x13
-#define MASK_ADDI  0x707f
-#define MATCH_SLLI 0x1013
-#define MASK_SLLI  0xfc00707f
-#define MATCH_SLTI 0x2013
-#define MASK_SLTI  0x707f
-#define MATCH_SLTIU 0x3013
-#define MASK_SLTIU  0x707f
-#define MATCH_XORI 0x4013
-#define MASK_XORI  0x707f
-#define MATCH_SRLI 0x5013
-#define MASK_SRLI  0xfc00707f
-#define MATCH_SRAI 0x40005013
-#define MASK_SRAI  0xfc00707f
-#define MATCH_ORI 0x6013
-#define MASK_ORI  0x707f
-#define MATCH_ANDI 0x7013
-#define MASK_ANDI  0x707f
-#define MATCH_ADD 0x33
-#define MASK_ADD  0xfe00707f
-#define MATCH_SUB 0x40000033
-#define MASK_SUB  0xfe00707f
-#define MATCH_SLL 0x1033
-#define MASK_SLL  0xfe00707f
-#define MATCH_SLT 0x2033
-#define MASK_SLT  0xfe00707f
-#define MATCH_SLTU 0x3033
-#define MASK_SLTU  0xfe00707f
-#define MATCH_XOR 0x4033
-#define MASK_XOR  0xfe00707f
-#define MATCH_SRL 0x5033
-#define MASK_SRL  0xfe00707f
-#define MATCH_SRA 0x40005033
-#define MASK_SRA  0xfe00707f
-#define MATCH_OR 0x6033
-#define MASK_OR  0xfe00707f
-#define MATCH_AND 0x7033
-#define MASK_AND  0xfe00707f
-#define MATCH_ADDIW 0x1b
-#define MASK_ADDIW  0x707f
-#define MATCH_SLLIW 0x101b
-#define MASK_SLLIW  0xfe00707f
-#define MATCH_SRLIW 0x501b
-#define MASK_SRLIW  0xfe00707f
-#define MATCH_SRAIW 0x4000501b
-#define MASK_SRAIW  0xfe00707f
-#define MATCH_ADDW 0x3b
-#define MASK_ADDW  0xfe00707f
-#define MATCH_SUBW 0x4000003b
-#define MASK_SUBW  0xfe00707f
-#define MATCH_SLLW 0x103b
-#define MASK_SLLW  0xfe00707f
-#define MATCH_SRLW 0x503b
-#define MASK_SRLW  0xfe00707f
-#define MATCH_SRAW 0x4000503b
-#define MASK_SRAW  0xfe00707f
-#define MATCH_LB 0x3
-#define MASK_LB  0x707f
-#define MATCH_LH 0x1003
-#define MASK_LH  0x707f
-#define MATCH_LW 0x2003
-#define MASK_LW  0x707f
-#define MATCH_LD 0x3003
-#define MASK_LD  0x707f
-#define MATCH_LBU 0x4003
-#define MASK_LBU  0x707f
-#define MATCH_LHU 0x5003
-#define MASK_LHU  0x707f
-#define MATCH_LWU 0x6003
-#define MASK_LWU  0x707f
-#define MATCH_SB 0x23
-#define MASK_SB  0x707f
-#define MATCH_SH 0x1023
-#define MASK_SH  0x707f
-#define MATCH_SW 0x2023
-#define MASK_SW  0x707f
-#define MATCH_SD 0x3023
-#define MASK_SD  0x707f
-#define MATCH_FENCE 0xf
-#define MASK_FENCE  0x707f
-#define MATCH_FENCE_I 0x100f
-#define MASK_FENCE_I  0x707f
-#define MATCH_MUL 0x2000033
-#define MASK_MUL  0xfe00707f
-#define MATCH_MULH 0x2001033
-#define MASK_MULH  0xfe00707f
-#define MATCH_MULHSU 0x2002033
-#define MASK_MULHSU  0xfe00707f
-#define MATCH_MULHU 0x2003033
-#define MASK_MULHU  0xfe00707f
-#define MATCH_DIV 0x2004033
-#define MASK_DIV  0xfe00707f
-#define MATCH_DIVU 0x2005033
-#define MASK_DIVU  0xfe00707f
-#define MATCH_REM 0x2006033
-#define MASK_REM  0xfe00707f
-#define MATCH_REMU 0x2007033
-#define MASK_REMU  0xfe00707f
-#define MATCH_MULW 0x200003b
-#define MASK_MULW  0xfe00707f
-#define MATCH_DIVW 0x200403b
-#define MASK_DIVW  0xfe00707f
-#define MATCH_DIVUW 0x200503b
-#define MASK_DIVUW  0xfe00707f
-#define MATCH_REMW 0x200603b
-#define MASK_REMW  0xfe00707f
-#define MATCH_REMUW 0x200703b
-#define MASK_REMUW  0xfe00707f
-#define MATCH_AMOADD_W 0x202f
-#define MASK_AMOADD_W  0xf800707f
-#define MATCH_AMOXOR_W 0x2000202f
-#define MASK_AMOXOR_W  0xf800707f
-#define MATCH_AMOOR_W 0x4000202f
-#define MASK_AMOOR_W  0xf800707f
-#define MATCH_AMOAND_W 0x6000202f
-#define MASK_AMOAND_W  0xf800707f
-#define MATCH_AMOMIN_W 0x8000202f
-#define MASK_AMOMIN_W  0xf800707f
-#define MATCH_AMOMAX_W 0xa000202f
-#define MASK_AMOMAX_W  0xf800707f
-#define MATCH_AMOMINU_W 0xc000202f
-#define MASK_AMOMINU_W  0xf800707f
-#define MATCH_AMOMAXU_W 0xe000202f
-#define MASK_AMOMAXU_W  0xf800707f
-#define MATCH_AMOSWAP_W 0x800202f
-#define MASK_AMOSWAP_W  0xf800707f
-#define MATCH_LR_W 0x1000202f
-#define MASK_LR_W  0xf9f0707f
-#define MATCH_SC_W 0x1800202f
-#define MASK_SC_W  0xf800707f
-#define MATCH_AMOADD_D 0x302f
-#define MASK_AMOADD_D  0xf800707f
-#define MATCH_AMOXOR_D 0x2000302f
-#define MASK_AMOXOR_D  0xf800707f
-#define MATCH_AMOOR_D 0x4000302f
-#define MASK_AMOOR_D  0xf800707f
-#define MATCH_AMOAND_D 0x6000302f
-#define MASK_AMOAND_D  0xf800707f
-#define MATCH_AMOMIN_D 0x8000302f
-#define MASK_AMOMIN_D  0xf800707f
-#define MATCH_AMOMAX_D 0xa000302f
-#define MASK_AMOMAX_D  0xf800707f
-#define MATCH_AMOMINU_D 0xc000302f
-#define MASK_AMOMINU_D  0xf800707f
-#define MATCH_AMOMAXU_D 0xe000302f
-#define MASK_AMOMAXU_D  0xf800707f
-#define MATCH_AMOSWAP_D 0x800302f
-#define MASK_AMOSWAP_D  0xf800707f
-#define MATCH_LR_D 0x1000302f
-#define MASK_LR_D  0xf9f0707f
-#define MATCH_SC_D 0x1800302f
-#define MASK_SC_D  0xf800707f
-#define MATCH_ECALL 0x73
-#define MASK_ECALL  0xffffffff
-#define MATCH_EBREAK 0x100073
-#define MASK_EBREAK  0xffffffff
-#define MATCH_URET 0x200073
-#define MASK_URET  0xffffffff
-#define MATCH_SRET 0x10200073
-#define MASK_SRET  0xffffffff
-#define MATCH_MRET 0x30200073
-#define MASK_MRET  0xffffffff
-#define MATCH_DRET 0x7b200073
-#define MASK_DRET  0xffffffff
-#define MATCH_SFENCE_VMA 0x12000073
-#define MASK_SFENCE_VMA  0xfe007fff
-#define MATCH_WFI 0x10500073
-#define MASK_WFI  0xffffffff
-#define MATCH_CSRRW 0x1073
-#define MASK_CSRRW  0x707f
-#define MATCH_CSRRS 0x2073
-#define MASK_CSRRS  0x707f
-#define MATCH_CSRRC 0x3073
-#define MASK_CSRRC  0x707f
-#define MATCH_CSRRWI 0x5073
-#define MASK_CSRRWI  0x707f
-#define MATCH_CSRRSI 0x6073
-#define MASK_CSRRSI  0x707f
-#define MATCH_CSRRCI 0x7073
-#define MASK_CSRRCI  0x707f
-#define MATCH_FADD_S 0x53
-#define MASK_FADD_S  0xfe00007f
-#define MATCH_FSUB_S 0x8000053
-#define MASK_FSUB_S  0xfe00007f
-#define MATCH_FMUL_S 0x10000053
-#define MASK_FMUL_S  0xfe00007f
-#define MATCH_FDIV_S 0x18000053
-#define MASK_FDIV_S  0xfe00007f
-#define MATCH_FSGNJ_S 0x20000053
-#define MASK_FSGNJ_S  0xfe00707f
-#define MATCH_FSGNJN_S 0x20001053
-#define MASK_FSGNJN_S  0xfe00707f
-#define MATCH_FSGNJX_S 0x20002053
-#define MASK_FSGNJX_S  0xfe00707f
-#define MATCH_FMIN_S 0x28000053
-#define MASK_FMIN_S  0xfe00707f
-#define MATCH_FMAX_S 0x28001053
-#define MASK_FMAX_S  0xfe00707f
-#define MATCH_FSQRT_S 0x58000053
-#define MASK_FSQRT_S  0xfff0007f
-#define MATCH_FADD_D 0x2000053
-#define MASK_FADD_D  0xfe00007f
-#define MATCH_FSUB_D 0xa000053
-#define MASK_FSUB_D  0xfe00007f
-#define MATCH_FMUL_D 0x12000053
-#define MASK_FMUL_D  0xfe00007f
-#define MATCH_FDIV_D 0x1a000053
-#define MASK_FDIV_D  0xfe00007f
-#define MATCH_FSGNJ_D 0x22000053
-#define MASK_FSGNJ_D  0xfe00707f
-#define MATCH_FSGNJN_D 0x22001053
-#define MASK_FSGNJN_D  0xfe00707f
-#define MATCH_FSGNJX_D 0x22002053
-#define MASK_FSGNJX_D  0xfe00707f
-#define MATCH_FMIN_D 0x2a000053
-#define MASK_FMIN_D  0xfe00707f
-#define MATCH_FMAX_D 0x2a001053
-#define MASK_FMAX_D  0xfe00707f
-#define MATCH_FCVT_S_D 0x40100053
-#define MASK_FCVT_S_D  0xfff0007f
-#define MATCH_FCVT_D_S 0x42000053
-#define MASK_FCVT_D_S  0xfff0007f
-#define MATCH_FSQRT_D 0x5a000053
-#define MASK_FSQRT_D  0xfff0007f
-#define MATCH_FADD_Q 0x6000053
-#define MASK_FADD_Q  0xfe00007f
-#define MATCH_FSUB_Q 0xe000053
-#define MASK_FSUB_Q  0xfe00007f
-#define MATCH_FMUL_Q 0x16000053
-#define MASK_FMUL_Q  0xfe00007f
-#define MATCH_FDIV_Q 0x1e000053
-#define MASK_FDIV_Q  0xfe00007f
-#define MATCH_FSGNJ_Q 0x26000053
-#define MASK_FSGNJ_Q  0xfe00707f
-#define MATCH_FSGNJN_Q 0x26001053
-#define MASK_FSGNJN_Q  0xfe00707f
-#define MATCH_FSGNJX_Q 0x26002053
-#define MASK_FSGNJX_Q  0xfe00707f
-#define MATCH_FMIN_Q 0x2e000053
-#define MASK_FMIN_Q  0xfe00707f
-#define MATCH_FMAX_Q 0x2e001053
-#define MASK_FMAX_Q  0xfe00707f
-#define MATCH_FCVT_S_Q 0x40300053
-#define MASK_FCVT_S_Q  0xfff0007f
-#define MATCH_FCVT_Q_S 0x46000053
-#define MASK_FCVT_Q_S  0xfff0007f
-#define MATCH_FCVT_D_Q 0x42300053
-#define MASK_FCVT_D_Q  0xfff0007f
-#define MATCH_FCVT_Q_D 0x46100053
-#define MASK_FCVT_Q_D  0xfff0007f
-#define MATCH_FSQRT_Q 0x5e000053
-#define MASK_FSQRT_Q  0xfff0007f
-#define MATCH_FLE_S 0xa0000053
-#define MASK_FLE_S  0xfe00707f
-#define MATCH_FLT_S 0xa0001053
-#define MASK_FLT_S  0xfe00707f
-#define MATCH_FEQ_S 0xa0002053
-#define MASK_FEQ_S  0xfe00707f
-#define MATCH_FLE_D 0xa2000053
-#define MASK_FLE_D  0xfe00707f
-#define MATCH_FLT_D 0xa2001053
-#define MASK_FLT_D  0xfe00707f
-#define MATCH_FEQ_D 0xa2002053
-#define MASK_FEQ_D  0xfe00707f
-#define MATCH_FLE_Q 0xa6000053
-#define MASK_FLE_Q  0xfe00707f
-#define MATCH_FLT_Q 0xa6001053
-#define MASK_FLT_Q  0xfe00707f
-#define MATCH_FEQ_Q 0xa6002053
-#define MASK_FEQ_Q  0xfe00707f
-#define MATCH_FCVT_W_S 0xc0000053
-#define MASK_FCVT_W_S  0xfff0007f
-#define MATCH_FCVT_WU_S 0xc0100053
-#define MASK_FCVT_WU_S  0xfff0007f
-#define MATCH_FCVT_L_S 0xc0200053
-#define MASK_FCVT_L_S  0xfff0007f
-#define MATCH_FCVT_LU_S 0xc0300053
-#define MASK_FCVT_LU_S  0xfff0007f
-#define MATCH_FMV_X_W 0xe0000053
-#define MASK_FMV_X_W  0xfff0707f
-#define MATCH_FCLASS_S 0xe0001053
-#define MASK_FCLASS_S  0xfff0707f
-#define MATCH_FCVT_W_D 0xc2000053
-#define MASK_FCVT_W_D  0xfff0007f
-#define MATCH_FCVT_WU_D 0xc2100053
-#define MASK_FCVT_WU_D  0xfff0007f
-#define MATCH_FCVT_L_D 0xc2200053
-#define MASK_FCVT_L_D  0xfff0007f
-#define MATCH_FCVT_LU_D 0xc2300053
-#define MASK_FCVT_LU_D  0xfff0007f
-#define MATCH_FMV_X_D 0xe2000053
-#define MASK_FMV_X_D  0xfff0707f
-#define MATCH_FCLASS_D 0xe2001053
-#define MASK_FCLASS_D  0xfff0707f
-#define MATCH_FCVT_W_Q 0xc6000053
-#define MASK_FCVT_W_Q  0xfff0007f
-#define MATCH_FCVT_WU_Q 0xc6100053
-#define MASK_FCVT_WU_Q  0xfff0007f
-#define MATCH_FCVT_L_Q 0xc6200053
-#define MASK_FCVT_L_Q  0xfff0007f
-#define MATCH_FCVT_LU_Q 0xc6300053
-#define MASK_FCVT_LU_Q  0xfff0007f
-#define MATCH_FMV_X_Q 0xe6000053
-#define MASK_FMV_X_Q  0xfff0707f
-#define MATCH_FCLASS_Q 0xe6001053
-#define MASK_FCLASS_Q  0xfff0707f
-#define MATCH_FCVT_S_W 0xd0000053
-#define MASK_FCVT_S_W  0xfff0007f
-#define MATCH_FCVT_S_WU 0xd0100053
-#define MASK_FCVT_S_WU  0xfff0007f
-#define MATCH_FCVT_S_L 0xd0200053
-#define MASK_FCVT_S_L  0xfff0007f
-#define MATCH_FCVT_S_LU 0xd0300053
-#define MASK_FCVT_S_LU  0xfff0007f
-#define MATCH_FMV_W_X 0xf0000053
-#define MASK_FMV_W_X  0xfff0707f
-#define MATCH_FCVT_D_W 0xd2000053
-#define MASK_FCVT_D_W  0xfff0007f
-#define MATCH_FCVT_D_WU 0xd2100053
-#define MASK_FCVT_D_WU  0xfff0007f
-#define MATCH_FCVT_D_L 0xd2200053
-#define MASK_FCVT_D_L  0xfff0007f
-#define MATCH_FCVT_D_LU 0xd2300053
-#define MASK_FCVT_D_LU  0xfff0007f
-#define MATCH_FMV_D_X 0xf2000053
-#define MASK_FMV_D_X  0xfff0707f
-#define MATCH_FCVT_Q_W 0xd6000053
-#define MASK_FCVT_Q_W  0xfff0007f
-#define MATCH_FCVT_Q_WU 0xd6100053
-#define MASK_FCVT_Q_WU  0xfff0007f
-#define MATCH_FCVT_Q_L 0xd6200053
-#define MASK_FCVT_Q_L  0xfff0007f
-#define MATCH_FCVT_Q_LU 0xd6300053
-#define MASK_FCVT_Q_LU  0xfff0007f
-#define MATCH_FMV_Q_X 0xf6000053
-#define MASK_FMV_Q_X  0xfff0707f
-#define MATCH_FLW 0x2007
-#define MASK_FLW  0x707f
-#define MATCH_FLD 0x3007
-#define MASK_FLD  0x707f
-#define MATCH_FLQ 0x4007
-#define MASK_FLQ  0x707f
-#define MATCH_FSW 0x2027
-#define MASK_FSW  0x707f
-#define MATCH_FSD 0x3027
-#define MASK_FSD  0x707f
-#define MATCH_FSQ 0x4027
-#define MASK_FSQ  0x707f
-#define MATCH_FMADD_S 0x43
-#define MASK_FMADD_S  0x600007f
-#define MATCH_FMSUB_S 0x47
-#define MASK_FMSUB_S  0x600007f
-#define MATCH_FNMSUB_S 0x4b
-#define MASK_FNMSUB_S  0x600007f
-#define MATCH_FNMADD_S 0x4f
-#define MASK_FNMADD_S  0x600007f
-#define MATCH_FMADD_D 0x2000043
-#define MASK_FMADD_D  0x600007f
-#define MATCH_FMSUB_D 0x2000047
-#define MASK_FMSUB_D  0x600007f
-#define MATCH_FNMSUB_D 0x200004b
-#define MASK_FNMSUB_D  0x600007f
-#define MATCH_FNMADD_D 0x200004f
-#define MASK_FNMADD_D  0x600007f
-#define MATCH_FMADD_Q 0x6000043
-#define MASK_FMADD_Q  0x600007f
-#define MATCH_FMSUB_Q 0x6000047
-#define MASK_FMSUB_Q  0x600007f
-#define MATCH_FNMSUB_Q 0x600004b
-#define MASK_FNMSUB_Q  0x600007f
-#define MATCH_FNMADD_Q 0x600004f
-#define MASK_FNMADD_Q  0x600007f
-#define MATCH_C_NOP 0x1
-#define MASK_C_NOP  0xffff
-#define MATCH_C_ADDI16SP 0x6101
-#define MASK_C_ADDI16SP  0xef83
-#define MATCH_C_JR 0x8002
-#define MASK_C_JR  0xf07f
-#define MATCH_C_JALR 0x9002
-#define MASK_C_JALR  0xf07f
-#define MATCH_C_EBREAK 0x9002
-#define MASK_C_EBREAK  0xffff
-#define MATCH_C_LD 0x6000
-#define MASK_C_LD  0xe003
-#define MATCH_C_SD 0xe000
-#define MASK_C_SD  0xe003
-#define MATCH_C_ADDIW 0x2001
-#define MASK_C_ADDIW  0xe003
-#define MATCH_C_LDSP 0x6002
-#define MASK_C_LDSP  0xe003
-#define MATCH_C_SDSP 0xe002
-#define MASK_C_SDSP  0xe003
-#define MATCH_C_ADDI4SPN 0x0
-#define MASK_C_ADDI4SPN  0xe003
-#define MATCH_C_FLD 0x2000
-#define MASK_C_FLD  0xe003
-#define MATCH_C_LW 0x4000
-#define MASK_C_LW  0xe003
-#define MATCH_C_FLW 0x6000
-#define MASK_C_FLW  0xe003
-#define MATCH_C_FSD 0xa000
-#define MASK_C_FSD  0xe003
-#define MATCH_C_SW 0xc000
-#define MASK_C_SW  0xe003
-#define MATCH_C_FSW 0xe000
-#define MASK_C_FSW  0xe003
-#define MATCH_C_ADDI 0x1
-#define MASK_C_ADDI  0xe003
-#define MATCH_C_JAL 0x2001
-#define MASK_C_JAL  0xe003
-#define MATCH_C_LI 0x4001
-#define MASK_C_LI  0xe003
-#define MATCH_C_LUI 0x6001
-#define MASK_C_LUI  0xe003
-#define MATCH_C_SRLI 0x8001
-#define MASK_C_SRLI  0xec03
-#define MATCH_C_SRAI 0x8401
-#define MASK_C_SRAI  0xec03
-#define MATCH_C_ANDI 0x8801
-#define MASK_C_ANDI  0xec03
-#define MATCH_C_SUB 0x8c01
-#define MASK_C_SUB  0xfc63
-#define MATCH_C_XOR 0x8c21
-#define MASK_C_XOR  0xfc63
-#define MATCH_C_OR 0x8c41
-#define MASK_C_OR  0xfc63
-#define MATCH_C_AND 0x8c61
-#define MASK_C_AND  0xfc63
-#define MATCH_C_SUBW 0x9c01
-#define MASK_C_SUBW  0xfc63
-#define MATCH_C_ADDW 0x9c21
-#define MASK_C_ADDW  0xfc63
-#define MATCH_C_J 0xa001
-#define MASK_C_J  0xe003
-#define MATCH_C_BEQZ 0xc001
-#define MASK_C_BEQZ  0xe003
-#define MATCH_C_BNEZ 0xe001
-#define MASK_C_BNEZ  0xe003
-#define MATCH_C_SLLI 0x2
-#define MASK_C_SLLI  0xe003
-#define MATCH_C_FLDSP 0x2002
-#define MASK_C_FLDSP  0xe003
-#define MATCH_C_LWSP 0x4002
-#define MASK_C_LWSP  0xe003
-#define MATCH_C_FLWSP 0x6002
-#define MASK_C_FLWSP  0xe003
-#define MATCH_C_MV 0x8002
-#define MASK_C_MV  0xf003
-#define MATCH_C_ADD 0x9002
-#define MASK_C_ADD  0xf003
-#define MATCH_C_FSDSP 0xa002
-#define MASK_C_FSDSP  0xe003
-#define MATCH_C_SWSP 0xc002
-#define MASK_C_SWSP  0xe003
-#define MATCH_C_FSWSP 0xe002
-#define MASK_C_FSWSP  0xe003
-#define MATCH_CUSTOM0 0xb
-#define MASK_CUSTOM0  0x707f
-#define MATCH_CUSTOM0_RS1 0x200b
-#define MASK_CUSTOM0_RS1  0x707f
-#define MATCH_CUSTOM0_RS1_RS2 0x300b
-#define MASK_CUSTOM0_RS1_RS2  0x707f
-#define MATCH_CUSTOM0_RD 0x400b
-#define MASK_CUSTOM0_RD  0x707f
-#define MATCH_CUSTOM0_RD_RS1 0x600b
-#define MASK_CUSTOM0_RD_RS1  0x707f
-#define MATCH_CUSTOM0_RD_RS1_RS2 0x700b
-#define MASK_CUSTOM0_RD_RS1_RS2  0x707f
-#define MATCH_CUSTOM1 0x2b
-#define MASK_CUSTOM1  0x707f
-#define MATCH_CUSTOM1_RS1 0x202b
-#define MASK_CUSTOM1_RS1  0x707f
-#define MATCH_CUSTOM1_RS1_RS2 0x302b
-#define MASK_CUSTOM1_RS1_RS2  0x707f
-#define MATCH_CUSTOM1_RD 0x402b
-#define MASK_CUSTOM1_RD  0x707f
-#define MATCH_CUSTOM1_RD_RS1 0x602b
-#define MASK_CUSTOM1_RD_RS1  0x707f
-#define MATCH_CUSTOM1_RD_RS1_RS2 0x702b
-#define MASK_CUSTOM1_RD_RS1_RS2  0x707f
-#define MATCH_CUSTOM2 0x5b
-#define MASK_CUSTOM2  0x707f
-#define MATCH_CUSTOM2_RS1 0x205b
-#define MASK_CUSTOM2_RS1  0x707f
-#define MATCH_CUSTOM2_RS1_RS2 0x305b
-#define MASK_CUSTOM2_RS1_RS2  0x707f
-#define MATCH_CUSTOM2_RD 0x405b
-#define MASK_CUSTOM2_RD  0x707f
-#define MATCH_CUSTOM2_RD_RS1 0x605b
-#define MASK_CUSTOM2_RD_RS1  0x707f
-#define MATCH_CUSTOM2_RD_RS1_RS2 0x705b
-#define MASK_CUSTOM2_RD_RS1_RS2  0x707f
-#define MATCH_CUSTOM3 0x7b
-#define MASK_CUSTOM3  0x707f
-#define MATCH_CUSTOM3_RS1 0x207b
-#define MASK_CUSTOM3_RS1  0x707f
-#define MATCH_CUSTOM3_RS1_RS2 0x307b
-#define MASK_CUSTOM3_RS1_RS2  0x707f
-#define MATCH_CUSTOM3_RD 0x407b
-#define MASK_CUSTOM3_RD  0x707f
-#define MATCH_CUSTOM3_RD_RS1 0x607b
-#define MASK_CUSTOM3_RD_RS1  0x707f
-#define MATCH_CUSTOM3_RD_RS1_RS2 0x707b
-#define MASK_CUSTOM3_RD_RS1_RS2  0x707f
-#define CSR_FFLAGS 0x1
-#define CSR_FRM 0x2
-#define CSR_FCSR 0x3
-#define CSR_CYCLE 0xc00
-#define CSR_TIME 0xc01
-#define CSR_INSTRET 0xc02
-#define CSR_HPMCOUNTER3 0xc03
-#define CSR_HPMCOUNTER4 0xc04
-#define CSR_HPMCOUNTER5 0xc05
-#define CSR_HPMCOUNTER6 0xc06
-#define CSR_HPMCOUNTER7 0xc07
-#define CSR_HPMCOUNTER8 0xc08
-#define CSR_HPMCOUNTER9 0xc09
-#define CSR_HPMCOUNTER10 0xc0a
-#define CSR_HPMCOUNTER11 0xc0b
-#define CSR_HPMCOUNTER12 0xc0c
-#define CSR_HPMCOUNTER13 0xc0d
-#define CSR_HPMCOUNTER14 0xc0e
-#define CSR_HPMCOUNTER15 0xc0f
-#define CSR_HPMCOUNTER16 0xc10
-#define CSR_HPMCOUNTER17 0xc11
-#define CSR_HPMCOUNTER18 0xc12
-#define CSR_HPMCOUNTER19 0xc13
-#define CSR_HPMCOUNTER20 0xc14
-#define CSR_HPMCOUNTER21 0xc15
-#define CSR_HPMCOUNTER22 0xc16
-#define CSR_HPMCOUNTER23 0xc17
-#define CSR_HPMCOUNTER24 0xc18
-#define CSR_HPMCOUNTER25 0xc19
-#define CSR_HPMCOUNTER26 0xc1a
-#define CSR_HPMCOUNTER27 0xc1b
-#define CSR_HPMCOUNTER28 0xc1c
-#define CSR_HPMCOUNTER29 0xc1d
-#define CSR_HPMCOUNTER30 0xc1e
-#define CSR_HPMCOUNTER31 0xc1f
-#define CSR_SSTATUS 0x100
-#define CSR_SIE 0x104
-#define CSR_STVEC 0x105
-#define CSR_SCOUNTEREN 0x106
-#define CSR_SSCRATCH 0x140
-#define CSR_SEPC 0x141
-#define CSR_SCAUSE 0x142
-#define CSR_STVAL 0x143
-#define CSR_SIP 0x144
-#define CSR_SATP 0x180
-#define CSR_MSTATUS 0x300
-#define CSR_MISA 0x301
-#define CSR_MEDELEG 0x302
-#define CSR_MIDELEG 0x303
-#define CSR_MIE 0x304
-#define CSR_MTVEC 0x305
-#define CSR_MCOUNTEREN 0x306
-#define CSR_MSCRATCH 0x340
-#define CSR_MEPC 0x341
-#define CSR_MCAUSE 0x342
-#define CSR_MTVAL 0x343
-#define CSR_MIP 0x344
-#define CSR_PMPCFG0 0x3a0
-#define CSR_PMPCFG1 0x3a1
-#define CSR_PMPCFG2 0x3a2
-#define CSR_PMPCFG3 0x3a3
-#define CSR_PMPADDR0 0x3b0
-#define CSR_PMPADDR1 0x3b1
-#define CSR_PMPADDR2 0x3b2
-#define CSR_PMPADDR3 0x3b3
-#define CSR_PMPADDR4 0x3b4
-#define CSR_PMPADDR5 0x3b5
-#define CSR_PMPADDR6 0x3b6
-#define CSR_PMPADDR7 0x3b7
-#define CSR_PMPADDR8 0x3b8
-#define CSR_PMPADDR9 0x3b9
-#define CSR_PMPADDR10 0x3ba
-#define CSR_PMPADDR11 0x3bb
-#define CSR_PMPADDR12 0x3bc
-#define CSR_PMPADDR13 0x3bd
-#define CSR_PMPADDR14 0x3be
-#define CSR_PMPADDR15 0x3bf
-#define CSR_TSELECT 0x7a0
-#define CSR_TDATA1 0x7a1
-#define CSR_TDATA2 0x7a2
-#define CSR_TDATA3 0x7a3
-#define CSR_DCSR 0x7b0
-#define CSR_DPC 0x7b1
-#define CSR_DSCRATCH 0x7b2
-#define CSR_MCYCLE 0xb00
-#define CSR_MINSTRET 0xb02
-#define CSR_MHPMCOUNTER3 0xb03
-#define CSR_MHPMCOUNTER4 0xb04
-#define CSR_MHPMCOUNTER5 0xb05
-#define CSR_MHPMCOUNTER6 0xb06
-#define CSR_MHPMCOUNTER7 0xb07
-#define CSR_MHPMCOUNTER8 0xb08
-#define CSR_MHPMCOUNTER9 0xb09
-#define CSR_MHPMCOUNTER10 0xb0a
-#define CSR_MHPMCOUNTER11 0xb0b
-#define CSR_MHPMCOUNTER12 0xb0c
-#define CSR_MHPMCOUNTER13 0xb0d
-#define CSR_MHPMCOUNTER14 0xb0e
-#define CSR_MHPMCOUNTER15 0xb0f
-#define CSR_MHPMCOUNTER16 0xb10
-#define CSR_MHPMCOUNTER17 0xb11
-#define CSR_MHPMCOUNTER18 0xb12
-#define CSR_MHPMCOUNTER19 0xb13
-#define CSR_MHPMCOUNTER20 0xb14
-#define CSR_MHPMCOUNTER21 0xb15
-#define CSR_MHPMCOUNTER22 0xb16
-#define CSR_MHPMCOUNTER23 0xb17
-#define CSR_MHPMCOUNTER24 0xb18
-#define CSR_MHPMCOUNTER25 0xb19
-#define CSR_MHPMCOUNTER26 0xb1a
-#define CSR_MHPMCOUNTER27 0xb1b
-#define CSR_MHPMCOUNTER28 0xb1c
-#define CSR_MHPMCOUNTER29 0xb1d
-#define CSR_MHPMCOUNTER30 0xb1e
-#define CSR_MHPMCOUNTER31 0xb1f
-#define CSR_MHPMEVENT3 0x323
-#define CSR_MHPMEVENT4 0x324
-#define CSR_MHPMEVENT5 0x325
-#define CSR_MHPMEVENT6 0x326
-#define CSR_MHPMEVENT7 0x327
-#define CSR_MHPMEVENT8 0x328
-#define CSR_MHPMEVENT9 0x329
-#define CSR_MHPMEVENT10 0x32a
-#define CSR_MHPMEVENT11 0x32b
-#define CSR_MHPMEVENT12 0x32c
-#define CSR_MHPMEVENT13 0x32d
-#define CSR_MHPMEVENT14 0x32e
-#define CSR_MHPMEVENT15 0x32f
-#define CSR_MHPMEVENT16 0x330
-#define CSR_MHPMEVENT17 0x331
-#define CSR_MHPMEVENT18 0x332
-#define CSR_MHPMEVENT19 0x333
-#define CSR_MHPMEVENT20 0x334
-#define CSR_MHPMEVENT21 0x335
-#define CSR_MHPMEVENT22 0x336
-#define CSR_MHPMEVENT23 0x337
-#define CSR_MHPMEVENT24 0x338
-#define CSR_MHPMEVENT25 0x339
-#define CSR_MHPMEVENT26 0x33a
-#define CSR_MHPMEVENT27 0x33b
-#define CSR_MHPMEVENT28 0x33c
-#define CSR_MHPMEVENT29 0x33d
-#define CSR_MHPMEVENT30 0x33e
-#define CSR_MHPMEVENT31 0x33f
-#define CSR_MVENDORID 0xf11
-#define CSR_MARCHID 0xf12
-#define CSR_MIMPID 0xf13
-#define CSR_MHARTID 0xf14
-#define CSR_CYCLEH 0xc80
-#define CSR_TIMEH 0xc81
-#define CSR_INSTRETH 0xc82
-#define CSR_HPMCOUNTER3H 0xc83
-#define CSR_HPMCOUNTER4H 0xc84
-#define CSR_HPMCOUNTER5H 0xc85
-#define CSR_HPMCOUNTER6H 0xc86
-#define CSR_HPMCOUNTER7H 0xc87
-#define CSR_HPMCOUNTER8H 0xc88
-#define CSR_HPMCOUNTER9H 0xc89
-#define CSR_HPMCOUNTER10H 0xc8a
-#define CSR_HPMCOUNTER11H 0xc8b
-#define CSR_HPMCOUNTER12H 0xc8c
-#define CSR_HPMCOUNTER13H 0xc8d
-#define CSR_HPMCOUNTER14H 0xc8e
-#define CSR_HPMCOUNTER15H 0xc8f
-#define CSR_HPMCOUNTER16H 0xc90
-#define CSR_HPMCOUNTER17H 0xc91
-#define CSR_HPMCOUNTER18H 0xc92
-#define CSR_HPMCOUNTER19H 0xc93
-#define CSR_HPMCOUNTER20H 0xc94
-#define CSR_HPMCOUNTER21H 0xc95
-#define CSR_HPMCOUNTER22H 0xc96
-#define CSR_HPMCOUNTER23H 0xc97
-#define CSR_HPMCOUNTER24H 0xc98
-#define CSR_HPMCOUNTER25H 0xc99
-#define CSR_HPMCOUNTER26H 0xc9a
-#define CSR_HPMCOUNTER27H 0xc9b
-#define CSR_HPMCOUNTER28H 0xc9c
-#define CSR_HPMCOUNTER29H 0xc9d
-#define CSR_HPMCOUNTER30H 0xc9e
-#define CSR_HPMCOUNTER31H 0xc9f
-#define CSR_MCYCLEH 0xb80
-#define CSR_MINSTRETH 0xb82
-#define CSR_MHPMCOUNTER3H 0xb83
-#define CSR_MHPMCOUNTER4H 0xb84
-#define CSR_MHPMCOUNTER5H 0xb85
-#define CSR_MHPMCOUNTER6H 0xb86
-#define CSR_MHPMCOUNTER7H 0xb87
-#define CSR_MHPMCOUNTER8H 0xb88
-#define CSR_MHPMCOUNTER9H 0xb89
-#define CSR_MHPMCOUNTER10H 0xb8a
-#define CSR_MHPMCOUNTER11H 0xb8b
-#define CSR_MHPMCOUNTER12H 0xb8c
-#define CSR_MHPMCOUNTER13H 0xb8d
-#define CSR_MHPMCOUNTER14H 0xb8e
-#define CSR_MHPMCOUNTER15H 0xb8f
-#define CSR_MHPMCOUNTER16H 0xb90
-#define CSR_MHPMCOUNTER17H 0xb91
-#define CSR_MHPMCOUNTER18H 0xb92
-#define CSR_MHPMCOUNTER19H 0xb93
-#define CSR_MHPMCOUNTER20H 0xb94
-#define CSR_MHPMCOUNTER21H 0xb95
-#define CSR_MHPMCOUNTER22H 0xb96
-#define CSR_MHPMCOUNTER23H 0xb97
-#define CSR_MHPMCOUNTER24H 0xb98
-#define CSR_MHPMCOUNTER25H 0xb99
-#define CSR_MHPMCOUNTER26H 0xb9a
-#define CSR_MHPMCOUNTER27H 0xb9b
-#define CSR_MHPMCOUNTER28H 0xb9c
-#define CSR_MHPMCOUNTER29H 0xb9d
-#define CSR_MHPMCOUNTER30H 0xb9e
-#define CSR_MHPMCOUNTER31H 0xb9f
-#define CAUSE_MISALIGNED_FETCH 0x0
-#define CAUSE_FETCH_ACCESS 0x1
-#define CAUSE_ILLEGAL_INSTRUCTION 0x2
-#define CAUSE_BREAKPOINT 0x3
-#define CAUSE_MISALIGNED_LOAD 0x4
-#define CAUSE_LOAD_ACCESS 0x5
-#define CAUSE_MISALIGNED_STORE 0x6
-#define CAUSE_STORE_ACCESS 0x7
-#define CAUSE_USER_ECALL 0x8
-#define CAUSE_SUPERVISOR_ECALL 0x9
-#define CAUSE_HYPERVISOR_ECALL 0xa
-#define CAUSE_MACHINE_ECALL 0xb
-#define CAUSE_FETCH_PAGE_FAULT 0xc
-#define CAUSE_LOAD_PAGE_FAULT 0xd
-#define CAUSE_STORE_PAGE_FAULT 0xf
-#endif
-#ifdef DECLARE_INSN
-DECLARE_INSN(beq, MATCH_BEQ, MASK_BEQ)
-DECLARE_INSN(bne, MATCH_BNE, MASK_BNE)
-DECLARE_INSN(blt, MATCH_BLT, MASK_BLT)
-DECLARE_INSN(bge, MATCH_BGE, MASK_BGE)
-DECLARE_INSN(bltu, MATCH_BLTU, MASK_BLTU)
-DECLARE_INSN(bgeu, MATCH_BGEU, MASK_BGEU)
-DECLARE_INSN(jalr, MATCH_JALR, MASK_JALR)
-DECLARE_INSN(jal, MATCH_JAL, MASK_JAL)
-DECLARE_INSN(lui, MATCH_LUI, MASK_LUI)
-DECLARE_INSN(auipc, MATCH_AUIPC, MASK_AUIPC)
-DECLARE_INSN(addi, MATCH_ADDI, MASK_ADDI)
-DECLARE_INSN(slli, MATCH_SLLI, MASK_SLLI)
-DECLARE_INSN(slti, MATCH_SLTI, MASK_SLTI)
-DECLARE_INSN(sltiu, MATCH_SLTIU, MASK_SLTIU)
-DECLARE_INSN(xori, MATCH_XORI, MASK_XORI)
-DECLARE_INSN(srli, MATCH_SRLI, MASK_SRLI)
-DECLARE_INSN(srai, MATCH_SRAI, MASK_SRAI)
-DECLARE_INSN(ori, MATCH_ORI, MASK_ORI)
-DECLARE_INSN(andi, MATCH_ANDI, MASK_ANDI)
-DECLARE_INSN(add, MATCH_ADD, MASK_ADD)
-DECLARE_INSN(sub, MATCH_SUB, MASK_SUB)
-DECLARE_INSN(sll, MATCH_SLL, MASK_SLL)
-DECLARE_INSN(slt, MATCH_SLT, MASK_SLT)
-DECLARE_INSN(sltu, MATCH_SLTU, MASK_SLTU)
-DECLARE_INSN(xor, MATCH_XOR, MASK_XOR)
-DECLARE_INSN(srl, MATCH_SRL, MASK_SRL)
-DECLARE_INSN(sra, MATCH_SRA, MASK_SRA)
-DECLARE_INSN(or, MATCH_OR, MASK_OR)
-DECLARE_INSN(and, MATCH_AND, MASK_AND)
-DECLARE_INSN(addiw, MATCH_ADDIW, MASK_ADDIW)
-DECLARE_INSN(slliw, MATCH_SLLIW, MASK_SLLIW)
-DECLARE_INSN(srliw, MATCH_SRLIW, MASK_SRLIW)
-DECLARE_INSN(sraiw, MATCH_SRAIW, MASK_SRAIW)
-DECLARE_INSN(addw, MATCH_ADDW, MASK_ADDW)
-DECLARE_INSN(subw, MATCH_SUBW, MASK_SUBW)
-DECLARE_INSN(sllw, MATCH_SLLW, MASK_SLLW)
-DECLARE_INSN(srlw, MATCH_SRLW, MASK_SRLW)
-DECLARE_INSN(sraw, MATCH_SRAW, MASK_SRAW)
-DECLARE_INSN(lb, MATCH_LB, MASK_LB)
-DECLARE_INSN(lh, MATCH_LH, MASK_LH)
-DECLARE_INSN(lw, MATCH_LW, MASK_LW)
-DECLARE_INSN(ld, MATCH_LD, MASK_LD)
-DECLARE_INSN(lbu, MATCH_LBU, MASK_LBU)
-DECLARE_INSN(lhu, MATCH_LHU, MASK_LHU)
-DECLARE_INSN(lwu, MATCH_LWU, MASK_LWU)
-DECLARE_INSN(sb, MATCH_SB, MASK_SB)
-DECLARE_INSN(sh, MATCH_SH, MASK_SH)
-DECLARE_INSN(sw, MATCH_SW, MASK_SW)
-DECLARE_INSN(sd, MATCH_SD, MASK_SD)
-DECLARE_INSN(fence, MATCH_FENCE, MASK_FENCE)
-DECLARE_INSN(fence_i, MATCH_FENCE_I, MASK_FENCE_I)
-DECLARE_INSN(mul, MATCH_MUL, MASK_MUL)
-DECLARE_INSN(mulh, MATCH_MULH, MASK_MULH)
-DECLARE_INSN(mulhsu, MATCH_MULHSU, MASK_MULHSU)
-DECLARE_INSN(mulhu, MATCH_MULHU, MASK_MULHU)
-DECLARE_INSN(div, MATCH_DIV, MASK_DIV)
-DECLARE_INSN(divu, MATCH_DIVU, MASK_DIVU)
-DECLARE_INSN(rem, MATCH_REM, MASK_REM)
-DECLARE_INSN(remu, MATCH_REMU, MASK_REMU)
-DECLARE_INSN(mulw, MATCH_MULW, MASK_MULW)
-DECLARE_INSN(divw, MATCH_DIVW, MASK_DIVW)
-DECLARE_INSN(divuw, MATCH_DIVUW, MASK_DIVUW)
-DECLARE_INSN(remw, MATCH_REMW, MASK_REMW)
-DECLARE_INSN(remuw, MATCH_REMUW, MASK_REMUW)
-DECLARE_INSN(amoadd_w, MATCH_AMOADD_W, MASK_AMOADD_W)
-DECLARE_INSN(amoxor_w, MATCH_AMOXOR_W, MASK_AMOXOR_W)
-DECLARE_INSN(amoor_w, MATCH_AMOOR_W, MASK_AMOOR_W)
-DECLARE_INSN(amoand_w, MATCH_AMOAND_W, MASK_AMOAND_W)
-DECLARE_INSN(amomin_w, MATCH_AMOMIN_W, MASK_AMOMIN_W)
-DECLARE_INSN(amomax_w, MATCH_AMOMAX_W, MASK_AMOMAX_W)
-DECLARE_INSN(amominu_w, MATCH_AMOMINU_W, MASK_AMOMINU_W)
-DECLARE_INSN(amomaxu_w, MATCH_AMOMAXU_W, MASK_AMOMAXU_W)
-DECLARE_INSN(amoswap_w, MATCH_AMOSWAP_W, MASK_AMOSWAP_W)
-DECLARE_INSN(lr_w, MATCH_LR_W, MASK_LR_W)
-DECLARE_INSN(sc_w, MATCH_SC_W, MASK_SC_W)
-DECLARE_INSN(amoadd_d, MATCH_AMOADD_D, MASK_AMOADD_D)
-DECLARE_INSN(amoxor_d, MATCH_AMOXOR_D, MASK_AMOXOR_D)
-DECLARE_INSN(amoor_d, MATCH_AMOOR_D, MASK_AMOOR_D)
-DECLARE_INSN(amoand_d, MATCH_AMOAND_D, MASK_AMOAND_D)
-DECLARE_INSN(amomin_d, MATCH_AMOMIN_D, MASK_AMOMIN_D)
-DECLARE_INSN(amomax_d, MATCH_AMOMAX_D, MASK_AMOMAX_D)
-DECLARE_INSN(amominu_d, MATCH_AMOMINU_D, MASK_AMOMINU_D)
-DECLARE_INSN(amomaxu_d, MATCH_AMOMAXU_D, MASK_AMOMAXU_D)
-DECLARE_INSN(amoswap_d, MATCH_AMOSWAP_D, MASK_AMOSWAP_D)
-DECLARE_INSN(lr_d, MATCH_LR_D, MASK_LR_D)
-DECLARE_INSN(sc_d, MATCH_SC_D, MASK_SC_D)
-DECLARE_INSN(ecall, MATCH_ECALL, MASK_ECALL)
-DECLARE_INSN(ebreak, MATCH_EBREAK, MASK_EBREAK)
-DECLARE_INSN(uret, MATCH_URET, MASK_URET)
-DECLARE_INSN(sret, MATCH_SRET, MASK_SRET)
-DECLARE_INSN(mret, MATCH_MRET, MASK_MRET)
-DECLARE_INSN(dret, MATCH_DRET, MASK_DRET)
-DECLARE_INSN(sfence_vma, MATCH_SFENCE_VMA, MASK_SFENCE_VMA)
-DECLARE_INSN(wfi, MATCH_WFI, MASK_WFI)
-DECLARE_INSN(csrrw, MATCH_CSRRW, MASK_CSRRW)
-DECLARE_INSN(csrrs, MATCH_CSRRS, MASK_CSRRS)
-DECLARE_INSN(csrrc, MATCH_CSRRC, MASK_CSRRC)
-DECLARE_INSN(csrrwi, MATCH_CSRRWI, MASK_CSRRWI)
-DECLARE_INSN(csrrsi, MATCH_CSRRSI, MASK_CSRRSI)
-DECLARE_INSN(csrrci, MATCH_CSRRCI, MASK_CSRRCI)
-DECLARE_INSN(fadd_s, MATCH_FADD_S, MASK_FADD_S)
-DECLARE_INSN(fsub_s, MATCH_FSUB_S, MASK_FSUB_S)
-DECLARE_INSN(fmul_s, MATCH_FMUL_S, MASK_FMUL_S)
-DECLARE_INSN(fdiv_s, MATCH_FDIV_S, MASK_FDIV_S)
-DECLARE_INSN(fsgnj_s, MATCH_FSGNJ_S, MASK_FSGNJ_S)
-DECLARE_INSN(fsgnjn_s, MATCH_FSGNJN_S, MASK_FSGNJN_S)
-DECLARE_INSN(fsgnjx_s, MATCH_FSGNJX_S, MASK_FSGNJX_S)
-DECLARE_INSN(fmin_s, MATCH_FMIN_S, MASK_FMIN_S)
-DECLARE_INSN(fmax_s, MATCH_FMAX_S, MASK_FMAX_S)
-DECLARE_INSN(fsqrt_s, MATCH_FSQRT_S, MASK_FSQRT_S)
-DECLARE_INSN(fadd_d, MATCH_FADD_D, MASK_FADD_D)
-DECLARE_INSN(fsub_d, MATCH_FSUB_D, MASK_FSUB_D)
-DECLARE_INSN(fmul_d, MATCH_FMUL_D, MASK_FMUL_D)
-DECLARE_INSN(fdiv_d, MATCH_FDIV_D, MASK_FDIV_D)
-DECLARE_INSN(fsgnj_d, MATCH_FSGNJ_D, MASK_FSGNJ_D)
-DECLARE_INSN(fsgnjn_d, MATCH_FSGNJN_D, MASK_FSGNJN_D)
-DECLARE_INSN(fsgnjx_d, MATCH_FSGNJX_D, MASK_FSGNJX_D)
-DECLARE_INSN(fmin_d, MATCH_FMIN_D, MASK_FMIN_D)
-DECLARE_INSN(fmax_d, MATCH_FMAX_D, MASK_FMAX_D)
-DECLARE_INSN(fcvt_s_d, MATCH_FCVT_S_D, MASK_FCVT_S_D)
-DECLARE_INSN(fcvt_d_s, MATCH_FCVT_D_S, MASK_FCVT_D_S)
-DECLARE_INSN(fsqrt_d, MATCH_FSQRT_D, MASK_FSQRT_D)
-DECLARE_INSN(fadd_q, MATCH_FADD_Q, MASK_FADD_Q)
-DECLARE_INSN(fsub_q, MATCH_FSUB_Q, MASK_FSUB_Q)
-DECLARE_INSN(fmul_q, MATCH_FMUL_Q, MASK_FMUL_Q)
-DECLARE_INSN(fdiv_q, MATCH_FDIV_Q, MASK_FDIV_Q)
-DECLARE_INSN(fsgnj_q, MATCH_FSGNJ_Q, MASK_FSGNJ_Q)
-DECLARE_INSN(fsgnjn_q, MATCH_FSGNJN_Q, MASK_FSGNJN_Q)
-DECLARE_INSN(fsgnjx_q, MATCH_FSGNJX_Q, MASK_FSGNJX_Q)
-DECLARE_INSN(fmin_q, MATCH_FMIN_Q, MASK_FMIN_Q)
-DECLARE_INSN(fmax_q, MATCH_FMAX_Q, MASK_FMAX_Q)
-DECLARE_INSN(fcvt_s_q, MATCH_FCVT_S_Q, MASK_FCVT_S_Q)
-DECLARE_INSN(fcvt_q_s, MATCH_FCVT_Q_S, MASK_FCVT_Q_S)
-DECLARE_INSN(fcvt_d_q, MATCH_FCVT_D_Q, MASK_FCVT_D_Q)
-DECLARE_INSN(fcvt_q_d, MATCH_FCVT_Q_D, MASK_FCVT_Q_D)
-DECLARE_INSN(fsqrt_q, MATCH_FSQRT_Q, MASK_FSQRT_Q)
-DECLARE_INSN(fle_s, MATCH_FLE_S, MASK_FLE_S)
-DECLARE_INSN(flt_s, MATCH_FLT_S, MASK_FLT_S)
-DECLARE_INSN(feq_s, MATCH_FEQ_S, MASK_FEQ_S)
-DECLARE_INSN(fle_d, MATCH_FLE_D, MASK_FLE_D)
-DECLARE_INSN(flt_d, MATCH_FLT_D, MASK_FLT_D)
-DECLARE_INSN(feq_d, MATCH_FEQ_D, MASK_FEQ_D)
-DECLARE_INSN(fle_q, MATCH_FLE_Q, MASK_FLE_Q)
-DECLARE_INSN(flt_q, MATCH_FLT_Q, MASK_FLT_Q)
-DECLARE_INSN(feq_q, MATCH_FEQ_Q, MASK_FEQ_Q)
-DECLARE_INSN(fcvt_w_s, MATCH_FCVT_W_S, MASK_FCVT_W_S)
-DECLARE_INSN(fcvt_wu_s, MATCH_FCVT_WU_S, MASK_FCVT_WU_S)
-DECLARE_INSN(fcvt_l_s, MATCH_FCVT_L_S, MASK_FCVT_L_S)
-DECLARE_INSN(fcvt_lu_s, MATCH_FCVT_LU_S, MASK_FCVT_LU_S)
-DECLARE_INSN(fmv_x_w, MATCH_FMV_X_W, MASK_FMV_X_W)
-DECLARE_INSN(fclass_s, MATCH_FCLASS_S, MASK_FCLASS_S)
-DECLARE_INSN(fcvt_w_d, MATCH_FCVT_W_D, MASK_FCVT_W_D)
-DECLARE_INSN(fcvt_wu_d, MATCH_FCVT_WU_D, MASK_FCVT_WU_D)
-DECLARE_INSN(fcvt_l_d, MATCH_FCVT_L_D, MASK_FCVT_L_D)
-DECLARE_INSN(fcvt_lu_d, MATCH_FCVT_LU_D, MASK_FCVT_LU_D)
-DECLARE_INSN(fmv_x_d, MATCH_FMV_X_D, MASK_FMV_X_D)
-DECLARE_INSN(fclass_d, MATCH_FCLASS_D, MASK_FCLASS_D)
-DECLARE_INSN(fcvt_w_q, MATCH_FCVT_W_Q, MASK_FCVT_W_Q)
-DECLARE_INSN(fcvt_wu_q, MATCH_FCVT_WU_Q, MASK_FCVT_WU_Q)
-DECLARE_INSN(fcvt_l_q, MATCH_FCVT_L_Q, MASK_FCVT_L_Q)
-DECLARE_INSN(fcvt_lu_q, MATCH_FCVT_LU_Q, MASK_FCVT_LU_Q)
-DECLARE_INSN(fmv_x_q, MATCH_FMV_X_Q, MASK_FMV_X_Q)
-DECLARE_INSN(fclass_q, MATCH_FCLASS_Q, MASK_FCLASS_Q)
-DECLARE_INSN(fcvt_s_w, MATCH_FCVT_S_W, MASK_FCVT_S_W)
-DECLARE_INSN(fcvt_s_wu, MATCH_FCVT_S_WU, MASK_FCVT_S_WU)
-DECLARE_INSN(fcvt_s_l, MATCH_FCVT_S_L, MASK_FCVT_S_L)
-DECLARE_INSN(fcvt_s_lu, MATCH_FCVT_S_LU, MASK_FCVT_S_LU)
-DECLARE_INSN(fmv_w_x, MATCH_FMV_W_X, MASK_FMV_W_X)
-DECLARE_INSN(fcvt_d_w, MATCH_FCVT_D_W, MASK_FCVT_D_W)
-DECLARE_INSN(fcvt_d_wu, MATCH_FCVT_D_WU, MASK_FCVT_D_WU)
-DECLARE_INSN(fcvt_d_l, MATCH_FCVT_D_L, MASK_FCVT_D_L)
-DECLARE_INSN(fcvt_d_lu, MATCH_FCVT_D_LU, MASK_FCVT_D_LU)
-DECLARE_INSN(fmv_d_x, MATCH_FMV_D_X, MASK_FMV_D_X)
-DECLARE_INSN(fcvt_q_w, MATCH_FCVT_Q_W, MASK_FCVT_Q_W)
-DECLARE_INSN(fcvt_q_wu, MATCH_FCVT_Q_WU, MASK_FCVT_Q_WU)
-DECLARE_INSN(fcvt_q_l, MATCH_FCVT_Q_L, MASK_FCVT_Q_L)
-DECLARE_INSN(fcvt_q_lu, MATCH_FCVT_Q_LU, MASK_FCVT_Q_LU)
-DECLARE_INSN(fmv_q_x, MATCH_FMV_Q_X, MASK_FMV_Q_X)
-DECLARE_INSN(flw, MATCH_FLW, MASK_FLW)
-DECLARE_INSN(fld, MATCH_FLD, MASK_FLD)
-DECLARE_INSN(flq, MATCH_FLQ, MASK_FLQ)
-DECLARE_INSN(fsw, MATCH_FSW, MASK_FSW)
-DECLARE_INSN(fsd, MATCH_FSD, MASK_FSD)
-DECLARE_INSN(fsq, MATCH_FSQ, MASK_FSQ)
-DECLARE_INSN(fmadd_s, MATCH_FMADD_S, MASK_FMADD_S)
-DECLARE_INSN(fmsub_s, MATCH_FMSUB_S, MASK_FMSUB_S)
-DECLARE_INSN(fnmsub_s, MATCH_FNMSUB_S, MASK_FNMSUB_S)
-DECLARE_INSN(fnmadd_s, MATCH_FNMADD_S, MASK_FNMADD_S)
-DECLARE_INSN(fmadd_d, MATCH_FMADD_D, MASK_FMADD_D)
-DECLARE_INSN(fmsub_d, MATCH_FMSUB_D, MASK_FMSUB_D)
-DECLARE_INSN(fnmsub_d, MATCH_FNMSUB_D, MASK_FNMSUB_D)
-DECLARE_INSN(fnmadd_d, MATCH_FNMADD_D, MASK_FNMADD_D)
-DECLARE_INSN(fmadd_q, MATCH_FMADD_Q, MASK_FMADD_Q)
-DECLARE_INSN(fmsub_q, MATCH_FMSUB_Q, MASK_FMSUB_Q)
-DECLARE_INSN(fnmsub_q, MATCH_FNMSUB_Q, MASK_FNMSUB_Q)
-DECLARE_INSN(fnmadd_q, MATCH_FNMADD_Q, MASK_FNMADD_Q)
-DECLARE_INSN(c_nop, MATCH_C_NOP, MASK_C_NOP)
-DECLARE_INSN(c_addi16sp, MATCH_C_ADDI16SP, MASK_C_ADDI16SP)
-DECLARE_INSN(c_jr, MATCH_C_JR, MASK_C_JR)
-DECLARE_INSN(c_jalr, MATCH_C_JALR, MASK_C_JALR)
-DECLARE_INSN(c_ebreak, MATCH_C_EBREAK, MASK_C_EBREAK)
-DECLARE_INSN(c_ld, MATCH_C_LD, MASK_C_LD)
-DECLARE_INSN(c_sd, MATCH_C_SD, MASK_C_SD)
-DECLARE_INSN(c_addiw, MATCH_C_ADDIW, MASK_C_ADDIW)
-DECLARE_INSN(c_ldsp, MATCH_C_LDSP, MASK_C_LDSP)
-DECLARE_INSN(c_sdsp, MATCH_C_SDSP, MASK_C_SDSP)
-DECLARE_INSN(c_addi4spn, MATCH_C_ADDI4SPN, MASK_C_ADDI4SPN)
-DECLARE_INSN(c_fld, MATCH_C_FLD, MASK_C_FLD)
-DECLARE_INSN(c_lw, MATCH_C_LW, MASK_C_LW)
-DECLARE_INSN(c_flw, MATCH_C_FLW, MASK_C_FLW)
-DECLARE_INSN(c_fsd, MATCH_C_FSD, MASK_C_FSD)
-DECLARE_INSN(c_sw, MATCH_C_SW, MASK_C_SW)
-DECLARE_INSN(c_fsw, MATCH_C_FSW, MASK_C_FSW)
-DECLARE_INSN(c_addi, MATCH_C_ADDI, MASK_C_ADDI)
-DECLARE_INSN(c_jal, MATCH_C_JAL, MASK_C_JAL)
-DECLARE_INSN(c_li, MATCH_C_LI, MASK_C_LI)
-DECLARE_INSN(c_lui, MATCH_C_LUI, MASK_C_LUI)
-DECLARE_INSN(c_srli, MATCH_C_SRLI, MASK_C_SRLI)
-DECLARE_INSN(c_srai, MATCH_C_SRAI, MASK_C_SRAI)
-DECLARE_INSN(c_andi, MATCH_C_ANDI, MASK_C_ANDI)
-DECLARE_INSN(c_sub, MATCH_C_SUB, MASK_C_SUB)
-DECLARE_INSN(c_xor, MATCH_C_XOR, MASK_C_XOR)
-DECLARE_INSN(c_or, MATCH_C_OR, MASK_C_OR)
-DECLARE_INSN(c_and, MATCH_C_AND, MASK_C_AND)
-DECLARE_INSN(c_subw, MATCH_C_SUBW, MASK_C_SUBW)
-DECLARE_INSN(c_addw, MATCH_C_ADDW, MASK_C_ADDW)
-DECLARE_INSN(c_j, MATCH_C_J, MASK_C_J)
-DECLARE_INSN(c_beqz, MATCH_C_BEQZ, MASK_C_BEQZ)
-DECLARE_INSN(c_bnez, MATCH_C_BNEZ, MASK_C_BNEZ)
-DECLARE_INSN(c_slli, MATCH_C_SLLI, MASK_C_SLLI)
-DECLARE_INSN(c_fldsp, MATCH_C_FLDSP, MASK_C_FLDSP)
-DECLARE_INSN(c_lwsp, MATCH_C_LWSP, MASK_C_LWSP)
-DECLARE_INSN(c_flwsp, MATCH_C_FLWSP, MASK_C_FLWSP)
-DECLARE_INSN(c_mv, MATCH_C_MV, MASK_C_MV)
-DECLARE_INSN(c_add, MATCH_C_ADD, MASK_C_ADD)
-DECLARE_INSN(c_fsdsp, MATCH_C_FSDSP, MASK_C_FSDSP)
-DECLARE_INSN(c_swsp, MATCH_C_SWSP, MASK_C_SWSP)
-DECLARE_INSN(c_fswsp, MATCH_C_FSWSP, MASK_C_FSWSP)
-DECLARE_INSN(custom0, MATCH_CUSTOM0, MASK_CUSTOM0)
-DECLARE_INSN(custom0_rs1, MATCH_CUSTOM0_RS1, MASK_CUSTOM0_RS1)
-DECLARE_INSN(custom0_rs1_rs2, MATCH_CUSTOM0_RS1_RS2, MASK_CUSTOM0_RS1_RS2)
-DECLARE_INSN(custom0_rd, MATCH_CUSTOM0_RD, MASK_CUSTOM0_RD)
-DECLARE_INSN(custom0_rd_rs1, MATCH_CUSTOM0_RD_RS1, MASK_CUSTOM0_RD_RS1)
-DECLARE_INSN(custom0_rd_rs1_rs2, MATCH_CUSTOM0_RD_RS1_RS2, MASK_CUSTOM0_RD_RS1_RS2)
-DECLARE_INSN(custom1, MATCH_CUSTOM1, MASK_CUSTOM1)
-DECLARE_INSN(custom1_rs1, MATCH_CUSTOM1_RS1, MASK_CUSTOM1_RS1)
-DECLARE_INSN(custom1_rs1_rs2, MATCH_CUSTOM1_RS1_RS2, MASK_CUSTOM1_RS1_RS2)
-DECLARE_INSN(custom1_rd, MATCH_CUSTOM1_RD, MASK_CUSTOM1_RD)
-DECLARE_INSN(custom1_rd_rs1, MATCH_CUSTOM1_RD_RS1, MASK_CUSTOM1_RD_RS1)
-DECLARE_INSN(custom1_rd_rs1_rs2, MATCH_CUSTOM1_RD_RS1_RS2, MASK_CUSTOM1_RD_RS1_RS2)
-DECLARE_INSN(custom2, MATCH_CUSTOM2, MASK_CUSTOM2)
-DECLARE_INSN(custom2_rs1, MATCH_CUSTOM2_RS1, MASK_CUSTOM2_RS1)
-DECLARE_INSN(custom2_rs1_rs2, MATCH_CUSTOM2_RS1_RS2, MASK_CUSTOM2_RS1_RS2)
-DECLARE_INSN(custom2_rd, MATCH_CUSTOM2_RD, MASK_CUSTOM2_RD)
-DECLARE_INSN(custom2_rd_rs1, MATCH_CUSTOM2_RD_RS1, MASK_CUSTOM2_RD_RS1)
-DECLARE_INSN(custom2_rd_rs1_rs2, MATCH_CUSTOM2_RD_RS1_RS2, MASK_CUSTOM2_RD_RS1_RS2)
-DECLARE_INSN(custom3, MATCH_CUSTOM3, MASK_CUSTOM3)
-DECLARE_INSN(custom3_rs1, MATCH_CUSTOM3_RS1, MASK_CUSTOM3_RS1)
-DECLARE_INSN(custom3_rs1_rs2, MATCH_CUSTOM3_RS1_RS2, MASK_CUSTOM3_RS1_RS2)
-DECLARE_INSN(custom3_rd, MATCH_CUSTOM3_RD, MASK_CUSTOM3_RD)
-DECLARE_INSN(custom3_rd_rs1, MATCH_CUSTOM3_RD_RS1, MASK_CUSTOM3_RD_RS1)
-DECLARE_INSN(custom3_rd_rs1_rs2, MATCH_CUSTOM3_RD_RS1_RS2, MASK_CUSTOM3_RD_RS1_RS2)
-#endif
-#ifdef DECLARE_CSR
-DECLARE_CSR(fflags, CSR_FFLAGS)
-DECLARE_CSR(frm, CSR_FRM)
-DECLARE_CSR(fcsr, CSR_FCSR)
-DECLARE_CSR(cycle, CSR_CYCLE)
-DECLARE_CSR(time, CSR_TIME)
-DECLARE_CSR(instret, CSR_INSTRET)
-DECLARE_CSR(hpmcounter3, CSR_HPMCOUNTER3)
-DECLARE_CSR(hpmcounter4, CSR_HPMCOUNTER4)
-DECLARE_CSR(hpmcounter5, CSR_HPMCOUNTER5)
-DECLARE_CSR(hpmcounter6, CSR_HPMCOUNTER6)
-DECLARE_CSR(hpmcounter7, CSR_HPMCOUNTER7)
-DECLARE_CSR(hpmcounter8, CSR_HPMCOUNTER8)
-DECLARE_CSR(hpmcounter9, CSR_HPMCOUNTER9)
-DECLARE_CSR(hpmcounter10, CSR_HPMCOUNTER10)
-DECLARE_CSR(hpmcounter11, CSR_HPMCOUNTER11)
-DECLARE_CSR(hpmcounter12, CSR_HPMCOUNTER12)
-DECLARE_CSR(hpmcounter13, CSR_HPMCOUNTER13)
-DECLARE_CSR(hpmcounter14, CSR_HPMCOUNTER14)
-DECLARE_CSR(hpmcounter15, CSR_HPMCOUNTER15)
-DECLARE_CSR(hpmcounter16, CSR_HPMCOUNTER16)
-DECLARE_CSR(hpmcounter17, CSR_HPMCOUNTER17)
-DECLARE_CSR(hpmcounter18, CSR_HPMCOUNTER18)
-DECLARE_CSR(hpmcounter19, CSR_HPMCOUNTER19)
-DECLARE_CSR(hpmcounter20, CSR_HPMCOUNTER20)
-DECLARE_CSR(hpmcounter21, CSR_HPMCOUNTER21)
-DECLARE_CSR(hpmcounter22, CSR_HPMCOUNTER22)
-DECLARE_CSR(hpmcounter23, CSR_HPMCOUNTER23)
-DECLARE_CSR(hpmcounter24, CSR_HPMCOUNTER24)
-DECLARE_CSR(hpmcounter25, CSR_HPMCOUNTER25)
-DECLARE_CSR(hpmcounter26, CSR_HPMCOUNTER26)
-DECLARE_CSR(hpmcounter27, CSR_HPMCOUNTER27)
-DECLARE_CSR(hpmcounter28, CSR_HPMCOUNTER28)
-DECLARE_CSR(hpmcounter29, CSR_HPMCOUNTER29)
-DECLARE_CSR(hpmcounter30, CSR_HPMCOUNTER30)
-DECLARE_CSR(hpmcounter31, CSR_HPMCOUNTER31)
-DECLARE_CSR(sstatus, CSR_SSTATUS)
-DECLARE_CSR(sie, CSR_SIE)
-DECLARE_CSR(stvec, CSR_STVEC)
-DECLARE_CSR(scounteren, CSR_SCOUNTEREN)
-DECLARE_CSR(sscratch, CSR_SSCRATCH)
-DECLARE_CSR(sepc, CSR_SEPC)
-DECLARE_CSR(scause, CSR_SCAUSE)
-DECLARE_CSR(stval, CSR_STVAL)
-DECLARE_CSR(sip, CSR_SIP)
-DECLARE_CSR(satp, CSR_SATP)
-DECLARE_CSR(mstatus, CSR_MSTATUS)
-DECLARE_CSR(misa, CSR_MISA)
-DECLARE_CSR(medeleg, CSR_MEDELEG)
-DECLARE_CSR(mideleg, CSR_MIDELEG)
-DECLARE_CSR(mie, CSR_MIE)
-DECLARE_CSR(mtvec, CSR_MTVEC)
-DECLARE_CSR(mcounteren, CSR_MCOUNTEREN)
-DECLARE_CSR(mscratch, CSR_MSCRATCH)
-DECLARE_CSR(mepc, CSR_MEPC)
-DECLARE_CSR(mcause, CSR_MCAUSE)
-DECLARE_CSR(mtval, CSR_MTVAL)
-DECLARE_CSR(mip, CSR_MIP)
-DECLARE_CSR(pmpcfg0, CSR_PMPCFG0)
-DECLARE_CSR(pmpcfg1, CSR_PMPCFG1)
-DECLARE_CSR(pmpcfg2, CSR_PMPCFG2)
-DECLARE_CSR(pmpcfg3, CSR_PMPCFG3)
-DECLARE_CSR(pmpaddr0, CSR_PMPADDR0)
-DECLARE_CSR(pmpaddr1, CSR_PMPADDR1)
-DECLARE_CSR(pmpaddr2, CSR_PMPADDR2)
-DECLARE_CSR(pmpaddr3, CSR_PMPADDR3)
-DECLARE_CSR(pmpaddr4, CSR_PMPADDR4)
-DECLARE_CSR(pmpaddr5, CSR_PMPADDR5)
-DECLARE_CSR(pmpaddr6, CSR_PMPADDR6)
-DECLARE_CSR(pmpaddr7, CSR_PMPADDR7)
-DECLARE_CSR(pmpaddr8, CSR_PMPADDR8)
-DECLARE_CSR(pmpaddr9, CSR_PMPADDR9)
-DECLARE_CSR(pmpaddr10, CSR_PMPADDR10)
-DECLARE_CSR(pmpaddr11, CSR_PMPADDR11)
-DECLARE_CSR(pmpaddr12, CSR_PMPADDR12)
-DECLARE_CSR(pmpaddr13, CSR_PMPADDR13)
-DECLARE_CSR(pmpaddr14, CSR_PMPADDR14)
-DECLARE_CSR(pmpaddr15, CSR_PMPADDR15)
-DECLARE_CSR(tselect, CSR_TSELECT)
-DECLARE_CSR(tdata1, CSR_TDATA1)
-DECLARE_CSR(tdata2, CSR_TDATA2)
-DECLARE_CSR(tdata3, CSR_TDATA3)
-DECLARE_CSR(dcsr, CSR_DCSR)
-DECLARE_CSR(dpc, CSR_DPC)
-DECLARE_CSR(dscratch, CSR_DSCRATCH)
-DECLARE_CSR(mcycle, CSR_MCYCLE)
-DECLARE_CSR(minstret, CSR_MINSTRET)
-DECLARE_CSR(mhpmcounter3, CSR_MHPMCOUNTER3)
-DECLARE_CSR(mhpmcounter4, CSR_MHPMCOUNTER4)
-DECLARE_CSR(mhpmcounter5, CSR_MHPMCOUNTER5)
-DECLARE_CSR(mhpmcounter6, CSR_MHPMCOUNTER6)
-DECLARE_CSR(mhpmcounter7, CSR_MHPMCOUNTER7)
-DECLARE_CSR(mhpmcounter8, CSR_MHPMCOUNTER8)
-DECLARE_CSR(mhpmcounter9, CSR_MHPMCOUNTER9)
-DECLARE_CSR(mhpmcounter10, CSR_MHPMCOUNTER10)
-DECLARE_CSR(mhpmcounter11, CSR_MHPMCOUNTER11)
-DECLARE_CSR(mhpmcounter12, CSR_MHPMCOUNTER12)
-DECLARE_CSR(mhpmcounter13, CSR_MHPMCOUNTER13)
-DECLARE_CSR(mhpmcounter14, CSR_MHPMCOUNTER14)
-DECLARE_CSR(mhpmcounter15, CSR_MHPMCOUNTER15)
-DECLARE_CSR(mhpmcounter16, CSR_MHPMCOUNTER16)
-DECLARE_CSR(mhpmcounter17, CSR_MHPMCOUNTER17)
-DECLARE_CSR(mhpmcounter18, CSR_MHPMCOUNTER18)
-DECLARE_CSR(mhpmcounter19, CSR_MHPMCOUNTER19)
-DECLARE_CSR(mhpmcounter20, CSR_MHPMCOUNTER20)
-DECLARE_CSR(mhpmcounter21, CSR_MHPMCOUNTER21)
-DECLARE_CSR(mhpmcounter22, CSR_MHPMCOUNTER22)
-DECLARE_CSR(mhpmcounter23, CSR_MHPMCOUNTER23)
-DECLARE_CSR(mhpmcounter24, CSR_MHPMCOUNTER24)
-DECLARE_CSR(mhpmcounter25, CSR_MHPMCOUNTER25)
-DECLARE_CSR(mhpmcounter26, CSR_MHPMCOUNTER26)
-DECLARE_CSR(mhpmcounter27, CSR_MHPMCOUNTER27)
-DECLARE_CSR(mhpmcounter28, CSR_MHPMCOUNTER28)
-DECLARE_CSR(mhpmcounter29, CSR_MHPMCOUNTER29)
-DECLARE_CSR(mhpmcounter30, CSR_MHPMCOUNTER30)
-DECLARE_CSR(mhpmcounter31, CSR_MHPMCOUNTER31)
-DECLARE_CSR(mhpmevent3, CSR_MHPMEVENT3)
-DECLARE_CSR(mhpmevent4, CSR_MHPMEVENT4)
-DECLARE_CSR(mhpmevent5, CSR_MHPMEVENT5)
-DECLARE_CSR(mhpmevent6, CSR_MHPMEVENT6)
-DECLARE_CSR(mhpmevent7, CSR_MHPMEVENT7)
-DECLARE_CSR(mhpmevent8, CSR_MHPMEVENT8)
-DECLARE_CSR(mhpmevent9, CSR_MHPMEVENT9)
-DECLARE_CSR(mhpmevent10, CSR_MHPMEVENT10)
-DECLARE_CSR(mhpmevent11, CSR_MHPMEVENT11)
-DECLARE_CSR(mhpmevent12, CSR_MHPMEVENT12)
-DECLARE_CSR(mhpmevent13, CSR_MHPMEVENT13)
-DECLARE_CSR(mhpmevent14, CSR_MHPMEVENT14)
-DECLARE_CSR(mhpmevent15, CSR_MHPMEVENT15)
-DECLARE_CSR(mhpmevent16, CSR_MHPMEVENT16)
-DECLARE_CSR(mhpmevent17, CSR_MHPMEVENT17)
-DECLARE_CSR(mhpmevent18, CSR_MHPMEVENT18)
-DECLARE_CSR(mhpmevent19, CSR_MHPMEVENT19)
-DECLARE_CSR(mhpmevent20, CSR_MHPMEVENT20)
-DECLARE_CSR(mhpmevent21, CSR_MHPMEVENT21)
-DECLARE_CSR(mhpmevent22, CSR_MHPMEVENT22)
-DECLARE_CSR(mhpmevent23, CSR_MHPMEVENT23)
-DECLARE_CSR(mhpmevent24, CSR_MHPMEVENT24)
-DECLARE_CSR(mhpmevent25, CSR_MHPMEVENT25)
-DECLARE_CSR(mhpmevent26, CSR_MHPMEVENT26)
-DECLARE_CSR(mhpmevent27, CSR_MHPMEVENT27)
-DECLARE_CSR(mhpmevent28, CSR_MHPMEVENT28)
-DECLARE_CSR(mhpmevent29, CSR_MHPMEVENT29)
-DECLARE_CSR(mhpmevent30, CSR_MHPMEVENT30)
-DECLARE_CSR(mhpmevent31, CSR_MHPMEVENT31)
-DECLARE_CSR(mvendorid, CSR_MVENDORID)
-DECLARE_CSR(marchid, CSR_MARCHID)
-DECLARE_CSR(mimpid, CSR_MIMPID)
-DECLARE_CSR(mhartid, CSR_MHARTID)
-DECLARE_CSR(cycleh, CSR_CYCLEH)
-DECLARE_CSR(timeh, CSR_TIMEH)
-DECLARE_CSR(instreth, CSR_INSTRETH)
-DECLARE_CSR(hpmcounter3h, CSR_HPMCOUNTER3H)
-DECLARE_CSR(hpmcounter4h, CSR_HPMCOUNTER4H)
-DECLARE_CSR(hpmcounter5h, CSR_HPMCOUNTER5H)
-DECLARE_CSR(hpmcounter6h, CSR_HPMCOUNTER6H)
-DECLARE_CSR(hpmcounter7h, CSR_HPMCOUNTER7H)
-DECLARE_CSR(hpmcounter8h, CSR_HPMCOUNTER8H)
-DECLARE_CSR(hpmcounter9h, CSR_HPMCOUNTER9H)
-DECLARE_CSR(hpmcounter10h, CSR_HPMCOUNTER10H)
-DECLARE_CSR(hpmcounter11h, CSR_HPMCOUNTER11H)
-DECLARE_CSR(hpmcounter12h, CSR_HPMCOUNTER12H)
-DECLARE_CSR(hpmcounter13h, CSR_HPMCOUNTER13H)
-DECLARE_CSR(hpmcounter14h, CSR_HPMCOUNTER14H)
-DECLARE_CSR(hpmcounter15h, CSR_HPMCOUNTER15H)
-DECLARE_CSR(hpmcounter16h, CSR_HPMCOUNTER16H)
-DECLARE_CSR(hpmcounter17h, CSR_HPMCOUNTER17H)
-DECLARE_CSR(hpmcounter18h, CSR_HPMCOUNTER18H)
-DECLARE_CSR(hpmcounter19h, CSR_HPMCOUNTER19H)
-DECLARE_CSR(hpmcounter20h, CSR_HPMCOUNTER20H)
-DECLARE_CSR(hpmcounter21h, CSR_HPMCOUNTER21H)
-DECLARE_CSR(hpmcounter22h, CSR_HPMCOUNTER22H)
-DECLARE_CSR(hpmcounter23h, CSR_HPMCOUNTER23H)
-DECLARE_CSR(hpmcounter24h, CSR_HPMCOUNTER24H)
-DECLARE_CSR(hpmcounter25h, CSR_HPMCOUNTER25H)
-DECLARE_CSR(hpmcounter26h, CSR_HPMCOUNTER26H)
-DECLARE_CSR(hpmcounter27h, CSR_HPMCOUNTER27H)
-DECLARE_CSR(hpmcounter28h, CSR_HPMCOUNTER28H)
-DECLARE_CSR(hpmcounter29h, CSR_HPMCOUNTER29H)
-DECLARE_CSR(hpmcounter30h, CSR_HPMCOUNTER30H)
-DECLARE_CSR(hpmcounter31h, CSR_HPMCOUNTER31H)
-DECLARE_CSR(mcycleh, CSR_MCYCLEH)
-DECLARE_CSR(minstreth, CSR_MINSTRETH)
-DECLARE_CSR(mhpmcounter3h, CSR_MHPMCOUNTER3H)
-DECLARE_CSR(mhpmcounter4h, CSR_MHPMCOUNTER4H)
-DECLARE_CSR(mhpmcounter5h, CSR_MHPMCOUNTER5H)
-DECLARE_CSR(mhpmcounter6h, CSR_MHPMCOUNTER6H)
-DECLARE_CSR(mhpmcounter7h, CSR_MHPMCOUNTER7H)
-DECLARE_CSR(mhpmcounter8h, CSR_MHPMCOUNTER8H)
-DECLARE_CSR(mhpmcounter9h, CSR_MHPMCOUNTER9H)
-DECLARE_CSR(mhpmcounter10h, CSR_MHPMCOUNTER10H)
-DECLARE_CSR(mhpmcounter11h, CSR_MHPMCOUNTER11H)
-DECLARE_CSR(mhpmcounter12h, CSR_MHPMCOUNTER12H)
-DECLARE_CSR(mhpmcounter13h, CSR_MHPMCOUNTER13H)
-DECLARE_CSR(mhpmcounter14h, CSR_MHPMCOUNTER14H)
-DECLARE_CSR(mhpmcounter15h, CSR_MHPMCOUNTER15H)
-DECLARE_CSR(mhpmcounter16h, CSR_MHPMCOUNTER16H)
-DECLARE_CSR(mhpmcounter17h, CSR_MHPMCOUNTER17H)
-DECLARE_CSR(mhpmcounter18h, CSR_MHPMCOUNTER18H)
-DECLARE_CSR(mhpmcounter19h, CSR_MHPMCOUNTER19H)
-DECLARE_CSR(mhpmcounter20h, CSR_MHPMCOUNTER20H)
-DECLARE_CSR(mhpmcounter21h, CSR_MHPMCOUNTER21H)
-DECLARE_CSR(mhpmcounter22h, CSR_MHPMCOUNTER22H)
-DECLARE_CSR(mhpmcounter23h, CSR_MHPMCOUNTER23H)
-DECLARE_CSR(mhpmcounter24h, CSR_MHPMCOUNTER24H)
-DECLARE_CSR(mhpmcounter25h, CSR_MHPMCOUNTER25H)
-DECLARE_CSR(mhpmcounter26h, CSR_MHPMCOUNTER26H)
-DECLARE_CSR(mhpmcounter27h, CSR_MHPMCOUNTER27H)
-DECLARE_CSR(mhpmcounter28h, CSR_MHPMCOUNTER28H)
-DECLARE_CSR(mhpmcounter29h, CSR_MHPMCOUNTER29H)
-DECLARE_CSR(mhpmcounter30h, CSR_MHPMCOUNTER30H)
-DECLARE_CSR(mhpmcounter31h, CSR_MHPMCOUNTER31H)
-#endif
-#ifdef DECLARE_CAUSE
-DECLARE_CAUSE("misaligned fetch", CAUSE_MISALIGNED_FETCH)
-DECLARE_CAUSE("fetch access", CAUSE_FETCH_ACCESS)
-DECLARE_CAUSE("illegal instruction", CAUSE_ILLEGAL_INSTRUCTION)
-DECLARE_CAUSE("breakpoint", CAUSE_BREAKPOINT)
-DECLARE_CAUSE("misaligned load", CAUSE_MISALIGNED_LOAD)
-DECLARE_CAUSE("load access", CAUSE_LOAD_ACCESS)
-DECLARE_CAUSE("misaligned store", CAUSE_MISALIGNED_STORE)
-DECLARE_CAUSE("store access", CAUSE_STORE_ACCESS)
-DECLARE_CAUSE("user_ecall", CAUSE_USER_ECALL)
-DECLARE_CAUSE("supervisor_ecall", CAUSE_SUPERVISOR_ECALL)
-DECLARE_CAUSE("hypervisor_ecall", CAUSE_HYPERVISOR_ECALL)
-DECLARE_CAUSE("machine_ecall", CAUSE_MACHINE_ECALL)
-DECLARE_CAUSE("fetch page fault", CAUSE_FETCH_PAGE_FAULT)
-DECLARE_CAUSE("load page fault", CAUSE_LOAD_PAGE_FAULT)
-DECLARE_CAUSE("store page fault", CAUSE_STORE_PAGE_FAULT)
-#endif
diff --git a/riscv/encoding.h b/riscv/encoding.h
new file mode 120000
index 0000000000..1075f15317
--- /dev/null
+++ b/riscv/encoding.h
@@ -0,0 +1 @@
+../../../software/runtime/encoding.h
\ No newline at end of file
diff --git a/riscv/execute.cc b/riscv/execute.cc
index e639e90462..84c6d91071 100644
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -2,65 +2,164 @@
 
 #include "processor.h"
 #include "mmu.h"
+#include "disasm.h"
 #include <cassert>
 
+#ifdef RISCV_ENABLE_COMMITLOG
+static void commit_log_reset(processor_t* p)
+{
+  p->get_state()->log_reg_write.clear();
+  p->get_state()->log_mem_read.clear();
+  p->get_state()->log_mem_write.clear();
+}
 
 static void commit_log_stash_privilege(processor_t* p)
 {
-#ifdef RISCV_ENABLE_COMMITLOG
   state_t* state = p->get_state();
   state->last_inst_priv = state->prv;
   state->last_inst_xlen = p->get_xlen();
   state->last_inst_flen = p->get_flen();
-#endif
 }
 
-static void commit_log_print_value(int width, uint64_t hi, uint64_t lo)
+static void commit_log_print_value(FILE *log_file, int width, const void *data)
 {
+  assert(log_file);
+
   switch (width) {
+    case 8:
+      fprintf(log_file, "0x%01" PRIx8, *(const uint8_t *)data);
+      break;
     case 16:
-      fprintf(stderr, "0x%04" PRIx16, (uint16_t)lo);
+      fprintf(log_file, "0x%04" PRIx16, *(const uint16_t *)data);
       break;
     case 32:
-      fprintf(stderr, "0x%08" PRIx32, (uint32_t)lo);
+      fprintf(log_file, "0x%08" PRIx32, *(const uint32_t *)data);
       break;
     case 64:
-      fprintf(stderr, "0x%016" PRIx64, lo);
-      break;
-    case 128:
-      fprintf(stderr, "0x%016" PRIx64 "%016" PRIx64, hi, lo);
+      fprintf(log_file, "0x%016" PRIx64, *(const uint64_t *)data);
       break;
     default:
-      abort();
+      // max lengh of vector
+      if (((width - 1) & width) == 0) {
+        const uint64_t *arr = (const uint64_t *)data;
+
+        fprintf(log_file, "0x");
+        for (int idx = width / 64 - 1; idx >= 0; --idx) {
+          fprintf(log_file, "%016" PRIx64, arr[idx]);
+        }
+      } else {
+        abort();
+      }
+      break;
   }
 }
 
-static void commit_log_print_insn(state_t* state, reg_t pc, insn_t insn)
+static void commit_log_print_value(FILE *log_file, int width, uint64_t val)
 {
-#ifdef RISCV_ENABLE_COMMITLOG
-  auto& reg = state->log_reg_write;
-  int priv = state->last_inst_priv;
-  int xlen = state->last_inst_xlen;
-  int flen = state->last_inst_flen;
-
-  fprintf(stderr, "%1d ", priv);
-  commit_log_print_value(xlen, 0, pc);
-  fprintf(stderr, " (");
-  commit_log_print_value(insn.length() * 8, 0, insn.bits());
-
-  if (reg.addr) {
-    bool fp = reg.addr & 1;
-    int rd = reg.addr >> 1;
-    int size = fp ? flen : xlen;
-    fprintf(stderr, ") %c%2d ", fp ? 'f' : 'x', rd);
-    commit_log_print_value(size, reg.data.v[1], reg.data.v[0]);
-    fprintf(stderr, "\n");
-  } else {
-    fprintf(stderr, ")\n");
+  commit_log_print_value(log_file, width, &val);
+}
+
+const char* processor_t::get_symbol(uint64_t addr)
+{
+  return sim->get_symbol(addr);
+}
+
+static void commit_log_print_insn(processor_t *p, reg_t pc, insn_t insn)
+{
+  FILE *log_file = p->get_log_file();
+
+  auto& reg = p->get_state()->log_reg_write;
+  auto& load = p->get_state()->log_mem_read;
+  auto& store = p->get_state()->log_mem_write;
+  int priv = p->get_state()->last_inst_priv;
+  int xlen = p->get_state()->last_inst_xlen;
+  int flen = p->get_state()->last_inst_flen;
+
+  // print core id on all lines so it is easy to grep
+  uint64_t id = p->get_csr(CSR_MHARTID);
+  fprintf(log_file, "core%4" PRId64 ": ", id);
+
+  fprintf(log_file, "%1d ", priv);
+  commit_log_print_value(log_file, xlen, pc);
+  fprintf(log_file, " (");
+  commit_log_print_value(log_file, insn.length() * 8, insn.bits());
+  fprintf(log_file, ")");
+  bool show_vec = false;
+
+  for (auto item : reg) {
+    if (item.first == 0)
+      continue;
+
+    char prefix;
+    int size;
+    int rd = item.first >> 4;
+    bool is_vec = false;
+    bool is_vreg = false;
+    switch (item.first & 0xf) {
+    case 0:
+      size = xlen;
+      prefix = 'x';
+      break;
+    case 1:
+      size = flen;
+      prefix = 'f';
+      break;
+    case 2:
+      size = p->VU.VLEN;
+      prefix = 'v';
+      is_vreg = true;
+      break;
+    case 3:
+      is_vec = true;
+      break;
+    case 4:
+      size = xlen;
+      prefix = 'c';
+      break;
+    default:
+      assert("can't been here" && 0);
+      break;
+    }
+
+    if (!show_vec && (is_vreg || is_vec)) {
+        fprintf(log_file, " e%ld %s%ld l%ld",
+                p->VU.vsew,
+                p->VU.vflmul < 1 ? "mf" : "m",
+                p->VU.vflmul < 1 ? (reg_t)(1 / p->VU.vflmul) : (reg_t)p->VU.vflmul,
+                p->VU.vl);
+        show_vec = true;
+    }
+
+    if (!is_vec) {
+      if (prefix == 'c')
+        fprintf(log_file, " c%d_%s ", rd, csr_name(rd));
+      else
+        fprintf(log_file, " %c%2d ", prefix, rd);
+      if (is_vreg)
+        commit_log_print_value(log_file, size, &p->VU.elt<uint8_t>(rd, 0));
+      else
+        commit_log_print_value(log_file, size, item.second.v);
+    }
   }
-  reg.addr = 0;
-#endif
+
+  for (auto item : load) {
+    fprintf(log_file, " mem ");
+    commit_log_print_value(log_file, xlen, std::get<0>(item));
+  }
+
+  for (auto item : store) {
+    fprintf(log_file, " mem ");
+    commit_log_print_value(log_file, xlen, std::get<0>(item));
+    fprintf(log_file, " ");
+    commit_log_print_value(log_file, std::get<2>(item) << 3, std::get<1>(item));
+  }
+  fprintf(log_file, "\n");
 }
+#else
+static void commit_log_reset(processor_t* p) {}
+static void commit_log_stash_privilege(processor_t* p) {}
+static void commit_log_print_insn(processor_t* p, reg_t pc, insn_t insn) {}
+#endif
 
 inline void processor_t::update_histogram(reg_t pc)
 {
@@ -74,26 +173,55 @@ inline void processor_t::update_histogram(reg_t pc)
 // function calls.
 static reg_t execute_insn(processor_t* p, reg_t pc, insn_fetch_t fetch)
 {
+  commit_log_reset(p);
   commit_log_stash_privilege(p);
-  reg_t npc = fetch.func(p, fetch.insn, pc);
-  if (npc != PC_SERIALIZE_BEFORE) {
-    commit_log_print_insn(p->get_state(), pc, fetch.insn);
-    p->update_histogram(pc);
+  reg_t npc;
+
+  try {
+    npc = fetch.func(p, fetch.insn, pc);
+    if (npc != PC_SERIALIZE_BEFORE) {
+
+#ifdef RISCV_ENABLE_COMMITLOG
+      if (p->get_log_commits_enabled()) {
+        commit_log_print_insn(p, pc, fetch.insn);
+      }
+#endif
+
+     }
+#ifdef RISCV_ENABLE_COMMITLOG
+  } catch(mem_trap_t& t) {
+      //handle segfault in midlle of vector load/store
+      if (p->get_log_commits_enabled()) {
+        for (auto item : p->get_state()->log_reg_write) {
+          if ((item.first & 3) == 3) {
+            commit_log_print_insn(p, pc, fetch.insn);
+            break;
+          }
+        }
+      }
+      throw;
+#endif
+  } catch(...) {
+    throw;
   }
+  p->update_histogram(pc);
+
   return npc;
 }
 
 bool processor_t::slow_path()
 {
-  return debug || state.single_step != state.STEP_NONE || state.dcsr.cause;
+  return debug || state.single_step != state.STEP_NONE || state.debug_mode;
 }
 
 // fetch/decode/execute loop
 void processor_t::step(size_t n)
 {
-  if (state.dcsr.cause == DCSR_CAUSE_NONE) {
-    if (halt_request) {
+  if (!state.debug_mode) {
+    if (halt_request == HR_REGULAR) {
       enter_debug_mode(DCSR_CAUSE_DEBUGINT);
+    } else if (halt_request == HR_GROUP) {
+      enter_debug_mode(DCSR_CAUSE_GROUP);
     } // !!!The halt bit in DCSR is deprecated.
     else if (state.dcsr.halt) {
       enter_debug_mode(DCSR_CAUSE_HALT);
@@ -130,7 +258,7 @@ void processor_t::step(size_t n)
         {
           if (unlikely(!state.serialized && state.single_step == state.STEP_STEPPED)) {
             state.single_step = state.STEP_NONE;
-            if (state.dcsr.cause == DCSR_CAUSE_NONE) {
+            if (!state.debug_mode) {
               enter_debug_mode(DCSR_CAUSE_STEP);
               // enter_debug_mode changed state.pc, so we can't just continue.
               break;
@@ -145,15 +273,8 @@ void processor_t::step(size_t n)
           if (debug && !state.serialized)
             disasm(fetch.insn);
           pc = execute_insn(this, pc, fetch);
-
+          pc = this->hwLoops.handle_loops(state.pc, pc, fetch.insn);
           advance_pc();
-
-          if (unlikely(state.pc >= DEBUG_ROM_ENTRY &&
-                       state.pc < DEBUG_END)) {
-            // We're waiting for the debugger to tell us something.
-            return;
-          }
-
         }
       }
       else while (instret < n)
@@ -185,9 +306,13 @@ void processor_t::step(size_t n)
         // This macro is included in "icache.h" included within the switch
         // statement below. The indirect jump corresponding to the instruction
         // is located within the execute_insn() function call.
+        
+        // Todo: Is it a good idea to add hwloops here or is forcing slow-path better?
+        // trade-off between speed of hwloops and speed of everything else
         #define ICACHE_ACCESS(i) { \
           insn_fetch_t fetch = ic_entry->data; \
           pc = execute_insn(this, pc, fetch); \
+          pc = this->hwLoops.handle_loops(state.pc, pc, fetch.insn);  \
           ic_entry = ic_entry->next; \
           if (i == mmu_t::ICACHE_ENTRIES-1) break; \
           if (unlikely(ic_entry->tag != pc)) break; \
@@ -236,7 +361,7 @@ void processor_t::step(size_t n)
           enter_debug_mode(DCSR_CAUSE_HWBP);
           break;
         case ACTION_DEBUG_EXCEPTION: {
-          mem_trap_t trap(CAUSE_BREAKPOINT, t.address);
+          insn_trap_t trap(CAUSE_BREAKPOINT, t.address);
           take_trap(trap, pc);
           break;
         }
@@ -244,6 +369,16 @@ void processor_t::step(size_t n)
           abort();
       }
     }
+    catch (wait_for_interrupt_t &t)
+    {
+      // Return to the outer simulation loop, which gives other devices/harts a
+      // chance to generate interrupts.
+      //
+      // In the debug ROM this prevents us from wasting time looping, but also
+      // allows us to switch to other threads only once per idle loop in case
+      // there is activity.
+      n = instret;
+    }
 
     state.minstret += instret;
     n -= instret;
diff --git a/riscv/extensions.cc b/riscv/extensions.cc
index d1690c4b8f..347dc5e915 100644
--- a/riscv/extensions.cc
+++ b/riscv/extensions.cc
@@ -21,14 +21,23 @@ std::function<extension_t*()> find_extension(const char* name)
   if (!extensions().count(name)) {
     // try to find extension xyz by loading libxyz.so
     std::string libname = std::string("lib") + name + ".so";
-    if (!dlopen(libname.c_str(), RTLD_LAZY)) {
-      fprintf(stderr, "couldn't find extension '%s' (or library '%s')\n",
-              name, libname.c_str());
-      exit(-1);
+    std::string libdefault = "libcustomext.so";
+    bool is_default = false;
+    auto dlh = dlopen(libname.c_str(), RTLD_LAZY);
+    if (!dlh) {
+      dlh = dlopen(libdefault.c_str(), RTLD_LAZY);
+      if (!dlh) {
+        fprintf(stderr, "couldn't find shared library either '%s' or '%s')\n",
+                libname.c_str(), libdefault.c_str());
+        exit(-1);
+      }
+
+      is_default = true;
     }
+
     if (!extensions().count(name)) {
       fprintf(stderr, "couldn't find extension '%s' in shared library '%s'\n",
-              name, libname.c_str());
+              name, is_default ? libdefault.c_str() : libname.c_str());
       exit(-1);
     }
   }
diff --git a/riscv/gen_icache b/riscv/gen_icache
index 7ec3c69434..67c0d69f1a 100755
--- a/riscv/gen_icache
+++ b/riscv/gen_icache
@@ -1,7 +1,8 @@
 #!/bin/sh
-n=$(($1-1))
-for i in `seq 0 $n`
+i=0
+while [ $i -lt $1 ]
 do
   echo case $i: ICACHE_ACCESS\($i\)\;
+  i=$((i+1))
 done
 echo
diff --git a/riscv/insn_template.h b/riscv/insn_template.h
index 07aa16ba05..3c36d10e52 100644
--- a/riscv/insn_template.h
+++ b/riscv/insn_template.h
@@ -1,7 +1,7 @@
 // See LICENSE for license details.
 
+#include "arith.h"
 #include "mmu.h"
-#include "mulhi.h"
 #include "softfloat.h"
 #include "internals.h"
 #include "specialize.h"
diff --git a/riscv/insns/c_ebreak.h b/riscv/insns/c_ebreak.h
index 128b86b22c..1c36b2418b 100644
--- a/riscv/insns/c_ebreak.h
+++ b/riscv/insns/c_ebreak.h
@@ -1,2 +1,2 @@
 require_extension('C');
-throw trap_breakpoint(pc);
+throw trap_breakpoint(0);
diff --git a/riscv/insns/csrrc.h b/riscv/insns/csrrc.h
index 0472d80efd..37384b0e52 100644
--- a/riscv/insns/csrrc.h
+++ b/riscv/insns/csrrc.h
@@ -1,6 +1,6 @@
 bool write = insn.rs1() != 0;
 int csr = validate_csr(insn.csr(), write);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, write);
 if (write) {
   p->set_csr(csr, old & ~RS1);
 }
diff --git a/riscv/insns/csrrci.h b/riscv/insns/csrrci.h
index 4d83cc0617..ad40c8f4c2 100644
--- a/riscv/insns/csrrci.h
+++ b/riscv/insns/csrrci.h
@@ -1,6 +1,6 @@
 bool write = insn.rs1() != 0;
 int csr = validate_csr(insn.csr(), write);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, write);
 if (write) {
   p->set_csr(csr, old & ~(reg_t)insn.rs1());
 }
diff --git a/riscv/insns/csrrs.h b/riscv/insns/csrrs.h
index 4e8bde9637..91fcc7a347 100644
--- a/riscv/insns/csrrs.h
+++ b/riscv/insns/csrrs.h
@@ -1,6 +1,6 @@
 bool write = insn.rs1() != 0;
 int csr = validate_csr(insn.csr(), write);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, write);
 if (write) {
   p->set_csr(csr, old | RS1);
 }
diff --git a/riscv/insns/csrrsi.h b/riscv/insns/csrrsi.h
index b673725b54..f348e570bd 100644
--- a/riscv/insns/csrrsi.h
+++ b/riscv/insns/csrrsi.h
@@ -1,6 +1,6 @@
 bool write = insn.rs1() != 0;
 int csr = validate_csr(insn.csr(), write);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, write);
 if (write) {
   p->set_csr(csr, old | insn.rs1());
 }
diff --git a/riscv/insns/csrrw.h b/riscv/insns/csrrw.h
index e45420b570..cc0c28dc95 100644
--- a/riscv/insns/csrrw.h
+++ b/riscv/insns/csrrw.h
@@ -1,5 +1,5 @@
 int csr = validate_csr(insn.csr(), true);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, true);
 p->set_csr(csr, RS1);
 WRITE_RD(sext_xlen(old));
 serialize();
diff --git a/riscv/insns/csrrwi.h b/riscv/insns/csrrwi.h
index decadf4121..4d5d06468b 100644
--- a/riscv/insns/csrrwi.h
+++ b/riscv/insns/csrrwi.h
@@ -1,5 +1,5 @@
 int csr = validate_csr(insn.csr(), true);
-reg_t old = p->get_csr(csr);
+reg_t old = p->get_csr(csr, insn, true);
 p->set_csr(csr, insn.rs1());
 WRITE_RD(sext_xlen(old));
 serialize();
diff --git a/riscv/insns/dret.h b/riscv/insns/dret.h
index 35c19cb8a2..ba503a0c7f 100644
--- a/riscv/insns/dret.h
+++ b/riscv/insns/dret.h
@@ -1,9 +1,9 @@
-require_privilege(PRV_M);
+require(STATE.debug_mode);
 set_pc_and_serialize(STATE.dpc);
 p->set_privilege(STATE.dcsr.prv);
 
 /* We're not in Debug Mode anymore. */
-STATE.dcsr.cause = 0;
+STATE.debug_mode = false;
 
 if (STATE.dcsr.step)
   STATE.single_step = STATE.STEP_STEPPING;
diff --git a/riscv/insns/ebreak.h b/riscv/insns/ebreak.h
index 736cebef4b..f123f9544d 100644
--- a/riscv/insns/ebreak.h
+++ b/riscv/insns/ebreak.h
@@ -1 +1 @@
-throw trap_breakpoint(pc);
+throw trap_breakpoint(0);
diff --git a/riscv/insns/ecall.h b/riscv/insns/ecall.h
index e298ac722b..e6c723f4e3 100644
--- a/riscv/insns/ecall.h
+++ b/riscv/insns/ecall.h
@@ -1,7 +1,11 @@
 switch (STATE.prv)
 {
   case PRV_U: throw trap_user_ecall();
-  case PRV_S: throw trap_supervisor_ecall();
+  case PRV_S:
+    if (STATE.v)
+      throw trap_virtual_supervisor_ecall();
+    else
+      throw trap_supervisor_ecall();
   case PRV_M: throw trap_machine_ecall();
   default: abort();
 }
diff --git a/riscv/insns/fadd_h.h b/riscv/insns/fadd_h.h
new file mode 100644
index 0000000000..2b646ae77b
--- /dev/null
+++ b/riscv/insns/fadd_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_add(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fclass_h.h b/riscv/insns/fclass_h.h
new file mode 100644
index 0000000000..066a2d24d6
--- /dev/null
+++ b/riscv/insns/fclass_h.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_RD(f16_classify(f16(FRS1)));
diff --git a/riscv/insns/fcvt_d_h.h b/riscv/insns/fcvt_d_h.h
new file mode 100644
index 0000000000..6906fc06c1
--- /dev/null
+++ b/riscv/insns/fcvt_d_h.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_extension('D');
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_to_f64(f16(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_d.h b/riscv/insns/fcvt_h_d.h
new file mode 100644
index 0000000000..f463dd58e6
--- /dev/null
+++ b/riscv/insns/fcvt_h_d.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_extension('D');
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f64_to_f16(f64(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_l.h b/riscv/insns/fcvt_h_l.h
new file mode 100644
index 0000000000..39178c2fd3
--- /dev/null
+++ b/riscv/insns/fcvt_h_l.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_rv64;
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(i64_to_f16(RS1));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_lu.h b/riscv/insns/fcvt_h_lu.h
new file mode 100644
index 0000000000..a872c48091
--- /dev/null
+++ b/riscv/insns/fcvt_h_lu.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_rv64;
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(ui64_to_f16(RS1));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_q.h b/riscv/insns/fcvt_h_q.h
new file mode 100644
index 0000000000..94b0001635
--- /dev/null
+++ b/riscv/insns/fcvt_h_q.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_extension('Q');
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f128_to_f16(f128(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_s.h b/riscv/insns/fcvt_h_s.h
new file mode 100644
index 0000000000..eb928e9785
--- /dev/null
+++ b/riscv/insns/fcvt_h_s.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f32_to_f16(f32(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_w.h b/riscv/insns/fcvt_h_w.h
new file mode 100644
index 0000000000..c08245451f
--- /dev/null
+++ b/riscv/insns/fcvt_h_w.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(i32_to_f16((int32_t)RS1));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_h_wu.h b/riscv/insns/fcvt_h_wu.h
new file mode 100644
index 0000000000..9f2f5f6a7c
--- /dev/null
+++ b/riscv/insns/fcvt_h_wu.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(ui32_to_f16((uint32_t)RS1));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_l_h.h b/riscv/insns/fcvt_l_h.h
new file mode 100644
index 0000000000..5a1fea850d
--- /dev/null
+++ b/riscv/insns/fcvt_l_h.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_rv64;
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_RD(f16_to_i64(f16(FRS1), RM, true));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_lu_h.h b/riscv/insns/fcvt_lu_h.h
new file mode 100644
index 0000000000..f1454c3e99
--- /dev/null
+++ b/riscv/insns/fcvt_lu_h.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_rv64;
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_RD(f16_to_ui64(f16(FRS1), RM, true));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_q_h.h b/riscv/insns/fcvt_q_h.h
new file mode 100644
index 0000000000..8a5f6805c5
--- /dev/null
+++ b/riscv/insns/fcvt_q_h.h
@@ -0,0 +1,6 @@
+require_extension(EXT_ZFH);
+require_extension('Q');
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_to_f128(f16(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_s_h.h b/riscv/insns/fcvt_s_h.h
new file mode 100644
index 0000000000..bfa2e91497
--- /dev/null
+++ b/riscv/insns/fcvt_s_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_to_f32(f16(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_w_h.h b/riscv/insns/fcvt_w_h.h
new file mode 100644
index 0000000000..fe8bb48fb2
--- /dev/null
+++ b/riscv/insns/fcvt_w_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_RD(sext32(f16_to_i32(f16(FRS1), RM, true)));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_wu_h.h b/riscv/insns/fcvt_wu_h.h
new file mode 100644
index 0000000000..bf6648d3c6
--- /dev/null
+++ b/riscv/insns/fcvt_wu_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_RD(sext32(f16_to_ui32(f16(FRS1), RM, true)));
+set_fp_exceptions;
diff --git a/riscv/insns/fdiv_h.h b/riscv/insns/fdiv_h.h
new file mode 100644
index 0000000000..a169eae83a
--- /dev/null
+++ b/riscv/insns/fdiv_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_div(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/feq_h.h b/riscv/insns/feq_h.h
new file mode 100644
index 0000000000..47e75a5b92
--- /dev/null
+++ b/riscv/insns/feq_h.h
@@ -0,0 +1,4 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_RD(f16_eq(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fle_h.h b/riscv/insns/fle_h.h
new file mode 100644
index 0000000000..9fc5968532
--- /dev/null
+++ b/riscv/insns/fle_h.h
@@ -0,0 +1,4 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_RD(f16_le(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/flh.h b/riscv/insns/flh.h
new file mode 100644
index 0000000000..c887999398
--- /dev/null
+++ b/riscv/insns/flh.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(f16(MMU.load_uint16(RS1 + insn.i_imm())));
diff --git a/riscv/insns/flt_h.h b/riscv/insns/flt_h.h
new file mode 100644
index 0000000000..f516a38a62
--- /dev/null
+++ b/riscv/insns/flt_h.h
@@ -0,0 +1,4 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_RD(f16_lt(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmadd_h.h b/riscv/insns/fmadd_h.h
new file mode 100644
index 0000000000..6551de5e30
--- /dev/null
+++ b/riscv/insns/fmadd_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_mulAdd(f16(FRS1), f16(FRS2), f16(FRS3)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmax_h.h b/riscv/insns/fmax_h.h
new file mode 100644
index 0000000000..3d4c40ebf9
--- /dev/null
+++ b/riscv/insns/fmax_h.h
@@ -0,0 +1,4 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(f16_max(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmin_h.h b/riscv/insns/fmin_h.h
new file mode 100644
index 0000000000..5fb1404fe0
--- /dev/null
+++ b/riscv/insns/fmin_h.h
@@ -0,0 +1,4 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(f16_min(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmsub_h.h b/riscv/insns/fmsub_h.h
new file mode 100644
index 0000000000..934291fc81
--- /dev/null
+++ b/riscv/insns/fmsub_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_mulAdd(f16(FRS1), f16(FRS2), f16(f16(FRS3).v ^ F16_SIGN)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmul_h.h b/riscv/insns/fmul_h.h
new file mode 100644
index 0000000000..0152df8f09
--- /dev/null
+++ b/riscv/insns/fmul_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_mul(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fmv_h_x.h b/riscv/insns/fmv_h_x.h
new file mode 100644
index 0000000000..c022508e24
--- /dev/null
+++ b/riscv/insns/fmv_h_x.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(f16(RS1));
diff --git a/riscv/insns/fmv_x_h.h b/riscv/insns/fmv_x_h.h
new file mode 100644
index 0000000000..5e89c4f0f0
--- /dev/null
+++ b/riscv/insns/fmv_x_h.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_RD(sext32((int16_t)(FRS1.v[0])));
diff --git a/riscv/insns/fnmadd_h.h b/riscv/insns/fnmadd_h.h
new file mode 100644
index 0000000000..e4c619e77a
--- /dev/null
+++ b/riscv/insns/fnmadd_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_mulAdd(f16(f16(FRS1).v ^ F16_SIGN), f16(FRS2), f16(f16(FRS3).v ^ F16_SIGN)));
+set_fp_exceptions;
diff --git a/riscv/insns/fnmsub_h.h b/riscv/insns/fnmsub_h.h
new file mode 100644
index 0000000000..0410c3bba6
--- /dev/null
+++ b/riscv/insns/fnmsub_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_mulAdd(f16(f16(FRS1).v ^ F16_SIGN), f16(FRS2), f16(FRS3)));
+set_fp_exceptions;
diff --git a/riscv/insns/fsgnj_h.h b/riscv/insns/fsgnj_h.h
new file mode 100644
index 0000000000..79d50f5fa3
--- /dev/null
+++ b/riscv/insns/fsgnj_h.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(fsgnj16(FRS1, FRS2, false, false));
diff --git a/riscv/insns/fsgnjn_h.h b/riscv/insns/fsgnjn_h.h
new file mode 100644
index 0000000000..ebb4ac9f50
--- /dev/null
+++ b/riscv/insns/fsgnjn_h.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(fsgnj16(FRS1, FRS2, true, false));
diff --git a/riscv/insns/fsgnjx_h.h b/riscv/insns/fsgnjx_h.h
new file mode 100644
index 0000000000..9310269545
--- /dev/null
+++ b/riscv/insns/fsgnjx_h.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+WRITE_FRD(fsgnj16(FRS1, FRS2, false, true));
diff --git a/riscv/insns/fsh.h b/riscv/insns/fsh.h
new file mode 100644
index 0000000000..b9fa4e0557
--- /dev/null
+++ b/riscv/insns/fsh.h
@@ -0,0 +1,3 @@
+require_extension(EXT_ZFH);
+require_fp;
+MMU.store_uint16(RS1 + insn.s_imm(), FRS2.v[0]);
diff --git a/riscv/insns/fsqrt_h.h b/riscv/insns/fsqrt_h.h
new file mode 100644
index 0000000000..138d572744
--- /dev/null
+++ b/riscv/insns/fsqrt_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_sqrt(f16(FRS1)));
+set_fp_exceptions;
diff --git a/riscv/insns/fsub_h.h b/riscv/insns/fsub_h.h
new file mode 100644
index 0000000000..43b51cc2eb
--- /dev/null
+++ b/riscv/insns/fsub_h.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFH);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD(f16_sub(f16(FRS1), f16(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/hfence_gvma.h b/riscv/insns/hfence_gvma.h
new file mode 100644
index 0000000000..f1996d921f
--- /dev/null
+++ b/riscv/insns/hfence_gvma.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.mstatus, MSTATUS_TVM) ? PRV_M : PRV_S);
+MMU.flush_tlb();
diff --git a/riscv/insns/hfence_vvma.h b/riscv/insns/hfence_vvma.h
new file mode 100644
index 0000000000..ecd42c198c
--- /dev/null
+++ b/riscv/insns/hfence_vvma.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(PRV_S);
+MMU.flush_tlb();
diff --git a/riscv/insns/hlv_b.h b/riscv/insns/hlv_b.h
new file mode 100644
index 0000000000..86192c6328
--- /dev/null
+++ b/riscv/insns/hlv_b.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_int8(RS1));
diff --git a/riscv/insns/hlv_bu.h b/riscv/insns/hlv_bu.h
new file mode 100644
index 0000000000..2f951947d0
--- /dev/null
+++ b/riscv/insns/hlv_bu.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_uint8(RS1));
diff --git a/riscv/insns/hlv_d.h b/riscv/insns/hlv_d.h
new file mode 100644
index 0000000000..1bbd0277b4
--- /dev/null
+++ b/riscv/insns/hlv_d.h
@@ -0,0 +1,5 @@
+require_extension('H');
+require_rv64;
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_int64(RS1));
diff --git a/riscv/insns/hlv_h.h b/riscv/insns/hlv_h.h
new file mode 100644
index 0000000000..6825fe46bd
--- /dev/null
+++ b/riscv/insns/hlv_h.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_int16(RS1));
diff --git a/riscv/insns/hlv_hu.h b/riscv/insns/hlv_hu.h
new file mode 100644
index 0000000000..3d9d98e2f4
--- /dev/null
+++ b/riscv/insns/hlv_hu.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_uint16(RS1));
diff --git a/riscv/insns/hlv_w.h b/riscv/insns/hlv_w.h
new file mode 100644
index 0000000000..be420d37c5
--- /dev/null
+++ b/riscv/insns/hlv_w.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_int32(RS1));
diff --git a/riscv/insns/hlv_wu.h b/riscv/insns/hlv_wu.h
new file mode 100644
index 0000000000..851be27c6a
--- /dev/null
+++ b/riscv/insns/hlv_wu.h
@@ -0,0 +1,5 @@
+require_extension('H');
+require_rv64;
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_uint32(RS1));
diff --git a/riscv/insns/hlvx_hu.h b/riscv/insns/hlvx_hu.h
new file mode 100644
index 0000000000..19dbcfbf63
--- /dev/null
+++ b/riscv/insns/hlvx_hu.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_x_uint16(RS1));
diff --git a/riscv/insns/hlvx_wu.h b/riscv/insns/hlvx_wu.h
new file mode 100644
index 0000000000..4dfe702350
--- /dev/null
+++ b/riscv/insns/hlvx_wu.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+WRITE_RD(MMU.guest_load_x_uint32(RS1));
diff --git a/riscv/insns/hsv_b.h b/riscv/insns/hsv_b.h
new file mode 100644
index 0000000000..a5c34ff072
--- /dev/null
+++ b/riscv/insns/hsv_b.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+MMU.guest_store_uint8(RS1, RS2);
diff --git a/riscv/insns/hsv_d.h b/riscv/insns/hsv_d.h
new file mode 100644
index 0000000000..14c6d5d913
--- /dev/null
+++ b/riscv/insns/hsv_d.h
@@ -0,0 +1,5 @@
+require_extension('H');
+require_rv64;
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+MMU.guest_store_uint64(RS1, RS2);
diff --git a/riscv/insns/hsv_h.h b/riscv/insns/hsv_h.h
new file mode 100644
index 0000000000..1cfe77aae8
--- /dev/null
+++ b/riscv/insns/hsv_h.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+MMU.guest_store_uint16(RS1, RS2);
diff --git a/riscv/insns/hsv_w.h b/riscv/insns/hsv_w.h
new file mode 100644
index 0000000000..d54f6731ee
--- /dev/null
+++ b/riscv/insns/hsv_w.h
@@ -0,0 +1,4 @@
+require_extension('H');
+require_novirt();
+require_privilege(get_field(STATE.hstatus, HSTATUS_HU) ? PRV_U : PRV_S);
+MMU.guest_store_uint32(RS1, RS2);
diff --git a/riscv/insns/lp_count.h b/riscv/insns/lp_count.h
new file mode 100644
index 0000000000..99c8099882
--- /dev/null
+++ b/riscv/insns/lp_count.h
@@ -0,0 +1,4 @@
+reg_t num_iter = zext_xlen(RS1);
+bool loopNr = insn.p_loop();
+
+p->hwLoops.set_count(loopNr, num_iter);
diff --git a/riscv/insns/lp_counti.h b/riscv/insns/lp_counti.h
new file mode 100644
index 0000000000..6c78ceaa89
--- /dev/null
+++ b/riscv/insns/lp_counti.h
@@ -0,0 +1,4 @@
+reg_t num_iter = zext_xlen(insn.p_uimmL());
+bool loopNr = insn.p_loop();
+
+p->hwLoops.set_count(loopNr, num_iter);
diff --git a/riscv/insns/lp_endi.h b/riscv/insns/lp_endi.h
new file mode 100644
index 0000000000..74d155f95d
--- /dev/null
+++ b/riscv/insns/lp_endi.h
@@ -0,0 +1,6 @@
+reg_t offset = zext_xlen(insn.p_uimmL() << 1);
+bool loopNr = insn.p_loop();
+
+reg_t end_addr = pc + offset;
+
+p->hwLoops.set_end(loopNr, end_addr);
diff --git a/riscv/insns/lp_setup.h b/riscv/insns/lp_setup.h
new file mode 100644
index 0000000000..2c29a1f4ca
--- /dev/null
+++ b/riscv/insns/lp_setup.h
@@ -0,0 +1,10 @@
+reg_t num_iter = zext_xlen(RS1);
+reg_t offset = zext_xlen(insn.p_uimmL() << 1);
+bool loopNr = insn.p_loop();
+
+reg_t start_addr = npc; // next pc (pc+4)
+reg_t end_addr = pc + offset;
+
+p->hwLoops.set_end(loopNr, end_addr);
+p->hwLoops.set_start(loopNr, start_addr);
+p->hwLoops.set_count(loopNr, num_iter);
diff --git a/riscv/insns/lp_setupi.h b/riscv/insns/lp_setupi.h
new file mode 100644
index 0000000000..f824b7bcf4
--- /dev/null
+++ b/riscv/insns/lp_setupi.h
@@ -0,0 +1,10 @@
+reg_t num_iter = zext_xlen(insn.p_uimmL());
+reg_t offset = zext_xlen(insn.p_uimmS() << 1);
+bool loopNr = insn.p_loop();
+
+reg_t start_addr = npc;  // next pc (pc+4)
+reg_t end_addr = pc + offset;
+
+p->hwLoops.set_end(loopNr, end_addr);
+p->hwLoops.set_start(loopNr, start_addr);
+p->hwLoops.set_count(loopNr, num_iter);
diff --git a/riscv/insns/lp_starti.h b/riscv/insns/lp_starti.h
new file mode 100644
index 0000000000..09b3acd96f
--- /dev/null
+++ b/riscv/insns/lp_starti.h
@@ -0,0 +1,6 @@
+reg_t offset = zext_xlen(insn.p_uimmL() << 1);
+bool loopNr = insn.p_loop();
+
+reg_t start_addr = pc + offset;
+
+p->hwLoops.set_start(loopNr, start_addr);
diff --git a/riscv/insns/lr_d.h b/riscv/insns/lr_d.h
index 52090c31b8..3f3521be54 100644
--- a/riscv/insns/lr_d.h
+++ b/riscv/insns/lr_d.h
@@ -1,4 +1,5 @@
 require_extension('A');
 require_rv64;
+auto res = MMU.load_int64(RS1);
 MMU.acquire_load_reservation(RS1);
-WRITE_RD(MMU.load_int64(RS1));
+WRITE_RD(res);
diff --git a/riscv/insns/lr_w.h b/riscv/insns/lr_w.h
index c5845a68e1..8605cc5df9 100644
--- a/riscv/insns/lr_w.h
+++ b/riscv/insns/lr_w.h
@@ -1,3 +1,4 @@
 require_extension('A');
+auto res = MMU.load_int32(RS1);
 MMU.acquire_load_reservation(RS1);
-WRITE_RD(MMU.load_int32(RS1));
+WRITE_RD(res);
diff --git a/riscv/insns/mret.h b/riscv/insns/mret.h
index 96933cf672..cedfc72840 100644
--- a/riscv/insns/mret.h
+++ b/riscv/insns/mret.h
@@ -2,8 +2,10 @@ require_privilege(PRV_M);
 set_pc_and_serialize(p->get_state()->mepc);
 reg_t s = STATE.mstatus;
 reg_t prev_prv = get_field(s, MSTATUS_MPP);
+reg_t prev_virt = get_field(s, MSTATUS_MPV);
 s = set_field(s, MSTATUS_MIE, get_field(s, MSTATUS_MPIE));
 s = set_field(s, MSTATUS_MPIE, 1);
 s = set_field(s, MSTATUS_MPP, PRV_U);
-p->set_privilege(prev_prv);
 p->set_csr(CSR_MSTATUS, s);
+p->set_privilege(prev_prv);
+p->set_virt(prev_virt);
diff --git a/riscv/insns/p_abs.h b/riscv/insns/p_abs.h
new file mode 100644
index 0000000000..409446b218
--- /dev/null
+++ b/riscv/insns/p_abs.h
@@ -0,0 +1,4 @@
+if(sreg_t(RS1) > 0)
+  WRITE_RD(RS1);
+else
+  WRITE_RD(-RS1);
diff --git a/riscv/insns/p_addN.h b/riscv/insns/p_addN.h
new file mode 100644
index 0000000000..303363ee9c
--- /dev/null
+++ b/riscv/insns/p_addN.h
@@ -0,0 +1,7 @@
+sreg_t term1 = sext_xlen(RS1);
+sreg_t term2 = sext_xlen(RS2);
+int norm = insn.p_Luimm5();
+
+sreg_t res = (term1 + term2) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_addNr.h b/riscv/insns/p_addNr.h
new file mode 100644
index 0000000000..22018dd3d0
--- /dev/null
+++ b/riscv/insns/p_addNr.h
@@ -0,0 +1,7 @@
+sreg_t term1 = sext_xlen(RD);
+sreg_t term2 = sext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+
+sreg_t res = (term1 + term2) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_addRN.h b/riscv/insns/p_addRN.h
new file mode 100644
index 0000000000..7fe4082288
--- /dev/null
+++ b/riscv/insns/p_addRN.h
@@ -0,0 +1,8 @@
+sreg_t term1 = sext_xlen(RS1);
+sreg_t term2 = sext_xlen(RS2);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((term1 + term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_addRNr.h b/riscv/insns/p_addRNr.h
new file mode 100644
index 0000000000..a41ec1569f
--- /dev/null
+++ b/riscv/insns/p_addRNr.h
@@ -0,0 +1,8 @@
+sreg_t term1 = sext_xlen(RD);
+sreg_t term2 = sext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((term1 + term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_adduN.h b/riscv/insns/p_adduN.h
new file mode 100644
index 0000000000..d1844e2ebf
--- /dev/null
+++ b/riscv/insns/p_adduN.h
@@ -0,0 +1,9 @@
+reg_t term1 = zext_xlen(RS1);
+reg_t term2 = zext_xlen(RS2);
+int norm = insn.p_Luimm5();
+
+reg_t res = (term1 + term2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_adduNr.h b/riscv/insns/p_adduNr.h
new file mode 100644
index 0000000000..e0205d2571
--- /dev/null
+++ b/riscv/insns/p_adduNr.h
@@ -0,0 +1,9 @@
+reg_t term1 = zext_xlen(RD);
+reg_t term2 = zext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+
+reg_t res = (term1 + term2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_adduRN.h b/riscv/insns/p_adduRN.h
new file mode 100644
index 0000000000..53734b3c98
--- /dev/null
+++ b/riscv/insns/p_adduRN.h
@@ -0,0 +1,10 @@
+reg_t term1 = zext_xlen(RS1);
+reg_t term2 = zext_xlen(RS2);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((term1 + term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_adduRNr.h b/riscv/insns/p_adduRNr.h
new file mode 100644
index 0000000000..7ab19b3cd3
--- /dev/null
+++ b/riscv/insns/p_adduRNr.h
@@ -0,0 +1,10 @@
+reg_t term1 = zext_xlen(RD);
+reg_t term2 = zext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((term1 + term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_bclr.h b/riscv/insns/p_bclr.h
new file mode 100644
index 0000000000..b8244e6fc7
--- /dev/null
+++ b/riscv/insns/p_bclr.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = insn.p_zimm5();
+reg_t upto = insn.p_Luimm5();
+
+reg_t bit_mask = ( (((reg_t)1<<upto)<<1) -1 ) << first;
+reg_t res = ~bit_mask & val;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_bclrr.h b/riscv/insns/p_bclrr.h
new file mode 100644
index 0000000000..2a7dff3d95
--- /dev/null
+++ b/riscv/insns/p_bclrr.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = zextr(RS2, 4, 0); // rs1[4:0]
+reg_t upto = zextr(RS2, 9, 5);  // rs1[9:5]
+
+reg_t clr_mask = ( (((reg_t)1<<upto)<<1) -1 ) << first;
+reg_t res = ~clr_mask & val;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_beqimm.h b/riscv/insns/p_beqimm.h
new file mode 100644
index 0000000000..5d63493212
--- /dev/null
+++ b/riscv/insns/p_beqimm.h
@@ -0,0 +1,2 @@
+if(sreg_t(RS1) == insn.p_simm5())
+  set_pc(BRANCH_TARGET);
diff --git a/riscv/insns/p_bitrev.h b/riscv/insns/p_bitrev.h
new file mode 100644
index 0000000000..7eae6c07ee
--- /dev/null
+++ b/riscv/insns/p_bitrev.h
@@ -0,0 +1,43 @@
+reg_t val = zext_xlen(RS1);
+reg_t shift = insn.p_zimm5();
+reg_t group = insn.p_Luimm5();
+reg_t res;
+
+// using method from: 
+// https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv
+// with 6bits per block, padded to 8bit spacing
+// groups of 2 and 3 bits could be done faster (12bit blocks)
+
+const reg_t mult_masks[3] = {0x208208, 0x1001, 0x2008};
+const reg_t and_masks[3] = {0x2240910, 0x3300C, 0x70038};
+
+val <<= shift;
+
+// handle two LSB bits
+switch(group)
+{
+    case 0: // res[1:0] = val[0:1]
+        res = (val>>1)%2;
+        res |= (val<<1)%4;
+        break;
+    case 1: // res[1:0] = val[1:0]
+        res = val%4;
+        break;
+    default:
+        res = 0;
+        break;
+}
+val >>=2; // remove LSBs from val (prep first block)
+
+
+reg_t temp;
+for(int i=0; i<5; i++)
+{
+    res <<=6; // shift finished part to safety
+    temp = (val & 0x3F) * mult_masks[group]; // create copies
+    temp = temp & and_masks[group]; // select bits from copies
+    res |= (temp % 255); // collapse selected bits together
+    val >>=6; // prep next block
+}
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_bneimm.h b/riscv/insns/p_bneimm.h
new file mode 100644
index 0000000000..ef565ccfd4
--- /dev/null
+++ b/riscv/insns/p_bneimm.h
@@ -0,0 +1,2 @@
+if(sreg_t(RS1) != insn.p_simm5())
+  set_pc(BRANCH_TARGET);
diff --git a/riscv/insns/p_bset.h b/riscv/insns/p_bset.h
new file mode 100644
index 0000000000..d7fec87de5
--- /dev/null
+++ b/riscv/insns/p_bset.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = insn.p_zimm5();
+reg_t upto = insn.p_Luimm5();
+
+reg_t set_mask = ( (((reg_t)1<<upto)<<1) -1 ) << first;
+reg_t res = set_mask | val;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_bsetr.h b/riscv/insns/p_bsetr.h
new file mode 100644
index 0000000000..4c20eafe18
--- /dev/null
+++ b/riscv/insns/p_bsetr.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = zextr(RS2, 4, 0); // rs1[4:0]
+reg_t upto = zextr(RS2, 9, 5);  // rs1[9:5]
+
+reg_t set_mask = ( (((reg_t)1<<upto)<<1) -1 ) << first;
+reg_t res = set_mask | val;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_clb.h b/riscv/insns/p_clb.h
new file mode 100644
index 0000000000..b124f9f2dc
--- /dev/null
+++ b/riscv/insns/p_clb.h
@@ -0,0 +1,37 @@
+reg_t val = sext_xlen(RS1);;
+reg_t cnt = 0;
+
+#ifdef __GNUC__
+  if(val) {
+    // "Returns the number of leading redundant sign bits"
+    cnt = __builtin_clrsb(val) +1;
+  } else {
+    cnt = 0; // rs1 = 0 -> rd = 0, not 32
+  }
+#else
+  if(val != 0x00) // rs1 = 0 -> rd = 0, not 32
+  {
+    if(val >= ((reg_t)1 << 31)) {
+      // turn leading 1s into leading 0s
+      val = ~val;
+    }
+    val <<= 1; // to distinguish -1 from -2
+
+    // modified log2() from standfords bithacks (find highest '1')
+    const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    const unsigned int S[] = {1, 2, 4, 8, 16};
+    for (int i = 4; i >= 0; i--)
+    {
+      if (val & b[i])
+      {
+        val >>= S[i];
+        cnt |= S[i];
+      } 
+    }
+
+    cnt = 32 - cnt; // pos of MSB+1 to #leading bits
+  }
+
+#endif
+
+WRITE_RD(cnt);
diff --git a/riscv/insns/p_clip.h b/riscv/insns/p_clip.h
new file mode 100644
index 0000000000..4cc255f00d
--- /dev/null
+++ b/riscv/insns/p_clip.h
@@ -0,0 +1,9 @@
+sreg_t clip_lower = insn.p_zimm5() ? -(1 << (insn.p_zimm5() - 1)) : -1;
+sreg_t clip_upper = insn.p_zimm5() ? ((1 << (insn.p_zimm5() - 1)) - 1) : 0;
+
+if(sreg_t(RS1) <= clip_lower)
+  WRITE_RD(clip_lower);
+else if(sreg_t(RS1) >= clip_upper)
+  WRITE_RD(clip_upper);
+else
+  WRITE_RD(sreg_t(RS1));
diff --git a/riscv/insns/p_clipr.h b/riscv/insns/p_clipr.h
new file mode 100644
index 0000000000..abe0846e57
--- /dev/null
+++ b/riscv/insns/p_clipr.h
@@ -0,0 +1,6 @@
+if(sreg_t(RS1) <= -(sreg_t(RS2) + 1))
+  WRITE_RD(-(sreg_t(RS2) + 1));
+else if(sreg_t(RS1) >= sreg_t(RS2))
+  WRITE_RD(sreg_t(RS2));
+else
+  WRITE_RD(sreg_t(RS1));
diff --git a/riscv/insns/p_clipu.h b/riscv/insns/p_clipu.h
new file mode 100644
index 0000000000..aa4da5858e
--- /dev/null
+++ b/riscv/insns/p_clipu.h
@@ -0,0 +1,8 @@
+sreg_t clipu_upper = insn.p_zimm5() ? ((1 << (insn.p_zimm5() - 1)) - 1) : 0;
+
+if(sreg_t(RS1) <= 0)
+  WRITE_RD(0);
+else if(sreg_t(RS1) >= clipu_upper)
+  WRITE_RD(clipu_upper);
+else
+  WRITE_RD(sreg_t(RS1));
diff --git a/riscv/insns/p_clipur.h b/riscv/insns/p_clipur.h
new file mode 100644
index 0000000000..08c03703a4
--- /dev/null
+++ b/riscv/insns/p_clipur.h
@@ -0,0 +1,6 @@
+if(sreg_t(RS1) <= 0)
+  WRITE_RD(0);
+else if(sreg_t(RS1) >= sreg_t(RS2))
+  WRITE_RD(sreg_t(RS2));
+else
+  WRITE_RD(sreg_t(RS1));
diff --git a/riscv/insns/p_cnt.h b/riscv/insns/p_cnt.h
new file mode 100644
index 0000000000..7ffc93ad2b
--- /dev/null
+++ b/riscv/insns/p_cnt.h
@@ -0,0 +1,14 @@
+reg_t val = zext_xlen(RS1);
+reg_t cnt;
+
+#ifdef __GNUC__
+  // "Returns the number of 1-bits in x."
+  cnt = __builtin_popcount(val);
+#else
+  cnt = 0;
+  for(cnt = 0; val != 0x00; ++cnt)
+  {
+    val &= val - 1;
+  }
+#endif
+WRITE_RD(cnt);
diff --git a/riscv/insns/p_extbs.h b/riscv/insns/p_extbs.h
new file mode 100644
index 0000000000..de3e100e89
--- /dev/null
+++ b/riscv/insns/p_extbs.h
@@ -0,0 +1 @@
+WRITE_RD(sext8(RS1));
diff --git a/riscv/insns/p_extbz.h b/riscv/insns/p_extbz.h
new file mode 100644
index 0000000000..419622cbaf
--- /dev/null
+++ b/riscv/insns/p_extbz.h
@@ -0,0 +1 @@
+WRITE_RD(zext8(RS1));
diff --git a/riscv/insns/p_exths.h b/riscv/insns/p_exths.h
new file mode 100644
index 0000000000..16cbe7b5a7
--- /dev/null
+++ b/riscv/insns/p_exths.h
@@ -0,0 +1 @@
+WRITE_RD(sext16(RS1));
diff --git a/riscv/insns/p_exthz.h b/riscv/insns/p_exthz.h
new file mode 100644
index 0000000000..8b548b6ffa
--- /dev/null
+++ b/riscv/insns/p_exthz.h
@@ -0,0 +1 @@
+WRITE_RD(zext16(RS1));
diff --git a/riscv/insns/p_extract.h b/riscv/insns/p_extract.h
new file mode 100644
index 0000000000..c45ffde0a8
--- /dev/null
+++ b/riscv/insns/p_extract.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = insn.p_zimm5();
+reg_t upto = insn.p_Luimm5();
+
+sreg_t res = val >> first;
+res = vsext(res, upto+1);
+
+WRITE_RD(res);
diff --git a/riscv/insns/p_extractr.h b/riscv/insns/p_extractr.h
new file mode 100644
index 0000000000..db131ee423
--- /dev/null
+++ b/riscv/insns/p_extractr.h
@@ -0,0 +1,8 @@
+sreg_t val = sext_xlen(RS1);
+reg_t first = zextr(RS2, 4, 0); // rs1[4:0]
+reg_t upto = zextr(RS2, 9, 5);  // rs1[9:5]
+
+sreg_t res = val >> first;
+res = vsext(res, upto+1);
+
+WRITE_RD(res);
diff --git a/riscv/insns/p_extractu.h b/riscv/insns/p_extractu.h
new file mode 100644
index 0000000000..60b5f78870
--- /dev/null
+++ b/riscv/insns/p_extractu.h
@@ -0,0 +1,8 @@
+sreg_t val = zext_xlen(RS1);
+reg_t first = insn.p_zimm5();
+reg_t upto = insn.p_Luimm5();
+
+sreg_t res = val >> first;
+res = vzext(res, upto+1);
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_extractur.h b/riscv/insns/p_extractur.h
new file mode 100644
index 0000000000..eb30730b7c
--- /dev/null
+++ b/riscv/insns/p_extractur.h
@@ -0,0 +1,8 @@
+sreg_t val = zext_xlen(RS1);
+reg_t first = zextr(RS2, 4, 0); // rs1[4:0]
+reg_t upto = zextr(RS2, 9, 5);  // rs1[9:5]
+
+sreg_t res = val >> first;
+res = vzext(res, upto+1);
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_ff1.h b/riscv/insns/p_ff1.h
new file mode 100644
index 0000000000..8b8a0dc519
--- /dev/null
+++ b/riscv/insns/p_ff1.h
@@ -0,0 +1,24 @@
+reg_t val = zext_xlen(RS1);
+reg_t cnt;
+
+#ifdef __GNUC__
+    // "Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero. "
+    cnt = __builtin_ffs(val);
+    if(cnt == 0) {
+      cnt = 32; // rs = 0 -> rd = 32
+    } else {
+      cnt--;
+    }
+#else
+  // count trailing zero bits from standfords bithacks
+  cnt = 32;
+  val &= -signed(val);
+  if (val) cnt--;
+  if (val & 0x0000FFFF) cnt -= 16;
+  if (val & 0x00FF00FF) cnt -= 8;
+  if (val & 0x0F0F0F0F) cnt -= 4;
+  if (val & 0x33333333) cnt -= 2;
+  if (val & 0x55555555) cnt -= 1;
+#endif
+
+WRITE_RD(cnt);
diff --git a/riscv/insns/p_fl1.h b/riscv/insns/p_fl1.h
new file mode 100644
index 0000000000..37ee52b37c
--- /dev/null
+++ b/riscv/insns/p_fl1.h
@@ -0,0 +1,32 @@
+reg_t val = zext_xlen(RS1);;
+reg_t cnt;
+
+#ifdef __GNUC__
+  if(val) {
+    // "Returns the number of leading 0-bits in x" x=0 -> undef
+    cnt = 31 - __builtin_clz(val);
+  } else {
+    // rs1 = 0 -> rd = 0, not 32
+    cnt = 32;
+  }
+#else
+  if(val){  
+    // log2() from standfords bithacks (find highest '1')
+    const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    const unsigned int S[] = {1, 2, 4, 8, 16};
+    cnt = 0;
+    for (int i = 4; i >= 0; i--)
+    {
+      if (val & b[i])
+      {
+        val >>= S[i];
+        cnt |= S[i];
+      } 
+    }
+  } else {
+    // rs1 = 0 -> rd = 0, not 32
+    cnt = 32;
+  }
+#endif
+
+WRITE_RD(cnt);
diff --git a/riscv/insns/p_insert.h b/riscv/insns/p_insert.h
new file mode 100644
index 0000000000..7989718553
--- /dev/null
+++ b/riscv/insns/p_insert.h
@@ -0,0 +1,19 @@
+sreg_t res = RD;
+sreg_t val = zext_xlen(RS1);
+reg_t first = insn.p_zimm5();
+reg_t upto = insn.p_Luimm5();
+
+int offset = first+upto-31;
+if(offset < 0) {
+    offset = 0;
+}
+
+reg_t bit_mask = ( (((reg_t)1<<upto)<<1) -1 );
+
+reg_t set_mask = (bit_mask << first) & (val << (first-offset));
+reg_t clr_mask = ~(bit_mask << first) | (val << (first-offset));
+
+res |= set_mask;
+res &= clr_mask;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_insertr.h b/riscv/insns/p_insertr.h
new file mode 100644
index 0000000000..afd5cb4f99
--- /dev/null
+++ b/riscv/insns/p_insertr.h
@@ -0,0 +1,19 @@
+sreg_t res = RD;
+sreg_t val = zext_xlen(RS1);
+reg_t first = zextr(RS2, 4, 0); // rs1[4:0]
+reg_t upto = zextr(RS2, 9, 5);  // rs1[9:5]
+
+int offset = first+upto-31;
+if(offset < 0) {
+    offset = 0;
+}
+
+reg_t bit_mask = ( (((reg_t)1<<upto)<<1) -1 );
+
+reg_t set_mask = (bit_mask << first) & (val << (first-offset));
+reg_t clr_mask = ~(bit_mask << first) | (val << (first-offset));
+
+res |= set_mask;
+res &= clr_mask;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_lb_irpost.h b/riscv/insns/p_lb_irpost.h
new file mode 100644
index 0000000000..ed17db1627
--- /dev/null
+++ b/riscv/insns/p_lb_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int8(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/riscv/insns/p_lb_rr.h b/riscv/insns/p_lb_rr.h
new file mode 100644
index 0000000000..c32237fe12
--- /dev/null
+++ b/riscv/insns/p_lb_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int8(RS1 + sreg_t(RS2)));
diff --git a/riscv/insns/p_lb_rrpost.h b/riscv/insns/p_lb_rrpost.h
new file mode 100644
index 0000000000..9dc2bd93da
--- /dev/null
+++ b/riscv/insns/p_lb_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int8(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/riscv/insns/p_lbu_irpost.h b/riscv/insns/p_lbu_irpost.h
new file mode 100644
index 0000000000..0f015c3766
--- /dev/null
+++ b/riscv/insns/p_lbu_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint8(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/riscv/insns/p_lbu_rr.h b/riscv/insns/p_lbu_rr.h
new file mode 100644
index 0000000000..a95ca2a9a9
--- /dev/null
+++ b/riscv/insns/p_lbu_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_uint8(RS1 + sreg_t(RS2)));
diff --git a/riscv/insns/p_lbu_rrpost.h b/riscv/insns/p_lbu_rrpost.h
new file mode 100644
index 0000000000..3456c8aecb
--- /dev/null
+++ b/riscv/insns/p_lbu_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint8(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/riscv/insns/p_lh_irpost.h b/riscv/insns/p_lh_irpost.h
new file mode 100644
index 0000000000..3fea47c188
--- /dev/null
+++ b/riscv/insns/p_lh_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int16(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/riscv/insns/p_lh_rr.h b/riscv/insns/p_lh_rr.h
new file mode 100644
index 0000000000..cd5bf82194
--- /dev/null
+++ b/riscv/insns/p_lh_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int16(RS1 + sreg_t(RS2)));
diff --git a/riscv/insns/p_lh_rrpost.h b/riscv/insns/p_lh_rrpost.h
new file mode 100644
index 0000000000..60353fd3e0
--- /dev/null
+++ b/riscv/insns/p_lh_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int16(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/riscv/insns/p_lhu_irpost.h b/riscv/insns/p_lhu_irpost.h
new file mode 100644
index 0000000000..8e7cfb6beb
--- /dev/null
+++ b/riscv/insns/p_lhu_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint16(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/riscv/insns/p_lhu_rr.h b/riscv/insns/p_lhu_rr.h
new file mode 100644
index 0000000000..6568736a78
--- /dev/null
+++ b/riscv/insns/p_lhu_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_uint16(RS1 + sreg_t(RS2)));
diff --git a/riscv/insns/p_lhu_rrpost.h b/riscv/insns/p_lhu_rrpost.h
new file mode 100644
index 0000000000..195222ac04
--- /dev/null
+++ b/riscv/insns/p_lhu_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint16(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/riscv/insns/p_lw_irpost.h b/riscv/insns/p_lw_irpost.h
new file mode 100644
index 0000000000..fb77d87236
--- /dev/null
+++ b/riscv/insns/p_lw_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int32(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/riscv/insns/p_lw_rr.h b/riscv/insns/p_lw_rr.h
new file mode 100644
index 0000000000..78fa33231b
--- /dev/null
+++ b/riscv/insns/p_lw_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int32(RS1 + sreg_t(RS2)));
diff --git a/riscv/insns/p_lw_rrpost.h b/riscv/insns/p_lw_rrpost.h
new file mode 100644
index 0000000000..e315c5dfe1
--- /dev/null
+++ b/riscv/insns/p_lw_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int32(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/riscv/insns/p_mac.h b/riscv/insns/p_mac.h
new file mode 100644
index 0000000000..bf5c77a149
--- /dev/null
+++ b/riscv/insns/p_mac.h
@@ -0,0 +1 @@
+WRITE_RD(sext_xlen(sreg_t(RD) + sext_xlen(sreg_t(RS1) * sreg_t(RS2))));
diff --git a/riscv/insns/p_machhsN.h b/riscv/insns/p_machhsN.h
new file mode 100644
index 0000000000..5afc9a0dc2
--- /dev/null
+++ b/riscv/insns/p_machhsN.h
@@ -0,0 +1,8 @@
+sreg_t mul1 = sext16(RS1_H(1));
+sreg_t mul2 = sext16(RS2_H(1));
+sreg_t acc = sext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+
+acc = ((mul1 * mul2) + acc) >> norm;
+
+WRITE_RD(sext_xlen(acc));
\ No newline at end of file
diff --git a/riscv/insns/p_machhsRN.h b/riscv/insns/p_machhsRN.h
new file mode 100644
index 0000000000..7cd965b45b
--- /dev/null
+++ b/riscv/insns/p_machhsRN.h
@@ -0,0 +1,9 @@
+sreg_t mul1 = sext16(RS1_H(1));
+sreg_t mul2 = sext16(RS2_H(1));
+sreg_t acc = sext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+acc = ((mul1 * mul2) + acc + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/p_machhuN.h b/riscv/insns/p_machhuN.h
new file mode 100644
index 0000000000..aa1d44d93b
--- /dev/null
+++ b/riscv/insns/p_machhuN.h
@@ -0,0 +1,10 @@
+reg_t mul1 = zext16(RS1_H(1));
+reg_t mul2 = zext16(RS2_H(1));
+reg_t acc = zext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+
+acc = ((mul1 * mul2) + acc) >> norm;
+
+WRITE_RD(sext_xlen(acc));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_machhuRN.h b/riscv/insns/p_machhuRN.h
new file mode 100644
index 0000000000..3fb0027056
--- /dev/null
+++ b/riscv/insns/p_machhuRN.h
@@ -0,0 +1,11 @@
+reg_t mul1 = zext16(RS1_H(1));
+reg_t mul2 = zext16(RS2_H(1));
+reg_t acc = zext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+acc = ((mul1 * mul2) + acc + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(acc));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_macsN.h b/riscv/insns/p_macsN.h
new file mode 100644
index 0000000000..e32a4abbe2
--- /dev/null
+++ b/riscv/insns/p_macsN.h
@@ -0,0 +1,8 @@
+sreg_t mul1 = sext16(RS1_H(0));
+sreg_t mul2 = sext16(RS2_H(0));
+sreg_t acc = sext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+
+acc = ((mul1 * mul2) + acc) >> norm;
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/p_macsRN.h b/riscv/insns/p_macsRN.h
new file mode 100644
index 0000000000..e5dbe68afa
--- /dev/null
+++ b/riscv/insns/p_macsRN.h
@@ -0,0 +1,9 @@
+sreg_t mul1 = sext16(RS1_H(0));
+sreg_t mul2 = sext16(RS2_H(0));
+sreg_t acc = sext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+acc = ((mul1 * mul2) + acc + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/p_macuN.h b/riscv/insns/p_macuN.h
new file mode 100644
index 0000000000..60570d7c07
--- /dev/null
+++ b/riscv/insns/p_macuN.h
@@ -0,0 +1,10 @@
+reg_t mul1 = zext16(RS1_H(0));
+reg_t mul2 = zext16(RS2_H(0));
+reg_t acc = zext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+
+acc = ((mul1 * mul2) + acc) >> norm;
+
+WRITE_RD(sext_xlen(acc));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_macuRN.h b/riscv/insns/p_macuRN.h
new file mode 100644
index 0000000000..ed360e8ebd
--- /dev/null
+++ b/riscv/insns/p_macuRN.h
@@ -0,0 +1,11 @@
+reg_t mul1 = zext16(RS1_H(0));
+reg_t mul2 = zext16(RS2_H(0));
+reg_t acc = zext_xlen(P_RS3);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+acc = ((mul1 * mul2) + acc + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(acc));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_max.h b/riscv/insns/p_max.h
new file mode 100644
index 0000000000..7a7ddcd8e9
--- /dev/null
+++ b/riscv/insns/p_max.h
@@ -0,0 +1,4 @@
+if(sreg_t(RS1) > sreg_t(RS2))
+  WRITE_RD(RS1);
+else
+  WRITE_RD(RS2);
diff --git a/riscv/insns/p_maxu.h b/riscv/insns/p_maxu.h
new file mode 100644
index 0000000000..8cba177c7b
--- /dev/null
+++ b/riscv/insns/p_maxu.h
@@ -0,0 +1,4 @@
+if(RS1 > RS2)
+  WRITE_RD(RS1);
+else
+  WRITE_RD(RS2);
diff --git a/riscv/insns/p_min.h b/riscv/insns/p_min.h
new file mode 100644
index 0000000000..1fc66807ed
--- /dev/null
+++ b/riscv/insns/p_min.h
@@ -0,0 +1,4 @@
+if(sreg_t(RS1) <= sreg_t(RS2))
+  WRITE_RD(RS1);
+else
+  WRITE_RD(RS2);
diff --git a/riscv/insns/p_minu.h b/riscv/insns/p_minu.h
new file mode 100644
index 0000000000..42339df5e6
--- /dev/null
+++ b/riscv/insns/p_minu.h
@@ -0,0 +1,4 @@
+if(RS1 <= RS2)
+  WRITE_RD(RS1);
+else
+  WRITE_RD(RS2);
diff --git a/riscv/insns/p_msu.h b/riscv/insns/p_msu.h
new file mode 100644
index 0000000000..2a42cf05e0
--- /dev/null
+++ b/riscv/insns/p_msu.h
@@ -0,0 +1 @@
+WRITE_RD(sext_xlen(sreg_t(RD) - sext_xlen(sreg_t(RS1) * sreg_t(RS2))));
diff --git a/riscv/insns/p_mulhhsN.h b/riscv/insns/p_mulhhsN.h
new file mode 100644
index 0000000000..e7cb59b6d1
--- /dev/null
+++ b/riscv/insns/p_mulhhsN.h
@@ -0,0 +1,7 @@
+sreg_t mul1 = sext16(RS1_H(1));
+sreg_t mul2 = sext16(RS2_H(1));
+int norm = insn.p_Luimm5();
+
+sreg_t res = (mul1 * mul2) >> norm;
+
+WRITE_RD(sext_xlen(res));
\ No newline at end of file
diff --git a/riscv/insns/p_mulhhsRN.h b/riscv/insns/p_mulhhsRN.h
new file mode 100644
index 0000000000..a5ca521559
--- /dev/null
+++ b/riscv/insns/p_mulhhsRN.h
@@ -0,0 +1,8 @@
+sreg_t mul1 = sext16(RS1_H(1));
+sreg_t mul2 = sext16(RS2_H(1));
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((mul1 * mul2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_mulhhuN.h b/riscv/insns/p_mulhhuN.h
new file mode 100644
index 0000000000..5c1a4b38fa
--- /dev/null
+++ b/riscv/insns/p_mulhhuN.h
@@ -0,0 +1,9 @@
+reg_t mul1 = zext16(RS1_H(1));
+reg_t mul2 = zext16(RS2_H(1));
+int norm = insn.p_Luimm5();
+
+reg_t res = (mul1 * mul2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_mulhhuRN.h b/riscv/insns/p_mulhhuRN.h
new file mode 100644
index 0000000000..c51bfbcf0e
--- /dev/null
+++ b/riscv/insns/p_mulhhuRN.h
@@ -0,0 +1,10 @@
+reg_t mul1 = zext16(RS1_H(1));
+reg_t mul2 = zext16(RS2_H(1));
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((mul1 * mul2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_mulsN.h b/riscv/insns/p_mulsN.h
new file mode 100644
index 0000000000..f1cbb96194
--- /dev/null
+++ b/riscv/insns/p_mulsN.h
@@ -0,0 +1,7 @@
+sreg_t mul1 = sext16(RS1_H(0));
+sreg_t mul2 = sext16(RS2_H(0));
+int norm = insn.p_Luimm5();
+
+sreg_t res = (mul1 * mul2) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_mulsRN.h b/riscv/insns/p_mulsRN.h
new file mode 100644
index 0000000000..d7dfccdd05
--- /dev/null
+++ b/riscv/insns/p_mulsRN.h
@@ -0,0 +1,8 @@
+sreg_t mul1 = sext16(RS1_H(0));
+sreg_t mul2 = sext16(RS2_H(0));
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((mul1 * mul2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_muluN.h b/riscv/insns/p_muluN.h
new file mode 100644
index 0000000000..539610f750
--- /dev/null
+++ b/riscv/insns/p_muluN.h
@@ -0,0 +1,9 @@
+reg_t mul1 = zext16(RS1_H(0));
+reg_t mul2 = zext16(RS2_H(0));
+int norm = insn.p_Luimm5();
+
+reg_t res = (mul1 * mul2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_muluRN.h b/riscv/insns/p_muluRN.h
new file mode 100644
index 0000000000..9fd8178615
--- /dev/null
+++ b/riscv/insns/p_muluRN.h
@@ -0,0 +1,10 @@
+reg_t mul1 = zext16(RS1_H(0));
+reg_t mul2 = zext16(RS2_H(0));
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((mul1 * mul2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_ror.h b/riscv/insns/p_ror.h
new file mode 100644
index 0000000000..124c3aa3b0
--- /dev/null
+++ b/riscv/insns/p_ror.h
@@ -0,0 +1,6 @@
+reg_t val = zext_xlen(RS1);
+reg_t rot = zext_xlen(RS2)%32;
+
+reg_t res = (val >> rot) | (((val << 32) >> rot));
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_sb_irpost.h b/riscv/insns/p_sb_irpost.h
new file mode 100644
index 0000000000..9339bc9cac
--- /dev/null
+++ b/riscv/insns/p_sb_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint8(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/riscv/insns/p_sb_rr.h b/riscv/insns/p_sb_rr.h
new file mode 100644
index 0000000000..73e49727c4
--- /dev/null
+++ b/riscv/insns/p_sb_rr.h
@@ -0,0 +1 @@
+MMU.store_uint8(RS1 + sreg_t(P_RS3), RS2);
diff --git a/riscv/insns/p_sb_rrpost.h b/riscv/insns/p_sb_rrpost.h
new file mode 100644
index 0000000000..0442551744
--- /dev/null
+++ b/riscv/insns/p_sb_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint8(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/riscv/insns/p_sh_irpost.h b/riscv/insns/p_sh_irpost.h
new file mode 100644
index 0000000000..f915c518dc
--- /dev/null
+++ b/riscv/insns/p_sh_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint16(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/riscv/insns/p_sh_rr.h b/riscv/insns/p_sh_rr.h
new file mode 100644
index 0000000000..f3270bd561
--- /dev/null
+++ b/riscv/insns/p_sh_rr.h
@@ -0,0 +1 @@
+MMU.store_uint16(RS1 + sreg_t(P_RS3), RS2);
diff --git a/riscv/insns/p_sh_rrpost.h b/riscv/insns/p_sh_rrpost.h
new file mode 100644
index 0000000000..5043c62876
--- /dev/null
+++ b/riscv/insns/p_sh_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint16(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/riscv/insns/p_slet.h b/riscv/insns/p_slet.h
new file mode 100644
index 0000000000..82f7cc2db1
--- /dev/null
+++ b/riscv/insns/p_slet.h
@@ -0,0 +1 @@
+WRITE_RD(sreg_t(RS1) <= sreg_t(RS2));
diff --git a/riscv/insns/p_sletu.h b/riscv/insns/p_sletu.h
new file mode 100644
index 0000000000..12547170bb
--- /dev/null
+++ b/riscv/insns/p_sletu.h
@@ -0,0 +1 @@
+WRITE_RD(RS1 <= RS2);
diff --git a/riscv/insns/p_subN.h b/riscv/insns/p_subN.h
new file mode 100644
index 0000000000..dec2151c3a
--- /dev/null
+++ b/riscv/insns/p_subN.h
@@ -0,0 +1,7 @@
+sreg_t term1 = sext_xlen(RS1);
+sreg_t term2 = sext_xlen(RS2);
+int norm = insn.p_Luimm5();
+
+sreg_t res = (term1 - term2) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_subNr.h b/riscv/insns/p_subNr.h
new file mode 100644
index 0000000000..e1db45dfcc
--- /dev/null
+++ b/riscv/insns/p_subNr.h
@@ -0,0 +1,7 @@
+sreg_t term1 = sext_xlen(RD);
+sreg_t term2 = sext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+
+sreg_t res = (term1 - term2) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_subRN.h b/riscv/insns/p_subRN.h
new file mode 100644
index 0000000000..d30ddb9ac3
--- /dev/null
+++ b/riscv/insns/p_subRN.h
@@ -0,0 +1,8 @@
+sreg_t term1 = sext_xlen(RS1);
+sreg_t term2 = sext_xlen(RS2);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((term1 - term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_subRNr.h b/riscv/insns/p_subRNr.h
new file mode 100644
index 0000000000..be5d175f47
--- /dev/null
+++ b/riscv/insns/p_subRNr.h
@@ -0,0 +1,8 @@
+sreg_t term1 = sext_xlen(RD);
+sreg_t term2 = sext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+sreg_t res = ((term1 - term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/p_subuN.h b/riscv/insns/p_subuN.h
new file mode 100644
index 0000000000..3363cfe583
--- /dev/null
+++ b/riscv/insns/p_subuN.h
@@ -0,0 +1,9 @@
+reg_t term1 = zext_xlen(RS1);
+reg_t term2 = zext_xlen(RS2);
+int norm = insn.p_Luimm5();
+
+reg_t res = (term1 - term2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_subuNr.h b/riscv/insns/p_subuNr.h
new file mode 100644
index 0000000000..074ed6c82b
--- /dev/null
+++ b/riscv/insns/p_subuNr.h
@@ -0,0 +1,9 @@
+reg_t term1 = zext_xlen(RD);
+reg_t term2 = zext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+
+reg_t res = (term1 - term2) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_subuRN.h b/riscv/insns/p_subuRN.h
new file mode 100644
index 0000000000..656c483886
--- /dev/null
+++ b/riscv/insns/p_subuRN.h
@@ -0,0 +1,10 @@
+reg_t term1 = zext_xlen(RS1);
+reg_t term2 = zext_xlen(RS2);
+int norm = insn.p_Luimm5();
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((term1 - term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_subuRNr.h b/riscv/insns/p_subuRNr.h
new file mode 100644
index 0000000000..da421a4e08
--- /dev/null
+++ b/riscv/insns/p_subuRNr.h
@@ -0,0 +1,10 @@
+reg_t term1 = zext_xlen(RD);
+reg_t term2 = zext_xlen(RS1);
+int norm =    RS2%0x20; // rs[4:0]
+uint halfbit = ((uint)0x01 << (norm))/2; // for rounding
+
+reg_t res = ((term1 - term2) + halfbit) >> norm;
+
+WRITE_RD(sext_xlen(res));   // this is not a mistake, it needs to be signed here because
+                            // Spike works with 64bit register representations internally.
+                            // And it expects the results to be signed (see eg. slli)
\ No newline at end of file
diff --git a/riscv/insns/p_sw_irpost.h b/riscv/insns/p_sw_irpost.h
new file mode 100644
index 0000000000..7ff0406fea
--- /dev/null
+++ b/riscv/insns/p_sw_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint32(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/riscv/insns/p_sw_rr.h b/riscv/insns/p_sw_rr.h
new file mode 100644
index 0000000000..6bef97f73d
--- /dev/null
+++ b/riscv/insns/p_sw_rr.h
@@ -0,0 +1 @@
+MMU.store_uint32(RS1 + sreg_t(P_RS3), RS2);
diff --git a/riscv/insns/p_sw_rrpost.h b/riscv/insns/p_sw_rrpost.h
new file mode 100644
index 0000000000..6382d6d801
--- /dev/null
+++ b/riscv/insns/p_sw_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint32(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/riscv/insns/pv_abs_b.h b/riscv/insns/pv_abs_b.h
new file mode 100644
index 0000000000..c0bc089cc1
--- /dev/null
+++ b/riscv/insns/pv_abs_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > 0 ? RS1_B(i) : -sext8(RS1_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_abs_h.h b/riscv/insns/pv_abs_h.h
new file mode 100644
index 0000000000..42ca4ff3ca
--- /dev/null
+++ b/riscv/insns/pv_abs_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > 0 ? RS1_H(i) : -sext16(RS1_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_b.h b/riscv/insns/pv_add_b.h
new file mode 100644
index 0000000000..ecae63a049
--- /dev/null
+++ b/riscv/insns/pv_add_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + sext8(RS2_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_h.h b/riscv/insns/pv_add_h.h
new file mode 100644
index 0000000000..0a78665af0
--- /dev/null
+++ b/riscv/insns/pv_add_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + sext16(RS2_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_h_div2.h b/riscv/insns/pv_add_h_div2.h
new file mode 100644
index 0000000000..d09a9e41f5
--- /dev/null
+++ b/riscv/insns/pv_add_h_div2.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] + src2.h[i]) >> 1 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] + src2.h[i];
+    res.h[i] = temp >> 1;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_add_h_div4.h b/riscv/insns/pv_add_h_div4.h
new file mode 100644
index 0000000000..0dc157025c
--- /dev/null
+++ b/riscv/insns/pv_add_h_div4.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] + src2.h[i]) >> 2 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] + src2.h[i];
+    res.h[i] = temp >> 2;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_add_h_div8.h b/riscv/insns/pv_add_h_div8.h
new file mode 100644
index 0000000000..5abd486118
--- /dev/null
+++ b/riscv/insns/pv_add_h_div8.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] + src2.h[i]) >> 3 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] + src2.h[i];
+    res.h[i] = temp >> 3;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_add_sc_b.h b/riscv/insns/pv_add_sc_b.h
new file mode 100644
index 0000000000..572b61c07f
--- /dev/null
+++ b/riscv/insns/pv_add_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + sext8(RS2_B(0));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_sc_h.h b/riscv/insns/pv_add_sc_h.h
new file mode 100644
index 0000000000..734a911b8f
--- /dev/null
+++ b/riscv/insns/pv_add_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + sext16(RS2_H(0));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_sci_b.h b/riscv/insns/pv_add_sci_b.h
new file mode 100644
index 0000000000..df47f1cb50
--- /dev/null
+++ b/riscv/insns/pv_add_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_add_sci_h.h b/riscv/insns/pv_add_sci_h.h
new file mode 100644
index 0000000000..907621c09f
--- /dev/null
+++ b/riscv/insns/pv_add_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_b.h b/riscv/insns/pv_and_b.h
new file mode 100644
index 0000000000..d3711b762d
--- /dev/null
+++ b/riscv/insns/pv_and_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_h.h b/riscv/insns/pv_and_h.h
new file mode 100644
index 0000000000..8bae35685b
--- /dev/null
+++ b/riscv/insns/pv_and_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) & RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_sc_b.h b/riscv/insns/pv_and_sc_b.h
new file mode 100644
index 0000000000..b1e6c865e5
--- /dev/null
+++ b/riscv/insns/pv_and_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_sc_h.h b/riscv/insns/pv_and_sc_h.h
new file mode 100644
index 0000000000..2389d11e1b
--- /dev/null
+++ b/riscv/insns/pv_and_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) & RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_sci_b.h b/riscv/insns/pv_and_sci_b.h
new file mode 100644
index 0000000000..7e4e9e0ac5
--- /dev/null
+++ b/riscv/insns/pv_and_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_and_sci_h.h b/riscv/insns/pv_and_sci_h.h
new file mode 100644
index 0000000000..fbd57d1163
--- /dev/null
+++ b/riscv/insns/pv_and_sci_h.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_H(i) & insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_b.h b/riscv/insns/pv_avg_b.h
new file mode 100644
index 0000000000..3d5d6d4723
--- /dev/null
+++ b/riscv/insns/pv_avg_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + sext8(RS2_B(i))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_h.h b/riscv/insns/pv_avg_h.h
new file mode 100644
index 0000000000..725f2f2e09
--- /dev/null
+++ b/riscv/insns/pv_avg_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + sext16(RS2_H(i))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_sc_b.h b/riscv/insns/pv_avg_sc_b.h
new file mode 100644
index 0000000000..0b7d2f8d2c
--- /dev/null
+++ b/riscv/insns/pv_avg_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + sext8(RS2_B(0))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_sc_h.h b/riscv/insns/pv_avg_sc_h.h
new file mode 100644
index 0000000000..8a6cb5e504
--- /dev/null
+++ b/riscv/insns/pv_avg_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + sext16(RS2_H(0))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_sci_b.h b/riscv/insns/pv_avg_sci_b.h
new file mode 100644
index 0000000000..ff67065e34
--- /dev/null
+++ b/riscv/insns/pv_avg_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + insn.p_simm6()) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avg_sci_h.h b/riscv/insns/pv_avg_sci_h.h
new file mode 100644
index 0000000000..f7deefd25b
--- /dev/null
+++ b/riscv/insns/pv_avg_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + insn.p_simm6()) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_b.h b/riscv/insns/pv_avgu_b.h
new file mode 100644
index 0000000000..435c4d22cc
--- /dev/null
+++ b/riscv/insns/pv_avgu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + zext8(RS2_B(i))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_h.h b/riscv/insns/pv_avgu_h.h
new file mode 100644
index 0000000000..3fdbaf4ddb
--- /dev/null
+++ b/riscv/insns/pv_avgu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + zext16(RS2_H(i))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_sc_b.h b/riscv/insns/pv_avgu_sc_b.h
new file mode 100644
index 0000000000..47ca3888bc
--- /dev/null
+++ b/riscv/insns/pv_avgu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + zext8(RS2_B(0))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_sc_h.h b/riscv/insns/pv_avgu_sc_h.h
new file mode 100644
index 0000000000..0bf92f93b3
--- /dev/null
+++ b/riscv/insns/pv_avgu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + zext16(RS2_H(0))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_sci_b.h b/riscv/insns/pv_avgu_sci_b.h
new file mode 100644
index 0000000000..fbc0dff921
--- /dev/null
+++ b/riscv/insns/pv_avgu_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + insn.p_zimm6()) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_avgu_sci_h.h b/riscv/insns/pv_avgu_sci_h.h
new file mode 100644
index 0000000000..dd8cd35442
--- /dev/null
+++ b/riscv/insns/pv_avgu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + insn.p_zimm6()) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_cmpeq_b.h b/riscv/insns/pv_cmpeq_b.h
new file mode 100644
index 0000000000..b2e912cb1f
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] == src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpeq_h.h b/riscv/insns/pv_cmpeq_h.h
new file mode 100644
index 0000000000..6524f55b54
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] == src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpeq_sc_b.h b/riscv/insns/pv_cmpeq_sc_b.h
new file mode 100644
index 0000000000..6103f22750
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] == src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpeq_sc_h.h b/riscv/insns/pv_cmpeq_sc_h.h
new file mode 100644
index 0000000000..82eecef2b9
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] == src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpeq_sci_b.h b/riscv/insns/pv_cmpeq_sci_b.h
new file mode 100644
index 0000000000..8ec09d3702
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] == src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpeq_sci_h.h b/riscv/insns/pv_cmpeq_sci_h.h
new file mode 100644
index 0000000000..1079f724bc
--- /dev/null
+++ b/riscv/insns/pv_cmpeq_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] == src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpge_b.h b/riscv/insns/pv_cmpge_b.h
new file mode 100644
index 0000000000..fe8a2f3871
--- /dev/null
+++ b/riscv/insns/pv_cmpge_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] >= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpge_h.h b/riscv/insns/pv_cmpge_h.h
new file mode 100644
index 0000000000..f6d8883637
--- /dev/null
+++ b/riscv/insns/pv_cmpge_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] >= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpge_sc_b.h b/riscv/insns/pv_cmpge_sc_b.h
new file mode 100644
index 0000000000..5471af6961
--- /dev/null
+++ b/riscv/insns/pv_cmpge_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] >= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpge_sc_h.h b/riscv/insns/pv_cmpge_sc_h.h
new file mode 100644
index 0000000000..91a7f412ab
--- /dev/null
+++ b/riscv/insns/pv_cmpge_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] >= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpge_sci_b.h b/riscv/insns/pv_cmpge_sci_b.h
new file mode 100644
index 0000000000..1f6fa2d92b
--- /dev/null
+++ b/riscv/insns/pv_cmpge_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] >= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpge_sci_h.h b/riscv/insns/pv_cmpge_sci_h.h
new file mode 100644
index 0000000000..6a1ce3db51
--- /dev/null
+++ b/riscv/insns/pv_cmpge_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] >= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgeu_b.h b/riscv/insns/pv_cmpgeu_b.h
new file mode 100644
index 0000000000..e587e93361
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] >= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgeu_h.h b/riscv/insns/pv_cmpgeu_h.h
new file mode 100644
index 0000000000..ab67fff8c0
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] >= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpgeu_sc_b.h b/riscv/insns/pv_cmpgeu_sc_b.h
new file mode 100644
index 0000000000..a364f6c903
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] >= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgeu_sc_h.h b/riscv/insns/pv_cmpgeu_sc_h.h
new file mode 100644
index 0000000000..f8898f4fab
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] >= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgeu_sci_b.h b/riscv/insns/pv_cmpgeu_sci_b.h
new file mode 100644
index 0000000000..e4840524f0
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] >= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgeu_sci_h.h b/riscv/insns/pv_cmpgeu_sci_h.h
new file mode 100644
index 0000000000..09eb4dd8ab
--- /dev/null
+++ b/riscv/insns/pv_cmpgeu_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] >= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgt_b.h b/riscv/insns/pv_cmpgt_b.h
new file mode 100644
index 0000000000..c2c4ba30e9
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] > src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgt_h.h b/riscv/insns/pv_cmpgt_h.h
new file mode 100644
index 0000000000..338b923ece
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] > src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpgt_sc_b.h b/riscv/insns/pv_cmpgt_sc_b.h
new file mode 100644
index 0000000000..3dd660c2be
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] > src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgt_sc_h.h b/riscv/insns/pv_cmpgt_sc_h.h
new file mode 100644
index 0000000000..77258f332e
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] > src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgt_sci_b.h b/riscv/insns/pv_cmpgt_sci_b.h
new file mode 100644
index 0000000000..597f1c7af0
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] > src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgt_sci_h.h b/riscv/insns/pv_cmpgt_sci_h.h
new file mode 100644
index 0000000000..5281a3c409
--- /dev/null
+++ b/riscv/insns/pv_cmpgt_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] > src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgtu_b.h b/riscv/insns/pv_cmpgtu_b.h
new file mode 100644
index 0000000000..49a34a6d71
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] > src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgtu_h.h b/riscv/insns/pv_cmpgtu_h.h
new file mode 100644
index 0000000000..a83073fa8c
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] > src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpgtu_sc_b.h b/riscv/insns/pv_cmpgtu_sc_b.h
new file mode 100644
index 0000000000..641d11040b
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] > src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgtu_sc_h.h b/riscv/insns/pv_cmpgtu_sc_h.h
new file mode 100644
index 0000000000..290d0a9795
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] > src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgtu_sci_b.h b/riscv/insns/pv_cmpgtu_sci_b.h
new file mode 100644
index 0000000000..acf42fa6b7
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] > src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpgtu_sci_h.h b/riscv/insns/pv_cmpgtu_sci_h.h
new file mode 100644
index 0000000000..7947ab2dea
--- /dev/null
+++ b/riscv/insns/pv_cmpgtu_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] > src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmple_b.h b/riscv/insns/pv_cmple_b.h
new file mode 100644
index 0000000000..cef9591bfe
--- /dev/null
+++ b/riscv/insns/pv_cmple_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] <= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmple_h.h b/riscv/insns/pv_cmple_h.h
new file mode 100644
index 0000000000..f10b555558
--- /dev/null
+++ b/riscv/insns/pv_cmple_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] <= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmple_sc_b.h b/riscv/insns/pv_cmple_sc_b.h
new file mode 100644
index 0000000000..3975bc9047
--- /dev/null
+++ b/riscv/insns/pv_cmple_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] <= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmple_sc_h.h b/riscv/insns/pv_cmple_sc_h.h
new file mode 100644
index 0000000000..f4612313dc
--- /dev/null
+++ b/riscv/insns/pv_cmple_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] <= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmple_sci_b.h b/riscv/insns/pv_cmple_sci_b.h
new file mode 100644
index 0000000000..54a5c1f830
--- /dev/null
+++ b/riscv/insns/pv_cmple_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] <= src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmple_sci_h.h b/riscv/insns/pv_cmple_sci_h.h
new file mode 100644
index 0000000000..158616e96f
--- /dev/null
+++ b/riscv/insns/pv_cmple_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] <= src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpleu_b.h b/riscv/insns/pv_cmpleu_b.h
new file mode 100644
index 0000000000..038c4ff7ab
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] <= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpleu_h.h b/riscv/insns/pv_cmpleu_h.h
new file mode 100644
index 0000000000..7f0577b409
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] <= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpleu_sc_b.h b/riscv/insns/pv_cmpleu_sc_b.h
new file mode 100644
index 0000000000..92232bf865
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] <= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpleu_sc_h.h b/riscv/insns/pv_cmpleu_sc_h.h
new file mode 100644
index 0000000000..19e8a957a0
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] <= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpleu_sci_b.h b/riscv/insns/pv_cmpleu_sci_b.h
new file mode 100644
index 0000000000..d400010f6f
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] <= src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpleu_sci_h.h b/riscv/insns/pv_cmpleu_sci_h.h
new file mode 100644
index 0000000000..dbd2ca7940
--- /dev/null
+++ b/riscv/insns/pv_cmpleu_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] <= src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmplt_b.h b/riscv/insns/pv_cmplt_b.h
new file mode 100644
index 0000000000..5e1dda25ee
--- /dev/null
+++ b/riscv/insns/pv_cmplt_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] < src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmplt_h.h b/riscv/insns/pv_cmplt_h.h
new file mode 100644
index 0000000000..d6cad0c119
--- /dev/null
+++ b/riscv/insns/pv_cmplt_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] < src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmplt_sc_b.h b/riscv/insns/pv_cmplt_sc_b.h
new file mode 100644
index 0000000000..d6d7d6823f
--- /dev/null
+++ b/riscv/insns/pv_cmplt_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] < src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmplt_sc_h.h b/riscv/insns/pv_cmplt_sc_h.h
new file mode 100644
index 0000000000..6d0b77f8eb
--- /dev/null
+++ b/riscv/insns/pv_cmplt_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] < src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmplt_sci_b.h b/riscv/insns/pv_cmplt_sci_b.h
new file mode 100644
index 0000000000..1382c7b43b
--- /dev/null
+++ b/riscv/insns/pv_cmplt_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] < src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmplt_sci_h.h b/riscv/insns/pv_cmplt_sci_h.h
new file mode 100644
index 0000000000..43a060c523
--- /dev/null
+++ b/riscv/insns/pv_cmplt_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] < src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpltu_b.h b/riscv/insns/pv_cmpltu_b.h
new file mode 100644
index 0000000000..fae2d4265f
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] < src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpltu_h.h b/riscv/insns/pv_cmpltu_h.h
new file mode 100644
index 0000000000..932f2747c0
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] < src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpltu_sc_b.h b/riscv/insns/pv_cmpltu_sc_b.h
new file mode 100644
index 0000000000..b29e612da8
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] < src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpltu_sc_h.h b/riscv/insns/pv_cmpltu_sc_h.h
new file mode 100644
index 0000000000..0c3ee69195
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] < src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpltu_sci_b.h b/riscv/insns/pv_cmpltu_sci_b.h
new file mode 100644
index 0000000000..67ff64c89e
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_sci_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.bu[i] < src2.bu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpltu_sci_h.h b/riscv/insns/pv_cmpltu_sci_h.h
new file mode 100644
index 0000000000..edff0c5f52
--- /dev/null
+++ b/riscv/insns/pv_cmpltu_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((insn.p_zimm6())*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.hu[i] < src2.hu[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpne_b.h b/riscv/insns/pv_cmpne_b.h
new file mode 100644
index 0000000000..4f8763897a
--- /dev/null
+++ b/riscv/insns/pv_cmpne_b.h
@@ -0,0 +1,12 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] != src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpne_h.h b/riscv/insns/pv_cmpne_h.h
new file mode 100644
index 0000000000..8f74a35562
--- /dev/null
+++ b/riscv/insns/pv_cmpne_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] != src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_cmpne_sc_b.h b/riscv/insns/pv_cmpne_sc_b.h
new file mode 100644
index 0000000000..c94538bbe5
--- /dev/null
+++ b/riscv/insns/pv_cmpne_sc_b.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] != src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpne_sc_h.h b/riscv/insns/pv_cmpne_sc_h.h
new file mode 100644
index 0000000000..f79334cca5
--- /dev/null
+++ b/riscv/insns/pv_cmpne_sc_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.reg = ((RS2 & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] != src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpne_sci_b.h b/riscv/insns/pv_cmpne_sci_b.h
new file mode 100644
index 0000000000..1930903b60
--- /dev/null
+++ b/riscv/insns/pv_cmpne_sci_b.h
@@ -0,0 +1,11 @@
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FF)*0x0101010101010101)};
+union simd_reg res;
+
+for(int i=0; i<(64/e8); i++)
+{
+    res.b[i] = (src1.b[i] != src2.b[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cmpne_sci_h.h b/riscv/insns/pv_cmpne_sci_h.h
new file mode 100644
index 0000000000..1a2b03671b
--- /dev/null
+++ b/riscv/insns/pv_cmpne_sci_h.h
@@ -0,0 +1,13 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+// replicate lowest element in second operand
+union simd_reg src2 = {.sreg = ((insn.p_simm6() & 0x0FFFF)*0x0001000100010001)};
+union simd_reg res;
+
+for(int i=0; i<(64/e16); i++)
+{
+    res.h[i] = (src1.h[i] != src2.h[i]);
+}
+
+WRITE_RD(sext_xlen(res.sreg));
\ No newline at end of file
diff --git a/riscv/insns/pv_cplxconj_h.h b/riscv/insns/pv_cplxconj_h.h
new file mode 100644
index 0000000000..95c91e5bd5
--- /dev/null
+++ b/riscv/insns/pv_cplxconj_h.h
@@ -0,0 +1,11 @@
+reg_t src1 = RS1;
+reg_t res;
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = src1;
+int16_t imag = (src1 >> 16);
+
+imag = -imag;
+res = (imag << 16) | real;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_i.h b/riscv/insns/pv_cplxmul_h_i.h
new file mode 100644
index 0000000000..e899c49e78
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_i.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real3 = src3;
+
+int16_t imag = ((sreg_t)real1*imag2 + imag1*real2) >> 15;
+res = (imag << 16) | real3;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_i_div2.h b/riscv/insns/pv_cplxmul_h_i_div2.h
new file mode 100644
index 0000000000..9d477176cd
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_i_div2.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real3 = src3;
+
+int16_t imag = ((sreg_t)real1*imag2 + imag1*real2) >> 16;
+res = (imag << 16) | real3;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_i_div4.h b/riscv/insns/pv_cplxmul_h_i_div4.h
new file mode 100644
index 0000000000..f23395b070
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_i_div4.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real3 = src3;
+
+int16_t imag = ((sreg_t)real1*imag2 + imag1*real2) >> 17;
+res = (imag << 16) | real3;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_i_div8.h b/riscv/insns/pv_cplxmul_h_i_div8.h
new file mode 100644
index 0000000000..735bf9be2d
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_i_div8.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real3 = src3;
+
+int16_t imag = ((sreg_t)real1*imag2 + imag1*real2) >> 18;
+res = (imag << 16) | real3;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_r.h b/riscv/insns/pv_cplxmul_h_r.h
new file mode 100644
index 0000000000..b101bfe153
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_r.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+int16_t imag3 = (src3 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = ((sreg_t)real1*real2 - imag1*imag2) >> 15;
+res = (imag3 << 16) | real;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_r_div2.h b/riscv/insns/pv_cplxmul_h_r_div2.h
new file mode 100644
index 0000000000..3b72aed003
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_r_div2.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+int16_t imag3 = (src3 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = ((sreg_t)real1*real2 - imag1*imag2) >> 16;
+res = (imag3 << 16) | real;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_r_div4.h b/riscv/insns/pv_cplxmul_h_r_div4.h
new file mode 100644
index 0000000000..d5091837ed
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_r_div4.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+int16_t imag3 = (src3 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = ((sreg_t)real1*real2 - imag1*imag2) >> 17;
+res = (imag3 << 16) | real;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_cplxmul_h_r_div8.h b/riscv/insns/pv_cplxmul_h_r_div8.h
new file mode 100644
index 0000000000..0b503ad84a
--- /dev/null
+++ b/riscv/insns/pv_cplxmul_h_r_div8.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t src3 = P_RS3;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+int16_t imag3 = (src3 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = ((sreg_t)real1*real2 - imag1*imag2) >> 18;
+res = (imag3 << 16) | real;
+
+WRITE_RD(sext_xlen(res));
diff --git a/riscv/insns/pv_dotsp_b.h b/riscv/insns/pv_dotsp_b.h
new file mode 100644
index 0000000000..93b7233cca
--- /dev/null
+++ b/riscv/insns/pv_dotsp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotsp_h.h b/riscv/insns/pv_dotsp_h.h
new file mode 100644
index 0000000000..9feed35efd
--- /dev/null
+++ b/riscv/insns/pv_dotsp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotsp_sc_b.h b/riscv/insns/pv_dotsp_sc_b.h
new file mode 100644
index 0000000000..cef11d5e77
--- /dev/null
+++ b/riscv/insns/pv_dotsp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotsp_sc_h.h b/riscv/insns/pv_dotsp_sc_h.h
new file mode 100644
index 0000000000..ef558d39f0
--- /dev/null
+++ b/riscv/insns/pv_dotsp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotsp_sci_b.h b/riscv/insns/pv_dotsp_sci_b.h
new file mode 100644
index 0000000000..3470fd55a3
--- /dev/null
+++ b/riscv/insns/pv_dotsp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotsp_sci_h.h b/riscv/insns/pv_dotsp_sci_h.h
new file mode 100644
index 0000000000..97e30eb290
--- /dev/null
+++ b/riscv/insns/pv_dotsp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_b.h b/riscv/insns/pv_dotup_b.h
new file mode 100644
index 0000000000..fa77f36678
--- /dev/null
+++ b/riscv/insns/pv_dotup_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_h.h b/riscv/insns/pv_dotup_h.h
new file mode 100644
index 0000000000..4e170b238b
--- /dev/null
+++ b/riscv/insns/pv_dotup_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_sc_b.h b/riscv/insns/pv_dotup_sc_b.h
new file mode 100644
index 0000000000..a581d0162a
--- /dev/null
+++ b/riscv/insns/pv_dotup_sc_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_sc_h.h b/riscv/insns/pv_dotup_sc_h.h
new file mode 100644
index 0000000000..b78762a871
--- /dev/null
+++ b/riscv/insns/pv_dotup_sc_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_sci_b.h b/riscv/insns/pv_dotup_sci_b.h
new file mode 100644
index 0000000000..0dedb1caf4
--- /dev/null
+++ b/riscv/insns/pv_dotup_sci_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotup_sci_h.h b/riscv/insns/pv_dotup_sci_h.h
new file mode 100644
index 0000000000..64a36d5690
--- /dev/null
+++ b/riscv/insns/pv_dotup_sci_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_b.h b/riscv/insns/pv_dotusp_b.h
new file mode 100644
index 0000000000..1cdfc2f2c2
--- /dev/null
+++ b/riscv/insns/pv_dotusp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_h.h b/riscv/insns/pv_dotusp_h.h
new file mode 100644
index 0000000000..81968a14d6
--- /dev/null
+++ b/riscv/insns/pv_dotusp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_sc_b.h b/riscv/insns/pv_dotusp_sc_b.h
new file mode 100644
index 0000000000..d562a7d4d3
--- /dev/null
+++ b/riscv/insns/pv_dotusp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_sc_h.h b/riscv/insns/pv_dotusp_sc_h.h
new file mode 100644
index 0000000000..3815c3721f
--- /dev/null
+++ b/riscv/insns/pv_dotusp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_sci_b.h b/riscv/insns/pv_dotusp_sci_b.h
new file mode 100644
index 0000000000..92c229540e
--- /dev/null
+++ b/riscv/insns/pv_dotusp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_dotusp_sci_h.h b/riscv/insns/pv_dotusp_sci_h.h
new file mode 100644
index 0000000000..8f91a89a33
--- /dev/null
+++ b/riscv/insns/pv_dotusp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_extract_b.h b/riscv/insns/pv_extract_b.h
new file mode 100644
index 0000000000..fce80bbb6e
--- /dev/null
+++ b/riscv/insns/pv_extract_b.h
@@ -0,0 +1 @@
+WRITE_RD(sext8(RS1_B(insn.p_zimm6() & 0x03)));
diff --git a/riscv/insns/pv_extract_h.h b/riscv/insns/pv_extract_h.h
new file mode 100644
index 0000000000..ee35393d49
--- /dev/null
+++ b/riscv/insns/pv_extract_h.h
@@ -0,0 +1 @@
+WRITE_RD(sext16(RS1_H(insn.p_zimm6() & 0x01)));
diff --git a/riscv/insns/pv_extractu_b.h b/riscv/insns/pv_extractu_b.h
new file mode 100644
index 0000000000..c240233874
--- /dev/null
+++ b/riscv/insns/pv_extractu_b.h
@@ -0,0 +1 @@
+WRITE_RD(zext8(RS1_B(insn.p_zimm6() & 0x03)));
diff --git a/riscv/insns/pv_extractu_h.h b/riscv/insns/pv_extractu_h.h
new file mode 100644
index 0000000000..90b679afd0
--- /dev/null
+++ b/riscv/insns/pv_extractu_h.h
@@ -0,0 +1 @@
+WRITE_RD(zext16(RS1_H(insn.p_zimm6() & 0x01)));
diff --git a/riscv/insns/pv_insert_b.h b/riscv/insns/pv_insert_b.h
new file mode 100644
index 0000000000..5575e79671
--- /dev/null
+++ b/riscv/insns/pv_insert_b.h
@@ -0,0 +1,6 @@
+uint32_t ins_rd = RD;
+uint8_t i = insn.p_zimm6() & 0x03; /* select to which rd half to write the 16-bit value */
+
+ins_rd = (ins_rd & ~(0xFF << ((xlen >> 2) * i))) | ((RS1_H(0) & 0xFF) << ((xlen >> 2) * i));
+
+WRITE_RD(sext_xlen(ins_rd));
diff --git a/riscv/insns/pv_insert_h.h b/riscv/insns/pv_insert_h.h
new file mode 100644
index 0000000000..eccb0eda6d
--- /dev/null
+++ b/riscv/insns/pv_insert_h.h
@@ -0,0 +1,6 @@
+uint32_t ins_rd = RD;
+uint8_t i = insn.p_zimm6() & 0x01; /* select to which rd half to write the 16-bit value */
+
+ins_rd = (ins_rd & ~(0xFFFF << ((xlen >> 1) * i))) | ((RS1_H(0) & 0xFFFF) << ((xlen >> 1) * i));
+
+WRITE_RD(sext_xlen(ins_rd));
diff --git a/riscv/insns/pv_max_b.h b/riscv/insns/pv_max_b.h
new file mode 100644
index 0000000000..4dc3e6be89
--- /dev/null
+++ b/riscv/insns/pv_max_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > sext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_max_h.h b/riscv/insns/pv_max_h.h
new file mode 100644
index 0000000000..c65a32da64
--- /dev/null
+++ b/riscv/insns/pv_max_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > sext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_max_sc_b.h b/riscv/insns/pv_max_sc_b.h
new file mode 100644
index 0000000000..896087f623
--- /dev/null
+++ b/riscv/insns/pv_max_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > sext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_max_sc_h.h b/riscv/insns/pv_max_sc_h.h
new file mode 100644
index 0000000000..fd55fb49b3
--- /dev/null
+++ b/riscv/insns/pv_max_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > sext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_max_sci_b.h b/riscv/insns/pv_max_sci_b.h
new file mode 100644
index 0000000000..5e06669faa
--- /dev/null
+++ b/riscv/insns/pv_max_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > insn.p_simm6() ? RS1_B(i) : insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_max_sci_h.h b/riscv/insns/pv_max_sci_h.h
new file mode 100644
index 0000000000..ce1df2ee1d
--- /dev/null
+++ b/riscv/insns/pv_max_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > insn.p_simm6() ? RS1_H(i) : insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_maxu_b.h b/riscv/insns/pv_maxu_b.h
new file mode 100644
index 0000000000..5821c17266
--- /dev/null
+++ b/riscv/insns/pv_maxu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > zext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_maxu_h.h b/riscv/insns/pv_maxu_h.h
new file mode 100644
index 0000000000..3e587c3c99
--- /dev/null
+++ b/riscv/insns/pv_maxu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > zext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_maxu_sc_b.h b/riscv/insns/pv_maxu_sc_b.h
new file mode 100644
index 0000000000..c297b87ab2
--- /dev/null
+++ b/riscv/insns/pv_maxu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > zext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_maxu_sc_h.h b/riscv/insns/pv_maxu_sc_h.h
new file mode 100644
index 0000000000..fbb5c7feb9
--- /dev/null
+++ b/riscv/insns/pv_maxu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > zext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_maxu_sci_b.h b/riscv/insns/pv_maxu_sci_b.h
new file mode 100644
index 0000000000..ab5f6e5f98
--- /dev/null
+++ b/riscv/insns/pv_maxu_sci_b.h
@@ -0,0 +1,10 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > insn.p_zimm6() ? RS1_B(i) : insn.p_zimm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
+
diff --git a/riscv/insns/pv_maxu_sci_h.h b/riscv/insns/pv_maxu_sci_h.h
new file mode 100644
index 0000000000..9aaf9effc6
--- /dev/null
+++ b/riscv/insns/pv_maxu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > insn.p_zimm6() ? RS1_H(i) : insn.p_zimm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_b.h b/riscv/insns/pv_min_b.h
new file mode 100644
index 0000000000..1b9104b55f
--- /dev/null
+++ b/riscv/insns/pv_min_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= sext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_h.h b/riscv/insns/pv_min_h.h
new file mode 100644
index 0000000000..bbc83caea8
--- /dev/null
+++ b/riscv/insns/pv_min_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= sext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_sc_b.h b/riscv/insns/pv_min_sc_b.h
new file mode 100644
index 0000000000..1d2aac507b
--- /dev/null
+++ b/riscv/insns/pv_min_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= sext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_sc_h.h b/riscv/insns/pv_min_sc_h.h
new file mode 100644
index 0000000000..b2b8ab1109
--- /dev/null
+++ b/riscv/insns/pv_min_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= sext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_sci_b.h b/riscv/insns/pv_min_sci_b.h
new file mode 100644
index 0000000000..031b51f362
--- /dev/null
+++ b/riscv/insns/pv_min_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= insn.p_simm6() ? RS1_B(i) : insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_min_sci_h.h b/riscv/insns/pv_min_sci_h.h
new file mode 100644
index 0000000000..d007e06620
--- /dev/null
+++ b/riscv/insns/pv_min_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= insn.p_simm6() ? RS1_H(i) : insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_b.h b/riscv/insns/pv_minu_b.h
new file mode 100644
index 0000000000..bbb92ca55a
--- /dev/null
+++ b/riscv/insns/pv_minu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= zext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_h.h b/riscv/insns/pv_minu_h.h
new file mode 100644
index 0000000000..fa7b0a4e21
--- /dev/null
+++ b/riscv/insns/pv_minu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= zext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_sc_b.h b/riscv/insns/pv_minu_sc_b.h
new file mode 100644
index 0000000000..566bcce6da
--- /dev/null
+++ b/riscv/insns/pv_minu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= zext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_sc_h.h b/riscv/insns/pv_minu_sc_h.h
new file mode 100644
index 0000000000..7471d96780
--- /dev/null
+++ b/riscv/insns/pv_minu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= zext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_sci_b.h b/riscv/insns/pv_minu_sci_b.h
new file mode 100644
index 0000000000..75c43787c3
--- /dev/null
+++ b/riscv/insns/pv_minu_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= insn.p_zimm6() ? RS1_B(i) : insn.p_zimm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_minu_sci_h.h b/riscv/insns/pv_minu_sci_h.h
new file mode 100644
index 0000000000..c665e92f40
--- /dev/null
+++ b/riscv/insns/pv_minu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= insn.p_zimm6() ? RS1_H(i) : insn.p_zimm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_b.h b/riscv/insns/pv_or_b.h
new file mode 100644
index 0000000000..d27a6e5d17
--- /dev/null
+++ b/riscv/insns/pv_or_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_h.h b/riscv/insns/pv_or_h.h
new file mode 100644
index 0000000000..65b112893e
--- /dev/null
+++ b/riscv/insns/pv_or_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_sc_b.h b/riscv/insns/pv_or_sc_b.h
new file mode 100644
index 0000000000..cac508744b
--- /dev/null
+++ b/riscv/insns/pv_or_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_sc_h.h b/riscv/insns/pv_or_sc_h.h
new file mode 100644
index 0000000000..e6f567cf3b
--- /dev/null
+++ b/riscv/insns/pv_or_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_sci_b.h b/riscv/insns/pv_or_sci_b.h
new file mode 100644
index 0000000000..0cb7b5cb6c
--- /dev/null
+++ b/riscv/insns/pv_or_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_or_sci_h.h b/riscv/insns/pv_or_sci_h.h
new file mode 100644
index 0000000000..e95922e1eb
--- /dev/null
+++ b/riscv/insns/pv_or_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_pack.h b/riscv/insns/pv_pack.h
new file mode 100644
index 0000000000..a7c9b69e5b
--- /dev/null
+++ b/riscv/insns/pv_pack.h
@@ -0,0 +1,7 @@
+reg_t src1 = RS1_H(0);
+reg_t src2 = RS2_H(0);
+
+reg_t res = (src1 << 16) | src2;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_pack_h.h b/riscv/insns/pv_pack_h.h
new file mode 100644
index 0000000000..7ad94faa9d
--- /dev/null
+++ b/riscv/insns/pv_pack_h.h
@@ -0,0 +1,7 @@
+reg_t src1 = RS1_H(1);
+reg_t src2 = RS2_H(1);
+
+reg_t res = (src1 << 16) | src2;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_packhi_b.h b/riscv/insns/pv_packhi_b.h
new file mode 100644
index 0000000000..b8a7a46c8d
--- /dev/null
+++ b/riscv/insns/pv_packhi_b.h
@@ -0,0 +1,8 @@
+reg_t src1 = RS1_B(0);
+reg_t src2 = RS2_B(0);
+reg_t res  = RD & 0x0FFFF;
+
+res |= (src1 << 24) | (src2 << 16);
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_packlo_b.h b/riscv/insns/pv_packlo_b.h
new file mode 100644
index 0000000000..4606fe1f01
--- /dev/null
+++ b/riscv/insns/pv_packlo_b.h
@@ -0,0 +1,8 @@
+reg_t src1 = RS1_B(0);
+reg_t src2 = RS2_B(0);
+reg_t res  = RD & 0xFFFF0000;
+
+res |= (src1 << 8) | src2;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_sdotsp_b.h b/riscv/insns/pv_sdotsp_b.h
new file mode 100644
index 0000000000..812e3d4369
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotsp_h.h b/riscv/insns/pv_sdotsp_h.h
new file mode 100644
index 0000000000..9ccfae939d
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotsp_sc_b.h b/riscv/insns/pv_sdotsp_sc_b.h
new file mode 100644
index 0000000000..e665a669f2
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotsp_sc_h.h b/riscv/insns/pv_sdotsp_sc_h.h
new file mode 100644
index 0000000000..fa1ca93fee
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotsp_sci_b.h b/riscv/insns/pv_sdotsp_sci_b.h
new file mode 100644
index 0000000000..31aab1fe54
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotsp_sci_h.h b/riscv/insns/pv_sdotsp_sci_h.h
new file mode 100644
index 0000000000..151d16a2e9
--- /dev/null
+++ b/riscv/insns/pv_sdotsp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_b.h b/riscv/insns/pv_sdotup_b.h
new file mode 100644
index 0000000000..82e47b4f82
--- /dev/null
+++ b/riscv/insns/pv_sdotup_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_h.h b/riscv/insns/pv_sdotup_h.h
new file mode 100644
index 0000000000..de77009a07
--- /dev/null
+++ b/riscv/insns/pv_sdotup_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_sc_b.h b/riscv/insns/pv_sdotup_sc_b.h
new file mode 100644
index 0000000000..717fffc119
--- /dev/null
+++ b/riscv/insns/pv_sdotup_sc_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_sc_h.h b/riscv/insns/pv_sdotup_sc_h.h
new file mode 100644
index 0000000000..ecf0485664
--- /dev/null
+++ b/riscv/insns/pv_sdotup_sc_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_sci_b.h b/riscv/insns/pv_sdotup_sci_b.h
new file mode 100644
index 0000000000..bd4d850e6d
--- /dev/null
+++ b/riscv/insns/pv_sdotup_sci_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotup_sci_h.h b/riscv/insns/pv_sdotup_sci_h.h
new file mode 100644
index 0000000000..145e73717f
--- /dev/null
+++ b/riscv/insns/pv_sdotup_sci_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_b.h b/riscv/insns/pv_sdotusp_b.h
new file mode 100644
index 0000000000..05d268ed21
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_h.h b/riscv/insns/pv_sdotusp_h.h
new file mode 100644
index 0000000000..fdc550db1c
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_sc_b.h b/riscv/insns/pv_sdotusp_sc_b.h
new file mode 100644
index 0000000000..2840cd1483
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_sc_h.h b/riscv/insns/pv_sdotusp_sc_h.h
new file mode 100644
index 0000000000..ca4c25ac10
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_sci_b.h b/riscv/insns/pv_sdotusp_sci_b.h
new file mode 100644
index 0000000000..d6823f83a9
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_sdotusp_sci_h.h b/riscv/insns/pv_sdotusp_sci_h.h
new file mode 100644
index 0000000000..42c4fbe88b
--- /dev/null
+++ b/riscv/insns/pv_sdotusp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/riscv/insns/pv_shuffle2_b.h b/riscv/insns/pv_shuffle2_b.h
new file mode 100644
index 0000000000..8dd4e99945
--- /dev/null
+++ b/riscv/insns/pv_shuffle2_b.h
@@ -0,0 +1,14 @@
+uint8_t src_sel; // select rd or rs1 as source (bit [2] of second operand)
+uint8_t byte_sel; // select which byte from source (bits [1:0] of second operand)
+uint8_t source;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  byte_sel = RS2_B(i) & 0x03; // bits [1:0] of RS2_B(i)
+  src_sel = (RS2_B(i) >> 2) & 0x01; // bit [2] of RS2_B(i)
+  source = src_sel ? RS1_B(byte_sel) : RD_B(byte_sel);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)source & 0x000000FF;
+}
+
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_shuffle2_h.h b/riscv/insns/pv_shuffle2_h.h
new file mode 100644
index 0000000000..362a4bdc77
--- /dev/null
+++ b/riscv/insns/pv_shuffle2_h.h
@@ -0,0 +1,14 @@
+uint8_t src_sel; // select rd or rs1 as source (bit [1] of second operand)
+uint8_t half_sel; // select which half from source (bit [0] of second operand)
+uint16_t source;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  half_sel = RS2_H(i) & 0x01; // bit [0] of RS2_H(i)
+  src_sel = (RS2_H(i) >> 1) & 0x01; // bit [1] of RS2_H(i)
+  source = src_sel ? RS1_H(half_sel) : RD_H(half_sel);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)source & 0x0000FFFF;
+}
+
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_shuffle_b.h b/riscv/insns/pv_shuffle_b.h
new file mode 100644
index 0000000000..efd7790c81
--- /dev/null
+++ b/riscv/insns/pv_shuffle_b.h
@@ -0,0 +1,16 @@
+reg_t selector = RS2;
+
+reg_t sel3 = (selector >> 24)%4;
+reg_t sel2 = (selector >> 16)%4;
+reg_t sel1 = (selector >> 8)%4;
+reg_t sel0 = selector%4;
+
+reg_t byte3 = RS1_B(sel3);
+reg_t byte2 = RS1_B(sel2);
+reg_t byte1 = RS1_B(sel1);
+reg_t byte0 = RS1_B(sel0);
+
+reg_t res = (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shuffle_h.h b/riscv/insns/pv_shuffle_h.h
new file mode 100644
index 0000000000..3920c9f619
--- /dev/null
+++ b/riscv/insns/pv_shuffle_h.h
@@ -0,0 +1,11 @@
+reg_t selector = RS2;
+
+reg_t h_sel = (selector >> 16)%2;
+reg_t l_sel = selector%2;
+reg_t hhalf = RS1_H(h_sel);
+reg_t lhalf = RS1_H(l_sel);
+
+reg_t res = (hhalf << 16) | lhalf;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shuffle_sci_h.h b/riscv/insns/pv_shuffle_sci_h.h
new file mode 100644
index 0000000000..2358492738
--- /dev/null
+++ b/riscv/insns/pv_shuffle_sci_h.h
@@ -0,0 +1,11 @@
+reg_t selector = insn.p_zimm6();
+
+reg_t h_sel = (selector >> 1)%2;
+reg_t l_sel = selector%2;
+reg_t hhalf = RS1_H(h_sel);
+reg_t lhalf = RS1_H(l_sel);
+
+reg_t res = (hhalf << 16) | lhalf;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shufflei0_sci_b.h b/riscv/insns/pv_shufflei0_sci_b.h
new file mode 100644
index 0000000000..a1524c8d46
--- /dev/null
+++ b/riscv/insns/pv_shufflei0_sci_b.h
@@ -0,0 +1,15 @@
+reg_t selector = insn.p_zimm6();;
+
+reg_t sel2 = (selector >> 4)%4;
+reg_t sel1 = (selector >> 2)%4;
+reg_t sel0 = selector%4;
+
+reg_t byte3 = RS1_B(0);
+reg_t byte2 = RS1_B(sel2);
+reg_t byte1 = RS1_B(sel1);
+reg_t byte0 = RS1_B(sel0);
+
+reg_t res = (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shufflei1_sci_b.h b/riscv/insns/pv_shufflei1_sci_b.h
new file mode 100644
index 0000000000..5ccb2c7549
--- /dev/null
+++ b/riscv/insns/pv_shufflei1_sci_b.h
@@ -0,0 +1,15 @@
+reg_t selector = insn.p_zimm6();;
+
+reg_t sel2 = (selector >> 4)%4;
+reg_t sel1 = (selector >> 2)%4;
+reg_t sel0 = selector%4;
+
+reg_t byte3 = RS1_B(1);
+reg_t byte2 = RS1_B(sel2);
+reg_t byte1 = RS1_B(sel1);
+reg_t byte0 = RS1_B(sel0);
+
+reg_t res = (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shufflei2_sci_b.h b/riscv/insns/pv_shufflei2_sci_b.h
new file mode 100644
index 0000000000..a84166929d
--- /dev/null
+++ b/riscv/insns/pv_shufflei2_sci_b.h
@@ -0,0 +1,15 @@
+reg_t selector = insn.p_zimm6();;
+
+reg_t sel2 = (selector >> 4)%4;
+reg_t sel1 = (selector >> 2)%4;
+reg_t sel0 = selector%4;
+
+reg_t byte3 = RS1_B(2);
+reg_t byte2 = RS1_B(sel2);
+reg_t byte1 = RS1_B(sel1);
+reg_t byte0 = RS1_B(sel0);
+
+reg_t res = (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_shufflei3_sci_b.h b/riscv/insns/pv_shufflei3_sci_b.h
new file mode 100644
index 0000000000..89c1d91132
--- /dev/null
+++ b/riscv/insns/pv_shufflei3_sci_b.h
@@ -0,0 +1,15 @@
+reg_t selector = insn.p_zimm6();;
+
+reg_t sel2 = (selector >> 4)%4;
+reg_t sel1 = (selector >> 2)%4;
+reg_t sel0 = selector%4;
+
+reg_t byte3 = RS1_B(3);
+reg_t byte2 = RS1_B(sel2);
+reg_t byte1 = RS1_B(sel1);
+reg_t byte0 = RS1_B(sel0);
+
+reg_t res = (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_sll_b.h b/riscv/insns/pv_sll_b.h
new file mode 100644
index 0000000000..ca8bcd6883
--- /dev/null
+++ b/riscv/insns/pv_sll_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sll_h.h b/riscv/insns/pv_sll_h.h
new file mode 100644
index 0000000000..cb9200caca
--- /dev/null
+++ b/riscv/insns/pv_sll_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sll_sc_b.h b/riscv/insns/pv_sll_sc_b.h
new file mode 100644
index 0000000000..d320519982
--- /dev/null
+++ b/riscv/insns/pv_sll_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sll_sc_h.h b/riscv/insns/pv_sll_sc_h.h
new file mode 100644
index 0000000000..e84cf0214a
--- /dev/null
+++ b/riscv/insns/pv_sll_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sll_sci_b.h b/riscv/insns/pv_sll_sci_b.h
new file mode 100644
index 0000000000..8e637bea82
--- /dev/null
+++ b/riscv/insns/pv_sll_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sll_sci_h.h b/riscv/insns/pv_sll_sci_h.h
new file mode 100644
index 0000000000..ec94a2e288
--- /dev/null
+++ b/riscv/insns/pv_sll_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_b.h b/riscv/insns/pv_sra_b.h
new file mode 100644
index 0000000000..9525a0afc1
--- /dev/null
+++ b/riscv/insns/pv_sra_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_h.h b/riscv/insns/pv_sra_h.h
new file mode 100644
index 0000000000..b3e8a0b944
--- /dev/null
+++ b/riscv/insns/pv_sra_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_sc_b.h b/riscv/insns/pv_sra_sc_b.h
new file mode 100644
index 0000000000..9442d92804
--- /dev/null
+++ b/riscv/insns/pv_sra_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_sc_h.h b/riscv/insns/pv_sra_sc_h.h
new file mode 100644
index 0000000000..1e012f750e
--- /dev/null
+++ b/riscv/insns/pv_sra_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_sci_b.h b/riscv/insns/pv_sra_sci_b.h
new file mode 100644
index 0000000000..3dafb3cb51
--- /dev/null
+++ b/riscv/insns/pv_sra_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sra_sci_h.h b/riscv/insns/pv_sra_sci_h.h
new file mode 100644
index 0000000000..4f56d0e5ee
--- /dev/null
+++ b/riscv/insns/pv_sra_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_b.h b/riscv/insns/pv_srl_b.h
new file mode 100644
index 0000000000..37be2e23aa
--- /dev/null
+++ b/riscv/insns/pv_srl_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_h.h b/riscv/insns/pv_srl_h.h
new file mode 100644
index 0000000000..1b35116d3b
--- /dev/null
+++ b/riscv/insns/pv_srl_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_sc_b.h b/riscv/insns/pv_srl_sc_b.h
new file mode 100644
index 0000000000..4b04ab6f74
--- /dev/null
+++ b/riscv/insns/pv_srl_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_sc_h.h b/riscv/insns/pv_srl_sc_h.h
new file mode 100644
index 0000000000..f49f784db8
--- /dev/null
+++ b/riscv/insns/pv_srl_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_sci_b.h b/riscv/insns/pv_srl_sci_b.h
new file mode 100644
index 0000000000..b0b38f2a90
--- /dev/null
+++ b/riscv/insns/pv_srl_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_srl_sci_h.h b/riscv/insns/pv_srl_sci_h.h
new file mode 100644
index 0000000000..5aba29cc91
--- /dev/null
+++ b/riscv/insns/pv_srl_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_b.h b/riscv/insns/pv_sub_b.h
new file mode 100644
index 0000000000..2ce1fe224f
--- /dev/null
+++ b/riscv/insns/pv_sub_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - sext8(RS2_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_h.h b/riscv/insns/pv_sub_h.h
new file mode 100644
index 0000000000..4ec5137268
--- /dev/null
+++ b/riscv/insns/pv_sub_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - sext16(RS2_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_h_div2.h b/riscv/insns/pv_sub_h_div2.h
new file mode 100644
index 0000000000..a5ea860a0a
--- /dev/null
+++ b/riscv/insns/pv_sub_h_div2.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] - src2.h[i]) >> 1 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] - src2.h[i];
+    res.h[i] = temp >> 1;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_sub_h_div4.h b/riscv/insns/pv_sub_h_div4.h
new file mode 100644
index 0000000000..8615108286
--- /dev/null
+++ b/riscv/insns/pv_sub_h_div4.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] - src2.h[i]) >> 2 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] +-src2.h[i];
+    res.h[i] = temp >> 2;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_sub_h_div8.h b/riscv/insns/pv_sub_h_div8.h
new file mode 100644
index 0000000000..027ea66f45
--- /dev/null
+++ b/riscv/insns/pv_sub_h_div8.h
@@ -0,0 +1,18 @@
+// Todo: explicit use of SIMD insns (ie xsimd lib) would be better
+
+union simd_reg src1 = {.reg = RS1};
+union simd_reg src2 = {.reg = RS2};
+union simd_reg res;
+
+int16_t temp;
+
+for(int i=0; i<(64/e16); i++)
+{
+    // (src1.h[i] - src2.h[i]) >> 3 doesn't work as shift
+    // will be performed in int32 (using overflows from add)
+    temp = src1.h[i] - src2.h[i];
+    res.h[i] = temp >> 3;
+}
+
+WRITE_RD(sext_xlen(res.sreg));
+
diff --git a/riscv/insns/pv_sub_sc_b.h b/riscv/insns/pv_sub_sc_b.h
new file mode 100644
index 0000000000..3375e64c24
--- /dev/null
+++ b/riscv/insns/pv_sub_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - sext8(RS2_B(0));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_sc_h.h b/riscv/insns/pv_sub_sc_h.h
new file mode 100644
index 0000000000..4bb12839c4
--- /dev/null
+++ b/riscv/insns/pv_sub_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - sext16(RS2_H(0));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_sci_b.h b/riscv/insns/pv_sub_sci_b.h
new file mode 100644
index 0000000000..20cc941239
--- /dev/null
+++ b/riscv/insns/pv_sub_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_sub_sci_h.h b/riscv/insns/pv_sub_sci_h.h
new file mode 100644
index 0000000000..50b11a6653
--- /dev/null
+++ b/riscv/insns/pv_sub_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_subrotmj_h.h b/riscv/insns/pv_subrotmj_h.h
new file mode 100644
index 0000000000..04289fcead
--- /dev/null
+++ b/riscv/insns/pv_subrotmj_h.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = imag1 - imag2;
+uint16_t imag = real2 - real1;
+res = (imag << 16) | real;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_subrotmj_h_div2.h b/riscv/insns/pv_subrotmj_h_div2.h
new file mode 100644
index 0000000000..3e2d698e3d
--- /dev/null
+++ b/riscv/insns/pv_subrotmj_h_div2.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = (int16_t)(imag1 - imag2) >> 1;
+uint16_t imag = (int16_t)(real2 - real1) >> 1;
+res = (imag << 16) | real;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_subrotmj_h_div4.h b/riscv/insns/pv_subrotmj_h_div4.h
new file mode 100644
index 0000000000..21e980869c
--- /dev/null
+++ b/riscv/insns/pv_subrotmj_h_div4.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = (int16_t)(imag1 - imag2) >> 2;
+uint16_t imag = (int16_t)(real2 - real1) >> 2;
+res = (imag << 16) | real;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_subrotmj_h_div8.h b/riscv/insns/pv_subrotmj_h_div8.h
new file mode 100644
index 0000000000..13babf74db
--- /dev/null
+++ b/riscv/insns/pv_subrotmj_h_div8.h
@@ -0,0 +1,16 @@
+reg_t src1 = RS1;
+reg_t src2 = RS2;
+reg_t res;
+
+int16_t real1 = src1;
+int16_t imag1 = (src1 >> 16);
+int16_t real2 = src2;
+int16_t imag2 = (src2 >> 16);
+
+// uint16_t zero-extends to uint32_t, which is needed
+uint16_t real = (int16_t)(imag1 - imag2) >> 3;
+uint16_t imag = (int16_t)(real2 - real1) >> 3;
+res = (imag << 16) | real;
+
+WRITE_RD(sext_xlen(res));
+
diff --git a/riscv/insns/pv_xor_b.h b/riscv/insns/pv_xor_b.h
new file mode 100644
index 0000000000..2fc203b4d6
--- /dev/null
+++ b/riscv/insns/pv_xor_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_xor_h.h b/riscv/insns/pv_xor_h.h
new file mode 100644
index 0000000000..56cf0b7c9a
--- /dev/null
+++ b/riscv/insns/pv_xor_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_xor_sc_b.h b/riscv/insns/pv_xor_sc_b.h
new file mode 100644
index 0000000000..ed3d5075ab
--- /dev/null
+++ b/riscv/insns/pv_xor_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_xor_sc_h.h b/riscv/insns/pv_xor_sc_h.h
new file mode 100644
index 0000000000..9d632f367b
--- /dev/null
+++ b/riscv/insns/pv_xor_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_xor_sci_b.h b/riscv/insns/pv_xor_sci_b.h
new file mode 100644
index 0000000000..7ecbf94fc4
--- /dev/null
+++ b/riscv/insns/pv_xor_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/pv_xor_sci_h.h b/riscv/insns/pv_xor_sci_h.h
new file mode 100644
index 0000000000..0a02ced601
--- /dev/null
+++ b/riscv/insns/pv_xor_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/riscv/insns/sc_d.h b/riscv/insns/sc_d.h
index aeeabd350d..54023ed449 100644
--- a/riscv/insns/sc_d.h
+++ b/riscv/insns/sc_d.h
@@ -1,11 +1,11 @@
 require_extension('A');
 require_rv64;
-if (MMU.check_load_reservation(RS1))
-{
+
+bool have_reservation = MMU.check_load_reservation(RS1, 8);
+
+if (have_reservation)
   MMU.store_uint64(RS1, RS2);
-  WRITE_RD(0);
-}
-else
-  WRITE_RD(1);
 
 MMU.yield_load_reservation();
+
+WRITE_RD(!have_reservation);
diff --git a/riscv/insns/sc_w.h b/riscv/insns/sc_w.h
index 4b4be50584..e430dcb2e5 100644
--- a/riscv/insns/sc_w.h
+++ b/riscv/insns/sc_w.h
@@ -1,10 +1,10 @@
 require_extension('A');
-if (MMU.check_load_reservation(RS1))
-{
+
+bool have_reservation = MMU.check_load_reservation(RS1, 4);
+
+if (have_reservation)
   MMU.store_uint32(RS1, RS2);
-  WRITE_RD(0);
-}
-else
-  WRITE_RD(1);
 
 MMU.yield_load_reservation();
+
+WRITE_RD(!have_reservation);
diff --git a/riscv/insns/sfence_vma.h b/riscv/insns/sfence_vma.h
index fc4625f0bf..ff949c7fad 100644
--- a/riscv/insns/sfence_vma.h
+++ b/riscv/insns/sfence_vma.h
@@ -1,2 +1,8 @@
-require_privilege(get_field(STATE.mstatus, MSTATUS_TVM) ? PRV_M : PRV_S);
+require_extension('S');
+if (STATE.v) {
+  if (STATE.prv == PRV_U || get_field(STATE.hstatus, HSTATUS_VTVM))
+    require_novirt();
+} else {
+  require_privilege(get_field(STATE.mstatus, MSTATUS_TVM) ? PRV_M : PRV_S);
+}
 MMU.flush_tlb();
diff --git a/riscv/insns/sret.h b/riscv/insns/sret.h
index ae841de93f..315f4f0ec2 100644
--- a/riscv/insns/sret.h
+++ b/riscv/insns/sret.h
@@ -1,9 +1,20 @@
-require_privilege(get_field(STATE.mstatus, MSTATUS_TSR) ? PRV_M : PRV_S);
-set_pc_and_serialize(p->get_state()->sepc);
+require_extension('S');
+if (STATE.v) {
+  if (STATE.prv == PRV_U || get_field(STATE.hstatus, HSTATUS_VTSR))
+    require_novirt();
+} else {
+  require_privilege(get_field(STATE.mstatus, MSTATUS_TSR) ? PRV_M : PRV_S);
+}
+reg_t next_pc = (STATE.v) ? p->get_state()->vsepc : p->get_state()->sepc;
+set_pc_and_serialize(next_pc);
 reg_t s = STATE.mstatus;
 reg_t prev_prv = get_field(s, MSTATUS_SPP);
 s = set_field(s, MSTATUS_SIE, get_field(s, MSTATUS_SPIE));
 s = set_field(s, MSTATUS_SPIE, 1);
 s = set_field(s, MSTATUS_SPP, PRV_U);
-p->set_privilege(prev_prv);
 p->set_csr(CSR_MSTATUS, s);
+p->set_privilege(prev_prv);
+if (!STATE.v) {
+  reg_t prev_virt = get_field(STATE.hstatus, HSTATUS_SPV);
+  p->set_virt(prev_virt);
+}
diff --git a/riscv/insns/vaadd_vv.h b/riscv/insns/vaadd_vv.h
new file mode 100644
index 0000000000..0a14467f61
--- /dev/null
+++ b/riscv/insns/vaadd_vv.h
@@ -0,0 +1,2 @@
+// vaadd.vv vd, vs2, vs1
+VI_VVX_LOOP_AVG(vs1, +, true);
diff --git a/riscv/insns/vaadd_vx.h b/riscv/insns/vaadd_vx.h
new file mode 100644
index 0000000000..ae00d8e46c
--- /dev/null
+++ b/riscv/insns/vaadd_vx.h
@@ -0,0 +1,2 @@
+// vaadd.vx vd, vs2, rs1
+VI_VVX_LOOP_AVG(rs1, +, false);
diff --git a/riscv/insns/vaaddu_vv.h b/riscv/insns/vaaddu_vv.h
new file mode 100644
index 0000000000..2f3fe745e7
--- /dev/null
+++ b/riscv/insns/vaaddu_vv.h
@@ -0,0 +1,2 @@
+// vaaddu.vv vd, vs2, vs1
+VI_VVX_ULOOP_AVG(vs1, +, true);
diff --git a/riscv/insns/vaaddu_vx.h b/riscv/insns/vaaddu_vx.h
new file mode 100644
index 0000000000..0e9fddcb1b
--- /dev/null
+++ b/riscv/insns/vaaddu_vx.h
@@ -0,0 +1,2 @@
+// vaaddu.vx vd, vs2, rs1
+VI_VVX_ULOOP_AVG(rs1, +, false);
diff --git a/riscv/insns/vadc_vim.h b/riscv/insns/vadc_vim.h
new file mode 100644
index 0000000000..824fac970e
--- /dev/null
+++ b/riscv/insns/vadc_vim.h
@@ -0,0 +1,10 @@
+// vadc.vim vd, vs2, simm5, v0
+VI_XI_LOOP_WITH_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & simm5) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadc_vvm.h b/riscv/insns/vadc_vvm.h
new file mode 100644
index 0000000000..2d6803a8b4
--- /dev/null
+++ b/riscv/insns/vadc_vvm.h
@@ -0,0 +1,10 @@
+// vadc.vvm vd, vs2, rs1, v0
+VI_VV_LOOP_WITH_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs1) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadc_vxm.h b/riscv/insns/vadc_vxm.h
new file mode 100644
index 0000000000..0d2d052cca
--- /dev/null
+++ b/riscv/insns/vadc_vxm.h
@@ -0,0 +1,10 @@
+// vadc.vxm vd, vs2, rs1, v0
+VI_XI_LOOP_WITH_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & rs1) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadd_vi.h b/riscv/insns/vadd_vi.h
new file mode 100644
index 0000000000..45fc6b74e7
--- /dev/null
+++ b/riscv/insns/vadd_vi.h
@@ -0,0 +1,5 @@
+// vadd.vi vd, simm5, vs2, vm
+VI_VI_LOOP
+({
+  vd = simm5 + vs2;
+})
diff --git a/riscv/insns/vadd_vv.h b/riscv/insns/vadd_vv.h
new file mode 100644
index 0000000000..45c6bdcba7
--- /dev/null
+++ b/riscv/insns/vadd_vv.h
@@ -0,0 +1,5 @@
+// vadd.vv vd, vs1, vs2, vm
+VI_VV_LOOP
+({
+  vd = vs1 + vs2;
+})
diff --git a/riscv/insns/vadd_vx.h b/riscv/insns/vadd_vx.h
new file mode 100644
index 0000000000..33e72ee495
--- /dev/null
+++ b/riscv/insns/vadd_vx.h
@@ -0,0 +1,5 @@
+// vadd.vx vd, rs1, vs2, vm
+VI_VX_LOOP
+({
+  vd = rs1 + vs2;
+})
diff --git a/riscv/insns/vamoaddei16_v.h b/riscv/insns/vamoaddei16_v.h
new file mode 100644
index 0000000000..3cb3db709d
--- /dev/null
+++ b/riscv/insns/vamoaddei16_v.h
@@ -0,0 +1,2 @@
+//vamoadde.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs + vs3; }, uint, e16);
diff --git a/riscv/insns/vamoaddei32_v.h b/riscv/insns/vamoaddei32_v.h
new file mode 100644
index 0000000000..2bd77fcbd2
--- /dev/null
+++ b/riscv/insns/vamoaddei32_v.h
@@ -0,0 +1,2 @@
+//vamoadde.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs + vs3; }, uint, e32);
diff --git a/riscv/insns/vamoaddei64_v.h b/riscv/insns/vamoaddei64_v.h
new file mode 100644
index 0000000000..79ca748205
--- /dev/null
+++ b/riscv/insns/vamoaddei64_v.h
@@ -0,0 +1,2 @@
+//vamoadde.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs + vs3; }, uint, e64);
diff --git a/riscv/insns/vamoaddei8_v.h b/riscv/insns/vamoaddei8_v.h
new file mode 100644
index 0000000000..06b8c79302
--- /dev/null
+++ b/riscv/insns/vamoaddei8_v.h
@@ -0,0 +1,2 @@
+//vamoadde.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs + vs3; }, uint, e8);
diff --git a/riscv/insns/vamoandei16_v.h b/riscv/insns/vamoandei16_v.h
new file mode 100644
index 0000000000..be119497f3
--- /dev/null
+++ b/riscv/insns/vamoandei16_v.h
@@ -0,0 +1,2 @@
+//vamoande.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs & vs3; }, uint, e16);
diff --git a/riscv/insns/vamoandei32_v.h b/riscv/insns/vamoandei32_v.h
new file mode 100644
index 0000000000..71506704ff
--- /dev/null
+++ b/riscv/insns/vamoandei32_v.h
@@ -0,0 +1,2 @@
+//vamoande.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs & vs3; }, uint, e32);
diff --git a/riscv/insns/vamoandei64_v.h b/riscv/insns/vamoandei64_v.h
new file mode 100644
index 0000000000..3efae3b59f
--- /dev/null
+++ b/riscv/insns/vamoandei64_v.h
@@ -0,0 +1,2 @@
+//vamoande.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs & vs3; }, uint, e64);
diff --git a/riscv/insns/vamoandei8_v.h b/riscv/insns/vamoandei8_v.h
new file mode 100644
index 0000000000..c47645d3e0
--- /dev/null
+++ b/riscv/insns/vamoandei8_v.h
@@ -0,0 +1,2 @@
+//vamoande.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs & vs3; }, uint, e8);
diff --git a/riscv/insns/vamomaxei16_v.h b/riscv/insns/vamomaxei16_v.h
new file mode 100644
index 0000000000..ca67893e99
--- /dev/null
+++ b/riscv/insns/vamomaxei16_v.h
@@ -0,0 +1,2 @@
+//vamomaxe.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3; }, int, e16);
diff --git a/riscv/insns/vamomaxei32_v.h b/riscv/insns/vamomaxei32_v.h
new file mode 100644
index 0000000000..b6823cd042
--- /dev/null
+++ b/riscv/insns/vamomaxei32_v.h
@@ -0,0 +1,2 @@
+//vamomaxe.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3; }, int, e32);
diff --git a/riscv/insns/vamomaxei64_v.h b/riscv/insns/vamomaxei64_v.h
new file mode 100644
index 0000000000..46e8a3bbd1
--- /dev/null
+++ b/riscv/insns/vamomaxei64_v.h
@@ -0,0 +1,2 @@
+//vamomaxe.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3; }, int, e64);
diff --git a/riscv/insns/vamomaxei8_v.h b/riscv/insns/vamomaxei8_v.h
new file mode 100644
index 0000000000..9697b3a4cb
--- /dev/null
+++ b/riscv/insns/vamomaxei8_v.h
@@ -0,0 +1,2 @@
+//vamomaxe.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3; }, int, e8);
diff --git a/riscv/insns/vamomaxuei16_v.h b/riscv/insns/vamomaxuei16_v.h
new file mode 100644
index 0000000000..e05971dfcf
--- /dev/null
+++ b/riscv/insns/vamomaxuei16_v.h
@@ -0,0 +1,2 @@
+//vamomaxue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3;; }, uint, e16);
diff --git a/riscv/insns/vamomaxuei32_v.h b/riscv/insns/vamomaxuei32_v.h
new file mode 100644
index 0000000000..9b873543b9
--- /dev/null
+++ b/riscv/insns/vamomaxuei32_v.h
@@ -0,0 +1,2 @@
+//vamomaxue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3;; }, uint, e32);
diff --git a/riscv/insns/vamomaxuei64_v.h b/riscv/insns/vamomaxuei64_v.h
new file mode 100644
index 0000000000..bbfbc9f2a3
--- /dev/null
+++ b/riscv/insns/vamomaxuei64_v.h
@@ -0,0 +1,2 @@
+//vamomaxue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3;; }, uint, e64);
diff --git a/riscv/insns/vamomaxuei8_v.h b/riscv/insns/vamomaxuei8_v.h
new file mode 100644
index 0000000000..357ba2454a
--- /dev/null
+++ b/riscv/insns/vamomaxuei8_v.h
@@ -0,0 +1,2 @@
+//vamomaxue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs >= vs3 ? lhs : vs3;; }, uint, e8);
diff --git a/riscv/insns/vamominei16_v.h b/riscv/insns/vamominei16_v.h
new file mode 100644
index 0000000000..9d1ecac643
--- /dev/null
+++ b/riscv/insns/vamominei16_v.h
@@ -0,0 +1,2 @@
+//vamomine.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3; }, int, e16);
diff --git a/riscv/insns/vamominei32_v.h b/riscv/insns/vamominei32_v.h
new file mode 100644
index 0000000000..6cb8475e39
--- /dev/null
+++ b/riscv/insns/vamominei32_v.h
@@ -0,0 +1,2 @@
+//vamomine.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3; }, int, e32);
diff --git a/riscv/insns/vamominei64_v.h b/riscv/insns/vamominei64_v.h
new file mode 100644
index 0000000000..9ef3d4ee3b
--- /dev/null
+++ b/riscv/insns/vamominei64_v.h
@@ -0,0 +1,2 @@
+//vamomine.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3; }, int, e64);
diff --git a/riscv/insns/vamominei8_v.h b/riscv/insns/vamominei8_v.h
new file mode 100644
index 0000000000..5c035ea47b
--- /dev/null
+++ b/riscv/insns/vamominei8_v.h
@@ -0,0 +1,2 @@
+//vamomine.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3; }, int, e8);
diff --git a/riscv/insns/vamominuei16_v.h b/riscv/insns/vamominuei16_v.h
new file mode 100644
index 0000000000..d4a8f89292
--- /dev/null
+++ b/riscv/insns/vamominuei16_v.h
@@ -0,0 +1,2 @@
+//vamominue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3;; }, uint, e16);
diff --git a/riscv/insns/vamominuei32_v.h b/riscv/insns/vamominuei32_v.h
new file mode 100644
index 0000000000..16296c5beb
--- /dev/null
+++ b/riscv/insns/vamominuei32_v.h
@@ -0,0 +1,2 @@
+//vamominue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3;; }, uint, e32);
diff --git a/riscv/insns/vamominuei64_v.h b/riscv/insns/vamominuei64_v.h
new file mode 100644
index 0000000000..fd850fd063
--- /dev/null
+++ b/riscv/insns/vamominuei64_v.h
@@ -0,0 +1,2 @@
+//vamominue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3;; }, uint, e64);
diff --git a/riscv/insns/vamominuei8_v.h b/riscv/insns/vamominuei8_v.h
new file mode 100644
index 0000000000..3749d0525d
--- /dev/null
+++ b/riscv/insns/vamominuei8_v.h
@@ -0,0 +1,2 @@
+//vamominue.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs < vs3 ? lhs : vs3;; }, uint, e8);
diff --git a/riscv/insns/vamoorei16_v.h b/riscv/insns/vamoorei16_v.h
new file mode 100644
index 0000000000..a5ba1caa74
--- /dev/null
+++ b/riscv/insns/vamoorei16_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs | vs3; }, uint, e16);
diff --git a/riscv/insns/vamoorei32_v.h b/riscv/insns/vamoorei32_v.h
new file mode 100644
index 0000000000..94e4458e49
--- /dev/null
+++ b/riscv/insns/vamoorei32_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs | vs3; }, uint, e32);
diff --git a/riscv/insns/vamoorei64_v.h b/riscv/insns/vamoorei64_v.h
new file mode 100644
index 0000000000..84e03944e5
--- /dev/null
+++ b/riscv/insns/vamoorei64_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs | vs3; }, uint, e64);
diff --git a/riscv/insns/vamoorei8_v.h b/riscv/insns/vamoorei8_v.h
new file mode 100644
index 0000000000..364035dbb2
--- /dev/null
+++ b/riscv/insns/vamoorei8_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs | vs3; }, uint, e8);
diff --git a/riscv/insns/vamoswapei16_v.h b/riscv/insns/vamoswapei16_v.h
new file mode 100644
index 0000000000..31ff021030
--- /dev/null
+++ b/riscv/insns/vamoswapei16_v.h
@@ -0,0 +1,2 @@
+//vamoswape.v vd, (rs1), vs2, vd
+VI_AMO({ return vs3; }, uint, e16);
diff --git a/riscv/insns/vamoswapei32_v.h b/riscv/insns/vamoswapei32_v.h
new file mode 100644
index 0000000000..a5741929ab
--- /dev/null
+++ b/riscv/insns/vamoswapei32_v.h
@@ -0,0 +1,2 @@
+//vamoswape.v vd, (rs1), vs2, vd
+VI_AMO({ return vs3; }, uint, e32);
diff --git a/riscv/insns/vamoswapei64_v.h b/riscv/insns/vamoswapei64_v.h
new file mode 100644
index 0000000000..58bd035217
--- /dev/null
+++ b/riscv/insns/vamoswapei64_v.h
@@ -0,0 +1,2 @@
+//vamoswape.v vd, (rs1), vs2, vd
+VI_AMO({ return vs3; }, uint, e64);
diff --git a/riscv/insns/vamoswapei8_v.h b/riscv/insns/vamoswapei8_v.h
new file mode 100644
index 0000000000..af37c8c3f8
--- /dev/null
+++ b/riscv/insns/vamoswapei8_v.h
@@ -0,0 +1,2 @@
+//vamoswape.v vd, (rs1), vs2, vd
+VI_AMO({ return vs3; }, uint, e8);
diff --git a/riscv/insns/vamoxorei16_v.h b/riscv/insns/vamoxorei16_v.h
new file mode 100644
index 0000000000..61e8c3272c
--- /dev/null
+++ b/riscv/insns/vamoxorei16_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs ^ vs3; }, uint, e16);
diff --git a/riscv/insns/vamoxorei32_v.h b/riscv/insns/vamoxorei32_v.h
new file mode 100644
index 0000000000..d48d951504
--- /dev/null
+++ b/riscv/insns/vamoxorei32_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs ^ vs3; }, uint, e32);
diff --git a/riscv/insns/vamoxorei64_v.h b/riscv/insns/vamoxorei64_v.h
new file mode 100644
index 0000000000..f7a3ca42e1
--- /dev/null
+++ b/riscv/insns/vamoxorei64_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs ^ vs3; }, uint, e64);
diff --git a/riscv/insns/vamoxorei8_v.h b/riscv/insns/vamoxorei8_v.h
new file mode 100644
index 0000000000..4b6c79824c
--- /dev/null
+++ b/riscv/insns/vamoxorei8_v.h
@@ -0,0 +1,2 @@
+//vamoore.v vd, (rs1), vs2, vd
+VI_AMO({ return lhs ^ vs3; }, uint, e8);
diff --git a/riscv/insns/vand_vi.h b/riscv/insns/vand_vi.h
new file mode 100644
index 0000000000..dd9618ba94
--- /dev/null
+++ b/riscv/insns/vand_vi.h
@@ -0,0 +1,5 @@
+// vand.vi vd, simm5, vs2, vm
+VI_VI_LOOP
+({
+  vd = simm5 & vs2;
+})
diff --git a/riscv/insns/vand_vv.h b/riscv/insns/vand_vv.h
new file mode 100644
index 0000000000..65558e4b6a
--- /dev/null
+++ b/riscv/insns/vand_vv.h
@@ -0,0 +1,5 @@
+// vand.vv vd, vs1, vs2, vm
+VI_VV_LOOP
+({
+  vd = vs1 & vs2;
+})
diff --git a/riscv/insns/vand_vx.h b/riscv/insns/vand_vx.h
new file mode 100644
index 0000000000..8eea1ed526
--- /dev/null
+++ b/riscv/insns/vand_vx.h
@@ -0,0 +1,5 @@
+// vand.vx vd, rs1, vs2, vm
+VI_VX_LOOP
+({
+  vd = rs1 & vs2;
+})
diff --git a/riscv/insns/vasub_vv.h b/riscv/insns/vasub_vv.h
new file mode 100644
index 0000000000..a45c18db7d
--- /dev/null
+++ b/riscv/insns/vasub_vv.h
@@ -0,0 +1,2 @@
+// vasub.vv vd, vs2, vs1
+VI_VVX_LOOP_AVG(vs1, -, true);
diff --git a/riscv/insns/vasub_vx.h b/riscv/insns/vasub_vx.h
new file mode 100644
index 0000000000..4e8dba1c9a
--- /dev/null
+++ b/riscv/insns/vasub_vx.h
@@ -0,0 +1,2 @@
+// vasub.vx vd, vs2, rs1
+VI_VVX_LOOP_AVG(rs1, -, false);
diff --git a/riscv/insns/vasubu_vv.h b/riscv/insns/vasubu_vv.h
new file mode 100644
index 0000000000..8e2be01aca
--- /dev/null
+++ b/riscv/insns/vasubu_vv.h
@@ -0,0 +1,2 @@
+// vasubu.vv vd, vs2, vs1
+VI_VVX_ULOOP_AVG(vs1, -, true);
diff --git a/riscv/insns/vasubu_vx.h b/riscv/insns/vasubu_vx.h
new file mode 100644
index 0000000000..3cc9ca8a01
--- /dev/null
+++ b/riscv/insns/vasubu_vx.h
@@ -0,0 +1,2 @@
+// vasubu.vx vd, vs2, rs1
+VI_VVX_ULOOP_AVG(rs1, -, false);
diff --git a/riscv/insns/vcompress_vm.h b/riscv/insns/vcompress_vm.h
new file mode 100644
index 0000000000..325e40adb4
--- /dev/null
+++ b/riscv/insns/vcompress_vm.h
@@ -0,0 +1,33 @@
+// vcompress vd, vs2, vs1
+require(P.VU.vstart == 0);
+require_align(insn.rd(), P.VU.vflmul);
+require_align(insn.rs2(), P.VU.vflmul);
+require(insn.rd() != insn.rs2());
+require_noover(insn.rd(), P.VU.vflmul, insn.rs1(), 1);
+
+reg_t pos = 0;
+
+VI_GENERAL_LOOP_BASE
+  const int midx = i / 64;
+  const int mpos = i % 64;
+
+  bool do_mask = (P.VU.elt<uint64_t>(rs1_num, midx) >> mpos) & 0x1;
+  if (do_mask) {
+    switch (sew) {
+    case e8:
+      P.VU.elt<uint8_t>(rd_num, pos, true) = P.VU.elt<uint8_t>(rs2_num, i);
+      break;
+    case e16:
+      P.VU.elt<uint16_t>(rd_num, pos, true) = P.VU.elt<uint16_t>(rs2_num, i);
+      break;
+    case e32:
+      P.VU.elt<uint32_t>(rd_num, pos, true) = P.VU.elt<uint32_t>(rs2_num, i);
+      break;
+    default:
+      P.VU.elt<uint64_t>(rd_num, pos, true) = P.VU.elt<uint64_t>(rs2_num, i);
+      break;
+    }
+
+    ++pos;
+  }
+VI_LOOP_END;
diff --git a/riscv/insns/vdiv_vv.h b/riscv/insns/vdiv_vv.h
new file mode 100644
index 0000000000..0d4bd0d8e4
--- /dev/null
+++ b/riscv/insns/vdiv_vv.h
@@ -0,0 +1,10 @@
+// vdiv.vv vd, vs2, vs1
+VI_VV_LOOP
+({
+  if (vs1 == 0)
+    vd = -1;
+  else if (vs2 == (INT64_MIN >> (64 - sew)) && vs1 == -1)
+    vd = vs2;
+  else
+    vd = vs2 / vs1;
+})
diff --git a/riscv/insns/vdiv_vx.h b/riscv/insns/vdiv_vx.h
new file mode 100644
index 0000000000..405295270e
--- /dev/null
+++ b/riscv/insns/vdiv_vx.h
@@ -0,0 +1,10 @@
+// vdiv.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  if(rs1 == 0)
+    vd = -1;
+  else if(vs2 == (INT64_MIN >> (64 - sew)) && rs1 == -1)
+    vd = vs2;
+  else
+    vd = vs2 / rs1;
+})
diff --git a/riscv/insns/vdivu_vv.h b/riscv/insns/vdivu_vv.h
new file mode 100644
index 0000000000..ef6e777d6b
--- /dev/null
+++ b/riscv/insns/vdivu_vv.h
@@ -0,0 +1,8 @@
+// vdivu.vv vd, vs2, vs1
+VI_VV_ULOOP
+({
+  if(vs1 == 0)
+    vd = -1;
+  else
+    vd = vs2 / vs1;
+})
diff --git a/riscv/insns/vdivu_vx.h b/riscv/insns/vdivu_vx.h
new file mode 100644
index 0000000000..7ffe1c6803
--- /dev/null
+++ b/riscv/insns/vdivu_vx.h
@@ -0,0 +1,8 @@
+// vdivu.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  if(rs1 == 0)
+    vd = -1;
+  else
+    vd = vs2 / rs1;
+})
diff --git a/riscv/insns/vdot_vv.h b/riscv/insns/vdot_vv.h
new file mode 100644
index 0000000000..7685230497
--- /dev/null
+++ b/riscv/insns/vdot_vv.h
@@ -0,0 +1,5 @@
+// vdot vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd += vs2 * vs1;
+})
diff --git a/riscv/insns/vdotu_vv.h b/riscv/insns/vdotu_vv.h
new file mode 100644
index 0000000000..9c4c59dde2
--- /dev/null
+++ b/riscv/insns/vdotu_vv.h
@@ -0,0 +1,5 @@
+// vdotu vd, vs2, vs1
+VI_VV_ULOOP
+({
+  vd += vs2 * vs1;
+})
diff --git a/riscv/insns/vfadd_vf.h b/riscv/insns/vfadd_vf.h
new file mode 100644
index 0000000000..2b808e0ccd
--- /dev/null
+++ b/riscv/insns/vfadd_vf.h
@@ -0,0 +1,11 @@
+// vfadd.vf vd, vs2, rs1
+VI_VFP_VF_LOOP
+({
+  vd = f16_add(rs1, vs2);
+},
+{
+  vd = f32_add(rs1, vs2);
+},
+{
+  vd = f64_add(rs1, vs2);
+})
diff --git a/riscv/insns/vfadd_vv.h b/riscv/insns/vfadd_vv.h
new file mode 100644
index 0000000000..ce94921d56
--- /dev/null
+++ b/riscv/insns/vfadd_vv.h
@@ -0,0 +1,11 @@
+// vfadd.vv vd, vs2, vs1
+VI_VFP_VV_LOOP
+({
+  vd = f16_add(vs1, vs2);
+},
+{
+  vd = f32_add(vs1, vs2);
+},
+{
+  vd = f64_add(vs1, vs2);
+})
diff --git a/riscv/insns/vfclass_v.h b/riscv/insns/vfclass_v.h
new file mode 100644
index 0000000000..658f28a23f
--- /dev/null
+++ b/riscv/insns/vfclass_v.h
@@ -0,0 +1,11 @@
+// vfclass.v vd, vs2, vm
+VI_VFP_V_LOOP
+({
+  vd.v = f16_classify(vs2);
+},
+{
+  vd.v = f32_classify(vs2);
+},
+{
+  vd.v = f64_classify(vs2);
+})
diff --git a/riscv/insns/vfcvt_f_x_v.h b/riscv/insns/vfcvt_f_x_v.h
new file mode 100644
index 0000000000..c53b0e1fd6
--- /dev/null
+++ b/riscv/insns/vfcvt_f_x_v.h
@@ -0,0 +1,14 @@
+// vfcvt.f.x.v vd, vd2, vm
+VI_VFP_VF_LOOP
+({
+  auto vs2_i = P.VU.elt<int16_t>(rs2_num, i);
+  vd = i32_to_f16(vs2_i);
+},
+{
+  auto vs2_i = P.VU.elt<int32_t>(rs2_num, i);
+  vd = i32_to_f32(vs2_i);
+},
+{
+  auto vs2_i = P.VU.elt<int64_t>(rs2_num, i);
+  vd = i64_to_f64(vs2_i);
+})
diff --git a/riscv/insns/vfcvt_f_xu_v.h b/riscv/insns/vfcvt_f_xu_v.h
new file mode 100644
index 0000000000..bd03768dbd
--- /dev/null
+++ b/riscv/insns/vfcvt_f_xu_v.h
@@ -0,0 +1,14 @@
+// vfcvt.f.xu.v vd, vd2, vm
+VI_VFP_VF_LOOP
+({
+  auto vs2_u = P.VU.elt<uint16_t>(rs2_num, i);
+  vd = ui32_to_f16(vs2_u);
+},
+{
+  auto vs2_u = P.VU.elt<uint32_t>(rs2_num, i);
+  vd = ui32_to_f32(vs2_u);
+},
+{
+  auto vs2_u = P.VU.elt<uint64_t>(rs2_num, i);
+  vd = ui64_to_f64(vs2_u);
+})
diff --git a/riscv/insns/vfcvt_rtz_x_f_v.h b/riscv/insns/vfcvt_rtz_x_f_v.h
new file mode 100644
index 0000000000..e7241bd033
--- /dev/null
+++ b/riscv/insns/vfcvt_rtz_x_f_v.h
@@ -0,0 +1,11 @@
+// vfcvt.rtz.x.f.v vd, vd2, vm
+VI_VFP_VF_LOOP
+({
+  P.VU.elt<int16_t>(rd_num, i) = f16_to_i16(vs2, softfloat_round_minMag, true);
+},
+{
+  P.VU.elt<int32_t>(rd_num, i) = f32_to_i32(vs2, softfloat_round_minMag, true);
+},
+{
+  P.VU.elt<int64_t>(rd_num, i) = f64_to_i64(vs2, softfloat_round_minMag, true);
+})
diff --git a/riscv/insns/vfcvt_rtz_xu_f_v.h b/riscv/insns/vfcvt_rtz_xu_f_v.h
new file mode 100644
index 0000000000..d3d266d0c9
--- /dev/null
+++ b/riscv/insns/vfcvt_rtz_xu_f_v.h
@@ -0,0 +1,11 @@
+// vfcvt.rtz.xu.f.v vd, vd2, vm
+VI_VFP_VF_LOOP
+({
+  P.VU.elt<uint16_t>(rd_num, i) = f16_to_ui16(vs2, softfloat_round_minMag, true);
+},
+{
+  P.VU.elt<uint32_t>(rd_num, i) = f32_to_ui32(vs2, softfloat_round_minMag, true);
+},
+{
+  P.VU.elt<uint64_t>(rd_num, i) = f64_to_ui64(vs2, softfloat_round_minMag, true);
+})
diff --git a/riscv/insns/vfcvt_x_f_v.h b/riscv/insns/vfcvt_x_f_v.h
new file mode 100644
index 0000000000..01e5ca17f4
--- /dev/null
+++ b/riscv/insns/vfcvt_x_f_v.h
@@ -0,0 +1,11 @@
+// vfcvt.x.f.v vd, vd2, vm
+VI_VFP_VF_LOOP
+({
+  P.VU.elt<int16_t>(rd_num, i) = f16_to_i16(vs2, STATE.frm, true);
+},
+{
+  P.VU.elt<int32_t>(rd_num, i) = f32_to_i32(vs2, STATE.frm, true);
+},
+{
+  P.VU.elt<int64_t>(rd_num, i) = f64_to_i64(vs2, STATE.frm, true);
+})
diff --git a/riscv/insns/vfcvt_xu_f_v.h b/riscv/insns/vfcvt_xu_f_v.h
new file mode 100644
index 0000000000..725cbda23a
--- /dev/null
+++ b/riscv/insns/vfcvt_xu_f_v.h
@@ -0,0 +1,11 @@
+// vfcvt.xu.f.v vd, vd2, vm
+VI_VFP_VV_LOOP
+({
+  P.VU.elt<uint16_t>(rd_num, i) = f16_to_ui16(vs2, STATE.frm, true);
+},
+{
+  P.VU.elt<uint32_t>(rd_num, i) = f32_to_ui32(vs2, STATE.frm, true);
+},
+{
+  P.VU.elt<uint64_t>(rd_num, i) = f64_to_ui64(vs2, STATE.frm, true);
+})
diff --git a/riscv/insns/vfdiv_vf.h b/riscv/insns/vfdiv_vf.h
new file mode 100644
index 0000000000..a703ef02bf
--- /dev/null
+++ b/riscv/insns/vfdiv_vf.h
@@ -0,0 +1,11 @@
+// vfdiv.vf vd, vs2, rs1
+VI_VFP_VF_LOOP
+({
+  vd = f16_div(vs2, rs1);
+},
+{
+  vd = f32_div(vs2, rs1);
+},
+{
+  vd = f64_div(vs2, rs1);
+})
diff --git a/riscv/insns/vfdiv_vv.h b/riscv/insns/vfdiv_vv.h
new file mode 100644
index 0000000000..c66d751659
--- /dev/null
+++ b/riscv/insns/vfdiv_vv.h
@@ -0,0 +1,11 @@
+// vfdiv.vv  vd, vs2, vs1
+VI_VFP_VV_LOOP
+({
+  vd = f16_div(vs2, vs1);
+},
+{
+  vd = f32_div(vs2, vs1);
+},
+{
+  vd = f64_div(vs2, vs1);
+})
diff --git a/riscv/insns/vfdot_vv.h b/riscv/insns/vfdot_vv.h
new file mode 100644
index 0000000000..8f5225acd4
--- /dev/null
+++ b/riscv/insns/vfdot_vv.h
@@ -0,0 +1,11 @@
+// vfdot.vv vd, vs2, vs1
+VI_VFP_VV_LOOP
+({
+  vd = f16_add(vd, f16_mul(vs2, vs1));
+},
+{
+  vd = f32_add(vd, f32_mul(vs2, vs1));
+},
+{
+  vd = f64_add(vd, f64_mul(vs2, vs1));
+})
diff --git a/riscv/insns/vfirst_m.h b/riscv/insns/vfirst_m.h
new file mode 100644
index 0000000000..309572374d
--- /dev/null
+++ b/riscv/insns/vfirst_m.h
@@ -0,0 +1,20 @@
+// vmfirst rd, vs2
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+reg_t pos = -1;
+for (reg_t i=P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP()
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  if (vs2_lsb) {
+    pos = i;
+    break;
+  }
+}
+P.VU.vstart = 0;
+WRITE_RD(pos);
diff --git a/riscv/insns/vfmacc_vf.h b/riscv/insns/vfmacc_vf.h
new file mode 100644
index 0000000000..61578d3318
--- /dev/null
+++ b/riscv/insns/vfmacc_vf.h
@@ -0,0 +1,11 @@
+// vfmacc.vf vd, rs1, vs2, vm    # vd[i] = +(vs2[i] * x[rs1]) + vd[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(rs1, vs2, vd);
+},
+{
+  vd = f32_mulAdd(rs1, vs2, vd);
+},
+{
+  vd = f64_mulAdd(rs1, vs2, vd);
+})
diff --git a/riscv/insns/vfmacc_vv.h b/riscv/insns/vfmacc_vv.h
new file mode 100644
index 0000000000..499b1d4d22
--- /dev/null
+++ b/riscv/insns/vfmacc_vv.h
@@ -0,0 +1,11 @@
+// vfmacc.vv vd, rs1, vs2, vm    # vd[i] = +(vs2[i] * vs1[i]) + vd[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(vs1, vs2, vd);
+},
+{
+  vd = f32_mulAdd(vs1, vs2, vd);
+},
+{
+  vd = f64_mulAdd(vs1, vs2, vd);
+})
diff --git a/riscv/insns/vfmadd_vf.h b/riscv/insns/vfmadd_vf.h
new file mode 100644
index 0000000000..2a01429506
--- /dev/null
+++ b/riscv/insns/vfmadd_vf.h
@@ -0,0 +1,11 @@
+// vfmadd: vd[i] = +(vd[i] * f[rs1]) + vs2[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(vd, rs1, vs2);
+},
+{
+  vd = f32_mulAdd(vd, rs1, vs2);
+},
+{
+  vd = f64_mulAdd(vd, rs1, vs2);
+})
diff --git a/riscv/insns/vfmadd_vv.h b/riscv/insns/vfmadd_vv.h
new file mode 100644
index 0000000000..7ef734f847
--- /dev/null
+++ b/riscv/insns/vfmadd_vv.h
@@ -0,0 +1,11 @@
+// vfmadd: vd[i] = +(vd[i] * vs1[i]) + vs2[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(vd, vs1, vs2);
+},
+{
+  vd = f32_mulAdd(vd, vs1, vs2);
+},
+{
+  vd = f64_mulAdd(vd, vs1, vs2);
+})
diff --git a/riscv/insns/vfmax_vf.h b/riscv/insns/vfmax_vf.h
new file mode 100644
index 0000000000..c4b74cbd54
--- /dev/null
+++ b/riscv/insns/vfmax_vf.h
@@ -0,0 +1,11 @@
+// vfmax
+VI_VFP_VF_LOOP
+({
+  vd = f16_max(vs2, rs1);
+},
+{
+  vd = f32_max(vs2, rs1);
+},
+{
+  vd = f64_max(vs2, rs1);
+})
diff --git a/riscv/insns/vfmax_vv.h b/riscv/insns/vfmax_vv.h
new file mode 100644
index 0000000000..6439c8997f
--- /dev/null
+++ b/riscv/insns/vfmax_vv.h
@@ -0,0 +1,11 @@
+// vfmax
+VI_VFP_VV_LOOP
+({
+  vd = f16_max(vs2, vs1);
+},
+{
+  vd = f32_max(vs2, vs1);
+},
+{
+  vd = f64_max(vs2, vs1);
+})
diff --git a/riscv/insns/vfmerge_vfm.h b/riscv/insns/vfmerge_vfm.h
new file mode 100644
index 0000000000..c9b39fe052
--- /dev/null
+++ b/riscv/insns/vfmerge_vfm.h
@@ -0,0 +1,50 @@
+// vfmerge_vf vd, vs2, vs1, vm
+VI_CHECK_SSS(false);
+VI_VFP_COMMON;
+
+switch(P.VU.vsew) {
+  case e16:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float16_t>(rd_num, i, true);
+      auto rs1 = f16(READ_FREG(rs1_num));
+      auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+
+      int midx = i / 64;
+      int mpos = i % 64;
+      bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+      vd = use_first ? rs1 : vs2;
+    }
+    break;
+  case e32:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float32_t>(rd_num, i, true);
+      auto rs1 = f32(READ_FREG(rs1_num));
+      auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+
+      int midx = i / 64;
+      int mpos = i % 64;
+      bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+      vd = use_first ? rs1 : vs2;
+    }
+    break;
+  case e64:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float64_t>(rd_num, i, true);
+      auto rs1 = f64(READ_FREG(rs1_num));
+      auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+
+      int midx = i / 64;
+      int mpos = i % 64;
+      bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+      vd = use_first ? rs1 : vs2;
+    }
+    break;
+  default:
+    require(0);
+    break;
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vfmin_vf.h b/riscv/insns/vfmin_vf.h
new file mode 100644
index 0000000000..1560cdf7dc
--- /dev/null
+++ b/riscv/insns/vfmin_vf.h
@@ -0,0 +1,11 @@
+// vfmin vd, vs2, rs1
+VI_VFP_VF_LOOP
+({
+  vd = f16_min(vs2, rs1);
+},
+{
+  vd = f32_min(vs2, rs1);
+},
+{
+  vd = f64_min(vs2, rs1);
+})
diff --git a/riscv/insns/vfmin_vv.h b/riscv/insns/vfmin_vv.h
new file mode 100644
index 0000000000..882a774044
--- /dev/null
+++ b/riscv/insns/vfmin_vv.h
@@ -0,0 +1,11 @@
+// vfmin vd, vs2, vs1
+VI_VFP_VV_LOOP
+({
+  vd = f16_min(vs2, vs1);
+},
+{
+  vd = f32_min(vs2, vs1);
+},
+{
+  vd = f64_min(vs2, vs1);
+})
diff --git a/riscv/insns/vfmsac_vf.h b/riscv/insns/vfmsac_vf.h
new file mode 100644
index 0000000000..8af397b999
--- /dev/null
+++ b/riscv/insns/vfmsac_vf.h
@@ -0,0 +1,11 @@
+// vfmsac: vd[i] = +(f[rs1] * vs2[i]) - vd[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(rs1, vs2, f16(vd.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(rs1, vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(rs1, vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfmsac_vv.h b/riscv/insns/vfmsac_vv.h
new file mode 100644
index 0000000000..3bb50e50a9
--- /dev/null
+++ b/riscv/insns/vfmsac_vv.h
@@ -0,0 +1,11 @@
+// vfmsac: vd[i] = +(vs1[i] * vs2[i]) - vd[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(vs1, vs2, f16(vd.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(vs1, vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(vs1, vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfmsub_vf.h b/riscv/insns/vfmsub_vf.h
new file mode 100644
index 0000000000..ab77b4c6e1
--- /dev/null
+++ b/riscv/insns/vfmsub_vf.h
@@ -0,0 +1,11 @@
+// vfmsub: vd[i] = +(vd[i] * f[rs1]) - vs2[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(vd, rs1, f16(vs2.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(vd, rs1, f32(vs2.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(vd, rs1, f64(vs2.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfmsub_vv.h b/riscv/insns/vfmsub_vv.h
new file mode 100644
index 0000000000..3cac937fd1
--- /dev/null
+++ b/riscv/insns/vfmsub_vv.h
@@ -0,0 +1,11 @@
+// vfmsub: vd[i] = +(vd[i] * vs1[i]) - vs2[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(vd, vs1, f16(vs2.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(vd, vs1, f32(vs2.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(vd, vs1, f64(vs2.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfmul_vf.h b/riscv/insns/vfmul_vf.h
new file mode 100644
index 0000000000..f5f63e497a
--- /dev/null
+++ b/riscv/insns/vfmul_vf.h
@@ -0,0 +1,11 @@
+// vfmul.vf vd, vs2, rs1, vm
+VI_VFP_VF_LOOP
+({
+  vd = f16_mul(vs2, rs1);
+},
+{
+  vd = f32_mul(vs2, rs1);
+},
+{
+  vd = f64_mul(vs2, rs1);
+})
diff --git a/riscv/insns/vfmul_vv.h b/riscv/insns/vfmul_vv.h
new file mode 100644
index 0000000000..7930fd034e
--- /dev/null
+++ b/riscv/insns/vfmul_vv.h
@@ -0,0 +1,11 @@
+// vfmul.vv vd, vs1, vs2, vm
+VI_VFP_VV_LOOP
+({
+  vd = f16_mul(vs1, vs2);
+},
+{
+  vd = f32_mul(vs1, vs2);
+},
+{
+  vd = f64_mul(vs1, vs2);
+})
diff --git a/riscv/insns/vfmv_f_s.h b/riscv/insns/vfmv_f_s.h
new file mode 100644
index 0000000000..4a81436c10
--- /dev/null
+++ b/riscv/insns/vfmv_f_s.h
@@ -0,0 +1,38 @@
+// vfmv_f_s: rd = vs2[0] (rs1=0)
+require_vector(true);
+require_fp;
+require((P.VU.vsew == e16 && p->supports_extension(EXT_ZFH)) ||
+        (P.VU.vsew == e32 && p->supports_extension('F')) ||
+        (P.VU.vsew == e64 && p->supports_extension('D')));
+require(STATE.frm < 0x5);
+
+reg_t rs2_num = insn.rs2();
+uint64_t vs2_0 = 0;
+const reg_t sew = P.VU.vsew;
+switch(sew) {
+  case e16:
+    vs2_0 = P.VU.elt<uint16_t>(rs2_num, 0);
+    break;
+  case e32:
+    vs2_0 = P.VU.elt<uint32_t>(rs2_num, 0);
+    break;
+  case e64:
+    vs2_0 = P.VU.elt<uint64_t>(rs2_num, 0);
+    break;
+  default:
+    require(0);
+    break;
+}
+
+// nan_extened
+if (FLEN > sew) {
+  vs2_0 = vs2_0 | (UINT64_MAX << sew);
+}
+
+if (FLEN == 64) {
+  WRITE_FRD(f64(vs2_0));
+} else {
+  WRITE_FRD(f32(vs2_0));
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vfmv_s_f.h b/riscv/insns/vfmv_s_f.h
new file mode 100644
index 0000000000..52ed7b2d11
--- /dev/null
+++ b/riscv/insns/vfmv_s_f.h
@@ -0,0 +1,29 @@
+// vfmv_s_f: vd[0] = rs1 (vs2=0)
+require_vector(true);
+require_fp;
+require((P.VU.vsew == e16 && p->supports_extension(EXT_ZFH)) ||
+        (P.VU.vsew == e32 && p->supports_extension('F')) ||
+        (P.VU.vsew == e64 && p->supports_extension('D')));
+require(STATE.frm < 0x5);
+
+reg_t vl = P.VU.vl;
+
+if (vl > 0 && P.VU.vstart < vl) {
+  reg_t rd_num = insn.rd();
+
+  switch(P.VU.vsew) {
+    case e16:
+      P.VU.elt<uint16_t>(rd_num, 0, true) = f16(FRS1).v;
+      break;
+    case e32:
+      P.VU.elt<uint32_t>(rd_num, 0, true) = f32(FRS1).v;
+      break;
+    case e64:
+      if (FLEN == 64)
+        P.VU.elt<uint64_t>(rd_num, 0, true) = f64(FRS1).v;
+      else
+        P.VU.elt<uint64_t>(rd_num, 0, true) = f32(FRS1).v;
+      break;
+  }
+}
+P.VU.vstart = 0;
diff --git a/riscv/insns/vfmv_v_f.h b/riscv/insns/vfmv_v_f.h
new file mode 100644
index 0000000000..fb9c78827e
--- /dev/null
+++ b/riscv/insns/vfmv_v_f.h
@@ -0,0 +1,31 @@
+// vfmv_vf vd, vs1
+require_align(insn.rd(), P.VU.vflmul);
+VI_VFP_COMMON
+switch(P.VU.vsew) {
+  case e16:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float16_t>(rd_num, i, true);
+      auto rs1 = f16(READ_FREG(rs1_num));
+
+      vd = rs1;
+    }
+    break;
+  case e32:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float32_t>(rd_num, i, true);
+      auto rs1 = f32(READ_FREG(rs1_num));
+
+      vd = rs1;
+    }
+    break;
+  case e64:
+    for (reg_t i=P.VU.vstart; i<vl; ++i) {
+      auto &vd = P.VU.elt<float64_t>(rd_num, i, true);
+      auto rs1 = f64(READ_FREG(rs1_num));
+
+      vd = rs1;
+    }
+    break;
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vfncvt_f_f_w.h b/riscv/insns/vfncvt_f_f_w.h
new file mode 100644
index 0000000000..e9f3b25746
--- /dev/null
+++ b/riscv/insns/vfncvt_f_f_w.h
@@ -0,0 +1,23 @@
+// vfncvt.f.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = f32_to_f16(vs2);
+},
+{
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = f64_to_f32(vs2);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('D'));
+},
+false, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfncvt_f_x_w.h b/riscv/insns/vfncvt_f_x_w.h
new file mode 100644
index 0000000000..556ee3c878
--- /dev/null
+++ b/riscv/insns/vfncvt_f_x_w.h
@@ -0,0 +1,23 @@
+// vfncvt.f.x.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = i32_to_f16(vs2);
+},
+{
+  auto vs2 = P.VU.elt<int64_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = i64_to_f32(vs2);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+false, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfncvt_f_xu_w.h b/riscv/insns/vfncvt_f_xu_w.h
new file mode 100644
index 0000000000..0626ecb4ad
--- /dev/null
+++ b/riscv/insns/vfncvt_f_xu_w.h
@@ -0,0 +1,23 @@
+// vfncvt.f.xu.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<uint32_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = ui32_to_f16(vs2);
+},
+{
+  auto vs2 = P.VU.elt<uint64_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = ui64_to_f32(vs2);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+false, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfncvt_rod_f_f_w.h b/riscv/insns/vfncvt_rod_f_f_w.h
new file mode 100644
index 0000000000..7113fd572a
--- /dev/null
+++ b/riscv/insns/vfncvt_rod_f_f_w.h
@@ -0,0 +1,25 @@
+// vfncvt.rod.f.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  softfloat_roundingMode = softfloat_round_odd;
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = f32_to_f16(vs2);
+},
+{
+  softfloat_roundingMode = softfloat_round_odd;
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = f64_to_f32(vs2);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+false, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfncvt_rtz_x_f_w.h b/riscv/insns/vfncvt_rtz_x_f_w.h
new file mode 100644
index 0000000000..1dfc6ecea4
--- /dev/null
+++ b/riscv/insns/vfncvt_rtz_x_f_w.h
@@ -0,0 +1,24 @@
+// vfncvt.rtz.x.f.w vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<int8_t>(rd_num, i, true) = f16_to_i8(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<int16_t>(rd_num, i, true) = f32_to_i16(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<int32_t>(rd_num, i, true) = f64_to_i32(vs2, softfloat_round_minMag, true);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+false, (P.VU.vsew <= 32))
diff --git a/riscv/insns/vfncvt_rtz_xu_f_w.h b/riscv/insns/vfncvt_rtz_xu_f_w.h
new file mode 100644
index 0000000000..c6adcec9ab
--- /dev/null
+++ b/riscv/insns/vfncvt_rtz_xu_f_w.h
@@ -0,0 +1,24 @@
+// vfncvt.rtz.xu.f.w vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<uint8_t>(rd_num, i, true) = f16_to_ui8(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<uint16_t>(rd_num, i, true) = f32_to_ui16(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<uint32_t>(rd_num, i, true) = f64_to_ui32(vs2, softfloat_round_minMag, true);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+false, (P.VU.vsew <= 32))
diff --git a/riscv/insns/vfncvt_x_f_w.h b/riscv/insns/vfncvt_x_f_w.h
new file mode 100644
index 0000000000..01b52a2af5
--- /dev/null
+++ b/riscv/insns/vfncvt_x_f_w.h
@@ -0,0 +1,24 @@
+// vfncvt.x.f.w vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<int8_t>(rd_num, i, true) = f16_to_i8(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<int16_t>(rd_num, i, true) = f32_to_i16(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<int32_t>(rd_num, i, true) = f64_to_i32(vs2, STATE.frm, true);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+false, (P.VU.vsew <= 32))
diff --git a/riscv/insns/vfncvt_xu_f_w.h b/riscv/insns/vfncvt_xu_f_w.h
new file mode 100644
index 0000000000..bb55ec318d
--- /dev/null
+++ b/riscv/insns/vfncvt_xu_f_w.h
@@ -0,0 +1,24 @@
+// vfncvt.xu.f.w vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<uint8_t>(rd_num, i, true) = f16_to_ui8(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<uint16_t>(rd_num, i, true) = f32_to_ui16(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float64_t>(rs2_num, i);
+  P.VU.elt<uint32_t>(rd_num, i, true) = f64_to_ui32(vs2, STATE.frm, true);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+false, (P.VU.vsew <= 32))
diff --git a/riscv/insns/vfnmacc_vf.h b/riscv/insns/vfnmacc_vf.h
new file mode 100644
index 0000000000..1b99302c6c
--- /dev/null
+++ b/riscv/insns/vfnmacc_vf.h
@@ -0,0 +1,11 @@
+// vfnmacc: vd[i] = -(f[rs1] * vs2[i]) - vd[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(rs1, f16(vs2.v ^ F16_SIGN), f16(vd.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(rs1, f32(vs2.v ^ F32_SIGN), f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(rs1, f64(vs2.v ^ F64_SIGN), f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfnmacc_vv.h b/riscv/insns/vfnmacc_vv.h
new file mode 100644
index 0000000000..7200e063ab
--- /dev/null
+++ b/riscv/insns/vfnmacc_vv.h
@@ -0,0 +1,11 @@
+// vfnmacc: vd[i] = -(vs1[i] * vs2[i]) - vd[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(f16(vs2.v ^ F16_SIGN), vs1, f16(vd.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(f32(vs2.v ^ F32_SIGN), vs1, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(f64(vs2.v ^ F64_SIGN), vs1, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfnmadd_vf.h b/riscv/insns/vfnmadd_vf.h
new file mode 100644
index 0000000000..cb9c217ff2
--- /dev/null
+++ b/riscv/insns/vfnmadd_vf.h
@@ -0,0 +1,11 @@
+// vfnmadd: vd[i] = -(vd[i] * f[rs1]) - vs2[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(f16(vd.v ^ F16_SIGN), rs1, f16(vs2.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(f32(vd.v ^ F32_SIGN), rs1, f32(vs2.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(f64(vd.v ^ F64_SIGN), rs1, f64(vs2.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfnmadd_vv.h b/riscv/insns/vfnmadd_vv.h
new file mode 100644
index 0000000000..7160ed7d6f
--- /dev/null
+++ b/riscv/insns/vfnmadd_vv.h
@@ -0,0 +1,11 @@
+// vfnmadd: vd[i] = -(vd[i] * vs1[i]) - vs2[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(f16(vd.v ^ F16_SIGN), vs1, f16(vs2.v ^ F16_SIGN));
+},
+{
+  vd = f32_mulAdd(f32(vd.v ^ F32_SIGN), vs1, f32(vs2.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(f64(vd.v ^ F64_SIGN), vs1, f64(vs2.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfnmsac_vf.h b/riscv/insns/vfnmsac_vf.h
new file mode 100644
index 0000000000..aa6baa30c6
--- /dev/null
+++ b/riscv/insns/vfnmsac_vf.h
@@ -0,0 +1,11 @@
+// vfnmsac: vd[i] = -(f[rs1] * vs2[i]) + vd[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(rs1, f16(vs2.v ^ F16_SIGN), vd);
+},
+{
+  vd = f32_mulAdd(rs1, f32(vs2.v ^ F32_SIGN), vd);
+},
+{
+  vd = f64_mulAdd(rs1, f64(vs2.v ^ F64_SIGN), vd);
+})
diff --git a/riscv/insns/vfnmsac_vv.h b/riscv/insns/vfnmsac_vv.h
new file mode 100644
index 0000000000..47db61d2d0
--- /dev/null
+++ b/riscv/insns/vfnmsac_vv.h
@@ -0,0 +1,11 @@
+// vfnmsac.vv vd, vs1, vs2, vm   # vd[i] = -(vs2[i] * vs1[i]) + vd[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(f16(vs1.v ^ F16_SIGN), vs2, vd);
+},
+{
+  vd = f32_mulAdd(f32(vs1.v ^ F32_SIGN), vs2, vd);
+},
+{
+  vd = f64_mulAdd(f64(vs1.v ^ F64_SIGN), vs2, vd);
+})
diff --git a/riscv/insns/vfnmsub_vf.h b/riscv/insns/vfnmsub_vf.h
new file mode 100644
index 0000000000..43aa9e2685
--- /dev/null
+++ b/riscv/insns/vfnmsub_vf.h
@@ -0,0 +1,11 @@
+// vfnmsub: vd[i] = -(vd[i] * f[rs1]) + vs2[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_mulAdd(f16(vd.v ^ F16_SIGN), rs1, vs2);
+},
+{
+  vd = f32_mulAdd(f32(vd.v ^ F32_SIGN), rs1, vs2);
+},
+{
+  vd = f64_mulAdd(f64(vd.v ^ F64_SIGN), rs1, vs2);
+})
diff --git a/riscv/insns/vfnmsub_vv.h b/riscv/insns/vfnmsub_vv.h
new file mode 100644
index 0000000000..2a45c8fca0
--- /dev/null
+++ b/riscv/insns/vfnmsub_vv.h
@@ -0,0 +1,11 @@
+// vfnmsub: vd[i] = -(vd[i] * vs1[i]) + vs2[i]
+VI_VFP_VV_LOOP
+({
+  vd = f16_mulAdd(f16(vd.v ^ F16_SIGN), vs1, vs2);
+},
+{
+  vd = f32_mulAdd(f32(vd.v ^ F32_SIGN), vs1, vs2);
+},
+{
+  vd = f64_mulAdd(f64(vd.v ^ F64_SIGN), vs1, vs2);
+})
diff --git a/riscv/insns/vfrdiv_vf.h b/riscv/insns/vfrdiv_vf.h
new file mode 100644
index 0000000000..b283343cc2
--- /dev/null
+++ b/riscv/insns/vfrdiv_vf.h
@@ -0,0 +1,11 @@
+// vfrdiv.vf vd, vs2, rs1, vm  # scalar-vector, vd[i] = f[rs1]/vs2[i]
+VI_VFP_VF_LOOP
+({
+  vd = f16_div(rs1, vs2);
+},
+{
+  vd = f32_div(rs1, vs2);
+},
+{
+  vd = f64_div(rs1, vs2);
+})
diff --git a/riscv/insns/vfrece7_v.h b/riscv/insns/vfrece7_v.h
new file mode 100644
index 0000000000..69c026b058
--- /dev/null
+++ b/riscv/insns/vfrece7_v.h
@@ -0,0 +1,11 @@
+// vfclass.v vd, vs2, vm
+VI_VFP_V_LOOP
+({
+  vd = f16_recip7(vs2);
+},
+{
+  vd = f32_recip7(vs2);
+},
+{
+  vd = f64_recip7(vs2);
+})
diff --git a/riscv/insns/vfredmax_vs.h b/riscv/insns/vfredmax_vs.h
new file mode 100644
index 0000000000..f19ec59791
--- /dev/null
+++ b/riscv/insns/vfredmax_vs.h
@@ -0,0 +1,12 @@
+// vfredmax vd, vs2, vs1
+bool is_propagate = false;
+VI_VFP_VV_LOOP_REDUCTION
+({
+  vd_0 = f16_max(vd_0, vs2);
+},
+{
+  vd_0 = f32_max(vd_0, vs2);
+},
+{
+  vd_0 = f64_max(vd_0, vs2);
+})
diff --git a/riscv/insns/vfredmin_vs.h b/riscv/insns/vfredmin_vs.h
new file mode 100644
index 0000000000..e3cf151324
--- /dev/null
+++ b/riscv/insns/vfredmin_vs.h
@@ -0,0 +1,12 @@
+// vfredmin vd, vs2, vs1
+bool is_propagate = false;
+VI_VFP_VV_LOOP_REDUCTION
+({
+  vd_0 = f16_min(vd_0, vs2);
+},
+{
+  vd_0 = f32_min(vd_0, vs2);
+},
+{
+  vd_0 = f64_min(vd_0, vs2);
+})
diff --git a/riscv/insns/vfredosum_vs.h b/riscv/insns/vfredosum_vs.h
new file mode 100644
index 0000000000..2438a7ba9c
--- /dev/null
+++ b/riscv/insns/vfredosum_vs.h
@@ -0,0 +1,12 @@
+// vfredosum: vd[0] =  sum( vs2[*] , vs1[0] )
+bool is_propagate = false;
+VI_VFP_VV_LOOP_REDUCTION
+({
+  vd_0 = f16_add(vd_0, vs2);
+},
+{
+  vd_0 = f32_add(vd_0, vs2);
+},
+{
+  vd_0 = f64_add(vd_0, vs2);
+})
diff --git a/riscv/insns/vfredsum_vs.h b/riscv/insns/vfredsum_vs.h
new file mode 100644
index 0000000000..bad7308e54
--- /dev/null
+++ b/riscv/insns/vfredsum_vs.h
@@ -0,0 +1,12 @@
+// vfredsum: vd[0] =  sum( vs2[*] , vs1[0] )
+bool is_propagate = true;
+VI_VFP_VV_LOOP_REDUCTION
+({
+  vd_0 = f16_add(vd_0, vs2);
+},
+{
+  vd_0 = f32_add(vd_0, vs2);
+},
+{
+  vd_0 = f64_add(vd_0, vs2);
+})
diff --git a/riscv/insns/vfrsqrte7_v.h b/riscv/insns/vfrsqrte7_v.h
new file mode 100644
index 0000000000..a073764127
--- /dev/null
+++ b/riscv/insns/vfrsqrte7_v.h
@@ -0,0 +1,11 @@
+// vfclass.v vd, vs2, vm
+VI_VFP_V_LOOP
+({
+  vd = f16_rsqrte7(vs2);
+},
+{
+  vd = f32_rsqrte7(vs2);
+},
+{
+  vd = f64_rsqrte7(vs2);
+})
diff --git a/riscv/insns/vfrsub_vf.h b/riscv/insns/vfrsub_vf.h
new file mode 100644
index 0000000000..7fb26a5b5d
--- /dev/null
+++ b/riscv/insns/vfrsub_vf.h
@@ -0,0 +1,11 @@
+// vfsub.vf vd, vs2, rs1
+VI_VFP_VF_LOOP
+({
+  vd = f16_sub(rs1, vs2);
+},
+{
+  vd = f32_sub(rs1, vs2);
+},
+{
+  vd = f64_sub(rs1, vs2);
+})
diff --git a/riscv/insns/vfsgnj_vf.h b/riscv/insns/vfsgnj_vf.h
new file mode 100644
index 0000000000..ce06185ee9
--- /dev/null
+++ b/riscv/insns/vfsgnj_vf.h
@@ -0,0 +1,11 @@
+// vfsgnj vd, vs2, vs1
+VI_VFP_VF_LOOP
+({
+  vd = fsgnj16(vs2.v, rs1.v, false, false);
+},
+{
+  vd = fsgnj32(vs2.v, rs1.v, false, false);
+},
+{
+  vd = fsgnj64(vs2.v, rs1.v, false, false);
+})
diff --git a/riscv/insns/vfsgnj_vv.h b/riscv/insns/vfsgnj_vv.h
new file mode 100644
index 0000000000..722cb29cf0
--- /dev/null
+++ b/riscv/insns/vfsgnj_vv.h
@@ -0,0 +1,11 @@
+// vfsgnj
+VI_VFP_VV_LOOP
+({
+  vd = fsgnj16(vs2.v, vs1.v, false, false);
+},
+{
+  vd = fsgnj32(vs2.v, vs1.v, false, false);
+},
+{
+  vd = fsgnj64(vs2.v, vs1.v, false, false);
+})
diff --git a/riscv/insns/vfsgnjn_vf.h b/riscv/insns/vfsgnjn_vf.h
new file mode 100644
index 0000000000..e4894124aa
--- /dev/null
+++ b/riscv/insns/vfsgnjn_vf.h
@@ -0,0 +1,11 @@
+// vfsgnn
+VI_VFP_VF_LOOP
+({
+  vd = fsgnj16(vs2.v, rs1.v, true, false);
+},
+{
+  vd = fsgnj32(vs2.v, rs1.v, true, false);
+},
+{
+  vd = fsgnj64(vs2.v, rs1.v, true, false);
+})
diff --git a/riscv/insns/vfsgnjn_vv.h b/riscv/insns/vfsgnjn_vv.h
new file mode 100644
index 0000000000..1d91f69199
--- /dev/null
+++ b/riscv/insns/vfsgnjn_vv.h
@@ -0,0 +1,11 @@
+// vfsgnn
+VI_VFP_VV_LOOP
+({
+  vd = fsgnj16(vs2.v, vs1.v, true, false);
+},
+{
+  vd = fsgnj32(vs2.v, vs1.v, true, false);
+},
+{
+  vd = fsgnj64(vs2.v, vs1.v, true, false);
+})
diff --git a/riscv/insns/vfsgnjx_vf.h b/riscv/insns/vfsgnjx_vf.h
new file mode 100644
index 0000000000..7be164c770
--- /dev/null
+++ b/riscv/insns/vfsgnjx_vf.h
@@ -0,0 +1,11 @@
+// vfsgnx
+VI_VFP_VF_LOOP
+({
+  vd = fsgnj16(vs2.v, rs1.v, false, true);
+},
+{
+  vd = fsgnj32(vs2.v, rs1.v, false, true);
+},
+{
+  vd = fsgnj64(vs2.v, rs1.v, false, true);
+})
diff --git a/riscv/insns/vfsgnjx_vv.h b/riscv/insns/vfsgnjx_vv.h
new file mode 100644
index 0000000000..b04b8454ac
--- /dev/null
+++ b/riscv/insns/vfsgnjx_vv.h
@@ -0,0 +1,11 @@
+// vfsgnx
+VI_VFP_VV_LOOP
+({
+  vd = fsgnj16(vs2.v, vs1.v, false, true);
+},
+{
+  vd = fsgnj32(vs2.v, vs1.v, false, true);
+},
+{
+  vd = fsgnj64(vs2.v, vs1.v, false, true);
+})
diff --git a/riscv/insns/vfslide1down_vf.h b/riscv/insns/vfslide1down_vf.h
new file mode 100644
index 0000000000..66eeaccbf4
--- /dev/null
+++ b/riscv/insns/vfslide1down_vf.h
@@ -0,0 +1,36 @@
+//vfslide1down.vf vd, vs2, rs1
+VI_CHECK_SLIDE(false);
+
+VI_VFP_LOOP_BASE
+if (i != vl - 1) {
+  switch (P.VU.vsew) {
+    case e16: {
+      VI_XI_SLIDEDOWN_PARAMS(e16, 1);
+      vd = vs2;
+    }
+    break;
+    case e32: {
+      VI_XI_SLIDEDOWN_PARAMS(e32, 1);
+      vd = vs2;
+    }
+    break;
+    case e64: {
+      VI_XI_SLIDEDOWN_PARAMS(e64, 1);
+      vd = vs2;
+    }
+    break;
+  }
+} else {
+  switch (P.VU.vsew) {
+    case e16:
+      P.VU.elt<float16_t>(rd_num, vl - 1, true) = f16(FRS1);
+      break;
+    case e32:
+      P.VU.elt<float32_t>(rd_num, vl - 1, true) = f32(FRS1);
+      break;
+    case e64:
+      P.VU.elt<float64_t>(rd_num, vl - 1, true) = f64(FRS1);
+      break;
+  }
+}
+VI_VFP_LOOP_END
diff --git a/riscv/insns/vfslide1up_vf.h b/riscv/insns/vfslide1up_vf.h
new file mode 100644
index 0000000000..b9c2817c28
--- /dev/null
+++ b/riscv/insns/vfslide1up_vf.h
@@ -0,0 +1,36 @@
+//vfslide1up.vf vd, vs2, rs1
+VI_CHECK_SLIDE(true);
+
+VI_VFP_LOOP_BASE
+if (i != 0) {
+  switch (P.VU.vsew) {
+    case e16: {
+      VI_XI_SLIDEUP_PARAMS(e16, 1);
+      vd = vs2;
+    }
+    break;
+    case e32: {
+      VI_XI_SLIDEUP_PARAMS(e32, 1);
+      vd = vs2;
+    }
+    break;
+    case e64: {
+      VI_XI_SLIDEUP_PARAMS(e64, 1);
+      vd = vs2;
+    }
+    break;
+  }
+} else {
+  switch (P.VU.vsew) {
+    case e16:
+      P.VU.elt<float16_t>(rd_num, 0, true) = f16(FRS1);
+      break;
+    case e32:
+      P.VU.elt<float32_t>(rd_num, 0, true) = f32(FRS1);
+      break;
+    case e64:
+      P.VU.elt<float64_t>(rd_num, 0, true) = f64(FRS1);
+      break;
+  }
+}
+VI_VFP_LOOP_END
diff --git a/riscv/insns/vfsqrt_v.h b/riscv/insns/vfsqrt_v.h
new file mode 100644
index 0000000000..86f0148d4b
--- /dev/null
+++ b/riscv/insns/vfsqrt_v.h
@@ -0,0 +1,11 @@
+// vsqrt.v vd, vd2, vm
+VI_VFP_V_LOOP
+({
+  vd = f16_sqrt(vs2);
+},
+{
+  vd = f32_sqrt(vs2);
+},
+{
+  vd = f64_sqrt(vs2);
+})
diff --git a/riscv/insns/vfsub_vf.h b/riscv/insns/vfsub_vf.h
new file mode 100644
index 0000000000..fc6877ca5a
--- /dev/null
+++ b/riscv/insns/vfsub_vf.h
@@ -0,0 +1,11 @@
+// vfsub.vf vd, vs2, rs1
+VI_VFP_VF_LOOP
+({
+  vd = f16_sub(vs2, rs1);
+},
+{
+  vd = f32_sub(vs2, rs1);
+},
+{
+  vd = f64_sub(vs2, rs1);
+})
diff --git a/riscv/insns/vfsub_vv.h b/riscv/insns/vfsub_vv.h
new file mode 100644
index 0000000000..b0403f1180
--- /dev/null
+++ b/riscv/insns/vfsub_vv.h
@@ -0,0 +1,11 @@
+// vfsub.vv vd, vs2, vs1
+VI_VFP_VV_LOOP
+({
+  vd = f16_sub(vs2, vs1);
+},
+{
+  vd = f32_sub(vs2, vs1);
+},
+{
+  vd = f64_sub(vs2, vs1);
+})
diff --git a/riscv/insns/vfwadd_vf.h b/riscv/insns/vfwadd_vf.h
new file mode 100644
index 0000000000..b8249001e0
--- /dev/null
+++ b/riscv/insns/vfwadd_vf.h
@@ -0,0 +1,8 @@
+// vfwadd.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_add(vs2, rs1);
+},
+{
+  vd = f64_add(vs2, rs1);
+})
diff --git a/riscv/insns/vfwadd_vv.h b/riscv/insns/vfwadd_vv.h
new file mode 100644
index 0000000000..7255a50e13
--- /dev/null
+++ b/riscv/insns/vfwadd_vv.h
@@ -0,0 +1,8 @@
+// vfwadd.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_add(vs2, vs1);
+},
+{
+  vd = f64_add(vs2, vs1);
+})
diff --git a/riscv/insns/vfwadd_wf.h b/riscv/insns/vfwadd_wf.h
new file mode 100644
index 0000000000..021b17f049
--- /dev/null
+++ b/riscv/insns/vfwadd_wf.h
@@ -0,0 +1,8 @@
+// vfwadd.wf vd, vs2, vs1
+VI_VFP_WF_LOOP_WIDE
+({
+  vd = f32_add(vs2, rs1);
+},
+{
+  vd = f64_add(vs2, rs1);
+})
diff --git a/riscv/insns/vfwadd_wv.h b/riscv/insns/vfwadd_wv.h
new file mode 100644
index 0000000000..c1ed038925
--- /dev/null
+++ b/riscv/insns/vfwadd_wv.h
@@ -0,0 +1,8 @@
+// vfwadd.wv vd, vs2, vs1
+VI_VFP_WV_LOOP_WIDE
+({
+  vd = f32_add(vs2, vs1);
+},
+{
+  vd = f64_add(vs2, vs1);
+})
diff --git a/riscv/insns/vfwcvt_f_f_v.h b/riscv/insns/vfwcvt_f_f_v.h
new file mode 100644
index 0000000000..9bf3f386e3
--- /dev/null
+++ b/riscv/insns/vfwcvt_f_f_v.h
@@ -0,0 +1,23 @@
+// vfwcvt.f.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = f16_to_f32(vs2);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<float64_t>(rd_num, i, true) = f32_to_f64(vs2);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('D'));
+},
+true, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfwcvt_f_x_v.h b/riscv/insns/vfwcvt_f_x_v.h
new file mode 100644
index 0000000000..481f37122a
--- /dev/null
+++ b/riscv/insns/vfwcvt_f_x_v.h
@@ -0,0 +1,24 @@
+// vfwcvt.f.x.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<int8_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = i32_to_f16(vs2);
+},
+{
+  auto vs2 = P.VU.elt<int16_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = i32_to_f32(vs2);
+},
+{
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  P.VU.elt<float64_t>(rd_num, i, true) = i32_to_f64(vs2);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+true, (P.VU.vsew >= 8))
diff --git a/riscv/insns/vfwcvt_f_xu_v.h b/riscv/insns/vfwcvt_f_xu_v.h
new file mode 100644
index 0000000000..544f33dd4b
--- /dev/null
+++ b/riscv/insns/vfwcvt_f_xu_v.h
@@ -0,0 +1,24 @@
+// vfwcvt.f.xu.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  auto vs2 = P.VU.elt<uint8_t>(rs2_num, i);
+  P.VU.elt<float16_t>(rd_num, i, true) = ui32_to_f16(vs2);
+},
+{
+  auto vs2 = P.VU.elt<uint16_t>(rs2_num, i);
+  P.VU.elt<float32_t>(rd_num, i, true) = ui32_to_f32(vs2);
+},
+{
+  auto vs2 = P.VU.elt<uint32_t>(rs2_num, i);
+  P.VU.elt<float64_t>(rd_num, i, true) = ui32_to_f64(vs2);
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+{
+  require(p->supports_extension('D'));
+},
+true, (P.VU.vsew >= 8))
diff --git a/riscv/insns/vfwcvt_rtz_x_f_v.h b/riscv/insns/vfwcvt_rtz_x_f_v.h
new file mode 100644
index 0000000000..7cbcf3116e
--- /dev/null
+++ b/riscv/insns/vfwcvt_rtz_x_f_v.h
@@ -0,0 +1,23 @@
+// vfwcvt.rtz.x.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<int32_t>(rd_num, i, true) = f16_to_i32(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<int64_t>(rd_num, i, true) = f32_to_i64(vs2, softfloat_round_minMag, true);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+true, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfwcvt_rtz_xu_f_v.h b/riscv/insns/vfwcvt_rtz_xu_f_v.h
new file mode 100644
index 0000000000..81be047a3c
--- /dev/null
+++ b/riscv/insns/vfwcvt_rtz_xu_f_v.h
@@ -0,0 +1,23 @@
+// vfwcvt.rtz,xu.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<uint32_t>(rd_num, i, true) = f16_to_ui32(vs2, softfloat_round_minMag, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<uint64_t>(rd_num, i, true) = f32_to_ui64(vs2, softfloat_round_minMag, true);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+true, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfwcvt_x_f_v.h b/riscv/insns/vfwcvt_x_f_v.h
new file mode 100644
index 0000000000..ebd99c0cd1
--- /dev/null
+++ b/riscv/insns/vfwcvt_x_f_v.h
@@ -0,0 +1,23 @@
+// vfwcvt.x.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<int32_t>(rd_num, i, true) = f16_to_i32(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<int64_t>(rd_num, i, true) = f32_to_i64(vs2, STATE.frm, true);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+true, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfwcvt_xu_f_v.h b/riscv/insns/vfwcvt_xu_f_v.h
new file mode 100644
index 0000000000..55036f6c55
--- /dev/null
+++ b/riscv/insns/vfwcvt_xu_f_v.h
@@ -0,0 +1,23 @@
+// vfwcvt.xu.f.v vd, vs2, vm
+VI_VFP_CVT_SCALE
+({
+  ;
+},
+{
+  auto vs2 = P.VU.elt<float16_t>(rs2_num, i);
+  P.VU.elt<uint32_t>(rd_num, i, true) = f16_to_ui32(vs2, STATE.frm, true);
+},
+{
+  auto vs2 = P.VU.elt<float32_t>(rs2_num, i);
+  P.VU.elt<uint64_t>(rd_num, i, true) = f32_to_ui64(vs2, STATE.frm, true);
+},
+{
+  ;
+},
+{
+  require(p->supports_extension(EXT_ZFH));
+},
+{
+  require(p->supports_extension('F'));
+},
+true, (P.VU.vsew >= 16))
diff --git a/riscv/insns/vfwmacc_vf.h b/riscv/insns/vfwmacc_vf.h
new file mode 100644
index 0000000000..441fa0a791
--- /dev/null
+++ b/riscv/insns/vfwmacc_vf.h
@@ -0,0 +1,8 @@
+// vfwmacc.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_mulAdd(rs1, vs2, vd);
+},
+{
+  vd = f64_mulAdd(rs1, vs2, vd);
+})
diff --git a/riscv/insns/vfwmacc_vv.h b/riscv/insns/vfwmacc_vv.h
new file mode 100644
index 0000000000..a654198bfd
--- /dev/null
+++ b/riscv/insns/vfwmacc_vv.h
@@ -0,0 +1,8 @@
+// vfwmacc.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_mulAdd(vs1, vs2, vd);
+},
+{
+  vd = f64_mulAdd(vs1, vs2, vd);
+})
diff --git a/riscv/insns/vfwmsac_vf.h b/riscv/insns/vfwmsac_vf.h
new file mode 100644
index 0000000000..18010ff490
--- /dev/null
+++ b/riscv/insns/vfwmsac_vf.h
@@ -0,0 +1,8 @@
+// vfwmsac.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_mulAdd(rs1, vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(rs1, vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfwmsac_vv.h b/riscv/insns/vfwmsac_vv.h
new file mode 100644
index 0000000000..9dc4073fef
--- /dev/null
+++ b/riscv/insns/vfwmsac_vv.h
@@ -0,0 +1,8 @@
+// vfwmsac.vv  vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_mulAdd(vs1, vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(vs1, vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfwmul_vf.h b/riscv/insns/vfwmul_vf.h
new file mode 100644
index 0000000000..2bb543f63a
--- /dev/null
+++ b/riscv/insns/vfwmul_vf.h
@@ -0,0 +1,8 @@
+// vfwmul.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_mul(vs2, rs1);
+},
+{
+  vd = f64_mul(vs2, rs1);
+})
diff --git a/riscv/insns/vfwmul_vv.h b/riscv/insns/vfwmul_vv.h
new file mode 100644
index 0000000000..2ce38e62c1
--- /dev/null
+++ b/riscv/insns/vfwmul_vv.h
@@ -0,0 +1,8 @@
+// vfwmul.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_mul(vs2, vs1);
+},
+{
+  vd = f64_mul(vs2, vs1);
+})
diff --git a/riscv/insns/vfwnmacc_vf.h b/riscv/insns/vfwnmacc_vf.h
new file mode 100644
index 0000000000..038bda08ca
--- /dev/null
+++ b/riscv/insns/vfwnmacc_vf.h
@@ -0,0 +1,8 @@
+// vfwnmacc.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_mulAdd(f32(rs1.v ^ F32_SIGN), vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(f64(rs1.v ^ F64_SIGN), vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfwnmacc_vv.h b/riscv/insns/vfwnmacc_vv.h
new file mode 100644
index 0000000000..bf863e04c2
--- /dev/null
+++ b/riscv/insns/vfwnmacc_vv.h
@@ -0,0 +1,8 @@
+// vfwnmacc.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_mulAdd(f32(vs1.v ^ F32_SIGN), vs2, f32(vd.v ^ F32_SIGN));
+},
+{
+  vd = f64_mulAdd(f64(vs1.v ^ F64_SIGN), vs2, f64(vd.v ^ F64_SIGN));
+})
diff --git a/riscv/insns/vfwnmsac_vf.h b/riscv/insns/vfwnmsac_vf.h
new file mode 100644
index 0000000000..1e288e1b91
--- /dev/null
+++ b/riscv/insns/vfwnmsac_vf.h
@@ -0,0 +1,8 @@
+// vfwnmacc.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_mulAdd(f32(rs1.v ^ F32_SIGN), vs2, vd);
+},
+{
+  vd = f64_mulAdd(f64(rs1.v ^ F64_SIGN), vs2, vd);
+})
diff --git a/riscv/insns/vfwnmsac_vv.h b/riscv/insns/vfwnmsac_vv.h
new file mode 100644
index 0000000000..ce97749e1c
--- /dev/null
+++ b/riscv/insns/vfwnmsac_vv.h
@@ -0,0 +1,8 @@
+// vfwnmsac.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_mulAdd(f32(vs1.v ^ F32_SIGN), vs2, vd);
+},
+{
+  vd = f64_mulAdd(f64(vs1.v ^ F64_SIGN), vs2, vd);
+})
diff --git a/riscv/insns/vfwredosum_vs.h b/riscv/insns/vfwredosum_vs.h
new file mode 100644
index 0000000000..1f42d8ff2f
--- /dev/null
+++ b/riscv/insns/vfwredosum_vs.h
@@ -0,0 +1,9 @@
+// vfwredosum.vs vd, vs2, vs1
+bool is_propagate = false;
+VI_VFP_VV_LOOP_WIDE_REDUCTION
+({
+  vd_0 = f32_add(vd_0, vs2);
+},
+{
+  vd_0 = f64_add(vd_0, vs2);
+})
diff --git a/riscv/insns/vfwredsum_vs.h b/riscv/insns/vfwredsum_vs.h
new file mode 100644
index 0000000000..4ef28969dc
--- /dev/null
+++ b/riscv/insns/vfwredsum_vs.h
@@ -0,0 +1,9 @@
+// vfwredsum.vs vd, vs2, vs1
+bool is_propagate = true;
+VI_VFP_VV_LOOP_WIDE_REDUCTION
+({
+  vd_0 = f32_add(vd_0, vs2);
+},
+{
+  vd_0 = f64_add(vd_0, vs2);
+})
diff --git a/riscv/insns/vfwsub_vf.h b/riscv/insns/vfwsub_vf.h
new file mode 100644
index 0000000000..8c37688419
--- /dev/null
+++ b/riscv/insns/vfwsub_vf.h
@@ -0,0 +1,8 @@
+// vfwsub.vf vd, vs2, rs1
+VI_VFP_VF_LOOP_WIDE
+({
+  vd = f32_sub(vs2, rs1);
+},
+{
+  vd = f64_sub(vs2, rs1);
+})
diff --git a/riscv/insns/vfwsub_vv.h b/riscv/insns/vfwsub_vv.h
new file mode 100644
index 0000000000..ce08e36af7
--- /dev/null
+++ b/riscv/insns/vfwsub_vv.h
@@ -0,0 +1,8 @@
+// vfwsub.vv vd, vs2, vs1
+VI_VFP_VV_LOOP_WIDE
+({
+  vd = f32_sub(vs2, vs1);
+},
+{
+  vd = f64_sub(vs2, vs1);
+})
diff --git a/riscv/insns/vfwsub_wf.h b/riscv/insns/vfwsub_wf.h
new file mode 100644
index 0000000000..f6f47ca5cf
--- /dev/null
+++ b/riscv/insns/vfwsub_wf.h
@@ -0,0 +1,8 @@
+// vfwsub.wf vd, vs2, rs1
+VI_VFP_WF_LOOP_WIDE
+({
+  vd = f32_sub(vs2, rs1);
+},
+{
+  vd = f64_sub(vs2, rs1);
+})
diff --git a/riscv/insns/vfwsub_wv.h b/riscv/insns/vfwsub_wv.h
new file mode 100644
index 0000000000..eef904dcc2
--- /dev/null
+++ b/riscv/insns/vfwsub_wv.h
@@ -0,0 +1,8 @@
+// vfwsub.wv vd, vs2, vs1
+VI_VFP_WV_LOOP_WIDE
+({
+  vd = f32_sub(vs2, vs1);
+},
+{
+  vd = f64_sub(vs2, vs1);
+})
diff --git a/riscv/insns/vid_v.h b/riscv/insns/vid_v.h
new file mode 100644
index 0000000000..012d124a43
--- /dev/null
+++ b/riscv/insns/vid_v.h
@@ -0,0 +1,31 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+require_align(rd_num, P.VU.vflmul);
+require_vm;
+
+for (reg_t i = P.VU.vstart ; i < P.VU.vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i, true) = i;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i, true) = i;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i, true) = i;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = i;
+    break;
+  }
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/viota_m.h b/riscv/insns/viota_m.h
new file mode 100644
index 0000000000..a4368254e3
--- /dev/null
+++ b/riscv/insns/viota_m.h
@@ -0,0 +1,53 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+require_vm;
+require_align(rd_num, P.VU.vflmul);
+require_noover(rd_num, P.VU.vflmul, rs2_num, 1);
+
+int cnt = 0;
+for (reg_t i = 0; i < vl; ++i) {
+  const int midx = i / 64;
+  const int mpos = i % 64;
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  bool has_one = false;
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    if (vs2_lsb) {
+      has_one = true;
+    }
+  }
+
+  bool use_ori = (insn.v_vm() == 0) && !do_mask;
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i, true) = use_ori ?
+                                   P.VU.elt<uint8_t>(rd_num, i) : cnt;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i, true) = use_ori ?
+                                    P.VU.elt<uint16_t>(rd_num, i) : cnt;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i, true) = use_ori ?
+                                    P.VU.elt<uint32_t>(rd_num, i) : cnt;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = use_ori ?
+                                    P.VU.elt<uint64_t>(rd_num, i) : cnt;
+    break;
+  }
+
+  if (has_one) {
+    cnt++;
+  }
+}
+
diff --git a/riscv/insns/vl1re16_v.h b/riscv/insns/vl1re16_v.h
new file mode 100644
index 0000000000..220e83e6bb
--- /dev/null
+++ b/riscv/insns/vl1re16_v.h
@@ -0,0 +1,2 @@
+// vl1re16.v vd, (rs1)
+VI_LD_WHOLE(uint16);
diff --git a/riscv/insns/vl1re32_v.h b/riscv/insns/vl1re32_v.h
new file mode 100644
index 0000000000..e72ca02a3d
--- /dev/null
+++ b/riscv/insns/vl1re32_v.h
@@ -0,0 +1,2 @@
+// vl1re32.v vd, (rs1)
+VI_LD_WHOLE(uint32);
diff --git a/riscv/insns/vl1re64_v.h b/riscv/insns/vl1re64_v.h
new file mode 100644
index 0000000000..265701a06f
--- /dev/null
+++ b/riscv/insns/vl1re64_v.h
@@ -0,0 +1,2 @@
+// vl1re64.v vd, (rs1)
+VI_LD_WHOLE(uint64);
diff --git a/riscv/insns/vl1re8_v.h b/riscv/insns/vl1re8_v.h
new file mode 100644
index 0000000000..b4ce661688
--- /dev/null
+++ b/riscv/insns/vl1re8_v.h
@@ -0,0 +1,2 @@
+// vl1re8.v vd, (rs1)
+VI_LD_WHOLE(uint8);
diff --git a/riscv/insns/vl2re16_v.h b/riscv/insns/vl2re16_v.h
new file mode 100644
index 0000000000..2846edd980
--- /dev/null
+++ b/riscv/insns/vl2re16_v.h
@@ -0,0 +1,2 @@
+// vl2e16.v vd, (rs1)
+VI_LD_WHOLE(uint16);
diff --git a/riscv/insns/vl2re32_v.h b/riscv/insns/vl2re32_v.h
new file mode 100644
index 0000000000..5cea835524
--- /dev/null
+++ b/riscv/insns/vl2re32_v.h
@@ -0,0 +1,2 @@
+// vl2re32.v vd, (rs1)
+VI_LD_WHOLE(uint32);
diff --git a/riscv/insns/vl2re64_v.h b/riscv/insns/vl2re64_v.h
new file mode 100644
index 0000000000..efdf2ce2ac
--- /dev/null
+++ b/riscv/insns/vl2re64_v.h
@@ -0,0 +1,2 @@
+// vl2re64.v vd, (rs1)
+VI_LD_WHOLE(uint64);
diff --git a/riscv/insns/vl2re8_v.h b/riscv/insns/vl2re8_v.h
new file mode 100644
index 0000000000..fcc3c4c057
--- /dev/null
+++ b/riscv/insns/vl2re8_v.h
@@ -0,0 +1,2 @@
+// vl2re8.v vd, (rs1)
+VI_LD_WHOLE(uint8);
diff --git a/riscv/insns/vl4re16_v.h b/riscv/insns/vl4re16_v.h
new file mode 100644
index 0000000000..0363418319
--- /dev/null
+++ b/riscv/insns/vl4re16_v.h
@@ -0,0 +1,2 @@
+// vl4re16.v vd, (rs1)
+VI_LD_WHOLE(uint16);
diff --git a/riscv/insns/vl4re32_v.h b/riscv/insns/vl4re32_v.h
new file mode 100644
index 0000000000..e37cc1ab7b
--- /dev/null
+++ b/riscv/insns/vl4re32_v.h
@@ -0,0 +1,2 @@
+// vl4re32.v vd, (rs1)
+VI_LD_WHOLE(uint32);
diff --git a/riscv/insns/vl4re64_v.h b/riscv/insns/vl4re64_v.h
new file mode 100644
index 0000000000..11486f5d1e
--- /dev/null
+++ b/riscv/insns/vl4re64_v.h
@@ -0,0 +1,2 @@
+// vl4re64.v vd, (rs1)
+VI_LD_WHOLE(uint64);
diff --git a/riscv/insns/vl4re8_v.h b/riscv/insns/vl4re8_v.h
new file mode 100644
index 0000000000..f9ce3ff7c7
--- /dev/null
+++ b/riscv/insns/vl4re8_v.h
@@ -0,0 +1,2 @@
+// vl4re8.v vd, (rs1)
+VI_LD_WHOLE(uint8);
diff --git a/riscv/insns/vl8re16_v.h b/riscv/insns/vl8re16_v.h
new file mode 100644
index 0000000000..0b3f1413ba
--- /dev/null
+++ b/riscv/insns/vl8re16_v.h
@@ -0,0 +1,2 @@
+// vl8re16.v vd, (rs1)
+VI_LD_WHOLE(uint16);
diff --git a/riscv/insns/vl8re32_v.h b/riscv/insns/vl8re32_v.h
new file mode 100644
index 0000000000..3372b89d05
--- /dev/null
+++ b/riscv/insns/vl8re32_v.h
@@ -0,0 +1,2 @@
+// vl8re32.v vd, (rs1)
+VI_LD_WHOLE(uint32);
diff --git a/riscv/insns/vl8re64_v.h b/riscv/insns/vl8re64_v.h
new file mode 100644
index 0000000000..f9a9ca981f
--- /dev/null
+++ b/riscv/insns/vl8re64_v.h
@@ -0,0 +1,2 @@
+// vl8re64.v vd, (rs1)
+VI_LD_WHOLE(uint64);
diff --git a/riscv/insns/vl8re8_v.h b/riscv/insns/vl8re8_v.h
new file mode 100644
index 0000000000..ee05e81a9d
--- /dev/null
+++ b/riscv/insns/vl8re8_v.h
@@ -0,0 +1,2 @@
+// vl8re8.v vd, (rs1)
+VI_LD_WHOLE(uint8);
diff --git a/riscv/insns/vle16_v.h b/riscv/insns/vle16_v.h
new file mode 100644
index 0000000000..7bd2e837af
--- /dev/null
+++ b/riscv/insns/vle16_v.h
@@ -0,0 +1,2 @@
+// vle16.v and vlseg[2-8]e16.v
+VI_LD(0, (i * nf + fn), int16);
diff --git a/riscv/insns/vle16ff_v.h b/riscv/insns/vle16ff_v.h
new file mode 100644
index 0000000000..53c8889137
--- /dev/null
+++ b/riscv/insns/vle16ff_v.h
@@ -0,0 +1,2 @@
+// vle16ff.v and vlseg[2-8]e16ff.v
+VI_LDST_FF(int16);
diff --git a/riscv/insns/vle32_v.h b/riscv/insns/vle32_v.h
new file mode 100644
index 0000000000..9399fd621f
--- /dev/null
+++ b/riscv/insns/vle32_v.h
@@ -0,0 +1,2 @@
+// vle32.v and vlseg[2-8]e32.v
+VI_LD(0, (i * nf + fn), int32);
diff --git a/riscv/insns/vle32ff_v.h b/riscv/insns/vle32ff_v.h
new file mode 100644
index 0000000000..7d03d7ddd5
--- /dev/null
+++ b/riscv/insns/vle32ff_v.h
@@ -0,0 +1,2 @@
+// vle32ff.v and vlseg[2-8]e32ff.v
+VI_LDST_FF(int32);
diff --git a/riscv/insns/vle64_v.h b/riscv/insns/vle64_v.h
new file mode 100644
index 0000000000..3f2654dd8b
--- /dev/null
+++ b/riscv/insns/vle64_v.h
@@ -0,0 +1,2 @@
+// vle64.v and vlseg[2-8]e64.v
+VI_LD(0, (i * nf + fn), int64);
diff --git a/riscv/insns/vle64ff_v.h b/riscv/insns/vle64ff_v.h
new file mode 100644
index 0000000000..39996da6f1
--- /dev/null
+++ b/riscv/insns/vle64ff_v.h
@@ -0,0 +1,2 @@
+// vle64ff.v and vlseg[2-8]e64ff.v
+VI_LDST_FF(int64);
diff --git a/riscv/insns/vle8_v.h b/riscv/insns/vle8_v.h
new file mode 100644
index 0000000000..5613a1dd3e
--- /dev/null
+++ b/riscv/insns/vle8_v.h
@@ -0,0 +1,2 @@
+// vle8.v and vlseg[2-8]e8.v
+VI_LD(0, (i * nf + fn), int8);
diff --git a/riscv/insns/vle8ff_v.h b/riscv/insns/vle8ff_v.h
new file mode 100644
index 0000000000..b56d1d339c
--- /dev/null
+++ b/riscv/insns/vle8ff_v.h
@@ -0,0 +1,2 @@
+// vle8ff.v and vlseg[2-8]e8ff.v
+VI_LDST_FF(int8);
diff --git a/riscv/insns/vlse16_v.h b/riscv/insns/vlse16_v.h
new file mode 100644
index 0000000000..7622ded97d
--- /dev/null
+++ b/riscv/insns/vlse16_v.h
@@ -0,0 +1,2 @@
+// vlse16.v and vlsseg[2-8]e16.v
+VI_LD(i * RS2, fn, int16);
diff --git a/riscv/insns/vlse32_v.h b/riscv/insns/vlse32_v.h
new file mode 100644
index 0000000000..1afc5e9cf4
--- /dev/null
+++ b/riscv/insns/vlse32_v.h
@@ -0,0 +1,2 @@
+// vlse32.v and vlsseg[2-8]e32.v
+VI_LD(i * RS2, fn, int32);
diff --git a/riscv/insns/vlse64_v.h b/riscv/insns/vlse64_v.h
new file mode 100644
index 0000000000..c6d999955e
--- /dev/null
+++ b/riscv/insns/vlse64_v.h
@@ -0,0 +1,2 @@
+// vlse64.v and vlsseg[2-8]e64.v
+VI_LD(i * RS2, fn, int64);
diff --git a/riscv/insns/vlse8_v.h b/riscv/insns/vlse8_v.h
new file mode 100644
index 0000000000..021a1fbcb4
--- /dev/null
+++ b/riscv/insns/vlse8_v.h
@@ -0,0 +1,2 @@
+// vlse8.v and vlsseg[2-8]e8.v
+VI_LD(i * RS2, fn, int8);
diff --git a/riscv/insns/vlxei16_v.h b/riscv/insns/vlxei16_v.h
new file mode 100644
index 0000000000..6e4ed49b57
--- /dev/null
+++ b/riscv/insns/vlxei16_v.h
@@ -0,0 +1,2 @@
+// vlxei16.v and vlxseg[2-8]e16.v
+VI_LD_INDEX(e16, true);
diff --git a/riscv/insns/vlxei32_v.h b/riscv/insns/vlxei32_v.h
new file mode 100644
index 0000000000..a7da8ff035
--- /dev/null
+++ b/riscv/insns/vlxei32_v.h
@@ -0,0 +1,2 @@
+// vlxe32.v and vlxseg[2-8]ei32.v
+VI_LD_INDEX(e32, true);
diff --git a/riscv/insns/vlxei64_v.h b/riscv/insns/vlxei64_v.h
new file mode 100644
index 0000000000..067224e4c5
--- /dev/null
+++ b/riscv/insns/vlxei64_v.h
@@ -0,0 +1,3 @@
+// vlxei64.v and vlxseg[2-8]ei64.v
+VI_LD_INDEX(e64, true);
+
diff --git a/riscv/insns/vlxei8_v.h b/riscv/insns/vlxei8_v.h
new file mode 100644
index 0000000000..d27304996b
--- /dev/null
+++ b/riscv/insns/vlxei8_v.h
@@ -0,0 +1,2 @@
+// vlxei8.v and vlxseg[2-8]ei8.v
+VI_LD_INDEX(e8, true);
diff --git a/riscv/insns/vmacc_vv.h b/riscv/insns/vmacc_vv.h
new file mode 100644
index 0000000000..e6ec93ff71
--- /dev/null
+++ b/riscv/insns/vmacc_vv.h
@@ -0,0 +1,5 @@
+// vmacc.vv: vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+VI_VV_LOOP
+({
+  vd = vs1 * vs2 + vd;
+})
diff --git a/riscv/insns/vmacc_vx.h b/riscv/insns/vmacc_vx.h
new file mode 100644
index 0000000000..d40b264a05
--- /dev/null
+++ b/riscv/insns/vmacc_vx.h
@@ -0,0 +1,5 @@
+// vmacc.vx: vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+VI_VX_LOOP
+({
+  vd = rs1 * vs2 + vd;
+})
diff --git a/riscv/insns/vmadc_vim.h b/riscv/insns/vmadc_vim.h
new file mode 100644
index 0000000000..afdca7e3c5
--- /dev/null
+++ b/riscv/insns/vmadc_vim.h
@@ -0,0 +1,13 @@
+// vmadc.vim vd, vs2, simm5
+VI_XI_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = insn.v_vm() == 0 ? (v0 >> mpos) & 0x1 : 0;
+
+  uint128_t res = (op_mask & simm5) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadc_vvm.h b/riscv/insns/vmadc_vvm.h
new file mode 100644
index 0000000000..a5d54c6dff
--- /dev/null
+++ b/riscv/insns/vmadc_vvm.h
@@ -0,0 +1,13 @@
+// vmadc.vvm vd, vs2, rs1
+VI_VV_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = insn.v_vm() == 0 ? (v0 >> mpos) & 0x1 : 0;
+
+  uint128_t res = (op_mask & vs1) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadc_vxm.h b/riscv/insns/vmadc_vxm.h
new file mode 100644
index 0000000000..ca0342e0ce
--- /dev/null
+++ b/riscv/insns/vmadc_vxm.h
@@ -0,0 +1,13 @@
+// vadc.vx vd, vs2, rs1
+VI_XI_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = insn.v_vm() == 0 ? (v0 >> mpos) & 0x1 : 0;
+
+  uint128_t res = (op_mask & rs1) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadd_vv.h b/riscv/insns/vmadd_vv.h
new file mode 100644
index 0000000000..a1c0d2ed64
--- /dev/null
+++ b/riscv/insns/vmadd_vv.h
@@ -0,0 +1,5 @@
+// vmadd: vd[i] = (vd[i] * vs1[i]) + vs2[i]
+VI_VV_LOOP
+({
+  vd = vd * vs1 + vs2;
+})
diff --git a/riscv/insns/vmadd_vx.h b/riscv/insns/vmadd_vx.h
new file mode 100644
index 0000000000..1a8a001593
--- /dev/null
+++ b/riscv/insns/vmadd_vx.h
@@ -0,0 +1,5 @@
+// vmadd: vd[i] = (vd[i] * x[rs1]) + vs2[i]
+VI_VX_LOOP
+({
+  vd = vd * rs1 + vs2;
+})
diff --git a/riscv/insns/vmand_mm.h b/riscv/insns/vmand_mm.h
new file mode 100644
index 0000000000..04615c60fc
--- /dev/null
+++ b/riscv/insns/vmand_mm.h
@@ -0,0 +1,2 @@
+// vmand.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 & vs1);
diff --git a/riscv/insns/vmandnot_mm.h b/riscv/insns/vmandnot_mm.h
new file mode 100644
index 0000000000..4c26469c7e
--- /dev/null
+++ b/riscv/insns/vmandnot_mm.h
@@ -0,0 +1,2 @@
+// vmandnot.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 & ~vs1);
diff --git a/riscv/insns/vmax_vv.h b/riscv/insns/vmax_vv.h
new file mode 100644
index 0000000000..b9f15c5f18
--- /dev/null
+++ b/riscv/insns/vmax_vv.h
@@ -0,0 +1,10 @@
+// vmax.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_LOOP
+({
+  if (vs1 >= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmax_vx.h b/riscv/insns/vmax_vx.h
new file mode 100644
index 0000000000..06f3f43160
--- /dev/null
+++ b/riscv/insns/vmax_vx.h
@@ -0,0 +1,10 @@
+// vmax.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_LOOP
+({
+  if (rs1 >= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmaxu_vv.h b/riscv/insns/vmaxu_vv.h
new file mode 100644
index 0000000000..4e6868d19e
--- /dev/null
+++ b/riscv/insns/vmaxu_vv.h
@@ -0,0 +1,9 @@
+// vmaxu.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_ULOOP
+({
+  if (vs1 >= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vmaxu_vx.h b/riscv/insns/vmaxu_vx.h
new file mode 100644
index 0000000000..cab89188f7
--- /dev/null
+++ b/riscv/insns/vmaxu_vx.h
@@ -0,0 +1,9 @@
+// vmaxu.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_ULOOP
+({
+  if (rs1 >= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vmerge_vim.h b/riscv/insns/vmerge_vim.h
new file mode 100644
index 0000000000..fd6ae1cdc4
--- /dev/null
+++ b/riscv/insns/vmerge_vim.h
@@ -0,0 +1,11 @@
+// vmerge.vim vd, vs2, simm5
+require_vector(true);
+VI_CHECK_SSS(false);
+VI_VVXI_MERGE_LOOP
+({
+  int midx = i / 64;
+  int mpos = i % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? simm5 : vs2;
+})
diff --git a/riscv/insns/vmerge_vvm.h b/riscv/insns/vmerge_vvm.h
new file mode 100644
index 0000000000..df416b2c78
--- /dev/null
+++ b/riscv/insns/vmerge_vvm.h
@@ -0,0 +1,11 @@
+// vmerge.vvm vd, vs2, vs1
+require_vector(true);
+VI_CHECK_SSS(true);
+VI_VVXI_MERGE_LOOP
+({
+  int midx = i / 64;
+  int mpos = i % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? vs1 : vs2;
+})
diff --git a/riscv/insns/vmerge_vxm.h b/riscv/insns/vmerge_vxm.h
new file mode 100644
index 0000000000..122a7b733e
--- /dev/null
+++ b/riscv/insns/vmerge_vxm.h
@@ -0,0 +1,11 @@
+// vmerge.vxm vd, vs2, rs1
+require_vector(true);
+VI_CHECK_SSS(false);
+VI_VVXI_MERGE_LOOP
+({
+  int midx = i / 64;
+  int mpos = i % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? rs1 : vs2;
+})
diff --git a/riscv/insns/vmfeq_vf.h b/riscv/insns/vmfeq_vf.h
new file mode 100644
index 0000000000..040f2b0b9d
--- /dev/null
+++ b/riscv/insns/vmfeq_vf.h
@@ -0,0 +1,12 @@
+// vmfeq.vf vd, vs2, fs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_eq(vs2, rs1);
+},
+{
+  res = f32_eq(vs2, rs1);
+},
+{
+  res = f64_eq(vs2, rs1);
+},
+false)
diff --git a/riscv/insns/vmfeq_vv.h b/riscv/insns/vmfeq_vv.h
new file mode 100644
index 0000000000..fb24d1329c
--- /dev/null
+++ b/riscv/insns/vmfeq_vv.h
@@ -0,0 +1,12 @@
+// vmfeq.vv vd, vs2, vs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_eq(vs2, vs1);
+},
+{
+  res = f32_eq(vs2, vs1);
+},
+{
+  res = f64_eq(vs2, vs1);
+},
+true)
diff --git a/riscv/insns/vmfge_vf.h b/riscv/insns/vmfge_vf.h
new file mode 100644
index 0000000000..9e69855b51
--- /dev/null
+++ b/riscv/insns/vmfge_vf.h
@@ -0,0 +1,12 @@
+// vmfge.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_le(rs1, vs2);
+},
+{
+  res = f32_le(rs1, vs2);
+},
+{
+  res = f64_le(rs1, vs2);
+},
+false)
diff --git a/riscv/insns/vmfgt_vf.h b/riscv/insns/vmfgt_vf.h
new file mode 100644
index 0000000000..bd5d99b70d
--- /dev/null
+++ b/riscv/insns/vmfgt_vf.h
@@ -0,0 +1,12 @@
+// vmfgt.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_lt(rs1, vs2);
+},
+{
+  res = f32_lt(rs1, vs2);
+},
+{
+  res = f64_lt(rs1, vs2);
+},
+false)
diff --git a/riscv/insns/vmfle_vf.h b/riscv/insns/vmfle_vf.h
new file mode 100644
index 0000000000..3d2852fca9
--- /dev/null
+++ b/riscv/insns/vmfle_vf.h
@@ -0,0 +1,12 @@
+// vmfle.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_le(vs2, rs1);
+},
+{
+  res = f32_le(vs2, rs1);
+},
+{
+  res = f64_le(vs2, rs1);
+},
+false)
diff --git a/riscv/insns/vmfle_vv.h b/riscv/insns/vmfle_vv.h
new file mode 100644
index 0000000000..203ef210ff
--- /dev/null
+++ b/riscv/insns/vmfle_vv.h
@@ -0,0 +1,12 @@
+// vmfle.vv vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_le(vs2, vs1);
+},
+{
+  res = f32_le(vs2, vs1);
+},
+{
+  res = f64_le(vs2, vs1);
+},
+true)
diff --git a/riscv/insns/vmflt_vf.h b/riscv/insns/vmflt_vf.h
new file mode 100644
index 0000000000..4780adc556
--- /dev/null
+++ b/riscv/insns/vmflt_vf.h
@@ -0,0 +1,12 @@
+// vmflt.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_lt(vs2, rs1);
+},
+{
+  res = f32_lt(vs2, rs1);
+},
+{
+  res = f64_lt(vs2, rs1);
+},
+false)
diff --git a/riscv/insns/vmflt_vv.h b/riscv/insns/vmflt_vv.h
new file mode 100644
index 0000000000..cdfc3fae90
--- /dev/null
+++ b/riscv/insns/vmflt_vv.h
@@ -0,0 +1,12 @@
+// vmflt.vv vd, vs2, vs1
+VI_VFP_LOOP_CMP
+({
+  res = f16_lt(vs2, vs1);
+},
+{
+  res = f32_lt(vs2, vs1);
+},
+{
+  res = f64_lt(vs2, vs1);
+},
+true)
diff --git a/riscv/insns/vmfne_vf.h b/riscv/insns/vmfne_vf.h
new file mode 100644
index 0000000000..84016993ae
--- /dev/null
+++ b/riscv/insns/vmfne_vf.h
@@ -0,0 +1,12 @@
+// vmfne.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = !f16_eq(vs2, rs1);
+},
+{
+  res = !f32_eq(vs2, rs1);
+},
+{
+  res = !f64_eq(vs2, rs1);
+},
+false)
diff --git a/riscv/insns/vmfne_vv.h b/riscv/insns/vmfne_vv.h
new file mode 100644
index 0000000000..50dfa9c047
--- /dev/null
+++ b/riscv/insns/vmfne_vv.h
@@ -0,0 +1,12 @@
+// vmfne.vv vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = !f16_eq(vs2, vs1);
+},
+{
+  res = !f32_eq(vs2, vs1);
+},
+{
+  res = !f64_eq(vs2, vs1);
+},
+true)
diff --git a/riscv/insns/vmin_vv.h b/riscv/insns/vmin_vv.h
new file mode 100644
index 0000000000..21da0b3c5b
--- /dev/null
+++ b/riscv/insns/vmin_vv.h
@@ -0,0 +1,11 @@
+// vmin.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_LOOP
+({
+  if (vs1 <= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+
+
+})
diff --git a/riscv/insns/vmin_vx.h b/riscv/insns/vmin_vx.h
new file mode 100644
index 0000000000..3291776d05
--- /dev/null
+++ b/riscv/insns/vmin_vx.h
@@ -0,0 +1,11 @@
+// vminx.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_LOOP
+({
+  if (rs1 <= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+
+})
diff --git a/riscv/insns/vminu_vv.h b/riscv/insns/vminu_vv.h
new file mode 100644
index 0000000000..c0ab1958d0
--- /dev/null
+++ b/riscv/insns/vminu_vv.h
@@ -0,0 +1,9 @@
+// vminu.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_ULOOP
+({
+  if (vs1 <= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vminu_vx.h b/riscv/insns/vminu_vx.h
new file mode 100644
index 0000000000..1055895ac3
--- /dev/null
+++ b/riscv/insns/vminu_vx.h
@@ -0,0 +1,10 @@
+// vminu.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_ULOOP
+({
+  if (rs1 <= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmnand_mm.h b/riscv/insns/vmnand_mm.h
new file mode 100644
index 0000000000..5a3ab090ae
--- /dev/null
+++ b/riscv/insns/vmnand_mm.h
@@ -0,0 +1,2 @@
+// vmnand.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 & vs1));
diff --git a/riscv/insns/vmnor_mm.h b/riscv/insns/vmnor_mm.h
new file mode 100644
index 0000000000..ab933786c9
--- /dev/null
+++ b/riscv/insns/vmnor_mm.h
@@ -0,0 +1,2 @@
+// vmnor.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 | vs1));
diff --git a/riscv/insns/vmor_mm.h b/riscv/insns/vmor_mm.h
new file mode 100644
index 0000000000..32e71b934a
--- /dev/null
+++ b/riscv/insns/vmor_mm.h
@@ -0,0 +1,2 @@
+// vmor.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 | vs1);
diff --git a/riscv/insns/vmornot_mm.h b/riscv/insns/vmornot_mm.h
new file mode 100644
index 0000000000..bdc1d8b6e7
--- /dev/null
+++ b/riscv/insns/vmornot_mm.h
@@ -0,0 +1,2 @@
+// vmornot.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 | ~vs1);
diff --git a/riscv/insns/vmsbc_vvm.h b/riscv/insns/vmsbc_vvm.h
new file mode 100644
index 0000000000..ff95464d05
--- /dev/null
+++ b/riscv/insns/vmsbc_vvm.h
@@ -0,0 +1,13 @@
+// vmsbc.vvm vd, vs2, rs1
+VI_VV_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = UINT64_C(1) << mpos;
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = insn.v_vm() == 0 ? (v0 >> mpos) & 0x1 : 0;
+
+  uint128_t res = (op_mask & vs2) - (op_mask & vs1) - carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmsbc_vxm.h b/riscv/insns/vmsbc_vxm.h
new file mode 100644
index 0000000000..29fa012af9
--- /dev/null
+++ b/riscv/insns/vmsbc_vxm.h
@@ -0,0 +1,13 @@
+// vmsbc.vxm vd, vs2, rs1
+VI_XI_LOOP_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = insn.v_vm() == 0 ? (v0 >> mpos) & 0x1 : 0;
+
+  uint128_t res = (op_mask & vs2) - (op_mask & rs1) - carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmsbf_m.h b/riscv/insns/vmsbf_m.h
new file mode 100644
index 0000000000..a4195cfd08
--- /dev/null
+++ b/riscv/insns/vmsbf_m.h
@@ -0,0 +1,32 @@
+// vmsbf.m vd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+require(P.VU.vstart == 0);
+require_vm;
+require(insn.rd() != insn.rs2());
+
+reg_t vl = P.VU.vl;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  const int midx = i / 64;
+  const int mpos = i % 64;
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
+    uint64_t res = 0;
+    if (!has_one && !vs2_lsb) {
+      res = 1;
+    } else if(!has_one && vs2_lsb) {
+      has_one = true;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
diff --git a/riscv/insns/vmseq_vi.h b/riscv/insns/vmseq_vi.h
new file mode 100644
index 0000000000..cfc16825f2
--- /dev/null
+++ b/riscv/insns/vmseq_vi.h
@@ -0,0 +1,5 @@
+// vseq.vi vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = simm5 == vs2;
+})
diff --git a/riscv/insns/vmseq_vv.h b/riscv/insns/vmseq_vv.h
new file mode 100644
index 0000000000..91fd204a50
--- /dev/null
+++ b/riscv/insns/vmseq_vv.h
@@ -0,0 +1,6 @@
+// vseq.vv vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 == vs1;
+})
+
diff --git a/riscv/insns/vmseq_vx.h b/riscv/insns/vmseq_vx.h
new file mode 100644
index 0000000000..ab63323134
--- /dev/null
+++ b/riscv/insns/vmseq_vx.h
@@ -0,0 +1,5 @@
+// vseq.vx vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = rs1 == vs2;
+})
diff --git a/riscv/insns/vmsgt_vi.h b/riscv/insns/vmsgt_vi.h
new file mode 100644
index 0000000000..4f7dea8e4b
--- /dev/null
+++ b/riscv/insns/vmsgt_vi.h
@@ -0,0 +1,5 @@
+// vsgt.vi  vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 > simm5;
+})
diff --git a/riscv/insns/vmsgt_vx.h b/riscv/insns/vmsgt_vx.h
new file mode 100644
index 0000000000..5f24db6964
--- /dev/null
+++ b/riscv/insns/vmsgt_vx.h
@@ -0,0 +1,5 @@
+// vsgt.vx  vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 > rs1;
+})
diff --git a/riscv/insns/vmsgtu_vi.h b/riscv/insns/vmsgtu_vi.h
new file mode 100644
index 0000000000..be28fee1e7
--- /dev/null
+++ b/riscv/insns/vmsgtu_vi.h
@@ -0,0 +1,5 @@
+// vmsgtu.vi  vd, vd2, simm5
+VI_VI_ULOOP_CMP
+({
+  res = vs2 > (insn.v_simm5() & (UINT64_MAX >> (64 - P.VU.vsew)));
+})
diff --git a/riscv/insns/vmsgtu_vx.h b/riscv/insns/vmsgtu_vx.h
new file mode 100644
index 0000000000..7f39800804
--- /dev/null
+++ b/riscv/insns/vmsgtu_vx.h
@@ -0,0 +1,5 @@
+// vsgtu.vx  vd, vs2, rs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 > rs1;
+})
diff --git a/riscv/insns/vmsif_m.h b/riscv/insns/vmsif_m.h
new file mode 100644
index 0000000000..a16ef681c4
--- /dev/null
+++ b/riscv/insns/vmsif_m.h
@@ -0,0 +1,32 @@
+// vmsif.m rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+require(P.VU.vstart == 0);
+require_vm;
+require(insn.rd() != insn.rs2());
+
+reg_t vl = P.VU.vl;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart ; i < vl; ++i) {
+  const int midx = i / 64;
+  const int mpos = i % 64;
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
+    uint64_t res = 0;
+    if (!has_one && !vs2_lsb) {
+      res = 1;
+    } else if(!has_one && vs2_lsb) {
+      has_one = true;
+      res = 1;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
diff --git a/riscv/insns/vmsle_vi.h b/riscv/insns/vmsle_vi.h
new file mode 100644
index 0000000000..f0f67d0213
--- /dev/null
+++ b/riscv/insns/vmsle_vi.h
@@ -0,0 +1,5 @@
+// vsle.vi vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 <= simm5;
+})
diff --git a/riscv/insns/vmsle_vv.h b/riscv/insns/vmsle_vv.h
new file mode 100644
index 0000000000..30aba06d41
--- /dev/null
+++ b/riscv/insns/vmsle_vv.h
@@ -0,0 +1,5 @@
+// vsle.vv vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 <= vs1;
+})
diff --git a/riscv/insns/vmsle_vx.h b/riscv/insns/vmsle_vx.h
new file mode 100644
index 0000000000..c26d59692e
--- /dev/null
+++ b/riscv/insns/vmsle_vx.h
@@ -0,0 +1,5 @@
+// vsle.vx vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 <= rs1;
+})
diff --git a/riscv/insns/vmsleu_vi.h b/riscv/insns/vmsleu_vi.h
new file mode 100644
index 0000000000..0e66b781a0
--- /dev/null
+++ b/riscv/insns/vmsleu_vi.h
@@ -0,0 +1,5 @@
+// vmsleu.vi vd, vs2, simm5
+VI_VI_ULOOP_CMP
+({
+  res = vs2 <= (insn.v_simm5() & (UINT64_MAX >> (64 - P.VU.vsew)));
+})
diff --git a/riscv/insns/vmsleu_vv.h b/riscv/insns/vmsleu_vv.h
new file mode 100644
index 0000000000..0e460326f8
--- /dev/null
+++ b/riscv/insns/vmsleu_vv.h
@@ -0,0 +1,5 @@
+// vsleu.vv vd, vs2, vs1
+VI_VV_ULOOP_CMP
+({
+  res = vs2 <= vs1;
+})
diff --git a/riscv/insns/vmsleu_vx.h b/riscv/insns/vmsleu_vx.h
new file mode 100644
index 0000000000..935b17681c
--- /dev/null
+++ b/riscv/insns/vmsleu_vx.h
@@ -0,0 +1,5 @@
+// vsleu.vx  vd, vs2, rs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 <= rs1;
+})
diff --git a/riscv/insns/vmslt_vv.h b/riscv/insns/vmslt_vv.h
new file mode 100644
index 0000000000..71e6f87f1f
--- /dev/null
+++ b/riscv/insns/vmslt_vv.h
@@ -0,0 +1,5 @@
+// vslt.vv  vd, vd2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 < vs1;
+})
diff --git a/riscv/insns/vmslt_vx.h b/riscv/insns/vmslt_vx.h
new file mode 100644
index 0000000000..b32bb14537
--- /dev/null
+++ b/riscv/insns/vmslt_vx.h
@@ -0,0 +1,5 @@
+// vslt.vx  vd, vs2, vs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 < rs1;
+})
diff --git a/riscv/insns/vmsltu_vv.h b/riscv/insns/vmsltu_vv.h
new file mode 100644
index 0000000000..53a570ae9d
--- /dev/null
+++ b/riscv/insns/vmsltu_vv.h
@@ -0,0 +1,5 @@
+// vsltu.vv  vd, vs2, vs1
+VI_VV_ULOOP_CMP
+({
+  res = vs2 < vs1;
+})
diff --git a/riscv/insns/vmsltu_vx.h b/riscv/insns/vmsltu_vx.h
new file mode 100644
index 0000000000..8082544876
--- /dev/null
+++ b/riscv/insns/vmsltu_vx.h
@@ -0,0 +1,5 @@
+// vsltu.vx  vd, vs2, vs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 < rs1;
+})
diff --git a/riscv/insns/vmsne_vi.h b/riscv/insns/vmsne_vi.h
new file mode 100644
index 0000000000..5e9758ef94
--- /dev/null
+++ b/riscv/insns/vmsne_vi.h
@@ -0,0 +1,5 @@
+// vsne.vi  vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 != simm5;
+})
diff --git a/riscv/insns/vmsne_vv.h b/riscv/insns/vmsne_vv.h
new file mode 100644
index 0000000000..e6a7174a48
--- /dev/null
+++ b/riscv/insns/vmsne_vv.h
@@ -0,0 +1,5 @@
+// vneq.vv  vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 != vs1;
+})
diff --git a/riscv/insns/vmsne_vx.h b/riscv/insns/vmsne_vx.h
new file mode 100644
index 0000000000..9e4c155387
--- /dev/null
+++ b/riscv/insns/vmsne_vx.h
@@ -0,0 +1,5 @@
+// vsne.vx  vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 != rs1;
+})
diff --git a/riscv/insns/vmsof_m.h b/riscv/insns/vmsof_m.h
new file mode 100644
index 0000000000..5ef0bfd4ac
--- /dev/null
+++ b/riscv/insns/vmsof_m.h
@@ -0,0 +1,30 @@
+// vmsof.m rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+require(P.VU.vstart == 0);
+require_vm;
+require(insn.rd() != insn.rs2());
+
+reg_t vl = P.VU.vl;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart ; i < vl; ++i) {
+  const int midx = i / 64;
+  const int mpos = i % 64;
+  const uint64_t mmask = UINT64_C(1) << mpos; \
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    uint64_t &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
+    uint64_t res = 0;
+    if(!has_one && vs2_lsb) {
+      has_one = true;
+      res = 1;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
diff --git a/riscv/insns/vmul_vv.h b/riscv/insns/vmul_vv.h
new file mode 100644
index 0000000000..a3278171dd
--- /dev/null
+++ b/riscv/insns/vmul_vv.h
@@ -0,0 +1,5 @@
+// vmul vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = vs2 * vs1;
+})
diff --git a/riscv/insns/vmul_vx.h b/riscv/insns/vmul_vx.h
new file mode 100644
index 0000000000..8d68390276
--- /dev/null
+++ b/riscv/insns/vmul_vx.h
@@ -0,0 +1,5 @@
+// vmul vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = vs2 * rs1;
+})
diff --git a/riscv/insns/vmulh_vv.h b/riscv/insns/vmulh_vv.h
new file mode 100644
index 0000000000..e861a3397a
--- /dev/null
+++ b/riscv/insns/vmulh_vv.h
@@ -0,0 +1,5 @@
+// vmulh vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = ((int128_t)vs2 * vs1) >> sew;
+})
diff --git a/riscv/insns/vmulh_vx.h b/riscv/insns/vmulh_vx.h
new file mode 100644
index 0000000000..b6b5503674
--- /dev/null
+++ b/riscv/insns/vmulh_vx.h
@@ -0,0 +1,5 @@
+// vmulh vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = ((int128_t)vs2 * rs1) >> sew;
+})
diff --git a/riscv/insns/vmulhsu_vv.h b/riscv/insns/vmulhsu_vv.h
new file mode 100644
index 0000000000..f77a7d3f21
--- /dev/null
+++ b/riscv/insns/vmulhsu_vv.h
@@ -0,0 +1,38 @@
+// vmulhsu.vv vd, vs2, vs1
+VI_CHECK_SSS(true);
+VI_LOOP_BASE
+switch(sew) {
+case e8: {
+  auto &vd = P.VU.elt<int8_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int8_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint8_t>(rs1_num, i);
+
+  vd = ((int16_t)vs2 * (uint16_t)vs1) >> sew;
+  break;
+}
+case e16: {
+  auto &vd = P.VU.elt<int16_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int16_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+
+  vd = ((int32_t)vs2 * (uint32_t)vs1) >> sew;
+  break;
+}
+case e32: {
+  auto &vd = P.VU.elt<int32_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint32_t>(rs1_num, i);
+
+  vd = ((int64_t)vs2 * (uint64_t)vs1) >> sew;
+  break;
+}
+default: {
+  auto &vd = P.VU.elt<int64_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int64_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint64_t>(rs1_num, i);
+
+  vd = ((int128_t)vs2 * (uint128_t)vs1) >> sew;
+  break;
+}
+}
+VI_LOOP_END
diff --git a/riscv/insns/vmulhsu_vx.h b/riscv/insns/vmulhsu_vx.h
new file mode 100644
index 0000000000..b0699f6f93
--- /dev/null
+++ b/riscv/insns/vmulhsu_vx.h
@@ -0,0 +1,38 @@
+// vmulhsu.vx vd, vs2, rs1
+VI_CHECK_SSS(false);
+VI_LOOP_BASE
+switch(sew) {
+case e8: {
+  auto &vd = P.VU.elt<int8_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int8_t>(rs2_num, i);
+  uint8_t rs1 = RS1;
+
+  vd = ((int16_t)vs2 * (uint16_t)rs1) >> sew;
+  break;
+}
+case e16: {
+  auto &vd = P.VU.elt<int16_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int16_t>(rs2_num, i);
+  uint16_t rs1 = RS1;
+
+  vd = ((int32_t)vs2 * (uint32_t)rs1) >> sew;
+  break;
+}
+case e32: {
+  auto &vd = P.VU.elt<int32_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  uint32_t rs1 = RS1;
+
+  vd = ((int64_t)vs2 * (uint64_t)rs1) >> sew;
+  break;
+}
+default: {
+  auto &vd = P.VU.elt<int64_t>(rd_num, i, true);
+  auto vs2 = P.VU.elt<int64_t>(rs2_num, i);
+  uint64_t rs1 = RS1;
+
+  vd = ((int128_t)vs2 * (uint128_t)rs1) >> sew;
+  break;
+}
+}
+VI_LOOP_END
diff --git a/riscv/insns/vmulhu_vv.h b/riscv/insns/vmulhu_vv.h
new file mode 100644
index 0000000000..8e318edb75
--- /dev/null
+++ b/riscv/insns/vmulhu_vv.h
@@ -0,0 +1,5 @@
+// vmulhu vd ,vs2, vs1
+VI_VV_ULOOP
+({
+  vd = ((uint128_t)vs2 * vs1) >> sew;
+})
diff --git a/riscv/insns/vmulhu_vx.h b/riscv/insns/vmulhu_vx.h
new file mode 100644
index 0000000000..672ad32df2
--- /dev/null
+++ b/riscv/insns/vmulhu_vx.h
@@ -0,0 +1,5 @@
+// vmulhu vd ,vs2, rs1
+VI_VX_ULOOP
+({
+  vd = ((uint128_t)vs2 * rs1) >> sew;
+})
diff --git a/riscv/insns/vmv1r_v.h b/riscv/insns/vmv1r_v.h
new file mode 100644
index 0000000000..bbdeab9a1d
--- /dev/null
+++ b/riscv/insns/vmv1r_v.h
@@ -0,0 +1,2 @@
+// vmv1r.v vd, vs2
+#include "vmvnfr_v.h"
diff --git a/riscv/insns/vmv2r_v.h b/riscv/insns/vmv2r_v.h
new file mode 100644
index 0000000000..1ac8e09eb0
--- /dev/null
+++ b/riscv/insns/vmv2r_v.h
@@ -0,0 +1,2 @@
+// vmv2r.v vd, vs2
+#include "vmvnfr_v.h"
diff --git a/riscv/insns/vmv4r_v.h b/riscv/insns/vmv4r_v.h
new file mode 100644
index 0000000000..2068731a9e
--- /dev/null
+++ b/riscv/insns/vmv4r_v.h
@@ -0,0 +1,2 @@
+// vmv4r.v vd, vs2
+#include "vmvnfr_v.h"
diff --git a/riscv/insns/vmv8r_v.h b/riscv/insns/vmv8r_v.h
new file mode 100644
index 0000000000..2b205fc79e
--- /dev/null
+++ b/riscv/insns/vmv8r_v.h
@@ -0,0 +1,2 @@
+// vmv8r.v vd, vs2
+#include "vmvnfr_v.h"
diff --git a/riscv/insns/vmv_s_x.h b/riscv/insns/vmv_s_x.h
new file mode 100644
index 0000000000..0e6a13e56a
--- /dev/null
+++ b/riscv/insns/vmv_s_x.h
@@ -0,0 +1,29 @@
+// vmv_s_x: vd[0] = rs1
+require_vector(true);
+require(insn.v_vm() == 1);
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+reg_t vl = P.VU.vl;
+
+if (vl > 0 && P.VU.vstart < vl) {
+  reg_t rd_num = insn.rd();
+  reg_t sew = P.VU.vsew;
+
+  switch(sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, 0, true) = RS1;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, 0, true) = RS1;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, 0, true) = RS1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, 0, true) = RS1;
+    break;
+  }
+
+  vl = 0;
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmv_v_i.h b/riscv/insns/vmv_v_i.h
new file mode 100644
index 0000000000..a760779107
--- /dev/null
+++ b/riscv/insns/vmv_v_i.h
@@ -0,0 +1,7 @@
+// vmv.v.i vd, simm5
+require_vector(true);
+VI_CHECK_SSS(false);
+VI_VVXI_MERGE_LOOP
+({
+  vd = simm5;
+})
diff --git a/riscv/insns/vmv_v_v.h b/riscv/insns/vmv_v_v.h
new file mode 100644
index 0000000000..d7f47d0882
--- /dev/null
+++ b/riscv/insns/vmv_v_v.h
@@ -0,0 +1,7 @@
+// vvmv.v.v vd, vs1
+require_vector(true);
+VI_CHECK_SSS(true);
+VI_VVXI_MERGE_LOOP
+({
+  vd = vs1;
+})
diff --git a/riscv/insns/vmv_v_x.h b/riscv/insns/vmv_v_x.h
new file mode 100644
index 0000000000..fa7c920be1
--- /dev/null
+++ b/riscv/insns/vmv_v_x.h
@@ -0,0 +1,7 @@
+// vmv.v.x vd, rs1
+require_vector(true);
+VI_CHECK_SSS(false);
+VI_VVXI_MERGE_LOOP
+({
+  vd = rs1;
+})
diff --git a/riscv/insns/vmv_x_s.h b/riscv/insns/vmv_x_s.h
new file mode 100644
index 0000000000..2c03e43e8c
--- /dev/null
+++ b/riscv/insns/vmv_x_s.h
@@ -0,0 +1,31 @@
+// vmv_x_s: rd = vs2[rs1]
+require_vector(true);
+require(insn.v_vm() == 1);
+uint64_t xmask = UINT64_MAX >> (64 - P.get_max_xlen());
+reg_t rs1 = RS1;
+reg_t sew = P.VU.vsew;
+reg_t rs2_num = insn.rs2();
+
+if (!(rs1 >= 0 && rs1 < (P.VU.get_vlen() / sew))) {
+  WRITE_RD(0);
+} else {
+  switch(sew) {
+  case e8:
+    WRITE_RD(P.VU.elt<int8_t>(rs2_num, rs1));
+    break;
+  case e16:
+    WRITE_RD(P.VU.elt<int16_t>(rs2_num, rs1));
+    break;
+  case e32:
+    WRITE_RD(P.VU.elt<int32_t>(rs2_num, rs1));
+    break;
+  case e64:
+    if (P.get_max_xlen() <= sew)
+      WRITE_RD(P.VU.elt<uint64_t>(rs2_num, rs1) & xmask);
+    else
+      WRITE_RD(P.VU.elt<uint64_t>(rs2_num, rs1));
+    break;
+  }
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmvnfr_v.h b/riscv/insns/vmvnfr_v.h
new file mode 100644
index 0000000000..96f0074ce1
--- /dev/null
+++ b/riscv/insns/vmvnfr_v.h
@@ -0,0 +1,27 @@
+// vmv1r.v vd, vs2
+require_vector_novtype(true, true);
+const reg_t baseAddr = RS1;
+const reg_t vd = insn.rd();
+const reg_t vs2 = insn.rs2();
+const reg_t len = insn.rs1() + 1;
+require_align(vd, len);
+require_align(vs2, len);
+const reg_t size = len * P.VU.vlenb;
+
+//register needs one-by-one copy to keep commitlog correct
+if (vd != vs2 && P.VU.vstart < size) {
+  reg_t i = P.VU.vstart / P.VU.vlenb;
+  reg_t off = P.VU.vstart % P.VU.vlenb;
+  if (off) {
+    memcpy(&P.VU.elt<uint8_t>(vd + i, off, true),
+           &P.VU.elt<uint8_t>(vs2 + i, off), P.VU.vlenb - off);
+    i++;
+  }
+
+  for (; i < len; ++i) {
+    memcpy(&P.VU.elt<uint8_t>(vd + i, 0, true),
+           &P.VU.elt<uint8_t>(vs2 + i, 0), P.VU.vlenb);
+  }
+}
+
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmxnor_mm.h b/riscv/insns/vmxnor_mm.h
new file mode 100644
index 0000000000..0736d5b21e
--- /dev/null
+++ b/riscv/insns/vmxnor_mm.h
@@ -0,0 +1,2 @@
+// vmnxor.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 ^ vs1));
diff --git a/riscv/insns/vmxor_mm.h b/riscv/insns/vmxor_mm.h
new file mode 100644
index 0000000000..7f0c576e37
--- /dev/null
+++ b/riscv/insns/vmxor_mm.h
@@ -0,0 +1,2 @@
+// vmxor.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 ^ vs1);
diff --git a/riscv/insns/vnclip_wi.h b/riscv/insns/vnclip_wi.h
new file mode 100644
index 0000000000..1647212392
--- /dev/null
+++ b/riscv/insns/vnclip_wi.h
@@ -0,0 +1,25 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> simm)
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = INT64_MAX >> (64 - P.VU.vsew);
+int64_t int_min = INT64_MIN >> (64 - P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+  int128_t result = vs2;
+  unsigned shift = zimm5 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  result = result >> shift;
+
+  // saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, false)
diff --git a/riscv/insns/vnclip_wv.h b/riscv/insns/vnclip_wv.h
new file mode 100644
index 0000000000..d87a3378e7
--- /dev/null
+++ b/riscv/insns/vnclip_wv.h
@@ -0,0 +1,25 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> vs1[i])
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = INT64_MAX >> (64 - P.VU.vsew);
+int64_t int_min = INT64_MIN >> (64 - P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+  int128_t result = vs2;
+  unsigned shift = vs1 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  result = result >> shift;
+
+  // saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, true)
diff --git a/riscv/insns/vnclip_wx.h b/riscv/insns/vnclip_wx.h
new file mode 100644
index 0000000000..9dbfcd7084
--- /dev/null
+++ b/riscv/insns/vnclip_wx.h
@@ -0,0 +1,25 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> rs1[i])
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = INT64_MAX >> (64 - P.VU.vsew);
+int64_t int_min = INT64_MIN >> (64 - P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+  int128_t result = vs2;
+  unsigned shift = rs1 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  result = result >> shift;
+
+  // saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, false)
diff --git a/riscv/insns/vnclipu_wi.h b/riscv/insns/vnclipu_wi.h
new file mode 100644
index 0000000000..8e4e0dad27
--- /dev/null
+++ b/riscv/insns/vnclipu_wi.h
@@ -0,0 +1,23 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> simm)
+VRM xrm = P.VU.get_vround_mode();
+uint64_t uint_max = UINT64_MAX >> (64 - P.VU.vsew);
+uint64_t sign_mask = UINT64_MAX << P.VU.vsew;
+VI_VVXI_LOOP_NARROW
+({
+  uint128_t result = vs2_u;
+  unsigned shift = zimm5 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  // unsigned shifting to rs1
+  result = result >> shift;
+
+  // saturation
+  if (result & sign_mask) {
+    result = uint_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, false)
diff --git a/riscv/insns/vnclipu_wv.h b/riscv/insns/vnclipu_wv.h
new file mode 100644
index 0000000000..f045964f71
--- /dev/null
+++ b/riscv/insns/vnclipu_wv.h
@@ -0,0 +1,22 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> vs1[i])
+VRM xrm = P.VU.get_vround_mode();
+uint64_t uint_max = UINT64_MAX >> (64 - P.VU.vsew);
+uint64_t sign_mask = UINT64_MAX << P.VU.vsew;
+VI_VVXI_LOOP_NARROW
+({
+  uint128_t result = vs2_u;
+  unsigned shift = vs1 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  result = result >> shift;
+
+  // saturation
+  if (result & sign_mask) {
+    result = uint_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, true)
diff --git a/riscv/insns/vnclipu_wx.h b/riscv/insns/vnclipu_wx.h
new file mode 100644
index 0000000000..d5155c11b3
--- /dev/null
+++ b/riscv/insns/vnclipu_wx.h
@@ -0,0 +1,22 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> rs1[i])
+VRM xrm = P.VU.get_vround_mode();
+uint64_t uint_max = UINT64_MAX >> (64 - P.VU.vsew);
+uint64_t sign_mask = UINT64_MAX << P.VU.vsew;
+VI_VVXI_LOOP_NARROW
+({
+  uint128_t result = vs2_u;
+  unsigned shift = rs1 & ((sew * 2) - 1);
+
+  // rounding
+  INT_ROUNDING(result, xrm, shift);
+
+  result = result >> shift;
+
+  // saturation
+  if (result & sign_mask) {
+    result = uint_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+}, false)
diff --git a/riscv/insns/vnmsac_vv.h b/riscv/insns/vnmsac_vv.h
new file mode 100644
index 0000000000..7c10f29af7
--- /dev/null
+++ b/riscv/insns/vnmsac_vv.h
@@ -0,0 +1,5 @@
+// vmsac.vv: vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+VI_VV_LOOP
+({
+  vd = -(vs1 * vs2) + vd;
+})
diff --git a/riscv/insns/vnmsac_vx.h b/riscv/insns/vnmsac_vx.h
new file mode 100644
index 0000000000..44920be4b2
--- /dev/null
+++ b/riscv/insns/vnmsac_vx.h
@@ -0,0 +1,5 @@
+// vmsac: vd[i] = -(x[rs1] * vs2[i]) + vd[i]
+VI_VX_LOOP
+({
+  vd = -(rs1 * vs2) + vd;
+})
diff --git a/riscv/insns/vnmsub_vv.h b/riscv/insns/vnmsub_vv.h
new file mode 100644
index 0000000000..37f82286c4
--- /dev/null
+++ b/riscv/insns/vnmsub_vv.h
@@ -0,0 +1,5 @@
+// vnmsub.vv: vd[i] = -(vd[i] * vs1[i]) + vs2[i]
+VI_VV_LOOP
+({
+  vd = -(vd * vs1) + vs2;
+})
diff --git a/riscv/insns/vnmsub_vx.h b/riscv/insns/vnmsub_vx.h
new file mode 100644
index 0000000000..2e00d22e4a
--- /dev/null
+++ b/riscv/insns/vnmsub_vx.h
@@ -0,0 +1,5 @@
+// vnmsub.vx: vd[i] = -(vd[i] * x[rs1]) + vs2[i]
+VI_VX_LOOP
+({
+  vd = -(vd * rs1) + vs2;
+})
diff --git a/riscv/insns/vnsra_wi.h b/riscv/insns/vnsra_wi.h
new file mode 100644
index 0000000000..f41979edff
--- /dev/null
+++ b/riscv/insns/vnsra_wi.h
@@ -0,0 +1,5 @@
+// vnsra.vi vd, vs2, zimm5
+VI_VI_LOOP_NSHIFT
+({
+  vd = vs2 >> (zimm5 & (sew * 2 - 1) & 0x1f);
+}, false)
diff --git a/riscv/insns/vnsra_wv.h b/riscv/insns/vnsra_wv.h
new file mode 100644
index 0000000000..59f255ef30
--- /dev/null
+++ b/riscv/insns/vnsra_wv.h
@@ -0,0 +1,5 @@
+// vnsra.vv vd, vs2, vs1
+VI_VV_LOOP_NSHIFT
+({
+  vd = vs2 >> (vs1 & (sew * 2 - 1));
+}, true)
diff --git a/riscv/insns/vnsra_wx.h b/riscv/insns/vnsra_wx.h
new file mode 100644
index 0000000000..adaa24c384
--- /dev/null
+++ b/riscv/insns/vnsra_wx.h
@@ -0,0 +1,5 @@
+// vnsra.vx vd, vs2, rs1
+VI_VX_LOOP_NSHIFT
+({
+  vd = vs2 >> (rs1 & (sew * 2 - 1));
+}, false)
diff --git a/riscv/insns/vnsrl_wi.h b/riscv/insns/vnsrl_wi.h
new file mode 100644
index 0000000000..91402c0c2a
--- /dev/null
+++ b/riscv/insns/vnsrl_wi.h
@@ -0,0 +1,5 @@
+// vnsrl.vi vd, vs2, zimm5
+VI_VI_LOOP_NSHIFT
+({
+  vd = vs2_u >> (zimm5 & (sew * 2 - 1));
+}, false)
diff --git a/riscv/insns/vnsrl_wv.h b/riscv/insns/vnsrl_wv.h
new file mode 100644
index 0000000000..609299faf8
--- /dev/null
+++ b/riscv/insns/vnsrl_wv.h
@@ -0,0 +1,5 @@
+// vnsrl.vv vd, vs2, vs1
+VI_VV_LOOP_NSHIFT
+({
+  vd = vs2_u >> (vs1 & (sew * 2 - 1));
+}, true)
diff --git a/riscv/insns/vnsrl_wx.h b/riscv/insns/vnsrl_wx.h
new file mode 100644
index 0000000000..8356a2bd77
--- /dev/null
+++ b/riscv/insns/vnsrl_wx.h
@@ -0,0 +1,5 @@
+// vnsrl.vx vd, vs2, rs1
+VI_VX_LOOP_NSHIFT
+({
+  vd = vs2_u >> (rs1 & (sew * 2 - 1));
+}, false)
diff --git a/riscv/insns/vor_vi.h b/riscv/insns/vor_vi.h
new file mode 100644
index 0000000000..f759607497
--- /dev/null
+++ b/riscv/insns/vor_vi.h
@@ -0,0 +1,5 @@
+// vor
+VI_VI_LOOP
+({
+  vd = simm5 | vs2;
+})
diff --git a/riscv/insns/vor_vv.h b/riscv/insns/vor_vv.h
new file mode 100644
index 0000000000..0c460662bf
--- /dev/null
+++ b/riscv/insns/vor_vv.h
@@ -0,0 +1,5 @@
+// vor
+VI_VV_LOOP
+({
+  vd = vs1 | vs2;
+})
diff --git a/riscv/insns/vor_vx.h b/riscv/insns/vor_vx.h
new file mode 100644
index 0000000000..01c003ab35
--- /dev/null
+++ b/riscv/insns/vor_vx.h
@@ -0,0 +1,5 @@
+// vor
+VI_VX_LOOP
+({
+  vd = rs1 | vs2;
+})
diff --git a/riscv/insns/vpopc_m.h b/riscv/insns/vpopc_m.h
new file mode 100644
index 0000000000..c204b2c0e2
--- /dev/null
+++ b/riscv/insns/vpopc_m.h
@@ -0,0 +1,23 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require_vector(true);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+reg_t popcount = 0;
+for (reg_t i=P.VU.vstart; i<vl; ++i) {
+  const int midx = i / 32;
+  const int mpos = i % 32;
+
+  bool vs2_lsb = ((P.VU.elt<uint32_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  if (insn.v_vm() == 1) {
+    popcount += vs2_lsb;
+  } else {
+    bool do_mask = (P.VU.elt<uint32_t>(0, midx) >> mpos) & 0x1;
+    popcount += (vs2_lsb && do_mask);
+  }
+}
+P.VU.vstart = 0;
+WRITE_RD(popcount);
diff --git a/riscv/insns/vredand_vs.h b/riscv/insns/vredand_vs.h
new file mode 100644
index 0000000000..6c2d9089fa
--- /dev/null
+++ b/riscv/insns/vredand_vs.h
@@ -0,0 +1,5 @@
+// vredand.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res &= vs2;
+})
diff --git a/riscv/insns/vredmax_vs.h b/riscv/insns/vredmax_vs.h
new file mode 100644
index 0000000000..be2e76ab3a
--- /dev/null
+++ b/riscv/insns/vredmax_vs.h
@@ -0,0 +1,5 @@
+// vredmax.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res >= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredmaxu_vs.h b/riscv/insns/vredmaxu_vs.h
new file mode 100644
index 0000000000..960f486181
--- /dev/null
+++ b/riscv/insns/vredmaxu_vs.h
@@ -0,0 +1,5 @@
+// vredmaxu.vs vd, vs2 ,vs1
+VI_VV_ULOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res >= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredmin_vs.h b/riscv/insns/vredmin_vs.h
new file mode 100644
index 0000000000..50359b7a53
--- /dev/null
+++ b/riscv/insns/vredmin_vs.h
@@ -0,0 +1,5 @@
+// vredmin.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res <= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredminu_vs.h b/riscv/insns/vredminu_vs.h
new file mode 100644
index 0000000000..708247592f
--- /dev/null
+++ b/riscv/insns/vredminu_vs.h
@@ -0,0 +1,5 @@
+// vredminu.vs vd, vs2 ,vs1
+VI_VV_ULOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res <= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredor_vs.h b/riscv/insns/vredor_vs.h
new file mode 100644
index 0000000000..f7acd9aa9d
--- /dev/null
+++ b/riscv/insns/vredor_vs.h
@@ -0,0 +1,5 @@
+// vredor.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res |= vs2;
+})
diff --git a/riscv/insns/vredsum_vs.h b/riscv/insns/vredsum_vs.h
new file mode 100644
index 0000000000..c4fefe57f6
--- /dev/null
+++ b/riscv/insns/vredsum_vs.h
@@ -0,0 +1,5 @@
+// vredsum.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vredxor_vs.h b/riscv/insns/vredxor_vs.h
new file mode 100644
index 0000000000..bb81ad9a4f
--- /dev/null
+++ b/riscv/insns/vredxor_vs.h
@@ -0,0 +1,5 @@
+// vredxor.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res ^= vs2;
+})
diff --git a/riscv/insns/vrem_vv.h b/riscv/insns/vrem_vv.h
new file mode 100644
index 0000000000..260716a0eb
--- /dev/null
+++ b/riscv/insns/vrem_vv.h
@@ -0,0 +1,11 @@
+// vrem.vv vd, vs2, vs1
+VI_VV_LOOP
+({
+  if (vs1 == 0)
+    vd = vs2;
+  else if(vs2 == -(((intmax_t)1) << (sew - 1)) && vs1 == -1)
+    vd = 0;
+  else {
+    vd = vs2 % vs1;
+  }
+})
diff --git a/riscv/insns/vrem_vx.h b/riscv/insns/vrem_vx.h
new file mode 100644
index 0000000000..3702f02f41
--- /dev/null
+++ b/riscv/insns/vrem_vx.h
@@ -0,0 +1,10 @@
+// vrem.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  if (rs1 == 0)
+    vd = vs2;
+  else if (vs2 == -(((intmax_t)1) << (sew - 1)) && rs1 == -1)
+    vd = 0;
+  else
+    vd = vs2 % rs1;
+})
diff --git a/riscv/insns/vremu_vv.h b/riscv/insns/vremu_vv.h
new file mode 100644
index 0000000000..7e1507235a
--- /dev/null
+++ b/riscv/insns/vremu_vv.h
@@ -0,0 +1,8 @@
+// vremu.vv vd, vs2, vs1
+VI_VV_ULOOP
+({
+  if (vs1 == 0)
+    vd = vs2;
+  else
+    vd = vs2 % vs1;
+})
diff --git a/riscv/insns/vremu_vx.h b/riscv/insns/vremu_vx.h
new file mode 100644
index 0000000000..a87a8200a8
--- /dev/null
+++ b/riscv/insns/vremu_vx.h
@@ -0,0 +1,8 @@
+// vremu.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  if (rs1 == 0)
+    vd = vs2;
+  else
+    vd = vs2 % rs1;
+})
diff --git a/riscv/insns/vrgather_vi.h b/riscv/insns/vrgather_vi.h
new file mode 100644
index 0000000000..385e9be973
--- /dev/null
+++ b/riscv/insns/vrgather_vi.h
@@ -0,0 +1,30 @@
+// vrgather.vi vd, vs2, zimm5 vm # vd[i] = (zimm5 >= VLMAX) ? 0 : vs2[zimm5];
+require_align(insn.rd(), P.VU.vflmul);
+require_align(insn.rs2(), P.VU.vflmul);
+require(insn.rd() != insn.rs2());
+require_vm;
+
+reg_t zimm5 = insn.v_zimm5();
+
+VI_LOOP_BASE
+
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i, true) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, zimm5);
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i, true) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, zimm5);
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i, true) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, zimm5);
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, zimm5);
+    break;
+  }
+}
+
+VI_LOOP_END;
diff --git a/riscv/insns/vrgather_vv.h b/riscv/insns/vrgather_vv.h
new file mode 100644
index 0000000000..a3a32f560f
--- /dev/null
+++ b/riscv/insns/vrgather_vv.h
@@ -0,0 +1,32 @@
+// vrgather.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+require_align(insn.rd(), P.VU.vflmul);
+require_align(insn.rs2(), P.VU.vflmul);
+require_align(insn.rs1(), P.VU.vflmul);
+require(insn.rd() != insn.rs2() && insn.rd() != insn.rs1());
+require_vm;
+
+VI_LOOP_BASE
+  switch (sew) {
+  case e8: {
+    auto vs1 = P.VU.elt<uint8_t>(rs1_num, i);
+    //if (i > 255) continue;
+    P.VU.elt<uint8_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, vs1);
+    break;
+  }
+  case e16: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint16_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, vs1);
+    break;
+  }
+  case e32: {
+    auto vs1 = P.VU.elt<uint32_t>(rs1_num, i);
+    P.VU.elt<uint32_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, vs1);
+    break;
+  }
+  default: {
+    auto vs1 = P.VU.elt<uint64_t>(rs1_num, i);
+    P.VU.elt<uint64_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, vs1);
+    break;
+  }
+  }
+VI_LOOP_END;
diff --git a/riscv/insns/vrgather_vx.h b/riscv/insns/vrgather_vx.h
new file mode 100644
index 0000000000..058ffae104
--- /dev/null
+++ b/riscv/insns/vrgather_vx.h
@@ -0,0 +1,24 @@
+// vrgather.vx vd, vs2, rs1, vm # vd[i] = (rs1 >= VLMAX) ? 0 : vs2[rs1];
+require_align(insn.rd(), P.VU.vflmul);
+require_align(insn.rs2(), P.VU.vflmul);
+require(insn.rd() != insn.rs2());
+require_vm;
+
+reg_t rs1 = RS1;
+
+VI_LOOP_BASE
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i, true) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, rs1);
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i, true) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, rs1);
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i, true) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, rs1);
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, rs1);
+    break;
+  }
+VI_LOOP_END;
diff --git a/riscv/insns/vrgatherei16_vv.h b/riscv/insns/vrgatherei16_vv.h
new file mode 100644
index 0000000000..3bb166a237
--- /dev/null
+++ b/riscv/insns/vrgatherei16_vv.h
@@ -0,0 +1,34 @@
+// vrgatherei16.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+float vemul = (16.0 / P.VU.vsew * P.VU.vflmul);
+require(vemul >= 0.125 && vemul <= 8);
+require_align(insn.rd(), P.VU.vflmul);
+require_align(insn.rs2(), P.VU.vflmul);
+require_align(insn.rs1(), vemul);
+require_noover(insn.rd(), P.VU.vflmul, insn.rs1(), vemul);
+require(insn.rd() != insn.rs2());
+require_vm;
+
+VI_LOOP_BASE
+  switch (sew) {
+  case e8: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint8_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, vs1);
+    break;
+  }
+  case e16: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint16_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, vs1);
+    break;
+  }
+  case e32: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint32_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, vs1);
+    break;
+  }
+  default: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint64_t>(rd_num, i, true) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, vs1);
+    break;
+  }
+  }
+VI_LOOP_END;
diff --git a/riscv/insns/vrsub_vi.h b/riscv/insns/vrsub_vi.h
new file mode 100644
index 0000000000..198c33f927
--- /dev/null
+++ b/riscv/insns/vrsub_vi.h
@@ -0,0 +1,5 @@
+// vrsub.vi vd, vs2, imm, vm   # vd[i] = imm - vs2[i]
+VI_VI_LOOP
+({
+  vd = simm5 - vs2;
+})
diff --git a/riscv/insns/vrsub_vx.h b/riscv/insns/vrsub_vx.h
new file mode 100644
index 0000000000..bfd62594db
--- /dev/null
+++ b/riscv/insns/vrsub_vx.h
@@ -0,0 +1,5 @@
+// vrsub.vx vd, vs2, rs1, vm   # vd[i] = rs1 - vs2[i]
+VI_VX_LOOP
+({
+  vd = rs1 - vs2;
+})
diff --git a/riscv/insns/vs1r_v.h b/riscv/insns/vs1r_v.h
new file mode 100644
index 0000000000..1932ec0b88
--- /dev/null
+++ b/riscv/insns/vs1r_v.h
@@ -0,0 +1,2 @@
+// vs1r.v vs3, (rs1)
+VI_ST_WHOLE
diff --git a/riscv/insns/vs2r_v.h b/riscv/insns/vs2r_v.h
new file mode 100644
index 0000000000..2e515b476c
--- /dev/null
+++ b/riscv/insns/vs2r_v.h
@@ -0,0 +1,2 @@
+// vs2r.v vs3, (rs1)
+VI_ST_WHOLE
diff --git a/riscv/insns/vs4r_v.h b/riscv/insns/vs4r_v.h
new file mode 100644
index 0000000000..161bf89a31
--- /dev/null
+++ b/riscv/insns/vs4r_v.h
@@ -0,0 +1,2 @@
+// vs4r.v vs3, (rs1)
+VI_ST_WHOLE
diff --git a/riscv/insns/vs8r_v.h b/riscv/insns/vs8r_v.h
new file mode 100644
index 0000000000..1ad2575638
--- /dev/null
+++ b/riscv/insns/vs8r_v.h
@@ -0,0 +1,2 @@
+// vs8r.v vs3, (rs1)
+VI_ST_WHOLE
diff --git a/riscv/insns/vsadd_vi.h b/riscv/insns/vsadd_vi.h
new file mode 100644
index 0000000000..c361f087f7
--- /dev/null
+++ b/riscv/insns/vsadd_vi.h
@@ -0,0 +1,28 @@
+// vsadd.vi vd, vs2 simm5
+VI_CHECK_SSS(false);
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VI_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+case e16: {
+  VI_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+case e32: {
+  VI_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+default: {
+  VI_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsadd_vv.h b/riscv/insns/vsadd_vv.h
new file mode 100644
index 0000000000..ce0ef4071b
--- /dev/null
+++ b/riscv/insns/vsadd_vv.h
@@ -0,0 +1,29 @@
+// vsadd.vv vd, vs2, vs1
+VI_CHECK_SSS(true);
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VV_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
+
diff --git a/riscv/insns/vsadd_vx.h b/riscv/insns/vsadd_vx.h
new file mode 100644
index 0000000000..691f017fff
--- /dev/null
+++ b/riscv/insns/vsadd_vx.h
@@ -0,0 +1,28 @@
+// vsadd.vx vd, vs2, rs1
+VI_CHECK_SSS(false);
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VX_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsaddu_vi.h b/riscv/insns/vsaddu_vi.h
new file mode 100644
index 0000000000..7a200dff74
--- /dev/null
+++ b/riscv/insns/vsaddu_vi.h
@@ -0,0 +1,11 @@
+// vsaddu vd, vs2, zimm5
+VI_VI_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + (insn.v_simm5() & (UINT64_MAX >> (64 - P.VU.vsew)));
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+})
diff --git a/riscv/insns/vsaddu_vv.h b/riscv/insns/vsaddu_vv.h
new file mode 100644
index 0000000000..e5d7025f05
--- /dev/null
+++ b/riscv/insns/vsaddu_vv.h
@@ -0,0 +1,11 @@
+// vsaddu vd, vs2, vs1
+VI_VV_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + vs1;
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+})
diff --git a/riscv/insns/vsaddu_vx.h b/riscv/insns/vsaddu_vx.h
new file mode 100644
index 0000000000..46ec29d9b7
--- /dev/null
+++ b/riscv/insns/vsaddu_vx.h
@@ -0,0 +1,12 @@
+// vsaddu vd, vs2, rs1
+VI_VX_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + rs1;
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+
+})
diff --git a/riscv/insns/vsbc_vvm.h b/riscv/insns/vsbc_vvm.h
new file mode 100644
index 0000000000..96b8bb8041
--- /dev/null
+++ b/riscv/insns/vsbc_vvm.h
@@ -0,0 +1,10 @@
+// vsbc.vvm vd, vs2, rs1, v0
+VI_VV_LOOP_WITH_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs2) - (op_mask & vs1) - carry;
+  vd = res;
+})
diff --git a/riscv/insns/vsbc_vxm.h b/riscv/insns/vsbc_vxm.h
new file mode 100644
index 0000000000..c6f9ca82a2
--- /dev/null
+++ b/riscv/insns/vsbc_vxm.h
@@ -0,0 +1,10 @@
+// vsbc.vxm vd, vs2, rs1, v0
+VI_XI_LOOP_WITH_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs2) - (op_mask & rs1) - carry;
+  vd = res;
+})
diff --git a/riscv/insns/vse16_v.h b/riscv/insns/vse16_v.h
new file mode 100644
index 0000000000..20b04c869b
--- /dev/null
+++ b/riscv/insns/vse16_v.h
@@ -0,0 +1,2 @@
+// vse16.v and vsseg[2-8]e16.v
+VI_ST(0, (i * nf + fn), uint16);
diff --git a/riscv/insns/vse32_v.h b/riscv/insns/vse32_v.h
new file mode 100644
index 0000000000..efd2973d07
--- /dev/null
+++ b/riscv/insns/vse32_v.h
@@ -0,0 +1,2 @@
+// vse32.v and vsseg[2-8]e32.v
+VI_ST(0, (i * nf + fn), uint32);
diff --git a/riscv/insns/vse64_v.h b/riscv/insns/vse64_v.h
new file mode 100644
index 0000000000..9b36c8d8d2
--- /dev/null
+++ b/riscv/insns/vse64_v.h
@@ -0,0 +1,2 @@
+// vse64.v and vsseg[2-8]e64.v
+VI_ST(0, (i * nf + fn), uint64);
diff --git a/riscv/insns/vse8_v.h b/riscv/insns/vse8_v.h
new file mode 100644
index 0000000000..32dee14b4c
--- /dev/null
+++ b/riscv/insns/vse8_v.h
@@ -0,0 +1,2 @@
+// vse8.v and vsseg[2-8]e8.v
+VI_ST(0, (i * nf + fn), uint8);
diff --git a/riscv/insns/vsetvl.h b/riscv/insns/vsetvl.h
new file mode 100644
index 0000000000..2969edc6ce
--- /dev/null
+++ b/riscv/insns/vsetvl.h
@@ -0,0 +1,2 @@
+require_vector_novtype(false, false);
+WRITE_RD(P.VU.set_vl(insn.rd(), insn.rs1(), RS1, RS2));
diff --git a/riscv/insns/vsetvli.h b/riscv/insns/vsetvli.h
new file mode 100644
index 0000000000..7b1f1d716c
--- /dev/null
+++ b/riscv/insns/vsetvli.h
@@ -0,0 +1,2 @@
+require_vector_novtype(false, false);
+WRITE_RD(P.VU.set_vl(insn.rd(), insn.rs1(), RS1, insn.v_zimm11()));
diff --git a/riscv/insns/vsext_vf2.h b/riscv/insns/vsext_vf2.h
new file mode 100644
index 0000000000..16ccfac607
--- /dev/null
+++ b/riscv/insns/vsext_vf2.h
@@ -0,0 +1 @@
+VI_VV_EXT(2, int);
diff --git a/riscv/insns/vsext_vf4.h b/riscv/insns/vsext_vf4.h
new file mode 100644
index 0000000000..d4476a310f
--- /dev/null
+++ b/riscv/insns/vsext_vf4.h
@@ -0,0 +1 @@
+VI_VV_EXT(4, int);
diff --git a/riscv/insns/vsext_vf8.h b/riscv/insns/vsext_vf8.h
new file mode 100644
index 0000000000..09fdc2c75c
--- /dev/null
+++ b/riscv/insns/vsext_vf8.h
@@ -0,0 +1 @@
+VI_VV_EXT(8, int);
diff --git a/riscv/insns/vslide1down_vx.h b/riscv/insns/vslide1down_vx.h
new file mode 100644
index 0000000000..e867722fa8
--- /dev/null
+++ b/riscv/insns/vslide1down_vx.h
@@ -0,0 +1,44 @@
+//vslide1down.vx vd, vs2, rs1
+VI_CHECK_SLIDE(false);
+
+VI_LOOP_BASE
+if (i != vl - 1) {
+  switch (sew) {
+  case e8: {
+    VI_XI_SLIDEDOWN_PARAMS(e8, 1);
+    vd = vs2;
+  }
+  break;
+  case e16: {
+    VI_XI_SLIDEDOWN_PARAMS(e16, 1);
+    vd = vs2;
+  }
+  break;
+  case e32: {
+    VI_XI_SLIDEDOWN_PARAMS(e32, 1);
+    vd = vs2;
+  }
+  break;
+  default: {
+    VI_XI_SLIDEDOWN_PARAMS(e64, 1);
+    vd = vs2;
+  }
+  break;
+  }
+} else {
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, vl - 1, true) = RS1;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, vl - 1, true) = RS1;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, vl - 1, true) = RS1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, vl - 1, true) = RS1;
+    break;
+  }
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslide1up_vx.h b/riscv/insns/vslide1up_vx.h
new file mode 100644
index 0000000000..33cb9ed641
--- /dev/null
+++ b/riscv/insns/vslide1up_vx.h
@@ -0,0 +1,30 @@
+//vslide1up.vx vd, vs2, rs1
+VI_CHECK_SLIDE(true);
+
+VI_LOOP_BASE
+if (i != 0) {
+  if (sew == e8) {
+    VI_XI_SLIDEUP_PARAMS(e8, 1);
+    vd = vs2;
+  } else if(sew == e16) {
+    VI_XI_SLIDEUP_PARAMS(e16, 1);
+    vd = vs2;
+  } else if(sew == e32) {
+    VI_XI_SLIDEUP_PARAMS(e32, 1);
+    vd = vs2;
+  } else if(sew == e64) {
+    VI_XI_SLIDEUP_PARAMS(e64, 1);
+    vd = vs2;
+  }
+} else {
+  if (sew == e8) {
+    P.VU.elt<uint8_t>(rd_num, 0, true) = RS1;
+  } else if(sew == e16) {
+    P.VU.elt<uint16_t>(rd_num, 0, true) = RS1;
+  } else if(sew == e32) {
+    P.VU.elt<uint32_t>(rd_num, 0, true) = RS1;
+  } else if(sew == e64) {
+    P.VU.elt<uint64_t>(rd_num, 0, true) = RS1;
+  }
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslidedown_vi.h b/riscv/insns/vslidedown_vi.h
new file mode 100644
index 0000000000..bc440cf2bd
--- /dev/null
+++ b/riscv/insns/vslidedown_vi.h
@@ -0,0 +1,36 @@
+// vslidedown.vi vd, vs2, rs1
+VI_CHECK_SLIDE(false);
+
+const reg_t sh = insn.v_zimm5();
+VI_LOOP_BASE
+
+reg_t offset = 0;
+bool is_valid = (i + sh) < P.VU.vlmax;
+
+if (is_valid) {
+  offset = sh;
+}
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEDOWN_PARAMS(e8, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e16: {
+  VI_XI_SLIDEDOWN_PARAMS(e16, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e32: {
+  VI_XI_SLIDEDOWN_PARAMS(e32, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+default: {
+  VI_XI_SLIDEDOWN_PARAMS(e64, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslidedown_vx.h b/riscv/insns/vslidedown_vx.h
new file mode 100644
index 0000000000..074aa50868
--- /dev/null
+++ b/riscv/insns/vslidedown_vx.h
@@ -0,0 +1,36 @@
+//vslidedown.vx vd, vs2, rs1
+VI_CHECK_SLIDE(false);
+
+const uint128_t sh = RS1;
+VI_LOOP_BASE
+
+reg_t offset = 0;
+bool is_valid = (i + sh) < P.VU.vlmax;
+
+if (is_valid) {
+  offset = sh;
+}
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEDOWN_PARAMS(e8, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e16: {
+  VI_XI_SLIDEDOWN_PARAMS(e16, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e32: {
+  VI_XI_SLIDEDOWN_PARAMS(e32, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+default: {
+  VI_XI_SLIDEDOWN_PARAMS(e64, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslideup_vi.h b/riscv/insns/vslideup_vi.h
new file mode 100644
index 0000000000..42657892af
--- /dev/null
+++ b/riscv/insns/vslideup_vi.h
@@ -0,0 +1,31 @@
+// vslideup.vi vd, vs2, rs1
+VI_CHECK_SLIDE(true);
+
+const reg_t offset = insn.v_zimm5();
+VI_LOOP_BASE
+if (P.VU.vstart < offset && i < offset)
+  continue;
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEUP_PARAMS(e8, offset);
+  vd = vs2;
+}
+break;
+case e16: {
+  VI_XI_SLIDEUP_PARAMS(e16, offset);
+  vd = vs2;
+}
+break;
+case e32: {
+  VI_XI_SLIDEUP_PARAMS(e32, offset);
+  vd = vs2;
+}
+break;
+default: {
+  VI_XI_SLIDEUP_PARAMS(e64, offset);
+  vd = vs2;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslideup_vx.h b/riscv/insns/vslideup_vx.h
new file mode 100644
index 0000000000..720d2ab53b
--- /dev/null
+++ b/riscv/insns/vslideup_vx.h
@@ -0,0 +1,31 @@
+//vslideup.vx vd, vs2, rs1
+VI_CHECK_SLIDE(true);
+
+const reg_t offset = RS1;
+VI_LOOP_BASE
+if (P.VU.vstart < offset && i < offset)
+  continue;
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEUP_PARAMS(e8, offset);
+  vd = vs2;
+}
+break;
+case e16: {
+  VI_XI_SLIDEUP_PARAMS(e16, offset);
+  vd = vs2;
+}
+break;
+case e32: {
+  VI_XI_SLIDEUP_PARAMS(e32, offset);
+  vd = vs2;
+}
+break;
+default: {
+  VI_XI_SLIDEUP_PARAMS(e64, offset);
+  vd = vs2;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vsll_vi.h b/riscv/insns/vsll_vi.h
new file mode 100644
index 0000000000..be4650669f
--- /dev/null
+++ b/riscv/insns/vsll_vi.h
@@ -0,0 +1,5 @@
+// vsll.vi  vd, vs2, zimm5
+VI_VI_LOOP
+({
+  vd = vs2 << (simm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsll_vv.h b/riscv/insns/vsll_vv.h
new file mode 100644
index 0000000000..ce82022504
--- /dev/null
+++ b/riscv/insns/vsll_vv.h
@@ -0,0 +1,5 @@
+// vsll
+VI_VV_LOOP
+({
+  vd = vs2 << (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsll_vx.h b/riscv/insns/vsll_vx.h
new file mode 100644
index 0000000000..823510b2c5
--- /dev/null
+++ b/riscv/insns/vsll_vx.h
@@ -0,0 +1,5 @@
+// vsll
+VI_VX_LOOP
+({
+  vd = vs2 << (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsmul_vv.h b/riscv/insns/vsmul_vv.h
new file mode 100644
index 0000000000..e7ce306e9e
--- /dev/null
+++ b/riscv/insns/vsmul_vv.h
@@ -0,0 +1,32 @@
+// vsmul.vv vd, vs2, vs1
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = INT64_MAX >> (64 - P.VU.vsew);
+int64_t int_min = INT64_MIN >> (64 - P.VU.vsew);
+int64_t sign_mask = uint64_t(1) << (P.VU.vsew - 1);
+
+VI_VV_LOOP
+({
+  int64_t vs1_sign;
+  int64_t vs2_sign;
+  int64_t result_sign;
+
+  vs1_sign = vs1 & sign_mask;
+  vs2_sign = vs2 & sign_mask;
+  bool overflow = vs1 == vs2 && vs1 == int_min;
+
+  int128_t result = (int128_t)vs1 * (int128_t)vs2;
+  result_sign = (vs1_sign ^ vs2_sign) & sign_mask;
+
+  // rounding
+  INT_ROUNDING(result, xrm, sew - 1);
+  // remove guard bits
+  result = result >> (sew - 1);
+
+  // saturation
+  if (overflow) {
+    result = int_max;
+    P.VU.vxsat |= 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vsmul_vx.h b/riscv/insns/vsmul_vx.h
new file mode 100644
index 0000000000..cf4b511423
--- /dev/null
+++ b/riscv/insns/vsmul_vx.h
@@ -0,0 +1,33 @@
+// vsmul.vx vd, vs2, rs1
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = INT64_MAX >> (64 - P.VU.vsew);
+int64_t int_min = INT64_MIN >> (64 - P.VU.vsew);
+int64_t sign_mask = uint64_t(1) << (P.VU.vsew - 1);
+
+VI_VX_LOOP
+({
+  int64_t rs1_sign;
+  int64_t vs2_sign;
+  int64_t result_sign;
+
+  rs1_sign = rs1 & sign_mask;
+  vs2_sign = vs2 & sign_mask;
+  bool overflow = rs1 == vs2 && rs1 == int_min;
+
+  int128_t result = (int128_t)rs1 * (int128_t)vs2;
+  result_sign = (rs1_sign ^ vs2_sign) & sign_mask;
+
+  // rounding
+  INT_ROUNDING(result, xrm, sew - 1);
+
+  // remove guard bits
+  result = result >> (sew - 1);
+
+  // max saturation
+  if (overflow) {
+    result = int_max;
+    P.VU.vxsat |= 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vsra_vi.h b/riscv/insns/vsra_vi.h
new file mode 100644
index 0000000000..5c589274ef
--- /dev/null
+++ b/riscv/insns/vsra_vi.h
@@ -0,0 +1,5 @@
+// vsra.vi vd, vs2, zimm5
+VI_VI_LOOP
+({
+  vd = vs2 >> (simm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsra_vv.h b/riscv/insns/vsra_vv.h
new file mode 100644
index 0000000000..8889af9c08
--- /dev/null
+++ b/riscv/insns/vsra_vv.h
@@ -0,0 +1,5 @@
+// vsra.vv  vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = vs2 >> (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsra_vx.h b/riscv/insns/vsra_vx.h
new file mode 100644
index 0000000000..c1b0c10728
--- /dev/null
+++ b/riscv/insns/vsra_vx.h
@@ -0,0 +1,5 @@
+// vsra.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = vs2 >> (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsrl_vi.h b/riscv/insns/vsrl_vi.h
new file mode 100644
index 0000000000..fe5d272025
--- /dev/null
+++ b/riscv/insns/vsrl_vi.h
@@ -0,0 +1,5 @@
+// vsrl.vi vd, vs2, zimm5
+VI_VI_ULOOP
+({
+  vd = vs2 >> (zimm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsrl_vv.h b/riscv/insns/vsrl_vv.h
new file mode 100644
index 0000000000..6376af36bc
--- /dev/null
+++ b/riscv/insns/vsrl_vv.h
@@ -0,0 +1,5 @@
+// vsrl.vv  vd, vs2, vs1
+VI_VV_ULOOP
+({
+  vd = vs2 >> (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsrl_vx.h b/riscv/insns/vsrl_vx.h
new file mode 100644
index 0000000000..a4f899ca2c
--- /dev/null
+++ b/riscv/insns/vsrl_vx.h
@@ -0,0 +1,5 @@
+// vsrl.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  vd = vs2 >> (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsse16_v.h b/riscv/insns/vsse16_v.h
new file mode 100644
index 0000000000..adbbcf5c5d
--- /dev/null
+++ b/riscv/insns/vsse16_v.h
@@ -0,0 +1,2 @@
+// vsse16v and vssseg[2-8]e16.v
+VI_ST(i * RS2, fn, uint16);
diff --git a/riscv/insns/vsse32_v.h b/riscv/insns/vsse32_v.h
new file mode 100644
index 0000000000..73bd272b0a
--- /dev/null
+++ b/riscv/insns/vsse32_v.h
@@ -0,0 +1,2 @@
+// vsse32.v and vssseg[2-8]e32.v
+VI_ST(i * RS2, fn, uint32);
diff --git a/riscv/insns/vsse64_v.h b/riscv/insns/vsse64_v.h
new file mode 100644
index 0000000000..1785a56877
--- /dev/null
+++ b/riscv/insns/vsse64_v.h
@@ -0,0 +1,2 @@
+// vsse64.v and vssseg[2-8]e64.v
+VI_ST(i * RS2, fn, uint64);
diff --git a/riscv/insns/vsse8_v.h b/riscv/insns/vsse8_v.h
new file mode 100644
index 0000000000..c5daf0bce8
--- /dev/null
+++ b/riscv/insns/vsse8_v.h
@@ -0,0 +1,2 @@
+// vsse8.v and vssseg[2-8]e8.v
+VI_ST(i * RS2, fn, uint8);
diff --git a/riscv/insns/vssra_vi.h b/riscv/insns/vssra_vi.h
new file mode 100644
index 0000000000..ff2e1c585c
--- /dev/null
+++ b/riscv/insns/vssra_vi.h
@@ -0,0 +1,10 @@
+// vssra.vi vd, vs2, simm5
+VRM xrm = P.VU.get_vround_mode();
+VI_VI_LOOP
+({
+  int sh = simm5 & (sew - 1) & 0x1f;
+  int128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssra_vv.h b/riscv/insns/vssra_vv.h
new file mode 100644
index 0000000000..7bbc766ff1
--- /dev/null
+++ b/riscv/insns/vssra_vv.h
@@ -0,0 +1,10 @@
+// vssra.vv vd, vs2, vs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VV_LOOP
+({
+  int sh = vs1 & (sew - 1);
+  int128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssra_vx.h b/riscv/insns/vssra_vx.h
new file mode 100644
index 0000000000..068a22b692
--- /dev/null
+++ b/riscv/insns/vssra_vx.h
@@ -0,0 +1,10 @@
+// vssra.vx vd, vs2, rs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VX_LOOP
+({
+  int sh = rs1 & (sew - 1);
+  int128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssrl_vi.h b/riscv/insns/vssrl_vi.h
new file mode 100644
index 0000000000..d125164d6a
--- /dev/null
+++ b/riscv/insns/vssrl_vi.h
@@ -0,0 +1,10 @@
+// vssra.vi vd, vs2, simm5
+VRM xrm = P.VU.get_vround_mode();
+VI_VI_ULOOP
+({
+  int sh = zimm5 & (sew - 1) & 0x1f;
+  uint128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssrl_vv.h b/riscv/insns/vssrl_vv.h
new file mode 100644
index 0000000000..a8e5d16423
--- /dev/null
+++ b/riscv/insns/vssrl_vv.h
@@ -0,0 +1,10 @@
+// vssrl.vv vd, vs2, vs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VV_ULOOP
+({
+  int sh = vs1 & (sew - 1);
+  uint128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssrl_vx.h b/riscv/insns/vssrl_vx.h
new file mode 100644
index 0000000000..ee3cb3462f
--- /dev/null
+++ b/riscv/insns/vssrl_vx.h
@@ -0,0 +1,10 @@
+// vssrl.vx vd, vs2, rs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VX_ULOOP
+({
+  int sh = rs1 & (sew - 1);
+  uint128_t val = vs2;
+
+  INT_ROUNDING(val, xrm, sh);
+  vd = val >> sh;
+})
diff --git a/riscv/insns/vssub_vv.h b/riscv/insns/vssub_vv.h
new file mode 100644
index 0000000000..18fe4fb5f0
--- /dev/null
+++ b/riscv/insns/vssub_vv.h
@@ -0,0 +1,29 @@
+// vssub.vv vd, vs2, vs1
+VI_CHECK_SSS(true);
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VV_PARAMS(e8);
+  vd = sat_sub<int8_t, uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_PARAMS(e16);
+  vd = sat_sub<int16_t, uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_PARAMS(e32);
+  vd = sat_sub<int32_t, uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_PARAMS(e64);
+  vd = sat_sub<int64_t, uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vssub_vx.h b/riscv/insns/vssub_vx.h
new file mode 100644
index 0000000000..7a01125644
--- /dev/null
+++ b/riscv/insns/vssub_vx.h
@@ -0,0 +1,29 @@
+// vssub.vx vd, vs2, rs1
+VI_CHECK_SSS(false);
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VX_PARAMS(e8);
+  vd = sat_sub<int8_t, uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_PARAMS(e16);
+  vd = sat_sub<int16_t, uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_PARAMS(e32);
+  vd = sat_sub<int32_t, uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_PARAMS(e64);
+  vd = sat_sub<int64_t, uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vssubu_vv.h b/riscv/insns/vssubu_vv.h
new file mode 100644
index 0000000000..e58076ebe4
--- /dev/null
+++ b/riscv/insns/vssubu_vv.h
@@ -0,0 +1,30 @@
+// vssubu.vv vd, vs2, vs1
+VI_CHECK_SSS(true);
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VV_U_PARAMS(e8);
+  vd = sat_subu<uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_U_PARAMS(e16);
+  vd = sat_subu<uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_U_PARAMS(e32);
+  vd = sat_subu<uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_U_PARAMS(e64);
+  vd = sat_subu<uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+
+VI_LOOP_END
diff --git a/riscv/insns/vssubu_vx.h b/riscv/insns/vssubu_vx.h
new file mode 100644
index 0000000000..556c759f59
--- /dev/null
+++ b/riscv/insns/vssubu_vx.h
@@ -0,0 +1,29 @@
+// vssubu.vx vd, vs2, rs1
+VI_CHECK_SSS(false);
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VX_U_PARAMS(e8);
+  vd = sat_subu<uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_U_PARAMS(e16);
+  vd = sat_subu<uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_U_PARAMS(e32);
+  vd = sat_subu<uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_U_PARAMS(e64);
+  vd = sat_subu<uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsub_vv.h b/riscv/insns/vsub_vv.h
new file mode 100644
index 0000000000..7d119d50fd
--- /dev/null
+++ b/riscv/insns/vsub_vv.h
@@ -0,0 +1,5 @@
+// vsub
+VI_VV_LOOP
+({
+  vd = vs2 - vs1;
+})
diff --git a/riscv/insns/vsub_vx.h b/riscv/insns/vsub_vx.h
new file mode 100644
index 0000000000..e075b42370
--- /dev/null
+++ b/riscv/insns/vsub_vx.h
@@ -0,0 +1,5 @@
+// vsub: vd[i] = (vd[i] * x[rs1]) - vs2[i]
+VI_VX_LOOP
+({
+  vd = vs2 - rs1;
+})
diff --git a/riscv/insns/vsuxei16_v.h b/riscv/insns/vsuxei16_v.h
new file mode 100644
index 0000000000..f361c03f81
--- /dev/null
+++ b/riscv/insns/vsuxei16_v.h
@@ -0,0 +1,2 @@
+// vsuxe16.v
+VI_ST_INDEX(e16, false);
diff --git a/riscv/insns/vsuxei32_v.h b/riscv/insns/vsuxei32_v.h
new file mode 100644
index 0000000000..c1c8dc7c51
--- /dev/null
+++ b/riscv/insns/vsuxei32_v.h
@@ -0,0 +1,2 @@
+// vsuxe32.v
+VI_ST_INDEX(e32, false);
diff --git a/riscv/insns/vsuxei64_v.h b/riscv/insns/vsuxei64_v.h
new file mode 100644
index 0000000000..0c619cf1d8
--- /dev/null
+++ b/riscv/insns/vsuxei64_v.h
@@ -0,0 +1,2 @@
+// vsuxe64.v
+VI_ST_INDEX(e64, false);
diff --git a/riscv/insns/vsuxei8_v.h b/riscv/insns/vsuxei8_v.h
new file mode 100644
index 0000000000..36d2a11c01
--- /dev/null
+++ b/riscv/insns/vsuxei8_v.h
@@ -0,0 +1,2 @@
+// vsuxe8.v
+VI_ST_INDEX(e8, false);
diff --git a/riscv/insns/vsxei16_v.h b/riscv/insns/vsxei16_v.h
new file mode 100644
index 0000000000..42c3c78dbb
--- /dev/null
+++ b/riscv/insns/vsxei16_v.h
@@ -0,0 +1,2 @@
+// vsxei16.v and vsxseg[2-8]ei16.v
+VI_ST_INDEX(e16, true);
diff --git a/riscv/insns/vsxei32_v.h b/riscv/insns/vsxei32_v.h
new file mode 100644
index 0000000000..f0aed6bd6e
--- /dev/null
+++ b/riscv/insns/vsxei32_v.h
@@ -0,0 +1,2 @@
+// vsxei32.v and vsxseg[2-8]ei32.v
+VI_ST_INDEX(e32, true);
diff --git a/riscv/insns/vsxei64_v.h b/riscv/insns/vsxei64_v.h
new file mode 100644
index 0000000000..88ddaf3fd7
--- /dev/null
+++ b/riscv/insns/vsxei64_v.h
@@ -0,0 +1,2 @@
+// vsxei64.v and vsxseg[2-8]ei64.v
+VI_ST_INDEX(e64, true);
diff --git a/riscv/insns/vsxei8_v.h b/riscv/insns/vsxei8_v.h
new file mode 100644
index 0000000000..621512c50c
--- /dev/null
+++ b/riscv/insns/vsxei8_v.h
@@ -0,0 +1,2 @@
+// vsxei8.v and vsxseg[2-8]ei8.v
+VI_ST_INDEX(e8, true);
diff --git a/riscv/insns/vwadd_vv.h b/riscv/insns/vwadd_vv.h
new file mode 100644
index 0000000000..df4a13534d
--- /dev/null
+++ b/riscv/insns/vwadd_vv.h
@@ -0,0 +1,6 @@
+// vwadd.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, +, +, int);
+})
diff --git a/riscv/insns/vwadd_vx.h b/riscv/insns/vwadd_vx.h
new file mode 100644
index 0000000000..c226389342
--- /dev/null
+++ b/riscv/insns/vwadd_vx.h
@@ -0,0 +1,6 @@
+// vwadd.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, +, +, int);
+})
diff --git a/riscv/insns/vwadd_wv.h b/riscv/insns/vwadd_wv.h
new file mode 100644
index 0000000000..54d2ba4072
--- /dev/null
+++ b/riscv/insns/vwadd_wv.h
@@ -0,0 +1,6 @@
+// vwadd.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, +, int);
+})
diff --git a/riscv/insns/vwadd_wx.h b/riscv/insns/vwadd_wx.h
new file mode 100644
index 0000000000..bb4cee5100
--- /dev/null
+++ b/riscv/insns/vwadd_wx.h
@@ -0,0 +1,6 @@
+// vwaddu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, +, int);
+})
diff --git a/riscv/insns/vwaddu_vv.h b/riscv/insns/vwaddu_vv.h
new file mode 100644
index 0000000000..286ebc858e
--- /dev/null
+++ b/riscv/insns/vwaddu_vv.h
@@ -0,0 +1,6 @@
+// vwaddu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, +, +, uint);
+})
diff --git a/riscv/insns/vwaddu_vx.h b/riscv/insns/vwaddu_vx.h
new file mode 100644
index 0000000000..61cddfc897
--- /dev/null
+++ b/riscv/insns/vwaddu_vx.h
@@ -0,0 +1,6 @@
+// vwaddu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, +, +, uint);
+})
diff --git a/riscv/insns/vwaddu_wv.h b/riscv/insns/vwaddu_wv.h
new file mode 100644
index 0000000000..fee813657e
--- /dev/null
+++ b/riscv/insns/vwaddu_wv.h
@@ -0,0 +1,6 @@
+// vwaddu.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, +, uint);
+})
diff --git a/riscv/insns/vwaddu_wx.h b/riscv/insns/vwaddu_wx.h
new file mode 100644
index 0000000000..0073ac35c5
--- /dev/null
+++ b/riscv/insns/vwaddu_wx.h
@@ -0,0 +1,6 @@
+// vwaddu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, +, uint);
+})
diff --git a/riscv/insns/vwmacc_vv.h b/riscv/insns/vwmacc_vv.h
new file mode 100644
index 0000000000..7208c6d696
--- /dev/null
+++ b/riscv/insns/vwmacc_vv.h
@@ -0,0 +1,6 @@
+// vwmacc.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, vd_w, *, +, int);
+})
diff --git a/riscv/insns/vwmacc_vx.h b/riscv/insns/vwmacc_vx.h
new file mode 100644
index 0000000000..5ae597a267
--- /dev/null
+++ b/riscv/insns/vwmacc_vx.h
@@ -0,0 +1,6 @@
+// vwmacc.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, vd_w, *, +, int);
+})
diff --git a/riscv/insns/vwmaccsu_vv.h b/riscv/insns/vwmaccsu_vv.h
new file mode 100644
index 0000000000..3aa43ef44d
--- /dev/null
+++ b/riscv/insns/vwmaccsu_vv.h
@@ -0,0 +1,6 @@
+// vwmaccsu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, vs1, vd_w, *, +, int, uint, int);
+})
diff --git a/riscv/insns/vwmaccsu_vx.h b/riscv/insns/vwmaccsu_vx.h
new file mode 100644
index 0000000000..e00a21ddc1
--- /dev/null
+++ b/riscv/insns/vwmaccsu_vx.h
@@ -0,0 +1,6 @@
+// vwmaccsu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, rs1, vd_w, *, +, int, uint, int);
+})
diff --git a/riscv/insns/vwmaccu_vv.h b/riscv/insns/vwmaccu_vv.h
new file mode 100644
index 0000000000..2cbdaa312b
--- /dev/null
+++ b/riscv/insns/vwmaccu_vv.h
@@ -0,0 +1,6 @@
+// vwmaccu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, vd_w, *, +, uint);
+})
diff --git a/riscv/insns/vwmaccu_vx.h b/riscv/insns/vwmaccu_vx.h
new file mode 100644
index 0000000000..533297f3fa
--- /dev/null
+++ b/riscv/insns/vwmaccu_vx.h
@@ -0,0 +1,6 @@
+// vwmaccu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, vd_w, *, +, uint);
+})
diff --git a/riscv/insns/vwmaccus_vx.h b/riscv/insns/vwmaccus_vx.h
new file mode 100644
index 0000000000..5310f0e9be
--- /dev/null
+++ b/riscv/insns/vwmaccus_vx.h
@@ -0,0 +1,6 @@
+// vwmaccus.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, rs1, vd_w, *, +, int, int, uint);
+})
diff --git a/riscv/insns/vwmul_vv.h b/riscv/insns/vwmul_vv.h
new file mode 100644
index 0000000000..2197edbfbf
--- /dev/null
+++ b/riscv/insns/vwmul_vv.h
@@ -0,0 +1,6 @@
+// vwmul.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, *, +, int);
+})
diff --git a/riscv/insns/vwmul_vx.h b/riscv/insns/vwmul_vx.h
new file mode 100644
index 0000000000..bc1422d400
--- /dev/null
+++ b/riscv/insns/vwmul_vx.h
@@ -0,0 +1,6 @@
+// vwmul.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, *, +, int);
+})
diff --git a/riscv/insns/vwmulsu_vv.h b/riscv/insns/vwmulsu_vv.h
new file mode 100644
index 0000000000..5f84721f6f
--- /dev/null
+++ b/riscv/insns/vwmulsu_vv.h
@@ -0,0 +1,16 @@
+// vwmulsu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  switch(P.VU.vsew) {
+  case e8:
+    P.VU.elt<uint16_t>(rd_num, i, true) = (int16_t)(int8_t)vs2 * (int16_t)(uint8_t)vs1;
+    break;
+  case e16:
+    P.VU.elt<uint32_t>(rd_num, i, true) = (int32_t)(int16_t)vs2 * (int32_t)(uint16_t)vs1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = (int64_t)(int32_t)vs2 * (int64_t)(uint32_t)vs1;
+    break;
+  }
+})
diff --git a/riscv/insns/vwmulsu_vx.h b/riscv/insns/vwmulsu_vx.h
new file mode 100644
index 0000000000..68d6d276ea
--- /dev/null
+++ b/riscv/insns/vwmulsu_vx.h
@@ -0,0 +1,16 @@
+// vwmulsu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  switch(P.VU.vsew) {
+  case e8:
+    P.VU.elt<uint16_t>(rd_num, i, true) = (int16_t)(int8_t)vs2 * (int16_t)(uint8_t)rs1;
+    break;
+  case e16:
+    P.VU.elt<uint32_t>(rd_num, i, true) = (int32_t)(int16_t)vs2 * (int32_t)(uint16_t)rs1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i, true) = (int64_t)(int32_t)vs2 * (int64_t)(uint32_t)rs1;
+    break;
+  }
+})
diff --git a/riscv/insns/vwmulu_vv.h b/riscv/insns/vwmulu_vv.h
new file mode 100644
index 0000000000..8ddbb4b488
--- /dev/null
+++ b/riscv/insns/vwmulu_vv.h
@@ -0,0 +1,6 @@
+// vwmulu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, *, +, uint);
+})
diff --git a/riscv/insns/vwmulu_vx.h b/riscv/insns/vwmulu_vx.h
new file mode 100644
index 0000000000..1ce77eefdc
--- /dev/null
+++ b/riscv/insns/vwmulu_vx.h
@@ -0,0 +1,6 @@
+// vwmul.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, *, +, uint);
+})
diff --git a/riscv/insns/vwredsum_vs.h b/riscv/insns/vwredsum_vs.h
new file mode 100644
index 0000000000..c7a87db431
--- /dev/null
+++ b/riscv/insns/vwredsum_vs.h
@@ -0,0 +1,5 @@
+// vwredsum.vs vd, vs2, vs1
+VI_VV_LOOP_WIDE_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vwredsumu_vs.h b/riscv/insns/vwredsumu_vs.h
new file mode 100644
index 0000000000..889a77d310
--- /dev/null
+++ b/riscv/insns/vwredsumu_vs.h
@@ -0,0 +1,5 @@
+// vwredsum.vs vd, vs2, vs1
+VI_VV_ULOOP_WIDE_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vwsub_vv.h b/riscv/insns/vwsub_vv.h
new file mode 100644
index 0000000000..99f9348985
--- /dev/null
+++ b/riscv/insns/vwsub_vv.h
@@ -0,0 +1,6 @@
+// vwsub.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, -, +, int);
+})
diff --git a/riscv/insns/vwsub_vx.h b/riscv/insns/vwsub_vx.h
new file mode 100644
index 0000000000..affdf62ce0
--- /dev/null
+++ b/riscv/insns/vwsub_vx.h
@@ -0,0 +1,6 @@
+// vwsub.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, -, +, int);
+})
diff --git a/riscv/insns/vwsub_wv.h b/riscv/insns/vwsub_wv.h
new file mode 100644
index 0000000000..10db7308e2
--- /dev/null
+++ b/riscv/insns/vwsub_wv.h
@@ -0,0 +1,6 @@
+// vwsub.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, -, int);
+})
diff --git a/riscv/insns/vwsub_wx.h b/riscv/insns/vwsub_wx.h
new file mode 100644
index 0000000000..f72341ba80
--- /dev/null
+++ b/riscv/insns/vwsub_wx.h
@@ -0,0 +1,6 @@
+// vwsub.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, -, int);
+})
diff --git a/riscv/insns/vwsubu_vv.h b/riscv/insns/vwsubu_vv.h
new file mode 100644
index 0000000000..cf68adb9fa
--- /dev/null
+++ b/riscv/insns/vwsubu_vv.h
@@ -0,0 +1,6 @@
+// vwsubu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, -, +, uint);
+})
diff --git a/riscv/insns/vwsubu_vx.h b/riscv/insns/vwsubu_vx.h
new file mode 100644
index 0000000000..3e972dd211
--- /dev/null
+++ b/riscv/insns/vwsubu_vx.h
@@ -0,0 +1,6 @@
+// vwsubu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, -, +, uint);
+})
diff --git a/riscv/insns/vwsubu_wv.h b/riscv/insns/vwsubu_wv.h
new file mode 100644
index 0000000000..3687c3d237
--- /dev/null
+++ b/riscv/insns/vwsubu_wv.h
@@ -0,0 +1,6 @@
+// vwsubu.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, -, uint);
+})
diff --git a/riscv/insns/vwsubu_wx.h b/riscv/insns/vwsubu_wx.h
new file mode 100644
index 0000000000..c7f20edd79
--- /dev/null
+++ b/riscv/insns/vwsubu_wx.h
@@ -0,0 +1,6 @@
+// vwsubu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, -, uint);
+})
diff --git a/riscv/insns/vxor_vi.h b/riscv/insns/vxor_vi.h
new file mode 100644
index 0000000000..b2dcf946dc
--- /dev/null
+++ b/riscv/insns/vxor_vi.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VI_LOOP
+({
+  vd = simm5 ^ vs2;
+})
diff --git a/riscv/insns/vxor_vv.h b/riscv/insns/vxor_vv.h
new file mode 100644
index 0000000000..c37b6ab729
--- /dev/null
+++ b/riscv/insns/vxor_vv.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VV_LOOP
+({
+  vd = vs1 ^ vs2;
+})
diff --git a/riscv/insns/vxor_vx.h b/riscv/insns/vxor_vx.h
new file mode 100644
index 0000000000..8021e0e851
--- /dev/null
+++ b/riscv/insns/vxor_vx.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VX_LOOP
+({
+  vd = rs1 ^ vs2;
+})
diff --git a/riscv/insns/vzext_vf2.h b/riscv/insns/vzext_vf2.h
new file mode 100644
index 0000000000..100f2e359a
--- /dev/null
+++ b/riscv/insns/vzext_vf2.h
@@ -0,0 +1 @@
+VI_VV_EXT(2, uint);
diff --git a/riscv/insns/vzext_vf4.h b/riscv/insns/vzext_vf4.h
new file mode 100644
index 0000000000..6ff920e0bc
--- /dev/null
+++ b/riscv/insns/vzext_vf4.h
@@ -0,0 +1 @@
+VI_VV_EXT(4, uint);
diff --git a/riscv/insns/vzext_vf8.h b/riscv/insns/vzext_vf8.h
new file mode 100644
index 0000000000..b1762fbf67
--- /dev/null
+++ b/riscv/insns/vzext_vf8.h
@@ -0,0 +1 @@
+VI_VV_EXT(8, uint);
diff --git a/riscv/insns/wfi.h b/riscv/insns/wfi.h
index 6504b78c60..59ed35bb6f 100644
--- a/riscv/insns/wfi.h
+++ b/riscv/insns/wfi.h
@@ -1,2 +1,11 @@
-require_privilege(get_field(STATE.mstatus, MSTATUS_TW) ? PRV_M : PRV_S);
+if (STATE.v && STATE.prv == PRV_U) {
+  require_novirt();
+} else if (get_field(STATE.mstatus, MSTATUS_TW)) {
+  require_privilege(PRV_M);
+} else if (STATE.v) { // VS-mode
+  if (get_field(STATE.hstatus, HSTATUS_VTW))
+    require_novirt();
+} else {
+  require_privilege(PRV_S);
+}
 wfi();
diff --git a/riscv/interactive.cc b/riscv/interactive.cc
index c96c71ace7..00e505d896 100644
--- a/riscv/interactive.cc
+++ b/riscv/interactive.cc
@@ -66,8 +66,10 @@ void sim_t::interactive()
   funcs["run"] = &sim_t::interactive_run_noisy;
   funcs["r"] = funcs["run"];
   funcs["rs"] = &sim_t::interactive_run_silent;
+  funcs["vreg"] = &sim_t::interactive_vreg;
   funcs["reg"] = &sim_t::interactive_reg;
   funcs["freg"] = &sim_t::interactive_freg;
+  funcs["fregh"] = &sim_t::interactive_fregh;
   funcs["fregs"] = &sim_t::interactive_fregs;
   funcs["fregd"] = &sim_t::interactive_fregd;
   funcs["pc"] = &sim_t::interactive_pc;
@@ -107,7 +109,7 @@ void sim_t::interactive()
       else
         fprintf(stderr, "Unknown command %s\n", cmd.c_str());
     }
-    catch(trap_t t) {}
+    catch(trap_t& t) {}
   }
   ctrlc_pressed = false;
 }
@@ -117,8 +119,10 @@ void sim_t::interactive_help(const std::string& cmd, const std::vector<std::stri
   std::cerr <<
     "Interactive commands:\n"
     "reg <core> [reg]                # Display [reg] (all if omitted) in <core>\n"
+    "fregh <core> <reg>              # Display half precision <reg> in <core>\n"
     "fregs <core> <reg>              # Display single precision <reg> in <core>\n"
     "fregd <core> <reg>              # Display double precision <reg> in <core>\n"
+    "vreg <core> [reg]               # Display vector [reg] (all if omitted) in <core>\n"
     "pc <core>                       # Show current PC in <core>\n"
     "mem <hex addr>                  # Show contents of physical memory\n"
     "str <hex addr>                  # Show NUL-terminated C string\n"
@@ -218,6 +222,54 @@ freg_t sim_t::get_freg(const std::vector<std::string>& args)
   return p->get_state()->FPR[r];
 }
 
+void sim_t::interactive_vreg(const std::string& cmd, const std::vector<std::string>& args)
+{
+  int rstart = 0;
+  int rend = NVPR;
+  if (args.size() >= 2) {
+    rstart = strtol(args[1].c_str(), NULL, 0);
+    if (!(rstart >= 0 && rstart < NVPR)) {
+      rstart = 0;
+    } else {
+      rend = rstart + 1;
+    }
+  }
+
+  // Show all the regs!
+  processor_t *p = get_core(args[0]);
+  const int vlen = (int)(p->VU.get_vlen()) >> 3;
+  const int elen = (int)(p->VU.get_elen()) >> 3;
+  const int num_elem = vlen/elen;
+  fprintf(stderr, "VLEN=%d bits; ELEN=%d bits\n", vlen << 3, elen << 3);
+
+  for (int r = rstart; r < rend; ++r) {
+    fprintf(stderr, "%-4s: ", vr_name[r]);
+    for (int e = num_elem-1; e >= 0; --e){
+      uint64_t val;
+      switch(elen){
+        case 8:
+          val = P.VU.elt<uint64_t>(r, e);
+          fprintf(stderr, "[%d]: 0x%016" PRIx64 "  ", e, val);
+          break;
+        case 4:
+          val = P.VU.elt<uint32_t>(r, e);
+          fprintf(stderr, "[%d]: 0x%08" PRIx32 "  ", e, (uint32_t)val);
+          break;
+        case 2:
+          val = P.VU.elt<uint16_t>(r, e);
+          fprintf(stderr, "[%d]: 0x%08" PRIx16 "  ", e, (uint16_t)val);
+          break;
+        case 1:
+          val = P.VU.elt<uint8_t>(r, e);
+          fprintf(stderr, "[%d]: 0x%08" PRIx8 "  ", e, (uint8_t)val);
+          break;
+      }
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+
 void sim_t::interactive_reg(const std::string& cmd, const std::vector<std::string>& args)
 {
   if (args.size() == 1) {
@@ -246,6 +298,13 @@ void sim_t::interactive_freg(const std::string& cmd, const std::vector<std::stri
   fprintf(stderr, "0x%016" PRIx64 "%016" PRIx64 "\n", r.v[1], r.v[0]);
 }
 
+void sim_t::interactive_fregh(const std::string& cmd, const std::vector<std::string>& args)
+{
+  fpr f;
+  f.r = freg(f16_to_f32(f16(get_freg(args))));
+  fprintf(stderr, "%g\n", isBoxedF32(f.r) ? (double)f.s : NAN);
+}
+
 void sim_t::interactive_fregs(const std::string& cmd, const std::vector<std::string>& args)
 {
   fpr f;
@@ -361,7 +420,7 @@ void sim_t::interactive_until(const std::string& cmd, const std::vector<std::str
       if (ctrlc_pressed)
         break;
     }
-    catch (trap_t t) {}
+    catch (trap_t& t) {}
 
     set_procs_debug(noisy);
     step(1);
diff --git a/riscv/jtag_dtm.cc b/riscv/jtag_dtm.cc
index 365528a49a..1baeca89be 100644
--- a/riscv/jtag_dtm.cc
+++ b/riscv/jtag_dtm.cc
@@ -20,13 +20,14 @@ enum {
 
 #define DTMCONTROL_VERSION      0xf
 #define DTMCONTROL_ABITS        (0x3f << 4)
-#define DTMCONTROL_DBUSSTAT     (3<<10)
+#define DTMCONTROL_DMISTAT      (3<<10)
 #define DTMCONTROL_IDLE         (7<<12)
-#define DTMCONTROL_DBUSRESET    (1<<16)
+#define DTMCONTROL_DMIRESET     (1<<16)
+#define DTMCONTROL_DMIHARDRESET (1<<17)
 
 #define DMI_OP                 3
-#define DMI_DATA               (0xffffffffL<<2)
-#define DMI_ADDRESS            ((1L<<(abits+34)) - (1L<<34))
+#define DMI_DATA               (0xffffffffLL<<2)
+#define DMI_ADDRESS            ((1LL<<(abits+34)) - (1LL<<34))
 
 #define DMI_OP_STATUS_SUCCESS	0
 #define DMI_OP_STATUS_RESERVED	1
@@ -38,8 +39,8 @@ enum {
 #define DMI_OP_WRITE	        2
 #define DMI_OP_RESERVED	        3
 
-jtag_dtm_t::jtag_dtm_t(debug_module_t *dm) :
-  dm(dm),
+jtag_dtm_t::jtag_dtm_t(debug_module_t *dm, unsigned required_rti_cycles) :
+  dm(dm), required_rti_cycles(required_rti_cycles),
   _tck(false), _tms(false), _tdi(false), _tdo(false),
   dtmcontrol((abits << DTM_DTMCS_ABITS_OFFSET) | 1),
   dmi(DMI_OP_STATUS_SUCCESS << DTM_DMI_OP_OFFSET),
@@ -49,6 +50,9 @@ jtag_dtm_t::jtag_dtm_t(debug_module_t *dm) :
 
 void jtag_dtm_t::reset() {
   _state = TEST_LOGIC_RESET;
+  busy_stuck = false;
+  rti_remaining = 0;
+  dmi = 0;
 }
 
 void jtag_dtm_t::set_pins(bool tck, bool tms, bool tdi) {
@@ -88,6 +92,11 @@ void jtag_dtm_t::set_pins(bool tck, bool tms, bool tdi) {
     }
     _state = next[_state][_tms];
     switch (_state) {
+      case RUN_TEST_IDLE:
+        if (rti_remaining > 0)
+          rti_remaining--;
+        dm->run_test_idle();
+        break;
       case TEST_LOGIC_RESET:
         ir = IR_IDCODE;
         break;
@@ -136,7 +145,12 @@ void jtag_dtm_t::capture_dr()
       dr_length = 32;
       break;
     case IR_DBUS:
-      dr = dmi;
+      if (rti_remaining > 0 || busy_stuck) {
+        dr = DMI_OP_STATUS_BUSY;
+        busy_stuck = true;
+      } else {
+        dr = dmi;
+      }
       dr_length = abits + 34;
       break;
     default:
@@ -151,34 +165,37 @@ void jtag_dtm_t::update_dr()
 {
   D(fprintf(stderr, "Update DR; IR=0x%x, DR=0x%lx (%d bits)\n",
         ir, dr, dr_length));
-  switch (ir) {
-    case IR_DBUS:
-      {
-        unsigned op = get_field(dr, DMI_OP);
-        uint32_t data = get_field(dr, DMI_DATA);
-        unsigned address = get_field(dr, DMI_ADDRESS);
-
-        dmi = dr;
-
-        bool success = true;
-        if (op == DMI_OP_READ) {
-          uint32_t value;
-          if (dm->dmi_read(address, &value)) {
-            dmi = set_field(dmi, DMI_DATA, value);
-          } else {
-            success = false;
-          }
-        } else if (op == DMI_OP_WRITE) {
-          success = dm->dmi_write(address, data);
-        }
-
-        if (success) {
-          dmi = set_field(dmi, DMI_OP, DMI_OP_STATUS_SUCCESS);
-        } else {
-          dmi = set_field(dmi, DMI_OP, DMI_OP_STATUS_FAILED);
-        }
-        D(fprintf(stderr, "dmi=0x%lx\n", dmi));
+  if (ir == IR_DTMCONTROL) {
+    if (dr & DTMCONTROL_DMIRESET)
+      busy_stuck = false;
+    if (dr & DTMCONTROL_DMIHARDRESET)
+      reset();
+  } else if (ir == IR_DBUS && !busy_stuck) {
+    unsigned op = get_field(dr, DMI_OP);
+    uint32_t data = get_field(dr, DMI_DATA);
+    unsigned address = get_field(dr, DMI_ADDRESS);
+
+    dmi = dr;
+
+    bool success = true;
+    if (op == DMI_OP_READ) {
+      uint32_t value;
+      if (dm->dmi_read(address, &value)) {
+        dmi = set_field(dmi, DMI_DATA, value);
+      } else {
+        success = false;
       }
-      break;
+    } else if (op == DMI_OP_WRITE) {
+      success = dm->dmi_write(address, data);
+    }
+
+    if (success) {
+      dmi = set_field(dmi, DMI_OP, DMI_OP_STATUS_SUCCESS);
+    } else {
+      dmi = set_field(dmi, DMI_OP, DMI_OP_STATUS_FAILED);
+    }
+    D(fprintf(stderr, "dmi=0x%lx\n", dmi));
+
+    rti_remaining = required_rti_cycles;
   }
 }
diff --git a/riscv/jtag_dtm.h b/riscv/jtag_dtm.h
index 063e3f40b3..3482b8a06b 100644
--- a/riscv/jtag_dtm.h
+++ b/riscv/jtag_dtm.h
@@ -29,7 +29,7 @@ class jtag_dtm_t
   static const unsigned idcode = 0xdeadbeef;
 
   public:
-    jtag_dtm_t(debug_module_t *dm);
+    jtag_dtm_t(debug_module_t *dm, unsigned required_rti_cycles);
     void reset();
 
     void set_pins(bool tck, bool tms, bool tdi);
@@ -40,6 +40,9 @@ class jtag_dtm_t
 
   private:
     debug_module_t *dm;
+    // The number of Run-Test/Idle cycles required before a DMI access is
+    // complete.
+    unsigned required_rti_cycles;
     bool _tck, _tms, _tdi, _tdo;
     uint32_t ir;
     const unsigned ir_length = 5;
@@ -51,6 +54,10 @@ class jtag_dtm_t
     const unsigned abits = 6;
     uint32_t dtmcontrol;
     uint64_t dmi;
+    // Number of Run-Test/Idle cycles needed before we call this access
+    // complete.
+    unsigned rti_remaining;
+    bool busy_stuck;
 
     jtag_state_t _state;
 
diff --git a/riscv/log_file.h b/riscv/log_file.h
new file mode 100644
index 0000000000..d039859dc6
--- /dev/null
+++ b/riscv/log_file.h
@@ -0,0 +1,37 @@
+// See LICENSE for license details.
+#ifndef _RISCV_LOGFILE_H
+#define _RISCV_LOGFILE_H
+
+#include <stdio.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+// Header-only class wrapping a log file. When constructed with an
+// actual path, it opens the named file for writing. When constructed
+// with the null path, it wraps stderr.
+class log_file_t
+{
+public:
+  log_file_t(const char *path)
+    : wrapped_file (nullptr, &fclose)
+  {
+    if (!path)
+      return;
+
+    wrapped_file.reset(fopen(path, "w"));
+    if (! wrapped_file) {
+      std::ostringstream oss;
+      oss << "Failed to open log file at `" << path << "': "
+          << strerror (errno);
+      throw std::runtime_error(oss.str());
+    }
+  }
+
+  FILE *get() { return wrapped_file ? wrapped_file.get() : stderr; }
+
+private:
+  std::unique_ptr<FILE, decltype(&fclose)> wrapped_file;
+};
+
+#endif
diff --git a/riscv/mmio_plugin.h b/riscv/mmio_plugin.h
new file mode 100644
index 0000000000..f14470bf38
--- /dev/null
+++ b/riscv/mmio_plugin.h
@@ -0,0 +1,91 @@
+#ifndef _RISCV_MMIO_PLUGIN_H
+#define _RISCV_MMIO_PLUGIN_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef uint64_t reg_t;
+
+typedef struct {
+  // Allocate user data for an instance of the plugin. The parameter is a simple
+  // c-string containing arguments used to construct the plugin. It returns a
+  // void* to the allocated data.
+  void* (*alloc)(const char*);
+
+  // Load a memory address of the MMIO plugin. The parameters are the user_data
+  // (void*), memory offset (reg_t), number of bytes to load (size_t), and the
+  // buffer into which the loaded data should be written (uint8_t*). Return true
+  // if the load is successful and false otherwise.
+  bool (*load)(void*, reg_t, size_t, uint8_t*);
+
+  // Store some bytes to a memory address of the MMIO plugin. The parameters are
+  // the user_data (void*), memory offset (reg_t), number of bytes to store
+  // (size_t), and the buffer containing the data to be stored (const uint8_t*).
+  // Return true if the store is successful and false otherwise.
+  bool (*store)(void*, reg_t, size_t, const uint8_t*);
+
+  // Deallocate the data allocated during the call to alloc. The parameter is a
+  // pointer to the user data allocated during the call to alloc.
+  void (*dealloc)(void*);
+} mmio_plugin_t;
+
+// Register an mmio plugin with the application. This should be called by
+// plugins as part of their loading process.
+extern void register_mmio_plugin(const char* name_cstr,
+                                 const mmio_plugin_t* mmio_plugin);
+
+#ifdef __cplusplus
+}
+
+#include <string>
+
+// Wrapper around the C plugin API that makes registering a C++ class with
+// correctly formed constructor, load, and store functions easier. The template
+// type should be the type that implements the MMIO plugin interface. Simply
+// make a global mmio_plugin_registration_t and your plugin should register
+// itself with the application when it is loaded because the
+// mmio_plugin_registration_t constructor will be called.
+template <typename T>
+struct mmio_plugin_registration_t
+{
+  static void* alloc(const char* args)
+  {
+    return reinterpret_cast<void*>(new T(std::string(args)));
+  }
+
+  static bool load(void* self, reg_t addr, size_t len, uint8_t* bytes)
+  {
+    return reinterpret_cast<T*>(self)->load(addr, len, bytes);
+  }
+
+  static bool store(void* self, reg_t addr, size_t len, const uint8_t* bytes)
+  {
+    return reinterpret_cast<T*>(self)->store(addr, len, bytes);
+  }
+
+  static void dealloc(void* self)
+  {
+    delete reinterpret_cast<T*>(self);
+  }
+
+  mmio_plugin_registration_t(const std::string& name)
+  {
+    mmio_plugin_t plugin = {
+      mmio_plugin_registration_t<T>::alloc,
+      mmio_plugin_registration_t<T>::load,
+      mmio_plugin_registration_t<T>::store,
+      mmio_plugin_registration_t<T>::dealloc,
+    };
+
+    register_mmio_plugin(name.c_str(), &plugin);
+  }
+};
+#endif // __cplusplus
+
+#endif
diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 3e1fc25552..e8dca6a85d 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -37,39 +37,51 @@ void mmu_t::flush_tlb()
 static void throw_access_exception(reg_t addr, access_type type)
 {
   switch (type) {
-    case FETCH: throw trap_instruction_access_fault(addr);
-    case LOAD: throw trap_load_access_fault(addr);
-    case STORE: throw trap_store_access_fault(addr);
+    case FETCH: throw trap_instruction_access_fault(addr, 0, 0);
+    case LOAD: throw trap_load_access_fault(addr, 0, 0);
+    case STORE: throw trap_store_access_fault(addr, 0, 0);
     default: abort();
   }
 }
 
-reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type)
+reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_flags)
 {
   if (!proc)
     return addr;
 
+  bool mxr = get_field(proc->state.mstatus, MSTATUS_MXR);
+  bool virt = proc->state.v;
   reg_t mode = proc->state.prv;
   if (type != FETCH) {
-    if (!proc->state.dcsr.cause && get_field(proc->state.mstatus, MSTATUS_MPRV))
+    if (!proc->state.debug_mode && get_field(proc->state.mstatus, MSTATUS_MPRV)) {
       mode = get_field(proc->state.mstatus, MSTATUS_MPP);
+      if (get_field(proc->state.mstatus, MSTATUS_MPV))
+        virt = true;
+    }
+    if (!proc->state.debug_mode && (xlate_flags & RISCV_XLATE_VIRT)) {
+      virt = true;
+      mode = get_field(proc->state.hstatus, HSTATUS_SPVP);
+      if (type == LOAD && (xlate_flags & RISCV_XLATE_VIRT_MXR)) {
+        mxr = true;
+      }
+    }
   }
 
-  reg_t paddr = walk(addr, type, mode) | (addr & (PGSIZE-1));
-  if (!pmp_ok(paddr, type, mode) || !pmp_homogeneous(paddr, len))
+  reg_t paddr = walk(addr, type, mode, virt, mxr) | (addr & (PGSIZE-1));
+  if (!pmp_ok(paddr, len, type, mode))
     throw_access_exception(addr, type);
   return paddr;
 }
 
 tlb_entry_t mmu_t::fetch_slow_path(reg_t vaddr)
 {
-  reg_t paddr = translate(vaddr, sizeof(fetch_temp), FETCH);
+  reg_t paddr = translate(vaddr, sizeof(fetch_temp), FETCH, 0);
 
   if (auto host_addr = sim->addr_to_mem(paddr)) {
     return refill_tlb(vaddr, paddr, host_addr, FETCH);
   } else {
-    if (!sim->mmio_load(paddr, sizeof fetch_temp, (uint8_t*)&fetch_temp))
-      throw trap_instruction_access_fault(vaddr);
+    if (!mmio_load(paddr, sizeof fetch_temp, (uint8_t*)&fetch_temp))
+      throw trap_instruction_access_fault(vaddr, 0, 0);
     tlb_entry_t entry = {(char*)&fetch_temp - vaddr, paddr - vaddr};
     return entry;
   }
@@ -101,9 +113,34 @@ reg_t reg_from_bytes(size_t len, const uint8_t* bytes)
   abort();
 }
 
-void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes)
+bool mmu_t::mmio_ok(reg_t addr, access_type type)
+{
+  // Disallow access to debug region when not in debug mode
+  if (addr >= DEBUG_START && addr <= DEBUG_END && proc && !proc->state.debug_mode)
+    return false;
+
+  return true;
+}
+
+bool mmu_t::mmio_load(reg_t addr, size_t len, uint8_t* bytes)
+{
+  if (!mmio_ok(addr, LOAD))
+    return false;
+
+  return sim->mmio_load(addr, len, bytes);
+}
+
+bool mmu_t::mmio_store(reg_t addr, size_t len, const uint8_t* bytes)
+{
+  if (!mmio_ok(addr, STORE))
+    return false;
+
+  return sim->mmio_store(addr, len, bytes);
+}
+
+void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags)
 {
-  reg_t paddr = translate(addr, len, LOAD);
+  reg_t paddr = translate(addr, len, LOAD, xlate_flags);
 
   if (auto host_addr = sim->addr_to_mem(paddr)) {
     memcpy(bytes, host_addr, len);
@@ -111,8 +148,8 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes)
       tracer.trace(paddr, len, LOAD);
     else
       refill_tlb(addr, paddr, host_addr, LOAD);
-  } else if (!sim->mmio_load(paddr, len, bytes)) {
-    throw trap_load_access_fault(addr);
+  } else if (!mmio_load(paddr, len, bytes)) {
+    throw trap_load_access_fault(addr, 0, 0);
   }
 
   if (!matched_trigger) {
@@ -123,9 +160,9 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes)
   }
 }
 
-void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes)
+void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags)
 {
-  reg_t paddr = translate(addr, len, STORE);
+  reg_t paddr = translate(addr, len, STORE, xlate_flags);
 
   if (!matched_trigger) {
     reg_t data = reg_from_bytes(len, bytes);
@@ -140,8 +177,8 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes)
       tracer.trace(paddr, len, STORE);
     else
       refill_tlb(addr, paddr, host_addr, STORE);
-  } else if (!sim->mmio_store(paddr, len, bytes)) {
-    throw trap_store_access_fault(addr);
+  } else if (!mmio_store(paddr, len, bytes)) {
+    throw trap_store_access_fault(addr, 0, 0);
   }
 }
 
@@ -173,26 +210,40 @@ tlb_entry_t mmu_t::refill_tlb(reg_t vaddr, reg_t paddr, char* host_addr, access_
   return entry;
 }
 
-reg_t mmu_t::pmp_ok(reg_t addr, access_type type, reg_t mode)
+reg_t mmu_t::pmp_ok(reg_t addr, reg_t len, access_type type, reg_t mode)
 {
-  if (!proc)
+  if (!proc || proc->n_pmp == 0)
     return true;
 
   reg_t base = 0;
-  for (size_t i = 0; i < proc->state.n_pmp; i++) {
-    reg_t tor = proc->state.pmpaddr[i] << PMP_SHIFT;
+  for (size_t i = 0; i < proc->n_pmp; i++) {
+    reg_t tor = (proc->state.pmpaddr[i] & proc->pmp_tor_mask()) << PMP_SHIFT;
     uint8_t cfg = proc->state.pmpcfg[i];
 
     if (cfg & PMP_A) {
       bool is_tor = (cfg & PMP_A) == PMP_TOR;
       bool is_na4 = (cfg & PMP_A) == PMP_NA4;
 
-      reg_t mask = (proc->state.pmpaddr[i] << 1) | (!is_na4);
+      reg_t mask = (proc->state.pmpaddr[i] << 1) | (!is_na4) | ~proc->pmp_tor_mask();
       mask = ~(mask & ~(mask + 1)) << PMP_SHIFT;
-      bool napot_match = ((addr ^ tor) & mask) == 0;
-      bool tor_match = base <= addr && addr < tor;
 
-      if (is_tor ? tor_match : napot_match) {
+      // Check each 4-byte sector of the access
+      bool any_match = false;
+      bool all_match = true;
+      for (reg_t offset = 0; offset < len; offset += 1 << PMP_SHIFT) {
+        reg_t cur_addr = addr + offset;
+        bool napot_match = ((cur_addr ^ tor) & mask) == 0;
+        bool tor_match = base <= cur_addr && cur_addr < tor;
+        bool match = is_tor ? tor_match : napot_match;
+        any_match |= match;
+        all_match &= match;
+      }
+
+      if (any_match) {
+        // If the PMP matches only a strict subset of the access, fail it
+        if (!all_match)
+          return false;
+
         return
           (mode == PRV_M && !(cfg & PMP_L)) ||
           (type == LOAD && (cfg & PMP_R)) ||
@@ -216,8 +267,8 @@ reg_t mmu_t::pmp_homogeneous(reg_t addr, reg_t len)
     return true;
 
   reg_t base = 0;
-  for (size_t i = 0; i < proc->state.n_pmp; i++) {
-    reg_t tor = proc->state.pmpaddr[i] << PMP_SHIFT;
+  for (size_t i = 0; i < proc->n_pmp; i++) {
+    reg_t tor = (proc->state.pmpaddr[i] & proc->pmp_tor_mask()) << PMP_SHIFT;
     uint8_t cfg = proc->state.pmpcfg[i];
 
     if (cfg & PMP_A) {
@@ -231,7 +282,7 @@ reg_t mmu_t::pmp_homogeneous(reg_t addr, reg_t len)
       bool tor_homogeneous = ends_before_lower || begins_after_upper ||
         (begins_after_lower && ends_before_upper);
 
-      reg_t mask = (proc->state.pmpaddr[i] << 1) | (!is_na4);
+      reg_t mask = (proc->state.pmpaddr[i] << 1) | (!is_na4) | ~proc->pmp_tor_mask();
       mask = ~(mask & ~(mask + 1)) << PMP_SHIFT;
       bool mask_homogeneous = ~(mask << 1) & len;
       bool napot_homogeneous = mask_homogeneous || ((addr ^ tor) / len) != 0;
@@ -246,15 +297,82 @@ reg_t mmu_t::pmp_homogeneous(reg_t addr, reg_t len)
   return true;
 }
 
-reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode)
+reg_t mmu_t::s2xlate(reg_t gva, reg_t gpa, access_type type, bool virt, bool mxr)
+{
+  if (!virt)
+    return gpa;
+
+  vm_info vm = decode_vm_info(proc->max_xlen, true, 0, proc->get_state()->hgatp);
+  if (vm.levels == 0)
+    return gpa;
+
+  reg_t base = vm.ptbase;
+  for (int i = vm.levels - 1; i >= 0; i--) {
+    int ptshift = i * vm.idxbits;
+    int idxbits = (i == (vm.levels - 1)) ? vm.idxbits + vm.widenbits : vm.idxbits;
+    reg_t idx = (gpa >> (PGSHIFT + ptshift)) & ((reg_t(1) << idxbits) - 1);
+
+    // check that physical address of PTE is legal
+    auto pte_paddr = base + idx * vm.ptesize;
+    auto ppte = sim->addr_to_mem(pte_paddr);
+    if (!ppte || !pmp_ok(pte_paddr, vm.ptesize, LOAD, PRV_S)) {
+      throw_access_exception(gva, type);
+    }
+
+    reg_t pte = vm.ptesize == 4 ? from_le(*(uint32_t*)ppte) : from_le(*(uint64_t*)ppte);
+    reg_t ppn = pte >> PTE_PPN_SHIFT;
+
+    if (PTE_TABLE(pte)) { // next level of page table
+      base = ppn << PGSHIFT;
+    } else if (!(pte & PTE_V) || (!(pte & PTE_R) && (pte & PTE_W))) {
+      break;
+    } else if (!(pte & PTE_U)) {
+      break;
+    } else if (type == FETCH ? !(pte & PTE_X) :
+               type == LOAD ?  !(pte & PTE_R) && !(mxr && (pte & PTE_X)) :
+                               !((pte & PTE_R) && (pte & PTE_W))) {
+      break;
+    } else if ((ppn & ((reg_t(1) << ptshift) - 1)) != 0) {
+      break;
+    } else {
+      reg_t ad = PTE_A | ((type == STORE) * PTE_D);
+#ifdef RISCV_ENABLE_DIRTY
+      // set accessed and possibly dirty bits.
+      if ((pte & ad) != ad) {
+        if (!pmp_ok(pte_paddr, vm.ptesize, STORE, PRV_S))
+          throw_access_exception(gva, type);
+        *(uint32_t*)ppte |= to_le((uint32_t)ad);
+      }
+#else
+      // take exception if access or possibly dirty bit is not set.
+      if ((pte & ad) != ad)
+        break;
+#endif
+      reg_t vpn = gpa >> PGSHIFT;
+      reg_t page_mask = (reg_t(1) << PGSHIFT) - 1;
+      reg_t page_base = (ppn | (vpn & ((reg_t(1) << ptshift) - 1))) << PGSHIFT;
+      return page_base | (gpa & page_mask);
+    }
+  }
+
+  switch (type) {
+    case FETCH: throw trap_instruction_guest_page_fault(gva, gpa >> 2, 0);
+    case LOAD: throw trap_load_guest_page_fault(gva, gpa >> 2, 0);
+    case STORE: throw trap_store_guest_page_fault(gva, gpa >> 2, 0);
+    default: abort();
+  }
+}
+
+reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode, bool virt, bool mxr)
 {
-  vm_info vm = decode_vm_info(proc->max_xlen, mode, proc->get_state()->satp);
+  reg_t page_mask = (reg_t(1) << PGSHIFT) - 1;
+  reg_t satp = (virt) ? proc->get_state()->vsatp : proc->get_state()->satp;
+  vm_info vm = decode_vm_info(proc->max_xlen, false, mode, satp);
   if (vm.levels == 0)
-    return addr & ((reg_t(2) << (proc->xlen-1))-1); // zero-extend from xlen
+    return s2xlate(addr, addr & ((reg_t(2) << (proc->xlen-1))-1), type, virt, mxr) & ~page_mask; // zero-extend from xlen
 
   bool s_mode = mode == PRV_S;
   bool sum = get_field(proc->state.mstatus, MSTATUS_SUM);
-  bool mxr = get_field(proc->state.mstatus, MSTATUS_MXR);
 
   // verify bits xlen-1:va_bits-1 are all equal
   int va_bits = PGSHIFT + vm.levels * vm.idxbits;
@@ -269,12 +387,12 @@ reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode)
     reg_t idx = (addr >> (PGSHIFT + ptshift)) & ((1 << vm.idxbits) - 1);
 
     // check that physical address of PTE is legal
-    auto pte_paddr = base + idx * vm.ptesize;
+    auto pte_paddr = s2xlate(addr, base + idx * vm.ptesize, LOAD, virt, false);
     auto ppte = sim->addr_to_mem(pte_paddr);
-    if (!ppte || !pmp_ok(pte_paddr, LOAD, PRV_S))
+    if (!ppte || !pmp_ok(pte_paddr, vm.ptesize, LOAD, PRV_S))
       throw_access_exception(addr, type);
 
-    reg_t pte = vm.ptesize == 4 ? *(uint32_t*)ppte : *(uint64_t*)ppte;
+    reg_t pte = vm.ptesize == 4 ? from_le(*(uint32_t*)ppte) : from_le(*(uint64_t*)ppte);
     reg_t ppn = pte >> PTE_PPN_SHIFT;
 
     if (PTE_TABLE(pte)) { // next level of page table
@@ -294,9 +412,9 @@ reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode)
 #ifdef RISCV_ENABLE_DIRTY
       // set accessed and possibly dirty bits.
       if ((pte & ad) != ad) {
-        if (!pmp_ok(pte_paddr, STORE, PRV_S))
+        if (!pmp_ok(pte_paddr, vm.ptesize, STORE, PRV_S))
           throw_access_exception(addr, type);
-        *(uint32_t*)ppte |= ad;
+        *(uint32_t*)ppte |= to_le((uint32_t)ad);
       }
 #else
       // take exception if access or possibly dirty bit is not set.
@@ -305,15 +423,16 @@ reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode)
 #endif
       // for superpage mappings, make a fake leaf PTE for the TLB's benefit.
       reg_t vpn = addr >> PGSHIFT;
-      reg_t value = (ppn | (vpn & ((reg_t(1) << ptshift) - 1))) << PGSHIFT;
-      return value;
+      reg_t page_base = (ppn | (vpn & ((reg_t(1) << ptshift) - 1))) << PGSHIFT;
+      reg_t phys = page_base | (addr & page_mask);
+      return s2xlate(addr, phys, type, virt, mxr) & ~page_mask;
     }
   }
 
   switch (type) {
-    case FETCH: throw trap_instruction_page_fault(addr);
-    case LOAD: throw trap_load_page_fault(addr);
-    case STORE: throw trap_store_page_fault(addr);
+    case FETCH: throw trap_instruction_page_fault(addr, 0, 0);
+    case LOAD: throw trap_load_page_fault(addr, 0, 0);
+    case STORE: throw trap_store_page_fault(addr, 0, 0);
     default: abort();
   }
 }
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 7617367436..990f137287 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -10,6 +10,7 @@
 #include "simif.h"
 #include "processor.h"
 #include "memtracer.h"
+#include "byteorder.h"
 #include <stdlib.h>
 #include <vector>
 
@@ -17,6 +18,7 @@
 #define PGSHIFT 12
 const reg_t PGSIZE = 1 << PGSHIFT;
 const reg_t PGMASK = ~(PGSIZE-1);
+#define MAX_PADDR_BITS 56 // imposed by Sv39 / Sv48
 
 struct insn_fetch_t
 {
@@ -64,7 +66,7 @@ class mmu_t
       res += (reg_t)load_uint8(addr + i) << (i * 8);
     return res;
 #else
-    throw trap_load_address_misaligned(addr);
+    throw trap_load_address_misaligned(addr, 0, 0);
 #endif
   }
 
@@ -74,80 +76,131 @@ class mmu_t
     for (size_t i = 0; i < size; i++)
       store_uint8(addr + i, data >> (i * 8));
 #else
-    throw trap_store_address_misaligned(addr);
+    throw trap_store_address_misaligned(addr, 0, 0);
 #endif
   }
 
+#ifndef RISCV_ENABLE_COMMITLOG
+# define READ_MEM(addr, size) ({})
+#else
+# define READ_MEM(addr, size) \
+  proc->state.log_mem_read.push_back(std::make_tuple(addr, 0, size));
+#endif
+
+#define RISCV_XLATE_VIRT (1U << 0)
+#define RISCV_XLATE_VIRT_MXR (1U << 1)
+
   // template for functions that load an aligned value from memory
-  #define load_func(type) \
-    inline type##_t load_##type(reg_t addr) { \
+  #define load_func(type, prefix, xlate_flags) \
+    inline type##_t prefix##_##type(reg_t addr) { \
+      if (xlate_flags) \
+        flush_tlb(); \
       if (unlikely(addr & (sizeof(type##_t)-1))) \
         return misaligned_load(addr, sizeof(type##_t)); \
       reg_t vpn = addr >> PGSHIFT; \
-      if (likely(tlb_load_tag[vpn % TLB_ENTRIES] == vpn)) \
-        return *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr); \
+      size_t size = sizeof(type##_t); \
+      if (likely(tlb_load_tag[vpn % TLB_ENTRIES] == vpn)) { \
+        if (proc) READ_MEM(addr, size); \
+        return from_le(*(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr)); \
+      } \
       if (unlikely(tlb_load_tag[vpn % TLB_ENTRIES] == (vpn | TLB_CHECK_TRIGGERS))) { \
-        type##_t data = *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr); \
+        type##_t data = from_le(*(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr)); \
         if (!matched_trigger) { \
           matched_trigger = trigger_exception(OPERATION_LOAD, addr, data); \
           if (matched_trigger) \
             throw *matched_trigger; \
         } \
+        if (proc) READ_MEM(addr, size); \
         return data; \
       } \
       type##_t res; \
-      load_slow_path(addr, sizeof(type##_t), (uint8_t*)&res); \
-      return res; \
+      load_slow_path(addr, sizeof(type##_t), (uint8_t*)&res, (xlate_flags)); \
+      if (proc) READ_MEM(addr, size); \
+      if (xlate_flags) \
+        flush_tlb(); \
+      return from_le(res); \
     }
 
   // load value from memory at aligned address; zero extend to register width
-  load_func(uint8)
-  load_func(uint16)
-  load_func(uint32)
-  load_func(uint64)
+  load_func(uint8, load, 0)
+  load_func(uint16, load, 0)
+  load_func(uint32, load, 0)
+  load_func(uint64, load, 0)
+
+  // load value from guest memory at aligned address; zero extend to register width
+  load_func(uint8, guest_load, RISCV_XLATE_VIRT)
+  load_func(uint16, guest_load, RISCV_XLATE_VIRT)
+  load_func(uint32, guest_load, RISCV_XLATE_VIRT)
+  load_func(uint64, guest_load, RISCV_XLATE_VIRT)
+  load_func(uint16, guest_load_x, RISCV_XLATE_VIRT|RISCV_XLATE_VIRT_MXR)
+  load_func(uint32, guest_load_x, RISCV_XLATE_VIRT|RISCV_XLATE_VIRT_MXR)
 
   // load value from memory at aligned address; sign extend to register width
-  load_func(int8)
-  load_func(int16)
-  load_func(int32)
-  load_func(int64)
+  load_func(int8, load, 0)
+  load_func(int16, load, 0)
+  load_func(int32, load, 0)
+  load_func(int64, load, 0)
+
+  // load value from guest memory at aligned address; sign extend to register width
+  load_func(int8, guest_load, RISCV_XLATE_VIRT)
+  load_func(int16, guest_load, RISCV_XLATE_VIRT)
+  load_func(int32, guest_load, RISCV_XLATE_VIRT)
+  load_func(int64, guest_load, RISCV_XLATE_VIRT)
+
+#ifndef RISCV_ENABLE_COMMITLOG
+# define WRITE_MEM(addr, value, size) ({})
+#else
+# define WRITE_MEM(addr, val, size) \
+  proc->state.log_mem_write.push_back(std::make_tuple(addr, val, size));
+#endif
 
   // template for functions that store an aligned value to memory
-  #define store_func(type) \
-    void store_##type(reg_t addr, type##_t val) { \
+  #define store_func(type, prefix, xlate_flags) \
+    void prefix##_##type(reg_t addr, type##_t val) { \
+      if (xlate_flags) \
+        flush_tlb(); \
       if (unlikely(addr & (sizeof(type##_t)-1))) \
         return misaligned_store(addr, val, sizeof(type##_t)); \
       reg_t vpn = addr >> PGSHIFT; \
-      if (likely(tlb_store_tag[vpn % TLB_ENTRIES] == vpn)) \
-        *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = val; \
+      size_t size = sizeof(type##_t); \
+      if (likely(tlb_store_tag[vpn % TLB_ENTRIES] == vpn)) { \
+        if (proc) WRITE_MEM(addr, val, size); \
+        *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = to_le(val); \
+      } \
       else if (unlikely(tlb_store_tag[vpn % TLB_ENTRIES] == (vpn | TLB_CHECK_TRIGGERS))) { \
         if (!matched_trigger) { \
           matched_trigger = trigger_exception(OPERATION_STORE, addr, val); \
           if (matched_trigger) \
             throw *matched_trigger; \
         } \
-        *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = val; \
+        if (proc) WRITE_MEM(addr, val, size); \
+        *(type##_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = to_le(val); \
       } \
-      else \
-        store_slow_path(addr, sizeof(type##_t), (const uint8_t*)&val); \
-    }
+      else { \
+        type##_t le_val = to_le(val); \
+        store_slow_path(addr, sizeof(type##_t), (const uint8_t*)&le_val, (xlate_flags)); \
+        if (proc) WRITE_MEM(addr, val, size); \
+      } \
+      if (xlate_flags) \
+        flush_tlb(); \
+  }
 
   // template for functions that perform an atomic memory operation
   #define amo_func(type) \
     template<typename op> \
     type##_t amo_##type(reg_t addr, op f) { \
       if (addr & (sizeof(type##_t)-1)) \
-        throw trap_store_address_misaligned(addr); \
+        throw trap_store_address_misaligned(addr, 0, 0); \
       try { \
         auto lhs = load_##type(addr); \
         store_##type(addr, f(lhs)); \
         return lhs; \
       } catch (trap_load_page_fault& t) { \
         /* AMO faults should be reported as store faults */ \
-        throw trap_store_page_fault(t.get_tval()); \
+        throw trap_store_page_fault(t.get_tval(), t.get_tval2(), t.get_tinst()); \
       } catch (trap_load_access_fault& t) { \
         /* AMO faults should be reported as store faults */ \
-        throw trap_store_access_fault(t.get_tval()); \
+        throw trap_store_access_fault(t.get_tval(), t.get_tval2(), t.get_tinst()); \
       } \
     }
 
@@ -155,7 +208,7 @@ class mmu_t
   {
 #ifndef RISCV_ENABLE_MISALIGNED
     if (unlikely(addr & (sizeof(float128_t)-1)))
-      throw trap_store_address_misaligned(addr);
+      throw trap_store_address_misaligned(addr, 0, 0);
 #endif
     store_uint64(addr, val.v[0]);
     store_uint64(addr + 8, val.v[1]);
@@ -165,16 +218,22 @@ class mmu_t
   {
 #ifndef RISCV_ENABLE_MISALIGNED
     if (unlikely(addr & (sizeof(float128_t)-1)))
-      throw trap_load_address_misaligned(addr);
+      throw trap_load_address_misaligned(addr, 0, 0);
 #endif
     return (float128_t){load_uint64(addr), load_uint64(addr + 8)};
   }
 
   // store value to memory at aligned address
-  store_func(uint8)
-  store_func(uint16)
-  store_func(uint32)
-  store_func(uint64)
+  store_func(uint8, store, 0)
+  store_func(uint16, store, 0)
+  store_func(uint32, store, 0)
+  store_func(uint64, store, 0)
+
+  // store value to guest memory at aligned address
+  store_func(uint8, guest_store, RISCV_XLATE_VIRT)
+  store_func(uint16, guest_store, RISCV_XLATE_VIRT)
+  store_func(uint32, guest_store, RISCV_XLATE_VIRT)
+  store_func(uint64, guest_store, RISCV_XLATE_VIRT)
 
   // perform an atomic memory operation at an aligned address
   amo_func(uint32)
@@ -187,20 +246,23 @@ class mmu_t
 
   inline void acquire_load_reservation(reg_t vaddr)
   {
-    reg_t paddr = translate(vaddr, 1, LOAD);
+    reg_t paddr = translate(vaddr, 1, LOAD, 0);
     if (auto host_addr = sim->addr_to_mem(paddr))
       load_reservation_address = refill_tlb(vaddr, paddr, host_addr, LOAD).target_offset + vaddr;
     else
-      throw trap_load_access_fault(vaddr); // disallow LR to I/O space
+      throw trap_load_access_fault(vaddr, 0, 0); // disallow LR to I/O space
   }
 
-  inline bool check_load_reservation(reg_t vaddr)
+  inline bool check_load_reservation(reg_t vaddr, size_t size)
   {
-    reg_t paddr = translate(vaddr, 1, STORE);
+    if (vaddr & (size-1))
+      throw trap_store_address_misaligned(vaddr, 0, 0);
+
+    reg_t paddr = translate(vaddr, 1, STORE, 0);
     if (auto host_addr = sim->addr_to_mem(paddr))
       return load_reservation_address == refill_tlb(vaddr, paddr, host_addr, STORE).target_offset + vaddr;
     else
-      throw trap_store_access_fault(vaddr); // disallow SC to I/O space
+      throw trap_store_access_fault(vaddr, 0, 0); // disallow SC to I/O space
   }
 
   static const reg_t ICACHE_ENTRIES = 1024;
@@ -213,21 +275,21 @@ class mmu_t
   inline icache_entry_t* refill_icache(reg_t addr, icache_entry_t* entry)
   {
     auto tlb_entry = translate_insn_addr(addr);
-    insn_bits_t insn = *(uint16_t*)(tlb_entry.host_offset + addr);
+    insn_bits_t insn = from_le(*(uint16_t*)(tlb_entry.host_offset + addr));
     int length = insn_length(insn);
 
     if (likely(length == 4)) {
-      insn |= (insn_bits_t)*(const int16_t*)translate_insn_addr_to_host(addr + 2) << 16;
+      insn |= (insn_bits_t)from_le(*(const int16_t*)translate_insn_addr_to_host(addr + 2)) << 16;
     } else if (length == 2) {
       insn = (int16_t)insn;
     } else if (length == 6) {
-      insn |= (insn_bits_t)*(const int16_t*)translate_insn_addr_to_host(addr + 4) << 32;
-      insn |= (insn_bits_t)*(const uint16_t*)translate_insn_addr_to_host(addr + 2) << 16;
+      insn |= (insn_bits_t)from_le(*(const int16_t*)translate_insn_addr_to_host(addr + 4)) << 32;
+      insn |= (insn_bits_t)from_le(*(const uint16_t*)translate_insn_addr_to_host(addr + 2)) << 16;
     } else {
       static_assert(sizeof(insn_bits_t) == 8, "insn_bits_t must be uint64_t");
-      insn |= (insn_bits_t)*(const int16_t*)translate_insn_addr_to_host(addr + 6) << 48;
-      insn |= (insn_bits_t)*(const uint16_t*)translate_insn_addr_to_host(addr + 4) << 32;
-      insn |= (insn_bits_t)*(const uint16_t*)translate_insn_addr_to_host(addr + 2) << 16;
+      insn |= (insn_bits_t)from_le(*(const int16_t*)translate_insn_addr_to_host(addr + 6)) << 48;
+      insn |= (insn_bits_t)from_le(*(const uint16_t*)translate_insn_addr_to_host(addr + 4)) << 32;
+      insn |= (insn_bits_t)from_le(*(const uint16_t*)translate_insn_addr_to_host(addr + 2)) << 16;
     }
 
     insn_fetch_t fetch = {proc->decode_insn(insn), insn};
@@ -304,14 +366,20 @@ class mmu_t
   tlb_entry_t refill_tlb(reg_t vaddr, reg_t paddr, char* host_addr, access_type type);
   const char* fill_from_mmio(reg_t vaddr, reg_t paddr);
 
+  // perform a stage2 translation for a given guest address
+  reg_t s2xlate(reg_t gva, reg_t gpa, access_type type, bool virt, bool mxr);
+
   // perform a page table walk for a given VA; set referenced/dirty bits
-  reg_t walk(reg_t addr, access_type type, reg_t prv);
+  reg_t walk(reg_t addr, access_type type, reg_t prv, bool virt, bool mxr);
 
   // handle uncommon cases: TLB misses, page faults, MMIO
   tlb_entry_t fetch_slow_path(reg_t addr);
-  void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes);
-  void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes);
-  reg_t translate(reg_t addr, reg_t len, access_type type);
+  void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags);
+  void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags);
+  bool mmio_load(reg_t addr, size_t len, uint8_t* bytes);
+  bool mmio_store(reg_t addr, size_t len, const uint8_t* bytes);
+  bool mmio_ok(reg_t addr, access_type type);
+  reg_t translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_flags);
 
   // ITLB lookup
   inline tlb_entry_t translate_insn_addr(reg_t addr) {
@@ -326,9 +394,9 @@ class mmu_t
     }
     if (unlikely(tlb_insn_tag[vpn % TLB_ENTRIES] == (vpn | TLB_CHECK_TRIGGERS))) {
       uint16_t* ptr = (uint16_t*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr);
-      int match = proc->trigger_match(OPERATION_EXECUTE, addr, *ptr);
+      int match = proc->trigger_match(OPERATION_EXECUTE, addr, from_le(*ptr));
       if (match >= 0) {
-        throw trigger_matched_t(match, OPERATION_EXECUTE, addr, *ptr);
+        throw trigger_matched_t(match, OPERATION_EXECUTE, addr, from_le(*ptr));
       }
     }
     return result;
@@ -354,7 +422,7 @@ class mmu_t
   }
 
   reg_t pmp_homogeneous(reg_t addr, reg_t len);
-  reg_t pmp_ok(reg_t addr, access_type type, reg_t mode);
+  reg_t pmp_ok(reg_t addr, reg_t len, access_type type, reg_t mode);
 
   bool check_triggers_fetch;
   bool check_triggers_load;
@@ -368,27 +436,41 @@ class mmu_t
 struct vm_info {
   int levels;
   int idxbits;
+  int widenbits;
   int ptesize;
   reg_t ptbase;
 };
 
-inline vm_info decode_vm_info(int xlen, reg_t prv, reg_t satp)
+inline vm_info decode_vm_info(int xlen, bool stage2, reg_t prv, reg_t satp)
 {
   if (prv == PRV_M) {
-    return {0, 0, 0, 0};
-  } else if (prv <= PRV_S && xlen == 32) {
+    return {0, 0, 0, 0, 0};
+  } else if (!stage2 && prv <= PRV_S && xlen == 32) {
     switch (get_field(satp, SATP32_MODE)) {
-      case SATP_MODE_OFF: return {0, 0, 0, 0};
-      case SATP_MODE_SV32: return {2, 10, 4, (satp & SATP32_PPN) << PGSHIFT};
+      case SATP_MODE_OFF: return {0, 0, 0, 0, 0};
+      case SATP_MODE_SV32: return {2, 10, 0, 4, (satp & SATP32_PPN) << PGSHIFT};
       default: abort();
     }
-  } else if (prv <= PRV_S && xlen == 64) {
+  } else if (!stage2 && prv <= PRV_S && xlen == 64) {
     switch (get_field(satp, SATP64_MODE)) {
-      case SATP_MODE_OFF: return {0, 0, 0, 0};
-      case SATP_MODE_SV39: return {3, 9, 8, (satp & SATP64_PPN) << PGSHIFT};
-      case SATP_MODE_SV48: return {4, 9, 8, (satp & SATP64_PPN) << PGSHIFT};
-      case SATP_MODE_SV57: return {5, 9, 8, (satp & SATP64_PPN) << PGSHIFT};
-      case SATP_MODE_SV64: return {6, 9, 8, (satp & SATP64_PPN) << PGSHIFT};
+      case SATP_MODE_OFF: return {0, 0, 0, 0, 0};
+      case SATP_MODE_SV39: return {3, 9, 0, 8, (satp & SATP64_PPN) << PGSHIFT};
+      case SATP_MODE_SV48: return {4, 9, 0, 8, (satp & SATP64_PPN) << PGSHIFT};
+      case SATP_MODE_SV57: return {5, 9, 0, 8, (satp & SATP64_PPN) << PGSHIFT};
+      case SATP_MODE_SV64: return {6, 9, 0, 8, (satp & SATP64_PPN) << PGSHIFT};
+      default: abort();
+    }
+  } else if (stage2 && xlen == 32) {
+    switch (get_field(satp, HGATP32_MODE)) {
+      case HGATP_MODE_OFF: return {0, 0, 0, 0, 0};
+      case HGATP_MODE_SV32X4: return {2, 10, 2, 4, (satp & HGATP32_PPN) << PGSHIFT};
+      default: abort();
+    }
+  } else if (stage2 && xlen == 64) {
+    switch (get_field(satp, HGATP64_MODE)) {
+      case HGATP_MODE_OFF: return {0, 0, 0, 0, 0};
+      case HGATP_MODE_SV39X4: return {3, 9, 2, 8, (satp & HGATP64_PPN) << PGSHIFT};
+      case HGATP_MODE_SV48X4: return {4, 9, 2, 8, (satp & HGATP64_PPN) << PGSHIFT};
       default: abort();
     }
   } else {
diff --git a/riscv/mulhi.h b/riscv/mulhi.h
deleted file mode 100644
index bb4a484a6d..0000000000
--- a/riscv/mulhi.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// See LICENSE for license details.
-
-#ifndef _RISCV_MULHI_H
-#define _RISCV_MULHI_H
-
-#include <cstdint>
-
-inline uint64_t mulhu(uint64_t a, uint64_t b)
-{
-  uint64_t t;
-  uint32_t y1, y2, y3;
-  uint64_t a0 = (uint32_t)a, a1 = a >> 32;
-  uint64_t b0 = (uint32_t)b, b1 = b >> 32;
-
-  t = a1*b0 + ((a0*b0) >> 32);
-  y1 = t;
-  y2 = t >> 32;
-
-  t = a0*b1 + y1;
-  y1 = t;
-
-  t = a1*b1 + y2 + (t >> 32);
-  y2 = t;
-  y3 = t >> 32;
-
-  return ((uint64_t)y3 << 32) | y2;
-}
-
-inline int64_t mulh(int64_t a, int64_t b)
-{
-  int negate = (a < 0) != (b < 0);
-  uint64_t res = mulhu(a < 0 ? -a : a, b < 0 ? -b : b);
-  return negate ? ~res + (a * b == 0) : res;
-}
-
-inline int64_t mulhsu(int64_t a, uint64_t b)
-{
-  int negate = a < 0;
-  uint64_t res = mulhu(a < 0 ? -a : a, b);
-  return negate ? ~res + (a * b == 0) : res;
-}
-
-#endif
diff --git a/riscv/opcodes.h b/riscv/opcodes.h
index 34c089ebb7..065934a238 100644
--- a/riscv/opcodes.h
+++ b/riscv/opcodes.h
@@ -125,6 +125,11 @@ static uint32_t csrr(unsigned int rd, unsigned int csr) {
   return (csr << 20) | (rd << 7) | MATCH_CSRRS;
 }
 
+static uint32_t csrrs(unsigned int rd, unsigned int rs1, unsigned int csr) __attribute__ ((unused));
+static uint32_t csrrs(unsigned int rd, unsigned int rs1, unsigned int csr) {
+  return (csr << 20) | (rs1 << 15) | (rd << 7) | MATCH_CSRRS;
+}
+
 static uint32_t fsw(unsigned int src, unsigned int base, uint16_t offset) __attribute__ ((unused));
 static uint32_t fsw(unsigned int src, unsigned int base, uint16_t offset)
 {
@@ -177,7 +182,6 @@ static uint32_t fence_i(void)
   return MATCH_FENCE_I;
 }
 
-/*
 static uint32_t lui(unsigned int dest, uint32_t imm) __attribute__ ((unused));
 static uint32_t lui(unsigned int dest, uint32_t imm)
 {
@@ -186,6 +190,7 @@ static uint32_t lui(unsigned int dest, uint32_t imm)
     MATCH_LUI;
 }
 
+/*
 static uint32_t csrci(unsigned int csr, uint16_t imm) __attribute__ ((unused));
 static uint32_t csrci(unsigned int csr, uint16_t imm) {
   return (csr << 20) |
diff --git a/riscv/processor.cc b/riscv/processor.cc
index 00612f0944..b601f1fbf7 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -1,5 +1,6 @@
 // See LICENSE for license details.
 
+#include "arith.h"
 #include "processor.h"
 #include "extension.h"
 #include "common.h"
@@ -14,19 +15,29 @@
 #include <assert.h>
 #include <limits.h>
 #include <stdexcept>
+#include <string>
 #include <algorithm>
 
 #undef STATE
 #define STATE state
 
-processor_t::processor_t(const char* isa, simif_t* sim, uint32_t id,
-        bool halt_on_reset)
-  : debug(false), halt_request(false), sim(sim), ext(NULL), id(id),
-  halt_on_reset(halt_on_reset), last_pc(1), executions(1)
+processor_t::processor_t(const char* isa, const char* priv, const char* varch,
+                         simif_t* sim, uint32_t id, bool halt_on_reset,
+                         FILE* log_file)
+  : debug(false), halt_request(HR_NONE), sim(sim), ext(NULL), id(id), xlen(0),
+  histogram_enabled(false), log_commits_enabled(false),
+  log_file(log_file), halt_on_reset(halt_on_reset),
+  extension_table(256, false), last_pc(1), executions(1)
 {
+  VU.p = this;
+
+  hwLoops.p = this;
+
   parse_isa_string(isa);
-  register_base_instructions();
+  parse_priv_string(priv);
+  parse_varch_string(varch);
 
+  register_base_instructions();
   mmu = new mmu_t(sim, this);
 
   disassembler = new disassembler_t(max_xlen);
@@ -34,6 +45,8 @@ processor_t::processor_t(const char* isa, simif_t* sim, uint32_t id,
     for (auto disasm_insn : ext->get_disasms())
       disassembler->add_insn(disasm_insn);
 
+  set_pmp_granularity(1 << PMP_SHIFT);
+  set_pmp_num(state.max_pmp);
   reset();
 }
 
@@ -52,26 +65,160 @@ processor_t::~processor_t()
   delete disassembler;
 }
 
-static void bad_isa_string(const char* isa)
+static void bad_option_string(const char *option, const char *value,
+                              const char *msg)
 {
-  fprintf(stderr, "error: bad --isa option %s\n", isa);
+  fprintf(stderr, "error: bad %s option '%s'. %s\n", option, value, msg);
   abort();
 }
 
-void processor_t::parse_isa_string(const char* str)
+static void bad_isa_string(const char* isa, const char* msg)
+{
+  bad_option_string("--isa", isa, msg);
+}
+
+static void bad_priv_string(const char* priv)
+{
+  fprintf(stderr, "error: bad --priv option %s\n", priv);
+  abort();
+}
+
+static void bad_varch_string(const char* varch, const char *msg)
+{
+  bad_option_string("--varch", varch, msg);
+}
+
+static std::string get_string_token(std::string str, const char delimiter, size_t& pos)
+{
+  size_t _pos = pos;
+  while (pos < str.length() && str[pos] != delimiter) ++pos;
+  return str.substr(_pos, pos - _pos);
+}
+
+static int get_int_token(std::string str, const char delimiter, size_t& pos)
+{
+  size_t _pos = pos;
+  while (pos < str.length() && str[pos] != delimiter) {
+    if (!isdigit(str[pos]))
+      bad_varch_string(str.c_str(), "Unsupported value"); // An integer is expected
+    ++pos;
+  }
+  return (pos == _pos) ? 0 : stoi(str.substr(_pos, pos - _pos));
+}
+
+static bool check_pow2(int val)
 {
-  std::string lowercase, tmp;
+  return ((val & (val - 1))) == 0;
+}
+
+void processor_t::parse_varch_string(const char* s)
+{
+  std::string str, tmp;
+  for (const char *r = s; *r; r++)
+    str += std::tolower(*r);
+
+  size_t pos = 0;
+  size_t len = str.length();
+  int vlen = 0;
+  int elen = 0;
+  int slen = 0;
+  int vstart_alu = 1;
+
+  while (pos < len) {
+    std::string attr = get_string_token(str, ':', pos);
+
+    ++pos;
+
+    if (attr == "vlen")
+      vlen = get_int_token(str, ',', pos);
+    else if (attr == "slen")
+      slen = get_int_token(str, ',', pos);
+    else if (attr == "elen")
+      elen = get_int_token(str, ',', pos);
+    else if (attr == "vstartalu")
+      vstart_alu = get_int_token(str, ',', pos);
+    else
+      bad_varch_string(s, "Unsupported token");
+
+    ++pos;
+  }
+
+  // The integer should be the power of 2
+  if (!check_pow2(vlen) || !check_pow2(elen) || !check_pow2(slen)){
+    bad_varch_string(s, "The integer value should be the power of 2");
+  }
+
+  if (slen == 0)
+    slen = vlen;
+
+  /* Vector spec requirements. */
+  if (vlen < elen)
+    bad_varch_string(s, "vlen must be >= elen");
+  if ((unsigned) elen < std::max(max_xlen, get_flen()))
+    bad_varch_string(s, "elen must be >= max(xlen, flen)");
+  if (vlen != slen)
+    bad_varch_string(s, "vlen must be == slen for current limitation");
+
+  /* spike requirements. */
+  if (vlen > 4096)
+    bad_varch_string(s, "vlen must be <= 4096");
+
+  VU.VLEN = vlen;
+  VU.ELEN = elen;
+  VU.vlenb = vlen / 8;
+  VU.vstart_alu = vstart_alu;
+}
+
+static std::string strtolower(const char* str)
+{
+  std::string res;
   for (const char *r = str; *r; r++)
-    lowercase += std::tolower(*r);
+    res += std::tolower(*r);
+  return res;
+}
+
+void processor_t::parse_priv_string(const char* str)
+{
+  std::string lowercase = strtolower(str);
+  bool user = false, supervisor = false;
+
+  if (lowercase == "m")
+    ;
+  else if (lowercase == "mu")
+    user = true;
+  else if (lowercase == "msu")
+    user = supervisor = true;
+  else
+    bad_priv_string(str);
+
+  if (user) {
+    max_isa |= reg_t(user) << ('u' - 'a');
+    extension_table['U'] = true;
+  }
+
+  if (supervisor) {
+    max_isa |= reg_t(supervisor) << ('s' - 'a');
+    extension_table['S'] = true;
+  }
+}
+
+void processor_t::parse_isa_string(const char* str)
+{
+  std::string lowercase = strtolower(str), tmp;
 
+  char error_msg[256];
   const char* p = lowercase.c_str();
-  const char* all_subsets = "imafdqc";
+  const char* all_subsets = "imafdqch"
+#ifdef __SIZEOF_INT128__
+    "v"
+#endif
+    "";
 
   max_xlen = 64;
-  state.misa = reg_t(2) << 62;
+  max_isa = reg_t(2) << 62;
 
   if (strncmp(p, "rv32", 4) == 0)
-    max_xlen = 32, state.misa = reg_t(1) << 30, p += 4;
+    max_xlen = 32, max_isa = reg_t(1) << 30, p += 4;
   else if (strncmp(p, "rv64", 4) == 0)
     p += 4;
   else if (strncmp(p, "rv", 2) == 0)
@@ -82,55 +229,196 @@ void processor_t::parse_isa_string(const char* str)
   } else if (*p == 'g') { // treat "G" as "IMAFD"
     tmp = std::string("imafd") + (p+1);
     p = &tmp[0];
-  } else if (*p != 'i') {
-    bad_isa_string(str);
   }
 
   isa_string = "rv" + std::to_string(max_xlen) + p;
-  state.misa |= 1L << ('s' - 'a'); // advertise support for supervisor mode
-  state.misa |= 1L << ('u' - 'a'); // advertise support for user mode
 
   while (*p) {
-    state.misa |= 1L << (*p - 'a');
+    if (islower(*p)) {
+      max_isa |= 1L << (*p - 'a');
+      extension_table[toupper(*p)] = true;
+
+      if (strchr(all_subsets, *p)) {
+        p++;
+      } else if (*p == 'x') {
+        const char* ext = p + 1, *end = ext;
+        while (islower(*end) || *end == '_')
+          end++;
+
+        auto ext_str = std::string(ext, end - ext);
+        if (ext_str != "dummy")
+          register_extension(find_extension(ext_str.c_str())());
+
+        p = end;
+      } else {
+        sprintf(error_msg, "unsupported extension '%c'", *p);
+        bad_isa_string(str, error_msg);
+      }
+    } else if (*p == '_') {
+      const char* ext = p + 1, *end = ext;
+      if (*ext == 'x') {
+        p++;
+        continue;
+      }
 
-    if (auto next = strchr(all_subsets, *p)) {
-      all_subsets = next + 1;
-      p++;
-    } else if (*p == 'x') {
-      const char* ext = p+1, *end = ext;
       while (islower(*end))
         end++;
-      register_extension(find_extension(std::string(ext, end - ext).c_str())());
+
+      auto ext_str = std::string(ext, end - ext);
+      if (ext_str == "zfh") {
+        extension_table[EXT_ZFH] = true;
+      } else {
+        sprintf(error_msg, "unsupported extension '%s'", ext_str.c_str());
+        bad_isa_string(str, error_msg);
+      }
+
       p = end;
     } else {
-      bad_isa_string(str);
+      sprintf(error_msg, "can't parse '%c(%d)'", *p, *p);
+      bad_isa_string(str, error_msg);
     }
   }
 
-  if (supports_extension('D') && !supports_extension('F'))
-    bad_isa_string(str);
+  state.misa = max_isa;
 
-  if (supports_extension('Q') && !supports_extension('D'))
-    bad_isa_string(str);
+  if (!supports_extension('I'))
+    bad_isa_string(str, "'I' extension is required");
+
+  if (supports_extension(EXT_ZFH) && !supports_extension('F'))
+    bad_isa_string(str, "'Zfh' extension requires 'F'");
 
-  if (supports_extension('Q') && max_xlen < 64)
-    bad_isa_string(str);
+  if (supports_extension('D') && !supports_extension('F'))
+    bad_isa_string(str, "'D' extension requires 'F'");
 
-  max_isa = state.misa;
+  if (supports_extension('Q') && !supports_extension('D'))
+    bad_isa_string(str, "'Q' extension requires 'D'");
 }
 
 void state_t::reset(reg_t max_isa)
 {
-  memset(this, 0, sizeof(*this));
-  misa = max_isa;
-  prv = PRV_M;
   pc = DEFAULT_RSTVEC;
+  XPR.reset();
+  FPR.reset();
+
+  prv = PRV_M;
+  v = false;
+  misa = max_isa;
+  mstatus = 0;
+  mepc = 0;
+  mtval = 0;
+  mscratch = 0;
+  mtvec = 0;
+  mcause = 0;
+  minstret = 0;
+  mie = 0;
+  mip = 0;
+  medeleg = 0;
+  mideleg = 0;
+  mcounteren = 0;
+  scounteren = 0;
+  sepc = 0;
+  stval = 0;
+  sscratch = 0;
+  stvec = 0;
+  satp = 0;
+  scause = 0;
+  mtval2 = 0;
+  mtinst = 0;
+  hstatus = 0;
+  hideleg = 0;
+  hedeleg = 0;
+  hcounteren = 0;
+  htval = 0;
+  htinst = 0;
+  hgatp = 0;
+  vsstatus = 0;
+  vstvec = 0;
+  vsscratch = 0;
+  vsepc = 0;
+  vscause = 0;
+  vstval = 0;
+  vsatp = 0;
+
+  dpc = 0;
+  dscratch0 = 0;
+  dscratch1 = 0;
+  memset(&this->dcsr, 0, sizeof(this->dcsr));
+
   tselect = 0;
-  for (unsigned int i = 0; i < num_triggers; i++)
-    mcontrol[i].type = 2;
+  memset(this->mcontrol, 0, sizeof(this->mcontrol));
+  for (auto &item : mcontrol)
+    item.type = 2;
+
+  memset(this->tdata2, 0, sizeof(this->tdata2));
+  debug_mode = false;
+  single_step = STEP_NONE;
+
+  memset(this->pmpcfg, 0, sizeof(this->pmpcfg));
+  memset(this->pmpaddr, 0, sizeof(this->pmpaddr));
+
+  fflags = 0;
+  frm = 0;
+  serialized = false;
+
+#ifdef RISCV_ENABLE_COMMITLOG
+  log_reg_write.clear();
+  log_mem_read.clear();
+  log_mem_write.clear();
+  last_inst_priv = 0;
+  last_inst_xlen = 0;
+  last_inst_flen = 0;
+#endif
+}
+
+void processor_t::vectorUnit_t::reset(){
+  free(reg_file);
+  VLEN = get_vlen();
+  ELEN = get_elen();
+  reg_file = malloc(NVPR * vlenb);
+  memset(reg_file, 0, NVPR * vlenb);
 
-  pmpcfg[0] = PMP_R | PMP_W | PMP_X | PMP_NAPOT;
-  pmpaddr[0] = ~reg_t(0);
+  vtype = 0;
+  set_vl(0, 0, 0, -1); // default to illegal configuration
+}
+
+reg_t processor_t::vectorUnit_t::set_vl(int rd, int rs1, reg_t reqVL, reg_t newType){
+  int new_vlmul = 0;
+  if (vtype != newType){
+    vtype = newType;
+    vsew = 1 << (extract64(newType, 3, 3) + 3);
+    new_vlmul = int8_t(extract64(newType, 0, 3) << 5) >> 5;
+    vflmul = new_vlmul >= 0 ? 1 << new_vlmul : 1.0 / (1 << -new_vlmul);
+    vlmax = (VLEN/vsew) * vflmul;
+    vta = extract64(newType, 6, 1);
+    vma = extract64(newType, 7, 1);
+    vediv = 1 << extract64(newType, 8, 2);
+
+    vill = !(vflmul >= 0.125 && vflmul <= 8)
+           || vsew > ELEN
+           || vflmul < ((float)vsew / ELEN)
+           || vediv != 1
+           || (newType >> 8) != 0;
+
+    if (vill) {
+      vlmax = 0;
+      vtype = UINT64_MAX << (p->get_xlen() - 1);
+    }
+  }
+
+  // set vl
+  if (vlmax == 0) {
+    vl = 0;
+  } else if (rd == 0 && rs1 == 0) {
+    vl = vl > vlmax ? vlmax : vl;
+  } else if (rd != 0 && rs1 == 0) {
+    vl = vlmax;
+  } else if (rs1 != 0) {
+    vl = reqVL > vlmax ? vlmax : reqVL;
+  }
+
+  vstart = 0;
+  setvl_count++;
+  return vl;
 }
 
 void processor_t::set_debug(bool value)
@@ -146,17 +434,36 @@ void processor_t::set_histogram(bool value)
 #ifndef RISCV_ENABLE_HISTOGRAM
   if (value) {
     fprintf(stderr, "PC Histogram support has not been properly enabled;");
-    fprintf(stderr, " please re-build the riscv-isa-run project using \"configure --enable-histogram\".\n");
+    fprintf(stderr, " please re-build the riscv-isa-sim project using \"configure --enable-histogram\".\n");
+    abort();
   }
 #endif
 }
 
+#ifdef RISCV_ENABLE_COMMITLOG
+void processor_t::enable_log_commits()
+{
+  log_commits_enabled = true;
+}
+#endif
+
 void processor_t::reset()
 {
   state.reset(max_isa);
+
+  state.mideleg = supports_extension('H') ? MIDELEG_FORCED_MASK : 0;
+
   state.dcsr.halt = halt_on_reset;
   halt_on_reset = false;
   set_csr(CSR_MSTATUS, state.mstatus);
+  VU.reset();
+
+  if (n_pmp > 0) {
+    // For backwards compatibility with software that is unaware of PMP,
+    // initialize PMP to permit unprivileged access to all of memory.
+    set_csr(CSR_PMPADDR0, ~reg_t(0));
+    set_csr(CSR_PMPCFG0, PMP_R | PMP_W | PMP_X | PMP_NAPOT);
+  }
 
   if (ext)
     ext->reset(); // reset the extension
@@ -175,31 +482,79 @@ static int ctz(reg_t val)
   return res;
 }
 
+void processor_t::set_pmp_num(reg_t n)
+{
+  // check the number of pmp is in a reasonable range
+  if (n > state.max_pmp) {
+    fprintf(stderr, "error: bad number of pmp regions: '%ld' from the dtb\n", (unsigned long)n);
+    abort();
+  }
+  n_pmp = n;
+}
+
+void processor_t::set_pmp_granularity(reg_t gran) {
+  // check the pmp granularity is set from dtb(!=0) and is power of 2
+  if (gran < (1 << PMP_SHIFT) || (gran & (gran - 1)) != 0) {
+    fprintf(stderr, "error: bad pmp granularity '%ld' from the dtb\n", (unsigned long)gran);
+    abort();
+  }
+
+  lg_pmp_granularity = ctz(gran);
+}
+
 void processor_t::take_interrupt(reg_t pending_interrupts)
 {
-  reg_t mie = get_field(state.mstatus, MSTATUS_MIE);
-  reg_t m_enabled = state.prv < PRV_M || (state.prv == PRV_M && mie);
-  reg_t enabled_interrupts = pending_interrupts & ~state.mideleg & -m_enabled;
+  reg_t enabled_interrupts, deleg, status, mie, m_enabled;
+  reg_t hsie, hs_enabled, vsie, vs_enabled;
 
-  reg_t sie = get_field(state.mstatus, MSTATUS_SIE);
-  reg_t s_enabled = state.prv < PRV_S || (state.prv == PRV_S && sie);
-  // M-ints have highest priority; consider S-ints only if no M-ints pending
-  if (enabled_interrupts == 0)
-    enabled_interrupts = pending_interrupts & state.mideleg & -s_enabled;
+  // Do nothing if no pending interrupts
+  if (!pending_interrupts) {
+    return;
+  }
 
-  if (state.dcsr.cause == 0 && enabled_interrupts) {
+  // M-ints have higher priority over HS-ints and VS-ints
+  mie = get_field(state.mstatus, MSTATUS_MIE);
+  m_enabled = state.prv < PRV_M || (state.prv == PRV_M && mie);
+  enabled_interrupts = pending_interrupts & ~state.mideleg & -m_enabled;
+  if (enabled_interrupts == 0) {
+    // HS-ints have higher priority over VS-ints
+    deleg = state.mideleg & ~MIP_VS_MASK;
+    status = (state.v) ? state.vsstatus : state.mstatus;
+    hsie = get_field(status, MSTATUS_SIE);
+    hs_enabled = state.prv < PRV_S || (state.prv == PRV_S && hsie);
+    enabled_interrupts = pending_interrupts & deleg & -hs_enabled;
+    if (state.v && enabled_interrupts == 0) {
+      // VS-ints have least priority and can only be taken with virt enabled
+      deleg = state.mideleg & state.hideleg;
+      vsie = get_field(state.mstatus, MSTATUS_SIE);
+      vs_enabled = state.prv < PRV_S || (state.prv == PRV_S && vsie);
+      enabled_interrupts = pending_interrupts & deleg & -vs_enabled;
+    }
+  }
+
+  if (!state.debug_mode && enabled_interrupts) {
     // nonstandard interrupts have highest priority
     if (enabled_interrupts >> IRQ_M_EXT)
       enabled_interrupts = enabled_interrupts >> IRQ_M_EXT << IRQ_M_EXT;
-    // external interrupts have next-highest priority
-    else if (enabled_interrupts & (MIP_MEIP | MIP_SEIP))
-      enabled_interrupts = enabled_interrupts & (MIP_MEIP | MIP_SEIP);
-    // software interrupts have next-highest priority
-    else if (enabled_interrupts & (MIP_MSIP | MIP_SSIP))
-      enabled_interrupts = enabled_interrupts & (MIP_MSIP | MIP_SSIP);
-    // timer interrupts have next-highest priority
-    else if (enabled_interrupts & (MIP_MTIP | MIP_STIP))
-      enabled_interrupts = enabled_interrupts & (MIP_MTIP | MIP_STIP);
+    // standard interrupt priority is MEI, MSI, MTI, SEI, SSI, STI
+    else if (enabled_interrupts & MIP_MEIP)
+      enabled_interrupts = MIP_MEIP;
+    else if (enabled_interrupts & MIP_MSIP)
+      enabled_interrupts = MIP_MSIP;
+    else if (enabled_interrupts & MIP_MTIP)
+      enabled_interrupts = MIP_MTIP;
+    else if (enabled_interrupts & MIP_SEIP)
+      enabled_interrupts = MIP_SEIP;
+    else if (enabled_interrupts & MIP_SSIP)
+      enabled_interrupts = MIP_SSIP;
+    else if (enabled_interrupts & MIP_STIP)
+      enabled_interrupts = MIP_STIP;
+    else if (enabled_interrupts & MIP_VSEIP)
+      enabled_interrupts = MIP_VSEIP;
+    else if (enabled_interrupts & MIP_VSSIP)
+      enabled_interrupts = MIP_VSSIP;
+    else if (enabled_interrupts & MIP_VSTIP)
+      enabled_interrupts = MIP_VSTIP;
     else
       abort();
 
@@ -223,7 +578,7 @@ reg_t processor_t::legalize_privilege(reg_t prv)
   if (!supports_extension('U'))
     return PRV_M;
 
-  if (prv == PRV_H || !supports_extension('S'))
+  if ((prv == PRV_HS && !supports_extension('H')) || (prv == PRV_S && !supports_extension('S')))
     return PRV_U;
 
   return prv;
@@ -235,8 +590,52 @@ void processor_t::set_privilege(reg_t prv)
   state.prv = legalize_privilege(prv);
 }
 
+void processor_t::set_virt(bool virt)
+{
+  reg_t tmp, mask;
+
+  if (state.prv == PRV_M)
+    return;
+
+  if (state.v != virt) {
+    /*
+     * Ideally, we should flush TLB here but we don't need it because
+     * set_virt() is always used in conjucter with set_privilege() and
+     * set_privilege() will flush TLB unconditionally.
+     */
+    if (state.v and !virt) {
+      /*
+       * When transitioning from virt-on (VS/VU) to virt-off (HS/M)
+       * we should sync Guest/VM FS, VS, and XS state with Host FS,
+       * VS, and XS state.
+       */
+       if ((state.mstatus & SSTATUS_FS) == SSTATUS_FS) {
+         state.vsstatus |= SSTATUS_FS;
+         state.vsstatus |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+       }
+       if ((state.mstatus & SSTATUS_VS) == SSTATUS_VS) {
+         state.vsstatus |= SSTATUS_VS;
+         state.vsstatus |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+       }
+       if ((state.mstatus & SSTATUS_XS) == SSTATUS_XS) {
+         state.vsstatus |= SSTATUS_XS;
+         state.vsstatus |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+       }
+    }
+    mask = SSTATUS_VS_MASK;
+    mask |= (supports_extension('F') ? SSTATUS_FS : 0);
+    mask |= (supports_extension('V') ? SSTATUS_VS : 0);
+    mask |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+    tmp = state.mstatus & mask;
+    state.mstatus = (state.mstatus & ~mask) | (state.vsstatus & mask);
+    state.vsstatus = tmp;
+    state.v = virt;
+  }
+}
+
 void processor_t::enter_debug_mode(uint8_t cause)
 {
+  state.debug_mode = true;
   state.dcsr.cause = cause;
   state.dcsr.prv = state.prv;
   set_privilege(PRV_M);
@@ -247,14 +646,14 @@ void processor_t::enter_debug_mode(uint8_t cause)
 void processor_t::take_trap(trap_t& t, reg_t epc)
 {
   if (debug) {
-    fprintf(stderr, "core %3d: exception %s, epc 0x%016" PRIx64 "\n",
+    fprintf(log_file, "core %3d: exception %s, epc 0x%016" PRIx64 "\n",
             id, t.name(), epc);
     if (t.has_tval())
-      fprintf(stderr, "core %3d:           tval 0x%016" PRIx64 "\n", id,
-          t.get_tval());
+      fprintf(log_file, "core %3d:           tval 0x%016" PRIx64 "\n",
+              id, t.get_tval());
   }
 
-  if (state.dcsr.cause) {
+  if (state.debug_mode) {
     if (t.cause() == CAUSE_BREAKPOINT) {
       state.pc = DEBUG_ROM_ENTRY;
     } else {
@@ -271,36 +670,72 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     return;
   }
 
-  // by default, trap to M-mode, unless delegated to S-mode
+  // By default, trap to M-mode, unless delegated to HS-mode or VS-mode
+  reg_t vsdeleg, hsdeleg;
   reg_t bit = t.cause();
-  reg_t deleg = state.medeleg;
+  bool curr_virt = state.v;
   bool interrupt = (bit & ((reg_t)1 << (max_xlen-1))) != 0;
-  if (interrupt)
-    deleg = state.mideleg, bit &= ~((reg_t)1 << (max_xlen-1));
-  if (state.prv <= PRV_S && bit < max_xlen && ((deleg >> bit) & 1)) {
-    // handle the trap in S-mode
-    state.pc = state.stvec;
+  if (interrupt) {
+    vsdeleg = (curr_virt && state.prv <= PRV_S) ? (state.mideleg & state.hideleg) : 0;
+    hsdeleg = (state.prv <= PRV_S) ? state.mideleg : 0;
+    bit &= ~((reg_t)1 << (max_xlen-1));
+  } else {
+    vsdeleg = (curr_virt && state.prv <= PRV_S) ? (state.medeleg & state.hedeleg) : 0;
+    hsdeleg = (state.prv <= PRV_S) ? state.medeleg : 0;
+  }
+  if (state.prv <= PRV_S && bit < max_xlen && ((vsdeleg >> bit) & 1)) {
+    // Handle the trap in VS-mode
+    reg_t vector = (state.vstvec & 1) && interrupt ? 4*bit : 0;
+    state.pc = (state.vstvec & ~(reg_t)1) + vector;
+    state.vscause = (interrupt) ? (t.cause() - 1) : t.cause();
+    state.vsepc = epc;
+    state.vstval = t.get_tval();
+
+    reg_t s = state.mstatus;
+    s = set_field(s, MSTATUS_SPIE, get_field(s, MSTATUS_SIE));
+    s = set_field(s, MSTATUS_SPP, state.prv);
+    s = set_field(s, MSTATUS_SIE, 0);
+    set_csr(CSR_MSTATUS, s);
+    set_privilege(PRV_S);
+  } else if (state.prv <= PRV_S && bit < max_xlen && ((hsdeleg >> bit) & 1)) {
+    // Handle the trap in HS-mode
+    set_virt(false);
+    reg_t vector = (state.stvec & 1) && interrupt ? 4*bit : 0;
+    state.pc = (state.stvec & ~(reg_t)1) + vector;
     state.scause = t.cause();
     state.sepc = epc;
     state.stval = t.get_tval();
+    state.htval = t.get_tval2();
+    state.htinst = t.get_tinst();
 
     reg_t s = state.mstatus;
     s = set_field(s, MSTATUS_SPIE, get_field(s, MSTATUS_SIE));
     s = set_field(s, MSTATUS_SPP, state.prv);
     s = set_field(s, MSTATUS_SIE, 0);
     set_csr(CSR_MSTATUS, s);
+    s = state.hstatus;
+    s = set_field(s, HSTATUS_SPVP, state.prv);
+    s = set_field(s, HSTATUS_SPV, curr_virt);
+    s = set_field(s, HSTATUS_GVA, t.has_gva());
+    set_csr(CSR_HSTATUS, s);
     set_privilege(PRV_S);
   } else {
+    // Handle the trap in M-mode
+    set_virt(false);
     reg_t vector = (state.mtvec & 1) && interrupt ? 4*bit : 0;
     state.pc = (state.mtvec & ~(reg_t)1) + vector;
     state.mepc = epc;
     state.mcause = t.cause();
     state.mtval = t.get_tval();
+    state.mtval2 = t.get_tval2();
+    state.mtinst = t.get_tinst();
 
     reg_t s = state.mstatus;
     s = set_field(s, MSTATUS_MPIE, get_field(s, MSTATUS_MIE));
     s = set_field(s, MSTATUS_MPP, state.prv);
     s = set_field(s, MSTATUS_MIE, 0);
+    s = set_field(s, MSTATUS_MPV, curr_virt);
+    s = set_field(s, MSTATUS_GVA, t.has_gva());
     set_csr(CSR_MSTATUS, s);
     set_privilege(PRV_M);
   }
@@ -310,11 +745,20 @@ void processor_t::disasm(insn_t insn)
 {
   uint64_t bits = insn.bits() & ((1ULL << (8 * insn_length(insn.bits()))) - 1);
   if (last_pc != state.pc || last_bits != bits) {
+
+#ifdef RISCV_ENABLE_COMMITLOG
+    const char* sym = get_symbol(state.pc);
+    if (sym != nullptr)
+    {
+      fprintf(log_file, "core %3d: >>>>  %s\n", id, sym);
+    }
+#endif
+
     if (executions != 1) {
-      fprintf(stderr, "core %3d: Executed %" PRIx64 " times\n", id, executions);
+      fprintf(log_file, "core %3d: Executed %" PRIx64 " times\n", id, executions);
     }
 
-    fprintf(stderr, "core %3d: 0x%016" PRIx64 " (0x%08" PRIx64 ") %s\n",
+    fprintf(log_file, "core %3d: 0x%016" PRIx64 " (0x%08" PRIx64 ") %s\n",
             id, state.pc, bits, disassembler->disassemble(insn).c_str());
     last_pc = state.pc;
     last_bits = bits;
@@ -332,26 +776,52 @@ int processor_t::paddr_bits()
 
 void processor_t::set_csr(int which, reg_t val)
 {
+#if defined(RISCV_ENABLE_COMMITLOG)
+#define LOG_CSR(rd) \
+  STATE.log_reg_write[((which) << 4) | 4] = {get_csr(rd), 0};
+#else
+#define LOG_CSR(rd)
+#endif
+
   val = zext_xlen(val);
-  reg_t delegable_ints = MIP_SSIP | MIP_STIP | MIP_SEIP
-                       | ((ext != NULL) << IRQ_COP);
-  reg_t all_ints = delegable_ints | MIP_MSIP | MIP_MTIP;
+  reg_t supervisor_ints = supports_extension('S') ? MIP_SSIP | MIP_STIP | MIP_SEIP : 0;
+  reg_t vssip_int = supports_extension('H') ? MIP_VSSIP : 0;
+  reg_t hypervisor_ints = supports_extension('H') ? MIP_HS_MASK : 0;
+  reg_t coprocessor_ints = (ext != NULL) << IRQ_COP;
+  reg_t delegable_ints = supervisor_ints | coprocessor_ints;
+  reg_t all_ints = delegable_ints | hypervisor_ints | MIP_MSIP | MIP_MTIP | MIP_MEIP;
+
+  if (which >= CSR_PMPADDR0 && which < CSR_PMPADDR0 + state.max_pmp) {
+    // If no PMPs are configured, disallow access to all.  Otherwise, allow
+    // access to all, but unimplemented ones are hardwired to zero.
+    if (n_pmp == 0)
+      return;
 
-  if (which >= CSR_PMPADDR0 && which < CSR_PMPADDR0 + state.n_pmp) {
     size_t i = which - CSR_PMPADDR0;
     bool locked = state.pmpcfg[i] & PMP_L;
-    bool next_locked = i+1 < state.n_pmp && (state.pmpcfg[i+1] & PMP_L);
-    bool next_tor = i+1 < state.n_pmp && (state.pmpcfg[i+1] & PMP_A) == PMP_TOR;
-    if (!locked && !(next_locked && next_tor))
-      state.pmpaddr[i] = val;
+    bool next_locked = i+1 < state.max_pmp && (state.pmpcfg[i+1] & PMP_L);
+    bool next_tor = i+1 < state.max_pmp && (state.pmpcfg[i+1] & PMP_A) == PMP_TOR;
+    if (i < n_pmp && !locked && !(next_locked && next_tor)) {
+      state.pmpaddr[i] = val & ((reg_t(1) << (MAX_PADDR_BITS - PMP_SHIFT)) - 1);
+      LOG_CSR(which);
+    }
 
     mmu->flush_tlb();
   }
 
-  if (which >= CSR_PMPCFG0 && which < CSR_PMPCFG0 + state.n_pmp / 4) {
+  if (which >= CSR_PMPCFG0 && which < CSR_PMPCFG0 + state.max_pmp / 4) {
+    if (n_pmp == 0)
+      return;
+
     for (size_t i0 = (which - CSR_PMPCFG0) * 4, i = i0; i < i0 + xlen / 8; i++) {
-      if (!(state.pmpcfg[i] & PMP_L))
-        state.pmpcfg[i] = (val >> (8 * (i - i0))) & (PMP_R | PMP_W | PMP_X | PMP_A | PMP_L);
+      if (i < n_pmp && !(state.pmpcfg[i] & PMP_L)) {
+        uint8_t cfg = (val >> (8 * (i - i0))) & (PMP_R | PMP_W | PMP_X | PMP_A | PMP_L);
+        cfg &= ~PMP_W | ((cfg & PMP_R) ? PMP_W : 0); // Disallow R=0 W=1
+        if (lg_pmp_granularity != PMP_SHIFT && (cfg & PMP_A) == PMP_NA4)
+          cfg |= PMP_NAPOT; // Disallow A=NA4 when granularity > 4
+        state.pmpcfg[i] = cfg;
+        LOG_CSR(which);
+      }
     }
     mmu->flush_tlb();
   }
@@ -371,16 +841,30 @@ void processor_t::set_csr(int which, reg_t val)
       state.fflags = (val & FSR_AEXC) >> FSR_AEXC_SHIFT;
       state.frm = (val & FSR_RD) >> FSR_RD_SHIFT;
       break;
+    case CSR_VCSR:
+      dirty_vs_state;
+      VU.vxsat = (val & VCSR_VXSAT) >> VCSR_VXSAT_SHIFT;
+      VU.vxrm = (val & VCSR_VXRM) >> VCSR_VXRM_SHIFT;
+      break;
     case CSR_MSTATUS: {
       if ((val ^ state.mstatus) &
           (MSTATUS_MPP | MSTATUS_MPRV | MSTATUS_SUM | MSTATUS_MXR))
         mmu->flush_tlb();
 
-      reg_t mask = MSTATUS_SIE | MSTATUS_SPIE | MSTATUS_MIE | MSTATUS_MPIE
-                 | MSTATUS_FS | MSTATUS_MPRV | MSTATUS_SUM
-                 | MSTATUS_MXR | MSTATUS_TW | MSTATUS_TVM
-                 | MSTATUS_TSR | MSTATUS_UXL | MSTATUS_SXL |
-                 (ext ? MSTATUS_XS : 0);
+      bool has_fs = supports_extension('S') || supports_extension('F')
+                  || supports_extension('V');
+      bool has_vs = supports_extension('V');
+      bool has_mpv = supports_extension('S') && supports_extension('H');
+      bool has_gva = has_mpv;
+
+      reg_t mask = MSTATUS_MIE | MSTATUS_MPIE | MSTATUS_MPRV
+                 | (supports_extension('S') ? (MSTATUS_SUM | MSTATUS_SIE | MSTATUS_SPIE) : 0)
+                 | MSTATUS_MXR | MSTATUS_TW | MSTATUS_TVM | MSTATUS_TSR
+                 | (has_fs ? MSTATUS_FS : 0)
+                 | (has_vs ? MSTATUS_VS : 0)
+                 | (ext ? MSTATUS_XS : 0)
+                 | (has_gva ? MSTATUS_GVA : 0)
+                 | (has_mpv ? MSTATUS_MPV : 0);
 
       reg_t requested_mpp = legalize_privilege(get_field(val, MSTATUS_MPP));
       state.mstatus = set_field(state.mstatus, MSTATUS_MPP, requested_mpp);
@@ -391,20 +875,22 @@ void processor_t::set_csr(int which, reg_t val)
 
       bool dirty = (state.mstatus & MSTATUS_FS) == MSTATUS_FS;
       dirty |= (state.mstatus & MSTATUS_XS) == MSTATUS_XS;
+      dirty |= (state.mstatus & MSTATUS_VS) == MSTATUS_VS;
       if (max_xlen == 32)
         state.mstatus = set_field(state.mstatus, MSTATUS32_SD, dirty);
       else
         state.mstatus = set_field(state.mstatus, MSTATUS64_SD, dirty);
 
-      state.mstatus = set_field(state.mstatus, MSTATUS_UXL, xlen_to_uxl(max_xlen));
-      state.mstatus = set_field(state.mstatus, MSTATUS_UXL, xlen_to_uxl(max_xlen));
-      state.mstatus = set_field(state.mstatus, MSTATUS_SXL, xlen_to_uxl(max_xlen));
+      if (supports_extension('U'))
+        state.mstatus = set_field(state.mstatus, MSTATUS_UXL, xlen_to_uxl(max_xlen));
+      if (supports_extension('S'))
+        state.mstatus = set_field(state.mstatus, MSTATUS_SXL, xlen_to_uxl(max_xlen));
       // U-XLEN == S-XLEN == M-XLEN
       xlen = max_xlen;
       break;
     }
     case CSR_MIP: {
-      reg_t mask = MIP_SSIP | MIP_STIP;
+      reg_t mask = (supervisor_ints | hypervisor_ints) & (MIP_SSIP | MIP_STIP | vssip_int);
       state.mip = (state.mip & ~mask) | (val & mask);
       break;
     }
@@ -419,9 +905,17 @@ void processor_t::set_csr(int which, reg_t val)
         (1 << CAUSE_MISALIGNED_FETCH) |
         (1 << CAUSE_BREAKPOINT) |
         (1 << CAUSE_USER_ECALL) |
+        (1 << CAUSE_SUPERVISOR_ECALL) |
         (1 << CAUSE_FETCH_PAGE_FAULT) |
         (1 << CAUSE_LOAD_PAGE_FAULT) |
         (1 << CAUSE_STORE_PAGE_FAULT);
+      mask |= supports_extension('H') ?
+        (1 << CAUSE_VIRTUAL_SUPERVISOR_ECALL) |
+        (1 << CAUSE_FETCH_GUEST_PAGE_FAULT) |
+        (1 << CAUSE_LOAD_GUEST_PAGE_FAULT) |
+        (1 << CAUSE_VIRTUAL_INSTRUCTION) |
+        (1 << CAUSE_STORE_GUEST_PAGE_FAULT)
+        : 0;
       state.medeleg = (state.medeleg & ~mask) | (val & mask);
       break;
     }
@@ -450,36 +944,85 @@ void processor_t::set_csr(int which, reg_t val)
       break;
     case CSR_SSTATUS: {
       reg_t mask = SSTATUS_SIE | SSTATUS_SPIE | SSTATUS_SPP | SSTATUS_FS
-                 | SSTATUS_XS | SSTATUS_SUM | SSTATUS_MXR;
+                 | SSTATUS_XS | SSTATUS_SUM | SSTATUS_MXR
+                 | (supports_extension('V') ? SSTATUS_VS : 0);
       return set_csr(CSR_MSTATUS, (state.mstatus & ~mask) | (val & mask));
     }
     case CSR_SIP: {
-      reg_t mask = MIP_SSIP & state.mideleg;
-      return set_csr(CSR_MIP, (state.mip & ~mask) | (val & mask));
+      reg_t mask;
+      if (state.v) {
+        mask = state.hideleg & MIP_VSSIP;
+        val = val << 1;
+      } else {
+        mask = state.mideleg & MIP_SSIP;
+      }
+      state.mip = (state.mip & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_SIE: {
+      reg_t mask;
+      if (state.v) {
+        mask = state.hideleg & MIP_VS_MASK;
+        val = val << 1;
+      } else {
+        mask = state.mideleg & ~MIP_HS_MASK;
+      }
+      state.mie = (state.mie & ~mask) | (val & mask);
+      break;
     }
-    case CSR_SIE:
-      return set_csr(CSR_MIE,
-                     (state.mie & ~state.mideleg) | (val & state.mideleg));
     case CSR_SATP: {
+      reg_t reg_val = 0;
+      reg_t rv64_ppn_mask = (reg_t(1) << (MAX_PADDR_BITS - PGSHIFT)) - 1;
       mmu->flush_tlb();
       if (max_xlen == 32)
-        state.satp = val & (SATP32_PPN | SATP32_MODE);
+        reg_val = val & (SATP32_PPN | SATP32_MODE);
       if (max_xlen == 64 && (get_field(val, SATP64_MODE) == SATP_MODE_OFF ||
                              get_field(val, SATP64_MODE) == SATP_MODE_SV39 ||
                              get_field(val, SATP64_MODE) == SATP_MODE_SV48))
-        state.satp = val & (SATP64_PPN | SATP64_MODE);
+        reg_val = val & (SATP64_PPN | SATP64_MODE | rv64_ppn_mask);
+      if (state.v)
+        state.vsatp = reg_val;
+      else
+        state.satp = reg_val;
       break;
     }
-    case CSR_SEPC: state.sepc = val & ~(reg_t)1; break;
-    case CSR_STVEC: state.stvec = val >> 2 << 2; break;
-    case CSR_SSCRATCH: state.sscratch = val; break;
-    case CSR_SCAUSE: state.scause = val; break;
-    case CSR_STVAL: state.stval = val; break;
+    case CSR_SEPC:
+      if (state.v)
+        state.vsepc = val & ~(reg_t)1;
+      else
+        state.sepc = val & ~(reg_t)1;
+      break;
+    case CSR_STVEC:
+      if (state.v)
+        state.vstvec = val & ~(reg_t)2;
+      else
+        state.stvec = val & ~(reg_t)2;
+      break;
+    case CSR_SSCRATCH:
+      if (state.v)
+        state.vsscratch = val;
+      else
+        state.sscratch = val;
+      break;
+    case CSR_SCAUSE:
+      if (state.v)
+        state.vscause = val;
+      else
+        state.scause = val;
+      break;
+    case CSR_STVAL:
+      if (state.v)
+        state.vstval = val;
+      else
+        state.stval = val;
+      break;
     case CSR_MEPC: state.mepc = val & ~(reg_t)1; break;
     case CSR_MTVEC: state.mtvec = val & ~(reg_t)2; break;
     case CSR_MSCRATCH: state.mscratch = val; break;
     case CSR_MCAUSE: state.mcause = val; break;
     case CSR_MTVAL: state.mtval = val; break;
+    case CSR_MTVAL2: state.mtval2 = val; break;
+    case CSR_MTINST: state.mtinst = val; break;
     case CSR_MISA: {
       // the write is ignored if increasing IALIGN would misalign the PC
       if (!(val & (1L << ('C' - 'A'))) && (state.pc & 2))
@@ -495,9 +1038,118 @@ void processor_t::set_csr(int which, reg_t val)
       mask |= 1L << ('F' - 'A');
       mask |= 1L << ('D' - 'A');
       mask |= 1L << ('C' - 'A');
+      mask |= 1L << ('H' - 'A');
       mask &= max_isa;
 
       state.misa = (val & mask) | (state.misa & ~mask);
+
+      // update the forced bits in MIDELEG
+      if (supports_extension('H'))
+          state.mideleg |= MIDELEG_FORCED_MASK;
+      else
+          state.mideleg &= ~MIDELEG_FORCED_MASK;
+      break;
+    }
+    case CSR_HSTATUS: {
+      reg_t mask = HSTATUS_VTSR | HSTATUS_VTW | HSTATUS_VTVM |
+                   HSTATUS_HU | HSTATUS_SPVP | HSTATUS_SPV | HSTATUS_GVA;
+      state.hstatus = (state.hstatus & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HEDELEG: {
+      reg_t mask =
+        (1 << CAUSE_MISALIGNED_FETCH) |
+        (1 << CAUSE_BREAKPOINT) |
+        (1 << CAUSE_MISALIGNED_LOAD) |
+        (1 << CAUSE_LOAD_ACCESS) |
+        (1 << CAUSE_MISALIGNED_STORE) |
+        (1 << CAUSE_STORE_ACCESS) |
+        (1 << CAUSE_USER_ECALL) |
+        (1 << CAUSE_FETCH_PAGE_FAULT) |
+        (1 << CAUSE_LOAD_PAGE_FAULT) |
+        (1 << CAUSE_STORE_PAGE_FAULT);
+      state.hedeleg = (state.hedeleg & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HIDELEG: {
+      reg_t mask = MIP_VS_MASK;
+      state.hideleg = (state.hideleg & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HIE: {
+      reg_t mask = MIP_HS_MASK;
+      state.mie = (state.mie & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HCOUNTEREN:
+      state.hcounteren = val;
+      break;
+    case CSR_HGEIE:
+      /* Ignore */
+      break;
+    case CSR_HTVAL:
+      state.htinst = val;
+      break;
+    case CSR_HIP: {
+      reg_t mask = MIP_VSSIP;
+      state.mip = (state.mip & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HVIP: {
+      reg_t mask = MIP_VS_MASK;
+      state.mip = (state.mip & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_HTINST:
+      state.htinst = val;
+      break;
+    case CSR_HGATP: {
+      reg_t reg_val = 0;
+      reg_t rv64_ppn_mask = (reg_t(1) << (MAX_PADDR_BITS - PGSHIFT)) - 1;
+      mmu->flush_tlb();
+      if (max_xlen == 32)
+        reg_val = val & (HGATP32_PPN | HGATP32_MODE);
+      if (max_xlen == 64 && (get_field(val, HGATP64_MODE) == HGATP_MODE_OFF ||
+                             get_field(val, HGATP64_MODE) == HGATP_MODE_SV39X4 ||
+                             get_field(val, HGATP64_MODE) == HGATP_MODE_SV48X4))
+        reg_val = val & (HGATP64_PPN | HGATP64_MODE | rv64_ppn_mask);
+      state.hgatp = reg_val;
+      break;
+    }
+    case CSR_VSSTATUS: {
+      reg_t mask = SSTATUS_VS_MASK;
+      mask |= (supports_extension('F') ? SSTATUS_FS : 0);
+      mask |= (supports_extension('V') ? SSTATUS_VS : 0);
+      mask |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+      state.vsstatus = (state.vsstatus & ~mask) | (val & mask);
+      break;
+    }
+    case CSR_VSIE: {
+      reg_t mask = state.hideleg & MIP_VS_MASK;
+      state.mie = (state.mie & ~mask) | ((val << 1) & mask);
+      break;
+    }
+    case CSR_VSTVEC: state.vstvec = val & ~(reg_t)2; break;
+    case CSR_VSSCRATCH: state.vsscratch = val; break;
+    case CSR_VSEPC: state.vsepc = val & ~(reg_t)1; break;
+    case CSR_VSCAUSE: state.vscause = val; break;
+    case CSR_VSTVAL: state.vstval = val; break;
+    case CSR_VSIP: {
+      reg_t mask = state.hideleg & MIP_VSSIP;
+      state.mip = (state.mip & ~mask) | ((val << 1) & mask);
+      break;
+    }
+    case CSR_VSATP: {
+      reg_t reg_val = 0;
+      reg_t rv64_ppn_mask = (reg_t(1) << (MAX_PADDR_BITS - PGSHIFT)) - 1;
+      mmu->flush_tlb();
+      if (max_xlen == 32)
+        reg_val = val & (SATP32_PPN | SATP32_MODE);
+      if (max_xlen == 64 && (get_field(val, SATP64_MODE) == SATP_MODE_OFF ||
+                             get_field(val, SATP64_MODE) == SATP_MODE_SV39 ||
+                             get_field(val, SATP64_MODE) == SATP_MODE_SV48))
+        reg_val = val & (SATP64_PPN | SATP64_MODE | rv64_ppn_mask);
+      state.vsatp = reg_val;
       break;
     }
     case CSR_TSELECT:
@@ -508,7 +1160,7 @@ void processor_t::set_csr(int which, reg_t val)
     case CSR_TDATA1:
       {
         mcontrol_t *mc = &state.mcontrol[state.tselect];
-        if (mc->dmode && !state.dcsr.cause) {
+        if (mc->dmode && !state.debug_mode) {
           break;
         }
         mc->dmode = get_field(val, MCONTROL_DMODE(xlen));
@@ -531,7 +1183,7 @@ void processor_t::set_csr(int which, reg_t val)
       }
       break;
     case CSR_TDATA2:
-      if (state.mcontrol[state.tselect].dmode && !state.dcsr.cause) {
+      if (state.mcontrol[state.tselect].dmode && !state.debug_mode) {
         break;
       }
       if (state.tselect < state.num_triggers) {
@@ -551,47 +1203,180 @@ void processor_t::set_csr(int which, reg_t val)
     case CSR_DPC:
       state.dpc = val & ~(reg_t)1;
       break;
-    case CSR_DSCRATCH:
-      state.dscratch = val;
+    case CSR_DSCRATCH0:
+      state.dscratch0 = val;
+      break;
+    case CSR_DSCRATCH1:
+      state.dscratch1 = val;
+      break;
+    case CSR_VSTART:
+      dirty_vs_state;
+      VU.vstart = val & (VU.get_vlen() - 1);
+      break;
+    case CSR_VXSAT:
+      dirty_vs_state;
+      VU.vxsat = val & 0x1ul;
+      break;
+    case CSR_VXRM:
+      dirty_vs_state;
+      VU.vxrm = val & 0x3ul;
+      break;
+    // xpulphwloop
+    case CSR_LPSTART0:
+      hwLoops.set_start(0, val);
+      break;
+    case CSR_LPEND0:
+      hwLoops.set_end(0, val);
+      break;
+    case CSR_LPCOUNT0:
+      hwLoops.set_count(0, val);
+      break;
+    case CSR_LPSTART1:
+      hwLoops.set_start(1, val);
+      break;
+    case CSR_LPEND1:
+      hwLoops.set_end(1, val);
+      break;
+    case CSR_LPCOUNT1:
+      hwLoops.set_count(1, val);
+      break;
+  }
+
+#if defined(RISCV_ENABLE_COMMITLOG)
+  switch (which)
+  {
+    case CSR_FFLAGS:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_FFLAGS);
+      break;
+    case CSR_FRM:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_FRM);
+      break;
+    case CSR_FCSR:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_FFLAGS);
+      LOG_CSR(CSR_FRM);
+      break;
+    case CSR_VCSR:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_VXSAT);
+      LOG_CSR(CSR_VXRM);
+      break;
+
+    case CSR_VSTART:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_VSTART);
+      break;
+    case CSR_VXSAT:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_VXSAT);
+      break;
+    case CSR_VXRM:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_VXRM);
+      break;
+
+    case CSR_SSTATUS:
+      LOG_CSR(CSR_MSTATUS);
+      LOG_CSR(CSR_SSTATUS);
+      break;
+    case CSR_SIP:
+      LOG_CSR(CSR_MIP);
+      LOG_CSR(CSR_SIP);
+      break;
+    case CSR_SIE:
+      LOG_CSR(CSR_MIE);
+      LOG_CSR(CSR_SIE);
+      break;
+
+    case CSR_MSTATUS:
+    case CSR_MIP:
+    case CSR_MIE:
+    case CSR_MIDELEG:
+    case CSR_MEDELEG:
+    case CSR_MINSTRET:
+    case CSR_MCYCLE:
+    case CSR_MINSTRETH:
+    case CSR_MCYCLEH:
+    case CSR_SCOUNTEREN:
+    case CSR_MCOUNTEREN:
+    case CSR_SATP:
+    case CSR_SEPC:
+    case CSR_STVEC:
+    case CSR_SSCRATCH:
+    case CSR_SCAUSE:
+    case CSR_STVAL:
+    case CSR_MEPC:
+    case CSR_MTVEC:
+    case CSR_MSCRATCH:
+    case CSR_MCAUSE:
+    case CSR_MTVAL:
+    case CSR_MISA:
+    case CSR_TSELECT:
+    case CSR_TDATA1:
+    case CSR_TDATA2:
+    case CSR_DCSR:
+    case CSR_DPC:
+    case CSR_DSCRATCH0:
+    case CSR_DSCRATCH1:
+      LOG_CSR(which);
       break;
   }
+#endif
 }
 
 // Note that get_csr is sometimes called when read side-effects should not
 // be actioned.  In other words, Spike cannot currently support CSRs with
 // side effects on reads.
-reg_t processor_t::get_csr(int which)
+reg_t processor_t::get_csr(int which, insn_t insn, bool write, bool peek)
 {
   uint32_t ctr_en = -1;
   if (state.prv < PRV_M)
     ctr_en &= state.mcounteren;
+  if (state.v)
+    ctr_en &= state.hcounteren;
   if (state.prv < PRV_S)
     ctr_en &= state.scounteren;
   bool ctr_ok = (ctr_en >> (which & 31)) & 1;
 
+  reg_t res = 0;
+#define ret(n) do { \
+    res = (n); \
+    goto out; \
+  } while (false)
+
   if (ctr_ok) {
     if (which >= CSR_HPMCOUNTER3 && which <= CSR_HPMCOUNTER31)
-      return 0;
+      ret(0);
     if (xlen == 32 && which >= CSR_HPMCOUNTER3H && which <= CSR_HPMCOUNTER31H)
-      return 0;
+      ret(0);
   }
   if (which >= CSR_MHPMCOUNTER3 && which <= CSR_MHPMCOUNTER31)
-    return 0;
+    ret(0);
   if (xlen == 32 && which >= CSR_MHPMCOUNTER3H && which <= CSR_MHPMCOUNTER31H)
-    return 0;
+    ret(0);
   if (which >= CSR_MHPMEVENT3 && which <= CSR_MHPMEVENT31)
-    return 0;
-
-  if (which >= CSR_PMPADDR0 && which < CSR_PMPADDR0 + state.n_pmp)
-    return state.pmpaddr[which - CSR_PMPADDR0];
+    ret(0);
+
+  if (which >= CSR_PMPADDR0 && which < CSR_PMPADDR0 + state.max_pmp) {
+    // If n_pmp is zero, that means pmp is not implemented hence raise trap if it tries to access the csr
+    if (n_pmp == 0)
+      goto throw_illegal;
+    reg_t i = which - CSR_PMPADDR0;
+    if ((state.pmpcfg[i] & PMP_A) >= PMP_NAPOT)
+      ret(state.pmpaddr[i] | (~pmp_tor_mask() >> 1));
+    else
+      ret(state.pmpaddr[i] & pmp_tor_mask());
+  }
 
-  if (which >= CSR_PMPCFG0 && which < CSR_PMPCFG0 + state.n_pmp / 4) {
+  if (which >= CSR_PMPCFG0 && which < CSR_PMPCFG0 + state.max_pmp / 4) {
     require((which & ((xlen / 32) - 1)) == 0);
 
-    reg_t res = 0;
-    for (size_t i0 = (which - CSR_PMPCFG0) * 4, i = i0; i < i0 + xlen / 8 && i < state.n_pmp; i++)
-      res |= reg_t(state.pmpcfg[i]) << (8 * (i - i0));
-    return res;
+    reg_t cfg_res = 0;
+    for (size_t i0 = (which - CSR_PMPCFG0) * 4, i = i0; i < i0 + xlen / 8 && i < state.max_pmp; i++)
+      cfg_res |= reg_t(state.pmpcfg[i]) << (8 * (i - i0));
+    ret(cfg_res);
   }
 
   switch (which)
@@ -600,76 +1385,191 @@ reg_t processor_t::get_csr(int which)
       require_fp;
       if (!supports_extension('F'))
         break;
-      return state.fflags;
+      ret(state.fflags);
     case CSR_FRM:
       require_fp;
       if (!supports_extension('F'))
         break;
-      return state.frm;
+      ret(state.frm);
     case CSR_FCSR:
       require_fp;
       if (!supports_extension('F'))
         break;
-      return (state.fflags << FSR_AEXC_SHIFT) | (state.frm << FSR_RD_SHIFT);
+      ret((state.fflags << FSR_AEXC_SHIFT) | (state.frm << FSR_RD_SHIFT));
+    case CSR_VCSR:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret((VU.vxsat << VCSR_VXSAT_SHIFT) | (VU.vxrm << VCSR_VXRM_SHIFT));
     case CSR_INSTRET:
     case CSR_CYCLE:
       if (ctr_ok)
-        return state.minstret;
+        ret(state.minstret);
+      if (state.v &&
+          ((state.mcounteren >> (which & 31)) & 1) &&
+          !((state.hcounteren >> (which & 31)) & 1)) {
+        goto throw_virtual;
+      }
       break;
     case CSR_MINSTRET:
     case CSR_MCYCLE:
-      return state.minstret;
+      ret(state.minstret);
     case CSR_INSTRETH:
     case CSR_CYCLEH:
       if (ctr_ok && xlen == 32)
-        return state.minstret >> 32;
+        ret(state.minstret >> 32);
+      if (state.v &&
+          ((state.mcounteren >> (which & 31)) & 1) &&
+          !((state.hcounteren >> (which & 31)) & 1)) {
+        goto throw_virtual;
+      }
       break;
     case CSR_MINSTRETH:
     case CSR_MCYCLEH:
       if (xlen == 32)
-        return state.minstret >> 32;
+        ret(state.minstret >> 32);
       break;
-    case CSR_SCOUNTEREN: return state.scounteren;
-    case CSR_MCOUNTEREN: return state.mcounteren;
+    case CSR_SCOUNTEREN: ret(state.scounteren);
+    case CSR_MCOUNTEREN:
+      if (!supports_extension('U'))
+        break;
+      ret(state.mcounteren);
+    case CSR_MCOUNTINHIBIT: ret(0);
     case CSR_SSTATUS: {
       reg_t mask = SSTATUS_SIE | SSTATUS_SPIE | SSTATUS_SPP | SSTATUS_FS
+                 | (supports_extension('V') ? SSTATUS_VS : 0)
                  | SSTATUS_XS | SSTATUS_SUM | SSTATUS_MXR | SSTATUS_UXL;
       reg_t sstatus = state.mstatus & mask;
       if ((sstatus & SSTATUS_FS) == SSTATUS_FS ||
           (sstatus & SSTATUS_XS) == SSTATUS_XS)
         sstatus |= (xlen == 32 ? SSTATUS32_SD : SSTATUS64_SD);
-      return sstatus;
+      ret(sstatus);
     }
-    case CSR_SIP: return state.mip & state.mideleg;
-    case CSR_SIE: return state.mie & state.mideleg;
-    case CSR_SEPC: return state.sepc & pc_alignment_mask();
-    case CSR_STVAL: return state.stval;
-    case CSR_STVEC: return state.stvec;
-    case CSR_SCAUSE:
-      if (max_xlen > xlen)
-        return state.scause | ((state.scause >> (max_xlen-1)) << (xlen-1));
-      return state.scause;
-    case CSR_SATP:
-      if (get_field(state.mstatus, MSTATUS_TVM))
+    case CSR_SIP: {
+      if (state.v) {
+        ret((state.mip & state.hideleg & MIP_VS_MASK) >> 1);
+      } else {
+        ret(state.mip & state.mideleg & ~MIP_HS_MASK);
+      }
+    }
+    case CSR_SIE: {
+      if (state.v) {
+        ret((state.mie & state.hideleg & MIP_VS_MASK) >> 1);
+      } else {
+        ret(state.mie & state.mideleg & ~MIP_HS_MASK);
+      }
+    }
+    case CSR_SEPC: {
+      if (state.v) {
+        ret(state.vsepc & pc_alignment_mask());
+      } else {
+        ret(state.sepc & pc_alignment_mask());
+      }
+    }
+    case CSR_STVAL: {
+      if (state.v) {
+        ret(state.vstval);
+      } else {
+        ret(state.stval);
+      }
+    }
+    case CSR_STVEC: {
+      if (state.v) {
+        ret(state.vstvec);
+      } else {
+        ret(state.stvec);
+      }
+    }
+    case CSR_SCAUSE: {
+      if (state.v) {
+        if (max_xlen > xlen)
+          ret(state.vscause | ((state.vscause >> (max_xlen-1)) << (xlen-1)));
+        ret(state.vscause);
+      } else {
+        if (max_xlen > xlen)
+          ret(state.scause | ((state.scause >> (max_xlen-1)) << (xlen-1)));
+        ret(state.scause);
+      }
+    }
+    case CSR_SATP: {
+      if (state.v) {
+        if (get_field(state.hstatus, HSTATUS_VTVM))
+          goto throw_virtual;
+        ret(state.vsatp);
+      } else {
+        if (get_field(state.mstatus, MSTATUS_TVM))
+          require_privilege(PRV_M);
+        ret(state.satp);
+      }
+    }
+    case CSR_SSCRATCH: {
+      if (state.v) {
+        ret(state.vsscratch);
+      } else {
+        ret(state.sscratch);
+      }
+    }
+    case CSR_MSTATUS: ret(state.mstatus);
+    case CSR_MIP: ret(state.mip);
+    case CSR_MIE: ret(state.mie);
+    case CSR_MEPC: ret(state.mepc & pc_alignment_mask());
+    case CSR_MSCRATCH: ret(state.mscratch);
+    case CSR_MCAUSE: ret(state.mcause);
+    case CSR_MTVAL: ret(state.mtval);
+    case CSR_MTVAL2:
+      if (supports_extension('H'))
+        ret(state.mtval2);
+      break;
+    case CSR_MTINST:
+      if (supports_extension('H'))
+        ret(state.mtinst);
+      break;
+    case CSR_MISA: ret(state.misa);
+    case CSR_MARCHID: ret(5);
+    case CSR_MIMPID: ret(0);
+    case CSR_MVENDORID: ret(0);
+    case CSR_MHARTID: ret(id);
+    case CSR_MTVEC: ret(state.mtvec);
+    case CSR_MEDELEG:
+      if (!supports_extension('S'))
+        break;
+      ret(state.medeleg);
+    case CSR_MIDELEG:
+      if (!supports_extension('S'))
+        break;
+      ret(state.mideleg);
+    case CSR_HSTATUS: ret(state.hstatus);
+    case CSR_HEDELEG: ret(state.hedeleg);
+    case CSR_HIDELEG: ret(state.hideleg);
+    case CSR_HIE: ret(state.mie & MIP_HS_MASK);
+    case CSR_HCOUNTEREN: ret(state.hcounteren);
+    case CSR_HGEIE: ret(0);
+    case CSR_HTVAL: ret(state.htval);
+    case CSR_HIP: ret(state.mip & MIP_HS_MASK);
+    case CSR_HVIP: ret(state.mip & MIP_VS_MASK);
+    case CSR_HTINST: ret(state.htinst);
+    case CSR_HGATP: {
+      if (!state.v && get_field(state.mstatus, MSTATUS_TVM))
         require_privilege(PRV_M);
-      return state.satp;
-    case CSR_SSCRATCH: return state.sscratch;
-    case CSR_MSTATUS: return state.mstatus;
-    case CSR_MIP: return state.mip;
-    case CSR_MIE: return state.mie;
-    case CSR_MEPC: return state.mepc & pc_alignment_mask();
-    case CSR_MSCRATCH: return state.mscratch;
-    case CSR_MCAUSE: return state.mcause;
-    case CSR_MTVAL: return state.mtval;
-    case CSR_MISA: return state.misa;
-    case CSR_MARCHID: return 5;
-    case CSR_MIMPID: return 0;
-    case CSR_MVENDORID: return 0;
-    case CSR_MHARTID: return id;
-    case CSR_MTVEC: return state.mtvec;
-    case CSR_MEDELEG: return state.medeleg;
-    case CSR_MIDELEG: return state.mideleg;
-    case CSR_TSELECT: return state.tselect;
+      ret(state.hgatp);
+    }
+    case CSR_HGEIP: ret(0);
+    case CSR_VSSTATUS: {
+      reg_t mask = SSTATUS_VS_MASK;
+      mask |= (supports_extension('F') ? SSTATUS_FS : 0);
+      mask |= (supports_extension('V') ? SSTATUS_VS : 0);
+      mask |= (xlen == 64 ? SSTATUS64_SD : SSTATUS32_SD);
+      ret(state.vsstatus & mask);
+    }
+    case CSR_VSIE: ret((state.mie & state.hideleg & MIP_VS_MASK) >> 1);
+    case CSR_VSTVEC: ret(state.vstvec);
+    case CSR_VSSCRATCH: ret(state.vsscratch);
+    case CSR_VSEPC: ret(state.vsepc & pc_alignment_mask());
+    case CSR_VSCAUSE: ret(state.vscause);
+    case CSR_VSTVAL: ret(state.vstval);
+    case CSR_VSIP: ret((state.mip & state.hideleg & MIP_VS_MASK) >> 1);
+    case CSR_VSATP: ret(state.vsatp);
+    case CSR_TSELECT: ret(state.tselect);
     case CSR_TDATA1:
       if (state.tselect < state.num_triggers) {
         reg_t v = 0;
@@ -689,21 +1589,23 @@ reg_t processor_t::get_csr(int which)
         v = set_field(v, MCONTROL_EXECUTE, mc->execute);
         v = set_field(v, MCONTROL_STORE, mc->store);
         v = set_field(v, MCONTROL_LOAD, mc->load);
-        return v;
+        ret(v);
       } else {
-        return 0;
+        ret(0);
       }
       break;
     case CSR_TDATA2:
       if (state.tselect < state.num_triggers) {
-        return state.tdata2[state.tselect];
+        ret(state.tdata2[state.tselect]);
       } else {
-        return 0;
+        ret(0);
       }
       break;
-    case CSR_TDATA3: return 0;
+    case CSR_TDATA3: ret(0);
     case CSR_DCSR:
       {
+        if (!state.debug_mode)
+          break;
         uint32_t v = 0;
         v = set_field(v, DCSR_XDEBUGVER, 1);
         v = set_field(v, DCSR_EBREAKM, state.dcsr.ebreakm);
@@ -715,19 +1617,109 @@ reg_t processor_t::get_csr(int which)
         v = set_field(v, DCSR_CAUSE, state.dcsr.cause);
         v = set_field(v, DCSR_STEP, state.dcsr.step);
         v = set_field(v, DCSR_PRV, state.dcsr.prv);
-        return v;
+        ret(v);
       }
     case CSR_DPC:
-      return state.dpc & pc_alignment_mask();
-    case CSR_DSCRATCH:
-      return state.dscratch;
+      if (!state.debug_mode)
+        break;
+      ret(state.dpc & pc_alignment_mask());
+    case CSR_DSCRATCH0:
+      if (!state.debug_mode)
+        break;
+      ret(state.dscratch0);
+    case CSR_DSCRATCH1:
+      if (!state.debug_mode)
+        break;
+      ret(state.dscratch1);
+    case CSR_VSTART:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vstart);
+    case CSR_VXSAT:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vxsat);
+    case CSR_VXRM:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vxrm);
+    case CSR_VL:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vl);
+    case CSR_VTYPE:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vtype);
+    case CSR_VLENB:
+      require_vector_vs;
+      if (!supports_extension('V'))
+        break;
+      ret(VU.vlenb);
+    // xpulphwloop
+    case CSR_LPSTART0:
+      ret(state.lpstart0);
+      break;
+    case CSR_LPEND0:
+      ret(state.lpend0);
+      break;
+    case CSR_LPCOUNT0:
+      ret(state.lpcount0);
+      break;
+    case CSR_LPSTART1:
+      ret(state.lpstart1);
+      break;
+    case CSR_LPEND1:
+      ret(state.lpend1);
+      break;
+    case CSR_LPCOUNT1:
+      ret(state.lpcount1);
+      break;
+  }
+
+#undef ret
+
+  // If we get here, the CSR doesn't exist.  Unimplemented CSRs always throw
+  // illegal-instruction exceptions, not virtual-instruction exceptions.
+throw_illegal:
+  throw trap_illegal_instruction(insn.bits());
+
+throw_virtual:
+  throw trap_virtual_instruction(insn.bits());
+
+out:
+  // Check permissions.  Raise virtual-instruction exception if V=1,
+  // privileges are insufficient, and the CSR belongs to supervisor or
+  // hypervisor.  Raise illegal-instruction exception otherwise.
+
+  if (peek)
+    return res;
+
+  unsigned csr_priv = get_field(which, 0x300);
+  bool csr_read_only = get_field(which, 0xC00) == 3;
+  unsigned priv = state.prv == PRV_S && !state.v ? PRV_HS : state.prv;
+
+  if ((csr_priv == PRV_S && !supports_extension('S')) ||
+      (csr_priv == PRV_HS && !supports_extension('H')))
+    goto throw_illegal;
+
+  if ((write && csr_read_only) || priv < csr_priv) {
+    if (state.v && csr_priv <= PRV_HS)
+      goto throw_virtual;
+    goto throw_illegal;
   }
-  throw trap_illegal_instruction(0);
+
+  return res;
 }
 
 reg_t illegal_instruction(processor_t* p, insn_t insn, reg_t pc)
 {
-  throw trap_illegal_instruction(0);
+  throw trap_illegal_instruction(insn.bits());
 }
 
 insn_func_t processor_t::decode_insn(insn_t insn)
@@ -784,8 +1776,11 @@ void processor_t::register_extension(extension_t* x)
   for (auto insn : x->get_instructions())
     register_insn(insn);
   build_opcode_map();
-  for (auto disasm_insn : x->get_disasms())
-    disassembler->add_insn(disasm_insn);
+
+  if (disassembler)
+    for (auto disasm_insn : x->get_disasms())
+      disassembler->add_insn(disasm_insn);
+
   if (ext != NULL)
     throw std::logic_error("only one extension may be registered");
   ext = x;
@@ -858,3 +1853,92 @@ void processor_t::trigger_updated()
     }
   }
 }
+
+
+// PULP HW-Loop extension (xpulphwloop)
+
+// sets activate flag for loops and overall unit
+void processor_t::hwLoopUnit_t::set_active(int i)
+{
+  bool valid_body = get_start(i) < get_end(i);
+  lp_active[i] = valid_body && (get_count(i) > 0);
+  any_active |= lp_active[i];
+
+  // Constraint: loop body must be at least 3 instructions
+  if(lp_active[i] && (get_start(i) +8 > get_end(i))) {
+    throw trap_illegal_instruction(0);
+  }
+}
+
+// CSR write methods with the side-effects (set activate, check constraints)
+void processor_t::hwLoopUnit_t::set_start(int loopNr, reg_t val) { 
+  if(loopNr) {
+    p->state.lpstart1 = val;
+  } else {
+    p->state.lpstart0 = val;
+  }
+  set_active(loopNr);
+}
+
+void processor_t::hwLoopUnit_t::set_end(int loopNr, reg_t val) { 
+  if(loopNr) {
+    p->state.lpend1 = val;
+  } else {
+    p->state.lpend0 = val;
+  }
+  // Constraint: outer-end must be at least 2 instructions after inner-end
+  if(p->state.lpend0 +8 > p->state.lpend1) {
+    throw trap_illegal_instruction(0);
+  }
+  set_active(loopNr);
+}
+
+void processor_t::hwLoopUnit_t::set_count(int loopNr, reg_t val) { 
+  if(loopNr) {
+    p->state.lpcount1 = val;
+  } else {
+    p->state.lpcount0 = val;
+  }
+  set_active(loopNr);
+}
+
+// Executed after execution of instruction
+// pc:  current pc (state.pc)
+// npc: next pc (from instruction)
+// returns next pc (can be modifed due to hw-loop)
+reg_t processor_t::hwLoopUnit_t::handle_loops(reg_t pc, reg_t npc, insn_t insn)
+{
+  // immediately break if inactive as to not degrade performance
+  if(!any_active) { 
+    return npc;
+  }
+
+  for(int i=0; i<2; i++) {
+    // active and in body
+    if(lp_active[i] && get_start(i) <= pc && pc <= get_end(i)) {
+      // Constraints: if not met -> throw trap_illegal_instruction(insn.bits())
+      // no compressed instructions
+      if(insn.length() < 4) {
+        throw trap_illegal_instruction(insn.bits());
+      }
+      // Todo: Finish constraint checks
+      // no unconditional jumps
+      // no conditional branching
+      // no priviliged instructions except ebreak
+      // no memory ordering (fence) instr
+
+      if(pc == get_end(i)) {
+        reg_t remaining = get_count(i) -1;
+        set_count(i, remaining);
+        lp_active[i] &= (remaining > 0);
+        any_active = lp_active[0] || lp_active[1];
+
+        if(lp_active[i]) {
+          return get_start(i);
+        }
+      }
+    }
+  }
+
+  return npc;
+}
diff --git a/riscv/processor.h b/riscv/processor.h
index de0be7829b..70f54bed71 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -8,7 +8,9 @@
 #include "trap.h"
 #include <string>
 #include <vector>
+#include <unordered_map>
 #include <map>
+#include <cassert>
 #include "debug_rom_defines.h"
 
 class processor_t;
@@ -27,11 +29,11 @@ struct insn_desc_t
   insn_func_t rv64;
 };
 
-struct commit_log_reg_t
-{
-  reg_t addr;
-  freg_t data;
-};
+// regnum, data
+typedef std::unordered_map<reg_t, freg_t> commit_log_reg_t;
+
+// addr, value, size
+typedef std::vector<std::tuple<reg_t, uint64_t, uint8_t>> commit_log_mem_t;
 
 typedef struct
 {
@@ -83,6 +85,68 @@ typedef struct
   bool load;
 } mcontrol_t;
 
+enum VRM{
+  RNU = 0,
+  RNE,
+  RDN,
+  ROD,
+  INVALID_RM
+};
+
+template<uint64_t N>
+struct type_usew_t;
+
+template<>
+struct type_usew_t<8>
+{
+  using type=uint8_t;
+};
+
+template<>
+struct type_usew_t<16>
+{
+  using type=uint16_t;
+};
+
+template<>
+struct type_usew_t<32>
+{
+  using type=uint32_t;
+};
+
+template<>
+struct type_usew_t<64>
+{
+  using type=uint64_t;
+};
+
+template<uint64_t N>
+struct type_sew_t;
+
+template<>
+struct type_sew_t<8>
+{
+  using type=int8_t;
+};
+
+template<>
+struct type_sew_t<16>
+{
+  using type=int16_t;
+};
+
+template<>
+struct type_sew_t<32>
+{
+  using type=int32_t;
+};
+
+template<>
+struct type_sew_t<64>
+{
+  using type=int64_t;
+};
+
 // architectural state of a RISC-V hart
 struct state_t
 {
@@ -96,6 +160,7 @@ struct state_t
 
   // control and status registers
   reg_t prv;    // TODO: Can this be an enum instead?
+  bool v;
   reg_t misa;
   reg_t mstatus;
   reg_t mepc;
@@ -116,19 +181,47 @@ struct state_t
   reg_t stvec;
   reg_t satp;
   reg_t scause;
+
+  reg_t mtval2;
+  reg_t mtinst;
+  reg_t hstatus;
+  reg_t hideleg;
+  reg_t hedeleg;
+  uint32_t hcounteren;
+  reg_t htval;
+  reg_t htinst;
+  reg_t hgatp;
+  reg_t vsstatus;
+  reg_t vstvec;
+  reg_t vsscratch;
+  reg_t vsepc;
+  reg_t vscause;
+  reg_t vstval;
+  reg_t vsatp;
+
   reg_t dpc;
-  reg_t dscratch;
+  reg_t dscratch0, dscratch1;
   dcsr_t dcsr;
   reg_t tselect;
   mcontrol_t mcontrol[num_triggers];
   reg_t tdata2[num_triggers];
+  bool debug_mode;
 
-  static const int n_pmp = 16;
-  uint8_t pmpcfg[n_pmp];
-  reg_t pmpaddr[n_pmp];
+  static const int max_pmp = 16;
+  uint8_t pmpcfg[max_pmp];
+  reg_t pmpaddr[max_pmp];
 
   uint32_t fflags;
   uint32_t frm;
+
+  // xpulphwloop
+  reg_t lpstart0;
+  reg_t lpend0;
+  reg_t lpcount0;
+  reg_t lpstart1;
+  reg_t lpend1;
+  reg_t lpcount1;
+
   bool serialized; // whether timer CSRs are in a well-defined state
 
   // When true, execute a single instruction and then enter debug mode.  This
@@ -141,6 +234,8 @@ struct state_t
 
 #ifdef RISCV_ENABLE_COMMITLOG
   commit_log_reg_t log_reg_write;
+  commit_log_mem_t log_mem_read;
+  commit_log_mem_t log_mem_write;
   reg_t last_inst_priv;
   int last_inst_xlen;
   int last_inst_flen;
@@ -153,6 +248,12 @@ typedef enum {
   OPERATION_LOAD,
 } trigger_operation_t;
 
+typedef enum {
+  // 65('A') ~ 90('Z') is reserved for standard isa in misa
+  EXT_ZFH   = 0,
+  EXT_ZVEDIV,
+} isa_extension_t;
+
 // Count number of contiguous 1 bits starting from the LSB.
 static int cto(reg_t val)
 {
@@ -166,15 +267,22 @@ static int cto(reg_t val)
 class processor_t : public abstract_device_t
 {
 public:
-  processor_t(const char* isa, simif_t* sim, uint32_t id, bool halt_on_reset=false);
+  processor_t(const char* isa, const char* priv, const char* varch,
+              simif_t* sim, uint32_t id, bool halt_on_reset,
+              FILE *log_file);
   ~processor_t();
 
   void set_debug(bool value);
   void set_histogram(bool value);
+#ifdef RISCV_ENABLE_COMMITLOG
+  void enable_log_commits();
+  bool get_log_commits_enabled() const { return log_commits_enabled; }
+#endif
   void reset();
   void step(size_t n); // run for n cycles
   void set_csr(int which, reg_t val);
-  reg_t get_csr(int which);
+  reg_t get_csr(int which, insn_t insn, bool write, bool peek = 0);
+  reg_t get_csr(int which) { return get_csr(which, insn_t(0), false, true); }
   mmu_t* get_mmu() { return mmu; }
   state_t* get_state() { return &state; }
   unsigned get_xlen() { return xlen; }
@@ -187,21 +295,26 @@ class processor_t : public abstract_device_t
   }
   extension_t* get_extension() { return ext; }
   bool supports_extension(unsigned char ext) {
-    if (ext >= 'a' && ext <= 'z') ext += 'A' - 'a';
-    return ext >= 'A' && ext <= 'Z' && ((state.misa >> (ext - 'A')) & 1);
+    if (ext >= 'A' && ext <= 'Z')
+      return ((state.misa >> (ext - 'A')) & 1);
+    else
+      return extension_table[ext];
   }
   reg_t pc_alignment_mask() {
     return ~(reg_t)(supports_extension('C') ? 0 : 2);
   }
   void check_pc_alignment(reg_t pc) {
     if (unlikely(pc & ~pc_alignment_mask()))
-      throw trap_instruction_address_misaligned(pc);
+      throw trap_instruction_address_misaligned(pc, 0, 0);
   }
   reg_t legalize_privilege(reg_t);
   void set_privilege(reg_t);
+  void set_virt(bool);
   void update_histogram(reg_t pc);
   const disassembler_t* get_disassembler() { return disassembler; }
 
+  FILE *get_log_file() { return log_file; }
+
   void register_insn(insn_desc_t);
   void register_extension(extension_t*);
 
@@ -213,13 +326,17 @@ class processor_t : public abstract_device_t
   bool debug;
   // When true, take the slow simulation path.
   bool slow_path();
-  bool halted() { return state.dcsr.cause ? true : false; }
-  bool halt_request;
+  bool halted() { return state.debug_mode; }
+  enum {
+    HR_NONE,    /* Halt request is inactive. */
+    HR_REGULAR, /* Regular halt request/debug interrupt. */
+    HR_GROUP    /* Halt requested due to halt group. */
+  } halt_request;
 
   // Return the index of a trigger that matched, or -1.
   inline int trigger_match(trigger_operation_t operation, reg_t address, reg_t data)
   {
-    if (state.dcsr.cause)
+    if (state.debug_mode)
       return -1;
 
     bool chain_ok = true;
@@ -259,7 +376,7 @@ class processor_t : public abstract_device_t
           break;
         case MATCH_NAPOT:
           {
-            reg_t mask = ~((1 << cto(state.tdata2[i])) - 1);
+            reg_t mask = ~((1 << (cto(state.tdata2[i])+1)) - 1);
             if ((value & mask) != (state.tdata2[i] & mask))
               continue;
           }
@@ -298,6 +415,11 @@ class processor_t : public abstract_device_t
 
   void trigger_updated();
 
+  void set_pmp_num(reg_t pmp_num);
+  void set_pmp_granularity(reg_t pmp_granularity);
+
+  const char* get_symbol(uint64_t addr);
+
 private:
   simif_t* sim;
   mmu_t* mmu; // main memory is always accessed via the mmu
@@ -310,7 +432,11 @@ class processor_t : public abstract_device_t
   reg_t max_isa;
   std::string isa_string;
   bool histogram_enabled;
+  bool log_commits_enabled;
+  FILE *log_file;
   bool halt_on_reset;
+  std::vector<bool> extension_table;
+  
 
   std::vector<insn_desc_t> instructions;
   std::map<reg_t,uint64_t> pc_histogram;
@@ -324,19 +450,125 @@ class processor_t : public abstract_device_t
   void disasm(insn_t insn); // disassemble and print an instruction
   int paddr_bits();
 
+  reg_t pmp_tor_mask() { return -(reg_t(1) << (lg_pmp_granularity - PMP_SHIFT)); }
+
   void enter_debug_mode(uint8_t cause);
 
   friend class mmu_t;
   friend class clint_t;
   friend class extension_t;
 
-  void parse_isa_string(const char* isa);
+  void parse_varch_string(const char*);
+  void parse_priv_string(const char*);
+  void parse_isa_string(const char*);
   void build_opcode_map();
   void register_base_instructions();
   insn_func_t decode_insn(insn_t insn);
 
   // Track repeated executions for processor_t::disasm()
   uint64_t last_pc, last_bits, executions;
+  reg_t n_pmp;
+  reg_t lg_pmp_granularity;
+
+public:
+  class vectorUnit_t {
+    public:
+      processor_t* p;
+      void *reg_file;
+      char reg_referenced[NVPR];
+      int setvl_count;
+      reg_t vlmax;
+      reg_t vstart, vxrm, vxsat, vl, vtype, vlenb;
+      reg_t vma, vta;
+      reg_t vediv, vsew;
+      float vflmul;
+      reg_t ELEN, VLEN;
+      bool vill;
+      bool vstart_alu;
+
+      // vector element for varies SEW
+      template<class T>
+        T& elt(reg_t vReg, reg_t n, bool is_write = false){
+          assert(vsew != 0);
+          assert((VLEN >> 3)/sizeof(T) > 0);
+          reg_t elts_per_reg = (VLEN >> 3) / (sizeof(T));
+          vReg += n / elts_per_reg;
+          n = n % elts_per_reg;
+#ifdef WORDS_BIGENDIAN
+          // "V" spec 0.7.1 requires lower indices to map to lower significant
+          // bits when changing SEW, thus we need to index from the end on BE.
+  	  n ^= elts_per_reg - 1;
+#endif
+          reg_referenced[vReg] = 1;
+
+#ifdef RISCV_ENABLE_COMMITLOG
+          if (is_write)
+            p->get_state()->log_reg_write[((vReg) << 4) | 2] = {0, 0};
+#endif
+
+          T *regStart = (T*)((char*)reg_file + vReg * (VLEN >> 3));
+          return regStart[n];
+        }
+    public:
+
+      void reset();
+
+      vectorUnit_t(){
+        reg_file = 0;
+      }
+
+      ~vectorUnit_t(){
+        free(reg_file);
+        reg_file = 0;
+      }
+
+      reg_t set_vl(int rd, int rs1, reg_t reqVL, reg_t newType);
+
+      reg_t get_vlen() { return VLEN; }
+      reg_t get_elen() { return ELEN; }
+      reg_t get_slen() { return VLEN; }
+
+      VRM get_vround_mode() {
+        return (VRM)vxrm;
+      }
+  };
+
+  vectorUnit_t VU;
+
+
+  // PULP HW-Loop extension (xpulphwloop)
+  class hwLoopUnit_t {
+    public:
+      processor_t* p;
+
+      bool lp_active[2];
+      bool any_active;  // set if any group of count, start and end are valid
+
+      hwLoopUnit_t() : 
+        p(0) {
+        any_active = false;
+      }
+
+      void set_active(int i); // handles exceptions and sets active-flags
+
+      reg_t handle_loops(reg_t pc, reg_t npc, insn_t insn); // returns npc
+
+      // Control and Status Register access (callable by instructions)
+      // It woul also be possible to use p->get_csr here but since
+      // hwloop doesn't have side effect this is fine and faster
+      reg_t get_start(int lpNr) { return (lpNr) ? p->state.lpstart1 : p->state.lpstart0; }
+      reg_t get_end(int lpNr)   { return (lpNr) ? p->state.lpend1 : p->state.lpend0; }
+      reg_t get_count(int lpNr) { return (lpNr) ? p->state.lpcount1 : p->state.lpcount0; }
+
+      // also used in set_csr() to make sure csr-write commands also
+      // check for constraints and throw exception
+      // setters take insn to throw illegal_instruction
+      void set_start(int loopNr, reg_t val);
+      void set_end(int loopNr, reg_t val);
+      void set_count(int loopNr, reg_t val);
+  };
+  hwLoopUnit_t hwLoops;
+
 };
 
 reg_t illegal_instruction(processor_t* p, insn_t insn, reg_t pc);
diff --git a/riscv/remote_bitbang.cc b/riscv/remote_bitbang.cc
index 21306dd166..8453e85abd 100644
--- a/riscv/remote_bitbang.cc
+++ b/riscv/remote_bitbang.cc
@@ -5,6 +5,13 @@
 #include <string.h>
 #include <unistd.h>
 
+#ifndef AF_INET
+#include <sys/socket.h>
+#endif
+#ifndef INADDR_ANY
+#include <netinet/in.h>
+#endif
+
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
diff --git a/riscv/riscv.ac b/riscv/riscv.ac
index 68bcdb55d1..64693e9144 100644
--- a/riscv/riscv.ac
+++ b/riscv/riscv.ac
@@ -6,21 +6,24 @@ AC_ARG_WITH(isa,
   AC_DEFINE_UNQUOTED([DEFAULT_ISA], "$withval", [Default value for --isa switch]),
   AC_DEFINE_UNQUOTED([DEFAULT_ISA], "RV64IMAFDC", [Default value for --isa switch]))
 
-AC_SEARCH_LIBS([dlopen], [dl dld], [], [
-  AC_MSG_ERROR([unable to find the dlopen() function])
+AC_ARG_WITH(priv,
+	[AS_HELP_STRING([--with-priv=MSU],
+		[Sets the default RISC-V privilege modes supported])],
+  AC_DEFINE_UNQUOTED([DEFAULT_PRIV], "$withval", [Default value for --priv switch]),
+  AC_DEFINE_UNQUOTED([DEFAULT_PRIV], "MSU", [Default value for --priv switch]))
+
+AC_ARG_WITH(varch,
+	[AS_HELP_STRING([--with-varch=vlen:128,elen:64,slen:128],
+		[Sets the default vector config])],
+  AC_DEFINE_UNQUOTED([DEFAULT_VARCH], "$withval", [Default value for --varch switch]),
+  AC_DEFINE_UNQUOTED([DEFAULT_VARCH], ["vlen:128,elen:64,slen:128"], [Default value for --varch switch]))
+
+
+AC_SEARCH_LIBS([dlopen], [dl dld], [
+  AC_DEFINE([HAVE_DLOPEN], [], [Dynamic library loading is supported]),
+  AC_SUBST([HAVE_DLOPEN], [yes])
 ])
 
-AC_ARG_WITH([fesvr],
-  [AS_HELP_STRING([--with-fesvr],
-    [path to your fesvr installation if not in a standard location])],
-  [
-    LDFLAGS="-L$withval/lib $LDFLAGS"
-    CPPFLAGS="-I$withval/include $CPPFLAGS"
-  ]
-)
-
-AC_CHECK_LIB(fesvr, libfesvr_is_present, [], [AC_MSG_ERROR([libfesvr is required])], [-pthread])
-
 AC_CHECK_LIB(pthread, pthread_create, [], [AC_MSG_ERROR([libpthread is required])])
 
 AC_ARG_ENABLE([commitlog], AS_HELP_STRING([--enable-commitlog], [Enable commit log generation]))
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 80755e711c..d547a1efac 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -2,10 +2,13 @@ get_insn_list = $(shell grep ^DECLARE_INSN $(1) | sed 's/DECLARE_INSN(\(.*\),.*,
 get_opcode = $(shell grep ^DECLARE_INSN.*\\\<$(2)\\\> $(1) | sed 's/DECLARE_INSN(.*,\(.*\),.*)/\1/')
 
 riscv_subproject_deps = \
+	fdt \
 	softfloat \
 
 riscv_install_prog_srcs = \
 
+riscv_CFLAGS = -fPIC
+
 riscv_hdrs = \
 	common.h \
 	decode.h \
@@ -20,16 +23,18 @@ riscv_hdrs = \
 	encoding.h \
 	cachesim.h \
 	memtracer.h \
+	mmio_plugin.h \
 	tracer.h \
 	extension.h \
 	rocc.h \
 	insn_template.h \
-	mulhi.h \
 	debug_module.h \
 	debug_rom_defines.h \
 	remote_bitbang.h \
 	jtag_dtm.h \
 
+riscv_install_hdrs = mmio_plugin.h
+
 riscv_precompiled_hdrs = \
 	insn_template.h \
 
@@ -46,7 +51,6 @@ riscv_srcs = \
 	extension.cc \
 	extensions.cc \
 	rocc.cc \
-	regnames.cc \
 	devices.cc \
 	rom.cc \
 	clint.cc \
@@ -61,11 +65,61 @@ riscv_gen_hdrs = \
 	icache.h \
 	insn_list.h \
 
-riscv_insn_list = \
+
+riscv_insn_ext_i = \
 	add \
 	addi \
 	addiw \
 	addw \
+	and \
+	andi \
+	auipc \
+	beq \
+	bge \
+	bgeu \
+	blt \
+	bltu \
+	bne \
+	jal \
+	jalr \
+	lb \
+	lbu \
+	ld \
+	lh \
+	lhu \
+	lui \
+	lw \
+	lwu \
+	or \
+	ori \
+	sb \
+	sd \
+	sh \
+	sll \
+	slli \
+	slliw \
+	sllw \
+	slt \
+	slti \
+	sltiu \
+	sltu \
+	sra \
+	srai \
+	sraiw \
+	sraw \
+	srl \
+	srli \
+	srliw \
+	srlw \
+	sub \
+	subw \
+	sw \
+	xor \
+	xori \
+	fence \
+	fence_i \
+
+riscv_insn_ext_a = \
 	amoadd_d \
 	amoadd_w \
 	amoand_d \
@@ -84,18 +138,15 @@ riscv_insn_list = \
 	amoswap_w \
 	amoxor_d \
 	amoxor_w \
-	and \
-	andi \
-	auipc \
-	beq \
-	bge \
-	bgeu \
-	blt \
-	bltu \
-	bne \
+	lr_d \
+	lr_w \
+	sc_d \
+	sc_w \
+
+riscv_insn_ext_c = \
 	c_add \
-	c_addi4spn \
 	c_addi \
+	c_addi4spn \
 	c_addw \
 	c_and \
 	c_andi \
@@ -110,9 +161,9 @@ riscv_insn_list = \
 	c_fsdsp \
 	c_fsw \
 	c_fswsp \
+	c_j \
 	c_jal \
 	c_jalr \
-	c_j \
 	c_jr \
 	c_li \
 	c_lui \
@@ -125,28 +176,60 @@ riscv_insn_list = \
 	c_srli \
 	c_sub \
 	c_subw \
-	c_xor \
-	csrrc \
-	csrrci \
-	csrrs \
-	csrrsi \
-	csrrw \
-	csrrwi \
 	c_sw \
 	c_swsp \
+	c_xor \
+
+riscv_insn_ext_m = \
 	div \
 	divu \
 	divuw \
 	divw \
-	dret \
-	ebreak \
-	ecall \
-	fadd_d \
-	fadd_q \
+	mul \
+	mulh \
+	mulhsu \
+	mulhu \
+	mulw \
+	rem \
+	remu \
+	remuw \
+	remw \
+
+riscv_insn_ext_f = \
 	fadd_s \
-	fclass_d \
-	fclass_q \
 	fclass_s \
+	fcvt_l_s \
+	fcvt_lu_s \
+	fcvt_s_l \
+	fcvt_s_lu \
+	fcvt_s_w \
+	fcvt_s_wu \
+	fcvt_w_s \
+	fcvt_wu_s \
+	fdiv_s \
+	feq_s \
+	fle_s \
+	flt_s \
+	flw \
+	fmadd_s \
+	fmax_s \
+	fmin_s \
+	fmsub_s \
+	fmul_s \
+	fmv_w_x \
+	fmv_x_w \
+	fnmadd_s \
+	fnmsub_s \
+	fsgnj_s \
+	fsgnjn_s \
+	fsgnjx_s \
+	fsqrt_s \
+	fsub_s \
+	fsw \
+
+riscv_insn_ext_d = \
+	fadd_d \
+	fclass_d \
 	fcvt_d_l \
 	fcvt_d_lu \
 	fcvt_d_q \
@@ -154,142 +237,952 @@ riscv_insn_list = \
 	fcvt_d_w \
 	fcvt_d_wu \
 	fcvt_l_d \
-	fcvt_l_q \
-	fcvt_l_s \
 	fcvt_lu_d \
+	fcvt_s_d \
+	fcvt_w_d \
+	fcvt_wu_d \
+	fdiv_d \
+	feq_d \
+	fld \
+	fle_d \
+	flt_d \
+	fmadd_d \
+	fmax_d \
+	fmin_d \
+	fmsub_d \
+	fmul_d \
+	fmv_d_x \
+	fmv_x_d \
+	fnmadd_d \
+	fnmsub_d \
+	fsd \
+	fsgnj_d \
+	fsgnjn_d \
+	fsgnjx_d \
+	fsqrt_d \
+	fsub_d \
+
+riscv_insn_ext_zfh = \
+	fadd_h \
+	fclass_h \
+	fcvt_l_h \
+	fcvt_lu_h \
+	fcvt_d_h \
+	fcvt_h_d \
+	fcvt_h_l \
+	fcvt_h_lu \
+	#fcvt_h_q \
+	fcvt_h_s \
+	fcvt_h_w \
+	fcvt_h_wu \
+	#fcvt_q_h \
+	fcvt_s_h \
+	fcvt_w_h \
+	fcvt_wu_h \
+	fdiv_h \
+	feq_h \
+	fle_h \
+	flh \
+	flt_h \
+	fmadd_h \
+	fmax_h \
+	fmin_h \
+	fmsub_h \
+	fmul_h \
+	fmv_h_x \
+	fmv_x_h \
+	fnmadd_h \
+	fnmsub_h \
+	fsgnj_h \
+	fsgnjn_h \
+	fsgnjx_h \
+	fsh \
+	fsqrt_h \
+	fsub_h \
+
+riscv_insn_ext_q = \
+	fadd_q \
+	fclass_q \
+	fcvt_l_q \
 	fcvt_lu_q \
-	fcvt_lu_s \
 	fcvt_q_d \
 	fcvt_q_l \
 	fcvt_q_lu \
 	fcvt_q_s \
 	fcvt_q_w \
 	fcvt_q_wu \
-	fcvt_s_d \
-	fcvt_s_l \
-	fcvt_s_lu \
 	fcvt_s_q \
-	fcvt_s_w \
-	fcvt_s_wu \
-	fcvt_w_d \
 	fcvt_w_q \
-	fcvt_w_s \
-	fcvt_wu_d \
 	fcvt_wu_q \
-	fcvt_wu_s \
-	fdiv_d \
 	fdiv_q \
-	fdiv_s \
-	fence \
-	fence_i \
-	feq_d \
 	feq_q \
-	feq_s \
-	fld \
-	fle_d \
 	fle_q \
-	fle_s \
 	flq \
-	flt_d \
 	flt_q \
-	flt_s \
-	flw \
-	fmadd_d \
 	fmadd_q \
-	fmadd_s \
-	fmax_d \
 	fmax_q \
-	fmax_s \
-	fmin_d \
 	fmin_q \
-	fmin_s \
-	fmsub_d \
 	fmsub_q \
-	fmsub_s \
-	fmul_d \
 	fmul_q \
-	fmul_s \
-	fmv_d_x \
-	fmv_w_x \
-	fmv_x_d \
-	fmv_x_w \
-	fnmadd_d \
 	fnmadd_q \
-	fnmadd_s \
-	fnmsub_d \
 	fnmsub_q \
-	fnmsub_s \
-	fsd \
-	fsgnj_d \
 	fsgnj_q \
-	fsgnjn_d \
 	fsgnjn_q \
-	fsgnjn_s \
-	fsgnj_s \
-	fsgnjx_d \
 	fsgnjx_q \
-	fsgnjx_s \
 	fsq \
-	fsqrt_d \
 	fsqrt_q \
-	fsqrt_s \
-	fsub_d \
 	fsub_q \
-	fsub_s \
-	fsw \
-	jal \
-	jalr \
-	lb \
-	lbu \
-	ld \
-	lh \
-	lhu \
-	lr_d \
-	lr_w \
-	lui \
-	lw \
-	lwu \
+
+# Disabled riscv_insn_ext_v_alu_int instructions for opcode overlap:
+#vasubu_vx
+#vslide1up_vx
+#vaaddu_vx
+#vadc_vvm
+#vadc_vxm
+#vsbc_vvm
+#vsbc_vxm
+#vmulhu_vx
+#vdivu_vx
+#vmulhsu_vx
+
+riscv_insn_ext_v_alu_int = \
+	vaadd_vv \
+	vaaddu_vv \
+	vaadd_vx \
+	vadc_vim \
+	vadd_vi \
+	vadd_vv \
+	vadd_vx \
+	vand_vi \
+	vand_vv \
+	vand_vx \
+	vasub_vv \
+	vasubu_vv \
+	vasub_vx \
+	vcompress_vm \
+	vdiv_vv \
+	vdiv_vx \
+	vdivu_vv \
+	vdot_vv \
+	vdotu_vv \
+	vid_v \
+	viota_m \
+	vmacc_vv \
+	vmacc_vx \
+	vmadc_vim \
+	vmadc_vvm \
+	vmadc_vxm \
+	vmadd_vv \
+	vmadd_vx \
+	vmand_mm \
+	vmandnot_mm \
+	vmax_vv \
+	vmax_vx \
+	vmaxu_vv \
+	vmaxu_vx \
+	vmerge_vim \
+	vmerge_vvm \
+	vmerge_vxm \
+	vfirst_m \
+	vmin_vv \
+	vmin_vx \
+	vminu_vv \
+	vminu_vx \
+	vmnand_mm \
+	vmnor_mm \
+	vmor_mm \
+	vmornot_mm \
+	vpopc_m \
+	vmsbc_vvm \
+	vmsbc_vxm \
+	vmsbf_m \
+	vmseq_vi \
+	vmseq_vv \
+	vmseq_vx \
+	vmsgt_vi \
+	vmsgt_vx \
+	vmsgtu_vi \
+	vmsgtu_vx \
+	vmsif_m \
+	vmsle_vi \
+	vmsle_vv \
+	vmsle_vx \
+	vmsleu_vi \
+	vmsleu_vv \
+	vmsleu_vx \
+	vmslt_vv \
+	vmslt_vx \
+	vmsltu_vv \
+	vmsltu_vx \
+	vmsne_vi \
+	vmsne_vv \
+	vmsne_vx \
+	vmsof_m \
+	vmul_vv \
+	vmul_vx \
+	vmulh_vv \
+	vmulh_vx \
+	vmulhsu_vv \
+	vmulhu_vv \
+	vmv_s_x \
+	vmv_v_i \
+	vmv_v_v \
+	vmv_v_x \
+	vmv_x_s \
+	vmv1r_v \
+	vmv2r_v \
+	vmv4r_v \
+	vmv8r_v \
+	vmxnor_mm \
+	vmxor_mm \
+	vnclip_wi \
+	vnclip_wv \
+	vnclip_wx \
+	vnclipu_wi \
+	vnclipu_wv \
+	vnclipu_wx \
+	vnmsac_vv \
+	vnmsac_vx \
+	vnmsub_vv \
+	vnmsub_vx \
+	vnsra_wi \
+	vnsra_wv \
+	vnsra_wx \
+	vnsrl_wi \
+	vnsrl_wv \
+	vnsrl_wx \
+	vor_vi \
+	vor_vv \
+	vor_vx \
+	vredand_vs \
+	vredmax_vs \
+	vredmaxu_vs \
+	vredmin_vs \
+	vredminu_vs \
+	vredor_vs \
+	vredsum_vs \
+	vredxor_vs \
+	vrem_vv \
+	vrem_vx \
+	vremu_vv \
+	vremu_vx \
+	vrgather_vi \
+	vrgather_vv \
+	vrgather_vx \
+	vrgatherei16_vv \
+	vrsub_vi \
+	vrsub_vx \
+	vsadd_vi \
+	vsadd_vv \
+	vsadd_vx \
+	vsaddu_vi \
+	vsaddu_vv \
+	vsaddu_vx \
+	vsext_vf2 \
+	vsext_vf4 \
+	vsext_vf8 \
+	vslide1down_vx \
+	vslidedown_vi \
+	vslidedown_vx \
+	vslideup_vi \
+	vslideup_vx \
+	vsll_vi \
+	vsll_vv \
+	vsll_vx \
+	vsmul_vv \
+	vsmul_vx \
+	vsra_vi \
+	vsra_vv \
+	vsra_vx \
+	vsrl_vi \
+	vsrl_vv \
+	vsrl_vx \
+	vssra_vi \
+	vssra_vv \
+	vssra_vx \
+	vssrl_vi \
+	vssrl_vv \
+	vssrl_vx \
+	vssub_vv \
+	vssub_vx \
+	vssubu_vv \
+	vssubu_vx \
+	vsub_vv \
+	vsub_vx \
+	vwadd_vv \
+	vwadd_vx \
+	vwadd_wv \
+	vwadd_wx \
+	vwaddu_vv \
+	vwaddu_vx \
+	vwaddu_wv \
+	vwaddu_wx \
+	vwmacc_vv \
+	vwmacc_vx \
+	vwmaccsu_vv \
+	vwmaccsu_vx \
+	vwmaccu_vv \
+	vwmaccu_vx \
+	vwmaccus_vx \
+	vwmul_vv \
+	vwmul_vx \
+	vwmulsu_vv \
+	vwmulsu_vx \
+	vwmulu_vv \
+	vwmulu_vx \
+	vwredsum_vs \
+	vwredsumu_vs \
+	vwsub_vv \
+	vwsub_vx \
+	vwsub_wv \
+	vwsub_wx \
+	vwsubu_vv \
+	vwsubu_vx \
+	vwsubu_wv \
+	vwsubu_wx \
+	vxor_vi \
+	vxor_vv \
+	vxor_vx \
+	vzext_vf2 \
+	vzext_vf4 \
+	vzext_vf8 \
+
+# Disabled riscv_insn_ext_v_alu_fp instructions for opcode overlap:
+#vfcvt_x_f_v
+
+riscv_insn_ext_v_alu_fp = \
+	vfadd_vf \
+	vfadd_vv \
+	vfclass_v \
+	vfcvt_f_x_v \
+	vfcvt_f_xu_v \
+	vfcvt_rtz_x_f_v \
+	vfcvt_rtz_xu_f_v \
+	vfcvt_xu_f_v \
+	vfdiv_vf \
+	vfdiv_vv \
+	vfdot_vv \
+	vfmacc_vf \
+	vfmacc_vv \
+	vfmadd_vf \
+	vfmadd_vv \
+	vfmax_vf \
+	vfmax_vv \
+	vfmerge_vfm \
+	vfmin_vf \
+	vfmin_vv \
+	vfmsac_vf \
+	vfmsac_vv \
+	vfmsub_vf \
+	vfmsub_vv \
+	vfmul_vf \
+	vfmul_vv \
+	vfmv_f_s \
+	vfmv_s_f \
+	vfmv_v_f \
+	vfncvt_f_f_w \
+	vfncvt_f_x_w \
+	vfncvt_f_xu_w \
+	vfncvt_rod_f_f_w \
+	vfncvt_rtz_x_f_w \
+	vfncvt_rtz_xu_f_w \
+	vfncvt_x_f_w \
+	vfncvt_xu_f_w \
+	vfnmacc_vf \
+	vfnmacc_vv \
+	vfnmadd_vf \
+	vfnmadd_vv \
+	vfnmsac_vf \
+	vfnmsac_vv \
+	vfnmsub_vf \
+	vfnmsub_vv \
+	vfrdiv_vf \
+	vfredmax_vs \
+	vfredmin_vs \
+	vfredosum_vs \
+	vfredsum_vs \
+	vfrece7_v \
+	vfrsub_vf \
+	vfrsqrte7_v \
+	vfsgnj_vf \
+	vfsgnj_vv \
+	vfsgnjn_vf \
+	vfsgnjn_vv \
+	vfsgnjx_vf \
+	vfsgnjx_vv \
+	vfsqrt_v \
+	vfslide1down_vf \
+	vfslide1up_vf \
+	vfsub_vf \
+	vfsub_vv \
+	vfwadd_vf \
+	vfwadd_vv \
+	vfwadd_wf \
+	vfwadd_wv \
+	vfwcvt_f_f_v \
+	vfwcvt_f_x_v \
+	vfwcvt_f_xu_v \
+	vfwcvt_rtz_x_f_v \
+	vfwcvt_rtz_xu_f_v \
+	vfwcvt_x_f_v \
+	vfwcvt_xu_f_v \
+	vfwmacc_vf \
+	vfwmacc_vv \
+	vfwmsac_vf \
+	vfwmsac_vv \
+	vfwmul_vf \
+	vfwmul_vv \
+	vfwnmacc_vf \
+	vfwnmacc_vv \
+	vfwnmsac_vf \
+	vfwnmsac_vv \
+	vfwredosum_vs \
+	vfwredsum_vs \
+	vfwsub_vf \
+	vfwsub_vv \
+	vfwsub_wf \
+	vfwsub_wv \
+	vmfeq_vf \
+	vmfeq_vv \
+	vmfge_vf \
+	vmfgt_vf \
+	vmfle_vf \
+	vmfle_vv \
+	vmflt_vf \
+	vmflt_vv \
+	vmfne_vf \
+	vmfne_vv \
+
+riscv_insn_ext_v_amo = \
+	vamoswapei8_v \
+	vamoaddei8_v \
+	vamoandei8_v \
+	vamomaxei8_v \
+	vamomaxuei8_v \
+	vamominei8_v \
+	vamominuei8_v \
+	vamoorei8_v \
+	vamoxorei8_v \
+	vamoswapei16_v \
+	vamoaddei16_v \
+	vamoandei16_v \
+	vamomaxei16_v \
+	vamomaxuei16_v \
+	vamominei16_v \
+	vamominuei16_v \
+	vamoorei16_v \
+	vamoxorei16_v \
+	vamoswapei32_v \
+	vamoaddei32_v \
+	vamoandei32_v \
+	vamomaxei32_v \
+	vamomaxuei32_v \
+	vamominei32_v \
+	vamominuei32_v \
+	vamoorei32_v \
+	vamoxorei32_v \
+	vamoswapei64_v \
+	vamoaddei64_v \
+	vamoandei64_v \
+	vamomaxei64_v \
+	vamomaxuei64_v \
+	vamominei64_v \
+	vamominuei64_v \
+	vamoorei64_v \
+	vamoxorei64_v \
+
+riscv_insn_ext_v_ldst = \
+	vle8_v \
+	vle16_v \
+	vle32_v \
+	vle64_v \
+	vlse8_v \
+	vlse16_v \
+	vlse32_v \
+	vlse64_v \
+	vlxei8_v \
+	vlxei16_v \
+	vlxei32_v \
+	vlxei64_v \
+	vle8ff_v \
+	vle16ff_v \
+	vle32ff_v \
+	vle64ff_v \
+	vl1re8_v \
+	vl2re8_v \
+	vl4re8_v \
+	vl8re8_v \
+	vl1re16_v \
+	vl2re16_v \
+	vl4re16_v \
+	vl8re16_v \
+	vl1re32_v \
+	vl2re32_v \
+	vl4re32_v \
+	vl8re32_v \
+	vl1re64_v \
+	vl2re64_v \
+	vl4re64_v \
+	vl8re64_v \
+	vse8_v \
+	vse16_v \
+	vse32_v \
+	vse64_v \
+	vsse8_v \
+	vsse16_v \
+	vsse32_v \
+	vsse64_v \
+	vsxei8_v \
+	vsxei16_v \
+	vsxei32_v \
+	vsxei64_v \
+	vsuxei8_v \
+	vsuxei16_v \
+	vsuxei32_v \
+	vsuxei64_v \
+	vs1r_v \
+	vs2r_v \
+	vs4r_v \
+	vs8r_v \
+
+# Disabled riscv_insn_ext_v_ctrl instructions for opcode overlap:
+#vsetvl
+
+riscv_insn_ext_v_ctrl = \
+	vsetvli \
+
+riscv_insn_ext_v = \
+	$(riscv_insn_ext_v_alu_fp) \
+	$(riscv_insn_ext_v_alu_int) \
+	$(riscv_insn_ext_v_amo) \
+	$(riscv_insn_ext_v_ctrl) \
+	$(riscv_insn_ext_v_ldst) \
+	
+	
+riscv_insn_ext_pulphwloop = \
+	lp_starti \
+	lp_endi \
+	lp_count \
+	lp_counti \
+	lp_setup \
+	lp_setupi \
+	
+riscv_insn_ext_pulppostmod = \
+	p_lb_irpost \
+	p_lbu_irpost \
+	p_lh_irpost \
+	p_lhu_irpost \
+	p_lw_irpost \
+	p_lb_rrpost \
+	p_lbu_rrpost \
+	p_lh_rrpost \
+	p_lhu_rrpost \
+	p_lw_rrpost \
+	p_lb_rr \
+	p_lbu_rr \
+	p_lh_rr \
+	p_lhu_rr \
+	p_lw_rr \
+	p_sb_irpost \
+	p_sh_irpost \
+	p_sw_irpost \
+	p_sb_rrpost \
+	p_sh_rrpost \
+	p_sw_rrpost \
+	p_sb_rr \
+	p_sh_rr \
+	p_sw_rr \
+
+riscv_insn_ext_pulpabs = \
+	p_abs \
+
+riscv_insn_ext_pulpslet = \
+	p_slet \
+	p_sletu \
+	
+riscv_insn_ext_pulpmacsi = \
+	p_mac \
+	p_msu \
+	
+riscv_insn_ext_pulpmulrnhi = \
+  p_mulsN \
+  p_mulsRN \
+  p_muluN \
+  p_muluRN \
+  p_mulhhsN \
+  p_mulhhuN \
+  p_mulhhsRN \
+  p_mulhhuRN \
+	
+riscv_insn_ext_pulpmacrnhi = \
+  p_macsN \
+  p_macuN \
+  p_macsRN \
+  p_macuRN \
+  p_machhsN \
+  p_machhuN \
+  p_machhsRN \
+  p_machhuRN \
+
+riscv_insn_ext_pulppartmac = \
+	p_macs \
+	p_macu \
+	p_machhs \
+	p_machhu \
+	
+riscv_insn_ext_pulpminmax = \
+	p_min \
+	p_max \
+	p_minu \
+	p_maxu \
+	
+riscv_insn_ext_pulpbitopsmall = \
+  p_cnt \
+  p_clb \
+  p_fl1 \
+  p_ff1 \
+  p_ror \
+  p_exths \
+  p_exthz \
+  p_extbs \
+  p_extbz \
+  
+riscv_insn_ext_pulpbitop = \
+  $(riscv_insn_ext_pulpbitopsmall) \
+  p_extract \
+  p_extractr \
+  p_extractu \
+  p_extractur \
+  p_insert \
+  p_insertr \
+  p_bset \
+  p_bsetr \
+  p_bclr \
+  p_bclrr \
+	
+riscv_insn_ext_pulpvect = \
+  pv_add_h \
+	pv_add_sc_h \
+	pv_add_sci_h \
+	pv_add_b \
+	pv_add_sc_b \
+	pv_add_sci_b \
+	pv_add_h_div2 \
+	pv_add_h_div4 \
+	pv_add_h_div8 \
+	pv_sub_h \
+	pv_sub_sc_h \
+	pv_sub_sci_h \
+	pv_sub_b \
+	pv_sub_sc_b \
+	pv_sub_sci_b \
+	pv_sub_h_div2 \
+	pv_sub_h_div4 \
+	pv_sub_h_div8 \
+	pv_avg_h \
+	pv_avg_sc_h \
+	pv_avg_sci_h \
+	pv_avg_b \
+	pv_avg_sc_b \
+	pv_avg_sci_b \
+	pv_avgu_h \
+	pv_avgu_sc_h \
+	pv_avgu_sci_h \
+	pv_avgu_b \
+	pv_avgu_sc_b \
+	pv_avgu_sci_b \
+	pv_min_h \
+	pv_min_sc_h \
+	pv_min_sci_h \
+	pv_min_b \
+	pv_min_sc_b \
+	pv_min_sci_b \
+	pv_minu_h \
+	pv_minu_sc_h \
+	pv_minu_sci_h \
+	pv_minu_b \
+	pv_minu_sc_b \
+	pv_minu_sci_b \
+	pv_max_h \
+	pv_max_sc_h \
+	pv_max_sci_h \
+	pv_max_b \
+	pv_max_sc_b \
+	pv_max_sci_b \
+	pv_maxu_h \
+	pv_maxu_sc_h \
+	pv_maxu_sci_h \
+	pv_maxu_b \
+	pv_maxu_sc_b \
+	pv_maxu_sci_b \
+	pv_srl_h \
+	pv_srl_sc_h \
+	pv_srl_sci_h \
+	pv_srl_b \
+	pv_srl_sc_b \
+	pv_srl_sci_b \
+	pv_sra_h \
+	pv_sra_sc_h \
+	pv_sra_sci_h \
+	pv_sra_b \
+	pv_sra_sc_b \
+	pv_sra_sci_b \
+	pv_sll_h \
+	pv_sll_sc_h \
+	pv_sll_sci_h \
+	pv_sll_b \
+	pv_sll_sc_b \
+	pv_sll_sci_b \
+	pv_or_h \
+	pv_or_sc_h \
+	pv_or_sci_h \
+	pv_or_b \
+	pv_or_sc_b \
+	pv_or_sci_b \
+	pv_xor_h \
+	pv_xor_sc_h \
+	pv_xor_sci_h \
+	pv_xor_b \
+	pv_xor_sc_b \
+	pv_xor_sci_b \
+	pv_and_h \
+	pv_and_sc_h \
+	pv_and_sci_h \
+	pv_and_b \
+	pv_and_sc_b \
+	pv_and_sci_b \
+	pv_abs_h \
+	pv_abs_b \
+	pv_extract_h \
+	pv_extract_b \
+	pv_extractu_h \
+	pv_extractu_b \
+	pv_insert_h \
+	pv_insert_b \
+	pv_dotup_h \
+	pv_dotup_sc_h \
+	pv_dotup_sci_h \
+	pv_dotup_b \
+	pv_dotup_sc_b \
+	pv_dotup_sci_b \
+	pv_dotusp_h \
+	pv_dotusp_sc_h \
+	pv_dotusp_sci_h \
+	pv_dotusp_b \
+	pv_dotusp_sc_b \
+	pv_dotusp_sci_b \
+	pv_dotsp_h \
+	pv_dotsp_sc_h \
+	pv_dotsp_sci_h \
+	pv_dotsp_b \
+	pv_dotsp_sc_b \
+	pv_dotsp_sci_b \
+	pv_sdotup_h \
+	pv_sdotup_sc_h \
+	pv_sdotup_sci_h \
+	pv_sdotup_b \
+	pv_sdotup_sc_b \
+	pv_sdotup_sci_b \
+	pv_sdotusp_h \
+	pv_sdotusp_sc_h \
+	pv_sdotusp_sci_h \
+	pv_sdotusp_b \
+	pv_sdotusp_sc_b \
+	pv_sdotusp_sci_b \
+	pv_sdotsp_h \
+	pv_sdotsp_sc_h \
+	pv_sdotsp_sci_h \
+	pv_sdotsp_b \
+	pv_sdotsp_sc_b \
+	pv_sdotsp_sci_b \
+	pv_cmpeq_h \
+	pv_cmpeq_sc_h \
+	pv_cmpeq_sci_h \
+	pv_cmpeq_b \
+	pv_cmpeq_sc_b \
+	pv_cmpeq_sci_b \
+	pv_cmpne_h \
+	pv_cmpne_sc_h \
+	pv_cmpne_sci_h \
+	pv_cmpne_b \
+	pv_cmpne_sc_b \
+	pv_cmpne_sci_b \
+	pv_cmpgt_h \
+	pv_cmpgt_sc_h \
+	pv_cmpgt_sci_h \
+	pv_cmpgt_b \
+	pv_cmpgt_sc_b \
+	pv_cmpgt_sci_b \
+	pv_cmpge_h \
+	pv_cmpge_sc_h \
+	pv_cmpge_sci_h \
+	pv_cmpge_b \
+	pv_cmpge_sc_b \
+	pv_cmpge_sci_b \
+	pv_cmplt_h \
+	pv_cmplt_sc_h \
+	pv_cmplt_sci_h \
+	pv_cmplt_b \
+	pv_cmplt_sc_b \
+	pv_cmplt_sci_b \
+	pv_cmple_h \
+	pv_cmple_sc_h \
+	pv_cmple_sci_h \
+	pv_cmple_b \
+	pv_cmple_sc_b \
+	pv_cmple_sci_b \
+	pv_cmpgtu_h \
+	pv_cmpgtu_sc_h \
+	pv_cmpgtu_sci_h \
+	pv_cmpgtu_b \
+	pv_cmpgtu_sc_b \
+	pv_cmpgtu_sci_b \
+	pv_cmpgeu_h \
+	pv_cmpgeu_sc_h \
+	pv_cmpgeu_sci_h \
+	pv_cmpgeu_b \
+	pv_cmpgeu_sc_b \
+	pv_cmpgeu_sci_b \
+	pv_cmpltu_h \
+	pv_cmpltu_sc_h \
+	pv_cmpltu_sci_h \
+	pv_cmpltu_b \
+	pv_cmpltu_sc_b \
+	pv_cmpltu_sci_b \
+	pv_cmpleu_h \
+	pv_cmpleu_sc_h \
+	pv_cmpleu_sci_h \
+	pv_cmpleu_b \
+	pv_cmpleu_sc_b \
+	pv_cmpleu_sci_b \
+	
+riscv_insn_ext_pulpvectcomplex = \
+  pv_cplxconj_h \
+	pv_subrotmj_h \
+	pv_subrotmj_h_div2 \
+	pv_subrotmj_h_div4 \
+	pv_subrotmj_h_div8 \
+	pv_cplxmul_h_r \
+	pv_cplxmul_h_r_div2 \
+	pv_cplxmul_h_r_div4 \
+	pv_cplxmul_h_r_div8 \
+	pv_cplxmul_h_i \
+	pv_cplxmul_h_i_div2 \
+	pv_cplxmul_h_i_div4 \
+	pv_cplxmul_h_i_div8 \
+	
+riscv_insn_ext_pulpvectshufflepack = \
+  pv_shuffle_h \
+  pv_shuffle_sci_h \
+  pv_shuffle_b \
+  pv_shufflei0_sci_b \
+  pv_shufflei1_sci_b \
+  pv_shufflei2_sci_b \
+  pv_shufflei3_sci_b \
+  pv_shuffle2_h \
+  pv_shuffle2_b \
+  pv_pack \
+  pv_pack_h \
+  pv_packhi_b \
+  pv_packlo_b \
+
+riscv_insn_ext_pulpclip = \
+  p_clip \
+	p_clipu \
+	p_clipr \
+	p_clipur \
+	
+riscv_insn_ext_pulpaddsubrn = \
+  p_addN \
+  p_adduN \
+  p_addRN \
+  p_adduRN \
+  p_subN \
+  p_subuN \
+  p_subRN \
+  p_subuRN \
+  p_addNr \
+  p_adduNr \
+  p_addRNr \
+  p_adduRNr \
+  p_subNr \
+  p_subuNr \
+  p_subRNr \
+  p_subuRNr \
+
+riscv_insn_ext_pulpbr = \
+  p_beqimm \
+  p_bneimm \
+
+riscv_insn_ext_pulpbitrev = \
+  p_bitrev \
+
+riscv_insn_ext_pulpimg = \
+	$(riscv_insn_ext_pulphwloop) \
+	$(riscv_insn_ext_pulppostmod) \
+	$(riscv_insn_ext_pulpabs) \
+	$(riscv_insn_ext_pulpslet) \
+	$(riscv_insn_ext_pulpmacsi) \
+	$(riscv_insn_ext_pulpmulrnhi) \
+	$(riscv_insn_ext_pulpmacrnhi) \
+	$(riscv_insn_ext_pulpminmax) \
+	$(riscv_insn_ext_pulpbitop) \
+	$(riscv_insn_ext_pulpvect) \
+	$(riscv_insn_ext_pulpvectcomplex) \
+	$(riscv_insn_ext_pulpvectshufflepack) \
+	$(riscv_insn_ext_pulpclip) \
+	$(riscv_insn_ext_pulpaddsubrn) \
+	$(riscv_insn_ext_pulpbr) \
+	$(riscv_insn_ext_pulpbitrev) \
+# $(riscv_insn_ext_pulppartmac) \
+
+
+riscv_insn_ext_h = \
+	hfence_gvma \
+	hfence_vvma \
+	hlv_b \
+	hlv_bu \
+	hlv_h \
+	hlv_hu \
+	hlvx_hu \
+	hlv_w \
+	hlv_wu \
+	hlvx_wu \
+	hlv_d \
+	hsv_b \
+	hsv_h \
+	hsv_w \
+	hsv_d \
+
+riscv_insn_priv = \
+	csrrc \
+	csrrci \
+	csrrs \
+	csrrsi \
+	csrrw \
+	csrrwi \
+	dret \
+	ebreak \
+	ecall \
 	mret \
-	mul \
-	mulh \
-	mulhsu \
-	mulhu \
-	mulw \
-	or \
-	ori \
-	rem \
-	remu \
-	remuw \
-	remw \
-	sb \
-	sc_d \
-	sc_w \
-	sd \
 	sfence_vma \
-	sh \
-	sll \
-	slli \
-	slliw \
-	sllw \
-	slt \
-	slti \
-	sltiu \
-	sltu \
-	sra \
-	srai \
-	sraiw \
-	sraw \
 	sret \
-	srl \
-	srli \
-	srliw \
-	srlw \
-	sub \
-	subw \
-	sw \
 	wfi \
-	xor \
-	xori \
+
+
+riscv_insn_list = \
+	$(riscv_insn_ext_a) \
+	$(riscv_insn_ext_c) \
+	$(riscv_insn_ext_i) \
+	$(riscv_insn_ext_m) \
+	$(riscv_insn_ext_f) \
+	$(riscv_insn_ext_d) \
+	$(riscv_insn_ext_zfh) \
+	$(riscv_insn_ext_q) \
+	$(riscv_insn_ext_pulpimg) \
+	$(riscv_insn_ext_h) \
+	$(riscv_insn_priv) \
+	# $(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 
 riscv_gen_srcs = \
 	$(addsuffix .cc,$(riscv_insn_list))
@@ -311,3 +1204,4 @@ $(riscv_gen_srcs): %.cc: insns/%.h insn_template.cc
 
 riscv_junk = \
 	$(riscv_gen_srcs) \
+	
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 44223a7d90..76bb3cdff0 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -4,6 +4,8 @@
 #include "mmu.h"
 #include "dts.h"
 #include "remote_bitbang.h"
+#include "byteorder.h"
+#include <fstream>
 #include <map>
 #include <iostream>
 #include <sstream>
@@ -24,42 +26,79 @@ static void handle_signal(int sig)
   signal(sig, &handle_signal);
 }
 
-sim_t::sim_t(const char* isa, size_t nprocs, bool halted, reg_t start_pc,
-             std::vector<std::pair<reg_t, mem_t*>> mems,
+sim_t::sim_t(const char* isa, const char* priv, const char* varch,
+             size_t nprocs, bool halted, bool real_time_clint,
+             reg_t initrd_start, reg_t initrd_end, const char* bootargs,
+             reg_t start_pc, std::vector<std::pair<reg_t, mem_t*>> mems,
+             std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices,
              const std::vector<std::string>& args,
-             std::vector<int> const hartids, unsigned progsize,
-             unsigned max_bus_master_bits, bool require_authentication)
-  : htif_t(args), mems(mems), procs(std::max(nprocs, size_t(1))),
-    start_pc(start_pc), current_step(0), current_proc(0), debug(false),
-    histogram_enabled(false), dtb_enabled(true), remote_bitbang(NULL),
-    debug_module(this, progsize, max_bus_master_bits, require_authentication)
+             std::vector<int> const hartids,
+             const debug_module_config_t &dm_config,
+             const char *log_path,
+             bool dtb_enabled, const char *dtb_file)
+  : htif_t(args),
+    mems(mems),
+    plugin_devices(plugin_devices),
+    procs(std::max(nprocs, size_t(1))),
+    initrd_start(initrd_start),
+    initrd_end(initrd_end),
+    bootargs(bootargs),
+    start_pc(start_pc),
+    dtb_file(dtb_file ? dtb_file : ""),
+    dtb_enabled(dtb_enabled),
+    log_file(log_path),
+    current_step(0),
+    current_proc(0),
+    debug(false),
+    histogram_enabled(false),
+    log(false),
+    remote_bitbang(NULL),
+    debug_module(this, dm_config)
 {
   signal(SIGINT, &handle_signal);
 
   for (auto& x : mems)
     bus.add_device(x.first, x.second);
 
+  for (auto& x : plugin_devices)
+    bus.add_device(x.first, x.second);
+
   debug_module.add_device(&bus);
 
   debug_mmu = new mmu_t(this, NULL);
 
-  if (hartids.size() == 0) {
-    for (size_t i = 0; i < procs.size(); i++) {
-      procs[i] = new processor_t(isa, this, i, halted);
-    }
-  }
-  else {
-    if (hartids.size() != procs.size()) {
-      std::cerr << "Number of specified hartids doesn't match number of processors" << strerror(errno) << std::endl;
+  if (! (hartids.empty() || hartids.size() == nprocs)) {
+      std::cerr << "Number of specified hartids ("
+                << hartids.size()
+                << ") doesn't match number of processors ("
+                << nprocs << ").\n";
       exit(1);
-    }
-    for (size_t i = 0; i < procs.size(); i++) {
-      procs[i] = new processor_t(isa, this, hartids[i], halted);
-    }
   }
 
-  clint.reset(new clint_t(procs));
-  bus.add_device(CLINT_BASE, clint.get());
+  for (size_t i = 0; i < nprocs; i++) {
+    int hart_id = hartids.empty() ? i : hartids[i];
+    procs[i] = new processor_t(isa, priv, varch, this, hart_id, halted,
+                               log_file.get());
+  }
+
+  make_dtb();
+
+  clint.reset(new clint_t(procs, CPU_HZ / INSNS_PER_RTC_TICK, real_time_clint));
+  reg_t clint_base;
+  if (fdt_parse_clint((void *)dtb.c_str(), &clint_base, "riscv,clint0")) {
+    bus.add_device(CLINT_BASE, clint.get());
+  } else {
+    bus.add_device(clint_base, clint.get());
+  }
+
+  for (size_t i = 0; i < nprocs; i++) {
+    reg_t pmp_num = 0, pmp_granularity = 0;
+    fdt_parse_pmp_num((void *)dtb.c_str(), &pmp_num, "riscv");
+    fdt_parse_pmp_alignment((void *)dtb.c_str(), &pmp_granularity, "riscv");
+
+    procs[i]->set_pmp_num(pmp_num);
+    procs[i]->set_pmp_granularity(pmp_granularity);
+  }
 }
 
 sim_t::~sim_t()
@@ -125,11 +164,6 @@ void sim_t::set_debug(bool value)
   debug = value;
 }
 
-void sim_t::set_log(bool value)
-{
-  log = value;
-}
-
 void sim_t::set_histogram(bool value)
 {
   histogram_enabled = value;
@@ -138,27 +172,71 @@ void sim_t::set_histogram(bool value)
   }
 }
 
+void sim_t::configure_log(bool enable_log, bool enable_commitlog)
+{
+  log = enable_log;
+
+  if (!enable_commitlog)
+    return;
+
+#ifndef RISCV_ENABLE_COMMITLOG
+  fputs("Commit logging support has not been properly enabled; "
+        "please re-build the riscv-isa-sim project using "
+        "\"configure --enable-commitlog\".\n",
+        stderr);
+  abort();
+#else
+  for (processor_t *proc : procs) {
+    proc->enable_log_commits();
+  }
+#endif
+}
+
 void sim_t::set_procs_debug(bool value)
 {
   for (size_t i=0; i< procs.size(); i++)
     procs[i]->set_debug(value);
 }
 
+static bool paddr_ok(reg_t addr)
+{
+  return (addr >> MAX_PADDR_BITS) == 0;
+}
+
 bool sim_t::mmio_load(reg_t addr, size_t len, uint8_t* bytes)
 {
-  if (addr + len < addr)
+  if (addr + len < addr || !paddr_ok(addr + len - 1))
     return false;
   return bus.load(addr, len, bytes);
 }
 
 bool sim_t::mmio_store(reg_t addr, size_t len, const uint8_t* bytes)
 {
-  if (addr + len < addr)
+  if (addr + len < addr || !paddr_ok(addr + len - 1))
     return false;
   return bus.store(addr, len, bytes);
 }
 
 void sim_t::make_dtb()
+{
+  if (!dtb_file.empty()) {
+    std::ifstream fin(dtb_file.c_str(), std::ios::binary);
+    if (!fin.good()) {
+      std::cerr << "can't find dtb file: " << dtb_file << std::endl;
+      exit(-1);
+    }
+
+    std::stringstream strstream;
+    strstream << fin.rdbuf();
+
+    dtb = strstream.str();
+  } else {
+    dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ, initrd_start, initrd_end, bootargs, procs, mems);
+    dtb = dts_compile(dts);
+  }
+}
+
+void sim_t::set_rom()
 {
   const int reset_vec_size = 8;
 
@@ -176,11 +254,27 @@ void sim_t::make_dtb()
     (uint32_t) (start_pc & 0xffffffff),
     (uint32_t) (start_pc >> 32)
   };
+  for(int i = 0; i < reset_vec_size; i++)
+    reset_vec[i] = to_le(reset_vec[i]);
 
   std::vector<char> rom((char*)reset_vec, (char*)reset_vec + sizeof(reset_vec));
 
-  dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ, procs, mems);
-  std::string dtb = dts_compile(dts);
+  std::string dtb;
+  if (!dtb_file.empty()) {
+    std::ifstream fin(dtb_file.c_str(), std::ios::binary);
+    if (!fin.good()) {
+      std::cerr << "can't find dtb file: " << dtb_file << std::endl;
+      exit(-1);
+    }
+
+    std::stringstream strstream;
+    strstream << fin.rdbuf();
+
+    dtb = strstream.str();
+  } else {
+    dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ, initrd_start, initrd_end, bootargs, procs, mems);
+    dtb = dts_compile(dts);
+  }
 
   rom.insert(rom.end(), dtb.begin(), dtb.end());
   const int align = 0x1000;
@@ -191,6 +285,8 @@ void sim_t::make_dtb()
 }
 
 char* sim_t::addr_to_mem(reg_t addr) {
+  if (!paddr_ok(addr))
+    return NULL;
   auto desc = bus.find_device(addr);
   if (auto mem = dynamic_cast<mem_t*>(desc.second))
     if (addr - desc.first < mem->size())
@@ -198,12 +294,17 @@ char* sim_t::addr_to_mem(reg_t addr) {
   return NULL;
 }
 
+const char* sim_t::get_symbol(uint64_t addr)
+{
+  return htif_t::get_symbol(addr);
+}
+
 // htif
 
 void sim_t::reset()
 {
   if (dtb_enabled)
-    make_dtb();
+    set_rom();
 }
 
 void sim_t::idle()
@@ -214,7 +315,7 @@ void sim_t::idle()
 void sim_t::read_chunk(addr_t taddr, size_t len, void* dst)
 {
   assert(len == 8);
-  auto data = debug_mmu->load_uint64(taddr);
+  auto data = to_le(debug_mmu->load_uint64(taddr));
   memcpy(dst, &data, sizeof data);
 }
 
@@ -223,7 +324,7 @@ void sim_t::write_chunk(addr_t taddr, size_t len, const void* src)
   assert(len == 8);
   uint64_t data;
   memcpy(&data, src, sizeof data);
-  debug_mmu->store_uint64(taddr, data);
+  debug_mmu->store_uint64(taddr, from_le(data));
 }
 
 void sim_t::proc_reset(unsigned id)
diff --git a/riscv/sim.h b/riscv/sim.h
index e42808b4ae..c7e3de4f71 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -3,15 +3,18 @@
 #ifndef _RISCV_SIM_H
 #define _RISCV_SIM_H
 
-#include "processor.h"
-#include "devices.h"
 #include "debug_module.h"
+#include "devices.h"
+#include "log_file.h"
+#include "processor.h"
 #include "simif.h"
+
 #include <fesvr/htif.h>
 #include <fesvr/context.h>
 #include <vector>
 #include <string>
 #include <memory>
+#include <sys/types.h>
 
 class mmu_t;
 class remote_bitbang_t;
@@ -20,21 +23,30 @@ class remote_bitbang_t;
 class sim_t : public htif_t, public simif_t
 {
 public:
-  sim_t(const char* isa, size_t _nprocs,  bool halted, reg_t start_pc,
-        std::vector<std::pair<reg_t, mem_t*>> mems,
+  sim_t(const char* isa, const char* priv, const char* varch, size_t _nprocs,
+        bool halted, bool real_time_clint,
+        reg_t initrd_start, reg_t initrd_end, const char* bootargs,
+        reg_t start_pc, std::vector<std::pair<reg_t, mem_t*>> mems,
+        std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices,
         const std::vector<std::string>& args, const std::vector<int> hartids,
-        unsigned progsize, unsigned max_bus_master_bits, bool require_authentication);
+        const debug_module_config_t &dm_config, const char *log_path,
+        bool dtb_enabled, const char *dtb_file);
   ~sim_t();
 
   // run the simulation to completion
   int run();
   void set_debug(bool value);
-  void set_log(bool value);
   void set_histogram(bool value);
+
+  // Configure logging
+  //
+  // If enable_log is true, an instruction trace will be generated. If
+  // enable_commitlog is true, so will the commit results (if this
+  // build was configured without support for commit logging, the
+  // function will print an error message and abort).
+  void configure_log(bool enable_log, bool enable_commitlog);
+
   void set_procs_debug(bool value);
-  void set_dtb_enabled(bool value) {
-    this->dtb_enabled = value;
-  }
   void set_remote_bitbang(remote_bitbang_t* remote_bitbang) {
     this->remote_bitbang = remote_bitbang;
   }
@@ -47,13 +59,21 @@ class sim_t : public htif_t, public simif_t
 
 private:
   std::vector<std::pair<reg_t, mem_t*>> mems;
+  std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices;
   mmu_t* debug_mmu;  // debug port into main memory
   std::vector<processor_t*> procs;
+  reg_t initrd_start;
+  reg_t initrd_end;
+  const char* bootargs;
   reg_t start_pc;
   std::string dts;
+  std::string dtb;
+  std::string dtb_file;
+  bool dtb_enabled;
   std::unique_ptr<rom_device_t> boot_rom;
   std::unique_ptr<clint_t> clint;
   bus_t bus;
+  log_file_t log_file;
 
   processor_t* get_core(const std::string& i);
   void step(size_t n); // step through simulation
@@ -63,9 +83,8 @@ class sim_t : public htif_t, public simif_t
   size_t current_step;
   size_t current_proc;
   bool debug;
-  bool log;
   bool histogram_enabled; // provide a histogram of PCs
-  bool dtb_enabled;
+  bool log;
   remote_bitbang_t* remote_bitbang;
 
   // memory-mapped I/O routines
@@ -73,6 +92,9 @@ class sim_t : public htif_t, public simif_t
   bool mmio_load(reg_t addr, size_t len, uint8_t* bytes);
   bool mmio_store(reg_t addr, size_t len, const uint8_t* bytes);
   void make_dtb();
+  void set_rom();
+
+  const char* get_symbol(uint64_t addr);
 
   // presents a prompt for introspection into the simulation
   void interactive();
@@ -83,8 +105,10 @@ class sim_t : public htif_t, public simif_t
   void interactive_run(const std::string& cmd, const std::vector<std::string>& args, bool noisy);
   void interactive_run_noisy(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_run_silent(const std::string& cmd, const std::vector<std::string>& args);
+  void interactive_vreg(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_reg(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_freg(const std::string& cmd, const std::vector<std::string>& args);
+  void interactive_fregh(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_fregs(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_fregd(const std::string& cmd, const std::vector<std::string>& args);
   void interactive_pc(const std::string& cmd, const std::vector<std::string>& args);
diff --git a/riscv/simif.h b/riscv/simif.h
index 1d982b3396..0e75d45b16 100644
--- a/riscv/simif.h
+++ b/riscv/simif.h
@@ -16,6 +16,9 @@ class simif_t
   virtual bool mmio_store(reg_t addr, size_t len, const uint8_t* bytes) = 0;
   // Callback for processors to let the simulation know they were reset.
   virtual void proc_reset(unsigned id) = 0;
+
+  virtual const char* get_symbol(uint64_t addr) = 0;
+
 };
 
 #endif
diff --git a/riscv/trap.h b/riscv/trap.h
index b5b8a5080a..4431d8a94e 100644
--- a/riscv/trap.h
+++ b/riscv/trap.h
@@ -13,18 +13,23 @@ class trap_t
  public:
   trap_t(reg_t which) : which(which) {}
   virtual const char* name();
+  virtual bool has_gva() { return false; }
   virtual bool has_tval() { return false; }
   virtual reg_t get_tval() { return 0; }
+  virtual bool has_tval2() { return false; }
+  virtual reg_t get_tval2() { return 0; }
+  virtual bool has_tinst() { return false; }
+  virtual reg_t get_tinst() { return 0; }
   reg_t cause() { return which; }
  private:
   char _name[16];
   reg_t which;
 };
 
-class mem_trap_t : public trap_t
+class insn_trap_t : public trap_t
 {
  public:
-  mem_trap_t(reg_t which, reg_t tval)
+  insn_trap_t(reg_t which, reg_t tval)
     : trap_t(which), tval(tval) {}
   bool has_tval() override { return true; }
   reg_t get_tval() override { return tval; }
@@ -32,32 +37,59 @@ class mem_trap_t : public trap_t
   reg_t tval;
 };
 
+class mem_trap_t : public trap_t
+{
+ public:
+  mem_trap_t(reg_t which, bool gva, reg_t tval, reg_t tval2, reg_t tinst)
+    : trap_t(which), gva(gva), tval(tval), tval2(tval2), tinst(tinst) {}
+  bool has_gva() override { return gva; }
+  bool has_tval() override { return true; }
+  reg_t get_tval() override { return tval; }
+  bool has_tval2() override { return true; }
+  reg_t get_tval2() override { return tval2; }
+  bool has_tinst() override { return true; }
+  reg_t get_tinst() override { return tinst; }
+ private:
+  bool gva;
+  reg_t tval, tval2, tinst;
+};
+
 #define DECLARE_TRAP(n, x) class trap_##x : public trap_t { \
  public: \
   trap_##x() : trap_t(n) {} \
   const char* name() { return "trap_"#x; } \
 };
 
+#define DECLARE_INST_TRAP(n, x) class trap_##x : public insn_trap_t { \
+ public: \
+  trap_##x(reg_t tval) : insn_trap_t(n, tval) {} \
+  const char* name() { return "trap_"#x; } \
+};
+
 #define DECLARE_MEM_TRAP(n, x) class trap_##x : public mem_trap_t { \
  public: \
-  trap_##x(reg_t tval) : mem_trap_t(n, tval) {} \
+  trap_##x(reg_t tval, reg_t tval2, reg_t tinst) : mem_trap_t(n, true, tval, tval2, tinst) {} \
   const char* name() { return "trap_"#x; } \
 };
 
 DECLARE_MEM_TRAP(CAUSE_MISALIGNED_FETCH, instruction_address_misaligned)
 DECLARE_MEM_TRAP(CAUSE_FETCH_ACCESS, instruction_access_fault)
-DECLARE_MEM_TRAP(CAUSE_ILLEGAL_INSTRUCTION, illegal_instruction)
-DECLARE_MEM_TRAP(CAUSE_BREAKPOINT, breakpoint)
+DECLARE_INST_TRAP(CAUSE_ILLEGAL_INSTRUCTION, illegal_instruction)
+DECLARE_INST_TRAP(CAUSE_BREAKPOINT, breakpoint)
 DECLARE_MEM_TRAP(CAUSE_MISALIGNED_LOAD, load_address_misaligned)
 DECLARE_MEM_TRAP(CAUSE_MISALIGNED_STORE, store_address_misaligned)
 DECLARE_MEM_TRAP(CAUSE_LOAD_ACCESS, load_access_fault)
 DECLARE_MEM_TRAP(CAUSE_STORE_ACCESS, store_access_fault)
 DECLARE_TRAP(CAUSE_USER_ECALL, user_ecall)
 DECLARE_TRAP(CAUSE_SUPERVISOR_ECALL, supervisor_ecall)
-DECLARE_TRAP(CAUSE_HYPERVISOR_ECALL, hypervisor_ecall)
+DECLARE_TRAP(CAUSE_VIRTUAL_SUPERVISOR_ECALL, virtual_supervisor_ecall)
 DECLARE_TRAP(CAUSE_MACHINE_ECALL, machine_ecall)
 DECLARE_MEM_TRAP(CAUSE_FETCH_PAGE_FAULT, instruction_page_fault)
 DECLARE_MEM_TRAP(CAUSE_LOAD_PAGE_FAULT, load_page_fault)
 DECLARE_MEM_TRAP(CAUSE_STORE_PAGE_FAULT, store_page_fault)
+DECLARE_MEM_TRAP(CAUSE_FETCH_GUEST_PAGE_FAULT, instruction_guest_page_fault)
+DECLARE_MEM_TRAP(CAUSE_LOAD_GUEST_PAGE_FAULT, load_guest_page_fault)
+DECLARE_INST_TRAP(CAUSE_VIRTUAL_INSTRUCTION, virtual_instruction)
+DECLARE_MEM_TRAP(CAUSE_STORE_GUEST_PAGE_FAULT, store_guest_page_fault)
 
 #endif
diff --git a/scripts/vcs-version.sh b/scripts/vcs-version.sh
index 31fae86951..692c071ef7 100755
--- a/scripts/vcs-version.sh
+++ b/scripts/vcs-version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #=========================================================================
 # vcs-version.sh [options] [src-dir]
 #=========================================================================
diff --git a/softfloat/f16_classify.c b/softfloat/f16_classify.c
new file mode 100755
index 0000000000..9402ff13e8
--- /dev/null
+++ b/softfloat/f16_classify.c
@@ -0,0 +1,36 @@
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "specialize.h"
+#include "softfloat.h"
+
+uint_fast16_t f16_classify( float16_t a )
+{
+    union ui16_f16 uA;
+    uint_fast16_t uiA;
+
+    uA.f = a;
+    uiA = uA.ui;
+
+    uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F;
+    uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0;
+    bool sign = signF16UI( uiA );
+    bool fracZero = fracF16UI( uiA ) == 0;
+    bool isNaN = isNaNF16UI( uiA );
+    bool isSNaN = softfloat_isSigNaNF16UI( uiA );
+
+    return
+        (  sign && infOrNaN && fracZero )          << 0 |
+        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
+        (  sign && subnormalOrZero && !fracZero )  << 2 |
+        (  sign && subnormalOrZero && fracZero )   << 3 |
+        ( !sign && infOrNaN && fracZero )          << 7 |
+        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
+        ( !sign && subnormalOrZero && !fracZero )  << 5 |
+        ( !sign && subnormalOrZero && fracZero )   << 4 |
+        ( isNaN &&  isSNaN )                       << 8 |
+        ( isNaN && !isSNaN )                       << 9;
+}
+
diff --git a/softfloat/f16_to_i16.c b/softfloat/f16_to_i16.c
new file mode 100644
index 0000000000..b0fbb7cc75
--- /dev/null
+++ b/softfloat/f16_to_i16.c
@@ -0,0 +1,57 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+int_fast16_t f16_to_i16( float16_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    int_fast32_t sig32 = f16_to_i32(a, roundingMode, exact);
+
+    if (sig32 > INT16_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i16_fromPosOverflow;
+    } else if (sig32 < INT16_MIN) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i16_fromNegOverflow;
+    } else {
+        return sig32;
+    }
+}
+
diff --git a/softfloat/f16_to_i8.c b/softfloat/f16_to_i8.c
new file mode 100644
index 0000000000..23638cc102
--- /dev/null
+++ b/softfloat/f16_to_i8.c
@@ -0,0 +1,57 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+int_fast8_t f16_to_i8( float16_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    int_fast32_t sig32 = f16_to_i32(a, roundingMode, exact);
+
+    if (sig32 > INT8_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i8_fromPosOverflow;
+    } else if (sig32 < INT8_MIN) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i8_fromNegOverflow;
+    } else {
+        return sig32;
+    }
+}
+
diff --git a/softfloat/f16_to_ui16.c b/softfloat/f16_to_ui16.c
new file mode 100644
index 0000000000..81c4f8d9e0
--- /dev/null
+++ b/softfloat/f16_to_ui16.c
@@ -0,0 +1,54 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+uint_fast16_t f16_to_ui16( float16_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    uint_fast32_t sig32 = f16_to_ui32(a, roundingMode, exact);
+
+    if (sig32 > UINT16_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return ui16_fromPosOverflow;
+    } else {
+        return sig32;
+    }
+}
+
diff --git a/softfloat/f16_to_ui8.c b/softfloat/f16_to_ui8.c
new file mode 100644
index 0000000000..96124e1275
--- /dev/null
+++ b/softfloat/f16_to_ui8.c
@@ -0,0 +1,54 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+uint_fast8_t f16_to_ui8( float16_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    uint_fast32_t sig32 = f16_to_ui32(a, roundingMode, exact);
+
+    if (sig32 > UINT8_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return ui8_fromPosOverflow;
+    } else {
+        return sig32;
+    }
+}
+
diff --git a/softfloat/f32_to_i16.c b/softfloat/f32_to_i16.c
new file mode 100644
index 0000000000..bde4c76c9a
--- /dev/null
+++ b/softfloat/f32_to_i16.c
@@ -0,0 +1,57 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+int_fast16_t f32_to_i16( float32_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    int_fast32_t sig32 = f32_to_i32(a, roundingMode, exact);
+
+    if (sig32 > INT16_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i16_fromPosOverflow;
+    } else if (sig32 < INT16_MIN) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return i16_fromNegOverflow;
+    } else {
+        return sig32;
+    }
+}
+
diff --git a/softfloat/f32_to_ui16.c b/softfloat/f32_to_ui16.c
new file mode 100644
index 0000000000..073492bfaa
--- /dev/null
+++ b/softfloat/f32_to_ui16.c
@@ -0,0 +1,53 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdint.h>
+#include "specialize.h"
+#include "softfloat.h"
+
+uint_fast16_t f32_to_ui16( float32_t a, uint_fast8_t roundingMode, bool exact )
+{
+    uint_fast8_t old_flags = softfloat_exceptionFlags;
+
+    uint_fast32_t sig32 = f32_to_ui32(a, roundingMode, exact);
+
+    if (sig32 > UINT16_MAX) {
+        softfloat_exceptionFlags = old_flags | softfloat_flag_invalid;
+        return ui16_fromPosOverflow;
+    } else {
+        return sig32;
+    }
+}
diff --git a/softfloat/fall_maxmin.c b/softfloat/fall_maxmin.c
new file mode 100644
index 0000000000..32a9ade59e
--- /dev/null
+++ b/softfloat/fall_maxmin.c
@@ -0,0 +1,81 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of
+California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "specialize.h"
+#include "softfloat.h"
+
+#define COMPARE_MAX(a, b, bits) \
+float ## bits ## _t f ## bits ## _max( float ## bits ## _t a, float ## bits ## _t b )          \
+{                                                                                              \
+    bool greater = f ## bits ## _lt_quiet(b, a) ||                                             \
+               (f ## bits ## _eq(b, a) && signF ## bits ## UI(b.v));                           \
+                                                                                               \
+    if (isNaNF ## bits ## UI(a.v) && isNaNF ## bits ## UI(b.v)) {                              \
+        union ui ## bits ## _f ## bits  ui;                                                    \
+        ui.ui = defaultNaNF ## bits ## UI;                                                     \
+        return ui.f;                                                                           \
+    } else {                                                                                   \
+        return greater || isNaNF ## bits ## UI((b).v) ? a : b;                                 \
+    }                                                                                          \
+}
+
+#define COMPARE_MIN(a, b, bits) \
+float ## bits ## _t f ## bits ## _min( float ## bits ## _t a, float ## bits ## _t b )          \
+{                                                                                              \
+    bool less = f ## bits ## _lt_quiet(a, b) ||                                                \
+               (f ## bits ## _eq(a, b) && signF ## bits ## UI(a.v));                           \
+                                                                                               \
+    if (isNaNF ## bits ## UI(a.v) && isNaNF ## bits ## UI(b.v)) {                              \
+        union ui ## bits ## _f ## bits  ui;                                                    \
+        ui.ui = defaultNaNF ## bits ## UI;                                                     \
+        return ui.f;                                                                           \
+    } else {                                                                                   \
+        return less || isNaNF ## bits ## UI((b).v) ? a : b;                                    \
+    }                                                                                          \
+}
+
+COMPARE_MAX(a, b, 16);
+COMPARE_MAX(a, b, 32);
+COMPARE_MAX(a, b, 64);
+
+COMPARE_MIN(a, b, 16);
+COMPARE_MIN(a, b, 32);
+COMPARE_MIN(a, b, 64);
diff --git a/softfloat/fall_reciprocal.c b/softfloat/fall_reciprocal.c
new file mode 100644
index 0000000000..1c96458935
--- /dev/null
+++ b/softfloat/fall_reciprocal.c
@@ -0,0 +1,392 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of
+California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "specialize.h"
+#include "softfloat.h"
+
+static inline uint64_t extract64(uint64_t val, int pos, int len)
+{
+  assert(pos >= 0 && len > 0 && len <= 64 - pos);
+  return (val >> pos) & (~UINT64_C(0) >> (64 - len));
+}
+
+static inline uint64_t make_mask64(int pos, int len)
+{
+    assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
+    return (UINT64_MAX >> (64 - len)) << pos;
+}
+
+//user needs to truncate output to required length
+static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) {
+  uint64_t exp = extract64(val, s, e);
+  uint64_t sig = extract64(val, 0, s);
+  uint64_t sign = extract64(val, s + e, 1);
+  const int p = 7;
+
+  static const uint8_t table[] = {
+      52, 51, 50, 48, 47, 46, 44, 43,
+      42, 41, 40, 39, 38, 36, 35, 34,
+      33, 32, 31, 30, 30, 29, 28, 27,
+      26, 25, 24, 23, 23, 22, 21, 20,
+      19, 19, 18, 17, 16, 16, 15, 14,
+      14, 13, 12, 12, 11, 10, 10, 9,
+      9, 8, 7, 7, 6, 6, 5, 4,
+      4, 3, 3, 2, 2, 1, 1, 0,
+      127, 125, 123, 121, 119, 118, 116, 114,
+      113, 111, 109, 108, 106, 105, 103, 102,
+      100, 99, 97, 96, 95, 93, 92, 91,
+      90, 88, 87, 86, 85, 84, 83, 82,
+      80, 79, 78, 77, 76, 75, 74, 73,
+      72, 71, 70, 70, 69, 68, 67, 66,
+      65, 64, 63, 63, 62, 61, 60, 59,
+      59, 58, 57, 56, 56, 55, 54, 53};
+
+  if (sub) {
+      while (extract64(sig, s - 1, 1) == 0)
+          exp--, sig <<= 1;
+
+      sig = (sig << 1) & make_mask64(0 ,s);
+  }
+
+  int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1));
+  uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
+  uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2;
+
+  return (sign << (s+e)) | (out_exp << s) | out_sig;
+}
+
+float16_t f16_rsqrte7(float16_t in)
+{
+    union ui16_f16 uA;
+
+    uA.f = in;
+    unsigned int ret = f16_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF16UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfc00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7c00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 5, 10, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+float32_t f32_rsqrte7(float32_t in)
+{
+    union ui32_f32 uA;
+
+    uA.f = in;
+    unsigned int ret = f32_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF32UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xff800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7f800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 8, 23, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+float64_t f64_rsqrte7(float64_t in)
+{
+    union ui64_f64 uA;
+
+    uA.f = in;
+    unsigned int ret = f64_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF64UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfff0000000000000ul;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7ff0000000000000ul;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 11, 52, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+//user needs to truncate output to required length
+static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub,
+                              bool *round_abnormal)
+{
+    uint64_t exp = extract64(val, s, e);
+    uint64_t sig = extract64(val, 0, s);
+    uint64_t sign = extract64(val, s + e, 1);
+    const int p = 7;
+
+    static const uint8_t table[] = {
+        127, 125, 123, 121, 119, 117, 116, 114,
+        112, 110, 109, 107, 105, 104, 102, 100,
+        99, 97, 96, 94, 93, 91, 90, 88,
+        87, 85, 84, 83, 81, 80, 79, 77,
+        76, 75, 74, 72, 71, 70, 69, 68,
+        66, 65, 64, 63, 62, 61, 60, 59,
+        58, 57, 56, 55, 54, 53, 52, 51,
+        50, 49, 48, 47, 46, 45, 44, 43,
+        42, 41, 40, 40, 39, 38, 37, 36,
+        35, 35, 34, 33, 32, 31, 31, 30,
+        29, 28, 28, 27, 26, 25, 25, 24,
+        23, 23, 22, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 15, 15, 14, 14,
+        13, 12, 12, 11, 11, 10, 9, 9,
+        8, 8, 7, 7, 6, 5, 5, 4,
+        4, 3, 3, 2, 2, 1, 1, 0};
+
+    if (sub) {
+        while (extract64(sig, s - 1, 1) == 0)
+            exp--, sig <<= 1;
+
+        sig = (sig << 1) & make_mask64(0 ,s);
+
+        if (exp != 0 && exp != UINT64_MAX) {
+            *round_abnormal = true;
+            if (rm == 1 ||
+                (rm == 2 && !sign) ||
+                (rm == 3 && sign))
+                return ((sign << (s+e)) | make_mask64(s, e)) - 1;
+            else
+                return (sign << (s+e)) | make_mask64(s, e);
+        }
+    }
+
+    int idx = sig >> (s-p);
+    uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
+    uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp;
+    if (out_exp == 0 || out_exp == UINT64_MAX) {
+        out_sig = (out_sig >> 1) | make_mask64(s - 1, 1);
+        if (out_exp == UINT64_MAX) {
+            out_sig >>= 1;
+            out_exp = 0;
+        }
+    }
+
+    return (sign << (s+e)) | (out_exp << s) | out_sig;
+}
+
+float16_t f16_recip7(float16_t in)
+{
+    union ui16_f16 uA;
+
+    uA.f = in;
+    unsigned int ret = f16_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x8000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfc00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7c00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF16UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 5, 10,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+            softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                        softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
+
+float32_t f32_recip7(float32_t in)
+{
+    union ui32_f32 uA;
+
+    uA.f = in;
+    unsigned int ret = f32_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x80000000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xff800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7f800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF32UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 8, 23,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+          softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                      softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
+
+float64_t f64_recip7(float64_t in)
+{
+    union ui64_f64 uA;
+
+    uA.f = in;
+    unsigned int ret = f64_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x8000000000000000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfff0000000000000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7ff0000000000000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF64UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 11, 52,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+            softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                        softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
diff --git a/softfloat/platform.h b/softfloat/platform.h
index 03dd429faf..55de1941a7 100644
--- a/softfloat/platform.h
+++ b/softfloat/platform.h
@@ -36,11 +36,15 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 /*----------------------------------------------------------------------------
 *----------------------------------------------------------------------------*/
+#include "config.h"
+#ifndef WORDS_BIGENDIAN
 #define LITTLEENDIAN 1
+#endif
 
 #define INLINE_LEVEL 5
 #define SOFTFLOAT_FAST_INT64
 #define SOFTFLOAT_FAST_DIV64TO32
+#define SOFTFLOAT_ROUND_ODD
 
 /*----------------------------------------------------------------------------
 *----------------------------------------------------------------------------*/
diff --git a/softfloat/softfloat.h b/softfloat/softfloat.h
index b277281ec2..bdac1be263 100644
--- a/softfloat/softfloat.h
+++ b/softfloat/softfloat.h
@@ -141,8 +141,12 @@ void i64_to_f128M( int64_t, float128_t * );
 /*----------------------------------------------------------------------------
 | 16-bit (half-precision) floating-point operations.
 *----------------------------------------------------------------------------*/
+uint_fast8_t f16_to_ui8( float16_t, uint_fast8_t, bool );
+uint_fast16_t f16_to_ui16( float16_t, uint_fast8_t, bool );
 uint_fast32_t f16_to_ui32( float16_t, uint_fast8_t, bool );
 uint_fast64_t f16_to_ui64( float16_t, uint_fast8_t, bool );
+int_fast8_t f16_to_i8( float16_t, uint_fast8_t, bool );
+int_fast16_t f16_to_i16( float16_t, uint_fast8_t, bool );
 int_fast32_t f16_to_i32( float16_t, uint_fast8_t, bool );
 int_fast64_t f16_to_i64( float16_t, uint_fast8_t, bool );
 uint_fast32_t f16_to_ui32_r_minMag( float16_t, bool );
@@ -160,6 +164,8 @@ void f16_to_f128M( float16_t, float128_t * );
 float16_t f16_roundToInt( float16_t, uint_fast8_t, bool );
 float16_t f16_add( float16_t, float16_t );
 float16_t f16_sub( float16_t, float16_t );
+float16_t f16_max( float16_t, float16_t );
+float16_t f16_min( float16_t, float16_t );
 float16_t f16_mul( float16_t, float16_t );
 float16_t f16_mulAdd( float16_t, float16_t, float16_t );
 float16_t f16_div( float16_t, float16_t );
@@ -172,12 +178,17 @@ bool f16_eq_signaling( float16_t, float16_t );
 bool f16_le_quiet( float16_t, float16_t );
 bool f16_lt_quiet( float16_t, float16_t );
 bool f16_isSignalingNaN( float16_t );
+uint_fast16_t f16_classify( float16_t );
+float16_t f16_rsqrte7( float16_t );
+float16_t f16_recip7( float16_t );
 
 /*----------------------------------------------------------------------------
 | 32-bit (single-precision) floating-point operations.
 *----------------------------------------------------------------------------*/
+uint_fast16_t f32_to_ui16( float32_t, uint_fast8_t, bool );
 uint_fast32_t f32_to_ui32( float32_t, uint_fast8_t, bool );
 uint_fast64_t f32_to_ui64( float32_t, uint_fast8_t, bool );
+int_fast16_t f32_to_i16( float32_t, uint_fast8_t, bool );
 int_fast32_t f32_to_i32( float32_t, uint_fast8_t, bool );
 int_fast64_t f32_to_i64( float32_t, uint_fast8_t, bool );
 uint_fast32_t f32_to_ui32_r_minMag( float32_t, bool );
@@ -195,6 +206,8 @@ void f32_to_f128M( float32_t, float128_t * );
 float32_t f32_roundToInt( float32_t, uint_fast8_t, bool );
 float32_t f32_add( float32_t, float32_t );
 float32_t f32_sub( float32_t, float32_t );
+float32_t f32_max( float32_t, float32_t );
+float32_t f32_min( float32_t, float32_t );
 float32_t f32_mul( float32_t, float32_t );
 float32_t f32_mulAdd( float32_t, float32_t, float32_t );
 float32_t f32_div( float32_t, float32_t );
@@ -208,6 +221,8 @@ bool f32_le_quiet( float32_t, float32_t );
 bool f32_lt_quiet( float32_t, float32_t );
 bool f32_isSignalingNaN( float32_t );
 uint_fast16_t f32_classify( float32_t );
+float32_t f32_rsqrte7( float32_t );
+float32_t f32_recip7( float32_t );
 
 /*----------------------------------------------------------------------------
 | 64-bit (double-precision) floating-point operations.
@@ -231,6 +246,8 @@ void f64_to_f128M( float64_t, float128_t * );
 float64_t f64_roundToInt( float64_t, uint_fast8_t, bool );
 float64_t f64_add( float64_t, float64_t );
 float64_t f64_sub( float64_t, float64_t );
+float64_t f64_max( float64_t, float64_t );
+float64_t f64_min( float64_t, float64_t );
 float64_t f64_mul( float64_t, float64_t );
 float64_t f64_mulAdd( float64_t, float64_t, float64_t );
 float64_t f64_div( float64_t, float64_t );
@@ -244,6 +261,8 @@ bool f64_le_quiet( float64_t, float64_t );
 bool f64_lt_quiet( float64_t, float64_t );
 bool f64_isSignalingNaN( float64_t );
 uint_fast16_t f64_classify( float64_t );
+float64_t f64_rsqrte7( float64_t );
+float64_t f64_recip7( float64_t );
 
 /*----------------------------------------------------------------------------
 | Rounding precision for 80-bit extended double-precision floating-point.
diff --git a/softfloat/softfloat.mk.in b/softfloat/softfloat.mk.in
index ff7637b13e..07dca1618a 100644
--- a/softfloat/softfloat.mk.in
+++ b/softfloat/softfloat.mk.in
@@ -38,6 +38,7 @@ softfloat_c_srcs = \
 	f128_to_ui64.c \
 	f128_to_ui64_r_minMag.c \
 	f16_add.c \
+	f16_classify.c \
 	f16_div.c \
 	f16_eq.c \
 	f16_eq_signaling.c \
@@ -55,10 +56,14 @@ softfloat_c_srcs = \
 	f16_to_f128.c \
 	f16_to_f32.c \
 	f16_to_f64.c \
+	f16_to_i8.c \
+	f16_to_i16.c \
 	f16_to_i32.c \
 	f16_to_i32_r_minMag.c \
 	f16_to_i64.c \
 	f16_to_i64_r_minMag.c \
+	f16_to_ui8.c \
+	f16_to_ui16.c \
 	f16_to_ui32.c \
 	f16_to_ui32_r_minMag.c \
 	f16_to_ui64.c \
@@ -82,10 +87,12 @@ softfloat_c_srcs = \
 	f32_to_f128.c \
 	f32_to_f16.c \
 	f32_to_f64.c \
+	f32_to_i16.c \
 	f32_to_i32.c \
 	f32_to_i32_r_minMag.c \
 	f32_to_i64.c \
 	f32_to_i64_r_minMag.c \
+	f32_to_ui16.c \
 	f32_to_ui32.c \
 	f32_to_ui32_r_minMag.c \
 	f32_to_ui64.c \
@@ -117,6 +124,8 @@ softfloat_c_srcs = \
 	f64_to_ui32_r_minMag.c \
 	f64_to_ui64.c \
 	f64_to_ui64_r_minMag.c \
+	fall_maxmin.c \
+	fall_reciprocal.c \
 	i32_to_f128.c \
 	i32_to_f16.c \
 	i32_to_f32.c \
@@ -225,6 +234,10 @@ softfloat_c_srcs = \
 	ui64_to_f32.c \
 	ui64_to_f64.c \
 
+softfloat_CFLAGS = -fPIC
+
+softfloat_install_shared_lib = yes
+
 softfloat_test_srcs =
 
 softfloat_install_prog_srcs =
diff --git a/softfloat/specialize.h b/softfloat/specialize.h
index 629d5185b9..556476c1a5 100644
--- a/softfloat/specialize.h
+++ b/softfloat/specialize.h
@@ -55,6 +55,20 @@ extern "C" {
 | The values to return on conversions to 32-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
+#define ui8_fromPosOverflow  0xFF
+#define ui8_fromNegOverflow  0
+#define ui8_fromNaN          0xFF
+#define i8_fromPosOverflow   0x7F
+#define i8_fromNegOverflow   (-0x7F - 1)
+#define i8_fromNaN           0x7F
+
+#define ui16_fromPosOverflow 0xFFFF
+#define ui16_fromNegOverflow 0
+#define ui16_fromNaN         0xFFFF
+#define i16_fromPosOverflow  0x7FFF
+#define i16_fromNegOverflow  (-0x7FFF - 1)
+#define i16_fromNaN          0x7FFF
+
 #define ui32_fromPosOverflow 0xFFFFFFFF
 #define ui32_fromNegOverflow 0
 #define ui32_fromNaN         0xFFFFFFFF
diff --git a/spike_main/spike-dasm.cc b/spike_dasm/spike-dasm.cc
similarity index 70%
rename from spike_main/spike-dasm.cc
rename to spike_dasm/spike-dasm.cc
index 1161825c2e..fa6a25ae6a 100644
--- a/spike_main/spike-dasm.cc
+++ b/spike_dasm/spike-dasm.cc
@@ -21,13 +21,32 @@ int main(int argc, char** argv)
 
   std::function<extension_t*()> extension;
   option_parser_t parser;
+#ifdef HAVE_DLOPEN
   parser.option(0, "extension", 1, [&](const char* s){extension = find_extension(s);});
+#endif
   parser.option(0, "isa", 1, [&](const char* s){isa = s;});
   parser.parse(argv);
 
-  processor_t p(isa, 0, 0);
-  if (extension)
-    p.register_extension(extension());
+  std::string lowercase;
+  for (const char *p = isa; *p; p++)
+    lowercase += std::tolower(*p);
+
+  int xlen;
+  if (lowercase.compare(0, 4, "rv32") == 0) {
+    xlen = 32;
+  } else if (lowercase.compare(0, 4, "rv64") == 0) {
+    xlen = 64;
+  } else {
+    fprintf(stderr, "bad ISA string: %s\n", isa);
+    return 1;
+  }
+
+  disassembler_t* disassembler = new disassembler_t(xlen);
+  if (extension) {
+    for (auto disasm_insn : extension()->get_disasms()) {
+      disassembler->add_insn(disasm_insn);
+    }
+  }
 
   while (getline(cin, s))
   {
@@ -52,7 +71,7 @@ int main(int argc, char** argv)
       if (nbits < 64)
         bits = bits << (64 - nbits) >> (64 - nbits);
 
-      string dis = p.get_disassembler()->disassemble(bits);
+      string dis = disassembler->disassemble(bits);
       s = s.substr(0, start) + dis + s.substr(endp - &s[0] + 1);
       pos = start + dis.length();
     }
diff --git a/spike_dasm/spike_dasm.ac b/spike_dasm/spike_dasm.ac
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/spike_dasm/spike_dasm.mk.in b/spike_dasm/spike_dasm.mk.in
new file mode 100644
index 0000000000..b6118fd5c6
--- /dev/null
+++ b/spike_dasm/spike_dasm.mk.in
@@ -0,0 +1,9 @@
+spike_dasm_subproject_deps = \
+	disasm \
+  $(if $(HAVE_DLOPEN),riscv,) \
+
+spike_dasm_srcs = \
+  spike_dasm_option_parser.cc \
+
+spike_dasm_install_prog_srcs = \
+	spike-dasm.cc \
diff --git a/spike_dasm/spike_dasm_option_parser.cc b/spike_dasm/spike_dasm_option_parser.cc
new file mode 120000
index 0000000000..4244c15de1
--- /dev/null
+++ b/spike_dasm/spike_dasm_option_parser.cc
@@ -0,0 +1 @@
+../fesvr/option_parser.cc
\ No newline at end of file
diff --git a/spike_main/disasm.cc b/spike_main/disasm.cc
deleted file mode 100644
index 81264dd802..0000000000
--- a/spike_main/disasm.cc
+++ /dev/null
@@ -1,648 +0,0 @@
-// See LICENSE for license details.
-
-#include "disasm.h"
-#include <string>
-#include <vector>
-#include <cstdarg>
-#include <sstream>
-#include <stdlib.h>
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.i_imm()) + '(' + xpr_name[insn.rs1()] + ')';
-  }
-} load_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.s_imm()) + '(' + xpr_name[insn.rs1()] + ')';
-  }
-} store_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::string("(") + xpr_name[insn.rs1()] + ')';
-  }
-} amo_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rd()];
-  }
-} xrd;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rs1()];
-  }
-} xrs1;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rs2()];
-  }
-} xrs2;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rd()];
-  }
-} frd;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rs1()];
-  }
-} frs1;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rs2()];
-  }
-} frs2;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rs3()];
-  }
-} frs3;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    switch (insn.csr())
-    {
-      #define DECLARE_CSR(name, num) case num: return #name;
-      #include "encoding.h"
-      #undef DECLARE_CSR
-      default:
-      {
-        char buf[16];
-        snprintf(buf, sizeof buf, "unknown_%03" PRIx64, insn.csr());
-        return std::string(buf);
-      }
-    }
-  }
-} csr;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.i_imm());
-  }
-} imm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.shamt());
-  }
-} shamt;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    s << std::hex << "0x" << ((uint32_t)insn.u_imm() >> 12);
-    return s.str();
-  }
-} bigimm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string(insn.rs1());
-  }
-} zimm5;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    int32_t target = insn.sb_imm();
-    char sign = target >= 0 ? '+' : '-';
-    s << "pc " << sign << ' ' << abs(target);
-    return s.str();
-  }
-} branch_target;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    int32_t target = insn.uj_imm();
-    char sign = target >= 0 ? '+' : '-';
-    s << "pc " << sign << std::hex << " 0x" << abs(target);
-    return s.str();
-  }
-} jump_target;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rvc_rs1()];
-  }
-} rvc_rs1;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rvc_rs2()];
-  }
-} rvc_rs2;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rvc_rs2()];
-  }
-} rvc_fp_rs2;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rvc_rs1s()];
-  }
-} rvc_rs1s;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[insn.rvc_rs2s()];
-  }
-} rvc_rs2s;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return fpr_name[insn.rvc_rs2s()];
-  }
-} rvc_fp_rs2s;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return xpr_name[X_SP];
-  }
-} rvc_sp;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_imm());
-  }
-} rvc_imm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_addi4spn_imm());
-  }
-} rvc_addi4spn_imm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_addi16sp_imm());
-  }
-} rvc_addi16sp_imm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_lwsp_imm());
-  }
-} rvc_lwsp_imm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)(insn.rvc_imm() & 0x3f));
-  }
-} rvc_shamt;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    s << std::hex << "0x" << ((uint32_t)insn.rvc_imm() << 12 >> 12);
-    return s.str();
-  }
-} rvc_uimm;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_lwsp_imm()) + '(' + xpr_name[X_SP] + ')';
-  }
-} rvc_lwsp_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_ldsp_imm()) + '(' + xpr_name[X_SP] + ')';
-  }
-} rvc_ldsp_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_swsp_imm()) + '(' + xpr_name[X_SP] + ')';
-  }
-} rvc_swsp_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_sdsp_imm()) + '(' + xpr_name[X_SP] + ')';
-  }
-} rvc_sdsp_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_lw_imm()) + '(' + xpr_name[insn.rvc_rs1s()] + ')';
-  }
-} rvc_lw_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    return std::to_string((int)insn.rvc_ld_imm()) + '(' + xpr_name[insn.rvc_rs1s()] + ')';
-  }
-} rvc_ld_address;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    int32_t target = insn.rvc_b_imm();
-    char sign = target >= 0 ? '+' : '-';
-    s << "pc " << sign << ' ' << abs(target);
-    return s.str();
-  }
-} rvc_branch_target;
-
-struct : public arg_t {
-  std::string to_string(insn_t insn) const {
-    std::stringstream s;
-    int32_t target = insn.rvc_j_imm();
-    char sign = target >= 0 ? '+' : '-';
-    s << "pc " << sign << ' ' << abs(target);
-    return s.str();
-  }
-} rvc_jump_target;
-
-std::string disassembler_t::disassemble(insn_t insn) const
-{
-  const disasm_insn_t* disasm_insn = lookup(insn);
-  return disasm_insn ? disasm_insn->to_string(insn) : "unknown";
-}
-
-disassembler_t::disassembler_t(int xlen)
-{
-  const uint32_t mask_rd = 0x1fUL << 7;
-  const uint32_t match_rd_ra = 1UL << 7;
-  const uint32_t mask_rs1 = 0x1fUL << 15;
-  const uint32_t match_rs1_ra = 1UL << 15;
-  const uint32_t mask_rs2 = 0x1fUL << 20;
-  const uint32_t mask_imm = 0xfffUL << 20;
-  const uint32_t match_imm_1 = 1UL << 20;
-  const uint32_t mask_rvc_rs2 = 0x1fUL << 2;
-  const uint32_t mask_rvc_imm = mask_rvc_rs2 | 0x1000UL;
-
-  #define DECLARE_INSN(code, match, mask) \
-   const uint32_t match_##code = match; \
-   const uint32_t mask_##code = mask;
-  #include "encoding.h"
-  #undef DECLARE_INSN
-
-  // explicit per-instruction disassembly
-  #define DISASM_INSN(name, code, extra, ...) \
-    add_insn(new disasm_insn_t(name, match_##code, mask_##code | (extra), __VA_ARGS__));
-  #define DEFINE_NOARG(code) \
-    add_insn(new disasm_insn_t(#code, match_##code, mask_##code, {}));
-  #define DEFINE_RTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &xrs2})
-  #define DEFINE_ITYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &imm})
-  #define DEFINE_ITYPE_SHIFT(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &shamt})
-  #define DEFINE_I0TYPE(name, code) DISASM_INSN(name, code, mask_rs1, {&xrd, &imm})
-  #define DEFINE_I1TYPE(name, code) DISASM_INSN(name, code, mask_imm, {&xrd, &xrs1})
-  #define DEFINE_I2TYPE(name, code) DISASM_INSN(name, code, mask_rd | mask_imm, {&xrs1})
-  #define DEFINE_LTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &bigimm})
-  #define DEFINE_BTYPE(code) DISASM_INSN(#code, code, 0, {&xrs1, &xrs2, &branch_target})
-  #define DEFINE_B0TYPE(name, code) DISASM_INSN(name, code, mask_rs1 | mask_rs2, {&branch_target})
-  #define DEFINE_B1TYPE(name, code) DISASM_INSN(name, code, mask_rs2, {&xrs1, &branch_target})
-  #define DEFINE_XLOAD(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address})
-  #define DEFINE_XSTORE(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address})
-  #define DEFINE_XAMO(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs2, &amo_address})
-  #define DEFINE_XAMO_LR(code) DISASM_INSN(#code, code, 0, {&xrd, &amo_address})
-  #define DEFINE_FLOAD(code) DISASM_INSN(#code, code, 0, {&frd, &load_address})
-  #define DEFINE_FSTORE(code) DISASM_INSN(#code, code, 0, {&frs2, &store_address})
-  #define DEFINE_FRTYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1, &frs2})
-  #define DEFINE_FR1TYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1})
-  #define DEFINE_FR3TYPE(code) DISASM_INSN(#code, code, 0, {&frd, &frs1, &frs2, &frs3})
-  #define DEFINE_FXTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &frs1})
-  #define DEFINE_FX2TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &frs1, &frs2})
-  #define DEFINE_XFTYPE(code) DISASM_INSN(#code, code, 0, {&frd, &xrs1})
-  #define DEFINE_SFENCE_TYPE(code) DISASM_INSN(#code, code, 0, {&xrs1, &xrs2})
-
-  DEFINE_XLOAD(lb)
-  DEFINE_XLOAD(lbu)
-  DEFINE_XLOAD(lh)
-  DEFINE_XLOAD(lhu)
-  DEFINE_XLOAD(lw)
-  DEFINE_XLOAD(lwu)
-  DEFINE_XLOAD(ld)
-
-  DEFINE_XSTORE(sb)
-  DEFINE_XSTORE(sh)
-  DEFINE_XSTORE(sw)
-  DEFINE_XSTORE(sd)
-
-  DEFINE_XAMO(amoadd_w)
-  DEFINE_XAMO(amoswap_w)
-  DEFINE_XAMO(amoand_w)
-  DEFINE_XAMO(amoor_w)
-  DEFINE_XAMO(amoxor_w)
-  DEFINE_XAMO(amomin_w)
-  DEFINE_XAMO(amomax_w)
-  DEFINE_XAMO(amominu_w)
-  DEFINE_XAMO(amomaxu_w)
-  DEFINE_XAMO(amoadd_d)
-  DEFINE_XAMO(amoswap_d)
-  DEFINE_XAMO(amoand_d)
-  DEFINE_XAMO(amoor_d)
-  DEFINE_XAMO(amoxor_d)
-  DEFINE_XAMO(amomin_d)
-  DEFINE_XAMO(amomax_d)
-  DEFINE_XAMO(amominu_d)
-  DEFINE_XAMO(amomaxu_d)
-
-  DEFINE_XAMO_LR(lr_w)
-  DEFINE_XAMO(sc_w)
-  DEFINE_XAMO_LR(lr_d)
-  DEFINE_XAMO(sc_d)
-
-  DEFINE_FLOAD(flw)
-  DEFINE_FLOAD(fld)
-  DEFINE_FLOAD(flq)
-
-  DEFINE_FSTORE(fsw)
-  DEFINE_FSTORE(fsd)
-  DEFINE_FSTORE(fsq)
-
-  add_insn(new disasm_insn_t("j", match_jal, mask_jal | mask_rd, {&jump_target}));
-  add_insn(new disasm_insn_t("jal", match_jal | match_rd_ra, mask_jal | mask_rd, {&jump_target}));
-  add_insn(new disasm_insn_t("jal", match_jal, mask_jal, {&xrd, &jump_target}));
-
-  DEFINE_B1TYPE("beqz", beq);
-  DEFINE_B1TYPE("bnez", bne);
-  DEFINE_B1TYPE("bltz", blt);
-  DEFINE_B1TYPE("bgez", bge);
-  DEFINE_BTYPE(beq)
-  DEFINE_BTYPE(bne)
-  DEFINE_BTYPE(blt)
-  DEFINE_BTYPE(bge)
-  DEFINE_BTYPE(bltu)
-  DEFINE_BTYPE(bgeu)
-
-  DEFINE_LTYPE(lui);
-  DEFINE_LTYPE(auipc);
-
-  add_insn(new disasm_insn_t("ret", match_jalr | match_rs1_ra, mask_jalr | mask_rd | mask_rs1 | mask_imm, {}));
-  DEFINE_I2TYPE("jr", jalr);
-  add_insn(new disasm_insn_t("jalr", match_jalr | match_rd_ra, mask_jalr | mask_rd | mask_imm, {&xrs1}));
-  DEFINE_ITYPE(jalr);
-
-  add_insn(new disasm_insn_t("nop", match_addi, mask_addi | mask_rd | mask_rs1 | mask_imm, {}));
-  add_insn(new disasm_insn_t(" - ", match_xor, mask_xor | mask_rd | mask_rs1 | mask_rs2, {})); // for machine-generated bubbles
-  DEFINE_I0TYPE("li", addi);
-  DEFINE_I1TYPE("mv", addi);
-  DEFINE_ITYPE(addi);
-  DEFINE_ITYPE(slti);
-  add_insn(new disasm_insn_t("seqz", match_sltiu | match_imm_1, mask_sltiu | mask_imm, {&xrd, &xrs1}));
-  DEFINE_ITYPE(sltiu);
-  add_insn(new disasm_insn_t("not", match_xori | mask_imm, mask_xori | mask_imm, {&xrd, &xrs1}));
-  DEFINE_ITYPE(xori);
-
-  DEFINE_ITYPE_SHIFT(slli);
-  DEFINE_ITYPE_SHIFT(srli);
-  DEFINE_ITYPE_SHIFT(srai);
-
-  DEFINE_ITYPE(ori);
-  DEFINE_ITYPE(andi);
-  DEFINE_I1TYPE("sext.w", addiw);
-  DEFINE_ITYPE(addiw);
-
-  DEFINE_ITYPE_SHIFT(slliw);
-  DEFINE_ITYPE_SHIFT(srliw);
-  DEFINE_ITYPE_SHIFT(sraiw);
-
-  DEFINE_RTYPE(add);
-  DEFINE_RTYPE(sub);
-  DEFINE_RTYPE(sll);
-  DEFINE_RTYPE(slt);
-  add_insn(new disasm_insn_t("snez", match_sltu, mask_sltu | mask_rs1, {&xrd, &xrs2}));
-  DEFINE_RTYPE(sltu);
-  DEFINE_RTYPE(xor);
-  DEFINE_RTYPE(srl);
-  DEFINE_RTYPE(sra);
-  DEFINE_RTYPE(or);
-  DEFINE_RTYPE(and);
-  DEFINE_RTYPE(mul);
-  DEFINE_RTYPE(mulh);
-  DEFINE_RTYPE(mulhu);
-  DEFINE_RTYPE(mulhsu);
-  DEFINE_RTYPE(div);
-  DEFINE_RTYPE(divu);
-  DEFINE_RTYPE(rem);
-  DEFINE_RTYPE(remu);
-  DEFINE_RTYPE(addw);
-  DEFINE_RTYPE(subw);
-  DEFINE_RTYPE(sllw);
-  DEFINE_RTYPE(srlw);
-  DEFINE_RTYPE(sraw);
-  DEFINE_RTYPE(mulw);
-  DEFINE_RTYPE(divw);
-  DEFINE_RTYPE(divuw);
-  DEFINE_RTYPE(remw);
-  DEFINE_RTYPE(remuw);
-
-  DEFINE_NOARG(ecall);
-  DEFINE_NOARG(ebreak);
-  DEFINE_NOARG(uret);
-  DEFINE_NOARG(sret);
-  DEFINE_NOARG(mret);
-  DEFINE_NOARG(dret);
-  DEFINE_NOARG(wfi);
-  DEFINE_NOARG(fence);
-  DEFINE_NOARG(fence_i);
-  DEFINE_SFENCE_TYPE(sfence_vma);
-
-  add_insn(new disasm_insn_t("csrr", match_csrrs, mask_csrrs | mask_rs1, {&xrd, &csr}));
-  add_insn(new disasm_insn_t("csrw", match_csrrw, mask_csrrw | mask_rd, {&csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrs", match_csrrs, mask_csrrs | mask_rd, {&csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrc", match_csrrc, mask_csrrc | mask_rd, {&csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrwi", match_csrrwi, mask_csrrwi | mask_rd, {&csr, &zimm5}));
-  add_insn(new disasm_insn_t("csrsi", match_csrrsi, mask_csrrsi | mask_rd, {&csr, &zimm5}));
-  add_insn(new disasm_insn_t("csrci", match_csrrci, mask_csrrci | mask_rd, {&csr, &zimm5}));
-  add_insn(new disasm_insn_t("csrrw", match_csrrw, mask_csrrw, {&xrd, &csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrrs", match_csrrs, mask_csrrs, {&xrd, &csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrrc", match_csrrc, mask_csrrc, {&xrd, &csr, &xrs1}));
-  add_insn(new disasm_insn_t("csrrwi", match_csrrwi, mask_csrrwi, {&xrd, &csr, &zimm5}));
-  add_insn(new disasm_insn_t("csrrsi", match_csrrsi, mask_csrrsi, {&xrd, &csr, &zimm5}));
-  add_insn(new disasm_insn_t("csrrci", match_csrrci, mask_csrrci, {&xrd, &csr, &zimm5}));
-
-  DEFINE_FRTYPE(fadd_s);
-  DEFINE_FRTYPE(fsub_s);
-  DEFINE_FRTYPE(fmul_s);
-  DEFINE_FRTYPE(fdiv_s);
-  DEFINE_FR1TYPE(fsqrt_s);
-  DEFINE_FRTYPE(fmin_s);
-  DEFINE_FRTYPE(fmax_s);
-  DEFINE_FR3TYPE(fmadd_s);
-  DEFINE_FR3TYPE(fmsub_s);
-  DEFINE_FR3TYPE(fnmadd_s);
-  DEFINE_FR3TYPE(fnmsub_s);
-  DEFINE_FRTYPE(fsgnj_s);
-  DEFINE_FRTYPE(fsgnjn_s);
-  DEFINE_FRTYPE(fsgnjx_s);
-  DEFINE_FR1TYPE(fcvt_s_d);
-  DEFINE_FR1TYPE(fcvt_s_q);
-  DEFINE_XFTYPE(fcvt_s_l);
-  DEFINE_XFTYPE(fcvt_s_lu);
-  DEFINE_XFTYPE(fcvt_s_w);
-  DEFINE_XFTYPE(fcvt_s_wu);
-  DEFINE_XFTYPE(fcvt_s_wu);
-  DEFINE_XFTYPE(fmv_w_x);
-  DEFINE_FXTYPE(fcvt_l_s);
-  DEFINE_FXTYPE(fcvt_lu_s);
-  DEFINE_FXTYPE(fcvt_w_s);
-  DEFINE_FXTYPE(fcvt_wu_s);
-  DEFINE_FXTYPE(fclass_s);
-  DEFINE_FXTYPE(fmv_x_w);
-  DEFINE_FX2TYPE(feq_s);
-  DEFINE_FX2TYPE(flt_s);
-  DEFINE_FX2TYPE(fle_s);
-
-  DEFINE_FRTYPE(fadd_d);
-  DEFINE_FRTYPE(fsub_d);
-  DEFINE_FRTYPE(fmul_d);
-  DEFINE_FRTYPE(fdiv_d);
-  DEFINE_FR1TYPE(fsqrt_d);
-  DEFINE_FRTYPE(fmin_d);
-  DEFINE_FRTYPE(fmax_d);
-  DEFINE_FR3TYPE(fmadd_d);
-  DEFINE_FR3TYPE(fmsub_d);
-  DEFINE_FR3TYPE(fnmadd_d);
-  DEFINE_FR3TYPE(fnmsub_d);
-  DEFINE_FRTYPE(fsgnj_d);
-  DEFINE_FRTYPE(fsgnjn_d);
-  DEFINE_FRTYPE(fsgnjx_d);
-  DEFINE_FR1TYPE(fcvt_d_s);
-  DEFINE_FR1TYPE(fcvt_d_q);
-  DEFINE_XFTYPE(fcvt_d_l);
-  DEFINE_XFTYPE(fcvt_d_lu);
-  DEFINE_XFTYPE(fcvt_d_w);
-  DEFINE_XFTYPE(fcvt_d_wu);
-  DEFINE_XFTYPE(fcvt_d_wu);
-  DEFINE_XFTYPE(fmv_d_x);
-  DEFINE_FXTYPE(fcvt_l_d);
-  DEFINE_FXTYPE(fcvt_lu_d);
-  DEFINE_FXTYPE(fcvt_w_d);
-  DEFINE_FXTYPE(fcvt_wu_d);
-  DEFINE_FXTYPE(fclass_d);
-  DEFINE_FXTYPE(fmv_x_d);
-  DEFINE_FX2TYPE(feq_d);
-  DEFINE_FX2TYPE(flt_d);
-  DEFINE_FX2TYPE(fle_d);
-
-  DEFINE_FRTYPE(fadd_q);
-  DEFINE_FRTYPE(fsub_q);
-  DEFINE_FRTYPE(fmul_q);
-  DEFINE_FRTYPE(fdiv_q);
-  DEFINE_FR1TYPE(fsqrt_q);
-  DEFINE_FRTYPE(fmin_q);
-  DEFINE_FRTYPE(fmax_q);
-  DEFINE_FR3TYPE(fmadd_q);
-  DEFINE_FR3TYPE(fmsub_q);
-  DEFINE_FR3TYPE(fnmadd_q);
-  DEFINE_FR3TYPE(fnmsub_q);
-  DEFINE_FRTYPE(fsgnj_q);
-  DEFINE_FRTYPE(fsgnjn_q);
-  DEFINE_FRTYPE(fsgnjx_q);
-  DEFINE_FR1TYPE(fcvt_q_s);
-  DEFINE_FR1TYPE(fcvt_q_d);
-  DEFINE_XFTYPE(fcvt_q_l);
-  DEFINE_XFTYPE(fcvt_q_lu);
-  DEFINE_XFTYPE(fcvt_q_w);
-  DEFINE_XFTYPE(fcvt_q_wu);
-  DEFINE_XFTYPE(fcvt_q_wu);
-  DEFINE_XFTYPE(fmv_q_x);
-  DEFINE_FXTYPE(fcvt_l_q);
-  DEFINE_FXTYPE(fcvt_lu_q);
-  DEFINE_FXTYPE(fcvt_w_q);
-  DEFINE_FXTYPE(fcvt_wu_q);
-  DEFINE_FXTYPE(fclass_q);
-  DEFINE_FXTYPE(fmv_x_q);
-  DEFINE_FX2TYPE(feq_q);
-  DEFINE_FX2TYPE(flt_q);
-  DEFINE_FX2TYPE(fle_q);
-
-  DISASM_INSN("c.ebreak", c_add, mask_rd | mask_rvc_rs2, {});
-  add_insn(new disasm_insn_t("ret", match_c_jr | match_rd_ra, mask_c_jr | mask_rd | mask_rvc_imm, {}));
-  DISASM_INSN("c.jr", c_jr, mask_rvc_imm, {&rvc_rs1});
-  DISASM_INSN("c.jalr", c_jalr, mask_rvc_imm, {&rvc_rs1});
-  DISASM_INSN("c.nop", c_addi, mask_rd | mask_rvc_imm, {});
-  DISASM_INSN("c.addi16sp", c_addi16sp, mask_rd, {&rvc_sp, &rvc_addi16sp_imm});
-  DISASM_INSN("c.addi4spn", c_addi4spn, 0, {&rvc_rs2s, &rvc_sp, &rvc_addi4spn_imm});
-  DISASM_INSN("c.li", c_li, 0, {&xrd, &rvc_imm});
-  DISASM_INSN("c.lui", c_lui, 0, {&xrd, &rvc_uimm});
-  DISASM_INSN("c.addi", c_addi, 0, {&xrd, &rvc_imm});
-  DISASM_INSN("c.slli", c_slli, 0, {&rvc_rs1, &rvc_shamt});
-  DISASM_INSN("c.srli", c_srli, 0, {&rvc_rs1s, &rvc_shamt});
-  DISASM_INSN("c.srai", c_srai, 0, {&rvc_rs1s, &rvc_shamt});
-  DISASM_INSN("c.andi", c_andi, 0, {&rvc_rs1s, &rvc_imm});
-  DISASM_INSN("c.mv", c_mv, 0, {&xrd, &rvc_rs2});
-  DISASM_INSN("c.add", c_add, 0, {&xrd, &rvc_rs2});
-  DISASM_INSN("c.addw", c_addw, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.sub", c_sub, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.subw", c_subw, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.and", c_and, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.or", c_or, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.xor", c_xor, 0, {&rvc_rs1s, &rvc_rs2s});
-  DISASM_INSN("c.lwsp", c_lwsp, 0, {&xrd, &rvc_lwsp_address});
-  DISASM_INSN("c.fld", c_fld, 0, {&rvc_fp_rs2s, &rvc_ld_address});
-  DISASM_INSN("c.swsp", c_swsp, 0, {&rvc_rs2, &rvc_swsp_address});
-  DISASM_INSN("c.lw", c_lw, 0, {&rvc_rs2s, &rvc_lw_address});
-  DISASM_INSN("c.sw", c_sw, 0, {&rvc_rs2s, &rvc_lw_address});
-  DISASM_INSN("c.beqz", c_beqz, 0, {&rvc_rs1s, &rvc_branch_target});
-  DISASM_INSN("c.bnez", c_bnez, 0, {&rvc_rs1s, &rvc_branch_target});
-  DISASM_INSN("c.j", c_j, 0, {&rvc_jump_target});
-  DISASM_INSN("c.fldsp", c_fldsp, 0, {&rvc_fp_rs2s, &rvc_ldsp_address});
-  DISASM_INSN("c.fsd", c_fsd, 0, {&rvc_fp_rs2s, &rvc_ld_address});
-  DISASM_INSN("c.fsdsp", c_fsdsp, 0, {&rvc_fp_rs2s, &rvc_sdsp_address});
-
-  if (xlen == 32) {
-    DISASM_INSN("c.flw", c_flw, 0, {&rvc_fp_rs2s, &rvc_lw_address});
-    DISASM_INSN("c.flwsp", c_flwsp, 0, {&frd, &rvc_lwsp_address});
-    DISASM_INSN("c.fsw", c_fsw, 0, {&rvc_fp_rs2s, &rvc_lw_address});
-    DISASM_INSN("c.fswsp", c_fswsp, 0, {&rvc_fp_rs2, &rvc_swsp_address});
-    DISASM_INSN("c.jal", c_jal, 0, {&rvc_jump_target});
-  } else {
-    DISASM_INSN("c.ld", c_ld, 0, {&rvc_rs2s, &rvc_ld_address});
-    DISASM_INSN("c.ldsp", c_ldsp, 0, {&xrd, &rvc_ldsp_address});
-    DISASM_INSN("c.sd", c_sd, 0, {&rvc_rs2s, &rvc_ld_address});
-    DISASM_INSN("c.sdsp", c_sdsp, 0, {&rvc_rs2, &rvc_sdsp_address});
-    DISASM_INSN("c.addiw", c_addiw, 0, {&xrd, &rvc_imm});
-  }
-
-  // provide a default disassembly for all instructions as a fallback
-  #define DECLARE_INSN(code, match, mask) \
-   add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {}));
-  #include "encoding.h"
-  #undef DECLARE_INSN
-}
-
-const disasm_insn_t* disassembler_t::lookup(insn_t insn) const
-{
-  size_t idx = insn.bits() % HASH_SIZE;
-  for (size_t j = 0; j < chain[idx].size(); j++)
-    if(*chain[idx][j] == insn)
-      return chain[idx][j];
-
-  idx = HASH_SIZE;
-  for (size_t j = 0; j < chain[idx].size(); j++)
-    if(*chain[idx][j] == insn)
-      return chain[idx][j];
-
-  return NULL;
-}
-
-void disassembler_t::add_insn(disasm_insn_t* insn)
-{
-  size_t idx = HASH_SIZE;
-  if (insn->get_mask() % HASH_SIZE == HASH_SIZE - 1)
-    idx = insn->get_match() % HASH_SIZE;
-  chain[idx].push_back(insn);
-}
-
-disassembler_t::~disassembler_t()
-{
-  for (size_t i = 0; i < HASH_SIZE+1; i++)
-    for (size_t j = 0; j < chain[i].size(); j++)
-      delete chain[i][j];
-}
diff --git a/spike_main/spike-log-parser.cc b/spike_main/spike-log-parser.cc
new file mode 100644
index 0000000000..d174afc6ba
--- /dev/null
+++ b/spike_main/spike-log-parser.cc
@@ -0,0 +1,60 @@
+// See LICENSE for license details.
+
+// This little program finds occurrences of strings like
+//   core   0: 0x000000008000c36c (0xfe843783) ld      a5, -24(s0)
+// in its inputs, then output the RISC-V instruction with the disassembly
+// enclosed hexadecimal number.
+
+#include <iostream>
+#include <string>
+#include <cstdint>
+#include <regex>
+#include "fesvr/option_parser.h"
+
+#include "disasm.h"
+#include "extension.h"
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+  string s;
+  const char* isa = DEFAULT_ISA;
+
+  std::function<extension_t*()> extension;
+  option_parser_t parser;
+  parser.option(0, "extension", 1, [&](const char* s){extension = find_extension(s);});
+  parser.option(0, "isa", 1, [&](const char* s){isa = s;});
+  parser.parse(argv);
+
+  processor_t p(isa, DEFAULT_PRIV, DEFAULT_VARCH, 0, 0, false, nullptr);
+  if (extension) {
+    p.register_extension(extension());
+  }
+
+  std::regex reg("^core\\s+\\d+:\\s+0x[0-9a-f]+\\s+\\(0x([0-9a-f]+)\\)", std::regex_constants::icase);
+  std::smatch m;
+  std::ssub_match sm ;
+
+  while (getline(cin,s)){
+    if (regex_search(s, m, reg)){
+      // the opcode string
+      string op = m[1].str();
+      uint32_t bit_num = op.size() * 4;
+      uint64_t opcode = strtoull(op.c_str(), nullptr, 16);
+
+      if (bit_num<64){
+          opcode = opcode << (64-bit_num) >> (64-bit_num);
+      }
+
+      const disasm_insn_t* disasm = p.get_disassembler()->lookup(opcode);
+      if (disasm) {
+          cout << disasm->get_name() << '\n';
+      } else {
+          cout << "unknown_op\n";
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/spike_main/spike.cc b/spike_main/spike.cc
index 3e5c7e6c60..89bf915103 100644
--- a/spike_main/spike.cc
+++ b/spike_main/spike.cc
@@ -12,9 +12,12 @@
 #include <vector>
 #include <string>
 #include <memory>
+#include <fstream>
+#include "../VERSION"
 
-static void help()
+static void help(int exit_code = 1)
 {
+  fprintf(stderr, "Spike RISC-V ISA Simulator " SPIKE_VERSION "\n\n");
   fprintf(stderr, "usage: spike [host options] <target program> [target options]\n");
   fprintf(stderr, "Host Options:\n");
   fprintf(stderr, "  -p<n>                 Simulate <n> processors [default 1]\n");
@@ -24,27 +27,113 @@ static void help()
   fprintf(stderr, "  -d                    Interactive debug mode\n");
   fprintf(stderr, "  -g                    Track histogram of PCs\n");
   fprintf(stderr, "  -l                    Generate a log of execution\n");
-  fprintf(stderr, "  -h                    Print this help message\n");
+  fprintf(stderr, "  -h, --help            Print this help message\n");
   fprintf(stderr, "  -H                    Start halted, allowing a debugger to connect\n");
   fprintf(stderr, "  --isa=<name>          RISC-V ISA string [default %s]\n", DEFAULT_ISA);
+  fprintf(stderr, "  --priv=<m|mu|msu>     RISC-V privilege modes supported [default %s]\n", DEFAULT_PRIV);
+  fprintf(stderr, "  --varch=<name>        RISC-V Vector uArch string [default %s]\n", DEFAULT_VARCH);
   fprintf(stderr, "  --pc=<address>        Override ELF entry point\n");
   fprintf(stderr, "  --hartids=<a,b,...>   Explicitly specify hartids, default is 0,1,...\n");
   fprintf(stderr, "  --ic=<S>:<W>:<B>      Instantiate a cache model with S sets,\n");
   fprintf(stderr, "  --dc=<S>:<W>:<B>        W ways, and B-byte blocks (with S and\n");
   fprintf(stderr, "  --l2=<S>:<W>:<B>        B both powers of 2).\n");
+  fprintf(stderr, "  --device=<P,B,A>      Attach MMIO plugin device from an --extlib library\n");
+  fprintf(stderr, "                          P -- Name of the MMIO plugin\n");
+  fprintf(stderr, "                          B -- Base memory address of the device\n");
+  fprintf(stderr, "                          A -- String arguments to pass to the plugin\n");
+  fprintf(stderr, "                          This flag can be used multiple times.\n");
+  fprintf(stderr, "                          The extlib flag for the library must come first.\n");
   fprintf(stderr, "  --log-cache-miss      Generate a log of cache miss\n");
   fprintf(stderr, "  --extension=<name>    Specify RoCC Extension\n");
   fprintf(stderr, "  --extlib=<name>       Shared library to load\n");
+  fprintf(stderr, "                        This flag can be used multiple times.\n");
   fprintf(stderr, "  --rbb-port=<port>     Listen on <port> for remote bitbang connection\n");
   fprintf(stderr, "  --dump-dts            Print device tree string and exit\n");
   fprintf(stderr, "  --disable-dtb         Don't write the device tree blob into memory\n");
-  fprintf(stderr, "  --progsize=<words>    Progsize for the debug module [default 2]\n");
-  fprintf(stderr, "  --debug-sba=<bits>    Debug bus master supports up to "
+  fprintf(stderr, "  --kernel=<path>       Load kernel flat image into memory\n");
+  fprintf(stderr, "  --initrd=<path>       Load kernel initrd into memory\n");
+  fprintf(stderr, "  --bootargs=<args>     Provide custom bootargs for kernel [default: console=hvc0 earlycon=sbi]\n");
+  fprintf(stderr, "  --real-time-clint     Increment clint time at real-time rate\n");
+  fprintf(stderr, "  --dm-progsize=<words> Progsize for the debug module [default 2]\n");
+  fprintf(stderr, "  --dm-sba=<bits>       Debug bus master supports up to "
       "<bits> wide accesses [default 0]\n");
-  fprintf(stderr, "  --debug-auth          Debug module requires debugger to authenticate\n");
+  fprintf(stderr, "  --dm-auth             Debug module requires debugger to authenticate\n");
+  fprintf(stderr, "  --dmi-rti=<n>         Number of Run-Test/Idle cycles "
+      "required for a DMI access [default 0]\n");
+  fprintf(stderr, "  --dm-abstract-rti=<n> Number of Run-Test/Idle cycles "
+      "required for an abstract command to execute [default 0]\n");
+  fprintf(stderr, "  --dm-no-hasel         Debug module supports hasel\n");
+  fprintf(stderr, "  --dm-no-abstract-csr  Debug module won't support abstract to authenticate\n");
+  fprintf(stderr, "  --dm-no-halt-groups   Debug module won't support halt groups\n");
+  fprintf(stderr, "  --dm-no-impebreak     Debug module won't support implicit ebreak in program buffer\n");
+
+  exit(exit_code);
+}
+
+static void suggest_help()
+{
+  fprintf(stderr, "Try 'spike --help' for more information.\n");
   exit(1);
 }
 
+static bool check_file_exists(const char *fileName)
+{
+  std::ifstream infile(fileName);
+  return infile.good();
+}
+
+static std::ifstream::pos_type get_file_size(const char *filename)
+{
+  std::ifstream in(filename, std::ios::ate | std::ios::binary);
+  return in.tellg();
+}
+
+static void read_file_bytes(const char *filename,size_t fileoff,
+                            char *read_buf, size_t read_sz)
+{
+  std::ifstream in(filename, std::ios::in | std::ios::binary);
+  in.seekg(fileoff, std::ios::beg);
+  in.read(read_buf, read_sz);
+}
+
+bool sort_mem_region(const std::pair<reg_t, mem_t*> &a,
+                       const std::pair<reg_t, mem_t*> &b)
+{
+  if (a.first == b.first)
+    return (a.second->size() < b.second->size());
+  else
+    return (a.first < b.first);
+}
+
+void merge_overlapping_memory_regions(std::vector<std::pair<reg_t, mem_t*>>& mems)
+{
+  // check the user specified memory regions and merge the overlapping or
+  // eliminate the containing parts
+  std::sort(mems.begin(), mems.end(), sort_mem_region);
+  reg_t start_page = 0, end_page = 0;
+  std::vector<std::pair<reg_t, mem_t*>>::reverse_iterator it = mems.rbegin();
+  std::vector<std::pair<reg_t, mem_t*>>::reverse_iterator _it = mems.rbegin();
+  for(; it != mems.rend(); ++it) {
+    reg_t _start_page = it->first/PGSIZE;
+    reg_t _end_page = _start_page + it->second->size()/PGSIZE;
+    if (_start_page >= start_page && _end_page <= end_page) {
+      // contains
+      mems.erase(std::next(it).base());
+    }else if ( _start_page < start_page && _end_page > start_page) {
+      // overlapping
+      _it->first = _start_page;
+      if (_end_page > end_page)
+        end_page = _end_page;
+      mems.erase(std::next(it).base());
+    }else {
+      _it = it;
+      start_page = _start_page;
+      end_page = _end_page;
+      assert(start_page < end_page);
+    }
+  }
+}
+
 static std::vector<std::pair<reg_t, mem_t*>> make_mems(const char* arg)
 {
   // handle legacy mem argument
@@ -64,8 +153,23 @@ static std::vector<std::pair<reg_t, mem_t*>> make_mems(const char* arg)
     if (!*p || *p != ':')
       help();
     auto size = strtoull(p + 1, &p, 0);
-    if ((size | base) % PGSIZE != 0)
+
+    // page-align base and size
+    auto base0 = base, size0 = size;
+    size += base0 % PGSIZE;
+    base -= base0 % PGSIZE;
+    if (size % PGSIZE != 0)
+      size += PGSIZE - size % PGSIZE;
+
+    if (base + size < base)
       help();
+
+    if (size != size0) {
+      fprintf(stderr, "Warning: the memory at  [0x%llX, 0x%llX] has been realigned\n"
+                      "to the %ld KiB page size: [0x%llX, 0x%llX]\n",
+              base0, base0 + size0 - 1, PGSIZE / 1024, base, base + size - 1);
+    }
+
     res.push_back(std::make_pair(reg_t(base), new mem_t(size)));
     if (!*p)
       break;
@@ -73,6 +177,8 @@ static std::vector<std::pair<reg_t, mem_t*>> make_mems(const char* arg)
       help();
     arg = p + 1;
   }
+
+  merge_overlapping_memory_regions(res);
   return res;
 }
 
@@ -84,20 +190,41 @@ int main(int argc, char** argv)
   bool log = false;
   bool dump_dts = false;
   bool dtb_enabled = true;
+  bool real_time_clint = false;
   size_t nprocs = 1;
+  const char* kernel = NULL;
+  reg_t kernel_offset, kernel_size;
+  size_t initrd_size;
+  reg_t initrd_start = 0, initrd_end = 0;
+  const char* bootargs = NULL;
   reg_t start_pc = reg_t(-1);
   std::vector<std::pair<reg_t, mem_t*>> mems;
+  std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices;
   std::unique_ptr<icache_sim_t> ic;
   std::unique_ptr<dcache_sim_t> dc;
   std::unique_ptr<cache_sim_t> l2;
   bool log_cache = false;
+  bool log_commits = false;
+  const char *log_path = nullptr;
   std::function<extension_t*()> extension;
+  const char* initrd = NULL;
   const char* isa = DEFAULT_ISA;
+  const char* priv = DEFAULT_PRIV;
+  const char* varch = DEFAULT_VARCH;
+  const char* dtb_file = NULL;
   uint16_t rbb_port = 0;
   bool use_rbb = false;
-  unsigned progsize = 2;
-  unsigned max_bus_master_bits = 0;
-  bool require_authentication = false;
+  unsigned dmi_rti = 0;
+  debug_module_config_t dm_config = {
+    .progbufsize = 2,
+    .max_bus_master_bits = 0,
+    .require_authentication = false,
+    .abstract_rti = 0,
+    .support_hasel = true,
+    .support_abstract_csr_access = true,
+    .support_haltgroups = true,
+    .support_impebreak = true
+  };
   std::vector<int> hartids;
 
   auto const hartids_parser = [&](const char *s) {
@@ -112,9 +239,52 @@ int main(int argc, char** argv)
     }
   };
 
+  auto const device_parser = [&plugin_devices](const char *s) {
+    const std::string str(s);
+    std::istringstream stream(str);
+
+    // We are parsing a string like name,base,args.
+
+    // Parse the name, which is simply all of the characters leading up to the
+    // first comma. The validity of the plugin name will be checked later.
+    std::string name;
+    std::getline(stream, name, ',');
+    if (name.empty()) {
+      throw std::runtime_error("Plugin name is empty.");
+    }
+
+    // Parse the base address. First, get all of the characters up to the next
+    // comma (or up to the end of the string if there is no comma). Then try to
+    // parse that string as an integer according to the rules of strtoull. It
+    // could be in decimal, hex, or octal. Fail if we were able to parse a
+    // number but there were garbage characters after the valid number. We must
+    // consume the entire string between the commas.
+    std::string base_str;
+    std::getline(stream, base_str, ',');
+    if (base_str.empty()) {
+      throw std::runtime_error("Device base address is empty.");
+    }
+    char* end;
+    reg_t base = static_cast<reg_t>(strtoull(base_str.c_str(), &end, 0));
+    if (end != &*base_str.cend()) {
+      throw std::runtime_error("Error parsing device base address.");
+    }
+
+    // The remainder of the string is the arguments. We could use getline, but
+    // that could ignore newline characters in the arguments. That should be
+    // rare and discouraged, but handle it here anyway with this weird in_avail
+    // technique. The arguments are optional, so if there were no arguments
+    // specified we could end up with an empty string here. That's okay.
+    auto avail = stream.rdbuf()->in_avail();
+    std::string args(avail, '\0');
+    stream.readsome(&args[0], avail);
+
+    plugin_devices.emplace_back(base, new mmio_plugin_device_t(name, args));
+  };
+
   option_parser_t parser;
-  parser.help(&help);
-  parser.option('h', 0, 0, [&](const char* s){help();});
+  parser.help(&suggest_help);
+  parser.option('h', "help", 0, [&](const char* s){help(0);});
   parser.option('d', 0, 0, [&](const char* s){debug = true;});
   parser.option('g', 0, 0, [&](const char* s){histogram = true;});
   parser.option('l', 0, 0, [&](const char* s){log = true;});
@@ -130,9 +300,17 @@ int main(int argc, char** argv)
   parser.option(0, "l2", 1, [&](const char* s){l2.reset(cache_sim_t::construct(s, "L2$"));});
   parser.option(0, "log-cache-miss", 0, [&](const char* s){log_cache = true;});
   parser.option(0, "isa", 1, [&](const char* s){isa = s;});
+  parser.option(0, "priv", 1, [&](const char* s){priv = s;});
+  parser.option(0, "varch", 1, [&](const char* s){varch = s;});
+  parser.option(0, "device", 1, device_parser);
   parser.option(0, "extension", 1, [&](const char* s){extension = find_extension(s);});
   parser.option(0, "dump-dts", 0, [&](const char *s){dump_dts = true;});
   parser.option(0, "disable-dtb", 0, [&](const char *s){dtb_enabled = false;});
+  parser.option(0, "dtb", 1, [&](const char *s){dtb_file = s;});
+  parser.option(0, "kernel", 1, [&](const char* s){kernel = s;});
+  parser.option(0, "initrd", 1, [&](const char* s){initrd = s;});
+  parser.option(0, "bootargs", 1, [&](const char* s){bootargs = s;});
+  parser.option(0, "real-time-clint", 0, [&](const char *s){real_time_clint = true;});
   parser.option(0, "extlib", 1, [&](const char *s){
     void *lib = dlopen(s, RTLD_NOW | RTLD_GLOBAL);
     if (lib == NULL) {
@@ -140,11 +318,28 @@ int main(int argc, char** argv)
       exit(-1);
     }
   });
-  parser.option(0, "progsize", 1, [&](const char* s){progsize = atoi(s);});
-  parser.option(0, "debug-sba", 1,
-      [&](const char* s){max_bus_master_bits = atoi(s);});
-  parser.option(0, "debug-auth", 0,
-      [&](const char* s){require_authentication = true;});
+  parser.option(0, "dm-progsize", 1,
+      [&](const char* s){dm_config.progbufsize = atoi(s);});
+  parser.option(0, "dm-no-impebreak", 0,
+      [&](const char* s){dm_config.support_impebreak = false;});
+  parser.option(0, "dm-sba", 1,
+      [&](const char* s){dm_config.max_bus_master_bits = atoi(s);});
+  parser.option(0, "dm-auth", 0,
+      [&](const char* s){dm_config.require_authentication = true;});
+  parser.option(0, "dmi-rti", 1,
+      [&](const char* s){dmi_rti = atoi(s);});
+  parser.option(0, "dm-abstract-rti", 1,
+      [&](const char* s){dm_config.abstract_rti = atoi(s);});
+  parser.option(0, "dm-no-hasel", 0,
+      [&](const char* s){dm_config.support_hasel = false;});
+  parser.option(0, "dm-no-abstract-csr", 0,
+      [&](const char* s){dm_config.support_abstract_csr_access = false;});
+  parser.option(0, "dm-no-halt-groups", 0,
+      [&](const char* s){dm_config.support_haltgroups = false;});
+  parser.option(0, "log-commits", 0,
+                [&](const char* s){log_commits = true;});
+  parser.option(0, "log", 1,
+                [&](const char* s){log_path = s;});
 
   auto argv1 = parser.parse(argv);
   std::vector<std::string> htif_args(argv1, (const char*const*)argv + argc);
@@ -154,15 +349,42 @@ int main(int argc, char** argv)
   if (!*argv1)
     help();
 
-  sim_t s(isa, nprocs, halted, start_pc, mems, htif_args, std::move(hartids),
-      progsize, max_bus_master_bits, require_authentication);
+  if (kernel && check_file_exists(kernel)) {
+    kernel_size = get_file_size(kernel);
+    if (isa[2] == '6' && isa[3] == '4')
+      kernel_offset = 0x200000;
+    else
+      kernel_offset = 0x400000;
+    for (auto& m : mems) {
+      if (kernel_size && (kernel_offset + kernel_size) < m.second->size()) {
+         read_file_bytes(kernel, 0, m.second->contents() + kernel_offset, kernel_size);
+         break;
+      }
+    }
+  }
+
+  if (initrd && check_file_exists(initrd)) {
+    initrd_size = get_file_size(initrd);
+    for (auto& m : mems) {
+      if (initrd_size && (initrd_size + 0x1000) < m.second->size()) {
+         initrd_end = m.first + m.second->size() - 0x1000;
+         initrd_start = initrd_end - initrd_size;
+         read_file_bytes(initrd, 0, m.second->contents() + (initrd_start - m.first), initrd_size);
+         break;
+      }
+    }
+  }
+
+  sim_t s(isa, priv, varch, nprocs, halted, real_time_clint,
+      initrd_start, initrd_end, bootargs, start_pc, mems, plugin_devices, htif_args,
+      std::move(hartids), dm_config, log_path, dtb_enabled, dtb_file);
   std::unique_ptr<remote_bitbang_t> remote_bitbang((remote_bitbang_t *) NULL);
-  std::unique_ptr<jtag_dtm_t> jtag_dtm(new jtag_dtm_t(&s.debug_module));
+  std::unique_ptr<jtag_dtm_t> jtag_dtm(
+      new jtag_dtm_t(&s.debug_module, dmi_rti));
   if (use_rbb) {
     remote_bitbang.reset(new remote_bitbang_t(rbb_port, &(*jtag_dtm)));
     s.set_remote_bitbang(&(*remote_bitbang));
   }
-  s.set_dtb_enabled(dtb_enabled);
 
   if (dump_dts) {
     printf("%s", s.get_dts());
@@ -181,7 +403,16 @@ int main(int argc, char** argv)
   }
 
   s.set_debug(debug);
-  s.set_log(log);
+  s.configure_log(log, log_commits);
   s.set_histogram(histogram);
-  return s.run();
+
+  auto return_code = s.run();
+
+  for (auto& mem : mems)
+    delete mem.second;
+
+  for (auto& plugin_device : plugin_devices)
+    delete plugin_device.second;
+
+  return return_code;
 }
diff --git a/spike_main/spike_main.mk.in b/spike_main/spike_main.mk.in
index 500446fa6c..35bef398c4 100644
--- a/spike_main/spike_main.mk.in
+++ b/spike_main/spike_main.mk.in
@@ -1,10 +1,13 @@
 spike_main_subproject_deps = \
+	fdt \
+	fesvr \
 	softfloat \
+	disasm \
 	riscv \
 
 spike_main_install_prog_srcs = \
 	spike.cc \
-	spike-dasm.cc \
+	spike-log-parser.cc \
 	xspike.cc \
 	termios-xspike.cc \