diff --git a/go.mod b/go.mod
index 5cacb2e521..0fb1cb2b3b 100644
--- a/go.mod
+++ b/go.mod
@@ -42,7 +42,7 @@ require (
 	github.com/spf13/cobra v1.8.1
 	github.com/spf13/pflag v1.0.5
 	github.com/stretchr/testify v1.9.0
-	github.com/tetratelabs/wazero v1.7.2
+	github.com/tetratelabs/wazero v1.8.0
 	github.com/things-go/go-socks5 v0.0.5
 	github.com/ulikunitz/xz v0.5.12
 	github.com/xlab/treeprint v1.2.0
diff --git a/go.sum b/go.sum
index 4db757a329..1bb9176ade 100644
--- a/go.sum
+++ b/go.sum
@@ -387,8 +387,8 @@ github.com/tc-hib/winres v0.2.1 h1:YDE0FiP0VmtRaDn7+aaChp1KiF4owBiJa5l964l5ujA=
 github.com/tc-hib/winres v0.2.1/go.mod h1:C/JaNhH3KBvhNKVbvdlDWkbMDO9H4fKKDaN7/07SSuk=
 github.com/tcnksm/go-httpstat v0.2.0 h1:rP7T5e5U2HfmOBmZzGgGZjBQ5/GluWUylujl0tJ04I0=
 github.com/tcnksm/go-httpstat v0.2.0/go.mod h1:s3JVJFtQxtBEBC9dwcdTTXS9xFnM3SXAZwPG41aurT8=
-github.com/tetratelabs/wazero v1.7.2 h1:1+z5nXJNwMLPAWaTePFi49SSTL0IMx/i3Fg8Yc25GDc=
-github.com/tetratelabs/wazero v1.7.2/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y=
+github.com/tetratelabs/wazero v1.8.0 h1:iEKu0d4c2Pd+QSRieYbnQC9yiFlMS9D+Jr0LsRmcF4g=
+github.com/tetratelabs/wazero v1.8.0/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
 github.com/thedevsaddam/gojsonq/v2 v2.5.2 h1:CoMVaYyKFsVj6TjU6APqAhAvC07hTI6IQen8PHzHYY0=
 github.com/thedevsaddam/gojsonq/v2 v2.5.2/go.mod h1:bv6Xa7kWy82uT0LnXPE2SzGqTj33TAEeR560MdJkiXs=
 github.com/things-go/go-socks5 v0.0.5 h1:qvKaGcBkfDrUL33SchHN93srAmYGzb4CxSM2DPYufe8=
diff --git a/vendor/github.com/tetratelabs/wazero/Makefile b/vendor/github.com/tetratelabs/wazero/Makefile
index e5ae8a2619..17e264e01c 100644
--- a/vendor/github.com/tetratelabs/wazero/Makefile
+++ b/vendor/github.com/tetratelabs/wazero/Makefile
@@ -1,7 +1,7 @@
 
-gofumpt       := mvdan.cc/gofumpt@v0.5.0
+gofumpt       := mvdan.cc/gofumpt@v0.6.0
 gosimports    := github.com/rinchsan/gosimports/cmd/gosimports@v0.3.8
-golangci_lint := github.com/golangci/golangci-lint/cmd/golangci-lint@v1.55.2
+golangci_lint := github.com/golangci/golangci-lint/cmd/golangci-lint@v1.60.0
 asmfmt        := github.com/klauspost/asmfmt/cmd/asmfmt@v1.3.2
 # sync this with netlify.toml!
 hugo          := github.com/gohugoio/hugo@v0.115.2
@@ -20,22 +20,6 @@ main_packages := $(sort $(foreach f,$(dir $(main_sources)),$(if $(findstring ./,
 
 go_test_options ?= -timeout 300s
 
-ensureCompilerFastest := -ldflags '-X github.com/tetratelabs/wazero/internal/integration_test/vs.ensureCompilerFastest=true'
-.PHONY: bench
-bench:
-	@go build ./internal/integration_test/bench/...
-	@# Don't use -test.benchmem as it isn't accurate when comparing against CGO libs
-	@for d in vs/time vs/wasmedge vs/wasmtime ; do \
-		cd ./internal/integration_test/$$d ; \
-		go test -bench=. . -tags='wasmedge' $(ensureCompilerFastest) ; \
-		cd - ;\
-	done
-
-bench_testdata_dir := internal/integration_test/bench/testdata
-.PHONY: build.bench
-build.bench:
-	@tinygo build -o $(bench_testdata_dir)/case.wasm -scheduler=none --no-debug -target=wasi $(bench_testdata_dir)/case.go
-
 .PHONY: test.examples
 test.examples:
 	@go test $(go_test_options) ./examples/... ./imports/assemblyscript/example/... ./imports/emscripten/... ./imports/wasi_snapshot_preview1/example/...
@@ -183,7 +167,7 @@ build.spectest.threads:
 
 .PHONY: test
 test:
-	@go test $(go_test_options) $$(go list ./... | grep -vE '$(spectest_v1_dir)|$(spectest_v2_dir)')
+	@go test $(go_test_options) ./...
 	@cd internal/version/testdata && go test $(go_test_options) ./...
 	@cd internal/integration_test/fuzz/wazerolib && CGO_ENABLED=0 WASM_BINARY_PATH=testdata/test.wasm go test ./...
 
@@ -194,17 +178,6 @@ coverage: ## Generate test coverage
 	@go test -coverprofile=coverage.txt -covermode=atomic --coverpkg=$(coverpkg) $(main_packages)
 	@go tool cover -func coverage.txt
 
-.PHONY: spectest
-spectest:
-	@$(MAKE) spectest.v1
-	@$(MAKE) spectest.v2
-
-spectest.v1:
-	@go test $(go_test_options) $$(go list ./... | grep $(spectest_v1_dir))
-
-spectest.v2:
-	@go test $(go_test_options) $$(go list ./... | grep $(spectest_v2_dir))
-
 golangci_lint_path := $(shell go env GOPATH)/bin/golangci-lint
 
 $(golangci_lint_path):
@@ -214,7 +187,7 @@ golangci_lint_goarch ?= $(shell go env GOARCH)
 
 .PHONY: lint
 lint: $(golangci_lint_path)
-	@GOARCH=$(golangci_lint_goarch) CGO_ENABLED=0 $(golangci_lint_path) run --timeout 5m
+	@GOARCH=$(golangci_lint_goarch) CGO_ENABLED=0 $(golangci_lint_path) run --timeout 5m -E testableexamples
 
 .PHONY: format
 format:
diff --git a/vendor/github.com/tetratelabs/wazero/README.md b/vendor/github.com/tetratelabs/wazero/README.md
index 657da29594..f020be99a7 100644
--- a/vendor/github.com/tetratelabs/wazero/README.md
+++ b/vendor/github.com/tetratelabs/wazero/README.md
@@ -1,6 +1,6 @@
 # wazero: the zero dependency WebAssembly runtime for Go developers
 
-[![WebAssembly Core Specification Test](https://github.com/tetratelabs/wazero/actions/workflows/spectest.yaml/badge.svg)](https://github.com/tetratelabs/wazero/actions/workflows/spectest.yaml) [![Go Reference](https://pkg.go.dev/badge/github.com/tetratelabs/wazero.svg)](https://pkg.go.dev/github.com/tetratelabs/wazero) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Go Reference](https://pkg.go.dev/badge/github.com/tetratelabs/wazero.svg)](https://pkg.go.dev/github.com/tetratelabs/wazero) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 WebAssembly is a way to safely run code compiled in other languages. Runtimes
 execute WebAssembly Modules (Wasm), which are most often binaries with a `.wasm`
diff --git a/vendor/github.com/tetratelabs/wazero/api/wasm.go b/vendor/github.com/tetratelabs/wazero/api/wasm.go
index c66b582fae..d99c1a7569 100644
--- a/vendor/github.com/tetratelabs/wazero/api/wasm.go
+++ b/vendor/github.com/tetratelabs/wazero/api/wasm.go
@@ -151,9 +151,13 @@ type Module interface {
 
 	// ExportedFunction returns a function exported from this module or nil if it wasn't.
 	//
-	// Note: The default wazero.ModuleConfig attempts to invoke `_start`, which
-	// in rare cases can close the module. When in doubt, check IsClosed prior
-	// to invoking a function export after instantiation.
+	// # Notes
+	//   - The default wazero.ModuleConfig attempts to invoke `_start`, which
+	//     in rare cases can close the module. When in doubt, check IsClosed prior
+	//     to invoking a function export after instantiation.
+	//   - The semantics of host functions assumes the existence of an "importing module" because, for example, the host function needs access to
+	//     the memory of the importing module. Therefore, direct use of ExportedFunction is forbidden for host modules.
+	//     Practically speaking, it is usually meaningless to directly call a host function from Go code as it is already somewhere in Go code.
 	ExportedFunction(name string) Function
 
 	// ExportedFunctionDefinitions returns all the exported function
diff --git a/vendor/github.com/tetratelabs/wazero/builder.go b/vendor/github.com/tetratelabs/wazero/builder.go
index f64afabdf0..b60a9e0978 100644
--- a/vendor/github.com/tetratelabs/wazero/builder.go
+++ b/vendor/github.com/tetratelabs/wazero/builder.go
@@ -179,6 +179,9 @@ type HostFunctionBuilder interface {
 //     are deferred until Compile.
 //   - Functions are indexed in order of calls to NewFunctionBuilder as
 //     insertion ordering is needed by ABI such as Emscripten (invoke_*).
+//   - The semantics of host functions assumes the existence of an "importing module" because, for example, the host function needs access to
+//     the memory of the importing module. Therefore, direct use of ExportedFunction is forbidden for host modules.
+//     Practically speaking, it is usually meaningless to directly call a host function from Go code as it is already somewhere in Go code.
 type HostModuleBuilder interface {
 	// Note: until golang/go#5860, we can't use example tests to embed code in interface godocs.
 
@@ -341,12 +344,24 @@ func (b *hostModuleBuilder) Compile(ctx context.Context) (CompiledModule, error)
 	return c, nil
 }
 
+// hostModuleInstance is a wrapper around api.Module that prevents calling ExportedFunction.
+type hostModuleInstance struct{ api.Module }
+
+// ExportedFunction implements api.Module ExportedFunction.
+func (h hostModuleInstance) ExportedFunction(name string) api.Function {
+	panic("calling ExportedFunction is forbidden on host modules. See the note on ExportedFunction interface")
+}
+
 // Instantiate implements HostModuleBuilder.Instantiate
 func (b *hostModuleBuilder) Instantiate(ctx context.Context) (api.Module, error) {
 	if compiled, err := b.Compile(ctx); err != nil {
 		return nil, err
 	} else {
 		compiled.(*compiledModule).closeWithModule = true
-		return b.r.InstantiateModule(ctx, compiled, NewModuleConfig())
+		m, err := b.r.InstantiateModule(ctx, compiled, NewModuleConfig())
+		if err != nil {
+			return nil, err
+		}
+		return hostModuleInstance{m}, nil
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/cache.go b/vendor/github.com/tetratelabs/wazero/cache.go
index 2d1b4e3b9c..83cdb94ef3 100644
--- a/vendor/github.com/tetratelabs/wazero/cache.go
+++ b/vendor/github.com/tetratelabs/wazero/cache.go
@@ -24,6 +24,13 @@ import (
 //     All implementations are in wazero.
 //   - Instances of this can be reused across multiple runtimes, if configured
 //     via RuntimeConfig.
+//   - The cache check happens before the compilation, so if multiple Goroutines are
+//     trying to compile the same module simultaneously, it is possible that they
+//     all compile the module. The design here is that the lock isn't held for the action "Compile"
+//     but only for checking and saving the compiled result. Therefore, we strongly recommend that the embedder
+//     does the centralized compilation in a single Goroutines (or multiple Goroutines per Wasm binary) to generate cache rather than
+//     trying to Compile in parallel for a single module. In other words, we always recommend to produce CompiledModule
+//     share it across multiple Goroutines to avoid trying to compile the same module simultaneously.
 type CompilationCache interface{ api.Closer }
 
 // NewCompilationCache returns a new CompilationCache to be passed to RuntimeConfig.
diff --git a/vendor/github.com/tetratelabs/wazero/config.go b/vendor/github.com/tetratelabs/wazero/config.go
index 819a76df5e..ea7b84f443 100644
--- a/vendor/github.com/tetratelabs/wazero/config.go
+++ b/vendor/github.com/tetratelabs/wazero/config.go
@@ -148,7 +148,7 @@ type RuntimeConfig interface {
 	//	customSections := c.CustomSections()
 	WithCustomSections(bool) RuntimeConfig
 
-	// WithCloseOnContextDone ensures the executions of functions to be closed under one of the following circumstances:
+	// WithCloseOnContextDone ensures the executions of functions to be terminated under one of the following circumstances:
 	//
 	// 	- context.Context passed to the Call method of api.Function is canceled during execution. (i.e. ctx by context.WithCancel)
 	// 	- context.Context passed to the Call method of api.Function reaches timeout during execution. (i.e. ctx by context.WithTimeout or context.WithDeadline)
@@ -159,6 +159,8 @@ type RuntimeConfig interface {
 	// entire underlying OS thread which runs the api.Function call. See "Why it's safe to execute runtime-generated
 	// machine codes against async Goroutine preemption" section in RATIONALE.md for detail.
 	//
+	// Upon the termination of the function executions, api.Module is closed.
+	//
 	// Note that this comes with a bit of extra cost when enabled. The reason is that internally this forces
 	// interpreter and compiler runtimes to insert the periodical checks on the conditions above. For that reason,
 	// this is disabled by default.
@@ -217,9 +219,18 @@ const (
 // part. wazero automatically performs ahead-of-time compilation as needed when
 // Runtime.CompileModule is invoked.
 //
-// Warning: This panics at runtime if the runtime.GOOS or runtime.GOARCH does not
-// support compiler. Use NewRuntimeConfig to safely detect and fallback to
-// NewRuntimeConfigInterpreter if needed.
+// # Warning
+//
+//   - This panics at runtime if the runtime.GOOS or runtime.GOARCH does not
+//     support compiler. Use NewRuntimeConfig to safely detect and fallback to
+//     NewRuntimeConfigInterpreter if needed.
+//
+//   - If you are using wazero in buildmode=c-archive or c-shared, make sure that you set up the alternate signal stack
+//     by using, e.g. `sigaltstack` combined with `SA_ONSTACK` flag on `sigaction` on Linux,
+//     before calling any api.Function. This is because the Go runtime does not set up the alternate signal stack
+//     for c-archive or c-shared modes, and wazero uses the different stack than the calling Goroutine.
+//     Hence, the signal handler might get invoked on the wazero's stack, which may cause a stack overflow.
+//     https://github.com/tetratelabs/wazero/blob/2092c0a879f30d49d7b37f333f4547574b8afe0d/internal/integration_test/fuzz/fuzz/tests/sigstack.rs#L19-L36
 func NewRuntimeConfigCompiler() RuntimeConfig {
 	ret := engineLessConfig.clone()
 	ret.engineKind = engineKindCompiler
@@ -484,7 +495,20 @@ type ModuleConfig interface {
 	WithFSConfig(FSConfig) ModuleConfig
 
 	// WithName configures the module name. Defaults to what was decoded from
-	// the name section. Empty string ("") clears any name.
+	// the name section. Duplicate names are not allowed in a single Runtime.
+	//
+	// Calling this with the empty string "" makes the module anonymous.
+	// That is useful when you want to instantiate the same CompiledModule multiple times like below:
+	//
+	// 	for i := 0; i < N; i++ {
+	//		// Instantiate a new Wasm module from the already compiled `compiledWasm` anonymously without a name.
+	//		instance, err := r.InstantiateModule(ctx, compiledWasm, wazero.NewModuleConfig().WithName(""))
+	//		// ....
+	//	}
+	//
+	// See the `concurrent-instantiation` example for a complete usage.
+	//
+	// Non-empty named modules are available for other modules to import by name.
 	WithName(string) ModuleConfig
 
 	// WithStartFunctions configures the functions to call after the module is
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go b/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
index 443c5a294f..c75db615e6 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
@@ -21,13 +21,6 @@ type Snapshotter interface {
 	Snapshot() Snapshot
 }
 
-// EnableSnapshotterKey is a context key to indicate that snapshotting should be enabled.
-// The context.Context passed to a exported function invocation should have this key set
-// to a non-nil value, and host functions will be able to retrieve it using SnapshotterKey.
-//
-// Deprecated: use WithSnapshotter to enable snapshots.
-type EnableSnapshotterKey = expctxkeys.EnableSnapshotterKey
-
 // WithSnapshotter enables snapshots.
 // Passing the returned context to a exported function invocation enables snapshots,
 // and allows host functions to retrieve the Snapshotter using GetSnapshotter.
@@ -35,12 +28,6 @@ func WithSnapshotter(ctx context.Context) context.Context {
 	return context.WithValue(ctx, expctxkeys.EnableSnapshotterKey{}, struct{}{})
 }
 
-// SnapshotterKey is a context key to access a Snapshotter from a host function.
-// It is only present if EnableSnapshotter was set in the function invocation context.
-//
-// Deprecated: use GetSnapshotter to get the snapshotter.
-type SnapshotterKey = expctxkeys.SnapshotterKey
-
 // GetSnapshotter gets the Snapshotter from a host function.
 // It is only present if WithSnapshotter was called with the function invocation context.
 func GetSnapshotter(ctx context.Context) Snapshotter {
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/importresolver.go b/vendor/github.com/tetratelabs/wazero/experimental/importresolver.go
new file mode 100644
index 0000000000..36c0e22b15
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/experimental/importresolver.go
@@ -0,0 +1,19 @@
+package experimental
+
+import (
+	"context"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/internal/expctxkeys"
+)
+
+// ImportResolver is an experimental func type that, if set,
+// will be used as the first step in resolving imports.
+// See issue 2294.
+// If the import name is not found, it should return nil.
+type ImportResolver func(name string) api.Module
+
+// WithImportResolver returns a new context with the given ImportResolver.
+func WithImportResolver(ctx context.Context, resolver ImportResolver) context.Context {
+	return context.WithValue(ctx, expctxkeys.ImportResolverKey{}, resolver)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/listener.go b/vendor/github.com/tetratelabs/wazero/experimental/listener.go
index b2ba1fe834..55fc6b668e 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/listener.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/listener.go
@@ -24,12 +24,6 @@ type StackIterator interface {
 	ProgramCounter() ProgramCounter
 }
 
-// FunctionListenerFactoryKey is a context.Context Value key.
-// Its associated value should be a FunctionListenerFactory.
-//
-// Deprecated: use WithFunctionListenerFactory to enable snapshots.
-type FunctionListenerFactoryKey = expctxkeys.FunctionListenerFactoryKey
-
 // WithFunctionListenerFactory registers a FunctionListenerFactory
 // with the context.
 func WithFunctionListenerFactory(ctx context.Context, factory FunctionListenerFactory) context.Context {
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go b/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
index 761a1f9dc2..5ebc1780f4 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
@@ -23,6 +23,10 @@ const (
 	// instead of syscall.ENOTDIR
 	_ERROR_DIRECTORY = syscall.Errno(0x10B)
 
+	// _ERROR_NOT_A_REPARSE_POINT is a Windows error returned by os.Readlink
+	// instead of syscall.EINVAL
+	_ERROR_NOT_A_REPARSE_POINT = syscall.Errno(0x1126)
+
 	// _ERROR_INVALID_SOCKET is a Windows error returned by winsock_select
 	// when a given handle is not a socket.
 	_ERROR_INVALID_SOCKET = syscall.Errno(0x2736)
@@ -51,7 +55,7 @@ func errorToErrno(err error) Errno {
 			return EBADF
 		case syscall.ERROR_PRIVILEGE_NOT_HELD:
 			return EPERM
-		case _ERROR_NEGATIVE_SEEK, _ERROR_INVALID_NAME:
+		case _ERROR_NEGATIVE_SEEK, _ERROR_INVALID_NAME, _ERROR_NOT_A_REPARSE_POINT:
 			return EINVAL
 		}
 		errno, _ := syscallToErrno(err)
diff --git a/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/fs.go b/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/fs.go
index 384036a275..150f75cc16 100644
--- a/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/fs.go
+++ b/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/fs.go
@@ -1596,6 +1596,10 @@ func pathOpenFn(_ context.Context, mod api.Module, params []uint64) experimental
 		return errno
 	}
 
+	if pathLen == 0 {
+		return experimentalsys.EINVAL
+	}
+
 	fileOpenFlags := openFlags(dirflags, oflags, fdflags, rights)
 	isDir := fileOpenFlags&experimentalsys.O_DIRECTORY != 0
 
@@ -1704,7 +1708,6 @@ func openFlags(dirflags, oflags, fdflags uint16, rights uint32) (openFlags exper
 	}
 	if oflags&wasip1.O_DIRECTORY != 0 {
 		openFlags |= experimentalsys.O_DIRECTORY
-		return // Early return for directories as the rest of flags doesn't make sense for it.
 	} else if oflags&wasip1.O_EXCL != 0 {
 		openFlags |= experimentalsys.O_EXCL
 	}
@@ -1951,25 +1954,19 @@ func pathSymlinkFn(_ context.Context, mod api.Module, params []uint64) experimen
 		return experimentalsys.EFAULT
 	}
 
-	newPathBuf, ok := mem.Read(newPath, newPathLen)
-	if !ok {
-		return experimentalsys.EFAULT
+	_, newPathName, errno := atPath(fsc, mod.Memory(), fd, newPath, newPathLen)
+	if errno != 0 {
+		return errno
 	}
 
 	return dir.FS.Symlink(
 		// Do not join old path since it's only resolved when dereference the link created here.
 		// And the dereference result depends on the opening directory's file descriptor at that point.
-		bufToStr(oldPathBuf),
-		path.Join(dir.Name, bufToStr(newPathBuf)),
+		unsafe.String(&oldPathBuf[0], int(oldPathLen)),
+		newPathName,
 	)
 }
 
-// bufToStr converts the given byte slice as string unsafely.
-func bufToStr(buf []byte) string {
-	// TODO: use unsafe.String after flooring Go 1.20.
-	return *(*string)(unsafe.Pointer(&buf))
-}
-
 // pathUnlinkFile is the WASI function named PathUnlinkFileName which unlinks a
 // file.
 //
diff --git a/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/poll.go b/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/poll.go
index d09f30245b..4f96af2df3 100644
--- a/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/poll.go
+++ b/vendor/github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1/poll.go
@@ -68,9 +68,7 @@ func pollOneoffFn(_ context.Context, mod api.Module, params []uint64) sys.Errno
 	}
 	outBuf, ok := mem.Read(out, nsubscriptions*32)
 	// zero-out all buffer before writing
-	for i := range outBuf {
-		outBuf[i] = 0
-	}
+	clear(outBuf)
 
 	if !ok {
 		return sys.EFAULT
diff --git a/vendor/github.com/tetratelabs/wazero/internal/descriptor/table.go b/vendor/github.com/tetratelabs/wazero/internal/descriptor/table.go
index 542958bc7e..03761e6ec4 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/descriptor/table.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/descriptor/table.go
@@ -154,11 +154,6 @@ func (t *Table[Key, Item]) Range(f func(Key, Item) bool) {
 
 // Reset clears the content of the table.
 func (t *Table[Key, Item]) Reset() {
-	for i := range t.masks {
-		t.masks[i] = 0
-	}
-	var zero Item
-	for i := range t.items {
-		t.items[i] = zero
-	}
+	clear(t.masks)
+	clear(t.items)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
index 56dfac6206..4e20e4b2cb 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
@@ -26,11 +26,14 @@ const (
 type (
 	controlFrame struct {
 		frameID uint32
-		// originalStackLen holds the number of values on the stack
+		// originalStackLenWithoutParam holds the number of values on the stack
 		// when Start executing this control frame minus params for the block.
 		originalStackLenWithoutParam int
-		blockType                    *wasm.FunctionType
-		kind                         controlFrameKind
+		// originalStackLenWithoutParamUint64 is almost the same as originalStackLenWithoutParam
+		// except that it holds the number of values on the stack in uint64.
+		originalStackLenWithoutParamUint64 int
+		blockType                          *wasm.FunctionType
+		kind                               controlFrameKind
 	}
 	controlFrames struct{ frames []controlFrame }
 )
@@ -157,9 +160,11 @@ type compiler struct {
 	enabledFeatures            api.CoreFeatures
 	callFrameStackSizeInUint64 int
 	stack                      []unsignedType
-	currentFrameID             uint32
-	controlFrames              controlFrames
-	unreachableState           struct {
+	// stackLenInUint64 is the length of the stack in uint64.
+	stackLenInUint64 int
+	currentFrameID   uint32
+	controlFrames    controlFrames
+	unreachableState struct {
 		on    bool
 		depth int
 	}
@@ -341,6 +346,7 @@ func (c *compiler) Next() (*compilationResult, error) {
 	c.pc = 0
 	c.currentOpPC = 0
 	c.currentFrameID = 0
+	c.stackLenInUint64 = 0
 	c.unreachableState.on, c.unreachableState.depth = false, 0
 
 	if err := c.compile(sig, code.Body, code.LocalTypes, code.BodyOffsetInCodeSection); err != nil {
@@ -449,10 +455,11 @@ operatorSwitch:
 
 		// Create a new frame -- entering this block.
 		frame := controlFrame{
-			frameID:                      c.nextFrameID(),
-			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
-			kind:                         controlFrameKindBlockWithoutContinuationLabel,
-			blockType:                    bt,
+			frameID:                            c.nextFrameID(),
+			originalStackLenWithoutParam:       len(c.stack) - len(bt.Params),
+			originalStackLenWithoutParamUint64: c.stackLenInUint64 - bt.ParamNumInUint64,
+			kind:                               controlFrameKindBlockWithoutContinuationLabel,
+			blockType:                          bt,
 		}
 		c.controlFrames.push(frame)
 
@@ -473,10 +480,11 @@ operatorSwitch:
 
 		// Create a new frame -- entering loop.
 		frame := controlFrame{
-			frameID:                      c.nextFrameID(),
-			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
-			kind:                         controlFrameKindLoop,
-			blockType:                    bt,
+			frameID:                            c.nextFrameID(),
+			originalStackLenWithoutParam:       len(c.stack) - len(bt.Params),
+			originalStackLenWithoutParamUint64: c.stackLenInUint64 - bt.ParamNumInUint64,
+			kind:                               controlFrameKindLoop,
+			blockType:                          bt,
 		}
 		c.controlFrames.push(frame)
 
@@ -515,8 +523,9 @@ operatorSwitch:
 
 		// Create a new frame -- entering if.
 		frame := controlFrame{
-			frameID:                      c.nextFrameID(),
-			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
+			frameID:                            c.nextFrameID(),
+			originalStackLenWithoutParam:       len(c.stack) - len(bt.Params),
+			originalStackLenWithoutParamUint64: c.stackLenInUint64 - bt.ParamNumInUint64,
 			// Note this will be set to controlFrameKindIfWithElse
 			// when else opcode found later.
 			kind:      controlFrameKindIfWithoutElse,
@@ -543,7 +552,7 @@ operatorSwitch:
 			// If it is currently in unreachable, and the non-nested if,
 			// reset the stack so we can correctly handle the else block.
 			top := c.controlFrames.top()
-			c.stack = c.stack[:top.originalStackLenWithoutParam]
+			c.stackSwitchAt(top)
 			top.kind = controlFrameKindIfWithElse
 
 			// Re-push the parameters to the if block so that else block can use them.
@@ -572,7 +581,7 @@ operatorSwitch:
 
 		// Reset the stack manipulated by the then block, and re-push the block param types to the stack.
 
-		c.stack = c.stack[:frame.originalStackLenWithoutParam]
+		c.stackSwitchAt(frame)
 		for _, t := range frame.blockType.Params {
 			c.stackPush(wasmValueTypeTounsignedType(t))
 		}
@@ -601,7 +610,7 @@ operatorSwitch:
 				return nil
 			}
 
-			c.stack = c.stack[:frame.originalStackLenWithoutParam]
+			c.stackSwitchAt(frame)
 			for _, t := range frame.blockType.Results {
 				c.stackPush(wasmValueTypeTounsignedType(t))
 			}
@@ -628,7 +637,7 @@ operatorSwitch:
 		// We need to reset the stack so that
 		// the values pushed inside the block.
 		dropOp := newOperationDrop(c.getFrameDropRange(frame, true))
-		c.stack = c.stack[:frame.originalStackLenWithoutParam]
+		c.stackSwitchAt(frame)
 
 		// Push the result types onto the stack.
 		for _, t := range frame.blockType.Results {
@@ -3505,6 +3514,11 @@ func (c *compiler) stackPeek() (ret unsignedType) {
 	return
 }
 
+func (c *compiler) stackSwitchAt(frame *controlFrame) {
+	c.stack = c.stack[:frame.originalStackLenWithoutParam]
+	c.stackLenInUint64 = frame.originalStackLenWithoutParamUint64
+}
+
 func (c *compiler) stackPop() (ret unsignedType) {
 	// No need to check stack bound
 	// as we can assume that all the operations
@@ -3512,11 +3526,13 @@ func (c *compiler) stackPop() (ret unsignedType) {
 	// at module validation phase.
 	ret = c.stack[len(c.stack)-1]
 	c.stack = c.stack[:len(c.stack)-1]
+	c.stackLenInUint64 -= 1 + int(unsignedTypeV128&ret>>2)
 	return
 }
 
 func (c *compiler) stackPush(ts unsignedType) {
 	c.stack = append(c.stack, ts)
+	c.stackLenInUint64 += 1 + int(unsignedTypeV128&ts>>2)
 }
 
 // emit adds the operations into the result.
@@ -3565,7 +3581,7 @@ func (c *compiler) emitDefaultValue(t wasm.ValueType) {
 // of the n-th local.
 func (c *compiler) localDepth(index wasm.Index) int {
 	height := c.localIndexToStackHeightInUint64[index]
-	return c.stackLenInUint64(len(c.stack)) - 1 - int(height)
+	return c.stackLenInUint64 - 1 - height
 }
 
 func (c *compiler) localType(index wasm.Index) (t wasm.ValueType) {
@@ -3592,14 +3608,7 @@ func (c *compiler) getFrameDropRange(frame *controlFrame, isEnd bool) inclusiveR
 	} else {
 		start = frame.blockType.ResultNumInUint64
 	}
-	var end int
-	if frame.kind == controlFrameKindFunction {
-		// On the function return, we eliminate all the contents on the stack
-		// including locals (existing below of frame.originalStackLen)
-		end = c.stackLenInUint64(len(c.stack)) - 1
-	} else {
-		end = c.stackLenInUint64(len(c.stack)) - 1 - c.stackLenInUint64(frame.originalStackLenWithoutParam)
-	}
+	end := c.stackLenInUint64 - 1 - frame.originalStackLenWithoutParamUint64
 	if start <= end {
 		return inclusiveRange{Start: int32(start), End: int32(end)}
 	} else {
@@ -3607,17 +3616,6 @@ func (c *compiler) getFrameDropRange(frame *controlFrame, isEnd bool) inclusiveR
 	}
 }
 
-func (c *compiler) stackLenInUint64(ceil int) (ret int) {
-	for i := 0; i < ceil; i++ {
-		if c.stack[i] == unsignedTypeV128 {
-			ret += 2
-		} else {
-			ret++
-		}
-	}
-	return
-}
-
 func (c *compiler) readMemoryArg(tag string) (memoryArg, error) {
 	c.result.UsesMemory = true
 	alignment, num, err := leb128.LoadUint32(c.body[c.pc+1:])
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
index a89ddc4573..ee0b453ca0 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
@@ -98,6 +98,9 @@ func (e *moduleEngine) SetGlobalValue(idx wasm.Index, lo, hi uint64) {
 // OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
 func (e *moduleEngine) OwnsGlobals() bool { return false }
 
+// MemoryGrown implements wasm.ModuleEngine.
+func (e *moduleEngine) MemoryGrown() {}
+
 // callEngine holds context per moduleEngine.Call, and shared across all the
 // function calls originating from the same moduleEngine.Call execution.
 //
@@ -3898,14 +3901,9 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 		case operationKindV128Dot:
 			x2Hi, x2Lo := ce.popValue(), ce.popValue()
 			x1Hi, x1Lo := ce.popValue(), ce.popValue()
-			ce.pushValue(
-				uint64(uint32(int32(int16(x1Lo>>0))*int32(int16(x2Lo>>0))+int32(int16(x1Lo>>16))*int32(int16(x2Lo>>16)))) |
-					(uint64(uint32(int32(int16(x1Lo>>32))*int32(int16(x2Lo>>32))+int32(int16(x1Lo>>48))*int32(int16(x2Lo>>48)))) << 32),
-			)
-			ce.pushValue(
-				uint64(uint32(int32(int16(x1Hi>>0))*int32(int16(x2Hi>>0))+int32(int16(x1Hi>>16))*int32(int16(x2Hi>>16)))) |
-					(uint64(uint32(int32(int16(x1Hi>>32))*int32(int16(x2Hi>>32))+int32(int16(x1Hi>>48))*int32(int16(x2Hi>>48)))) << 32),
-			)
+			lo, hi := v128Dot(x1Hi, x1Lo, x2Hi, x2Lo)
+			ce.pushValue(lo)
+			ce.pushValue(hi)
 			frame.pc++
 		case operationKindV128ITruncSatFromF:
 			hi, lo := ce.popValue(), ce.popValue()
@@ -4581,3 +4579,18 @@ func (ce *callEngine) callGoFuncWithStack(ctx context.Context, m *wasm.ModuleIns
 		ce.stack = ce.stack[0 : len(ce.stack)-shrinkLen]
 	}
 }
+
+// v128Dot performs a dot product of two 64-bit vectors.
+// Note: for some reason (which I suspect is due to a bug in Go compiler's regalloc),
+// inlining this function causes a bug which happens **only when** we run with -race AND arm64 AND Go 1.22.
+func v128Dot(x1Hi, x1Lo, x2Hi, x2Lo uint64) (uint64, uint64) {
+	r1 := int32(int16(x1Lo>>0)) * int32(int16(x2Lo>>0))
+	r2 := int32(int16(x1Lo>>16)) * int32(int16(x2Lo>>16))
+	r3 := int32(int16(x1Lo>>32)) * int32(int16(x2Lo>>32))
+	r4 := int32(int16(x1Lo>>48)) * int32(int16(x2Lo>>48))
+	r5 := int32(int16(x1Hi>>0)) * int32(int16(x2Hi>>0))
+	r6 := int32(int16(x1Hi>>16)) * int32(int16(x2Hi>>16))
+	r7 := int32(int16(x1Hi>>32)) * int32(int16(x2Hi>>32))
+	r8 := int32(int16(x1Hi>>48)) * int32(int16(x2Hi>>48))
+	return uint64(uint32(r1+r2)) | (uint64(uint32(r3+r4)) << 32), uint64(uint32(r5+r6)) | (uint64(uint32(r7+r8)) << 32)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
index 59bbfe02d2..62d3650152 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@@ -69,7 +69,7 @@ type Compiler interface {
 	AllocateVReg(typ ssa.Type) regalloc.VReg
 
 	// ValueDefinition returns the definition of the given value.
-	ValueDefinition(ssa.Value) *SSAValueDefinition
+	ValueDefinition(ssa.Value) SSAValueDefinition
 
 	// VRegOf returns the virtual register of the given ssa.Value.
 	VRegOf(value ssa.Value) regalloc.VReg
@@ -79,13 +79,13 @@ type Compiler interface {
 
 	// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
 	// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
-	MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
+	MatchInstr(def SSAValueDefinition, opcode ssa.Opcode) bool
 
 	// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
 	// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
 	//
 	// Note: caller should be careful to avoid excessive allocation on opcodes slice.
-	MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
+	MatchInstrOneOf(def SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
 
 	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
 	AddRelocationInfo(funcRef ssa.FuncRef)
@@ -126,10 +126,7 @@ type compiler struct {
 	nextVRegID regalloc.VRegID
 	// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
 	ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
-	// ssaValueDefinitions maps ssa.ValueID to its definition.
-	ssaValueDefinitions []SSAValueDefinition
-	// ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
-	ssaValueRefCounts []int
+	ssaValuesInfo   []ssa.ValueInfo
 	// returnVRegs is the list of virtual registers that store the return values.
 	returnVRegs  []regalloc.VReg
 	varEdges     [][2]regalloc.VReg
@@ -206,15 +203,10 @@ func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
 // assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
 func (c *compiler) assignVirtualRegisters() {
 	builder := c.ssaBuilder
-	refCounts := builder.ValueRefCounts()
-	c.ssaValueRefCounts = refCounts
+	c.ssaValuesInfo = builder.ValuesInfo()
 
-	need := len(refCounts)
-	if need >= len(c.ssaValueToVRegs) {
-		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
-	}
-	if need >= len(c.ssaValueDefinitions) {
-		c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
+	if diff := len(c.ssaValuesInfo) - len(c.ssaValueToVRegs); diff > 0 {
+		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, diff+1)...)
 	}
 
 	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
@@ -225,40 +217,26 @@ func (c *compiler) assignVirtualRegisters() {
 			typ := p.Type()
 			vreg := c.AllocateVReg(typ)
 			c.ssaValueToVRegs[pid] = vreg
-			c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
 			c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
 		}
 
 		// Assigns each value to a virtual register produced by instructions.
 		for cur := blk.Root(); cur != nil; cur = cur.Next() {
 			r, rs := cur.Returns()
-			var N int
 			if r.Valid() {
 				id := r.ID()
 				ssaTyp := r.Type()
 				typ := r.Type()
 				vReg := c.AllocateVReg(typ)
 				c.ssaValueToVRegs[id] = vReg
-				c.ssaValueDefinitions[id] = SSAValueDefinition{
-					Instr:    cur,
-					N:        0,
-					RefCount: refCounts[id],
-				}
 				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
-				N++
 			}
 			for _, r := range rs {
 				id := r.ID()
 				ssaTyp := r.Type()
 				vReg := c.AllocateVReg(ssaTyp)
 				c.ssaValueToVRegs[id] = vReg
-				c.ssaValueDefinitions[id] = SSAValueDefinition{
-					Instr:    cur,
-					N:        N,
-					RefCount: refCounts[id],
-				}
 				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
-				N++
 			}
 		}
 	}
@@ -299,8 +277,12 @@ func (c *compiler) Init() {
 }
 
 // ValueDefinition implements Compiler.ValueDefinition.
-func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
-	return &c.ssaValueDefinitions[value.ID()]
+func (c *compiler) ValueDefinition(value ssa.Value) SSAValueDefinition {
+	return SSAValueDefinition{
+		V:        value,
+		Instr:    c.ssaBuilder.InstructionOfValue(value),
+		RefCount: c.ssaValuesInfo[value.ID()].RefCount,
+	}
 }
 
 // VRegOf implements Compiler.VRegOf.
@@ -319,7 +301,7 @@ func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
 }
 
 // MatchInstr implements Compiler.MatchInstr.
-func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
+func (c *compiler) MatchInstr(def SSAValueDefinition, opcode ssa.Opcode) bool {
 	instr := def.Instr
 	return def.IsFromInstr() &&
 		instr.Opcode() == opcode &&
@@ -328,7 +310,7 @@ func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
 }
 
 // MatchInstrOneOf implements Compiler.MatchInstrOneOf.
-func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
+func (c *compiler) MatchInstrOneOf(def SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
 	instr := def.Instr
 	if !def.IsFromInstr() {
 		return ssa.OpcodeInvalid
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
index 80e65668ad..735cfa3d35 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
@@ -9,7 +9,7 @@ import (
 func (c *compiler) Lower() {
 	c.assignVirtualRegisters()
 	c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
-	c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
+	c.mach.StartLoweringFunction(c.ssaBuilder.BlockIDMax())
 	c.lowerBlocks()
 }
 
@@ -20,12 +20,11 @@ func (c *compiler) lowerBlocks() {
 		c.lowerBlock(blk)
 	}
 
-	ectx := c.mach.ExecutableContext()
 	// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
 	var prev ssa.BasicBlock
 	for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
 		if prev != nil {
-			ectx.LinkAdjacentBlocks(prev, next)
+			c.mach.LinkAdjacentBlocks(prev, next)
 		}
 		prev = next
 	}
@@ -33,8 +32,7 @@ func (c *compiler) lowerBlocks() {
 
 func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
 	mach := c.mach
-	ectx := mach.ExecutableContext()
-	ectx.StartBlock(blk)
+	mach.StartBlock(blk)
 
 	// We traverse the instructions in reverse order because we might want to lower multiple
 	// instructions together.
@@ -76,7 +74,7 @@ func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
 		default:
 			mach.LowerInstr(cur)
 		}
-		ectx.FlushPendingInstructions()
+		mach.FlushPendingInstructions()
 	}
 
 	// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
@@ -84,7 +82,7 @@ func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
 		c.lowerFunctionArguments(blk)
 	}
 
-	ectx.EndBlock()
+	mach.EndBlock()
 }
 
 // lowerBranches is called right after StartBlock and before any LowerInstr call if
@@ -93,23 +91,24 @@ func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
 //
 // See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
 func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
-	ectx := c.mach.ExecutableContext()
+	mach := c.mach
 
 	c.setCurrentGroupID(br0.GroupID())
 	c.mach.LowerSingleBranch(br0)
-	ectx.FlushPendingInstructions()
+	mach.FlushPendingInstructions()
 	if br1 != nil {
 		c.setCurrentGroupID(br1.GroupID())
 		c.mach.LowerConditionalBranch(br1)
-		ectx.FlushPendingInstructions()
+		mach.FlushPendingInstructions()
 	}
 
 	if br0.Opcode() == ssa.OpcodeJump {
-		_, args, target := br0.BranchData()
+		_, args, targetBlockID := br0.BranchData()
 		argExists := len(args) != 0
 		if argExists && br1 != nil {
 			panic("BUG: critical edge split failed")
 		}
+		target := c.ssaBuilder.BasicBlock(targetBlockID)
 		if argExists && target.ReturnBlock() {
 			if len(args) > 0 {
 				c.mach.LowerReturns(args)
@@ -118,24 +117,25 @@ func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
 			c.lowerBlockArguments(args, target)
 		}
 	}
-	ectx.FlushPendingInstructions()
+	mach.FlushPendingInstructions()
 }
 
 func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
-	ectx := c.mach.ExecutableContext()
+	mach := c.mach
 
 	c.tmpVals = c.tmpVals[:0]
+	data := c.ssaBuilder.ValuesInfo()
 	for i := 0; i < entry.Params(); i++ {
 		p := entry.Param(i)
-		if c.ssaValueRefCounts[p.ID()] > 0 {
+		if data[p.ID()].RefCount > 0 {
 			c.tmpVals = append(c.tmpVals, p)
 		} else {
 			// If the argument is not used, we can just pass an invalid value.
 			c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
 		}
 	}
-	c.mach.LowerParams(c.tmpVals)
-	ectx.FlushPendingInstructions()
+	mach.LowerParams(c.tmpVals)
+	mach.FlushPendingInstructions()
 }
 
 // lowerBlockArguments lowers how to pass arguments to the given successor block.
@@ -152,12 +152,12 @@ func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
 		src := args[i]
 
 		dstReg := c.VRegOf(dst)
-		srcDef := c.ssaValueDefinitions[src.ID()]
-		if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
+		srcInstr := c.ssaBuilder.InstructionOfValue(src)
+		if srcInstr != nil && srcInstr.Constant() {
 			c.constEdges = append(c.constEdges, struct {
 				cInst *ssa.Instruction
 				dst   regalloc.VReg
-			}{cInst: srcDef.Instr, dst: dstReg})
+			}{cInst: srcInstr, dst: dstReg})
 		} else {
 			srcReg := c.VRegOf(src)
 			// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
deleted file mode 100644
index 81c6a6b62e..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
+++ /dev/null
@@ -1,219 +0,0 @@
-package backend
-
-import (
-	"fmt"
-	"math"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type ExecutableContext interface {
-	// StartLoweringFunction is called when the lowering of the given function is started.
-	// maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
-	StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
-
-	// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
-	LinkAdjacentBlocks(prev, next ssa.BasicBlock)
-
-	// StartBlock is called when the compilation of the given block is started.
-	// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
-	// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
-	StartBlock(ssa.BasicBlock)
-
-	// EndBlock is called when the compilation of the current block is finished.
-	EndBlock()
-
-	// FlushPendingInstructions flushes the pending instructions to the buffer.
-	// This will be called after the lowering of each SSA Instruction.
-	FlushPendingInstructions()
-}
-
-type ExecutableContextT[Instr any] struct {
-	CurrentSSABlk ssa.BasicBlock
-
-	// InstrPool is the InstructionPool of instructions.
-	InstructionPool wazevoapi.Pool[Instr]
-	asNop           func(*Instr)
-	setNext         func(*Instr, *Instr)
-	setPrev         func(*Instr, *Instr)
-
-	// RootInstr is the root instruction of the executable.
-	RootInstr         *Instr
-	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
-	NextLabel         Label
-	// LabelPositions maps a label to the instructions of the region which the label represents.
-	LabelPositions     map[Label]*LabelPosition[Instr]
-	OrderedBlockLabels []*LabelPosition[Instr]
-
-	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
-	PerBlockHead, PerBlockEnd *Instr
-	// PendingInstructions are the instructions which are not yet emitted into the instruction list.
-	PendingInstructions []*Instr
-
-	// SsaBlockIDToLabels maps an SSA block ID to the label.
-	SsaBlockIDToLabels []Label
-}
-
-func NewExecutableContextT[Instr any](
-	resetInstruction func(*Instr),
-	setNext func(*Instr, *Instr),
-	setPrev func(*Instr, *Instr),
-	asNop func(*Instr),
-) *ExecutableContextT[Instr] {
-	return &ExecutableContextT[Instr]{
-		InstructionPool:   wazevoapi.NewPool[Instr](resetInstruction),
-		asNop:             asNop,
-		setNext:           setNext,
-		setPrev:           setPrev,
-		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
-		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
-		NextLabel:         LabelInvalid,
-	}
-}
-
-func resetLabelPosition[T any](l *LabelPosition[T]) {
-	*l = LabelPosition[T]{}
-}
-
-// StartLoweringFunction implements ExecutableContext.
-func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
-	imax := int(max)
-	if len(e.SsaBlockIDToLabels) <= imax {
-		// Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
-		e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
-	}
-}
-
-func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
-	e.CurrentSSABlk = blk
-
-	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
-	if l == LabelInvalid {
-		l = e.AllocateLabel()
-		e.SsaBlockIDToLabels[blk.ID()] = l
-	}
-
-	end := e.allocateNop0()
-	e.PerBlockHead, e.PerBlockEnd = end, end
-
-	labelPos, ok := e.LabelPositions[l]
-	if !ok {
-		labelPos = e.AllocateLabelPosition(l)
-		e.LabelPositions[l] = labelPos
-	}
-	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
-	labelPos.Begin, labelPos.End = end, end
-	labelPos.SB = blk
-}
-
-// EndBlock implements ExecutableContext.
-func (e *ExecutableContextT[T]) EndBlock() {
-	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
-	e.insertAtPerBlockHead(e.allocateNop0())
-
-	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
-	e.LabelPositions[l].Begin = e.PerBlockHead
-
-	if e.CurrentSSABlk.EntryBlock() {
-		e.RootInstr = e.PerBlockHead
-	}
-}
-
-func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
-	if e.PerBlockHead == nil {
-		e.PerBlockHead = i
-		e.PerBlockEnd = i
-		return
-	}
-	e.setNext(i, e.PerBlockHead)
-	e.setPrev(e.PerBlockHead, i)
-	e.PerBlockHead = i
-}
-
-// FlushPendingInstructions implements ExecutableContext.
-func (e *ExecutableContextT[T]) FlushPendingInstructions() {
-	l := len(e.PendingInstructions)
-	if l == 0 {
-		return
-	}
-	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
-		e.insertAtPerBlockHead(e.PendingInstructions[i])
-	}
-	e.PendingInstructions = e.PendingInstructions[:0]
-}
-
-func (e *ExecutableContextT[T]) Reset() {
-	e.labelPositionPool.Reset()
-	e.InstructionPool.Reset()
-	for l := Label(0); l <= e.NextLabel; l++ {
-		delete(e.LabelPositions, l)
-	}
-	e.PendingInstructions = e.PendingInstructions[:0]
-	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
-	e.RootInstr = nil
-	e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
-	e.PerBlockHead, e.PerBlockEnd = nil, nil
-	e.NextLabel = LabelInvalid
-}
-
-// AllocateLabel allocates an unused label.
-func (e *ExecutableContextT[T]) AllocateLabel() Label {
-	e.NextLabel++
-	return e.NextLabel
-}
-
-func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
-	l := e.labelPositionPool.Allocate()
-	l.L = la
-	return l
-}
-
-func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
-	if blk.ReturnBlock() {
-		return LabelReturn
-	}
-	l := e.SsaBlockIDToLabels[blk.ID()]
-	if l == LabelInvalid {
-		l = e.AllocateLabel()
-		e.SsaBlockIDToLabels[blk.ID()] = l
-	}
-	return l
-}
-
-func (e *ExecutableContextT[T]) allocateNop0() *T {
-	i := e.InstructionPool.Allocate()
-	e.asNop(i)
-	return i
-}
-
-// LinkAdjacentBlocks implements backend.Machine.
-func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
-	prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
-	nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
-	e.setNext(prevLabelPos.End, nextLabelPos.Begin)
-}
-
-// LabelPosition represents the regions of the generated code which the label represents.
-type LabelPosition[Instr any] struct {
-	SB           ssa.BasicBlock
-	L            Label
-	Begin, End   *Instr
-	BinaryOffset int64
-}
-
-// Label represents a position in the generated code which is either
-// a real instruction or the constant InstructionPool (e.g. jump tables).
-//
-// This is exactly the same as the traditional "label" in assembly code.
-type Label uint32
-
-const (
-	LabelInvalid Label = 0
-	LabelReturn  Label = math.MaxUint32
-)
-
-// String implements backend.Machine.
-func (l Label) String() string {
-	return fmt.Sprintf("L%d", l)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
index 751050aff0..96f035e582 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
@@ -14,7 +14,6 @@ var calleeSavedVRegs = []regalloc.VReg{
 
 // CompileGoFunctionTrampoline implements backend.Machine.
 func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
-	ectx := m.ectx
 	argBegin := 1 // Skips exec context by default.
 	if needModuleContextPtr {
 		argBegin++
@@ -25,7 +24,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	m.currentABI = abi
 
 	cur := m.allocateNop()
-	ectx.RootInstr = cur
+	m.rootInstr = cur
 
 	// Execution context is always the first argument.
 	execCtrPtr := raxVReg
@@ -272,7 +271,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	cur = m.revertRBPRSP(cur)
 	linkInstr(cur, m.allocateInstr().asRet())
 
-	m.encodeWithoutSSA(ectx.RootInstr)
+	m.encodeWithoutSSA(m.rootInstr)
 	return m.c.Buf()
 }
 
@@ -347,10 +346,8 @@ var stackGrowSaveVRegs = []regalloc.VReg{
 
 // CompileStackGrowCallSequence implements backend.Machine.
 func (m *machine) CompileStackGrowCallSequence() []byte {
-	ectx := m.ectx
-
 	cur := m.allocateNop()
-	ectx.RootInstr = cur
+	m.rootInstr = cur
 
 	cur = m.setupRBPRSP(cur)
 
@@ -379,7 +376,7 @@ func (m *machine) CompileStackGrowCallSequence() []byte {
 	cur = m.revertRBPRSP(cur)
 	linkInstr(cur, m.allocateInstr().asRet())
 
-	m.encodeWithoutSSA(ectx.RootInstr)
+	m.encodeWithoutSSA(m.rootInstr)
 	return m.c.Buf()
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
index d27e79c0e5..6a3e58f51f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
@@ -17,16 +17,6 @@ type instruction struct {
 	kind                instructionKind
 }
 
-// Next implements regalloc.Instr.
-func (i *instruction) Next() regalloc.Instr {
-	return i.next
-}
-
-// Prev implements regalloc.Instr.
-func (i *instruction) Prev() regalloc.Instr {
-	return i.prev
-}
-
 // IsCall implements regalloc.Instr.
 func (i *instruction) IsCall() bool { return i.kind == call }
 
@@ -36,9 +26,6 @@ func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect }
 // IsReturn implements regalloc.Instr.
 func (i *instruction) IsReturn() bool { return i.kind == ret }
 
-// AddedBeforeRegAlloc implements regalloc.Instr.
-func (i *instruction) AddedBeforeRegAlloc() bool { return i.addedBeforeRegAlloc }
-
 // String implements regalloc.Instr.
 func (i *instruction) String() string {
 	switch i.kind {
@@ -651,26 +638,14 @@ func resetInstruction(i *instruction) {
 	*i = instruction{}
 }
 
-func setNext(i *instruction, next *instruction) {
-	i.next = next
-}
-
-func setPrev(i *instruction, prev *instruction) {
-	i.prev = prev
-}
-
-func asNop(i *instruction) {
-	i.kind = nop0
-}
-
-func (i *instruction) asNop0WithLabel(label backend.Label) *instruction { //nolint
+func (i *instruction) asNop0WithLabel(label label) *instruction { //nolint
 	i.kind = nop0
 	i.u1 = uint64(label)
 	return i
 }
 
-func (i *instruction) nop0Label() backend.Label {
-	return backend.Label(i.u1)
+func (i *instruction) nop0Label() label {
+	return label(i.u1)
 }
 
 type instructionKind byte
@@ -1161,7 +1136,7 @@ func (i *instruction) asJmp(target operand) *instruction {
 	return i
 }
 
-func (i *instruction) jmpLabel() backend.Label {
+func (i *instruction) jmpLabel() label {
 	switch i.kind {
 	case jmp, jmpIf, lea, xmmUnaryRmR:
 		return i.op1.label()
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
index bee673d25c..befe8c6436 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
@@ -130,9 +130,9 @@ func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
 	}
 }
 
-func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
-	if x.IsFromBlockParam() {
-		return addend{x.BlkParamVReg, 0, 0}
+func (m *machine) lowerAddend(x backend.SSAValueDefinition) addend {
+	if !x.IsFromInstr() {
+		return addend{m.c.VRegOf(x.V), 0, 0}
 	}
 	// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
 	op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
index 310ad2203a..aeeb6b6454 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -16,18 +16,13 @@ import (
 
 // NewBackend returns a new backend for arm64.
 func NewBackend() backend.Machine {
-	ectx := backend.NewExecutableContextT[instruction](
-		resetInstruction,
-		setNext,
-		setPrev,
-		asNop,
-	)
-	return &machine{
-		ectx:                                ectx,
+	m := &machine{
 		cpuFeatures:                         platform.CpuFeatures,
-		regAlloc:                            regalloc.NewAllocator(regInfo),
+		regAlloc:                            regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
 		spillSlots:                          map[regalloc.VRegID]int64{},
 		amodePool:                           wazevoapi.NewPool[amode](nil),
+		labelPositionPool:                   wazevoapi.NewIDedPool[labelPosition](resetLabelPosition),
+		instrPool:                           wazevoapi.NewPool[instruction](resetInstruction),
 		constSwizzleMaskConstIndex:          -1,
 		constSqmulRoundSatIndex:             -1,
 		constI8x16SHLMaskTableIndex:         -1,
@@ -41,23 +36,46 @@ func NewBackend() backend.Machine {
 		constExtAddPairwiseI16x8uMask1Index: -1,
 		constExtAddPairwiseI16x8uMask2Index: -1,
 	}
+	m.regAllocFn.m = m
+	return m
 }
 
 type (
 	// machine implements backend.Machine for amd64.
 	machine struct {
 		c                        backend.Compiler
-		ectx                     *backend.ExecutableContextT[instruction]
 		stackBoundsCheckDisabled bool
 
+		instrPool wazevoapi.Pool[instruction]
 		amodePool wazevoapi.Pool[amode]
 
 		cpuFeatures platform.CpuFeatureFlags
 
-		regAlloc        regalloc.Allocator
-		regAllocFn      *backend.RegAllocFunction[*instruction, *machine]
+		regAlloc        regalloc.Allocator[*instruction, *labelPosition, *regAllocFn]
+		regAllocFn      regAllocFn
 		regAllocStarted bool
 
+		// labelPositionPool is the pool of labelPosition. The id is the label where
+		// if the label is less than the maxSSABlockID, it's the ssa.BasicBlockID.
+		labelPositionPool wazevoapi.IDedPool[labelPosition]
+		// nextLabel is the next label to be allocated. The first free label comes after maxSSABlockID
+		// so that we can have an identical label for the SSA block ID, which is useful for debugging.
+		nextLabel label
+		// rootInstr is the first instruction of the function.
+		rootInstr *instruction
+		// currentLabelPos is the currently-compiled ssa.BasicBlock's labelPosition.
+		currentLabelPos *labelPosition
+		// orderedSSABlockLabelPos is the ordered list of labelPosition in the generated code for each ssa.BasicBlock.
+		orderedSSABlockLabelPos []*labelPosition
+		// returnLabelPos is the labelPosition for the return block.
+		returnLabelPos labelPosition
+		// perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+		perBlockHead, perBlockEnd *instruction
+		// pendingInstructions are the instructions which are not yet emitted into the instruction list.
+		pendingInstructions []*instruction
+		// maxSSABlockID is the maximum ssa.BasicBlockID in the current function.
+		maxSSABlockID label
+
 		spillSlotSize int64
 		spillSlots    map[regalloc.VRegID]int64
 		currentABI    *backend.FunctionABI
@@ -67,8 +85,11 @@ type (
 
 		labelResolutionPends []labelResolutionPend
 
+		// jmpTableTargets holds the labels of the jump table targets.
 		jmpTableTargets [][]uint32
-		consts          []_const
+		// jmpTableTargetNext is the index to the jmpTableTargets slice to be used for the next jump table.
+		jmpTableTargetsNext int
+		consts              []_const
 
 		constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
 		constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
@@ -79,9 +100,10 @@ type (
 	}
 
 	_const struct {
-		lo, hi uint64
-		_var   []byte
-		label  *labelPosition
+		lo, hi   uint64
+		_var     []byte
+		label    label
+		labelPos *labelPosition
 	}
 
 	labelResolutionPend struct {
@@ -90,22 +112,73 @@ type (
 		// imm32Offset is the offset of the last 4 bytes of the instruction.
 		imm32Offset int64
 	}
+)
 
-	labelPosition = backend.LabelPosition[instruction]
+type (
+	// label represents a position in the generated code which is either
+	// a real instruction or the constant InstructionPool (e.g. jump tables).
+	//
+	// This is exactly the same as the traditional "label" in assembly code.
+	label uint32
+
+	// labelPosition represents the regions of the generated code which the label represents.
+	// This implements regalloc.Block.
+	labelPosition struct {
+		// sb is not nil if this corresponds to a ssa.BasicBlock.
+		sb ssa.BasicBlock
+		// cur is used to walk through the instructions in the block during the register allocation.
+		cur,
+		// begin and end are the first and last instructions of the block.
+		begin, end *instruction
+		// binaryOffset is the offset in the binary where the label is located.
+		binaryOffset int64
+	}
 )
 
-func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label {
+// String implements backend.Machine.
+func (l label) String() string {
+	return fmt.Sprintf("L%d", l)
+}
+
+func resetLabelPosition(l *labelPosition) {
+	*l = labelPosition{}
+}
+
+const labelReturn = math.MaxUint32
+
+func ssaBlockLabel(sb ssa.BasicBlock) label {
+	if sb.ReturnBlock() {
+		return labelReturn
+	}
+	return label(sb.ID())
+}
+
+// getOrAllocateSSABlockLabelPosition returns the labelPosition for the given basic block.
+func (m *machine) getOrAllocateSSABlockLabelPosition(sb ssa.BasicBlock) *labelPosition {
+	if sb.ReturnBlock() {
+		m.returnLabelPos.sb = sb
+		return &m.returnLabelPos
+	}
+
+	l := ssaBlockLabel(sb)
+	pos := m.labelPositionPool.GetOrAllocate(int(l))
+	pos.sb = sb
+	return pos
+}
+
+func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) label {
 	index := *i
 	if index == -1 {
-		label := m.allocateLabel()
+		l, pos := m.allocateLabel()
 		index = len(m.consts)
 		m.consts = append(m.consts, _const{
-			_var:  _var,
-			label: label,
+			_var:     _var,
+			label:    l,
+			labelPos: pos,
 		})
 		*i = index
 	}
-	return m.consts[index].label.L
+	return m.consts[index].label
 }
 
 // Reset implements backend.Machine.
@@ -120,18 +193,20 @@ func (m *machine) Reset() {
 	}
 
 	m.stackBoundsCheckDisabled = false
-	m.ectx.Reset()
-
-	m.regAllocFn.Reset()
 	m.regAlloc.Reset()
+	m.labelPositionPool.Reset()
+	m.instrPool.Reset()
 	m.regAllocStarted = false
 	m.clobberedRegs = m.clobberedRegs[:0]
 
 	m.spillSlotSize = 0
 	m.maxRequiredStackSizeForCalls = 0
+	m.perBlockHead, m.perBlockEnd, m.rootInstr = nil, nil, nil
+	m.pendingInstructions = m.pendingInstructions[:0]
+	m.orderedSSABlockLabelPos = m.orderedSSABlockLabelPos[:0]
 
 	m.amodePool.Reset()
-	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.jmpTableTargetsNext = 0
 	m.constSwizzleMaskConstIndex = -1
 	m.constSqmulRoundSatIndex = -1
 	m.constI8x16SHLMaskTableIndex = -1
@@ -146,8 +221,63 @@ func (m *machine) Reset() {
 	m.constExtAddPairwiseI16x8uMask2Index = -1
 }
 
-// ExecutableContext implements backend.Machine.
-func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx }
+// StartLoweringFunction implements backend.Machine StartLoweringFunction.
+func (m *machine) StartLoweringFunction(maxBlockID ssa.BasicBlockID) {
+	m.maxSSABlockID = label(maxBlockID)
+	m.nextLabel = label(maxBlockID) + 1
+}
+
+// LinkAdjacentBlocks implements backend.Machine.
+func (m *machine) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+	prevPos, nextPos := m.getOrAllocateSSABlockLabelPosition(prev), m.getOrAllocateSSABlockLabelPosition(next)
+	prevPos.end.next = nextPos.begin
+}
+
+// StartBlock implements backend.Machine.
+func (m *machine) StartBlock(blk ssa.BasicBlock) {
+	m.currentLabelPos = m.getOrAllocateSSABlockLabelPosition(blk)
+	labelPos := m.currentLabelPos
+	end := m.allocateNop()
+	m.perBlockHead, m.perBlockEnd = end, end
+	labelPos.begin, labelPos.end = end, end
+	m.orderedSSABlockLabelPos = append(m.orderedSSABlockLabelPos, labelPos)
+}
+
+// EndBlock implements ExecutableContext.
+func (m *machine) EndBlock() {
+	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+	m.insertAtPerBlockHead(m.allocateNop())
+
+	m.currentLabelPos.begin = m.perBlockHead
+
+	if m.currentLabelPos.sb.EntryBlock() {
+		m.rootInstr = m.perBlockHead
+	}
+}
+
+func (m *machine) insertAtPerBlockHead(i *instruction) {
+	if m.perBlockHead == nil {
+		m.perBlockHead = i
+		m.perBlockEnd = i
+		return
+	}
+
+	i.next = m.perBlockHead
+	m.perBlockHead.prev = i
+	m.perBlockHead = i
+}
+
+// FlushPendingInstructions implements backend.Machine.
+func (m *machine) FlushPendingInstructions() {
+	l := len(m.pendingInstructions)
+	if l == 0 {
+		return
+	}
+	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+		m.insertAtPerBlockHead(m.pendingInstructions[i])
+	}
+	m.pendingInstructions = m.pendingInstructions[:0]
+}
 
 // DisableStackCheck implements backend.Machine.
 func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
@@ -155,23 +285,17 @@ func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
 // SetCompiler implements backend.Machine.
 func (m *machine) SetCompiler(c backend.Compiler) {
 	m.c = c
-	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c)
+	m.regAllocFn.ssaB = c.SSABuilder()
 }
 
 // SetCurrentABI implements backend.Machine.
-func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
-	m.currentABI = abi
-}
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { m.currentABI = abi }
 
 // RegAlloc implements backend.Machine.
 func (m *machine) RegAlloc() {
 	rf := m.regAllocFn
-	for _, pos := range m.ectx.OrderedBlockLabels {
-		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
-	}
-
 	m.regAllocStarted = true
-	m.regAlloc.DoAllocation(rf)
+	m.regAlloc.DoAllocation(&rf)
 	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
 	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
 }
@@ -184,49 +308,54 @@ func (m *machine) InsertReturn() {
 
 // LowerSingleBranch implements backend.Machine.
 func (m *machine) LowerSingleBranch(b *ssa.Instruction) {
-	ectx := m.ectx
 	switch b.Opcode() {
 	case ssa.OpcodeJump:
-		_, _, targetBlk := b.BranchData()
+		_, _, targetBlkID := b.BranchData()
 		if b.IsFallthroughJump() {
 			return
 		}
 		jmp := m.allocateInstr()
-		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
-		if target == backend.LabelReturn {
+		target := ssaBlockLabel(m.c.SSABuilder().BasicBlock(targetBlkID))
+		if target == labelReturn {
 			jmp.asRet()
 		} else {
 			jmp.asJmp(newOperandLabel(target))
 		}
 		m.insert(jmp)
 	case ssa.OpcodeBrTable:
-		index, target := b.BrTableData()
-		m.lowerBrTable(index, target)
+		index, targetBlkIDs := b.BrTableData()
+		m.lowerBrTable(index, targetBlkIDs)
 	default:
 		panic("BUG: unexpected branch opcode" + b.Opcode().String())
 	}
 }
 
-func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
-	// TODO: reuse the slice!
-	labels := make([]uint32, len(targets))
-	for j, target := range targets {
-		labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target))
+func (m *machine) addJmpTableTarget(targets ssa.Values) (index int) {
+	if m.jmpTableTargetsNext == len(m.jmpTableTargets) {
+		m.jmpTableTargets = append(m.jmpTableTargets, make([]uint32, 0, len(targets.View())))
+	}
+
+	index = m.jmpTableTargetsNext
+	m.jmpTableTargetsNext++
+	m.jmpTableTargets[index] = m.jmpTableTargets[index][:0]
+	for _, targetBlockID := range targets.View() {
+		target := m.c.SSABuilder().BasicBlock(ssa.BasicBlockID(targetBlockID))
+		m.jmpTableTargets[index] = append(m.jmpTableTargets[index], uint32(ssaBlockLabel(target)))
 	}
-	index = len(m.jmpTableTargets)
-	m.jmpTableTargets = append(m.jmpTableTargets, labels)
 	return
 }
 
 var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}
 
-func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) {
+func (m *machine) lowerBrTable(index ssa.Value, targets ssa.Values) {
 	_v := m.getOperand_Reg(m.c.ValueDefinition(index))
 	v := m.copyToTmp(_v.reg())
 
+	targetCount := len(targets.View())
+
 	// First, we need to do the bounds check.
 	maxIndex := m.c.AllocateVReg(ssa.TypeI32)
-	m.lowerIconst(maxIndex, uint64(len(targets)-1), false)
+	m.lowerIconst(maxIndex, uint64(targetCount-1), false)
 	cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false)
 	m.insert(cmp)
 
@@ -255,23 +384,22 @@ func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) {
 
 	jmpTable := m.allocateInstr()
 	targetSliceIndex := m.addJmpTableTarget(targets)
-	jmpTable.asJmpTableSequence(targetSliceIndex, len(targets))
+	jmpTable.asJmpTableSequence(targetSliceIndex, targetCount)
 	m.insert(jmpTable)
 }
 
 // LowerConditionalBranch implements backend.Machine.
 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
-	exctx := m.ectx
-	cval, args, targetBlk := b.BranchData()
+	cval, args, targetBlkID := b.BranchData()
 	if len(args) > 0 {
 		panic(fmt.Sprintf(
 			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
-			exctx.CurrentSSABlk,
-			targetBlk,
+			m.currentLabelPos.sb,
+			targetBlkID,
 		))
 	}
 
-	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+	target := ssaBlockLabel(m.c.SSABuilder().BasicBlock(targetBlkID))
 	cvalDef := m.c.ValueDefinition(cval)
 
 	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
@@ -1272,9 +1400,9 @@ func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) {
 	}
 
 	load := m.allocateInstr()
-	constLabel := m.allocateLabel()
-	m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi})
-	load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst)
+	l, pos := m.allocateLabel()
+	m.consts = append(m.consts, _const{label: l, labelPos: pos, lo: lo, hi: hi})
+	load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(l)), dst)
 	m.insert(load)
 }
 
@@ -1473,21 +1601,24 @@ func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value,
 	jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l))
 }
 
-func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) {
-	var target *backend.SSAValueDefinition
+func (m *machine) tryLowerBandToFlag(x, y backend.SSAValueDefinition) (ok bool) {
+	var target backend.SSAValueDefinition
+	var got bool
 	if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 {
 		if m.c.MatchInstr(y, ssa.OpcodeBand) {
 			target = y
+			got = true
 		}
 	}
 
 	if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 {
 		if m.c.MatchInstr(x, ssa.OpcodeBand) {
 			target = x
+			got = true
 		}
 	}
 
-	if target == nil {
+	if !got {
 		return false
 	}
 
@@ -1522,7 +1653,7 @@ func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (
 	return
 }
 
-func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) {
+func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel label) {
 	exitCodeReg := rbpVReg
 	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg)
 
@@ -1819,9 +1950,9 @@ func (m *machine) lowerCall(si *ssa.Instruction) {
 
 // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
 // caller side of the function call.
-func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) {
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def backend.SSAValueDefinition, stackSlotSize int64) {
 	arg := &a.Args[argIndex]
-	if def != nil && def.IsFromInstr() {
+	if def.IsFromInstr() {
 		// Constant instructions are inlined.
 		if inst := def.Instr; inst.Constant() {
 			m.insertLoadConstant(inst, reg)
@@ -1904,23 +2035,20 @@ func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
 
 // Format implements backend.Machine.
 func (m *machine) Format() string {
-	ectx := m.ectx
-	begins := map[*instruction]backend.Label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
-	}
-
-	irBlocks := map[backend.Label]ssa.BasicBlockID{}
-	for i, l := range ectx.SsaBlockIDToLabels {
-		irBlocks[l] = ssa.BasicBlockID(i)
+	begins := map[*instruction]label{}
+	for l := label(0); l < m.nextLabel; l++ {
+		pos := m.labelPositionPool.Get(int(l))
+		if pos != nil {
+			begins[pos.begin] = l
+		}
 	}
 
 	var lines []string
-	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+	for cur := m.rootInstr; cur != nil; cur = cur.next {
 		if l, ok := begins[cur]; ok {
 			var labelStr string
-			if blkID, ok := irBlocks[l]; ok {
-				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			if l <= m.maxSSABlockID {
+				labelStr = fmt.Sprintf("%s (SSA Block: blk%d):", l, l)
 			} else {
 				labelStr = fmt.Sprintf("%s:", l)
 			}
@@ -1933,9 +2061,9 @@ func (m *machine) Format() string {
 	}
 	for _, vc := range m.consts {
 		if vc._var == nil {
-			lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi))
+			lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label, vc.lo, vc.hi))
 		} else {
-			lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var))
+			lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label, vc._var))
 		}
 	}
 	return "\n" + strings.Join(lines, "\n") + "\n"
@@ -1943,15 +2071,14 @@ func (m *machine) Format() string {
 
 func (m *machine) encodeWithoutSSA(root *instruction) {
 	m.labelResolutionPends = m.labelResolutionPends[:0]
-	ectx := m.ectx
-
 	bufPtr := m.c.BufPtr()
 	for cur := root; cur != nil; cur = cur.next {
 		offset := int64(len(*bufPtr))
 		if cur.kind == nop0 {
 			l := cur.nop0Label()
-			if pos, ok := ectx.LabelPositions[l]; ok {
-				pos.BinaryOffset = offset
+			pos := m.labelPositionPool.Get(int(l))
+			if pos != nil {
+				pos.binaryOffset = offset
 			}
 		}
 
@@ -1968,7 +2095,7 @@ func (m *machine) encodeWithoutSSA(root *instruction) {
 		switch p.instr.kind {
 		case jmp, jmpIf, lea:
 			target := p.instr.jmpLabel()
-			targetOffset := ectx.LabelPositions[target].BinaryOffset
+			targetOffset := m.labelPositionPool.Get(int(target)).binaryOffset
 			imm32Offset := p.imm32Offset
 			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
 			binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset))
@@ -1980,33 +2107,33 @@ func (m *machine) encodeWithoutSSA(root *instruction) {
 
 // Encode implements backend.Machine Encode.
 func (m *machine) Encode(ctx context.Context) (err error) {
-	ectx := m.ectx
 	bufPtr := m.c.BufPtr()
 
 	var fn string
 	var fnIndex int
-	var labelToSSABlockID map[backend.Label]ssa.BasicBlockID
+	var labelPosToLabel map[*labelPosition]label
 	if wazevoapi.PerfMapEnabled {
 		fn = wazevoapi.GetCurrentFunctionName(ctx)
-		labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID)
-		for i, l := range ectx.SsaBlockIDToLabels {
-			labelToSSABlockID[l] = ssa.BasicBlockID(i)
+		labelPosToLabel = make(map[*labelPosition]label)
+		for i := 0; i <= m.labelPositionPool.MaxIDEncountered(); i++ {
+			pos := m.labelPositionPool.Get(i)
+			labelPosToLabel[pos] = label(i)
 		}
 		fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
 	}
 
 	m.labelResolutionPends = m.labelResolutionPends[:0]
-	for _, pos := range ectx.OrderedBlockLabels {
+	for _, pos := range m.orderedSSABlockLabelPos {
 		offset := int64(len(*bufPtr))
-		pos.BinaryOffset = offset
-		for cur := pos.Begin; cur != pos.End.next; cur = cur.next {
+		pos.binaryOffset = offset
+		for cur := pos.begin; cur != pos.end.next; cur = cur.next {
 			offset := int64(len(*bufPtr))
 
 			switch cur.kind {
 			case nop0:
 				l := cur.nop0Label()
-				if pos, ok := ectx.LabelPositions[l]; ok {
-					pos.BinaryOffset = offset
+				if pos := m.labelPositionPool.Get(int(l)); pos != nil {
+					pos.binaryOffset = offset
 				}
 			case sourceOffsetInfo:
 				m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo())
@@ -2021,22 +2148,16 @@ func (m *machine) Encode(ctx context.Context) (err error) {
 		}
 
 		if wazevoapi.PerfMapEnabled {
-			l := pos.L
-			var labelStr string
-			if blkID, ok := labelToSSABlockID[l]; ok {
-				labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
-			} else {
-				labelStr = l.String()
-			}
+			l := labelPosToLabel[pos]
 			size := int64(len(*bufPtr)) - offset
-			wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+			wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, l))
 		}
 	}
 
 	for i := range m.consts {
 		offset := int64(len(*bufPtr))
 		vc := &m.consts[i]
-		vc.label.BinaryOffset = offset
+		vc.labelPos.binaryOffset = offset
 		if vc._var == nil {
 			lo, hi := vc.lo, vc.hi
 			m.c.Emit8Bytes(lo)
@@ -2054,7 +2175,7 @@ func (m *machine) Encode(ctx context.Context) (err error) {
 		switch p.instr.kind {
 		case jmp, jmpIf, lea, xmmUnaryRmR:
 			target := p.instr.jmpLabel()
-			targetOffset := ectx.LabelPositions[target].BinaryOffset
+			targetOffset := m.labelPositionPool.Get(int(target)).binaryOffset
 			imm32Offset := p.imm32Offset
 			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
 			binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset))
@@ -2063,7 +2184,7 @@ func (m *machine) Encode(ctx context.Context) (err error) {
 			// Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
 			targets := m.jmpTableTargets[p.instr.u1]
 			for i, l := range targets {
-				targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset
+				targetOffset := m.labelPositionPool.Get(int(l)).binaryOffset
 				jmpOffset := targetOffset - tableBegin
 				binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset))
 			}
@@ -2092,7 +2213,7 @@ func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, rel
 // CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
 func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return }
 
-func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) {
+func (m *machine) lowerIcmpToFlag(xd, yd backend.SSAValueDefinition, _64 bool) {
 	x := m.getOperand_Reg(xd)
 	y := m.getOperand_Mem_Imm32_Reg(yd)
 	cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64)
@@ -2135,7 +2256,7 @@ func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and boo
 
 // allocateInstr allocates an instruction.
 func (m *machine) allocateInstr() *instruction {
-	instr := m.ectx.InstructionPool.Allocate()
+	instr := m.instrPool.Allocate()
 	if !m.regAllocStarted {
 		instr.addedBeforeRegAlloc = true
 	}
@@ -2149,25 +2270,22 @@ func (m *machine) allocateNop() *instruction {
 }
 
 func (m *machine) insert(i *instruction) {
-	ectx := m.ectx
-	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+	m.pendingInstructions = append(m.pendingInstructions, i)
 }
 
-func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint
-	pos := m.allocateLabel()
-	l = pos.L
+func (m *machine) allocateBrTarget() (nop *instruction, l label) { //nolint
+	l, pos := m.allocateLabel()
 	nop = m.allocateInstr()
 	nop.asNop0WithLabel(l)
-	pos.Begin, pos.End = nop, nop
+	pos.begin, pos.end = nop, nop
 	return
 }
 
-func (m *machine) allocateLabel() *labelPosition {
-	ectx := m.ectx
-	l := ectx.AllocateLabel()
-	pos := ectx.AllocateLabelPosition(l)
-	ectx.LabelPositions[l] = pos
-	return pos
+func (m *machine) allocateLabel() (label, *labelPosition) {
+	l := m.nextLabel
+	pos := m.labelPositionPool.GetOrAllocate(int(l))
+	m.nextLabel++
+	return l, pos
 }
 
 func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
@@ -3181,22 +3299,22 @@ func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) {
 		}
 	}
 
-	xmaskLabel := m.allocateLabel()
-	m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel})
-	ymaskLabel := m.allocateLabel()
-	m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel})
+	xl, xmaskPos := m.allocateLabel()
+	m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xl, labelPos: xmaskPos})
+	yl, ymaskPos := m.allocateLabel()
+	m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: yl, labelPos: ymaskPos})
 
 	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
 	tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg())
 
 	// Apply mask to X.
 	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp)
+	loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xl)), tmp)
 	m.insert(loadMaskLo)
 	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX))
 
 	// Apply mask to Y.
-	loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp)
+	loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(yl)), tmp)
 	m.insert(loadMaskHi)
 	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY))
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
index 8fa974c661..e53729860d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@@ -12,7 +12,7 @@ func (m *machine) PostRegAlloc() {
 }
 
 func (m *machine) setupPrologue() {
-	cur := m.ectx.RootInstr
+	cur := m.rootInstr
 	prevInitInst := cur.next
 
 	// At this point, we have the stack layout as follows:
@@ -130,14 +130,13 @@ func (m *machine) setupPrologue() {
 // 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
 // 4. Lowering that is supposed to be done after regalloc.
 func (m *machine) postRegAlloc() {
-	ectx := m.ectx
-	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+	for cur := m.rootInstr; cur != nil; cur = cur.next {
 		switch k := cur.kind; k {
 		case ret:
 			m.setupEpilogueAfter(cur.prev)
 			continue
 		case fcvtToSintSequence, fcvtToUintSequence:
-			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.pendingInstructions = m.pendingInstructions[:0]
 			if k == fcvtToSintSequence {
 				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
 			} else {
@@ -146,29 +145,29 @@ func (m *machine) postRegAlloc() {
 			prev := cur.prev
 			next := cur.next
 			cur := prev
-			for _, instr := range m.ectx.PendingInstructions {
+			for _, instr := range m.pendingInstructions {
 				cur = linkInstr(cur, instr)
 			}
 			linkInstr(cur, next)
 			continue
 		case xmmCMov:
-			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.pendingInstructions = m.pendingInstructions[:0]
 			m.lowerXmmCmovAfterRegAlloc(cur)
 			prev := cur.prev
 			next := cur.next
 			cur := prev
-			for _, instr := range m.ectx.PendingInstructions {
+			for _, instr := range m.pendingInstructions {
 				cur = linkInstr(cur, instr)
 			}
 			linkInstr(cur, next)
 			continue
 		case idivRemSequence:
-			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.pendingInstructions = m.pendingInstructions[:0]
 			m.lowerIDivRemSequenceAfterRegAlloc(cur)
 			prev := cur.prev
 			next := cur.next
 			cur := prev
-			for _, instr := range m.ectx.PendingInstructions {
+			for _, instr := range m.pendingInstructions {
 				cur = linkInstr(cur, instr)
 			}
 			linkInstr(cur, next)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
index 0bb28ee9e7..de9dcc9444 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
@@ -1,13 +1,226 @@
 package amd64
 
 import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
 )
 
-// InsertMoveBefore implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+// regAllocFn implements regalloc.Function.
+type regAllocFn struct {
+	ssaB                   ssa.Builder
+	m                      *machine
+	loopNestingForestRoots []ssa.BasicBlock
+	blockIter              int
+}
+
+// PostOrderBlockIteratorBegin implements regalloc.Function.
+func (f *regAllocFn) PostOrderBlockIteratorBegin() *labelPosition {
+	f.blockIter = len(f.m.orderedSSABlockLabelPos) - 1
+	return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function.
+func (f *regAllocFn) PostOrderBlockIteratorNext() *labelPosition {
+	if f.blockIter < 0 {
+		return nil
+	}
+	b := f.m.orderedSSABlockLabelPos[f.blockIter]
+	f.blockIter--
+	return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function.
+func (f *regAllocFn) ReversePostOrderBlockIteratorBegin() *labelPosition {
+	f.blockIter = 0
+	return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function.
+func (f *regAllocFn) ReversePostOrderBlockIteratorNext() *labelPosition {
+	if f.blockIter >= len(f.m.orderedSSABlockLabelPos) {
+		return nil
+	}
+	b := f.m.orderedSSABlockLabelPos[f.blockIter]
+	f.blockIter++
+	return b
+}
+
+// ClobberedRegisters implements regalloc.Function.
+func (f *regAllocFn) ClobberedRegisters(regs []regalloc.VReg) {
+	f.m.clobberedRegs = append(f.m.clobberedRegs[:0], regs...)
+}
+
+// LoopNestingForestRoots implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.ssaB.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestRoot(i int) *labelPosition {
+	root := f.loopNestingForestRoots[i]
+	pos := f.m.getOrAllocateSSABlockLabelPosition(root)
+	return pos
+}
+
+// LowestCommonAncestor implements regalloc.Function.
+func (f *regAllocFn) LowestCommonAncestor(blk1, blk2 *labelPosition) *labelPosition {
+	sb := f.ssaB.LowestCommonAncestor(blk1.sb, blk2.sb)
+	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
+	return pos
+}
+
+// Idom implements regalloc.Function.
+func (f *regAllocFn) Idom(blk *labelPosition) *labelPosition {
+	sb := f.ssaB.Idom(blk.sb)
+	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
+	return pos
+}
+
+// SwapBefore implements regalloc.Function.
+func (f *regAllocFn) SwapBefore(x1, x2, tmp regalloc.VReg, instr *instruction) {
+	f.m.swap(instr.prev, x1, x2, tmp)
+}
+
+// StoreRegisterBefore implements regalloc.Function.
+func (f *regAllocFn) StoreRegisterBefore(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertStoreRegisterAt(v, instr, false)
+}
+
+// StoreRegisterAfter implements regalloc.Function.
+func (f *regAllocFn) StoreRegisterAfter(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertStoreRegisterAt(v, instr, true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function.
+func (f *regAllocFn) ReloadRegisterBefore(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertReloadRegisterAt(v, instr, false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function.
+func (f *regAllocFn) ReloadRegisterAfter(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertReloadRegisterAt(v, instr, true)
+}
+
+// InsertMoveBefore implements regalloc.Function.
+func (f *regAllocFn) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	f.m.insertMoveBefore(dst, src, instr)
+}
+
+// LoopNestingForestChild implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestChild(pos *labelPosition, i int) *labelPosition {
+	childSB := pos.sb.LoopNestingForestChildren()[i]
+	return f.m.getOrAllocateSSABlockLabelPosition(childSB)
+}
+
+// Succ implements regalloc.Block.
+func (f *regAllocFn) Succ(pos *labelPosition, i int) *labelPosition {
+	succSB := pos.sb.Succ(i)
+	if succSB.ReturnBlock() {
+		return nil
+	}
+	return f.m.getOrAllocateSSABlockLabelPosition(succSB)
+}
+
+// Pred implements regalloc.Block.
+func (f *regAllocFn) Pred(pos *labelPosition, i int) *labelPosition {
+	predSB := pos.sb.Pred(i)
+	return f.m.getOrAllocateSSABlockLabelPosition(predSB)
+}
+
+// BlockParams implements regalloc.Function.
+func (f *regAllocFn) BlockParams(pos *labelPosition, regs *[]regalloc.VReg) []regalloc.VReg {
+	c := f.m.c
+	*regs = (*regs)[:0]
+	for i := 0; i < pos.sb.Params(); i++ {
+		v := c.VRegOf(pos.sb.Param(i))
+		*regs = append(*regs, v)
+	}
+	return *regs
+}
+
+// ID implements regalloc.Block.
+func (pos *labelPosition) ID() int32 {
+	return int32(pos.sb.ID())
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (pos *labelPosition) InstrIteratorBegin() *instruction {
+	ret := pos.begin
+	pos.cur = ret
+	return ret
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (pos *labelPosition) InstrIteratorNext() *instruction {
+	for {
+		if pos.cur == pos.end {
+			return nil
+		}
+		instr := pos.cur.next
+		pos.cur = instr
+		if instr == nil {
+			return nil
+		} else if instr.addedBeforeRegAlloc {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (pos *labelPosition) InstrRevIteratorBegin() *instruction {
+	pos.cur = pos.end
+	return pos.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (pos *labelPosition) InstrRevIteratorNext() *instruction {
+	for {
+		if pos.cur == pos.begin {
+			return nil
+		}
+		instr := pos.cur.prev
+		pos.cur = instr
+		if instr == nil {
+			return nil
+		} else if instr.addedBeforeRegAlloc {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// FirstInstr implements regalloc.Block.
+func (pos *labelPosition) FirstInstr() *instruction { return pos.begin }
+
+// LastInstrForInsertion implements regalloc.Block.
+func (pos *labelPosition) LastInstrForInsertion() *instruction {
+	return lastInstrForInsertion(pos.begin, pos.end)
+}
+
+// Preds implements regalloc.Block.
+func (pos *labelPosition) Preds() int { return pos.sb.Preds() }
+
+// Entry implements regalloc.Block.
+func (pos *labelPosition) Entry() bool { return pos.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (pos *labelPosition) Succs() int { return pos.sb.Succs() }
+
+// LoopHeader implements regalloc.Block.
+func (pos *labelPosition) LoopHeader() bool { return pos.sb.LoopHeader() }
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (pos *labelPosition) LoopNestingForestChildren() int {
+	return len(pos.sb.LoopNestingForestChildren())
+}
+
+func (m *machine) insertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
 	typ := src.RegType()
 	if typ != dst.RegType() {
 		panic("BUG: src and dst must have the same type")
@@ -26,8 +239,7 @@ func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
 	linkInstr(cur, prevNext)
 }
 
-// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+func (m *machine) insertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
 	if !v.IsRealReg() {
 		panic("BUG: VReg must be backed by real reg to be stored")
 	}
@@ -61,8 +273,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	return linkInstr(cur, prevNext)
 }
 
-// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+func (m *machine) insertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
 	if !v.IsRealReg() {
 		panic("BUG: VReg must be backed by real reg to be stored")
 	}
@@ -98,13 +309,7 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	return linkInstr(cur, prevNext)
 }
 
-// ClobberedRegisters implements backend.RegAllocFunctionMachine.
-func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
-	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
-}
-
-// Swap implements backend.RegAllocFunctionMachine.
-func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+func (m *machine) swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
 	if x1.RegType() == regalloc.RegTypeInt {
 		prevNext := cur.next
 		xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
@@ -113,25 +318,24 @@ func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
 	} else {
 		if tmp.Valid() {
 			prevNext := cur.next
-			m.InsertMoveBefore(tmp, x1, prevNext)
-			m.InsertMoveBefore(x1, x2, prevNext)
-			m.InsertMoveBefore(x2, tmp, prevNext)
+			m.insertMoveBefore(tmp, x1, prevNext)
+			m.insertMoveBefore(x1, x2, prevNext)
+			m.insertMoveBefore(x2, tmp, prevNext)
 		} else {
 			prevNext := cur.next
 			r2 := x2.RealReg()
 			// Temporarily spill x1 to stack.
-			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			cur = m.insertStoreRegisterAt(x1, cur, true).prev
 			// Then move x2 to x1.
 			cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
 			linkInstr(cur, prevNext)
 			// Then reload the original value on x1 from stack to r2.
-			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+			m.insertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
 		}
 	}
 }
 
-// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
-func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+func lastInstrForInsertion(begin, end *instruction) *instruction {
 	cur := end
 	for cur.kind == nop0 {
 		cur = cur.prev
@@ -146,8 +350,3 @@ func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
 		return end
 	}
 }
-
-// SSABlockLabel implements backend.RegAllocFunctionMachine.
-func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
-	return m.ectx.SsaBlockIDToLabels[id]
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
index 539a8b754b..8d514d8576 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
@@ -127,7 +127,7 @@ func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
 	tmpX := m.copyToTmp(xx.reg())
 
 	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqw, newOperandReg(tmpX), tmp))
 	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
 
 	m.copyTo(tmpX, m.c.VRegOf(ret))
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
index c6fcb86731..7879756833 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
@@ -59,7 +59,7 @@ func (o *operand) format(_64 bool) string {
 	case operandKindImm32:
 		return fmt.Sprintf("$%d", int32(o.imm32()))
 	case operandKindLabel:
-		return backend.Label(o.imm32()).String()
+		return label(o.imm32()).String()
 	default:
 		panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
 	}
@@ -85,22 +85,22 @@ func (o *operand) imm32() uint32 {
 	return uint32(o.data)
 }
 
-func (o *operand) label() backend.Label {
+func (o *operand) label() label {
 	switch o.kind {
 	case operandKindLabel:
-		return backend.Label(o.data)
+		return label(o.data)
 	case operandKindMem:
 		mem := o.addressMode()
 		if mem.kind() != amodeRipRel {
 			panic("BUG: invalid label")
 		}
-		return backend.Label(mem.imm32)
+		return label(mem.imm32)
 	default:
 		panic("BUG: invalid operand kind")
 	}
 }
 
-func newOperandLabel(label backend.Label) operand {
+func newOperandLabel(label label) operand {
 	return operand{kind: operandKindLabel, data: uint64(label)}
 }
 
@@ -221,7 +221,7 @@ func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, s
 	return ret
 }
 
-func (m *machine) newAmodeRipRel(label backend.Label) *amode {
+func (m *machine) newAmodeRipRel(label label) *amode {
 	ret := m.amodePool.Allocate()
 	*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
 	return ret
@@ -246,18 +246,18 @@ func (a *amode) String() string {
 			"%d(%s,%s,%d)",
 			int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
 	case amodeRipRel:
-		return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
+		return fmt.Sprintf("%s(%%rip)", label(a.imm32))
 	default:
 		panic("BUG: invalid amode kind")
 	}
 }
 
-func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
-	if def.IsFromBlockParam() {
-		return newOperandReg(def.BlkParamVReg)
+func (m *machine) getOperand_Mem_Reg(def backend.SSAValueDefinition) (op operand) {
+	if !def.IsFromInstr() {
+		return newOperandReg(m.c.VRegOf(def.V))
 	}
 
-	if def.SSAValue().Type() == ssa.TypeV128 {
+	if def.V.Type() == ssa.TypeV128 {
 		// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
 		return m.getOperand_Reg(def)
 	}
@@ -272,9 +272,9 @@ func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operan
 	return m.getOperand_Reg(def)
 }
 
-func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
-	if def.IsFromBlockParam() {
-		return newOperandReg(def.BlkParamVReg)
+func (m *machine) getOperand_Mem_Imm32_Reg(def backend.SSAValueDefinition) (op operand) {
+	if !def.IsFromInstr() {
+		return newOperandReg(m.c.VRegOf(def.V))
 	}
 
 	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
@@ -287,9 +287,9 @@ func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op
 	return m.getOperand_Imm32_Reg(def)
 }
 
-func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
-	if def.IsFromBlockParam() {
-		return newOperandReg(def.BlkParamVReg)
+func (m *machine) getOperand_Imm32_Reg(def backend.SSAValueDefinition) (op operand) {
+	if !def.IsFromInstr() {
+		return newOperandReg(m.c.VRegOf(def.V))
 	}
 
 	instr := def.Instr
@@ -323,24 +323,14 @@ func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
 	return u32val, true
 }
 
-func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
+func (m *machine) getOperand_Reg(def backend.SSAValueDefinition) (op operand) {
 	var v regalloc.VReg
-	if def.IsFromBlockParam() {
-		v = def.BlkParamVReg
+	if instr := def.Instr; instr != nil && instr.Constant() {
+		// We inline all the constant instructions so that we could reduce the register usage.
+		v = m.lowerConstant(instr)
+		instr.MarkLowered()
 	} else {
-		instr := def.Instr
-		if instr.Constant() {
-			// We inline all the constant instructions so that we could reduce the register usage.
-			v = m.lowerConstant(instr)
-			instr.MarkLowered()
-		} else {
-			if n := def.N; n == 0 {
-				v = m.c.VRegOf(instr.Return())
-			} else {
-				_, rs := instr.Returns()
-				v = m.c.VRegOf(rs[n-1])
-			}
-		}
+		v = m.c.VRegOf(def.V)
 	}
 	return newOperandReg(v)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
deleted file mode 100644
index 5219837e35..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build !tinygo
-
-package amd64
-
-import "reflect"
-
-// setSliceLimits sets both Cap and Len for the given reflected slice.
-func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
-	s.Len = int(limit)
-	s.Cap = int(limit)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
deleted file mode 100644
index df4cf46ec5..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build tinygo
-
-package amd64
-
-import "reflect"
-
-// setSliceLimits sets both Cap and Len for the given reflected slice.
-func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
-	s.Len = limit
-	s.Len = limit
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
index 05ba5f027e..ef823bdbdc 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
@@ -9,12 +9,14 @@ import (
 )
 
 func stackView(rbp, top uintptr) []byte {
+	l := int(top - rbp)
 	var stackBuf []byte
 	{
-		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		//nolint:staticcheck
 		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
 		hdr.Data = rbp
-		setSliceLimits(hdr, top-rbp)
+		hdr.Len = l
+		hdr.Cap = l
 	}
 	return stackBuf
 }
@@ -72,9 +74,9 @@ func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
 	//              |   SizeInBytes   |
 	//              +-----------------+ <---- stackPointerBeforeGoCall
 	//                 (low address)
-	data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
+	data := unsafe.Add(unsafe.Pointer(stackPointerBeforeGoCall), 8)
 	size := *stackPointerBeforeGoCall / 8
-	return unsafe.Slice((*uint64)(data), int(size))
+	return unsafe.Slice((*uint64)(data), size)
 }
 
 func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
index 6615471c6a..d1eaa7cd4f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -101,13 +101,14 @@ func (m *machine) LowerParams(args []ssa.Value) {
 			bits := arg.Type.Bits()
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
 			load := m.allocateInstr()
 			switch arg.Type {
 			case ssa.TypeI32, ssa.TypeI64:
-				load.asULoad(operandNR(reg), amode, bits)
+				load.asULoad(reg, amode, bits)
 			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-				load.asFpuLoad(operandNR(reg), amode, bits)
+				load.asFpuLoad(reg, amode, bits)
 			default:
 				panic("BUG")
 			}
@@ -169,7 +170,8 @@ func (m *machine) LowerReturns(rets []ssa.Value) {
 
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
 			store := m.allocateInstr()
 			store.asStore(operandNR(reg), amode, bits)
 			m.insert(store)
@@ -180,9 +182,9 @@ func (m *machine) LowerReturns(rets []ssa.Value) {
 
 // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
 // caller side of the function call.
-func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def backend.SSAValueDefinition, slotBegin int64) {
 	arg := &a.Args[argIndex]
-	if def != nil && def.IsFromInstr() {
+	if def.IsFromInstr() {
 		// Constant instructions are inlined.
 		if inst := def.Instr; inst.Constant() {
 			val := inst.Return()
@@ -215,9 +217,9 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 		ldr := m.allocateInstr()
 		switch r.Type {
 		case ssa.TypeI32, ssa.TypeI64:
-			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asULoad(reg, amode, r.Type.Bits())
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asFpuLoad(reg, amode, r.Type.Bits())
 		default:
 			panic("BUG")
 		}
@@ -225,25 +227,24 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 	}
 }
 
-func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
-	exct := m.executableContext
-	exct.PendingInstructions = exct.PendingInstructions[:0]
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, *addressMode) {
+	m.pendingInstructions = m.pendingInstructions[:0]
 	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
-	for _, instr := range exct.PendingInstructions {
+	for _, instr := range m.pendingInstructions {
 		cur = linkInstr(cur, instr)
 	}
 	return cur, mode
 }
 
-func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) *addressMode {
 	if rn.RegType() != regalloc.RegTypeInt {
 		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
 	}
-	var amode addressMode
+	amode := m.amodePool.Allocate()
 	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
 	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
 	} else {
 		var indexReg regalloc.VReg
 		if allowTmpRegUse {
@@ -253,7 +254,7 @@ func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn reg
 			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
 			m.lowerConstantI64(indexReg, offset)
 		}
-		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+		*amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
 	}
 	return amode
 }
@@ -315,7 +316,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		alu.asALU(ao, rd, operandNR(spVReg), imm12Operand, true)
 		m.insert(alu)
 	} else {
 		m.lowerConstantI64(tmpRegVReg, diff)
@@ -326,7 +327,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		alu.asALU(ao, rd, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		m.insert(alu)
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
index 7a9cceb332..f8b5d97ac7 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -59,25 +59,26 @@ func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regallo
 	} else {
 		postIndexImm = 8
 	}
-	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+	loadMode := m.amodePool.Allocate()
+	*loadMode = addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
 
 	instr := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32:
-		instr.asULoad(loadTargetReg, loadMode, 32)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeI64:
-		instr.asULoad(loadTargetReg, loadMode, 64)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeF32:
-		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeF64:
-		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeV128:
-		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 128)
 	}
 	cur = linkInstr(cur, instr)
 
 	if isStackArg {
-		var storeMode addressMode
+		var storeMode *addressMode
 		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
 		toStack := m.allocateInstr()
 		toStack.asStore(loadTargetReg, storeMode, bits)
@@ -113,21 +114,22 @@ func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr reg
 	}
 
 	if isStackArg {
-		var loadMode addressMode
+		var loadMode *addressMode
 		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
 		toReg := m.allocateInstr()
 		switch typ {
 		case ssa.TypeI32, ssa.TypeI64:
-			toReg.asULoad(storeTargetReg, loadMode, bits)
+			toReg.asULoad(storeTargetReg.reg(), loadMode, bits)
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+			toReg.asFpuLoad(storeTargetReg.reg(), loadMode, bits)
 		default:
 			panic("TODO?")
 		}
 		cur = linkInstr(cur, toReg)
 	}
 
-	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
 	instr := m.allocateInstr()
 	instr.asStore(storeTargetReg, mode, bits)
 	cur = linkInstr(cur, instr)
@@ -214,11 +216,12 @@ func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction
 
 func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
 	instr := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
 	if store {
 		instr.asStore(operandNR(d), mode, 64)
 	} else {
-		instr.asULoad(operandNR(d), mode, 64)
+		instr.asULoad(d, mode, 64)
 	}
 	return linkInstr(prev, instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
index 466b1f9609..06f8a4a053 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -14,7 +14,6 @@ var calleeSavedRegistersSorted = []regalloc.VReg{
 
 // CompileGoFunctionTrampoline implements backend.Machine.
 func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
-	exct := m.executableContext
 	argBegin := 1 // Skips exec context by default.
 	if needModuleContextPtr {
 		argBegin++
@@ -26,7 +25,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 
 	cur := m.allocateInstr()
 	cur.asNop0()
-	exct.RootInstr = cur
+	m.rootInstr = cur
 
 	// Execution context is always the first argument.
 	execCtrPtr := x0VReg
@@ -87,7 +86,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		// Module context is always the second argument.
 		moduleCtrPtr := x1VReg
 		store := m.allocateInstr()
-		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
 		store.asStore(operandNR(moduleCtrPtr), amode, 64)
 		cur = linkInstr(cur, store)
 	}
@@ -120,11 +120,9 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		} else {
 			sizeInBits = 64
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindPostIndex,
-				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
-			}, sizeInBits)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8)}
+		store.asStore(operandNR(v), amode, sizeInBits)
 		cur = linkInstr(cur, store)
 	}
 
@@ -139,7 +137,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		frameSizeReg = xzrVReg
 		sliceSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	_amode := addressModePreOrPostIndex(m, spVReg, -16, true)
 	storeP := m.allocateInstr()
 	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
 	cur = linkInstr(cur, storeP)
@@ -165,8 +163,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
 	ldr := m.allocateInstr()
 	// And load the return address.
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	amode := addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */)
+	ldr.asULoad(lrVReg, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
@@ -183,23 +181,24 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		r := &abi.Rets[i]
 		if r.Kind == backend.ABIArgKindReg {
 			loadIntoReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asULoad(r.Reg, mode, 32)
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asULoad(r.Reg, mode, 64)
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 32)
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 64)
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 128)
 			default:
 				panic("TODO")
 			}
@@ -208,28 +207,29 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 			// First we need to load the value to a temporary just like ^^.
 			intTmp, floatTmp := x11VReg, v11VReg
 			loadIntoTmpReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			var resultReg regalloc.VReg
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				loadIntoTmpReg.asULoad(intTmp, mode, 32)
 				resultReg = intTmp
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				loadIntoTmpReg.asULoad(intTmp, mode, 64)
 				resultReg = intTmp
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 32)
 				resultReg = floatTmp
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 64)
 				resultReg = floatTmp
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 128)
 				resultReg = floatTmp
 			default:
 				panic("TODO")
@@ -243,7 +243,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	ret.asRet()
 	linkInstr(cur, ret)
 
-	m.encode(m.executableContext.RootInstr)
+	m.encode(m.rootInstr)
 	return m.compiler.Buf()
 }
 
@@ -258,12 +258,13 @@ func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regal
 		case regalloc.RegTypeFloat:
 			sizeInBits = 128
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		store.asStore(operandNR(v), mode, sizeInBits)
 		store.prev = cur
 		cur.next = store
 		cur = store
@@ -276,7 +277,7 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
 	for _, v := range regs {
 		load := m.allocateInstr()
-		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var as func(dst regalloc.VReg, amode *addressMode, sizeInBits byte)
 		var sizeInBits byte
 		switch v.RegType() {
 		case regalloc.RegTypeInt:
@@ -286,12 +287,13 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 			as = load.asFpuLoad
 			sizeInBits = 128
 		}
-		as(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		as(v, mode, sizeInBits)
 		cur = linkInstr(cur, load)
 		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
 	}
@@ -299,20 +301,18 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 }
 
 func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
-	exct := m.executableContext
-	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.pendingInstructions = m.pendingInstructions[:0]
 	m.lowerConstantI64(dst, v)
-	for _, instr := range exct.PendingInstructions {
+	for _, instr := range m.pendingInstructions {
 		cur = linkInstr(cur, instr)
 	}
 	return cur
 }
 
 func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
-	exct := m.executableContext
-	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.pendingInstructions = m.pendingInstructions[:0]
 	m.lowerConstantI32(dst, v)
-	for _, instr := range exct.PendingInstructions {
+	for _, instr := range m.pendingInstructions {
 		cur = linkInstr(cur, instr)
 	}
 	return cur
@@ -324,11 +324,9 @@ func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode
 
 	// Set the exit status on the execution context.
 	setExistStatus := m.allocateInstr()
-	setExistStatus.asStore(operandNR(constReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64()}
+	setExistStatus.asStore(operandNR(constReg), mode, 32)
 	cur = linkInstr(cur, setExistStatus)
 	return cur
 }
@@ -340,12 +338,13 @@ func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
 	cur = linkInstr(cur, adr)
 
 	storeReturnAddr := m.allocateInstr()
-	storeReturnAddr.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		// Execution context is always the first argument.
+		rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeReturnAddr.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, storeReturnAddr)
 
 	// Exit the execution.
@@ -364,11 +363,12 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 	cur = linkInstr(cur, movSp)
 
 	strSp := m.allocateInstr()
-	strSp.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSp.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, strSp)
 	return cur
 }
@@ -376,27 +376,28 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
 	load := m.allocateInstr()
 	var result regalloc.VReg
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
 	switch arg.Type {
 	case ssa.TypeI32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 32)
+		load.asULoad(intVReg, mode, 32)
 		result = intVReg
 	case ssa.TypeI64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 64)
+		load.asULoad(intVReg, mode, 64)
 		result = intVReg
 	case ssa.TypeF32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		load.asFpuLoad(floatVReg, mode, 32)
 		result = floatVReg
 	case ssa.TypeF64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		load.asFpuLoad(floatVReg, mode, 64)
 		result = floatVReg
 	case ssa.TypeV128:
 		mode.imm = 16
-		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		load.asFpuLoad(floatVReg, mode, 128)
 		result = floatVReg
 	default:
 		panic("TODO")
@@ -408,7 +409,8 @@ func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg r
 
 func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
 	store := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
 	var sizeInBits byte
 	switch result.Type {
 	case ssa.TypeI32, ssa.TypeF32:
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
index 8aabc5997b..1f563428aa 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -3,10 +3,12 @@ package arm64
 import (
 	"fmt"
 	"math"
+	"unsafe"
 
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
 )
 
 type (
@@ -22,9 +24,9 @@ type (
 	// TODO: optimize the layout later once the impl settles.
 	instruction struct {
 		prev, next          *instruction
-		u1, u2, u3          uint64
-		rd, rm, rn, ra      operand
-		amode               addressMode
+		u1, u2              uint64
+		rd                  regalloc.VReg
+		rm, rn              operand
 		kind                instructionKind
 		addedBeforeRegAlloc bool
 	}
@@ -34,18 +36,6 @@ type (
 	instructionKind byte
 )
 
-func asNop0(i *instruction) {
-	i.kind = nop0
-}
-
-func setNext(i, next *instruction) {
-	i.next = next
-}
-
-func setPrev(i, prev *instruction) {
-	i.prev = prev
-}
-
 // IsCall implements regalloc.Instr IsCall.
 func (i *instruction) IsCall() bool {
 	return i.kind == call
@@ -61,21 +51,6 @@ func (i *instruction) IsReturn() bool {
 	return i.kind == ret
 }
 
-// Next implements regalloc.Instr Next.
-func (i *instruction) Next() regalloc.Instr {
-	return i.next
-}
-
-// Prev implements regalloc.Instr Prev.
-func (i *instruction) Prev() regalloc.Instr {
-	return i.prev
-}
-
-// AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc.
-func (i *instruction) AddedBeforeRegAlloc() bool {
-	return i.addedBeforeRegAlloc
-}
-
 type defKind byte
 
 const (
@@ -174,7 +149,7 @@ func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		*regs = append(*regs, i.rd.nr())
+		*regs = append(*regs, i.rd)
 	case defKindCall:
 		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
 		for i := byte(0); i < retIntRealRegs; i++ {
@@ -194,7 +169,7 @@ func (i *instruction) AssignDef(reg regalloc.VReg) {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		i.rd = i.rd.assignReg(reg)
+		i.rd = reg
 	case defKindCall:
 		panic("BUG: call instructions shouldn't be assigned")
 	default:
@@ -329,7 +304,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 		if rm := i.rm.reg(); rm.Valid() {
 			*regs = append(*regs, rm)
 		}
-		if ra := i.ra.reg(); ra.Valid() {
+		if ra := regalloc.VReg(i.u2); ra.Valid() {
 			*regs = append(*regs, ra)
 		}
 	case useKindRNRN1RM:
@@ -341,18 +316,20 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 			*regs = append(*regs, rm)
 		}
 	case useKindAMode:
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindRNAMode:
 		*regs = append(*regs, i.rn.reg())
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindCond:
@@ -374,7 +351,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 	case useKindRDRewrite:
 		*regs = append(*regs, i.rn.reg())
 		*regs = append(*regs, i.rm.reg())
-		*regs = append(*regs, i.rd.reg())
+		*regs = append(*regs, i.rd)
 	default:
 		panic(fmt.Sprintf("useKind for %v not defined", i))
 	}
@@ -408,8 +385,8 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if rd := i.rd.reg(); rd.Valid() {
-				i.rd = i.rd.assignReg(reg)
+			if rd := i.rd; rd.Valid() {
+				i.rd = reg
 			}
 		}
 	case useKindRNRN1RM:
@@ -435,32 +412,36 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if ra := i.ra.reg(); ra.Valid() {
-				i.ra = i.ra.assignReg(reg)
+			if ra := regalloc.VReg(i.u2); ra.Valid() {
+				i.u2 = uint64(reg)
 			}
 		}
 	case useKindAMode:
 		if index == 0 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			}
 		}
 	case useKindRNAMode:
 		if index == 0 {
 			i.rn = i.rn.assignReg(reg)
 		} else if index == 1 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			} else {
 				panic("BUG")
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			} else {
 				panic("BUG")
 			}
@@ -503,35 +484,35 @@ func (i *instruction) callFuncRef() ssa.FuncRef {
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movZ
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movK
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movN
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -553,21 +534,21 @@ func (i *instruction) asRet() {
 	i.kind = ret
 }
 
-func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = storeP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = loadP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asStore(src operand, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = store8
@@ -589,10 +570,10 @@ func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
 		i.kind = fpuStore128
 	}
 	i.rn = src
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asSLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = sLoad8
@@ -604,10 +585,10 @@ func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
 		panic("BUG")
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asULoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = uLoad8
@@ -619,10 +600,10 @@ func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
 		i.kind = uLoad64
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asFpuLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 32:
 		i.kind = fpuLoad32
@@ -632,10 +613,18 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte)
 		i.kind = fpuLoad128
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
+}
+
+func (i *instruction) getAmode() *addressMode {
+	return wazevoapi.PtrFromUintptr[addressMode](uintptr(i.u1))
 }
 
-func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+func (i *instruction) setAmode(a *addressMode) {
+	i.u1 = uint64(uintptr(unsafe.Pointer(a)))
+}
+
+func (i *instruction) asVecLoad1R(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	// NOTE: currently only has support for no-offset loads, though it is suspicious that
 	// we would need to support offset load (that is only available for post-index).
 	i.kind = vecLoad1R
@@ -646,32 +635,32 @@ func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
 
 func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
 	i.kind = cSet
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(c)
 	if mask {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = cSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asFpuCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = fpuCSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
@@ -691,7 +680,7 @@ func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, tar
 }
 
 func (i *instruction) brTableSequenceOffsetsResolved() {
-	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+	i.rm.data = 1 // indicate that the offsets are resolved, for debugging.
 }
 
 func (i *instruction) brLabel() label {
@@ -701,7 +690,7 @@ func (i *instruction) brLabel() label {
 // brOffsetResolved is called when the target label is resolved.
 func (i *instruction) brOffsetResolve(offset int64) {
 	i.u2 = uint64(offset)
-	i.u3 = 1 // indicate that the offset is resolved, for debugging.
+	i.rm.data = 1 // indicate that the offset is resolved, for debugging.
 }
 
 func (i *instruction) brOffset() int64 {
@@ -714,7 +703,7 @@ func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
 	i.u1 = c.asUint64()
 	i.u2 = uint64(target)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -728,17 +717,17 @@ func (i *instruction) condBrLabel() label {
 
 // condBrOffsetResolve is called when the target label is resolved.
 func (i *instruction) condBrOffsetResolve(offset int64) {
-	i.rd.data = uint64(offset)
-	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+	i.rn.data = uint64(offset)
+	i.rn.data2 = 1 // indicate that the offset is resolved, for debugging.
 }
 
 // condBrOffsetResolved returns true if condBrOffsetResolve is already called.
 func (i *instruction) condBrOffsetResolved() bool {
-	return i.rd.data2 == 1
+	return i.rn.data2 == 1
 }
 
 func (i *instruction) condBrOffset() int64 {
-	return int64(i.rd.data)
+	return int64(i.rn.data)
 }
 
 func (i *instruction) condBrCond() cond {
@@ -746,33 +735,33 @@ func (i *instruction) condBrCond() cond {
 }
 
 func (i *instruction) condBr64bit() bool {
-	return i.u3 == 1
+	return i.u2&(1<<32) != 0
 }
 
 func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst32
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst64
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
 	i.kind = loadFpuConst128
 	i.u1 = lo
 	i.u2 = hi
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
 	i.kind = fpuCmp
 	i.rn, i.rm = rn, rm
 	if is64bit {
-		i.u3 = 1
+		i.u1 = 1
 	}
 }
 
@@ -783,12 +772,12 @@ func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, i
 	i.u1 = uint64(c)
 	i.u2 = uint64(flag)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALU(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR
@@ -804,22 +793,22 @@ func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+func (i *instruction) asALURRRR(aluOp aluOp, rd regalloc.VReg, rn, rm operand, ra regalloc.VReg, dst64bit bool) {
 	i.kind = aluRRRR
 	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+	i.rd, i.rn, i.rm, i.u2 = rd, rn, rm, uint64(ra)
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
 // asALUShift setups a shift based ALU instruction.
-func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALUShift(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
@@ -831,17 +820,17 @@ func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool)
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
 	i.kind = aluRRBitmaskImm
 	i.u1 = uint64(aluOp)
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u2 = imm
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
@@ -852,76 +841,76 @@ func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
 
 func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
 	i.kind = movFromFPSR
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
 	i.kind = bitRR
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(bitOp)
 	if is64bit {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asFpuRRR(op fpuBinOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	i.kind = fpuRRR
 	i.u1 = uint64(op)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+func (i *instruction) asFpuRR(op fpuUniOp, rd regalloc.VReg, rn operand, dst64bit bool) {
 	i.kind = fpuRR
 	i.u1 = uint64(op)
 	i.rd, i.rn = rd, rn
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
 func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
 	i.kind = extend
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(fromBits)
 	i.u2 = uint64(toBits)
 	if signed {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asMove32(rd, rn regalloc.VReg) {
 	i.kind = mov32
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
 	i.kind = mov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
 func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
 	i.kind = fpuMov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
 	i.kind = fpuMov128
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
-func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asMovToVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = movToVec
 	i.rd = rd
 	i.rn = rn
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+func (i *instruction) asMovFromVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex, signed bool) {
 	if signed {
 		i.kind = movFromVecSigned
 	} else {
@@ -932,48 +921,48 @@ func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vec
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecDup(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecDup
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asVecDupElement(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = vecDupElement
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+func (i *instruction) asVecExtract(rd regalloc.VReg, rn, rm operand, arr vecArrangement, index uint32) {
 	i.kind = vecExtract
 	i.u1 = uint64(arr)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+func (i *instruction) asVecMovElement(rd regalloc.VReg, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
 	i.kind = vecMovElement
 	i.u1 = uint64(arr)
-	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+	i.u2 = uint64(rdIndex) | uint64(rnIndex)<<32
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecMisc(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecMisc
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecLanes(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecLanes
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecShiftImm(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecShiftImm
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
@@ -981,7 +970,7 @@ func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrange
 	return i
 }
 
-func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecTbl(nregs byte, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	switch nregs {
 	case 0, 1:
 		i.kind = vecTbl
@@ -1000,14 +989,14 @@ func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangemen
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecPermute(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecPermute
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecRRR(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecRRR
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1017,7 +1006,7 @@ func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement)
 
 // asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
 // IMPORTANT: the destination register must be already defined before this instruction.
-func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecRRRRewrite(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecRRRRewrite
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1033,8 +1022,8 @@ func (i *instruction) IsCopy() bool {
 
 // String implements fmt.Stringer.
 func (i *instruction) String() (str string) {
-	is64SizeBitToSize := func(u3 uint64) byte {
-		if u3 == 0 {
+	is64SizeBitToSize := func(v uint64) byte {
+		if v == 0 {
 			return 32
 		}
 		return 64
@@ -1049,46 +1038,46 @@ func (i *instruction) String() (str string) {
 			str = "nop0"
 		}
 	case aluRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size))
 	case aluRRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(regalloc.VReg(i.u2), size))
 	case aluRRImm12:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
 	case aluRRBitmaskImm:
-		size := is64SizeBitToSize(i.u3)
-		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+		size := is64SizeBitToSize(i.u1 >> 32)
+		rd, rn := formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size)
 		if size == 32 {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
 		} else {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
 		}
 	case aluRRImmShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %#x",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.shiftImm(),
 		)
 	case aluRRRShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size),
 		)
 	case aluRRRExtend:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			// Regardless of the source size, the register is formatted in 32-bit.
 			i.rm.format(32),
@@ -1097,57 +1086,57 @@ func (i *instruction) String() (str string) {
 		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s",
 			bitOp(i.u1),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 		)
 	case uLoad8:
-		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad8:
-		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad16:
-		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad16:
-		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad32:
-		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case store8:
-		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(8))
 	case store16:
-		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(16))
 	case store32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(32))
 	case store64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case storeP64:
 		str = fmt.Sprintf("stp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case loadP64:
 		str = fmt.Sprintf("ldp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case mov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegSized(i.rd.nr(), 64),
+			formatVRegSized(i.rd, 64),
 			formatVRegSized(i.rn.nr(), 64))
 	case mov32:
-		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd, 32), formatVRegSized(i.rn.nr(), 32))
 	case movZ:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movN:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movK:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case extend:
 		fromBits, toBits := byte(i.u1), byte(i.u2)
 
 		var signedStr string
-		if i.u3 == 1 {
+		if i.u2>>32 == 1 {
 			signedStr = "s"
 		} else {
 			signedStr = "u"
@@ -1161,39 +1150,39 @@ func (i *instruction) String() (str string) {
 		case 32:
 			fromStr = "w"
 		}
-		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd, toBits), formatVRegSized(i.rn.nr(), 32))
 	case cSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("csel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
 		)
 	case cSet:
 		if i.u2 != 0 {
-			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		} else {
-			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		}
 	case cCmpImm:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
 			formatVRegSized(i.rn.nr(), size), i.rm.data,
 			i.u2&0b1111,
 			condFlag(i.u1))
 	case fpuMov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement8B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
 	case fpuMov128:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
 	case fpuMovFromVec:
 		panic("TODO")
 	case fpuRR:
-		dstSz := is64SizeBitToSize(i.u3)
+		dstSz := is64SizeBitToSize(i.u2)
 		srcSz := dstSz
 		op := fpuUniOp(i.u1)
 		switch op {
@@ -1203,38 +1192,38 @@ func (i *instruction) String() (str string) {
 			srcSz = 64
 		}
 		str = fmt.Sprintf("%s %s, %s", op.String(),
-			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+			formatVRegSized(i.rd, dstSz), formatVRegSized(i.rn.nr(), srcSz))
 	case fpuRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuRRI:
 		panic("TODO")
 	case fpuRRRR:
 		panic("TODO")
 	case fpuCmp:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1)
 		str = fmt.Sprintf("fcmp %s, %s",
 			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case fpuStore32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(64))
 	case fpuLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case fpuStore64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case fpuLoad128:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 128), i.getAmode().format(64))
 	case fpuStore128:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.getAmode().format(64))
 	case loadFpuConst32:
-		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd, 32), math.Float32frombits(uint32(i.u1)))
 	case loadFpuConst64:
-		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd, 64), math.Float64frombits(i.u1))
 	case loadFpuConst128:
 		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
-			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+			formatVRegSized(i.rd, 128), i.u1, i.u2)
 	case fpuToInt:
 		var op, src, dst string
 		if signed := i.u1 == 1; signed {
@@ -1242,15 +1231,15 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "fcvtzu"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
 		} else {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegSized(i.rd.nr(), 64)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegSized(i.rd, 64)
 		} else {
-			dst = formatVRegSized(i.rd.nr(), 32)
+			dst = formatVRegSized(i.rd, 32)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 
@@ -1261,21 +1250,21 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "ucvtf"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegSized(i.rn.nr(), 64)
 		} else {
 			src = formatVRegSized(i.rn.nr(), 32)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegWidthVec(i.rd, vecArrangementD)
 		} else {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+			dst = formatVRegWidthVec(i.rd, vecArrangementS)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 	case fpuCSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
@@ -1291,7 +1280,7 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd, arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
 	case movFromVec, movFromVecSigned:
 		var size byte
 		var opcode string
@@ -1315,23 +1304,23 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd, size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
 	case vecDup:
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegSized(i.rn.nr(), 64),
 		)
 	case vecDupElement:
 		arr := vecArrangement(i.u1)
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
 		)
 	case vecDupFromFpu:
 		panic("TODO")
 	case vecExtract:
 		str = fmt.Sprintf("ext %s, %s, %s, #%d",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
 			uint32(i.u2),
@@ -1340,15 +1329,15 @@ func (i *instruction) String() (str string) {
 		panic("TODO")
 	case vecMovElement:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndex(i.u2&0xffffffff)),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u2>>32)),
 		)
 	case vecMiscNarrow:
 		panic("TODO")
 	case vecRRR, vecRRRRewrite:
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
 		)
@@ -1356,12 +1345,12 @@ func (i *instruction) String() (str string) {
 		vop := vecOp(i.u1)
 		if vop == vecOpCmeq0 {
 			str = fmt.Sprintf("cmeq %s, %s, #0",
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		} else {
 			str = fmt.Sprintf("%s %s, %s",
 				vop,
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		}
 	case vecLanes:
@@ -1379,24 +1368,24 @@ func (i *instruction) String() (str string) {
 		}
 		str = fmt.Sprintf("%s %s, %s",
 			vecOp(i.u1),
-			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegWidthVec(i.rd, destArr),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
 	case vecShiftImm:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, #%d",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			i.rm.shiftImm())
 	case vecTbl:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("tbl %s, { %s }, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case vecTbl2:
 		arr := vecArrangement(i.u2)
-		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+		rd, rn, rm := i.rd, i.rn.nr(), i.rm.nr()
 		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
 		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
 			formatVRegVec(rd, arr, vecIndexNone),
@@ -1407,13 +1396,13 @@ func (i *instruction) String() (str string) {
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case movToFPSR:
 		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
 	case movFromFPSR:
-		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd, 64))
 	case call:
 		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
 	case callInd:
@@ -1422,15 +1411,15 @@ func (i *instruction) String() (str string) {
 		str = "ret"
 	case br:
 		target := label(i.u1)
-		if i.u3 != 0 {
+		if i.rm.data != 0 {
 			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
 		} else {
 			str = fmt.Sprintf("b %s", target.String())
 		}
 	case condBr:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		c := cond(i.u1)
-		target := label(i.u2)
+		target := label(i.u2 & 0xffffffff)
 		switch c.kind() {
 		case condKindRegisterZero:
 			if !i.condBrOffsetResolved() {
@@ -1456,7 +1445,7 @@ func (i *instruction) String() (str string) {
 			}
 		}
 	case adr:
-		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd, 64), int64(i.u1))
 	case brTableSequence:
 		targetIndex := i.u1
 		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
@@ -1473,7 +1462,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicCas:
 		m := "casal"
 		size := byte(32)
@@ -1485,7 +1474,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
 	case atomicLoad:
 		m := "ldar"
 		size := byte(32)
@@ -1497,7 +1486,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicStore:
 		m := "stlr"
 		size := byte(32)
@@ -1517,9 +1506,9 @@ func (i *instruction) String() (str string) {
 	case emitSourceOffsetInfo:
 		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
 	case vecLoad1R:
-		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
 	case loadConstBlockArg:
-		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd, 64), i.u1)
 	default:
 		panic(i.kind)
 	}
@@ -1528,26 +1517,26 @@ func (i *instruction) String() (str string) {
 
 func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
 	i.kind = adr
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(offset)
 }
 
-func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicRmw
-	i.rd, i.rn, i.rm = rt, rn, rs
+	i.rd, i.rn, i.rm = rt, operandNR(rn), operandNR(rs)
 	i.u1 = uint64(op)
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicCas(rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicCas
-	i.rm, i.rn, i.rd = rt, rn, rs
+	i.rm, i.rn, i.rd = operandNR(rt), operandNR(rn), rs
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+func (i *instruction) asAtomicLoad(rn, rt regalloc.VReg, size uint64) {
 	i.kind = atomicLoad
-	i.rn, i.rd = rn, rt
+	i.rn, i.rd = operandNR(rn), rt
 	i.u2 = size
 }
 
@@ -1755,12 +1744,12 @@ func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.V
 	i.kind = loadConstBlockArg
 	i.u1 = v
 	i.u2 = uint64(typ)
-	i.rd = operandNR(dst)
+	i.rd = dst
 	return i
 }
 
 func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
-	return i.u1, ssa.Type(i.u2), i.rd.nr()
+	return i.u1, ssa.Type(i.u2), i.rd
 }
 
 func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
@@ -1778,7 +1767,7 @@ func (i *instruction) asUDF() *instruction {
 	return i
 }
 
-func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+func (i *instruction) asFpuToInt(rd regalloc.VReg, rn operand, rdSigned, src64bit, dst64bit bool) {
 	i.kind = fpuToInt
 	i.rn = rn
 	i.rd = rd
@@ -1789,11 +1778,11 @@ func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
-func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+func (i *instruction) asIntToFpu(rd regalloc.VReg, rn operand, rnSigned, src64bit, dst64bit bool) {
 	i.kind = intToFpu
 	i.rn = rn
 	i.rd = rd
@@ -1804,7 +1793,7 @@ func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
@@ -1817,7 +1806,7 @@ func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
 // aluOp determines the type of ALU operation. Instructions whose kind is one of
 // aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
 // would use this type.
-type aluOp int
+type aluOp uint32
 
 func (a aluOp) String() string {
 	switch a {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
index 227a964741..21be9b71e7 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -12,7 +12,7 @@ import (
 // Encode implements backend.Machine Encode.
 func (m *machine) Encode(ctx context.Context) error {
 	m.resolveRelativeAddresses(ctx)
-	m.encode(m.executableContext.RootInstr)
+	m.encode(m.rootInstr)
 	if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize {
 		return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize)
 	}
@@ -44,12 +44,12 @@ func (i *instruction) encode(m *machine) {
 	case callInd:
 		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
 	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], *i.getAmode()))
 	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.RealReg()], *i.getAmode()))
 	case vecLoad1R:
 		c.Emit4Bytes(encodeVecLoad1R(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1)))
 	case condBr:
@@ -75,22 +75,22 @@ func (i *instruction) encode(m *machine) {
 			panic("BUG")
 		}
 	case movN:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movZ:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movK:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case mov32:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
 	case mov64:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		toIsSp := to == sp
 		fromIsSp := from == sp
 		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
 	case loadP64, storeP64:
 		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		amode := i.amode
+		amode := i.getAmode()
 		rn := regNumberInEncoding[amode.rn.RealReg()]
 		var pre bool
 		switch amode.kind {
@@ -102,21 +102,21 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
 	case loadFpuConst32:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
 			encodeLoadFpuConst32(c, rd, i.u1)
 		}
 	case loadFpuConst64:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
-			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.RealReg()], i.u1)
 		}
 	case loadFpuConst128:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		lo, hi := i.u1, i.u2
 		if lo == 0 && hi == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
@@ -126,35 +126,35 @@ func (i *instruction) encode(m *machine) {
 	case aluRRRR:
 		c.Emit4Bytes(encodeAluRRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.ra.realReg()],
-			uint32(i.u3),
+			regNumberInEncoding[regalloc.VReg(i.u2).RealReg()],
+			uint32(i.u1>>32),
 		))
 	case aluRRImmShift:
 		c.Emit4Bytes(encodeAluRRImm(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
-			uint32(i.u3),
+			uint32(i.u2>>32),
 		))
 	case aluRRR:
 		rn := i.rn.realReg()
 		c.Emit4Bytes(encodeAluRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[rn],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2>>32 == 1,
 			rn == sp,
 		))
 	case aluRRRExtend:
 		rm, exo, to := i.rm.er()
 		c.Emit4Bytes(encodeAluRRRExtend(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[rm.RealReg()],
 			exo,
@@ -164,25 +164,25 @@ func (i *instruction) encode(m *machine) {
 		r, amt, sop := i.rm.sr()
 		c.Emit4Bytes(encodeAluRRRShift(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[r.RealReg()],
 			uint32(amt),
 			sop,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case aluRRBitmaskImm:
 		c.Emit4Bytes(encodeAluBitmaskImmediate(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			i.u2,
-			i.u3 == 1,
+			i.u1>>32 == 1,
 		))
 	case bitRR:
 		c.Emit4Bytes(encodeBitRR(
 			bitOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2)),
 		)
@@ -190,22 +190,22 @@ func (i *instruction) encode(m *machine) {
 		imm12, shift := i.rm.imm12()
 		c.Emit4Bytes(encodeAluRRImm12(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			imm12, shift,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case fpuRRR:
 		c.Emit4Bytes(encodeFpuRRR(
 			fpuBinOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuMov64, fpuMov128:
 		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		rn := regNumberInEncoding[i.rn.realReg()]
 		var q uint32
 		if kind == fpuMov128 {
@@ -213,7 +213,7 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
 	case cSet:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		cf := condFlag(i.u1)
 		if i.u2 == 1 {
 			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
@@ -225,12 +225,12 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
 		}
 	case extend:
-		c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+		c.Emit4Bytes(encodeExtend((i.u2>>32) == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.RealReg()], regNumberInEncoding[i.rn.realReg()]))
 	case fpuCmp:
 		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
 		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
 		var ftype uint32
-		if i.u3 == 1 {
+		if i.u1 == 1 {
 			ftype = 0b01 // double precision.
 		}
 		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
@@ -242,34 +242,34 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0)
 		}
 	case adr:
-		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.RealReg()], uint32(i.u1)))
 	case cSel:
 		c.Emit4Bytes(encodeConditionalSelect(
 			kind,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuCSel:
 		c.Emit4Bytes(encodeFpuCSel(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case movToVec:
 		c.Emit4Bytes(encodeMoveToVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
 		))
 	case movFromVec, movFromVecSigned:
 		c.Emit4Bytes(encodeMoveFromVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
@@ -277,18 +277,18 @@ func (i *instruction) encode(m *machine) {
 		))
 	case vecDup:
 		c.Emit4Bytes(encodeVecDup(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1))))
 	case vecDupElement:
 		c.Emit4Bytes(encodeVecDupElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2)))
 	case vecExtract:
 		c.Emit4Bytes(encodeVecExtract(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u1)),
@@ -296,35 +296,35 @@ func (i *instruction) encode(m *machine) {
 	case vecPermute:
 		c.Emit4Bytes(encodeVecPermute(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u2))))
 	case vecMovElement:
 		c.Emit4Bytes(encodeVecMovElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1),
-			uint32(i.u2), uint32(i.u3),
+			uint32(i.u2), uint32(i.u2>>32),
 		))
 	case vecMisc:
 		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecLanes:
 		c.Emit4Bytes(encodeVecLanes(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecShiftImm:
 		c.Emit4Bytes(encodeVecShiftImm(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
 			vecArrangement(i.u2),
@@ -332,7 +332,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl:
 		c.Emit4Bytes(encodeVecTbl(
 			1,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -340,7 +340,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl2:
 		c.Emit4Bytes(encodeVecTbl(
 			2,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -353,9 +353,9 @@ func (i *instruction) encode(m *machine) {
 	case fpuRR:
 		c.Emit4Bytes(encodeFloatDataOneSource(
 			fpuUniOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case vecRRR:
 		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
@@ -365,14 +365,14 @@ func (i *instruction) encode(m *machine) {
 	case vecRRRRewrite:
 		c.Emit4Bytes(encodeVecRRR(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2),
 		))
 	case cCmpImm:
 		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-		sf := uint32(i.u3 & 0b1)
+		sf := uint32((i.u2 >> 32) & 0b1)
 		nzcv := uint32(i.u2 & 0b1111)
 		cond := uint32(condFlag(i.u1))
 		imm := uint32(i.rm.data & 0b11111)
@@ -381,7 +381,7 @@ func (i *instruction) encode(m *machine) {
 			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
 		)
 	case movFromFPSR:
-		rt := regNumberInEncoding[i.rd.realReg()]
+		rt := regNumberInEncoding[i.rd.RealReg()]
 		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
 	case movToFPSR:
 		rt := regNumberInEncoding[i.rn.realReg()]
@@ -390,13 +390,13 @@ func (i *instruction) encode(m *machine) {
 		c.Emit4Bytes(encodeAtomicRmw(
 			atomicRmwOp(i.u1),
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
 		))
 	case atomicCas:
 		c.Emit4Bytes(encodeAtomicCas(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
@@ -404,7 +404,7 @@ func (i *instruction) encode(m *machine) {
 	case atomicLoad:
 		c.Emit4Bytes(encodeAtomicLoadStore(
 			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			uint32(i.u2),
 			1,
 		))
@@ -810,7 +810,7 @@ func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32
 // encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
 func encodeCnvBetweenFloatInt(i *instruction) uint32 {
-	rd := regNumberInEncoding[i.rd.realReg()]
+	rd := regNumberInEncoding[i.rd.RealReg()]
 	rn := regNumberInEncoding[i.rn.realReg()]
 
 	var opcode uint32
@@ -822,8 +822,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b00
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 		if signed {
 			opcode = 0b010
 		} else {
@@ -841,8 +841,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b11
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 
 		if signed {
 			opcode = 0b000
@@ -1787,13 +1787,13 @@ func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
 //
 // "shift" must have been divided by 16 at this point.
-func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm uint64, shift, _64bit uint32) (ret uint32) {
 	ret = rd
 	ret |= uint32(imm&0xffff) << 5
-	ret |= (uint32(shift)) << 21
+	ret |= (shift) << 21
 	ret |= 0b100101 << 23
 	ret |= opc << 29
-	ret |= uint32(_64bit) << 31
+	ret |= _64bit << 31
 	return
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
index 698b382d46..6c6824fb0a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -284,18 +284,18 @@ func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
 
 func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	instr.asMOVZ(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVK(dst, v, uint64(shift), dst64)
+	instr.asMOVK(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVN(dst, v, uint64(shift), dst64)
+	instr.asMOVN(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
index 2bb234e8c1..f9df356c0e 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -17,19 +17,18 @@ import (
 
 // LowerSingleBranch implements backend.Machine.
 func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
-	ectx := m.executableContext
 	switch br.Opcode() {
 	case ssa.OpcodeJump:
-		_, _, targetBlk := br.BranchData()
+		_, _, targetBlkID := br.BranchData()
 		if br.IsFallthroughJump() {
 			return
 		}
 		b := m.allocateInstr()
-		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
-		if target == labelReturn {
+		targetBlk := m.compiler.SSABuilder().BasicBlock(targetBlkID)
+		if targetBlk.ReturnBlock() {
 			b.asRet()
 		} else {
-			b.asBr(target)
+			b.asBr(ssaBlockLabel(targetBlk))
 		}
 		m.insert(b)
 	case ssa.OpcodeBrTable:
@@ -40,7 +39,8 @@ func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
 }
 
 func (m *machine) lowerBrTable(i *ssa.Instruction) {
-	index, targets := i.BrTableData()
+	index, targetBlockIDs := i.BrTableData()
+	targetBlockCount := len(targetBlockIDs.View())
 	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
 
 	// Firstly, we have to do the bounds check of the index, and
@@ -50,35 +50,35 @@ func (m *machine) lowerBrTable(i *ssa.Instruction) {
 	// subs wzr, index, maxIndexReg
 	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
 	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
-	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
+	m.lowerConstantI32(maxIndexReg, int32(targetBlockCount-1))
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+	subs.asALU(aluOpSubS, xzrVReg, indexOperand, operandNR(maxIndexReg), false)
 	m.insert(subs)
 	csel := m.allocateInstr()
 	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
-	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+	csel.asCSel(adjustedIndex, operandNR(maxIndexReg), indexOperand, hs, false)
 	m.insert(csel)
 
 	brSequence := m.allocateInstr()
 
-	tableIndex := m.addJmpTableTarget(targets)
-	brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets))
+	tableIndex := m.addJmpTableTarget(targetBlockIDs)
+	brSequence.asBrTableSequence(adjustedIndex, tableIndex, targetBlockCount)
 	m.insert(brSequence)
 }
 
 // LowerConditionalBranch implements backend.Machine.
 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
-	exctx := m.executableContext
-	cval, args, targetBlk := b.BranchData()
+	cval, args, targetBlkID := b.BranchData()
 	if len(args) > 0 {
 		panic(fmt.Sprintf(
 			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
-			exctx.CurrentSSABlk,
-			targetBlk,
+			m.currentLabelPos.sb,
+			targetBlkID,
 		))
 	}
 
-	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+	targetBlk := m.compiler.SSABuilder().BasicBlock(targetBlkID)
+	target := ssaBlockLabel(targetBlk)
 	cvalDef := m.compiler.ValueDefinition(cval)
 
 	switch {
@@ -249,7 +249,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
 			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-			rd := operandNR(m.compiler.VRegOf(instr.Return()))
+			rd := m.compiler.VRegOf(instr.Return())
 			m.lowerSelectVec(rc, rn, rm, rd)
 		} else {
 			m.lowerSelect(c, x, y, instr.Return())
@@ -270,7 +270,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
@@ -278,7 +278,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
@@ -286,25 +286,25 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFcvtFromUint:
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFdemote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
 		m.insert(cnt)
 	case ssa.OpcodeFpromote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
 		m.insert(cnt)
@@ -343,15 +343,15 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
 	case ssa.OpcodeSrem, ssa.OpcodeUrem:
 		x, y, ctx := instr.Arg3()
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+		rd := m.compiler.VRegOf(instr.Return())
+		m.lowerIRem(ctxVReg, rd, rn.nr(), rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
 	case ssa.OpcodeVconst:
 		result := m.compiler.VRegOf(instr.Return())
 		lo, hi := instr.VconstData()
@@ -362,7 +362,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
 		m.insert(ins)
 	case ssa.OpcodeVbxor:
@@ -382,12 +382,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
 		// in case when it is used somewhere else.
 		mov := m.allocateInstr()
-		mov.asFpuMov128(tmp.nr(), creg.nr())
+		mov.asFpuMov128(tmp, creg.nr())
 		m.insert(mov)
 
 		ins := m.allocateInstr()
@@ -396,7 +396,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		mov2 := m.allocateInstr()
 		rd := m.compiler.VRegOf(instr.Return())
-		mov2.asFpuMov128(rd, tmp.nr())
+		mov2.asFpuMov128(rd, tmp)
 		m.insert(mov2)
 	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
 		x, lane := instr.ArgWithLane()
@@ -405,12 +405,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			arr = ssaLaneToArrangement(lane)
 		}
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVcheckTrue(op, rm, rd, arr)
 	case ssa.OpcodeVhighBits:
 		x, lane := instr.ArgWithLane()
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		arr := ssaLaneToArrangement(lane)
 		m.lowerVhighBits(rm, rd, arr)
 	case ssa.OpcodeVIadd:
@@ -441,9 +441,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			panic("unsupported lane " + lane.String())
 		}
 
-		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
-		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
-		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo.nr(), vv, operandShiftImm(0), loArr)
+		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi.nr(), vv, operandShiftImm(0), hiArr)
+		addp := m.allocateInstr().asVecRRR(vecOpAddp, m.compiler.VRegOf(instr.Return()), tmpLo, tmpHi, dstArr)
 		m.insert(widenLo)
 		m.insert(widenHi)
 		m.insert(addp)
@@ -493,7 +493,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVIMul(rd, rn, rm, arr)
 	case ssa.OpcodeVIabs:
 		m.lowerVecMisc(vecOpAbs, instr)
@@ -507,7 +507,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVShift(op, rd, rn, rm, arr)
 	case ssa.OpcodeVSqrt:
 		m.lowerVecMisc(vecOpFsqrt, instr)
@@ -547,18 +547,18 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
 	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
 	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		var arr vecArrangement
 		switch lane {
@@ -580,7 +580,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -607,9 +607,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		loQxtn := m.allocateInstr()
 		hiQxtn := m.allocateInstr()
@@ -628,7 +628,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		m.insert(hiQxtn)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmp.nr())
+		mov.asFpuMov128(rd, tmp)
 		m.insert(mov)
 	case ssa.OpcodeFvpromoteLow:
 		x, lane := instr.ArgWithLane()
@@ -637,7 +637,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeFvdemote:
@@ -647,14 +647,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeExtractlane:
 		x, index, signed, lane := instr.ExtractlaneData()
 
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		mov := m.allocateInstr()
 		switch lane {
@@ -680,12 +680,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, index, lane := instr.InsertlaneData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		rd := m.compiler.VRegOf(instr.Return())
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Initially mov rn to tmp.
 		mov1 := m.allocateInstr()
-		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+		mov1.asFpuMov128(tmpReg, rn.nr())
 		m.insert(mov1)
 
 		// movToVec and vecMovElement do not clear the remaining bits to zero,
@@ -709,14 +709,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		// Finally mov tmp to rd.
 		mov3 := m.allocateInstr()
-		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+		mov3.asFpuMov128(rd, tmpReg)
 		m.insert(mov3)
 
 	case ssa.OpcodeSwizzle:
 		x, y, lane := instr.Arg2WithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -729,14 +729,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, lane1, lane2 := instr.ShuffleData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		m.lowerShuffle(rd, rn, rm, lane1, lane2)
 
 	case ssa.OpcodeSplat:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		dup := m.allocateInstr()
 		switch lane {
@@ -760,12 +760,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
 			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp2, vecArrangement4S))
 
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+		rd := m.compiler.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asFpuMov128(rd, tmp.nr()))
 
 	case ssa.OpcodeLoadSplat:
 		ptr, offset, lane := instr.LoadSplatData()
@@ -791,10 +791,10 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	default:
 		panic("TODO: lowering " + op.String())
 	}
-	m.executableContext.FlushPendingInstructions()
+	m.FlushPendingInstructions()
 }
 
-func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+func (m *machine) lowerShuffle(rd regalloc.VReg, rn, rm operand, lane1, lane2 uint64) {
 	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
 	vReg, wReg := v29VReg, v30VReg
 
@@ -822,7 +822,7 @@ func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
 	m.insert(tbl2)
 }
 
-func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVShift(op ssa.Opcode, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	var modulo byte
 	switch arr {
 	case vecArrangement16B:
@@ -847,13 +847,13 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	if op != ssa.OpcodeVIshl {
 		// Negate the amount to make this as right shift.
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+		neg.asALU(aluOpSub, rtmp.nr(), operandNR(xzrVReg), rtmp, true)
 		m.insert(neg)
 	}
 
 	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
 	dup := m.allocateInstr()
-	dup.asVecDup(vtmp, rtmp, arr)
+	dup.asVecDup(vtmp.nr(), rtmp, arr)
 	m.insert(dup)
 
 	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
@@ -867,7 +867,7 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	}
 }
 
-func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm operand, rd regalloc.VReg, arr vecArrangement) {
 	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 
 	// Special case VallTrue for i64x2.
@@ -878,11 +878,11 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		//	cset dst, eq
 
 		ins := m.allocateInstr()
-		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+		ins.asVecMisc(vecOpCmeq0, tmp.nr(), rm, vecArrangement2D)
 		m.insert(ins)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+		addp.asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp, vecArrangement2D)
 		m.insert(addp)
 
 		fcmp := m.allocateInstr()
@@ -890,7 +890,7 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		m.insert(fcmp)
 
 		cset := m.allocateInstr()
-		cset.asCSet(rd.nr(), false, eq)
+		cset.asCSet(rd, false, eq)
 		m.insert(cset)
 
 		return
@@ -900,10 +900,10 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	ins := m.allocateInstr()
 	if op == ssa.OpcodeVanyTrue {
 		// 	umaxp v4?.16b, v2?.16b, v2?.16b
-		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+		ins.asVecRRR(vecOpUmaxp, tmp.nr(), rm, rm, vecArrangement16B)
 	} else {
 		// 	uminv d4?, v2?.4s
-		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+		ins.asVecLanes(vecOpUminv, tmp.nr(), rm, arr)
 	}
 	m.insert(ins)
 
@@ -917,15 +917,15 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	m.insert(movv)
 
 	fc := m.allocateInstr()
-	fc.asCCmpImm(rd, uint64(0), al, 0, true)
+	fc.asCCmpImm(operandNR(rd), uint64(0), al, 0, true)
 	m.insert(fc)
 
 	cset := m.allocateInstr()
-	cset.asCSet(rd.nr(), false, ne)
+	cset.asCSet(rd, false, ne)
 	m.insert(cset)
 }
 
-func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVhighBits(rm operand, rd regalloc.VReg, arr vecArrangement) {
 	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
 	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
@@ -947,7 +947,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v1[i] = 0xff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(7), vecArrangement16B)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -958,7 +958,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		// Lane-wise logical AND with the bit mask, meaning that we have
@@ -967,23 +967,23 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Below, we use the following notation:
 		// wi := (1 << i) if vi<0, 0 otherwise.
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v1.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
 		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
 		ext := m.allocateInstr()
-		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+		ext.asVecExtract(v0.nr(), v1, v1, vecArrangement16B, uint32(8))
 		m.insert(ext)
 
 		// v = [w0, w8, ..., w7, w15]
 		zip1 := m.allocateInstr()
-		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+		zip1.asVecPermute(vecOpZip1, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(zip1)
 
 		// v.h[0] = w0 + ... + w15
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		// Extract the v.h[0] as the result.
@@ -1006,7 +1006,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(15), vecArrangement8H)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1014,26 +1014,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(4), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1055,7 +1055,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffffffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(31), vecArrangement4S)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1063,26 +1063,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(2), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement4S)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1102,21 +1102,21 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// Move the higher 64-bit int into r0.
 		movv1 := m.allocateInstr()
-		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+		movv1.asMovFromVec(r0.nr(), rm, vecArrangementD, vecIndex(1), false)
 		m.insert(movv1)
 
 		// Move the sign bit into the least significant bit.
 		lsr1 := m.allocateInstr()
-		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+		lsr1.asALUShift(aluOpLsr, r0.nr(), r0, operandShiftImm(63), true)
 		m.insert(lsr1)
 
 		lsr2 := m.allocateInstr()
-		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+		lsr2.asALUShift(aluOpLsr, rd, operandNR(rd), operandShiftImm(63), true)
 		m.insert(lsr2)
 
 		// rd = (r0<<1) | rd
 		lsl := m.allocateInstr()
-		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+		lsl.asALU(aluOpAdd, rd, operandNR(rd), operandSR(r0.nr(), 1, shiftOpLSL), false)
 		m.insert(lsl)
 	default:
 		panic("Unsupported " + arr.String())
@@ -1128,7 +1128,7 @@ func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
 	arr := ssaLaneToArrangement(lane)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	ins.asVecMisc(op, rd, rn, arr)
 	m.insert(ins)
 }
@@ -1137,22 +1137,22 @@ func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 	ins.asVecRRR(op, rd, rn, rm, arr)
 	m.insert(ins)
 }
 
-func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVIMul(rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	if arr != vecArrangement2D {
 		mul := m.allocateInstr()
 		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
 		m.insert(mul)
 	} else {
-		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp1 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp3 := m.compiler.AllocateVReg(ssa.TypeV128)
 
-		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmpRes := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
 		rev64 := m.allocateInstr()
@@ -1160,7 +1160,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(rev64)
 
 		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+		mul.asVecRRR(vecOpMul, tmp2, operandNR(tmp2), rn, vecArrangement4S)
 		m.insert(mul)
 
 		xtn1 := m.allocateInstr()
@@ -1168,7 +1168,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(xtn1)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+		addp.asVecRRR(vecOpAddp, tmp2, operandNR(tmp2), operandNR(tmp2), vecArrangement4S)
 		m.insert(addp)
 
 		xtn2 := m.allocateInstr()
@@ -1179,15 +1179,15 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		// In short, in UMLAL instruction, the result register is also one of the source register, and
 		// the value on the result register is significant.
 		shll := m.allocateInstr()
-		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+		shll.asVecMisc(vecOpShll, tmpRes, operandNR(tmp2), vecArrangement2S)
 		m.insert(shll)
 
 		umlal := m.allocateInstr()
-		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, operandNR(tmp3), operandNR(tmp1), vecArrangement2S)
 		m.insert(umlal)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmpRes.nr())
+		mov.asFpuMov128(rd, tmpRes)
 		m.insert(mov)
 	}
 }
@@ -1203,7 +1203,7 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 	// BSL modifies the destination register, so we need to use a temporary register so that
 	// the actual definition of the destination register happens *after* the BSL instruction.
 	// That way, we can force the spill instruction to be inserted after the BSL instruction.
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 	fcmgt := m.allocateInstr()
 	if max {
@@ -1220,17 +1220,17 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 
 	res := operandNR(m.compiler.VRegOf(instr.Return()))
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(res.nr(), tmp.nr())
+	mov2.asFpuMov128(res.nr(), tmp)
 	m.insert(mov2)
 }
 
-func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn regalloc.VReg, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
-		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpSDiv, rd, operandNR(rn), rm, _64bit)
 	} else {
-		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpUDiv, rd, operandNR(rn), rm, _64bit)
 	}
 	m.insert(div)
 
@@ -1239,11 +1239,11 @@ func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 
 	// rd = rn-rd*rm by MSUB instruction.
 	msub := m.allocateInstr()
-	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+	msub.asALURRRR(aluOpMSub, rd, operandNR(rd), rm, rn, _64bit)
 	m.insert(msub)
 }
 
-func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIDiv(execCtxVReg, rd regalloc.VReg, rn, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
@@ -1260,7 +1260,7 @@ func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
 		minusOneCheck := m.allocateInstr()
 		// Sets eq condition if rm == -1.
-		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+		minusOneCheck.asALU(aluOpAddS, xzrVReg, rm, operandImm12(1, 0), _64bit)
 		m.insert(minusOneCheck)
 
 		ccmp := m.allocateInstr()
@@ -1290,20 +1290,20 @@ func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, c
 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmpI, tmpF operand
+	var tmpI, tmpF regalloc.VReg
 	_64 := x.Type() == ssa.TypeF64
 	if _64 {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF64)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF32)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
 	rd := m.compiler.VRegOf(ret)
-	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+	m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, _64)
 }
 
-func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+func (m *machine) lowerFcopysignImpl(rd regalloc.VReg, rn, rm operand, tmpI, tmpF regalloc.VReg, _64bit bool) {
 	// This is exactly the same code emitted by GCC for "__builtin_copysign":
 	//
 	//    mov     x0, -9223372036854775808
@@ -1313,26 +1313,26 @@ func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool
 
 	setMSB := m.allocateInstr()
 	if _64bit {
-		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+		m.lowerConstantI64(tmpI, math.MinInt64)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementD, vecIndex(0))
 	} else {
-		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+		m.lowerConstantI32(tmpI, math.MinInt32)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementS, vecIndex(0))
 	}
 	m.insert(setMSB)
 
-	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	tmpReg := m.compiler.AllocateVReg(ssa.TypeF64)
 
 	mov := m.allocateInstr()
-	mov.asFpuMov64(tmpReg.nr(), rn.nr())
+	mov.asFpuMov64(tmpReg, rn.nr())
 	m.insert(mov)
 
 	vbit := m.allocateInstr()
-	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, operandNR(tmpF), vecArrangement8B)
 	m.insert(vbit)
 
 	movDst := m.allocateInstr()
-	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+	movDst.asFpuMov64(rd, tmpReg)
 	m.insert(movDst)
 }
 
@@ -1340,7 +1340,7 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 	v, dstType := instr.BitcastData()
 	srcType := v.Type()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	srcInt := srcType.IsInt()
 	dstInt := dstType.IsInt()
 	switch {
@@ -1371,14 +1371,14 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 
 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(out))
+	rd := m.compiler.VRegOf(out)
 
 	neg := m.allocateInstr()
 	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
 	m.insert(neg)
 }
 
-func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+func (m *machine) lowerFpuToInt(rd regalloc.VReg, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
 	if !nonTrapping {
 		// First of all, we have to clear the FPU flags.
 		flagClear := m.allocateInstr()
@@ -1405,7 +1405,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 		// Check if the conversion was undefined by comparing the status with 1.
 		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 		alu := m.allocateInstr()
-		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+		alu.asALU(aluOpSubS, xzrVReg, operandNR(tmpReg), operandImm12(1, 0), true)
 		m.insert(alu)
 
 		// If it is not undefined, we can return the result.
@@ -1429,7 +1429,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 	}
 }
 
-func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+func (m *machine) lowerIntToFpu(rd regalloc.VReg, rn operand, signed, src64bit, dst64bit bool) {
 	cvt := m.allocateInstr()
 	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
 	m.insert(cvt)
@@ -1456,7 +1456,7 @@ func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(instr)
 }
@@ -1482,7 +1482,7 @@ func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
 	case !add && yNegated: // rn+rm = x-(-y) = x-y
 		aop = aluOpAdd
 	}
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	alu := m.allocateInstr()
 	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(alu)
@@ -1527,7 +1527,7 @@ func (m *machine) lowerIcmp(si *ssa.Instruction) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+	alu.asALU(aluOpSubS, xzrVReg, rn, rm, in64bit)
 	m.insert(alu)
 
 	cset := m.allocateInstr()
@@ -1542,7 +1542,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1554,7 +1554,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1598,7 +1598,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1610,7 +1610,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1631,7 +1631,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 	}
 }
 
-func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuToInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
@@ -1643,15 +1643,15 @@ func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool
 	if arr == vecArrangement2D {
 		narrow := m.allocateInstr()
 		if signed {
-			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpSqxtn, rd, operandNR(rd), vecArrangement2S)
 		} else {
-			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpUqxtn, rd, operandNR(rd), vecArrangement2S)
 		}
 		m.insert(narrow)
 	}
 }
 
-func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuFromInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
@@ -1665,7 +1665,7 @@ func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
 	x, amount := si.Arg2()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
@@ -1678,11 +1678,11 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 
-	var rd operand
+	var rd regalloc.VReg
 	if ignoreResult {
-		rd = operandNR(xzrVReg)
+		rd = xzrVReg
 	} else {
-		rd = operandNR(m.compiler.VRegOf(si.Return()))
+		rd = m.compiler.VRegOf(si.Return())
 	}
 
 	_64 := x.Type().Bits() == 64
@@ -1691,7 +1691,7 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 		c := instr.ConstantVal()
 		if isBitMaskImmediate(c, _64) {
 			// Constant bit wise operations can be lowered to a single instruction.
-			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+			alu.asALUBitmaskImm(op, rd, rn.nr(), c, _64)
 			m.insert(alu)
 			return
 		}
@@ -1709,25 +1709,25 @@ func (m *machine) lowerRotl(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	rd := operandNR(m.compiler.VRegOf(r))
+	rd := m.compiler.VRegOf(r)
 
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
 }
 
-func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+func (m *machine) lowerRotlImpl(rd regalloc.VReg, rn, rm operand, tmp regalloc.VReg, is64bit bool) {
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	neg := m.allocateInstr()
 	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
 	m.insert(neg)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+	alu.asALU(aluOpRotR, rd, rn, operandNR(tmp), is64bit)
 	m.insert(alu)
 }
 
@@ -1737,7 +1737,7 @@ func (m *machine) lowerRotr(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
@@ -1797,7 +1797,7 @@ func (m *machine) lowerImul(x, y, result ssa.Value) {
 	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
 
 	mul := m.allocateInstr()
-	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+	mul.asALURRRR(aluOpMAdd, rd, rn, rm, xzrVReg, x.Type().Bits() == 64)
 	m.insert(mul)
 }
 
@@ -1849,22 +1849,22 @@ func (m *machine) lowerPopcnt(x, result ssa.Value) {
 	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
 	//
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 
 	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	ins := m.allocateInstr()
-	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+	ins.asMovToVec(rf1.nr(), rn, vecArrangementD, vecIndex(0))
 	m.insert(ins)
 
 	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	cnt := m.allocateInstr()
-	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	cnt.asVecMisc(vecOpCnt, rf2.nr(), rf1, vecArrangement16B)
 	m.insert(cnt)
 
 	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	uaddlv := m.allocateInstr()
-	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	uaddlv.asVecLanes(vecOpUaddlv, rf3.nr(), rf2, vecArrangement8B)
 	m.insert(uaddlv)
 
 	mov := m.allocateInstr()
@@ -1879,32 +1879,35 @@ func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.Ex
 	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
 
 	setExitCode := m.allocateInstr()
-	setExitCode.asStore(operandNR(tmpReg1),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+	}
+	setExitCode.asStore(operandNR(tmpReg1), mode, 32)
 
 	// In order to unwind the stack, we also need to push the current stack pointer:
 	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
 	movSpToTmp := m.allocateInstr()
 	movSpToTmp.asMove64(tmp2, spVReg)
 	strSpToExecCtx := m.allocateInstr()
-	strSpToExecCtx.asStore(operandNR(tmp2),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode2 := m.amodePool.Allocate()
+	*mode2 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSpToExecCtx.asStore(operandNR(tmp2), mode2, 64)
 	// Also the address of this exit.
 	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
 	currentAddrToTmp := m.allocateInstr()
 	currentAddrToTmp.asAdr(tmp3, 0)
 	storeCurrentAddrToExecCtx := m.allocateInstr()
-	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode3 := m.amodePool.Allocate()
+	*mode3 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), mode3, 64)
 
 	exitSeq := m.allocateInstr()
 	exitSeq.asExitSequence(execCtxVReg)
@@ -1937,7 +1940,7 @@ func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
 	alu.asALU(
 		aluOpSubS,
 		// We don't need the result, just need to set flags.
-		operandNR(xzrVReg),
+		xzrVReg,
 		rn,
 		rm,
 		x.Type().Bits() == 64,
@@ -2012,7 +2015,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 		alu.asALU(
 			aluOpSubS,
 			// We don't need the result, just need to set flags.
-			operandNR(xzrVReg),
+			xzrVReg,
 			rn,
 			operandNR(xzrVReg),
 			c.Type().Bits() == 64,
@@ -2024,7 +2027,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	switch x.Type() {
 	case ssa.TypeI32, ssa.TypeI64:
 		// csel rd, rn, rm, cc
@@ -2041,10 +2044,10 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	}
 }
 
-func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+func (m *machine) lowerSelectVec(rc, rn, rm operand, rd regalloc.VReg) {
 	// First check if `rc` is zero or not.
 	checkZero := m.allocateInstr()
-	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+	checkZero.asALU(aluOpSubS, xzrVReg, rc, operandNR(xzrVReg), false)
 	m.insert(checkZero)
 
 	// Then use CSETM to set all bits to one if `rc` is zero.
@@ -2054,7 +2057,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 	m.insert(cset)
 
 	// Then move the bits to the result vector register.
-	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
 	dup := m.allocateInstr()
 	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
 	m.insert(dup)
@@ -2067,7 +2070,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 
 	// Finally, move the result to the destination register.
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(rd.nr(), tmp2.nr())
+	mov2.asFpuMov128(rd, tmp2)
 	m.insert(mov2)
 }
 
@@ -2099,28 +2102,28 @@ func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
 	addr, val := si.Arg2()
 	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 	rs := m.getOperand_NR(valDef, extModeNone)
 
 	_64 := si.Return().Type().Bits() == 64
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+	m.lowerAtomicRmwImpl(op, rn.nr(), rs.nr(), rt, tmp, size, negateArg, flipArg, _64)
 }
 
-func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp regalloc.VReg, size uint64, negateArg, flipArg, dst64bit bool) {
 	switch {
 	case negateArg:
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(neg)
 	case flipArg:
 		flip := m.allocateInstr()
-		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(flip)
 	default:
 		tmp = rs
@@ -2139,32 +2142,32 @@ func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
 	rn := m.getOperand_NR(addrDef, extModeNone)
 	rt := m.getOperand_NR(replDef, extModeNone)
 	rs := m.getOperand_NR(expDef, extModeNone)
-	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+	tmp := m.compiler.AllocateVReg(si.Return().Type())
 
 	_64 := si.Return().Type().Bits() == 64
 	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
 	// in case when it is used somewhere else.
 	mov := m.allocateInstr()
 	if _64 {
-		mov.asMove64(tmp.nr(), rs.nr())
+		mov.asMove64(tmp, rs.nr())
 	} else {
-		mov.asMove32(tmp.nr(), rs.nr())
+		mov.asMove32(tmp, rs.nr())
 	}
 	m.insert(mov)
 
-	m.lowerAtomicCasImpl(rn, tmp, rt, size)
+	m.lowerAtomicCasImpl(rn.nr(), tmp, rt.nr(), size)
 
 	mov2 := m.allocateInstr()
 	rd := m.compiler.VRegOf(si.Return())
 	if _64 {
-		mov2.asMove64(rd, tmp.nr())
+		mov2.asMove64(rd, tmp)
 	} else {
-		mov2.asMove32(rd, tmp.nr())
+		mov2.asMove32(rd, tmp)
 	}
 	m.insert(mov2)
 }
 
-func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt regalloc.VReg, size uint64) {
 	cas := m.allocateInstr()
 	cas.asAtomicCas(rn, rs, rt, size)
 	m.insert(cas)
@@ -2176,12 +2179,12 @@ func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
 
 	addrDef := m.compiler.ValueDefinition(addr)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 
-	m.lowerAtomicLoadImpl(rn, rt, size)
+	m.lowerAtomicLoadImpl(rn.nr(), rt, size)
 }
 
-func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+func (m *machine) lowerAtomicLoadImpl(rn, rt regalloc.VReg, size uint64) {
 	ld := m.allocateInstr()
 	ld.asAtomicLoad(rn, rt, size)
 	m.insert(ld)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
index d9fbf1789b..7a398c3d09 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
@@ -162,9 +162,9 @@ func (o operand) assignReg(v regalloc.VReg) operand {
 //
 // `mode` is used to extend the operand if the bit length is smaller than mode.bits().
 // If the operand can be expressed as operandKindImm12, `mode` is ignored.
-func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
-	if def.IsFromBlockParam() {
-		return operandNR(def.BlkParamVReg)
+func (m *machine) getOperand_Imm12_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
+	if !def.IsFromInstr() {
+		return operandNR(m.compiler.VRegOf(def.V))
 	}
 
 	instr := def.Instr
@@ -179,9 +179,9 @@ func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mod
 
 // getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
 // If the immediate value is negated, the second return value is true, otherwise always false.
-func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
-	if def.IsFromBlockParam() {
-		return operandNR(def.BlkParamVReg), false
+func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
+	if !def.IsFromInstr() {
+		return operandNR(m.compiler.VRegOf(def.V)), false
 	}
 
 	instr := def.Instr
@@ -193,7 +193,7 @@ func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDef
 		}
 
 		signExtended := int64(c)
-		if def.SSAValue().Type().Bits() == 32 {
+		if def.V.Type().Bits() == 32 {
 			signExtended = (signExtended << 32) >> 32
 		}
 		negatedWithoutSign := -signExtended
@@ -208,9 +208,9 @@ func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDef
 // ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
 //
 // `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
-	if def.IsFromBlockParam() {
-		return operandNR(def.BlkParamVReg)
+func (m *machine) getOperand_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
+	if !def.IsFromInstr() {
+		return operandNR(m.compiler.VRegOf(def.V))
 	}
 
 	if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
@@ -251,9 +251,9 @@ func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extM
 // ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
 //
 // `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
-	if def.IsFromBlockParam() {
-		return operandNR(def.BlkParamVReg)
+func (m *machine) getOperand_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
+	if !def.IsFromInstr() {
+		return operandNR(m.compiler.VRegOf(def.V))
 	}
 
 	if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
@@ -273,9 +273,9 @@ func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode
 }
 
 // getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
-func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
-	if def.IsFromBlockParam() {
-		return operandNR(def.BlkParamVReg)
+func (m *machine) getOperand_ShiftImm_NR(def backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
+	if !def.IsFromInstr() {
+		return operandNR(m.compiler.VRegOf(def.V))
 	}
 
 	instr := def.Instr
@@ -289,28 +289,18 @@ func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode e
 // ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
 //
 // `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+func (m *machine) getOperand_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
 	var v regalloc.VReg
-	if def.IsFromBlockParam() {
-		v = def.BlkParamVReg
+	if def.IsFromInstr() && def.Instr.Constant() {
+		// We inline all the constant instructions so that we could reduce the register usage.
+		v = m.lowerConstant(def.Instr)
+		def.Instr.MarkLowered()
 	} else {
-		instr := def.Instr
-		if instr.Constant() {
-			// We inline all the constant instructions so that we could reduce the register usage.
-			v = m.lowerConstant(instr)
-			instr.MarkLowered()
-		} else {
-			if n := def.N; n == 0 {
-				v = m.compiler.VRegOf(instr.Return())
-			} else {
-				_, rs := instr.Returns()
-				v = m.compiler.VRegOf(rs[n-1])
-			}
-		}
+		v = m.compiler.VRegOf(def.V)
 	}
 
 	r := v
-	switch inBits := def.SSAValue().Type().Bits(); {
+	switch inBits := def.V.Type().Bits(); {
 	case mode == extModeNone:
 	case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
 	case inBits == 32 && mode == extModeZeroExtend64:
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
index 4842eaa382..fd0760d723 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -24,6 +24,14 @@ type (
 	addressModeKind byte
 )
 
+func resetAddressMode(a *addressMode) {
+	a.kind = 0
+	a.rn = 0
+	a.rm = 0
+	a.extOp = 0
+	a.imm = 0
+}
+
 const (
 	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
 	// and then scaled by bits(type)/8.
@@ -140,15 +148,17 @@ func (a addressMode) format(dstSizeBits byte) (ret string) {
 	return
 }
 
-func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+func addressModePreOrPostIndex(m *machine, rn regalloc.VReg, imm int64, preIndex bool) *addressMode {
 	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
 		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
 	}
+	mode := m.amodePool.Allocate()
 	if preIndex {
-		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
 	} else {
-		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
 	}
+	return mode
 }
 
 func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
@@ -207,9 +217,9 @@ func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret
 	amode := m.lowerToAddressMode(ptr, offset, size)
 	load := m.allocateInstr()
 	if signed {
-		load.asSLoad(operandNR(ret), amode, size)
+		load.asSLoad(ret, amode, size)
 	} else {
-		load.asULoad(operandNR(ret), amode, size)
+		load.asULoad(ret, amode, size)
 	}
 	m.insert(load)
 }
@@ -221,11 +231,11 @@ func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(dst), amode, typ.Bits())
+		load.asULoad(dst, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+		load.asFpuLoad(dst, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(dst), amode, 128)
+		load.asFpuLoad(dst, amode, 128)
 	default:
 		panic("TODO")
 	}
@@ -239,7 +249,7 @@ func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane,
 	m.lowerConstantI64(offsetReg, int64(offset))
 	addedBase := m.addReg64ToReg64(base, offsetReg)
 
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 
 	ld1r := m.allocateInstr()
 	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
@@ -258,7 +268,7 @@ func (m *machine) lowerStore(si *ssa.Instruction) {
 }
 
 // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode *addressMode) {
 	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
 	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
 	// to support more efficient address resolution.
@@ -272,32 +282,33 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte
 // During the construction, this might emit additional instructions.
 //
 // Extracted as a separate function for easy testing.
-func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode *addressMode) {
+	amode = m.amodePool.Allocate()
 	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
 	case a64sExist && a32sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		var a32 addend32
 		a32 = a32s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+		*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
 	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
 		offset = 0
 	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
 		offset = 0
 	case a64sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		if !a64s.Empty() {
 			index := a64s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+			*amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	case a32sExist:
 		base32 := a32s.Dequeue()
@@ -314,14 +325,14 @@ func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32],
 
 		if !a32s.Empty() {
 			index := a32s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+			*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	default: // Only static offsets.
 		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.lowerConstantI64(tmpReg, offset)
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
 		offset = 0
 	}
 
@@ -411,13 +422,13 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
 	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), imm12Op, true)
 	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
-		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpSub, rd, operandNR(r), imm12Op, true)
 	} else {
 		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.load64bitConst(c, tmp)
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), operandNR(tmp), true)
 	}
 	m.insert(alu)
 	return
@@ -426,7 +437,7 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandNR(rm), true)
 	m.insert(alu)
 	return
 }
@@ -434,7 +445,7 @@ func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandER(rm, ext, 64), true)
 	m.insert(alu)
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
index b435d9ba96..00e6b238f9 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -3,6 +3,7 @@ package arm64
 import (
 	"context"
 	"fmt"
+	"math"
 	"strings"
 
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
@@ -14,12 +15,35 @@ import (
 type (
 	// machine implements backend.Machine.
 	machine struct {
-		compiler          backend.Compiler
-		executableContext *backend.ExecutableContextT[instruction]
-		currentABI        *backend.FunctionABI
-
-		regAlloc   regalloc.Allocator
-		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+		compiler   backend.Compiler
+		currentABI *backend.FunctionABI
+		instrPool  wazevoapi.Pool[instruction]
+		// labelPositionPool is the pool of labelPosition. The id is the label where
+		// if the label is less than the maxSSABlockID, it's the ssa.BasicBlockID.
+		labelPositionPool wazevoapi.IDedPool[labelPosition]
+
+		// nextLabel is the next label to be allocated. The first free label comes after maxSSABlockID
+		// so that we can have an identical label for the SSA block ID, which is useful for debugging.
+		nextLabel label
+		// rootInstr is the first instruction of the function.
+		rootInstr *instruction
+		// currentLabelPos is the currently-compiled ssa.BasicBlock's labelPosition.
+		currentLabelPos *labelPosition
+		// orderedSSABlockLabelPos is the ordered list of labelPosition in the generated code for each ssa.BasicBlock.
+		orderedSSABlockLabelPos []*labelPosition
+		// returnLabelPos is the labelPosition for the return block.
+		returnLabelPos labelPosition
+		// perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+		perBlockHead, perBlockEnd *instruction
+		// pendingInstructions are the instructions which are not yet emitted into the instruction list.
+		pendingInstructions []*instruction
+		// maxSSABlockID is the maximum ssa.BasicBlockID in the current function.
+		maxSSABlockID label
+
+		regAlloc   regalloc.Allocator[*instruction, *labelPosition, *regAllocFn]
+		regAllocFn regAllocFn
+
+		amodePool wazevoapi.Pool[addressMode]
 
 		// addendsWorkQueue is used during address lowering, defined here for reuse.
 		addendsWorkQueue wazevoapi.Queue[ssa.Value]
@@ -33,6 +57,8 @@ type (
 
 		// jmpTableTargets holds the labels of the jump table targets.
 		jmpTableTargets [][]uint32
+		// jmpTableTargetNext is the index to the jmpTableTargets slice to be used for the next jump table.
+		jmpTableTargetsNext int
 
 		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
 		// During the execution of the function, the stack looks like:
@@ -89,44 +115,132 @@ type (
 		nextLabel label
 		offset    int64
 	}
+)
 
-	labelPosition = backend.LabelPosition[instruction]
-	label         = backend.Label
+type (
+	// label represents a position in the generated code which is either
+	// a real instruction or the constant InstructionPool (e.g. jump tables).
+	//
+	// This is exactly the same as the traditional "label" in assembly code.
+	label uint32
+
+	// labelPosition represents the regions of the generated code which the label represents.
+	// This implements regalloc.Block.
+	labelPosition struct {
+		// sb is not nil if this corresponds to a ssa.BasicBlock.
+		sb ssa.BasicBlock
+		// cur is used to walk through the instructions in the block during the register allocation.
+		cur,
+		// begin and end are the first and last instructions of the block.
+		begin, end *instruction
+		// binaryOffset is the offset in the binary where the label is located.
+		binaryOffset int64
+	}
 )
 
 const (
-	labelReturn  = backend.LabelReturn
-	labelInvalid = backend.LabelInvalid
+	labelReturn  label = math.MaxUint32
+	labelInvalid       = labelReturn - 1
 )
 
+// String implements backend.Machine.
+func (l label) String() string {
+	return fmt.Sprintf("L%d", l)
+}
+
+func resetLabelPosition(l *labelPosition) {
+	*l = labelPosition{}
+}
+
 // NewBackend returns a new backend for arm64.
 func NewBackend() backend.Machine {
 	m := &machine{
 		spillSlots:        make(map[regalloc.VRegID]int64),
-		executableContext: newExecutableContext(),
-		regAlloc:          regalloc.NewAllocator(regInfo),
+		regAlloc:          regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
+		amodePool:         wazevoapi.NewPool[addressMode](resetAddressMode),
+		instrPool:         wazevoapi.NewPool[instruction](resetInstruction),
+		labelPositionPool: wazevoapi.NewIDedPool[labelPosition](resetLabelPosition),
 	}
+	m.regAllocFn.m = m
 	return m
 }
 
-func newExecutableContext() *backend.ExecutableContextT[instruction] {
-	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
+func ssaBlockLabel(sb ssa.BasicBlock) label {
+	if sb.ReturnBlock() {
+		return labelReturn
+	}
+	return label(sb.ID())
+}
+
+// getOrAllocateSSABlockLabelPosition returns the labelPosition for the given basic block.
+func (m *machine) getOrAllocateSSABlockLabelPosition(sb ssa.BasicBlock) *labelPosition {
+	if sb.ReturnBlock() {
+		m.returnLabelPos.sb = sb
+		return &m.returnLabelPos
+	}
+
+	l := ssaBlockLabel(sb)
+	pos := m.labelPositionPool.GetOrAllocate(int(l))
+	pos.sb = sb
+	return pos
 }
 
-// ExecutableContext implements backend.Machine.
-func (m *machine) ExecutableContext() backend.ExecutableContext {
-	return m.executableContext
+// LinkAdjacentBlocks implements backend.Machine.
+func (m *machine) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+	prevPos, nextPos := m.getOrAllocateSSABlockLabelPosition(prev), m.getOrAllocateSSABlockLabelPosition(next)
+	prevPos.end.next = nextPos.begin
 }
 
-// RegAlloc implements backend.Machine Function.
-func (m *machine) RegAlloc() {
-	rf := m.regAllocFn
-	for _, pos := range m.executableContext.OrderedBlockLabels {
-		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+// StartBlock implements backend.Machine.
+func (m *machine) StartBlock(blk ssa.BasicBlock) {
+	m.currentLabelPos = m.getOrAllocateSSABlockLabelPosition(blk)
+	labelPos := m.currentLabelPos
+	end := m.allocateNop()
+	m.perBlockHead, m.perBlockEnd = end, end
+	labelPos.begin, labelPos.end = end, end
+	m.orderedSSABlockLabelPos = append(m.orderedSSABlockLabelPos, labelPos)
+}
+
+// EndBlock implements ExecutableContext.
+func (m *machine) EndBlock() {
+	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+	m.insertAtPerBlockHead(m.allocateNop())
+
+	m.currentLabelPos.begin = m.perBlockHead
+
+	if m.currentLabelPos.sb.EntryBlock() {
+		m.rootInstr = m.perBlockHead
 	}
+}
 
+func (m *machine) insertAtPerBlockHead(i *instruction) {
+	if m.perBlockHead == nil {
+		m.perBlockHead = i
+		m.perBlockEnd = i
+		return
+	}
+
+	i.next = m.perBlockHead
+	m.perBlockHead.prev = i
+	m.perBlockHead = i
+}
+
+// FlushPendingInstructions implements backend.Machine.
+func (m *machine) FlushPendingInstructions() {
+	l := len(m.pendingInstructions)
+	if l == 0 {
+		return
+	}
+	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+		m.insertAtPerBlockHead(m.pendingInstructions[i])
+	}
+	m.pendingInstructions = m.pendingInstructions[:0]
+}
+
+// RegAlloc implements backend.Machine Function.
+func (m *machine) RegAlloc() {
 	m.regAllocStarted = true
-	m.regAlloc.DoAllocation(rf)
+	m.regAlloc.DoAllocation(&m.regAllocFn)
 	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
 	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
 }
@@ -143,12 +257,22 @@ func (m *machine) Reset() {
 	m.clobberedRegs = m.clobberedRegs[:0]
 	m.regAllocStarted = false
 	m.regAlloc.Reset()
-	m.regAllocFn.Reset()
 	m.spillSlotSize = 0
 	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
 	m.maxRequiredStackSizeForCalls = 0
-	m.executableContext.Reset()
-	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.jmpTableTargetsNext = 0
+	m.amodePool.Reset()
+	m.instrPool.Reset()
+	m.labelPositionPool.Reset()
+	m.pendingInstructions = m.pendingInstructions[:0]
+	m.perBlockHead, m.perBlockEnd, m.rootInstr = nil, nil, nil
+	m.orderedSSABlockLabelPos = m.orderedSSABlockLabelPos[:0]
+}
+
+// StartLoweringFunction implements backend.Machine StartLoweringFunction.
+func (m *machine) StartLoweringFunction(maxBlockID ssa.BasicBlockID) {
+	m.maxSSABlockID = label(maxBlockID)
+	m.nextLabel = label(maxBlockID) + 1
 }
 
 // SetCurrentABI implements backend.Machine SetCurrentABI.
@@ -164,12 +288,11 @@ func (m *machine) DisableStackCheck() {
 // SetCompiler implements backend.Machine.
 func (m *machine) SetCompiler(ctx backend.Compiler) {
 	m.compiler = ctx
-	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
+	m.regAllocFn.ssaB = ctx.SSABuilder()
 }
 
 func (m *machine) insert(i *instruction) {
-	ectx := m.executableContext
-	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+	m.pendingInstructions = append(m.pendingInstructions, i)
 }
 
 func (m *machine) insertBrTargetLabel() label {
@@ -179,19 +302,18 @@ func (m *machine) insertBrTargetLabel() label {
 }
 
 func (m *machine) allocateBrTarget() (nop *instruction, l label) {
-	ectx := m.executableContext
-	l = ectx.AllocateLabel()
+	l = m.nextLabel
+	m.nextLabel++
 	nop = m.allocateInstr()
 	nop.asNop0WithLabel(l)
-	pos := ectx.AllocateLabelPosition(l)
-	pos.Begin, pos.End = nop, nop
-	ectx.LabelPositions[l] = pos
+	pos := m.labelPositionPool.GetOrAllocate(int(l))
+	pos.begin, pos.end = nop, nop
 	return
 }
 
 // allocateInstr allocates an instruction.
 func (m *machine) allocateInstr() *instruction {
-	instr := m.executableContext.InstructionPool.Allocate()
+	instr := m.instrPool.Allocate()
 	if !m.regAllocStarted {
 		instr.addedBeforeRegAlloc = true
 	}
@@ -209,7 +331,7 @@ func (m *machine) allocateNop() *instruction {
 }
 
 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
-	amode := &i.amode
+	amode := i.getAmode()
 	switch amode.kind {
 	case addressModeKindResultStackSpace:
 		amode.imm += ret0offset
@@ -248,7 +370,6 @@ func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruc
 
 // resolveRelativeAddresses resolves the relative addresses before encoding.
 func (m *machine) resolveRelativeAddresses(ctx context.Context) {
-	ectx := m.executableContext
 	for {
 		if len(m.unresolvedAddressModes) > 0 {
 			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
@@ -262,35 +383,36 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 
 		var fn string
 		var fnIndex int
-		var labelToSSABlockID map[label]ssa.BasicBlockID
+		var labelPosToLabel map[*labelPosition]label
 		if wazevoapi.PerfMapEnabled {
-			fn = wazevoapi.GetCurrentFunctionName(ctx)
-			labelToSSABlockID = make(map[label]ssa.BasicBlockID)
-			for i, l := range ectx.SsaBlockIDToLabels {
-				labelToSSABlockID[l] = ssa.BasicBlockID(i)
+			labelPosToLabel = make(map[*labelPosition]label)
+			for i := 0; i <= m.labelPositionPool.MaxIDEncountered(); i++ {
+				labelPosToLabel[m.labelPositionPool.Get(i)] = label(i)
 			}
+
+			fn = wazevoapi.GetCurrentFunctionName(ctx)
 			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
 		}
 
 		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
 		var offset int64
-		for i, pos := range ectx.OrderedBlockLabels {
-			pos.BinaryOffset = offset
+		for i, pos := range m.orderedSSABlockLabelPos {
+			pos.binaryOffset = offset
 			var size int64
-			for cur := pos.Begin; ; cur = cur.next {
+			for cur := pos.begin; ; cur = cur.next {
 				switch cur.kind {
 				case nop0:
 					l := cur.nop0Label()
-					if pos, ok := ectx.LabelPositions[l]; ok {
-						pos.BinaryOffset = offset + size
+					if pos := m.labelPositionPool.Get(int(l)); pos != nil {
+						pos.binaryOffset = offset + size
 					}
 				case condBr:
 					if !cur.condBrOffsetResolved() {
 						var nextLabel label
-						if i < len(ectx.OrderedBlockLabels)-1 {
+						if i < len(m.orderedSSABlockLabelPos)-1 {
 							// Note: this is only used when the block ends with fallthrough,
 							// therefore can be safely assumed that the next block exists when it's needed.
-							nextLabel = ectx.OrderedBlockLabels[i+1].L
+							nextLabel = ssaBlockLabel(m.orderedSSABlockLabelPos[i+1].sb)
 						}
 						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
 							cbr: cur, currentLabelPos: pos, offset: offset + size,
@@ -299,21 +421,14 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 					}
 				}
 				size += cur.size()
-				if cur == pos.End {
+				if cur == pos.end {
 					break
 				}
 			}
 
 			if wazevoapi.PerfMapEnabled {
 				if size > 0 {
-					l := pos.L
-					var labelStr string
-					if blkID, ok := labelToSSABlockID[l]; ok {
-						labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
-					} else {
-						labelStr = l.String()
-					}
-					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelPosToLabel[pos]))
 				}
 			}
 			offset += size
@@ -327,7 +442,7 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 			offset := reloc.offset
 
 			target := cbr.condBrLabel()
-			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
 			diff := offsetOfTarget - offset
 			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
 				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
@@ -348,11 +463,11 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 	}
 
 	var currentOffset int64
-	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+	for cur := m.rootInstr; cur != nil; cur = cur.next {
 		switch cur.kind {
 		case br:
 			target := cur.brLabel()
-			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
 			diff := offsetOfTarget - currentOffset
 			divided := diff >> 2
 			if divided < minSignedInt26 || divided > maxSignedInt26 {
@@ -363,7 +478,7 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 		case condBr:
 			if !cur.condBrOffsetResolved() {
 				target := cur.condBrLabel()
-				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+				offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
 				diff := offsetOfTarget - currentOffset
 				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
 					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
@@ -375,7 +490,7 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 			targets := m.jmpTableTargets[tableIndex]
 			for i := range targets {
 				l := label(targets[i])
-				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
+				offsetOfTarget := m.labelPositionPool.Get(int(l)).binaryOffset
 				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
 				targets[i] = uint32(diff)
 			}
@@ -396,7 +511,7 @@ const (
 )
 
 func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
-	cur := currentBlk.End
+	cur := currentBlk.end
 	originalTarget := cbr.condBrLabel()
 	endNext := cur.next
 
@@ -419,30 +534,27 @@ func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *
 	cur = linkInstr(cur, br)
 
 	// Update the end of the current block.
-	currentBlk.End = cur
+	currentBlk.end = cur
 
 	linkInstr(cur, endNext)
 }
 
 // Format implements backend.Machine.
 func (m *machine) Format() string {
-	ectx := m.executableContext
 	begins := map[*instruction]label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
-	}
-
-	irBlocks := map[label]ssa.BasicBlockID{}
-	for i, l := range ectx.SsaBlockIDToLabels {
-		irBlocks[l] = ssa.BasicBlockID(i)
+	for l := label(0); l < m.nextLabel; l++ {
+		pos := m.labelPositionPool.Get(int(l))
+		if pos != nil {
+			begins[pos.begin] = l
+		}
 	}
 
 	var lines []string
-	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+	for cur := m.rootInstr; cur != nil; cur = cur.next {
 		if l, ok := begins[cur]; ok {
 			var labelStr string
-			if blkID, ok := irBlocks[l]; ok {
-				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			if l <= m.maxSSABlockID {
+				labelStr = fmt.Sprintf("%s (SSA Block: blk%d):", l, int(l))
 			} else {
 				labelStr = fmt.Sprintf("%s:", l)
 			}
@@ -503,13 +615,17 @@ func (m *machine) frameSize() int64 {
 	return s
 }
 
-func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
-	// TODO: reuse the slice!
-	labels := make([]uint32, len(targets))
-	for j, target := range targets {
-		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
+func (m *machine) addJmpTableTarget(targets ssa.Values) (index int) {
+	if m.jmpTableTargetsNext == len(m.jmpTableTargets) {
+		m.jmpTableTargets = append(m.jmpTableTargets, make([]uint32, 0, len(targets.View())))
+	}
+
+	index = m.jmpTableTargetsNext
+	m.jmpTableTargetsNext++
+	m.jmpTableTargets[index] = m.jmpTableTargets[index][:0]
+	for _, targetBlockID := range targets.View() {
+		target := m.compiler.SSABuilder().BasicBlock(ssa.BasicBlockID(targetBlockID))
+		m.jmpTableTargets[index] = append(m.jmpTableTargets[index], uint32(target.ID()))
 	}
-	index = len(m.jmpTableTargets)
-	m.jmpTableTargets = append(m.jmpTableTargets, labels)
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
index 466fac4640..c646a8fab0 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -15,9 +15,7 @@ func (m *machine) PostRegAlloc() {
 
 // setupPrologue initializes the prologue of the function.
 func (m *machine) setupPrologue() {
-	ectx := m.executableContext
-
-	cur := ectx.RootInstr
+	cur := m.rootInstr
 	prevInitInst := cur.next
 
 	//
@@ -70,7 +68,7 @@ func (m *machine) setupPrologue() {
 		//                                          +-----------------+ <----- SP
 		//                                             (low address)
 		//
-		_amode := addressModePreOrPostIndex(spVReg,
+		_amode := addressModePreOrPostIndex(m, spVReg,
 			-16,  // stack pointer must be 16-byte aligned.
 			true, // Decrement before store.
 		)
@@ -159,7 +157,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 		sizeOfArgRetReg = tmpRegVReg
 
 		subSp := m.allocateInstr()
-		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		subSp.asALU(aluOpSub, spVReg, operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
 		cur = linkInstr(cur, subSp)
 	} else {
 		sizeOfArgRetReg = xzrVReg
@@ -168,7 +166,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 	// Saves the return address (lr) and the size_of_arg_ret below the SP.
 	// size_of_arg_ret is used for stack unwinding.
 	pstr := m.allocateInstr()
-	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	amode := addressModePreOrPostIndex(m, spVReg, -16, true /* decrement before store */)
 	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
 	cur = linkInstr(cur, pstr)
 	return cur
@@ -182,7 +180,7 @@ func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
 	} else {
 		frameSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg,
+	_amode := addressModePreOrPostIndex(m, spVReg,
 		-16,  // stack pointer must be 16-byte aligned.
 		true, // Decrement before store.
 	)
@@ -196,24 +194,23 @@ func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
 // 1. Removes the redundant copy instruction.
 // 2. Inserts the epilogue.
 func (m *machine) postRegAlloc() {
-	ectx := m.executableContext
-	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+	for cur := m.rootInstr; cur != nil; cur = cur.next {
 		switch cur.kind {
 		case ret:
 			m.setupEpilogueAfter(cur.prev)
 		case loadConstBlockArg:
 			lc := cur
 			next := lc.next
-			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+			m.pendingInstructions = m.pendingInstructions[:0]
 			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
-			for _, instr := range m.executableContext.PendingInstructions {
+			for _, instr := range m.pendingInstructions {
 				cur = linkInstr(cur, instr)
 			}
 			linkInstr(cur, next)
-			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+			m.pendingInstructions = m.pendingInstructions[:0]
 		default:
 			// Removes the redundant copy instruction.
-			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.RealReg() {
 				prev, next := cur.prev, cur.next
 				// Remove the copy instruction.
 				prev.next = next
@@ -286,16 +283,16 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 		for i := range m.clobberedRegs {
 			vr := m.clobberedRegs[l-i] // reverse order to restore.
 			load := m.allocateInstr()
-			amode := addressModePreOrPostIndex(spVReg,
+			amode := addressModePreOrPostIndex(m, spVReg,
 				16,    // stack pointer must be 16-byte aligned.
 				false, // Increment after store.
 			)
 			// TODO: pair loads to reduce the number of instructions.
 			switch regTypeToRegisterSizeInBits(vr.RegType()) {
 			case 64: // save int reg.
-				load.asULoad(operandNR(vr), amode, 64)
+				load.asULoad(vr, amode, 64)
 			case 128: // save vector reg.
-				load.asFpuLoad(operandNR(vr), amode, 128)
+				load.asFpuLoad(vr, amode, 128)
 			}
 			cur = linkInstr(cur, load)
 		}
@@ -317,8 +314,8 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	//    SP----> +-----------------+
 
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	ldr.asULoad(lrVReg,
+		addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
 	cur = linkInstr(cur, ldr)
 
 	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
@@ -351,14 +348,14 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
 		// sub tmp, sp, #requiredStackSize
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), immm12op, true)
 		cur = linkInstr(cur, sub)
 	} else {
 		// This case, we first load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		// Then subtract it.
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		cur = linkInstr(cur, sub)
 	}
 
@@ -366,16 +363,18 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 
 	// ldr tmp2, [executionContext #StackBottomPtr]
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(tmp2), addressMode{
+	amode := m.amodePool.Allocate()
+	*amode = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument.
 		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
-	}, 64)
+	}
+	ldr.asULoad(tmp2, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	// subs xzr, tmp, tmp2
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	subs.asALU(aluOpSubS, xzrVReg, operandNR(tmpRegVReg), operandNR(tmp2), true)
 	cur = linkInstr(cur, subs)
 
 	// b.ge #imm
@@ -388,22 +387,25 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 		// First load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		setRequiredStackSize := m.allocateInstr()
-		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
-			}, 64)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+		}
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg), amode, 64)
 
 		cur = linkInstr(cur, setRequiredStackSize)
 	}
 
 	ldrAddress := m.allocateInstr()
-	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+	amode2 := m.amodePool.Allocate()
+	*amode2 = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument
 		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
-	}, 64)
+	}
+	ldrAddress.asULoad(tmpRegVReg, amode2, 64)
 	cur = linkInstr(cur, ldrAddress)
 
 	// Then jumps to the stack grow call sequence's address, meaning
@@ -427,11 +429,9 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 
 // CompileStackGrowCallSequence implements backend.Machine.
 func (m *machine) CompileStackGrowCallSequence() []byte {
-	ectx := m.executableContext
-
 	cur := m.allocateInstr()
 	cur.asNop0()
-	ectx.RootInstr = cur
+	m.rootInstr = cur
 
 	// Save the callee saved and argument registers.
 	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
@@ -453,16 +453,14 @@ func (m *machine) CompileStackGrowCallSequence() []byte {
 	ret.asRet()
 	linkInstr(cur, ret)
 
-	m.encode(ectx.RootInstr)
+	m.encode(m.rootInstr)
 	return m.compiler.Buf()
 }
 
 func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
-	ectx := m.executableContext
-
-	ectx.PendingInstructions = ectx.PendingInstructions[:0]
+	m.pendingInstructions = m.pendingInstructions[:0]
 	m.insertAddOrSubStackPointer(rd, diff, add)
-	for _, inserted := range ectx.PendingInstructions {
+	for _, inserted := range m.pendingInstructions {
 		cur = linkInstr(cur, inserted)
 	}
 	return cur
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
index 1c8793b73d..f2ed53ae55 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -3,18 +3,226 @@ package arm64
 // This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
 
 import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
 )
 
-// ClobberedRegisters implements backend.RegAllocFunctionMachine.
-func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
-	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+// regAllocFn implements regalloc.Function.
+type regAllocFn struct {
+	ssaB                   ssa.Builder
+	m                      *machine
+	loopNestingForestRoots []ssa.BasicBlock
+	blockIter              int
 }
 
-// Swap implements backend.RegAllocFunctionMachine.
-func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+// PostOrderBlockIteratorBegin implements regalloc.Function.
+func (f *regAllocFn) PostOrderBlockIteratorBegin() *labelPosition {
+	f.blockIter = len(f.m.orderedSSABlockLabelPos) - 1
+	return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function.
+func (f *regAllocFn) PostOrderBlockIteratorNext() *labelPosition {
+	if f.blockIter < 0 {
+		return nil
+	}
+	b := f.m.orderedSSABlockLabelPos[f.blockIter]
+	f.blockIter--
+	return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function.
+func (f *regAllocFn) ReversePostOrderBlockIteratorBegin() *labelPosition {
+	f.blockIter = 0
+	return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function.
+func (f *regAllocFn) ReversePostOrderBlockIteratorNext() *labelPosition {
+	if f.blockIter >= len(f.m.orderedSSABlockLabelPos) {
+		return nil
+	}
+	b := f.m.orderedSSABlockLabelPos[f.blockIter]
+	f.blockIter++
+	return b
+}
+
+// ClobberedRegisters implements regalloc.Function.
+func (f *regAllocFn) ClobberedRegisters(regs []regalloc.VReg) {
+	f.m.clobberedRegs = append(f.m.clobberedRegs[:0], regs...)
+}
+
+// LoopNestingForestRoots implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.ssaB.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestRoot(i int) *labelPosition {
+	root := f.loopNestingForestRoots[i]
+	pos := f.m.getOrAllocateSSABlockLabelPosition(root)
+	return pos
+}
+
+// LowestCommonAncestor implements regalloc.Function.
+func (f *regAllocFn) LowestCommonAncestor(blk1, blk2 *labelPosition) *labelPosition {
+	sb := f.ssaB.LowestCommonAncestor(blk1.sb, blk2.sb)
+	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
+	return pos
+}
+
+// Idom implements regalloc.Function.
+func (f *regAllocFn) Idom(blk *labelPosition) *labelPosition {
+	sb := f.ssaB.Idom(blk.sb)
+	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
+	return pos
+}
+
+// SwapBefore implements regalloc.Function.
+func (f *regAllocFn) SwapBefore(x1, x2, tmp regalloc.VReg, instr *instruction) {
+	f.m.swap(instr.prev, x1, x2, tmp)
+}
+
+// StoreRegisterBefore implements regalloc.Function.
+func (f *regAllocFn) StoreRegisterBefore(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertStoreRegisterAt(v, instr, false)
+}
+
+// StoreRegisterAfter implements regalloc.Function.
+func (f *regAllocFn) StoreRegisterAfter(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertStoreRegisterAt(v, instr, true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function.
+func (f *regAllocFn) ReloadRegisterBefore(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertReloadRegisterAt(v, instr, false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function.
+func (f *regAllocFn) ReloadRegisterAfter(v regalloc.VReg, instr *instruction) {
+	m := f.m
+	m.insertReloadRegisterAt(v, instr, true)
+}
+
+// InsertMoveBefore implements regalloc.Function.
+func (f *regAllocFn) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	f.m.insertMoveBefore(dst, src, instr)
+}
+
+// LoopNestingForestChild implements regalloc.Function.
+func (f *regAllocFn) LoopNestingForestChild(pos *labelPosition, i int) *labelPosition {
+	childSB := pos.sb.LoopNestingForestChildren()[i]
+	return f.m.getOrAllocateSSABlockLabelPosition(childSB)
+}
+
+// Succ implements regalloc.Block.
+func (f *regAllocFn) Succ(pos *labelPosition, i int) *labelPosition {
+	succSB := pos.sb.Succ(i)
+	if succSB.ReturnBlock() {
+		return nil
+	}
+	return f.m.getOrAllocateSSABlockLabelPosition(succSB)
+}
+
+// Pred implements regalloc.Block.
+func (f *regAllocFn) Pred(pos *labelPosition, i int) *labelPosition {
+	predSB := pos.sb.Pred(i)
+	return f.m.getOrAllocateSSABlockLabelPosition(predSB)
+}
+
+// BlockParams implements regalloc.Function.
+func (f *regAllocFn) BlockParams(pos *labelPosition, regs *[]regalloc.VReg) []regalloc.VReg {
+	c := f.m.compiler
+	*regs = (*regs)[:0]
+	for i := 0; i < pos.sb.Params(); i++ {
+		v := c.VRegOf(pos.sb.Param(i))
+		*regs = append(*regs, v)
+	}
+	return *regs
+}
+
+// ID implements regalloc.Block.
+func (pos *labelPosition) ID() int32 {
+	return int32(pos.sb.ID())
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (pos *labelPosition) InstrIteratorBegin() *instruction {
+	ret := pos.begin
+	pos.cur = ret
+	return ret
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (pos *labelPosition) InstrIteratorNext() *instruction {
+	for {
+		if pos.cur == pos.end {
+			return nil
+		}
+		instr := pos.cur.next
+		pos.cur = instr
+		if instr == nil {
+			return nil
+		} else if instr.addedBeforeRegAlloc {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (pos *labelPosition) InstrRevIteratorBegin() *instruction {
+	pos.cur = pos.end
+	return pos.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (pos *labelPosition) InstrRevIteratorNext() *instruction {
+	for {
+		if pos.cur == pos.begin {
+			return nil
+		}
+		instr := pos.cur.prev
+		pos.cur = instr
+		if instr == nil {
+			return nil
+		} else if instr.addedBeforeRegAlloc {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// FirstInstr implements regalloc.Block.
+func (pos *labelPosition) FirstInstr() *instruction { return pos.begin }
+
+// LastInstrForInsertion implements regalloc.Block.
+func (pos *labelPosition) LastInstrForInsertion() *instruction {
+	return lastInstrForInsertion(pos.begin, pos.end)
+}
+
+// Preds implements regalloc.Block.
+func (pos *labelPosition) Preds() int { return pos.sb.Preds() }
+
+// Entry implements regalloc.Block.
+func (pos *labelPosition) Entry() bool { return pos.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (pos *labelPosition) Succs() int { return pos.sb.Succs() }
+
+// LoopHeader implements regalloc.Block.
+func (pos *labelPosition) LoopHeader() bool { return pos.sb.LoopHeader() }
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (pos *labelPosition) LoopNestingForestChildren() int {
+	return len(pos.sb.LoopNestingForestChildren())
+}
+
+func (m *machine) swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
 	prevNext := cur.next
 	var mov1, mov2, mov3 *instruction
 	if x1.RegType() == regalloc.RegTypeInt {
@@ -32,12 +240,12 @@ func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
 		if !tmp.Valid() {
 			r2 := x2.RealReg()
 			// Temporarily spill x1 to stack.
-			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			cur = m.insertStoreRegisterAt(x1, cur, true).prev
 			// Then move x2 to x1.
 			cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
 			linkInstr(cur, prevNext)
 			// Then reload the original value on x1 from stack to r2.
-			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+			m.insertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
 		} else {
 			mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
 			mov2 = m.allocateInstr().asFpuMov128(x1, x2)
@@ -50,8 +258,7 @@ func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
 	}
 }
 
-// InsertMoveBefore implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+func (m *machine) insertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
 	typ := src.RegType()
 	if typ != dst.RegType() {
 		panic("BUG: src and dst must have the same type")
@@ -70,13 +277,7 @@ func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
 	linkInstr(cur, prevNext)
 }
 
-// SSABlockLabel implements backend.RegAllocFunctionMachine.
-func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
-	return m.executableContext.SsaBlockIDToLabels[id]
-}
-
-// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+func (m *machine) insertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
 	if !v.IsRealReg() {
 		panic("BUG: VReg must be backed by real reg to be stored")
 	}
@@ -91,7 +292,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	store := m.allocateInstr()
 	store.asStore(operandNR(v), amode, typ.Bits())
@@ -100,8 +301,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	return linkInstr(cur, prevNext)
 }
 
-// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
-func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+func (m *machine) insertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
 	if !v.IsRealReg() {
 		panic("BUG: VReg must be backed by real reg to be stored")
 	}
@@ -116,16 +316,16 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(v), amode, typ.Bits())
+		load.asULoad(v, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+		load.asFpuLoad(v, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(v), amode, 128)
+		load.asFpuLoad(v, amode, 128)
 	default:
 		panic("TODO")
 	}
@@ -134,8 +334,7 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	return linkInstr(cur, prevNext)
 }
 
-// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
-func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+func lastInstrForInsertion(begin, end *instruction) *instruction {
 	cur := end
 	for cur.kind == nop0 {
 		cur = cur.prev
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
index edb0e36e33..a72b86f6bf 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
@@ -14,7 +14,7 @@ func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
 
 	var stackBuf []byte
 	{
-		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		//nolint:staticcheck
 		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
 		hdr.Data = sp
 		hdr.Len = l
@@ -78,13 +78,7 @@ func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
 	//              +-----------------+ <---- stackPointerBeforeGoCall
 	//                 (low address)
 	ptr := unsafe.Pointer(stackPointerBeforeGoCall)
+	data := (*uint64)(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
 	size := *(*uint64)(unsafe.Add(ptr, 8))
-	var view []uint64
-	{
-		sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
-		sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
-		sh.Len = int(size)
-		sh.Cap = int(size)
-	}
-	return view
+	return unsafe.Slice(data, size)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
index 54ce89e468..9044a9e4bc 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
@@ -11,7 +11,24 @@ import (
 type (
 	// Machine is a backend for a specific ISA machine.
 	Machine interface {
-		ExecutableContext() ExecutableContext
+		// StartLoweringFunction is called when the compilation of the given function is started.
+		// The maxBlockID is the maximum ssa.BasicBlockID in the function.
+		StartLoweringFunction(maxBlockID ssa.BasicBlockID)
+
+		// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
+		LinkAdjacentBlocks(prev, next ssa.BasicBlock)
+
+		// StartBlock is called when the compilation of the given block is started.
+		// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
+		// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
+		StartBlock(ssa.BasicBlock)
+
+		// EndBlock is called when the compilation of the current block is finished.
+		EndBlock()
+
+		// FlushPendingInstructions flushes the pending instructions to the buffer.
+		// This will be called after the lowering of each SSA Instruction.
+		FlushPendingInstructions()
 
 		// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
 		DisableStackCheck()
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
deleted file mode 100644
index 3f36c84e57..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
+++ /dev/null
@@ -1,319 +0,0 @@
-package backend
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
-type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
-	// InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
-	InsertMoveBefore(dst, src regalloc.VReg, instr I)
-	// InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
-	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
-	InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
-	// InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
-	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
-	InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
-	// ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
-	ClobberedRegisters(regs []regalloc.VReg)
-	// Swap swaps the two virtual registers after the given instruction.
-	Swap(cur I, x1, x2, tmp regalloc.VReg)
-	// LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
-	LastInstrForInsertion(begin, end I) I
-	// SSABlockLabel returns the label of the given ssa.BasicBlockID.
-	SSABlockLabel(id ssa.BasicBlockID) Label
-}
-
-type (
-	// RegAllocFunction implements regalloc.Function.
-	RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
-		m   m
-		ssb ssa.Builder
-		c   Compiler
-		// iter is the iterator for reversePostOrderBlocks
-		iter                   int
-		reversePostOrderBlocks []RegAllocBlock[I, m]
-		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
-		labelToRegAllocBlockIndex map[Label]int
-		loopNestingForestRoots    []ssa.BasicBlock
-	}
-
-	// RegAllocBlock implements regalloc.Block.
-	RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
-		// f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
-		f                           *RegAllocFunction[I, m]
-		sb                          ssa.BasicBlock
-		l                           Label
-		begin, end                  I
-		loopNestingForestChildren   []ssa.BasicBlock
-		cur                         I
-		id                          int
-		cachedLastInstrForInsertion I
-	}
-)
-
-// NewRegAllocFunction returns a new RegAllocFunction.
-func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
-	return &RegAllocFunction[I, M]{
-		m:                         m,
-		ssb:                       ssb,
-		c:                         c,
-		labelToRegAllocBlockIndex: make(map[Label]int),
-	}
-}
-
-// AddBlock adds a new block to the function.
-func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
-	i := len(f.reversePostOrderBlocks)
-	f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
-		f:     f,
-		sb:    sb,
-		l:     l,
-		begin: begin,
-		end:   end,
-		id:    int(sb.ID()),
-	})
-	f.labelToRegAllocBlockIndex[l] = i
-}
-
-// Reset resets the function for the next compilation.
-func (f *RegAllocFunction[I, M]) Reset() {
-	f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
-	f.iter = 0
-}
-
-// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
-func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
-	m := f.m
-	m.InsertStoreRegisterAt(v, instr.(I), true)
-}
-
-// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
-func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
-	m := f.m
-	m.InsertReloadRegisterAt(v, instr.(I), false)
-}
-
-// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
-func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
-	m := f.m
-	m.InsertReloadRegisterAt(v, instr.(I), true)
-}
-
-// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
-func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
-	m := f.m
-	m.InsertStoreRegisterAt(v, instr.(I), false)
-}
-
-// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
-func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
-	f.m.ClobberedRegisters(regs)
-}
-
-// SwapBefore implements regalloc.Function SwapBefore.
-func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
-	f.m.Swap(instr.Prev().(I), x1, x2, tmp)
-}
-
-// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
-func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
-	f.iter = len(f.reversePostOrderBlocks) - 1
-	return f.PostOrderBlockIteratorNext()
-}
-
-// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
-func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
-	if f.iter < 0 {
-		return nil
-	}
-	b := &f.reversePostOrderBlocks[f.iter]
-	f.iter--
-	return b
-}
-
-// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
-func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
-	f.iter = 0
-	return f.ReversePostOrderBlockIteratorNext()
-}
-
-// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
-func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
-	if f.iter >= len(f.reversePostOrderBlocks) {
-		return nil
-	}
-	b := &f.reversePostOrderBlocks[f.iter]
-	f.iter++
-	return b
-}
-
-// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
-func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
-	f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
-	return len(f.loopNestingForestRoots)
-}
-
-// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
-func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
-	blk := f.loopNestingForestRoots[i]
-	l := f.m.SSABlockLabel(blk.ID())
-	index := f.labelToRegAllocBlockIndex[l]
-	return &f.reversePostOrderBlocks[index]
-}
-
-// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
-func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
-	f.m.InsertMoveBefore(dst, src, instr.(I))
-}
-
-// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
-func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
-	ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
-	l := f.m.SSABlockLabel(ret.ID())
-	index := f.labelToRegAllocBlockIndex[l]
-	return &f.reversePostOrderBlocks[index]
-}
-
-// Idom implements regalloc.Function Idom.
-func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
-	builder := f.ssb
-	idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
-	if idom == nil {
-		panic("BUG: idom must not be nil")
-	}
-	l := f.m.SSABlockLabel(idom.ID())
-	index := f.labelToRegAllocBlockIndex[l]
-	return &f.reversePostOrderBlocks[index]
-}
-
-// ID implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
-
-// BlockParams implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
-	c := r.f.c
-	*regs = (*regs)[:0]
-	for i := 0; i < r.sb.Params(); i++ {
-		v := c.VRegOf(r.sb.Param(i))
-		*regs = append(*regs, v)
-	}
-	return *regs
-}
-
-// InstrIteratorBegin implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
-	r.cur = r.begin
-	return r.cur
-}
-
-// InstrIteratorNext implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
-	for {
-		if r.cur == r.end {
-			return nil
-		}
-		instr := r.cur.Next()
-		r.cur = instr.(I)
-		if instr == nil {
-			return nil
-		} else if instr.AddedBeforeRegAlloc() {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// InstrRevIteratorBegin implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
-	r.cur = r.end
-	return r.cur
-}
-
-// InstrRevIteratorNext implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
-	for {
-		if r.cur == r.begin {
-			return nil
-		}
-		instr := r.cur.Prev()
-		r.cur = instr.(I)
-		if instr == nil {
-			return nil
-		} else if instr.AddedBeforeRegAlloc() {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// FirstInstr implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
-	return r.begin
-}
-
-// EndInstr implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
-	return r.end
-}
-
-// LastInstrForInsertion implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
-	var nil I
-	if r.cachedLastInstrForInsertion == nil {
-		r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
-	}
-	return r.cachedLastInstrForInsertion
-}
-
-// Preds implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
-
-// Pred implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
-	sb := r.sb
-	pred := sb.Pred(i)
-	l := r.f.m.SSABlockLabel(pred.ID())
-	index := r.f.labelToRegAllocBlockIndex[l]
-	return &r.f.reversePostOrderBlocks[index]
-}
-
-// Entry implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
-
-// Succs implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) Succs() int {
-	return r.sb.Succs()
-}
-
-// Succ implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
-	sb := r.sb
-	succ := sb.Succ(i)
-	if succ.ReturnBlock() {
-		return nil
-	}
-	l := r.f.m.SSABlockLabel(succ.ID())
-	index := r.f.labelToRegAllocBlockIndex[l]
-	return &r.f.reversePostOrderBlocks[index]
-}
-
-// LoopHeader implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) LoopHeader() bool {
-	return r.sb.LoopHeader()
-}
-
-// LoopNestingForestChildren implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
-	r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
-	return len(r.loopNestingForestChildren)
-}
-
-// LoopNestingForestChild implements regalloc.Block.
-func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
-	blk := r.loopNestingForestChildren[i]
-	l := r.f.m.SSABlockLabel(blk.ID())
-	index := r.f.labelToRegAllocBlockIndex[l]
-	return &r.f.reversePostOrderBlocks[index]
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
index 23157b4782..5d15bd9dc1 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
@@ -4,104 +4,100 @@ import "fmt"
 
 // These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
 // allocators to work on any ISA.
-//
-// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
-// 	where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
-// 	by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
 
 type (
 	// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
 	// Blocks(s).
-	Function interface {
+	//
+	// I is the type of the instruction, and B is the type of the basic block.
+	Function[I Instr, B Block[I]] interface {
 		// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
 		// In other words, the last blocks in the CFG will be returned first.
-		PostOrderBlockIteratorBegin() Block
+		PostOrderBlockIteratorBegin() B
 		// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
-		PostOrderBlockIteratorNext() Block
+		PostOrderBlockIteratorNext() B
 		// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
 		// In other words, the first blocks in the CFG will be returned first.
-		ReversePostOrderBlockIteratorBegin() Block
+		ReversePostOrderBlockIteratorBegin() B
 		// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
-		ReversePostOrderBlockIteratorNext() Block
+		ReversePostOrderBlockIteratorNext() B
 		// ClobberedRegisters tell the clobbered registers by this function.
 		ClobberedRegisters([]VReg)
 		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
 		LoopNestingForestRoots() int
 		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
-		LoopNestingForestRoot(i int) Block
+		LoopNestingForestRoot(i int) B
 		// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
-		LowestCommonAncestor(blk1, blk2 Block) Block
+		LowestCommonAncestor(blk1, blk2 B) B
 		// Idom returns the immediate dominator of the given block.
-		Idom(blk Block) Block
+		Idom(blk B) B
+
+		// LoopNestingForestChild returns the i-th child of the block in the loop nesting forest.
+		LoopNestingForestChild(b B, i int) B
+		// Pred returns the i-th predecessor of the block in the CFG.
+		Pred(b B, i int) B
+		// Succ returns the i-th successor of the block in the CFG.
+		Succ(b B, i int) B
+		// BlockParams returns the virtual registers used as the parameters of this block.
+		BlockParams(B, *[]VReg) []VReg
 
 		// Followings are for rewriting the function.
 
-		// SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
-		SwapBefore(x1, x2, tmp VReg, instr Instr)
+		// SwapBefore swaps the two virtual registers at the end of the given block.
+		SwapBefore(x1, x2, tmp VReg, instr I)
 		// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
-		StoreRegisterBefore(v VReg, instr Instr)
+		StoreRegisterBefore(v VReg, instr I)
 		// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
-		StoreRegisterAfter(v VReg, instr Instr)
+		StoreRegisterAfter(v VReg, instr I)
 		// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
-		ReloadRegisterBefore(v VReg, instr Instr)
+		ReloadRegisterBefore(v VReg, instr I)
 		// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
-		ReloadRegisterAfter(v VReg, instr Instr)
+		ReloadRegisterAfter(v VReg, instr I)
 		// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
-		InsertMoveBefore(dst, src VReg, instr Instr)
+		InsertMoveBefore(dst, src VReg, instr I)
 	}
 
 	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
-	Block interface {
+	// Right now, this corresponds to a ssa.BasicBlock lowered to the machine level.
+	Block[I Instr] interface {
+		comparable
 		// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
 		ID() int32
-		// BlockParams returns the virtual registers used as the parameters of this block.
-		BlockParams(*[]VReg) []VReg
 		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
 		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
-		InstrIteratorBegin() Instr
+		InstrIteratorBegin() I
 		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
 		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
-		InstrIteratorNext() Instr
+		InstrIteratorNext() I
 		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
-		InstrRevIteratorBegin() Instr
+		InstrRevIteratorBegin() I
 		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
-		InstrRevIteratorNext() Instr
+		InstrRevIteratorNext() I
 		// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
-		FirstInstr() Instr
-		// EndInstr returns the end instruction in this block.
-		EndInstr() Instr
+		FirstInstr() I
 		// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
 		// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
 		// At the time of register allocation, all the critical edges are already split, so there is no need
 		// to worry about the case where branching instruction has multiple successors.
 		// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
 		// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
-		LastInstrForInsertion() Instr
+		LastInstrForInsertion() I
 		// Preds returns the number of predecessors of this block in the CFG.
 		Preds() int
-		// Pred returns the i-th predecessor of this block in the CFG.
-		Pred(i int) Block
 		// Entry returns true if the block is for the entry block.
 		Entry() bool
 		// Succs returns the number of successors of this block in the CFG.
 		Succs() int
-		// Succ returns the i-th successor of this block in the CFG.
-		Succ(i int) Block
 		// LoopHeader returns true if this block is a loop header.
 		LoopHeader() bool
 		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
 		LoopNestingForestChildren() int
-		// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
-		LoopNestingForestChild(i int) Block
 	}
 
 	// Instr is an instruction in a block, abstracting away the underlying ISA.
 	Instr interface {
+		comparable
 		fmt.Stringer
-		// Next returns the next instruction in the same block.
-		Next() Instr
-		// Prev returns the previous instruction in the same block.
-		Prev() Instr
 		// Defs returns the virtual registers defined by this instruction.
 		Defs(*[]VReg) []VReg
 		// Uses returns the virtual registers used by this instruction.
@@ -124,13 +120,5 @@ type (
 		IsIndirectCall() bool
 		// IsReturn returns true if this instruction is a return instruction.
 		IsReturn() bool
-		// AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
-		AddedBeforeRegAlloc() bool
-	}
-
-	// InstrConstraint is an interface for arch-specific instruction constraints.
-	InstrConstraint interface {
-		comparable
-		Instr
 	}
 )
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
index b4450d56fb..a5857f4f26 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -18,13 +18,13 @@ import (
 )
 
 // NewAllocator returns a new Allocator.
-func NewAllocator(allocatableRegs *RegisterInfo) Allocator {
-	a := Allocator{
+func NewAllocator[I Instr, B Block[I], F Function[I, B]](allocatableRegs *RegisterInfo) Allocator[I, B, F] {
+	a := Allocator[I, B, F]{
 		regInfo:            allocatableRegs,
-		phiDefInstListPool: wazevoapi.NewPool[phiDefInstList](resetPhiDefInstList),
-		blockStates:        wazevoapi.NewIDedPool[blockState](resetBlockState),
+		phiDefInstListPool: wazevoapi.NewPool[phiDefInstList[I]](resetPhiDefInstList[I]),
+		blockStates:        wazevoapi.NewIDedPool[blockState[I, B, F]](resetBlockState[I, B, F]),
 	}
-	a.state.vrStates = wazevoapi.NewIDedPool[vrState](resetVrState)
+	a.state.vrStates = wazevoapi.NewIDedPool[vrState[I, B, F]](resetVrState[I, B, F])
 	a.state.reset()
 	for _, regs := range allocatableRegs.AllocatableRegisters {
 		for _, r := range regs {
@@ -49,33 +49,39 @@ type (
 	}
 
 	// Allocator is a register allocator.
-	Allocator struct {
+	Allocator[I Instr, B Block[I], F Function[I, B]] struct {
 		// regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator.
 		regInfo *RegisterInfo
 		// allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA.
 		allocatableSet           RegSet
 		allocatedCalleeSavedRegs []VReg
 		vs                       []VReg
-		vs2                      []VRegID
-		phiDefInstListPool       wazevoapi.Pool[phiDefInstList]
+		ss                       []*vrState[I, B, F]
+		copies                   []_copy[I, B, F]
+		phiDefInstListPool       wazevoapi.Pool[phiDefInstList[I]]
 
 		// Followings are re-used during various places.
-		blks             []Block
-		reals            []RealReg
-		currentOccupants regInUseSet
+		blks  []B
+		reals []RealReg
 
 		// Following two fields are updated while iterating the blocks in the reverse postorder.
-		state       state
-		blockStates wazevoapi.IDedPool[blockState]
+		state       state[I, B, F]
+		blockStates wazevoapi.IDedPool[blockState[I, B, F]]
+	}
+
+	// _copy represents a source and destination pair of a copy instruction.
+	_copy[I Instr, B Block[I], F Function[I, B]] struct {
+		src   *vrState[I, B, F]
+		dstID VRegID
 	}
 
 	// programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg.
 	programCounter int32
 
-	state struct {
+	state[I Instr, B Block[I], F Function[I, B]] struct {
 		argRealRegs []VReg
-		regsInUse   regInUseSet
-		vrStates    wazevoapi.IDedPool[vrState]
+		regsInUse   regInUseSet[I, B, F]
+		vrStates    wazevoapi.IDedPool[vrState[I, B, F]]
 
 		currentBlockID int32
 
@@ -83,30 +89,30 @@ type (
 		allocatedRegSet RegSet
 	}
 
-	blockState struct {
+	blockState[I Instr, B Block[I], F Function[I, B]] struct {
 		// liveIns is a list of VReg that are live at the beginning of the block.
-		liveIns []VRegID
+		liveIns []*vrState[I, B, F]
 		// seen is true if the block is visited during the liveness analysis.
 		seen bool
 		// visited is true if the block is visited during the allocation phase.
 		visited            bool
 		startFromPredIndex int
 		// startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges.
-		startRegs regInUseSet
+		startRegs regInUseSet[I, B, F]
 		// endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges.
-		endRegs regInUseSet
+		endRegs regInUseSet[I, B, F]
 	}
 
-	vrState struct {
+	vrState[I Instr, B Block[I], f Function[I, B]] struct {
 		v VReg
 		r RealReg
 		// defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil.
-		defInstr Instr
+		defInstr I
 		// defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value.
-		defBlk Block
+		defBlk B
 		// lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that
 		// reloads this value. This is used to determine the spill location. Only valid if spilled=true.
-		lca Block
+		lca B
 		// lastUse is the program counter of the last use of this value. This changes while iterating the block, and
 		// should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID.
 		lastUse                 programCounter
@@ -121,14 +127,14 @@ type (
 		desiredLoc desiredLoc
 		// phiDefInstList is a list of instructions that defines this phi value.
 		// This is used to determine the spill location, and only valid if isPhi=true.
-		*phiDefInstList
+		*phiDefInstList[I]
 	}
 
 	// phiDefInstList is a linked list of instructions that defines a phi value.
-	phiDefInstList struct {
-		instr Instr
+	phiDefInstList[I Instr] struct {
+		instr I
 		v     VReg
-		next  *phiDefInstList
+		next  *phiDefInstList[I]
 	}
 
 	// desiredLoc represents a desired location for a VReg.
@@ -160,13 +166,14 @@ func (d desiredLoc) stack() bool {
 	return d&3 == desiredLoc(desiredLocKindStack)
 }
 
-func resetPhiDefInstList(l *phiDefInstList) {
-	l.instr = nil
+func resetPhiDefInstList[I Instr](l *phiDefInstList[I]) {
+	var nilInstr I
+	l.instr = nilInstr
 	l.next = nil
 	l.v = VRegInvalid
 }
 
-func (s *state) dump(info *RegisterInfo) { //nolint:unused
+func (s *state[I, B, F]) dump(info *RegisterInfo) { //nolint:unused
 	fmt.Println("\t\tstate:")
 	fmt.Println("\t\t\targRealRegs:", s.argRealRegs)
 	fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info))
@@ -185,7 +192,7 @@ func (s *state) dump(info *RegisterInfo) { //nolint:unused
 	fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", "))
 }
 
-func (s *state) reset() {
+func (s *state[I, B, F]) reset() {
 	s.argRealRegs = s.argRealRegs[:0]
 	s.vrStates.Reset()
 	s.allocatedRegSet = RegSet(0)
@@ -193,79 +200,74 @@ func (s *state) reset() {
 	s.currentBlockID = -1
 }
 
-func (s *state) setVRegState(v VReg, r RealReg) {
-	id := int(v.ID())
-	st := s.vrStates.GetOrAllocate(id)
-	st.r = r
-	st.v = v
-}
-
-func resetVrState(vs *vrState) {
+func resetVrState[I Instr, B Block[I], F Function[I, B]](vs *vrState[I, B, F]) {
 	vs.v = VRegInvalid
 	vs.r = RealRegInvalid
-	vs.defInstr = nil
-	vs.defBlk = nil
+	var nilInstr I
+	vs.defInstr = nilInstr
+	var nilBlk B
+	vs.defBlk = nilBlk
 	vs.spilled = false
 	vs.lastUse = -1
 	vs.lastUseUpdatedAtBlockID = -1
-	vs.lca = nil
+	vs.lca = nilBlk
 	vs.isPhi = false
 	vs.phiDefInstList = nil
 	vs.desiredLoc = desiredLocUnspecified
 }
 
-func (s *state) getVRegState(v VRegID) *vrState {
-	return s.vrStates.GetOrAllocate(int(v))
+func (s *state[I, B, F]) getOrAllocateVRegState(v VReg) *vrState[I, B, F] {
+	st := s.vrStates.GetOrAllocate(int(v.ID()))
+	if st.v == VRegInvalid {
+		st.v = v
+	}
+	return st
 }
 
-func (s *state) useRealReg(r RealReg, v VReg) {
-	if s.regsInUse.has(r) {
-		panic("BUG: useRealReg: the given real register is already used")
-	}
-	s.regsInUse.add(r, v)
-	s.setVRegState(v, r)
+func (s *state[I, B, F]) getVRegState(v VRegID) *vrState[I, B, F] {
+	return s.vrStates.Get(int(v))
+}
+
+func (s *state[I, B, F]) useRealReg(r RealReg, vr *vrState[I, B, F]) {
+	s.regsInUse.add(r, vr)
+	vr.r = r
 	s.allocatedRegSet = s.allocatedRegSet.add(r)
 }
 
-func (s *state) releaseRealReg(r RealReg) {
+func (s *state[I, B, F]) releaseRealReg(r RealReg) {
 	current := s.regsInUse.get(r)
-	if current.Valid() {
+	if current != nil {
 		s.regsInUse.remove(r)
-		s.setVRegState(current, RealRegInvalid)
+		current.r = RealRegInvalid
 	}
 }
 
 // recordReload records that the given VReg is reloaded in the given block.
 // This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value.
-func (vs *vrState) recordReload(f Function, blk Block) {
+func (vs *vrState[I, B, F]) recordReload(f F, blk B) {
 	vs.spilled = true
-	if vs.lca == nil {
+	var nilBlk B
+	if lca := vs.lca; lca == nilBlk {
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID())
 		}
 		vs.lca = blk
-	} else {
+	} else if lca != blk {
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID())
 		}
-		vs.lca = f.LowestCommonAncestor(vs.lca, blk)
+		vs.lca = f.LowestCommonAncestor(lca, blk)
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("updated lca=%d\n", vs.lca.ID())
 		}
 	}
 }
 
-func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) {
+func (a *Allocator[I, B, F]) findOrSpillAllocatable(s *state[I, B, F], allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) {
 	r = RealRegInvalid
 	// First, check if the preferredMask has any allocatable register.
 	if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) {
-		for _, candidateReal := range allocatable {
-			// TODO: we should ensure the preferred register is in the allocatable set in the first place,
-			//  but right now, just in case, we check it here.
-			if candidateReal == preferred {
-				return preferred
-			}
-		}
+		return preferred
 	}
 
 	var lastUseAt programCounter
@@ -276,7 +278,7 @@ func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forb
 		}
 
 		using := s.regsInUse.get(candidateReal)
-		if using == VRegInvalid {
+		if using == nil {
 			// This is not used at this point.
 			return candidateReal
 		}
@@ -285,17 +287,17 @@ func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forb
 		// For example, if the register is used as an argument register, and it might be
 		// spilled and not reloaded when it ends up being used as a temporary to pass
 		// stack based argument.
-		if using.IsRealReg() {
+		if using.v.IsRealReg() {
 			continue
 		}
 
 		isPreferred := candidateReal == preferred
 
 		// last == -1 means the value won't be used anymore.
-		if last := s.getVRegState(using.ID()).lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) {
+		if last := using.lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) {
 			lastUseAt = last
 			r = candidateReal
-			spillVReg = using
+			spillVReg = using.v
 			if isPreferred {
 				break
 			}
@@ -313,7 +315,7 @@ func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forb
 	return r
 }
 
-func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg {
+func (s *state[I, B, F]) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg {
 	for _, r := range allocatable {
 		if !s.regsInUse.has(r) && !forbiddenMask.has(r) {
 			return r
@@ -322,22 +324,20 @@ func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) Rea
 	return RealRegInvalid
 }
 
-func (s *state) resetAt(bs *blockState) {
-	s.regsInUse.range_(func(_ RealReg, vr VReg) {
-		s.setVRegState(vr, RealRegInvalid)
+func (s *state[I, B, F]) resetAt(bs *blockState[I, B, F]) {
+	s.regsInUse.range_(func(_ RealReg, vs *vrState[I, B, F]) {
+		vs.r = RealRegInvalid
 	})
 	s.regsInUse.reset()
-	bs.endRegs.range_(func(r RealReg, v VReg) {
-		id := int(v.ID())
-		st := s.vrStates.GetOrAllocate(id)
-		if st.lastUseUpdatedAtBlockID == s.currentBlockID && st.lastUse == programCounterLiveIn {
-			s.regsInUse.add(r, v)
-			s.setVRegState(v, r)
+	bs.endRegs.range_(func(r RealReg, vs *vrState[I, B, F]) {
+		if vs.lastUseUpdatedAtBlockID == s.currentBlockID && vs.lastUse == programCounterLiveIn {
+			s.regsInUse.add(r, vs)
+			vs.r = r
 		}
 	})
 }
 
-func resetBlockState(b *blockState) {
+func resetBlockState[I Instr, B Block[I], F Function[I, B]](b *blockState[I, B, F]) {
 	b.seen = false
 	b.visited = false
 	b.endRegs.reset()
@@ -346,7 +346,7 @@ func resetBlockState(b *blockState) {
 	b.liveIns = b.liveIns[:0]
 }
 
-func (b *blockState) dump(a *RegisterInfo) {
+func (b *blockState[I, B, F]) dump(a *RegisterInfo) {
 	fmt.Println("\t\tblockState:")
 	fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a))
 	fmt.Println("\t\t\tendRegs:", b.endRegs.format(a))
@@ -355,13 +355,13 @@ func (b *blockState) dump(a *RegisterInfo) {
 }
 
 // DoAllocation performs register allocation on the given Function.
-func (a *Allocator) DoAllocation(f Function) {
+func (a *Allocator[I, B, F]) DoAllocation(f F) {
 	a.livenessAnalysis(f)
 	a.alloc(f)
 	a.determineCalleeSavedRealRegs(f)
 }
 
-func (a *Allocator) determineCalleeSavedRealRegs(f Function) {
+func (a *Allocator[I, B, F]) determineCalleeSavedRealRegs(f F) {
 	a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0]
 	a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) {
 		if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) {
@@ -371,17 +371,17 @@ func (a *Allocator) determineCalleeSavedRealRegs(f Function) {
 	f.ClobberedRegisters(a.allocatedCalleeSavedRegs)
 }
 
-func (a *Allocator) getOrAllocateBlockState(blockID int32) *blockState {
+func (a *Allocator[I, B, F]) getOrAllocateBlockState(blockID int32) *blockState[I, B, F] {
 	return a.blockStates.GetOrAllocate(int(blockID))
 }
 
 // phiBlk returns the block that defines the given phi value, nil otherwise.
-func (s *state) phiBlk(v VRegID) Block {
-	vs := s.getVRegState(v)
+func (vs *vrState[I, B, F]) phiBlk() B {
 	if vs.isPhi {
 		return vs.defBlk
 	}
-	return nil
+	var nilBlk B
+	return nilBlk
 }
 
 const (
@@ -391,31 +391,35 @@ const (
 
 // liveAnalysis constructs Allocator.blockLivenessData.
 // The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2.
-func (a *Allocator) livenessAnalysis(f Function) {
+func (a *Allocator[I, B, F]) livenessAnalysis(f F) {
 	s := &a.state
-	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter.
 
+	for i := VRegID(0); i < vRegIDReservedForRealNum; i++ {
+		s.getOrAllocateVRegState(VReg(i).SetRealReg(RealReg(i)))
+	}
+
+	var nilBlk B
+	var nilInstr I
+	for blk := f.PostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.PostOrderBlockIteratorNext() {
 		// We should gather phi value data.
-		for _, p := range blk.BlockParams(&a.vs) {
-			vs := s.getVRegState(p.ID())
+		for _, p := range f.BlockParams(blk, &a.vs) {
+			vs := s.getOrAllocateVRegState(p)
 			vs.isPhi = true
 			vs.defBlk = blk
 		}
-	}
 
-	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
 		blkID := blk.ID()
 		info := a.getOrAllocateBlockState(blkID)
 
-		a.vs2 = a.vs2[:0]
+		a.ss = a.ss[:0]
 		const (
 			flagDeleted = false
 			flagLive    = true
 		)
 		ns := blk.Succs()
 		for i := 0; i < ns; i++ {
-			succ := blk.Succ(i)
-			if succ == nil {
+			succ := f.Succ(blk, i)
+			if succ == nilBlk {
 				continue
 			}
 
@@ -425,39 +429,39 @@ func (a *Allocator) livenessAnalysis(f Function) {
 				continue
 			}
 
-			for _, v := range succInfo.liveIns {
-				if s.phiBlk(v) != succ {
-					st := s.getVRegState(v)
+			for _, st := range succInfo.liveIns {
+				if st.phiBlk() != succ && st.spilled != flagLive { //nolint:gosimple
 					// We use .spilled field to store the flag.
 					st.spilled = flagLive
-					a.vs2 = append(a.vs2, v)
+					a.ss = append(a.ss, st)
 				}
 			}
 		}
 
-		for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+		for instr := blk.InstrRevIteratorBegin(); instr != nilInstr; instr = blk.InstrRevIteratorNext() {
 
 			var use, def VReg
+			var defIsPhi bool
 			for _, def = range instr.Defs(&a.vs) {
 				if !def.IsRealReg() {
-					id := def.ID()
-					st := s.getVRegState(id)
-					// We use .spilled field to store the flag.
+					st := s.getOrAllocateVRegState(def)
+					defIsPhi = st.isPhi
+					// Note: We use .spilled field to store the flag.
 					st.spilled = flagDeleted
-					a.vs2 = append(a.vs2, id)
 				}
 			}
 			for _, use = range instr.Uses(&a.vs) {
 				if !use.IsRealReg() {
-					id := use.ID()
-					st := s.getVRegState(id)
-					// We use .spilled field to store the flag.
-					st.spilled = flagLive
-					a.vs2 = append(a.vs2, id)
+					st := s.getOrAllocateVRegState(use)
+					// Note: We use .spilled field to store the flag.
+					if st.spilled != flagLive { //nolint:gosimple
+						st.spilled = flagLive
+						a.ss = append(a.ss, st)
+					}
 				}
 			}
 
-			if def.Valid() && s.phiBlk(def.ID()) != nil {
+			if defIsPhi {
 				if use.Valid() && use.IsRealReg() {
 					// If the destination is a phi value, and the source is a real register, this is the beginning of the function.
 					a.state.argRealRegs = append(a.state.argRealRegs, use)
@@ -465,11 +469,10 @@ func (a *Allocator) livenessAnalysis(f Function) {
 			}
 		}
 
-		for _, v := range a.vs2 {
-			st := s.getVRegState(v)
+		for _, st := range a.ss {
 			// We use .spilled field to store the flag.
 			if st.spilled == flagLive { //nolint:gosimple
-				info.liveIns = append(info.liveIns, v)
+				info.liveIns = append(info.liveIns, st)
 				st.spilled = false
 			}
 		}
@@ -480,51 +483,48 @@ func (a *Allocator) livenessAnalysis(f Function) {
 	nrs := f.LoopNestingForestRoots()
 	for i := 0; i < nrs; i++ {
 		root := f.LoopNestingForestRoot(i)
-		a.loopTreeDFS(root)
+		a.loopTreeDFS(f, root)
 	}
 }
 
 // loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way.
-func (a *Allocator) loopTreeDFS(entry Block) {
+func (a *Allocator[I, B, F]) loopTreeDFS(f F, entry B) {
 	a.blks = a.blks[:0]
 	a.blks = append(a.blks, entry)
 
-	s := &a.state
 	for len(a.blks) > 0 {
 		tail := len(a.blks) - 1
 		loop := a.blks[tail]
 		a.blks = a.blks[:tail]
-		a.vs2 = a.vs2[:0]
+		a.ss = a.ss[:0]
 		const (
 			flagDone    = false
 			flagPending = true
 		)
 		info := a.getOrAllocateBlockState(loop.ID())
-		for _, v := range info.liveIns {
-			if s.phiBlk(v) != loop {
-				a.vs2 = append(a.vs2, v)
-				st := s.getVRegState(v)
+		for _, st := range info.liveIns {
+			if st.phiBlk() != loop {
+				a.ss = append(a.ss, st)
 				// We use .spilled field to store the flag.
 				st.spilled = flagPending
 			}
 		}
 
-		var siblingAddedView []VRegID
+		var siblingAddedView []*vrState[I, B, F]
 		cn := loop.LoopNestingForestChildren()
 		for i := 0; i < cn; i++ {
-			child := loop.LoopNestingForestChild(i)
+			child := f.LoopNestingForestChild(loop, i)
 			childID := child.ID()
 			childInfo := a.getOrAllocateBlockState(childID)
 
 			if i == 0 {
 				begin := len(childInfo.liveIns)
-				for _, v := range a.vs2 {
-					st := s.getVRegState(v)
+				for _, st := range a.ss {
 					// We use .spilled field to store the flag.
 					if st.spilled == flagPending { //nolint:gosimple
 						st.spilled = flagDone
 						// TODO: deduplicate, though I don't think it has much impact.
-						childInfo.liveIns = append(childInfo.liveIns, v)
+						childInfo.liveIns = append(childInfo.liveIns, st)
 					}
 				}
 				siblingAddedView = childInfo.liveIns[begin:]
@@ -540,8 +540,7 @@ func (a *Allocator) loopTreeDFS(entry Block) {
 
 		if cn == 0 {
 			// If there's no forest child, we haven't cleared the .spilled field at this point.
-			for _, v := range a.vs2 {
-				st := s.getVRegState(v)
+			for _, st := range a.ss {
 				st.spilled = false
 			}
 		}
@@ -558,37 +557,36 @@ func (a *Allocator) loopTreeDFS(entry Block) {
 // the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value.
 //
 // All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^.
-func (a *Allocator) alloc(f Function) {
+func (a *Allocator[I, B, F]) alloc(f F) {
 	// First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block).
-	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+	var nilBlk B
+	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.ReversePostOrderBlockIteratorNext() {
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("========== allocating blk%d ========\n", blk.ID())
 		}
 		if blk.Entry() {
-			a.finalizeStartReg(blk)
+			a.finalizeStartReg(f, blk)
 		}
 		a.allocBlock(f, blk)
 	}
 	// After the allocation, we all know the start and end state of each block. So we can fix the merge states.
-	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.ReversePostOrderBlockIteratorNext() {
 		a.fixMergeState(f, blk)
 	}
 	// Finally, we insert the spill instructions as we know all the places where the reloads happen.
 	a.scheduleSpills(f)
 }
 
-func (a *Allocator) updateLiveInVRState(liveness *blockState) {
+func (a *Allocator[I, B, F]) updateLiveInVRState(liveness *blockState[I, B, F]) {
 	currentBlockID := a.state.currentBlockID
-	for _, v := range liveness.liveIns {
-		vs := a.state.getVRegState(v)
+	for _, vs := range liveness.liveIns {
 		vs.lastUse = programCounterLiveIn
 		vs.lastUseUpdatedAtBlockID = currentBlockID
 	}
 }
 
-func (a *Allocator) finalizeStartReg(blk Block) {
+func (a *Allocator[I, B, F]) finalizeStartReg(f F, blk B) {
 	bID := blk.ID()
-	liveness := a.getOrAllocateBlockState(bID)
 	s := &a.state
 	currentBlkState := a.getOrAllocateBlockState(bID)
 	if currentBlkState.startFromPredIndex > -1 {
@@ -596,20 +594,20 @@ func (a *Allocator) finalizeStartReg(blk Block) {
 	}
 
 	s.currentBlockID = bID
-	a.updateLiveInVRState(liveness)
+	a.updateLiveInVRState(currentBlkState)
 
 	preds := blk.Preds()
-	var predState *blockState
+	var predState *blockState[I, B, F]
 	switch preds {
 	case 0: // This is the entry block.
 	case 1:
-		predID := blk.Pred(0).ID()
+		predID := f.Pred(blk, 0).ID()
 		predState = a.getOrAllocateBlockState(predID)
 		currentBlkState.startFromPredIndex = 0
 	default:
 		// TODO: there should be some better heuristic to choose the predecessor.
 		for i := 0; i < preds; i++ {
-			predID := blk.Pred(i).ID()
+			predID := f.Pred(blk, i).ID()
 			if _predState := a.getOrAllocateBlockState(predID); _predState.visited {
 				predState = _predState
 				currentBlkState.startFromPredIndex = i
@@ -622,18 +620,18 @@ func (a *Allocator) finalizeStartReg(blk Block) {
 			panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID()))
 		}
 		for _, u := range s.argRealRegs {
-			s.useRealReg(u.RealReg(), u)
+			s.useRealReg(u.RealReg(), s.getVRegState(u.ID()))
 		}
 		currentBlkState.startFromPredIndex = 0
-	} else if predState != nil {
+	} else {
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n",
-				bID, blk.Pred(currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex)
+				bID, f.Pred(blk, currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex)
 		}
 		s.resetAt(predState)
 	}
 
-	s.regsInUse.range_(func(allocated RealReg, v VReg) {
+	s.regsInUse.range_(func(allocated RealReg, v *vrState[I, B, F]) {
 		currentBlkState.startRegs.add(allocated, v)
 	})
 	if wazevoapi.RegAllocLoggingEnabled {
@@ -641,7 +639,7 @@ func (a *Allocator) finalizeStartReg(blk Block) {
 	}
 }
 
-func (a *Allocator) allocBlock(f Function, blk Block) {
+func (a *Allocator[I, B, F]) allocBlock(f F, blk B) {
 	bID := blk.ID()
 	s := &a.state
 	currentBlkState := a.getOrAllocateBlockState(bID)
@@ -652,36 +650,34 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 	}
 
 	// Clears the previous state.
-	s.regsInUse.range_(func(allocatedRealReg RealReg, vr VReg) {
-		s.setVRegState(vr, RealRegInvalid)
-	})
+	s.regsInUse.range_(func(allocatedRealReg RealReg, vr *vrState[I, B, F]) { vr.r = RealRegInvalid })
 	s.regsInUse.reset()
 	// Then set the start state.
-	currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
-		s.useRealReg(allocatedRealReg, vr)
-	})
+	currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr *vrState[I, B, F]) { s.useRealReg(allocatedRealReg, vr) })
 
-	desiredUpdated := a.vs2[:0]
+	desiredUpdated := a.ss[:0]
 
 	// Update the last use of each VReg.
+	a.copies = a.copies[:0] // Stores the copy instructions.
 	var pc programCounter
-	for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
-		var use, def VReg
-		for _, use = range instr.Uses(&a.vs) {
+	var nilInstr I
+	for instr := blk.InstrIteratorBegin(); instr != nilInstr; instr = blk.InstrIteratorNext() {
+		var useState *vrState[I, B, F]
+		for _, use := range instr.Uses(&a.vs) {
+			useState = s.getVRegState(use.ID())
 			if !use.IsRealReg() {
-				s.getVRegState(use.ID()).lastUse = pc
+				useState.lastUse = pc
 			}
 		}
 
 		if instr.IsCopy() {
-			def = instr.Defs(&a.vs)[0]
+			def := instr.Defs(&a.vs)[0]
+			a.copies = append(a.copies, _copy[I, B, F]{src: useState, dstID: def.ID()})
 			r := def.RealReg()
 			if r != RealRegInvalid {
-				useID := use.ID()
-				vs := s.getVRegState(useID)
-				if !vs.isPhi { // TODO: no idea why do we need this.
-					vs.desiredLoc = newDesiredLocReg(r)
-					desiredUpdated = append(desiredUpdated, useID)
+				if !useState.isPhi { // TODO: no idea why do we need this.
+					useState.desiredLoc = newDesiredLocReg(r)
+					desiredUpdated = append(desiredUpdated, useState)
 				}
 			}
 		}
@@ -690,18 +686,18 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 
 	// Mark all live-out values by checking live-in of the successors.
 	// While doing so, we also update the desired register values.
-	var succ Block
+	var succ B
+	var nilBlk B
 	for i, ns := 0, blk.Succs(); i < ns; i++ {
-		succ = blk.Succ(i)
-		if succ == nil {
+		succ = f.Succ(blk, i)
+		if succ == nilBlk {
 			continue
 		}
 
 		succID := succ.ID()
 		succState := a.getOrAllocateBlockState(succID)
-		for _, v := range succState.liveIns {
-			if s.phiBlk(v) != succ {
-				st := s.getVRegState(v)
+		for _, st := range succState.liveIns {
+			if st.phiBlk() != succ {
 				st.lastUse = programCounterLiveOut
 			}
 		}
@@ -710,43 +706,33 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 			if wazevoapi.RegAllocLoggingEnabled {
 				fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo))
 			}
-			succState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
-				vs := s.getVRegState(vr.ID())
+			succState.startRegs.range_(func(allocatedRealReg RealReg, vs *vrState[I, B, F]) {
 				vs.desiredLoc = newDesiredLocReg(allocatedRealReg)
-				desiredUpdated = append(desiredUpdated, vr.ID())
+				desiredUpdated = append(desiredUpdated, vs)
 			})
-			for _, p := range succ.BlockParams(&a.vs) {
+			for _, p := range f.BlockParams(succ, &a.vs) {
 				vs := s.getVRegState(p.ID())
 				if vs.desiredLoc.realReg() == RealRegInvalid {
 					vs.desiredLoc = desiredLocStack
-					desiredUpdated = append(desiredUpdated, p.ID())
+					desiredUpdated = append(desiredUpdated, vs)
 				}
 			}
 		}
 	}
 
 	// Propagate the desired register values from the end of the block to the beginning.
-	for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
-		if instr.IsCopy() {
-			def := instr.Defs(&a.vs)[0]
-			defState := s.getVRegState(def.ID())
-			desired := defState.desiredLoc.realReg()
-			if desired == RealRegInvalid {
-				continue
-			}
-
-			use := instr.Uses(&a.vs)[0]
-			useID := use.ID()
-			useState := s.getVRegState(useID)
-			if s.phiBlk(useID) != succ && useState.desiredLoc == desiredLocUnspecified {
-				useState.desiredLoc = newDesiredLocReg(desired)
-				desiredUpdated = append(desiredUpdated, useID)
-			}
+	for _, instr := range a.copies {
+		defState := s.getVRegState(instr.dstID)
+		desired := defState.desiredLoc.realReg()
+		useState := instr.src
+		if useState.phiBlk() != succ && useState.desiredLoc == desiredLocUnspecified {
+			useState.desiredLoc = newDesiredLocReg(desired)
+			desiredUpdated = append(desiredUpdated, useState)
 		}
 	}
 
 	pc = 0
-	for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
+	for instr := blk.InstrIteratorBegin(); instr != nilInstr; instr = blk.InstrIteratorNext() {
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Println(instr)
 		}
@@ -755,7 +741,8 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		killSet := a.reals[:0]
 
 		// Gather the set of registers that will be used in the current instruction.
-		for _, use := range instr.Uses(&a.vs) {
+		uses := instr.Uses(&a.vs)
+		for _, use := range uses {
 			if use.IsRealReg() {
 				r := use.RealReg()
 				currentUsedSet = currentUsedSet.add(r)
@@ -770,19 +757,19 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 			}
 		}
 
-		for i, use := range instr.Uses(&a.vs) {
+		for i, use := range uses {
 			if !use.IsRealReg() {
 				vs := s.getVRegState(use.ID())
 				killed := vs.lastUse == pc
 				r := vs.r
 
 				if r == RealRegInvalid {
-					r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet,
+					r = a.findOrSpillAllocatable(s, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet,
 						// Prefer the desired register if it's available.
 						vs.desiredLoc.realReg())
 					vs.recordReload(f, blk)
 					f.ReloadRegisterBefore(use.SetRealReg(r), instr)
-					s.useRealReg(r, use)
+					s.useRealReg(r, vs)
 				}
 				if wazevoapi.RegAllocLoggingEnabled {
 					fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r))
@@ -799,10 +786,9 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		}
 
 		isIndirect := instr.IsIndirectCall()
-		call := instr.IsCall() || isIndirect
-		if call {
+		if instr.IsCall() || isIndirect {
 			addr := RealRegInvalid
-			if instr.IsIndirectCall() {
+			if isIndirect {
 				addr = a.vs[0].RealReg()
 			}
 			a.releaseCallerSavedRegs(addr)
@@ -814,8 +800,8 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		a.reals = killSet
 
 		defs := instr.Defs(&a.vs)
-		switch {
-		case len(defs) > 1:
+		switch len(defs) {
+		default:
 			// Some instructions define multiple values on real registers.
 			// E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx.
 			//
@@ -830,20 +816,21 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 				if s.regsInUse.has(r) {
 					s.releaseRealReg(r)
 				}
-				s.useRealReg(r, def)
+				s.useRealReg(r, s.getVRegState(def.ID()))
 			}
-		case len(defs) == 1:
+		case 0:
+		case 1:
 			def := defs[0]
+			vState := s.getVRegState(def.ID())
 			if def.IsRealReg() {
 				r := def.RealReg()
 				if a.allocatableSet.has(r) {
 					if s.regsInUse.has(r) {
 						s.releaseRealReg(r)
 					}
-					s.useRealReg(r, def)
+					s.useRealReg(r, vState)
 				}
 			} else {
-				vState := s.getVRegState(def.ID())
 				r := vState.r
 
 				if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid {
@@ -864,7 +851,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 							}
 							r = desired
 							s.releaseRealReg(r)
-							s.useRealReg(r, def)
+							s.useRealReg(r, vState)
 						}
 					}
 				}
@@ -880,9 +867,9 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 					}
 					if r == RealRegInvalid {
 						typ := def.RegType()
-						r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid)
+						r = a.findOrSpillAllocatable(s, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid)
 					}
-					s.useRealReg(r, def)
+					s.useRealReg(r, vState)
 				}
 				dr := def.SetRealReg(r)
 				instr.AssignDef(dr)
@@ -915,9 +902,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		pc++
 	}
 
-	s.regsInUse.range_(func(allocated RealReg, v VReg) {
-		currentBlkState.endRegs.add(allocated, v)
-	})
+	s.regsInUse.range_(func(allocated RealReg, v *vrState[I, B, F]) { currentBlkState.endRegs.add(allocated, v) })
 
 	currentBlkState.visited = true
 	if wazevoapi.RegAllocLoggingEnabled {
@@ -925,32 +910,30 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 	}
 
 	// Reset the desired end location.
-	for _, v := range desiredUpdated {
-		vs := s.getVRegState(v)
+	for _, vs := range desiredUpdated {
 		vs.desiredLoc = desiredLocUnspecified
 	}
-	a.vs2 = desiredUpdated[:0]
+	a.ss = desiredUpdated[:0]
 
 	for i := 0; i < blk.Succs(); i++ {
-		succ := blk.Succ(i)
-		if succ == nil {
+		succ := f.Succ(blk, i)
+		if succ == nilBlk {
 			continue
 		}
 		// If the successor is not visited yet, finalize the start state.
-		a.finalizeStartReg(succ)
+		a.finalizeStartReg(f, succ)
 	}
 }
 
-func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
+func (a *Allocator[I, B, F]) releaseCallerSavedRegs(addrReg RealReg) {
 	s := &a.state
 
-	for i := 0; i < 64; i++ {
-		allocated := RealReg(i)
+	for allocated := RealReg(0); allocated < 64; allocated++ {
 		if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
 			continue
 		}
-		if v := s.regsInUse.get(allocated); v.Valid() {
-			if v.IsRealReg() {
+		if vs := s.regsInUse.get(allocated); vs != nil {
+			if vs.v.IsRealReg() {
 				continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg.
 			}
 			if !a.regInfo.CallerSavedRegisters.has(allocated) {
@@ -962,7 +945,7 @@ func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
 	}
 }
 
-func (a *Allocator) fixMergeState(f Function, blk Block) {
+func (a *Allocator[I, B, F]) fixMergeState(f F, blk B) {
 	preds := blk.Preds()
 	if preds <= 1 {
 		return
@@ -974,11 +957,10 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	bID := blk.ID()
 	blkSt := a.getOrAllocateBlockState(bID)
 	desiredOccupants := &blkSt.startRegs
-	aliveOnRegVRegs := make(map[VReg]RealReg)
-	for i := 0; i < 64; i++ {
-		r := RealReg(i)
-		if v := blkSt.startRegs.get(r); v.Valid() {
-			aliveOnRegVRegs[v] = r
+	var desiredOccupantsSet RegSet
+	for i, v := range desiredOccupants {
+		if v != nil {
+			desiredOccupantsSet = desiredOccupantsSet.add(RealReg(i))
 		}
 	}
 
@@ -987,151 +969,146 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	}
 
 	s.currentBlockID = bID
-	a.updateLiveInVRState(a.getOrAllocateBlockState(bID))
+	a.updateLiveInVRState(blkSt)
 
-	currentOccupants := &a.currentOccupants
 	for i := 0; i < preds; i++ {
-		currentOccupants.reset()
 		if i == blkSt.startFromPredIndex {
 			continue
 		}
 
-		currentOccupantsRev := make(map[VReg]RealReg)
-		pred := blk.Pred(i)
+		pred := f.Pred(blk, i)
 		predSt := a.getOrAllocateBlockState(pred.ID())
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
-			if v := predSt.endRegs.get(r); v.Valid() {
-				if _, ok := aliveOnRegVRegs[v]; !ok {
-					continue
-				}
-				currentOccupants.add(r, v)
-				currentOccupantsRev[v] = r
-			}
-		}
 
 		s.resetAt(predSt)
 
 		// Finds the free registers if any.
 		intTmp, floatTmp := VRegInvalid, VRegInvalid
 		if intFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupantsSet,
 		); intFree != RealRegInvalid {
 			intTmp = FromRealReg(intFree, RegTypeInt)
 		}
 		if floatFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupantsSet,
 		); floatFree != RealRegInvalid {
 			floatTmp = FromRealReg(floatFree, RegTypeFloat)
 		}
 
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
-		}
-
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
+		for r := RealReg(0); r < 64; r++ {
 			desiredVReg := desiredOccupants.get(r)
-			if !desiredVReg.Valid() {
+			if desiredVReg == nil {
 				continue
 			}
 
-			currentVReg := currentOccupants.get(r)
-			if desiredVReg.ID() == currentVReg.ID() {
+			currentVReg := s.regsInUse.get(r)
+			if currentVReg != nil && desiredVReg.v.ID() == currentVReg.v.ID() {
 				continue
 			}
 
-			typ := desiredVReg.RegType()
+			typ := desiredVReg.v.RegType()
 			var tmpRealReg VReg
 			if typ == RegTypeInt {
 				tmpRealReg = intTmp
 			} else {
 				tmpRealReg = floatTmp
 			}
-			a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ)
+			a.reconcileEdge(f, r, pred, currentVReg, desiredVReg, tmpRealReg, typ)
 		}
 	}
 }
 
-func (a *Allocator) reconcileEdge(f Function,
+// reconcileEdge reconciles the register state between the current block and the predecessor for the real register `r`.
+//
+//   - currentVReg is the current VReg value that sits on the register `r`. This can be VRegInvalid if the register is not used at the end of the predecessor.
+//   - desiredVReg is the desired VReg value that should be on the register `r`.
+//   - freeReg is the temporary register that can be used to swap the values, which may or may not be used.
+//   - typ is the register type of the `r`.
+func (a *Allocator[I, B, F]) reconcileEdge(f F,
 	r RealReg,
-	pred Block,
-	currentOccupants *regInUseSet,
-	currentOccupantsRev map[VReg]RealReg,
-	currentVReg, desiredVReg VReg,
+	pred B,
+	currentState, desiredState *vrState[I, B, F],
 	freeReg VReg,
 	typ RegType,
 ) {
+	desiredVReg := desiredState.v
+	currentVReg := VRegInvalid
+	if currentState != nil {
+		currentVReg = currentState.v
+	}
+	// There are four cases to consider:
+	// 1. currentVReg is valid, but desiredVReg is on the stack.
+	// 2. Both currentVReg and desiredVReg are valid.
+	// 3. Desired is on a different register than `r` and currentReg is not valid.
+	// 4. Desired is on the stack and currentReg is not valid.
+
 	s := &a.state
 	if currentVReg.Valid() {
-		// Both are on reg.
-		er, ok := currentOccupantsRev[desiredVReg]
-		if !ok {
+		er := desiredState.r
+		if er == RealRegInvalid {
+			// Case 1: currentVReg is valid, but desiredVReg is on the stack.
 			if wazevoapi.RegAllocLoggingEnabled {
 				fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
 					desiredVReg.ID(), a.regInfo.RealRegName(r),
 				)
 			}
-			// This case is that the desired value is on the stack, but currentVReg is on the target register.
-			// We need to move the current value to the stack, and reload the desired value.
+			// We need to move the current value to the stack, and reload the desired value into the register.
 			// TODO: we can do better here.
 			f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			delete(currentOccupantsRev, currentVReg)
+			s.releaseRealReg(r)
 
-			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+			desiredState.recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			currentOccupants.add(r, desiredVReg)
-			currentOccupantsRev[desiredVReg] = r
+			s.useRealReg(r, desiredState)
 			return
-		}
-
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
-				desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+		} else {
+			// Case 2: Both currentVReg and desiredVReg are valid.
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
+					desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+				)
+			}
+			// This case, we need to swap the values between the current and desired values.
+			f.SwapBefore(
+				currentVReg.SetRealReg(r),
+				desiredVReg.SetRealReg(er),
+				freeReg,
+				pred.LastInstrForInsertion(),
 			)
-		}
-		f.SwapBefore(
-			currentVReg.SetRealReg(r),
-			desiredVReg.SetRealReg(er),
-			freeReg,
-			pred.LastInstrForInsertion(),
-		)
-		s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupantsRev[currentVReg] = er
-		currentOccupants.add(r, desiredVReg)
-		currentOccupants.add(er, currentVReg)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
+			s.releaseRealReg(r)
+			s.releaseRealReg(er)
+			s.useRealReg(r, desiredState)
+			s.useRealReg(er, currentState)
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			}
 		}
 	} else {
-		// Desired is on reg, but currently the target register is not used.
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
 				desiredVReg.ID(), a.regInfo.RealRegName(r),
 			)
 		}
-		if currentReg, ok := currentOccupantsRev[desiredVReg]; ok {
+		if currentReg := desiredState.r; currentReg != RealRegInvalid {
+			// Case 3: Desired is on a different register than `r` and currentReg is not valid.
+			// We simply need to move the desired value to the register.
 			f.InsertMoveBefore(
 				FromRealReg(r, typ),
 				desiredVReg.SetRealReg(currentReg),
 				pred.LastInstrForInsertion(),
 			)
-			currentOccupants.remove(currentReg)
+			s.releaseRealReg(currentReg)
 		} else {
-			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+			// Case 4: Both currentVReg and desiredVReg are not valid.
+			// We simply need to reload the desired value into the register.
+			desiredState.recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
 		}
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupants.add(r, desiredVReg)
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+		s.useRealReg(r, desiredState)
 	}
 }
 
-func (a *Allocator) scheduleSpills(f Function) {
+func (a *Allocator[I, B, F]) scheduleSpills(f F) {
 	states := a.state.vrStates
 	for i := 0; i <= states.MaxIDEncountered(); i++ {
 		vs := states.Get(i)
@@ -1144,7 +1121,7 @@ func (a *Allocator) scheduleSpills(f Function) {
 	}
 }
 
-func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
+func (a *Allocator[I, B, F]) scheduleSpill(f F, vs *vrState[I, B, F]) {
 	v := vs.v
 	// If the value is the phi value, we need to insert a spill after each phi definition.
 	if vs.isPhi {
@@ -1157,10 +1134,11 @@ func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
 	pos := vs.lca
 	definingBlk := vs.defBlk
 	r := RealRegInvalid
-	if definingBlk == nil {
+	var nilBlk B
+	if definingBlk == nilBlk {
 		panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
 	}
-	if pos == nil {
+	if pos == nilBlk {
 		panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
 	}
 
@@ -1169,9 +1147,8 @@ func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
 	}
 	for pos != definingBlk {
 		st := a.getOrAllocateBlockState(pos.ID())
-		for ii := 0; ii < 64; ii++ {
-			rr := RealReg(ii)
-			if st.startRegs.get(rr) == v {
+		for rr := RealReg(0); rr < 64; rr++ {
+			if vs := st.startRegs.get(rr); vs != nil && vs.v == v {
 				r = rr
 				// Already in the register, so we can place the spill at the beginning of the block.
 				break
@@ -1204,7 +1181,7 @@ func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
 }
 
 // Reset resets the allocator's internal state so that it can be reused.
-func (a *Allocator) Reset() {
+func (a *Allocator[I, B, F]) Reset() {
 	a.state.reset()
 	a.blockStates.Reset()
 	a.phiDefInstListPool.Reset()
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
index e9bf60661c..ce84c9c0cd 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@@ -46,63 +46,51 @@ func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
 	}
 }
 
-type regInUseSet struct {
-	set RegSet
-	vrs [64]VReg
+type regInUseSet[I Instr, B Block[I], F Function[I, B]] [64]*vrState[I, B, F]
+
+func newRegInUseSet[I Instr, B Block[I], F Function[I, B]]() regInUseSet[I, B, F] {
+	var ret regInUseSet[I, B, F]
+	ret.reset()
+	return ret
 }
 
-func (rs *regInUseSet) reset() {
-	rs.set = 0
-	for i := range rs.vrs {
-		rs.vrs[i] = VRegInvalid
-	}
+func (rs *regInUseSet[I, B, F]) reset() {
+	clear(rs[:])
 }
 
-func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
+func (rs *regInUseSet[I, B, F]) format(info *RegisterInfo) string { //nolint:unused
 	var ret []string
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			vr := rs.vrs[i]
-			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
+	for i, vr := range rs {
+		if vr != nil {
+			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.v.ID()))
 		}
 	}
 	return strings.Join(ret, ", ")
 }
 
-func (rs *regInUseSet) has(r RealReg) bool {
-	if r >= 64 {
-		return false
-	}
-	return rs.set&(1<<uint(r)) != 0
+func (rs *regInUseSet[I, B, F]) has(r RealReg) bool {
+	return r < 64 && rs[r] != nil
 }
 
-func (rs *regInUseSet) get(r RealReg) VReg {
-	if r >= 64 {
-		return VRegInvalid
-	}
-	return rs.vrs[r]
+func (rs *regInUseSet[I, B, F]) get(r RealReg) *vrState[I, B, F] {
+	return rs[r]
 }
 
-func (rs *regInUseSet) remove(r RealReg) {
-	if r >= 64 {
-		return
-	}
-	rs.set &= ^(1 << uint(r))
-	rs.vrs[r] = VRegInvalid
+func (rs *regInUseSet[I, B, F]) remove(r RealReg) {
+	rs[r] = nil
 }
 
-func (rs *regInUseSet) add(r RealReg, vr VReg) {
+func (rs *regInUseSet[I, B, F]) add(r RealReg, vr *vrState[I, B, F]) {
 	if r >= 64 {
 		return
 	}
-	rs.set |= 1 << uint(r)
-	rs.vrs[r] = vr
+	rs[r] = vr
 }
 
-func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			f(RealReg(i), rs.vrs[i])
+func (rs *regInUseSet[I, B, F]) range_(f func(allocatedRealReg RealReg, vr *vrState[I, B, F])) {
+	for i, vr := range rs {
+		if vr != nil {
+			f(RealReg(i), vr)
 		}
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
index edfa962b5c..47a275a3ab 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
@@ -1,43 +1,19 @@
 package backend
 
 import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
 )
 
 // SSAValueDefinition represents a definition of an SSA value.
 type SSAValueDefinition struct {
-	// BlockParamValue is valid if Instr == nil
-	BlockParamValue ssa.Value
-
-	// BlkParamVReg is valid if Instr == nil
-	BlkParamVReg regalloc.VReg
-
+	V ssa.Value
 	// Instr is not nil if this is a definition from an instruction.
 	Instr *ssa.Instruction
-	// N is the index of the return value in the instr's return values list.
-	N int
 	// RefCount is the number of references to the result.
-	RefCount int
+	RefCount uint32
 }
 
+// IsFromInstr returns true if this definition is from an instruction.
 func (d *SSAValueDefinition) IsFromInstr() bool {
 	return d.Instr != nil
 }
-
-func (d *SSAValueDefinition) IsFromBlockParam() bool {
-	return d.Instr == nil
-}
-
-func (d *SSAValueDefinition) SSAValue() ssa.Value {
-	if d.IsFromBlockParam() {
-		return d.BlockParamValue
-	} else {
-		r, rs := d.Instr.Returns()
-		if d.N == 0 {
-			return r
-		} else {
-			return rs[d.N-1]
-		}
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
index 3379c4ddef..639429a63f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
@@ -2,7 +2,6 @@ package wazevo
 
 import (
 	"context"
-	"encoding/binary"
 	"fmt"
 	"reflect"
 	"runtime"
@@ -310,15 +309,6 @@ func (c *callEngine) callWithStack(ctx context.Context, paramResultStack []uint6
 				*argRes = uint64(0xffffffff) // = -1 in signed 32-bit integer.
 			} else {
 				*argRes = uint64(res)
-				calleeOpaque := opaqueViewFromPtr(uintptr(unsafe.Pointer(c.execCtx.callerModuleContextPtr)))
-				if mod.Source.MemorySection != nil { // Local memory.
-					putLocalMemory(calleeOpaque, 8 /* local memory begins at 8 */, mem)
-				} else {
-					// Imported memory's owner at offset 16 of the callerModuleContextPtr.
-					opaquePtr := uintptr(binary.LittleEndian.Uint64(calleeOpaque[16:]))
-					importedMemOwner := opaqueViewFromPtr(opaquePtr)
-					putLocalMemory(importedMemOwner, 8 /* local memory begins at 8 */, mem)
-				}
 			}
 			c.execCtx.exitCode = wazevoapi.ExitCodeOK
 			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
@@ -525,14 +515,6 @@ func (c *callEngine) callerModuleInstance() *wasm.ModuleInstance {
 	return moduleInstanceFromOpaquePtr(c.execCtx.callerModuleContextPtr)
 }
 
-func opaqueViewFromPtr(ptr uintptr) []byte {
-	var opaque []byte
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaque))
-	sh.Data = ptr
-	setSliceLimits(sh, 24, 24)
-	return opaque
-}
-
 const callStackCeiling = uintptr(50000000) // in uint64 (8 bytes) == 400000000 bytes in total == 400mb.
 
 func (c *callEngine) growStackWithGuarded() (newSP uintptr, newFP uintptr, err error) {
@@ -572,17 +554,21 @@ func (c *callEngine) cloneStack(l uintptr) (newSP, newFP, newTop uintptr, newSta
 	// Copy the existing contents in the previous Go-allocated stack into the new one.
 	var prevStackAligned, newStackAligned []byte
 	{
+		//nolint:staticcheck
 		sh := (*reflect.SliceHeader)(unsafe.Pointer(&prevStackAligned))
 		sh.Data = c.stackTop - relSp
-		setSliceLimits(sh, relSp, relSp)
+		sh.Len = int(relSp)
+		sh.Cap = int(relSp)
 	}
 	newTop = alignedStackTop(newStack)
 	{
 		newSP = newTop - relSp
 		newFP = newTop - relFp
+		//nolint:staticcheck
 		sh := (*reflect.SliceHeader)(unsafe.Pointer(&newStackAligned))
 		sh.Data = newSP
-		setSliceLimits(sh, relSp, relSp)
+		sh.Len = int(relSp)
+		sh.Cap = int(relSp)
 	}
 	copy(newStackAligned, prevStackAligned)
 	return
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
index f7c0450aed..e49353dc8e 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
@@ -31,6 +31,13 @@ func fileCacheKey(m *wasm.Module) (ret filecache.Key) {
 	s := sha256.New()
 	s.Write(m.ID[:])
 	s.Write(magic)
+	// Write the CPU features so that we can cache the compiled module for the same CPU.
+	// This prevents the incompatible CPU features from being used.
+	cpu := platform.CpuFeatures.Raw()
+	// Reuse the `ret` buffer to write the first 8 bytes of the CPU features so that we can avoid the allocation.
+	binary.LittleEndian.PutUint64(ret[:8], cpu)
+	s.Write(ret[:8])
+	// Finally, write the hash to the ret buffer.
 	s.Sum(ret[:0])
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
index 873a35a551..eebdba0346 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
@@ -275,7 +275,7 @@ func (c *Compiler) LowerToSSA() {
 		builder.DefineVariable(variable, value, entryBlock)
 		c.setWasmLocalVariable(wasm.Index(i), variable)
 	}
-	c.declareWasmLocals(entryBlock)
+	c.declareWasmLocals()
 	c.declareNecessaryVariables()
 
 	c.lowerBody(entryBlock)
@@ -295,32 +295,13 @@ func (c *Compiler) setWasmLocalVariable(index wasm.Index, variable ssa.Variable)
 }
 
 // declareWasmLocals declares the SSA variables for the Wasm locals.
-func (c *Compiler) declareWasmLocals(entry ssa.BasicBlock) {
+func (c *Compiler) declareWasmLocals() {
 	localCount := wasm.Index(len(c.wasmFunctionTyp.Params))
 	for i, typ := range c.wasmFunctionLocalTypes {
 		st := WasmTypeToSSAType(typ)
 		variable := c.ssaBuilder.DeclareVariable(st)
 		c.setWasmLocalVariable(wasm.Index(i)+localCount, variable)
-
-		zeroInst := c.ssaBuilder.AllocateInstruction()
-		switch st {
-		case ssa.TypeI32:
-			zeroInst.AsIconst32(0)
-		case ssa.TypeI64:
-			zeroInst.AsIconst64(0)
-		case ssa.TypeF32:
-			zeroInst.AsF32const(0)
-		case ssa.TypeF64:
-			zeroInst.AsF64const(0)
-		case ssa.TypeV128:
-			zeroInst.AsVconst(0, 0)
-		default:
-			panic("TODO: " + wasm.ValueTypeName(typ))
-		}
-
-		c.ssaBuilder.InsertInstruction(zeroInst)
-		value := zeroInst.Return()
-		c.ssaBuilder.DefineVariable(variable, value, entry)
+		c.ssaBuilder.InsertZeroValue(st)
 	}
 }
 
@@ -562,11 +543,11 @@ func (c *Compiler) initializeCurrentBlockKnownBounds() {
 				cb := &c.bounds[i][c.pointers[i]]
 				if cb.id != smallestID {
 					same = false
-					break
 				} else {
 					if cb.bound < minBound {
 						minBound = cb.bound
 					}
+					c.pointers[i]++
 				}
 			}
 
@@ -574,14 +555,6 @@ func (c *Compiler) initializeCurrentBlockKnownBounds() {
 				// Absolute address cannot be used in the intersection since the value might be only defined in one of the predecessors.
 				c.recordKnownSafeBound(smallestID, minBound, ssa.ValueInvalid)
 			}
-
-			// Move pointer(s) for the smallest ID forward (if same, move all).
-			for i := 0; i < preds; i++ {
-				cb := &c.bounds[i][c.pointers[i]]
-				if cb.id == smallestID {
-					c.pointers[i]++
-				}
-			}
 		}
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
index 5096a63652..e73debbd1a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
@@ -1086,16 +1086,8 @@ func (c *Compiler) lowerCurrentOpcode() {
 			break
 		}
 		variable := c.localVariable(index)
-		if _, ok := c.m.NonStaticLocals[c.wasmLocalFunctionIndex][index]; ok {
-			state.push(builder.MustFindValue(variable))
-		} else {
-			// If a local is static, we can simply find it in the entry block which is either a function param
-			// or a zero value. This fast pass helps to avoid the overhead of searching the entire function plus
-			// avoid adding unnecessary block arguments.
-			// TODO: I think this optimization should be done in a SSA pass like passRedundantPhiEliminationOpt,
-			// 	but somehow there's some corner cases that it fails to optimize.
-			state.push(builder.MustFindValueInBlk(variable, c.ssaBuilder.EntryBlock()))
-		}
+		state.push(builder.MustFindValue(variable))
+
 	case wasm.OpcodeLocalSet:
 		index := c.readI32u()
 		if state.unreachable {
@@ -1546,8 +1538,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 		builder.SetCurrentBlock(elseBlk)
 
 	case wasm.OpcodeBrTable:
-		labels := state.tmpForBrTable
-		labels = labels[:0]
+		labels := state.tmpForBrTable[:0]
 		labelCount := c.readI32u()
 		for i := 0; i < int(labelCount); i++ {
 			labels = append(labels, c.readI32u())
@@ -1565,6 +1556,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 		} else {
 			c.lowerBrTable(labels, index)
 		}
+		state.tmpForBrTable = labels // reuse the temporary slice for next use.
 		state.unreachable = true
 
 	case wasm.OpcodeNop:
@@ -4076,13 +4068,14 @@ func (c *Compiler) lowerBrTable(labels []uint32, index ssa.Value) {
 		numArgs = len(f.blockType.Results)
 	}
 
-	targets := make([]ssa.BasicBlock, len(labels))
+	varPool := builder.VarLengthPool()
+	trampolineBlockIDs := varPool.Allocate(len(labels))
 
 	// We need trampoline blocks since depending on the target block structure, we might end up inserting moves before jumps,
 	// which cannot be done with br_table. Instead, we can do such per-block moves in the trampoline blocks.
 	// At the linking phase (very end of the backend), we can remove the unnecessary jumps, and therefore no runtime overhead.
 	currentBlk := builder.CurrentBlock()
-	for i, l := range labels {
+	for _, l := range labels {
 		// Args are always on the top of the stack. Note that we should not share the args slice
 		// among the jump instructions since the args are modified during passes (e.g. redundant phi elimination).
 		args := c.nPeekDup(numArgs)
@@ -4090,17 +4083,17 @@ func (c *Compiler) lowerBrTable(labels []uint32, index ssa.Value) {
 		trampoline := builder.AllocateBasicBlock()
 		builder.SetCurrentBlock(trampoline)
 		c.insertJumpToBlock(args, targetBlk)
-		targets[i] = trampoline
+		trampolineBlockIDs = trampolineBlockIDs.Append(builder.VarLengthPool(), ssa.Value(trampoline.ID()))
 	}
 	builder.SetCurrentBlock(currentBlk)
 
 	// If the target block has no arguments, we can just jump to the target block.
 	brTable := builder.AllocateInstruction()
-	brTable.AsBrTable(index, targets)
+	brTable.AsBrTable(index, trampolineBlockIDs)
 	builder.InsertInstruction(brTable)
 
-	for _, trampoline := range targets {
-		builder.Seal(trampoline)
+	for _, trampolineID := range trampolineBlockIDs.View() {
+		builder.Seal(builder.BasicBlock(ssa.BasicBlockID(trampolineID)))
 	}
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
index 1296706f5c..5b055d1272 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
@@ -1,5 +1,3 @@
-//go:build go1.21
-
 package frontend
 
 import (
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
deleted file mode 100644
index 2e786a160d..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
+++ /dev/null
@@ -1,17 +0,0 @@
-//go:build !go1.21
-
-// TODO: delete after the floor Go version is 1.21
-
-package frontend
-
-import (
-	"sort"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-func sortSSAValueIDs(IDs []ssa.ValueID) {
-	sort.SliceStable(IDs, func(i, j int) bool {
-		return int(IDs[i]) < int(IDs[j])
-	})
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
index 8da7347a9f..800a5d2a81 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
@@ -16,6 +16,7 @@ func buildHostModuleOpaque(m *wasm.Module, listeners []experimental.FunctionList
 	binary.LittleEndian.PutUint64(ret[0:], uint64(uintptr(unsafe.Pointer(m))))
 
 	if len(listeners) > 0 {
+		//nolint:staticcheck
 		sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&listeners))
 		binary.LittleEndian.PutUint64(ret[8:], uint64(sliceHeader.Data))
 		binary.LittleEndian.PutUint64(ret[16:], uint64(sliceHeader.Len))
@@ -33,6 +34,7 @@ func buildHostModuleOpaque(m *wasm.Module, listeners []experimental.FunctionList
 
 func hostModuleFromOpaque(opaqueBegin uintptr) *wasm.Module {
 	var opaqueViewOverSlice []byte
+	//nolint:staticcheck
 	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
 	sh.Data = opaqueBegin
 	sh.Len = 32
@@ -42,6 +44,7 @@ func hostModuleFromOpaque(opaqueBegin uintptr) *wasm.Module {
 
 func hostModuleListenersSliceFromOpaque(opaqueBegin uintptr) []experimental.FunctionListener {
 	var opaqueViewOverSlice []byte
+	//nolint:staticcheck
 	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
 	sh.Data = opaqueBegin
 	sh.Len = 32
@@ -51,9 +54,11 @@ func hostModuleListenersSliceFromOpaque(opaqueBegin uintptr) []experimental.Func
 	l := binary.LittleEndian.Uint64(opaqueViewOverSlice[16:])
 	c := binary.LittleEndian.Uint64(opaqueViewOverSlice[24:])
 	var ret []experimental.FunctionListener
+	//nolint:staticcheck
 	sh = (*reflect.SliceHeader)(unsafe.Pointer(&ret))
 	sh.Data = uintptr(b)
-	setSliceLimits(sh, uintptr(l), uintptr(c))
+	sh.Len = int(l)
+	sh.Cap = int(c)
 	return ret
 }
 
@@ -62,6 +67,7 @@ func hostModuleGoFuncFromOpaque[T any](index int, opaqueBegin uintptr) T {
 	ptr := opaqueBegin + offset
 
 	var opaqueViewOverFunction []byte
+	//nolint:staticcheck
 	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverFunction))
 	sh.Data = ptr
 	sh.Len = 16
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
index ba8f546c0d..efa1b9bbaa 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
@@ -86,16 +86,6 @@ func newAlignedOpaque(size int) moduleContextOpaque {
 	return buf
 }
 
-func putLocalMemory(opaque []byte, offset wazevoapi.Offset, mem *wasm.MemoryInstance) {
-	s := uint64(len(mem.Buffer))
-	var b uint64
-	if len(mem.Buffer) > 0 {
-		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
-	}
-	binary.LittleEndian.PutUint64(opaque[offset:], b)
-	binary.LittleEndian.PutUint64(opaque[offset+8:], s)
-}
-
 func (m *moduleEngine) setupOpaque() {
 	inst := m.module
 	offsets := &m.parent.offsets
@@ -106,7 +96,7 @@ func (m *moduleEngine) setupOpaque() {
 	)
 
 	if lm := offsets.LocalMemoryBegin; lm >= 0 {
-		putLocalMemory(opaque, lm, inst.MemoryInstance)
+		m.putLocalMemory()
 	}
 
 	// Note: imported memory is resolved in ResolveImportedFunction.
@@ -227,6 +217,25 @@ func (m *moduleEngine) SetGlobalValue(i wasm.Index, lo, hi uint64) {
 // OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
 func (m *moduleEngine) OwnsGlobals() bool { return true }
 
+// MemoryGrown implements wasm.ModuleEngine.
+func (m *moduleEngine) MemoryGrown() {
+	m.putLocalMemory()
+}
+
+// putLocalMemory writes the local memory buffer pointer and length to the opaque buffer.
+func (m *moduleEngine) putLocalMemory() {
+	mem := m.module.MemoryInstance
+	offset := m.parent.offsets.LocalMemoryBegin
+
+	s := uint64(len(mem.Buffer))
+	var b uint64
+	if len(mem.Buffer) > 0 {
+		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
+	}
+	binary.LittleEndian.PutUint64(m.opaque[offset:], b)
+	binary.LittleEndian.PutUint64(m.opaque[offset+8:], s)
+}
+
 // ResolveImportedFunction implements wasm.ModuleEngine.
 func (m *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
 	executableOffset, moduleCtxOffset, typeIDOffset := m.parent.offsets.ImportedFunctionOffset(index)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
deleted file mode 100644
index 6a03fc65c7..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build !tinygo
-
-package wazevo
-
-import "reflect"
-
-// setSliceLimits sets both Cap and Len for the given reflected slice.
-func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
-	s.Len = int(l)
-	s.Cap = int(c)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
deleted file mode 100644
index eda3e706ac..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build tinygo
-
-package wazevo
-
-import "reflect"
-
-// setSliceLimits sets both Cap and Len for the given reflected slice.
-func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
-	s.Len = l
-	s.Cap = c
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
index 10b6b4b62b..cf7f14d3b1 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
@@ -34,9 +34,6 @@ type BasicBlock interface {
 	// The returned Value is the definition of the param in this block.
 	Param(i int) Value
 
-	// InsertInstruction inserts an instruction that implements Value into the tail of this block.
-	InsertInstruction(raw *Instruction)
-
 	// Root returns the root instruction of this block.
 	Root() *Instruction
 
@@ -49,21 +46,12 @@ type BasicBlock interface {
 	// ReturnBlock returns ture if this block represents the function return.
 	ReturnBlock() bool
 
-	// FormatHeader returns the debug string of this block, not including instruction.
-	FormatHeader(b Builder) string
-
 	// Valid is true if this block is still valid even after optimizations.
 	Valid() bool
 
 	// Sealed is true if this block has been sealed.
 	Sealed() bool
 
-	// BeginPredIterator returns the first predecessor of this block.
-	BeginPredIterator() BasicBlock
-
-	// NextPredIterator returns the next predecessor of this block.
-	NextPredIterator() BasicBlock
-
 	// Preds returns the number of predecessors of this block.
 	Preds() int
 
@@ -88,10 +76,11 @@ type (
 	basicBlock struct {
 		id                      BasicBlockID
 		rootInstr, currentInstr *Instruction
-		params                  []blockParam
-		predIter                int
-		preds                   []basicBlockPredecessorInfo
-		success                 []*basicBlock
+		// params are Values that represent parameters to a basicBlock.
+		// Each parameter can be considered as an output of PHI instruction in traditional SSA.
+		params  Values
+		preds   []basicBlockPredecessorInfo
+		success []*basicBlock
 		// singlePred is the alias to preds[0] for fast lookup, and only set after Seal is called.
 		singlePred *basicBlock
 		// lastDefinitions maps Variable to its last definition in this block.
@@ -116,11 +105,14 @@ type (
 
 		// loopNestingForestChildren holds the children of this block in the loop nesting forest.
 		// Non-empty if and only if this block is a loop header (i.e. loopHeader=true)
-		loopNestingForestChildren []BasicBlock
+		loopNestingForestChildren wazevoapi.VarLength[BasicBlock]
 
 		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
 		// This is used in builder.LayoutBlocks.
-		reversePostOrder int
+		reversePostOrder int32
+
+		// visited is used during various traversals.
+		visited int32
 
 		// child and sibling are the ones in the dominator tree.
 		child, sibling *basicBlock
@@ -128,15 +120,6 @@ type (
 	// BasicBlockID is the unique ID of a basicBlock.
 	BasicBlockID uint32
 
-	// blockParam implements Value and represents a parameter to a basicBlock.
-	blockParam struct {
-		// value is the Value that corresponds to the parameter in this block,
-		// and can be considered as an output of PHI instruction in traditional SSA.
-		value Value
-		// typ is the type of the parameter.
-		typ Type
-	}
-
 	unknownValue struct {
 		// variable is the variable that this unknownValue represents.
 		variable Variable
@@ -145,6 +128,9 @@ type (
 	}
 )
 
+// basicBlockVarLengthNil is the default nil value for basicBlock.loopNestingForestChildren.
+var basicBlockVarLengthNil = wazevoapi.NewNilVarLength[BasicBlock]()
+
 const basicBlockIDReturnBlock = 0xffffffff
 
 // Name implements BasicBlock.Name.
@@ -190,24 +176,23 @@ func (bb *basicBlock) ReturnBlock() bool {
 // AddParam implements BasicBlock.AddParam.
 func (bb *basicBlock) AddParam(b Builder, typ Type) Value {
 	paramValue := b.allocateValue(typ)
-	bb.params = append(bb.params, blockParam{typ: typ, value: paramValue})
+	bb.params = bb.params.Append(&b.(*builder).varLengthPool, paramValue)
 	return paramValue
 }
 
 // addParamOn adds a parameter to this block whose value is already allocated.
-func (bb *basicBlock) addParamOn(typ Type, value Value) {
-	bb.params = append(bb.params, blockParam{typ: typ, value: value})
+func (bb *basicBlock) addParamOn(b *builder, value Value) {
+	bb.params = bb.params.Append(&b.varLengthPool, value)
 }
 
 // Params implements BasicBlock.Params.
 func (bb *basicBlock) Params() int {
-	return len(bb.params)
+	return len(bb.params.View())
 }
 
 // Param implements BasicBlock.Param.
 func (bb *basicBlock) Param(i int) Value {
-	p := &bb.params[i]
-	return p.value
+	return bb.params.View()[i]
 }
 
 // Valid implements BasicBlock.Valid.
@@ -220,8 +205,8 @@ func (bb *basicBlock) Sealed() bool {
 	return bb.sealed
 }
 
-// InsertInstruction implements BasicBlock.InsertInstruction.
-func (bb *basicBlock) InsertInstruction(next *Instruction) {
+// insertInstruction implements BasicBlock.InsertInstruction.
+func (bb *basicBlock) insertInstruction(b *builder, next *Instruction) {
 	current := bb.currentInstr
 	if current != nil {
 		current.next = next
@@ -233,12 +218,12 @@ func (bb *basicBlock) InsertInstruction(next *Instruction) {
 
 	switch next.opcode {
 	case OpcodeJump, OpcodeBrz, OpcodeBrnz:
-		target := next.blk.(*basicBlock)
-		target.addPred(bb, next)
+		target := BasicBlockID(next.rValue)
+		b.basicBlock(target).addPred(bb, next)
 	case OpcodeBrTable:
-		for _, _target := range next.targets {
-			target := _target.(*basicBlock)
-			target.addPred(bb, next)
+		for _, _target := range next.rValues.View() {
+			target := BasicBlockID(_target)
+			b.basicBlock(target).addPred(bb, next)
 		}
 	}
 }
@@ -248,22 +233,6 @@ func (bb *basicBlock) NumPreds() int {
 	return len(bb.preds)
 }
 
-// BeginPredIterator implements BasicBlock.BeginPredIterator.
-func (bb *basicBlock) BeginPredIterator() BasicBlock {
-	bb.predIter = 0
-	return bb.NextPredIterator()
-}
-
-// NextPredIterator implements BasicBlock.NextPredIterator.
-func (bb *basicBlock) NextPredIterator() BasicBlock {
-	if bb.predIter >= len(bb.preds) {
-		return nil
-	}
-	pred := bb.preds[bb.predIter].blk
-	bb.predIter++
-	return pred
-}
-
 // Preds implements BasicBlock.Preds.
 func (bb *basicBlock) Preds() int {
 	return len(bb.preds)
@@ -296,7 +265,7 @@ func (bb *basicBlock) Tail() *Instruction {
 
 // reset resets the basicBlock to its initial state so that it can be reused for another function.
 func resetBasicBlock(bb *basicBlock) {
-	bb.params = bb.params[:0]
+	bb.params = ValuesNil
 	bb.rootInstr, bb.currentInstr = nil, nil
 	bb.preds = bb.preds[:0]
 	bb.success = bb.success[:0]
@@ -305,7 +274,8 @@ func resetBasicBlock(bb *basicBlock) {
 	bb.unknownValues = bb.unknownValues[:0]
 	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
 	bb.reversePostOrder = -1
-	bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0]
+	bb.visited = 0
+	bb.loopNestingForestChildren = basicBlockVarLengthNil
 	bb.loopHeader = false
 	bb.sibling = nil
 	bb.child = nil
@@ -335,11 +305,11 @@ func (bb *basicBlock) addPred(blk BasicBlock, branch *Instruction) {
 	pred.success = append(pred.success, bb)
 }
 
-// FormatHeader implements BasicBlock.FormatHeader.
-func (bb *basicBlock) FormatHeader(b Builder) string {
-	ps := make([]string, len(bb.params))
-	for i, p := range bb.params {
-		ps[i] = p.value.formatWithType(b)
+// formatHeader returns the string representation of the header of the basicBlock.
+func (bb *basicBlock) formatHeader(b Builder) string {
+	ps := make([]string, len(bb.params.View()))
+	for i, p := range bb.params.View() {
+		ps[i] = p.formatWithType(b)
 	}
 
 	if len(bb.preds) > 0 {
@@ -366,7 +336,9 @@ func (bb *basicBlock) validate(b *builder) {
 	if len(bb.preds) > 0 {
 		for _, pred := range bb.preds {
 			if pred.branch.opcode != OpcodeBrTable {
-				if target := pred.branch.blk; target != bb {
+				blockID := int(pred.branch.rValue)
+				target := b.basicBlocksPool.View(blockID)
+				if target != bb {
 					panic(fmt.Sprintf("BUG: '%s' is not branch to %s, but to %s",
 						pred.branch.Format(b), bb.Name(), target.Name()))
 				}
@@ -376,14 +348,14 @@ func (bb *basicBlock) validate(b *builder) {
 			if bb.ReturnBlock() {
 				exp = len(b.currentSignature.Results)
 			} else {
-				exp = len(bb.params)
+				exp = len(bb.params.View())
 			}
 
 			if len(pred.branch.vs.View()) != exp {
 				panic(fmt.Sprintf(
 					"BUG: len(argument at %s) != len(params at %s): %d != %d: %s",
 					pred.blk.Name(), bb.Name(),
-					len(pred.branch.vs.View()), len(bb.params), pred.branch.Format(b),
+					len(pred.branch.vs.View()), len(bb.params.View()), pred.branch.Format(b),
 				))
 			}
 
@@ -398,7 +370,7 @@ func (bb *basicBlock) String() string {
 
 // LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren.
 func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock {
-	return bb.loopNestingForestChildren
+	return bb.loopNestingForestChildren.View()
 }
 
 // LoopHeader implements BasicBlock.LoopHeader.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
index e1471edc37..fb98298f7f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
@@ -1,5 +1,3 @@
-//go:build go1.21
-
 package ssa
 
 import (
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
deleted file mode 100644
index 9dc881dae7..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
+++ /dev/null
@@ -1,24 +0,0 @@
-//go:build !go1.21
-
-// TODO: delete after the floor Go version is 1.21
-
-package ssa
-
-import "sort"
-
-func sortBlocks(blocks []*basicBlock) {
-	sort.SliceStable(blocks, func(i, j int) bool {
-		iBlk, jBlk := blocks[i], blocks[j]
-		if jBlk.ReturnBlock() {
-			return true
-		}
-		if iBlk.ReturnBlock() {
-			return false
-		}
-		iRoot, jRoot := iBlk.rootInstr, jBlk.rootInstr
-		if iRoot == nil || jRoot == nil { // For testing.
-			return true
-		}
-		return iBlk.rootInstr.id < jBlk.rootInstr.id
-	})
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
index 1fc84d2eaf..43dd7d2928 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
@@ -54,9 +54,6 @@ type Builder interface {
 	// MustFindValue searches the latest definition of the given Variable and returns the result.
 	MustFindValue(variable Variable) Value
 
-	// MustFindValueInBlk is the same as MustFindValue except it searches the latest definition from the given BasicBlock.
-	MustFindValueInBlk(variable Variable, blk BasicBlock) Value
-
 	// FindValueInLinearPath tries to find the latest definition of the given Variable in the linear path to the current BasicBlock.
 	// If it cannot find the definition, or it's not sealed yet, it returns ValueInvalid.
 	FindValueInLinearPath(variable Variable) Value
@@ -97,9 +94,9 @@ type Builder interface {
 	// Returns nil if there's no unseen BasicBlock.
 	BlockIteratorNext() BasicBlock
 
-	// ValueRefCounts returns the map of ValueID to its reference count.
-	// The returned slice must not be modified.
-	ValueRefCounts() []int
+	// ValuesInfo returns the data per Value used to lower the SSA in backend.
+	// This is indexed by ValueID.
+	ValuesInfo() []ValueInfo
 
 	// BlockIteratorReversePostOrderBegin is almost the same as BlockIteratorBegin except it returns the BasicBlock in the reverse post-order.
 	// This is available after RunPasses is run.
@@ -127,21 +124,29 @@ type Builder interface {
 	// Idom returns the immediate dominator of the given BasicBlock.
 	Idom(blk BasicBlock) BasicBlock
 
+	// VarLengthPool returns the VarLengthPool of Value.
 	VarLengthPool() *wazevoapi.VarLengthPool[Value]
+
+	// InsertZeroValue inserts a zero value constant instruction of the given type.
+	InsertZeroValue(t Type)
+
+	// BasicBlock returns the BasicBlock of the given ID.
+	BasicBlock(id BasicBlockID) BasicBlock
+
+	// InstructionOfValue returns the Instruction that produces the given Value or nil if the Value is not produced by any Instruction.
+	InstructionOfValue(v Value) *Instruction
 }
 
 // NewBuilder returns a new Builder implementation.
 func NewBuilder() Builder {
 	return &builder{
-		instructionsPool:               wazevoapi.NewPool[Instruction](resetInstruction),
-		basicBlocksPool:                wazevoapi.NewPool[basicBlock](resetBasicBlock),
-		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
-		valueAnnotations:               make(map[ValueID]string),
-		signatures:                     make(map[SignatureID]*Signature),
-		blkVisited:                     make(map[*basicBlock]int),
-		valueIDAliases:                 make(map[ValueID]Value),
-		redundantParameterIndexToValue: make(map[int]Value),
-		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
+		instructionsPool:        wazevoapi.NewPool[Instruction](resetInstruction),
+		basicBlocksPool:         wazevoapi.NewPool[basicBlock](resetBasicBlock),
+		varLengthBasicBlockPool: wazevoapi.NewVarLengthPool[BasicBlock](),
+		varLengthPool:           wazevoapi.NewVarLengthPool[Value](),
+		valueAnnotations:        make(map[ValueID]string),
+		signatures:              make(map[SignatureID]*Signature),
+		returnBlk:               &basicBlock{id: basicBlockIDReturnBlock},
 	}
 }
 
@@ -158,36 +163,32 @@ type builder struct {
 	currentBB                     *basicBlock
 	returnBlk                     *basicBlock
 
-	// variables track the types for Variable with the index regarded Variable.
-	variables []Type
 	// nextValueID is used by builder.AllocateValue.
 	nextValueID ValueID
 	// nextVariable is used by builder.AllocateVariable.
 	nextVariable Variable
 
-	valueIDAliases   map[ValueID]Value
+	// valueAnnotations contains the annotations for each Value, only used for debugging.
 	valueAnnotations map[ValueID]string
 
-	// valueRefCounts is used to lower the SSA in backend, and will be calculated
-	// by the last SSA-level optimization pass.
-	valueRefCounts []int
+	// valuesInfo contains the data per Value used to lower the SSA in backend. This is indexed by ValueID.
+	valuesInfo []ValueInfo
 
 	// dominators stores the immediate dominator of each BasicBlock.
 	// The index is blockID of the BasicBlock.
 	dominators []*basicBlock
 	sparseTree dominatorSparseTree
 
+	varLengthBasicBlockPool wazevoapi.VarLengthPool[BasicBlock]
+
 	// loopNestingForestRoots are the roots of the loop nesting forest.
 	loopNestingForestRoots []BasicBlock
 
 	// The followings are used for optimization passes/deterministic compilation.
-	instStack                      []*Instruction
-	blkVisited                     map[*basicBlock]int
-	valueIDToInstruction           []*Instruction
-	blkStack                       []*basicBlock
-	blkStack2                      []*basicBlock
-	ints                           []int
-	redundantParameterIndexToValue map[int]Value
+	instStack       []*Instruction
+	blkStack        []*basicBlock
+	blkStack2       []*basicBlock
+	redundantParams []redundantParam
 
 	// blockIterCur is used to implement blockIteratorBegin and blockIteratorNext.
 	blockIterCur int
@@ -200,6 +201,60 @@ type builder struct {
 	donePostBlockLayoutPasses bool
 
 	currentSourceOffset SourceOffset
+
+	// zeros are the zero value constants for each type.
+	zeros [typeEnd]Value
+}
+
+// ValueInfo contains the data per Value used to lower the SSA in backend.
+type ValueInfo struct {
+	// RefCount is the reference count of the Value.
+	RefCount uint32
+	alias    Value
+}
+
+// redundantParam is a pair of the index of the redundant parameter and the Value.
+// This is used to eliminate the redundant parameters in the optimization pass.
+type redundantParam struct {
+	// index is the index of the redundant parameter in the basicBlock.
+	index int
+	// uniqueValue is the Value which is passed to the redundant parameter.
+	uniqueValue Value
+}
+
+// BasicBlock implements Builder.BasicBlock.
+func (b *builder) BasicBlock(id BasicBlockID) BasicBlock {
+	return b.basicBlock(id)
+}
+
+func (b *builder) basicBlock(id BasicBlockID) *basicBlock {
+	if id == basicBlockIDReturnBlock {
+		return b.returnBlk
+	}
+	return b.basicBlocksPool.View(int(id))
+}
+
+// InsertZeroValue implements Builder.InsertZeroValue.
+func (b *builder) InsertZeroValue(t Type) {
+	if b.zeros[t].Valid() {
+		return
+	}
+	zeroInst := b.AllocateInstruction()
+	switch t {
+	case TypeI32:
+		zeroInst.AsIconst32(0)
+	case TypeI64:
+		zeroInst.AsIconst64(0)
+	case TypeF32:
+		zeroInst.AsF32const(0)
+	case TypeF64:
+		zeroInst.AsF64const(0)
+	case TypeV128:
+		zeroInst.AsVconst(0, 0)
+	default:
+		panic("TODO: " + t.String())
+	}
+	b.zeros[t] = zeroInst.Insert(b).Return()
 }
 
 func (b *builder) VarLengthPool() *wazevoapi.VarLengthPool[Value] {
@@ -215,10 +270,12 @@ func (b *builder) ReturnBlock() BasicBlock {
 func (b *builder) Init(s *Signature) {
 	b.nextVariable = 0
 	b.currentSignature = s
+	b.zeros = [typeEnd]Value{ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid}
 	resetBasicBlock(b.returnBlk)
 	b.instructionsPool.Reset()
 	b.basicBlocksPool.Reset()
 	b.varLengthPool.Reset()
+	b.varLengthBasicBlockPool.Reset()
 	b.donePreBlockLayoutPasses = false
 	b.doneBlockLayout = false
 	b.donePostBlockLayoutPasses = false
@@ -226,31 +283,20 @@ func (b *builder) Init(s *Signature) {
 		sig.used = false
 	}
 
-	b.ints = b.ints[:0]
+	b.redundantParams = b.redundantParams[:0]
 	b.blkStack = b.blkStack[:0]
 	b.blkStack2 = b.blkStack2[:0]
 	b.dominators = b.dominators[:0]
 	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
-
-	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
-		blk := b.basicBlocksPool.View(i)
-		delete(b.blkVisited, blk)
-	}
 	b.basicBlocksPool.Reset()
 
 	for v := ValueID(0); v < b.nextValueID; v++ {
 		delete(b.valueAnnotations, v)
-		delete(b.valueIDAliases, v)
-		b.valueRefCounts[v] = 0
-		b.valueIDToInstruction[v] = nil
+		b.valuesInfo[v] = ValueInfo{alias: ValueInvalid}
 	}
 	b.nextValueID = 0
 	b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0]
 	b.doneBlockLayout = false
-	for i := range b.valueRefCounts {
-		b.valueRefCounts[i] = 0
-	}
-
 	b.currentSourceOffset = sourceOffsetUnknown
 }
 
@@ -330,7 +376,7 @@ func (b *builder) Idom(blk BasicBlock) BasicBlock {
 
 // InsertInstruction implements Builder.InsertInstruction.
 func (b *builder) InsertInstruction(instr *Instruction) {
-	b.currentBB.InsertInstruction(instr)
+	b.currentBB.insertInstruction(b, instr)
 
 	if l := b.currentSourceOffset; l.Valid() {
 		// Emit the source offset info only when the instruction has side effect because
@@ -352,7 +398,7 @@ func (b *builder) InsertInstruction(instr *Instruction) {
 	}
 
 	r1 := b.allocateValue(t1)
-	instr.rValue = r1
+	instr.rValue = r1.setInstructionID(instr.id)
 
 	tsl := len(ts)
 	if tsl == 0 {
@@ -361,20 +407,14 @@ func (b *builder) InsertInstruction(instr *Instruction) {
 
 	rValues := b.varLengthPool.Allocate(tsl)
 	for i := 0; i < tsl; i++ {
-		rValues = rValues.Append(&b.varLengthPool, b.allocateValue(ts[i]))
+		rn := b.allocateValue(ts[i])
+		rValues = rValues.Append(&b.varLengthPool, rn.setInstructionID(instr.id))
 	}
 	instr.rValues = rValues
 }
 
 // DefineVariable implements Builder.DefineVariable.
 func (b *builder) DefineVariable(variable Variable, value Value, block BasicBlock) {
-	if b.variables[variable].invalid() {
-		panic("BUG: trying to define variable " + variable.String() + " but is not declared yet")
-	}
-
-	if b.variables[variable] != value.Type() {
-		panic(fmt.Sprintf("BUG: inconsistent type for variable %d: expected %s but got %s", variable, b.variables[variable], value.Type()))
-	}
 	bb := block.(*basicBlock)
 	bb.lastDefinitions[variable] = value
 }
@@ -401,20 +441,9 @@ func (b *builder) EntryBlock() BasicBlock {
 
 // DeclareVariable implements Builder.DeclareVariable.
 func (b *builder) DeclareVariable(typ Type) Variable {
-	v := b.allocateVariable()
-	iv := int(v)
-	if l := len(b.variables); l <= iv {
-		b.variables = append(b.variables, make([]Type, 2*(l+1))...)
-	}
-	b.variables[v] = typ
-	return v
-}
-
-// allocateVariable allocates a new variable.
-func (b *builder) allocateVariable() (ret Variable) {
-	ret = b.nextVariable
+	v := b.nextVariable
 	b.nextVariable++
-	return
+	return v.setType(typ)
 }
 
 // allocateValue implements Builder.AllocateValue.
@@ -448,15 +477,9 @@ func (b *builder) findValueInLinearPath(variable Variable, blk *basicBlock) Valu
 	return ValueInvalid
 }
 
-func (b *builder) MustFindValueInBlk(variable Variable, blk BasicBlock) Value {
-	typ := b.definedVariableType(variable)
-	return b.findValue(typ, variable, blk.(*basicBlock))
-}
-
 // MustFindValue implements Builder.MustFindValue.
 func (b *builder) MustFindValue(variable Variable) Value {
-	typ := b.definedVariableType(variable)
-	return b.findValue(typ, variable, b.currentBB)
+	return b.findValue(variable.getType(), variable, b.currentBB)
 }
 
 // findValue recursively tries to find the latest definition of a `variable`. The algorithm is described in
@@ -482,6 +505,9 @@ func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value
 			value:    value,
 		})
 		return value
+	} else if blk.EntryBlock() {
+		// If this is the entry block, we reach the uninitialized variable which has zero value.
+		return b.zeros[variable.getType()]
 	}
 
 	if pred := blk.singlePred; pred != nil {
@@ -495,21 +521,41 @@ func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value
 	// If this block has multiple predecessors, we have to gather the definitions,
 	// and treat them as an argument to this block.
 	//
-	// The first thing is to define a new parameter to this block which may or may not be redundant, but
-	// later we eliminate trivial params in an optimization pass. This must be done before finding the
-	// definitions in the predecessors so that we can break the cycle.
-	paramValue := blk.AddParam(b, typ)
-	b.DefineVariable(variable, paramValue, blk)
-
-	// After the new param is added, we have to manipulate the original branching instructions
-	// in predecessors so that they would pass the definition of `variable` as the argument to
-	// the newly added PHI.
+	// But before that, we have to check if the possible definitions are the same Value.
+	tmpValue := b.allocateValue(typ)
+	// Break the cycle by defining the variable with the tmpValue.
+	b.DefineVariable(variable, tmpValue, blk)
+	// Check all the predecessors if they have the same definition.
+	uniqueValue := ValueInvalid
 	for i := range blk.preds {
-		pred := &blk.preds[i]
-		value := b.findValue(typ, variable, pred.blk)
-		pred.branch.addArgumentBranchInst(b, value)
+		predValue := b.findValue(typ, variable, blk.preds[i].blk)
+		if uniqueValue == ValueInvalid {
+			uniqueValue = predValue
+		} else if uniqueValue != predValue {
+			uniqueValue = ValueInvalid
+			break
+		}
+	}
+
+	if uniqueValue != ValueInvalid {
+		// If all the predecessors have the same definition, we can use that value.
+		b.alias(tmpValue, uniqueValue)
+		return uniqueValue
+	} else {
+		// Otherwise, add the tmpValue to this block as a parameter which may or may not be redundant, but
+		// later we eliminate trivial params in an optimization pass. This must be done before finding the
+		// definitions in the predecessors so that we can break the cycle.
+		blk.addParamOn(b, tmpValue)
+		// After the new param is added, we have to manipulate the original branching instructions
+		// in predecessors so that they would pass the definition of `variable` as the argument to
+		// the newly added PHI.
+		for i := range blk.preds {
+			pred := &blk.preds[i]
+			value := b.findValue(typ, variable, pred.blk)
+			pred.branch.addArgumentBranchInst(b, value)
+		}
+		return tmpValue
 	}
-	return paramValue
 }
 
 // Seal implements Builder.Seal.
@@ -522,8 +568,8 @@ func (b *builder) Seal(raw BasicBlock) {
 
 	for _, v := range blk.unknownValues {
 		variable, phiValue := v.variable, v.value
-		typ := b.definedVariableType(variable)
-		blk.addParamOn(typ, phiValue)
+		typ := variable.getType()
+		blk.addParamOn(b, phiValue)
 		for i := range blk.preds {
 			pred := &blk.preds[i]
 			predValue := b.findValue(typ, variable, pred.blk)
@@ -535,15 +581,6 @@ func (b *builder) Seal(raw BasicBlock) {
 	}
 }
 
-// definedVariableType returns the type of the given variable. If the variable is not defined yet, it panics.
-func (b *builder) definedVariableType(variable Variable) Type {
-	typ := b.variables[variable]
-	if typ.invalid() {
-		panic(fmt.Sprintf("%s is not defined yet", variable))
-	}
-	return typ
-}
-
 // Format implements Builder.Format.
 func (b *builder) Format() string {
 	str := strings.Builder{}
@@ -566,7 +603,7 @@ func (b *builder) Format() string {
 	}
 	for bb := iterBegin(); bb != nil; bb = iterNext() {
 		str.WriteByte('\n')
-		str.WriteString(bb.FormatHeader(b))
+		str.WriteString(bb.formatHeader(b))
 		str.WriteByte('\n')
 
 		for cur := bb.Root(); cur != nil; cur = cur.Next() {
@@ -645,15 +682,24 @@ func (b *builder) blockIteratorReversePostOrderNext() *basicBlock {
 	}
 }
 
-// ValueRefCounts implements Builder.ValueRefCounts.
-func (b *builder) ValueRefCounts() []int {
-	return b.valueRefCounts
+// ValuesInfo implements Builder.ValuesInfo.
+func (b *builder) ValuesInfo() []ValueInfo {
+	return b.valuesInfo
 }
 
 // alias records the alias of the given values. The alias(es) will be
 // eliminated in the optimization pass via resolveArgumentAlias.
 func (b *builder) alias(dst, src Value) {
-	b.valueIDAliases[dst.ID()] = src
+	did := int(dst.ID())
+	if did >= len(b.valuesInfo) {
+		l := did + 1 - len(b.valuesInfo)
+		b.valuesInfo = append(b.valuesInfo, make([]ValueInfo, l)...)
+		view := b.valuesInfo[len(b.valuesInfo)-l:]
+		for i := range view {
+			view[i].alias = ValueInvalid
+		}
+	}
+	b.valuesInfo[did].alias = src
 }
 
 // resolveArgumentAlias resolves the alias of the arguments of the given instruction.
@@ -678,10 +724,13 @@ func (b *builder) resolveArgumentAlias(instr *Instruction) {
 
 // resolveAlias resolves the alias of the given value.
 func (b *builder) resolveAlias(v Value) Value {
+	info := b.valuesInfo
+	l := ValueID(len(info))
 	// Some aliases are chained, so we need to resolve them recursively.
 	for {
-		if src, ok := b.valueIDAliases[v.ID()]; ok {
-			v = src
+		vid := v.ID()
+		if vid < l && info[vid].alias.Valid() {
+			v = info[vid].alias
 		} else {
 			break
 		}
@@ -729,3 +778,13 @@ func (b *builder) LoopNestingForestRoots() []BasicBlock {
 func (b *builder) LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock {
 	return b.sparseTree.findLCA(blk1.ID(), blk2.ID())
 }
+
+// InstructionOfValue returns the instruction that produces the given Value, or nil
+// if the Value is not produced by any instruction.
+func (b *builder) InstructionOfValue(v Value) *Instruction {
+	instrID := v.instructionID()
+	if instrID <= 0 {
+		return nil
+	}
+	return b.instructionsPool.View(instrID - 1)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
index 3e3482efc4..9a3d1da6e9 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
@@ -25,11 +25,13 @@ type Instruction struct {
 	v3         Value
 	vs         Values
 	typ        Type
-	blk        BasicBlock
-	targets    []BasicBlock
 	prev, next *Instruction
 
-	rValue         Value
+	// rValue is the (first) return value of this instruction.
+	// For branching instructions except for OpcodeBrTable, they hold BlockID to jump cast to Value.
+	rValue Value
+	// rValues are the rest of the return values of this instruction.
+	// For OpcodeBrTable, it holds the list of BlockID to jump cast to Value.
 	rValues        Values
 	gid            InstructionGroupID
 	sourceOffset   SourceOffset
@@ -105,6 +107,9 @@ type InstructionGroupID uint32
 // Returns Value(s) produced by this instruction if any.
 // The `first` is the first return value, and `rest` is the rest of the values.
 func (i *Instruction) Returns() (first Value, rest []Value) {
+	if i.IsBranching() {
+		return ValueInvalid, nil
+	}
 	return i.rValue, i.rValues.View()
 }
 
@@ -2077,7 +2082,7 @@ func (i *Instruction) InvertBrx() {
 }
 
 // BranchData returns the branch data for this instruction necessary for backends.
-func (i *Instruction) BranchData() (condVal Value, blockArgs []Value, target BasicBlock) {
+func (i *Instruction) BranchData() (condVal Value, blockArgs []Value, target BasicBlockID) {
 	switch i.opcode {
 	case OpcodeJump:
 		condVal = ValueInvalid
@@ -2087,17 +2092,17 @@ func (i *Instruction) BranchData() (condVal Value, blockArgs []Value, target Bas
 		panic("BUG")
 	}
 	blockArgs = i.vs.View()
-	target = i.blk
+	target = BasicBlockID(i.rValue)
 	return
 }
 
 // BrTableData returns the branch table data for this instruction necessary for backends.
-func (i *Instruction) BrTableData() (index Value, targets []BasicBlock) {
+func (i *Instruction) BrTableData() (index Value, targets Values) {
 	if i.opcode != OpcodeBrTable {
 		panic("BUG: BrTableData only available for OpcodeBrTable")
 	}
 	index = i.v
-	targets = i.targets
+	targets = i.rValues
 	return
 }
 
@@ -2105,7 +2110,7 @@ func (i *Instruction) BrTableData() (index Value, targets []BasicBlock) {
 func (i *Instruction) AsJump(vs Values, target BasicBlock) *Instruction {
 	i.opcode = OpcodeJump
 	i.vs = vs
-	i.blk = target
+	i.rValue = Value(target.ID())
 	return i
 }
 
@@ -2130,7 +2135,7 @@ func (i *Instruction) AsBrz(v Value, args Values, target BasicBlock) {
 	i.opcode = OpcodeBrz
 	i.v = v
 	i.vs = args
-	i.blk = target
+	i.rValue = Value(target.ID())
 }
 
 // AsBrnz initializes this instruction as a branch-if-not-zero instruction with OpcodeBrnz.
@@ -2138,15 +2143,16 @@ func (i *Instruction) AsBrnz(v Value, args Values, target BasicBlock) *Instructi
 	i.opcode = OpcodeBrnz
 	i.v = v
 	i.vs = args
-	i.blk = target
+	i.rValue = Value(target.ID())
 	return i
 }
 
 // AsBrTable initializes this instruction as a branch-table instruction with OpcodeBrTable.
-func (i *Instruction) AsBrTable(index Value, targets []BasicBlock) {
+// targets is a list of basic block IDs cast to Values.
+func (i *Instruction) AsBrTable(index Value, targets Values) {
 	i.opcode = OpcodeBrTable
 	i.v = index
-	i.targets = targets
+	i.rValues = targets
 }
 
 // AsCall initializes this instruction as a call instruction with OpcodeCall.
@@ -2531,7 +2537,8 @@ func (i *Instruction) Format(b Builder) string {
 		if i.IsFallthroughJump() {
 			vs[0] = " fallthrough"
 		} else {
-			vs[0] = " " + i.blk.(*basicBlock).Name()
+			blockId := BasicBlockID(i.rValue)
+			vs[0] = " " + b.BasicBlock(blockId).Name()
 		}
 		for idx := range view {
 			vs[idx+1] = view[idx].Format(b)
@@ -2542,7 +2549,8 @@ func (i *Instruction) Format(b Builder) string {
 		view := i.vs.View()
 		vs := make([]string, len(view)+2)
 		vs[0] = " " + i.v.Format(b)
-		vs[1] = i.blk.(*basicBlock).Name()
+		blockId := BasicBlockID(i.rValue)
+		vs[1] = b.BasicBlock(blockId).Name()
 		for idx := range view {
 			vs[idx+2] = view[idx].Format(b)
 		}
@@ -2551,8 +2559,8 @@ func (i *Instruction) Format(b Builder) string {
 		// `BrTable index, [label1, label2, ... labelN]`
 		instSuffix = fmt.Sprintf(" %s", i.v.Format(b))
 		instSuffix += ", ["
-		for i, target := range i.targets {
-			blk := target.(*basicBlock)
+		for i, target := range i.rValues.View() {
+			blk := b.BasicBlock(BasicBlockID(target))
 			if i == 0 {
 				instSuffix += blk.Name()
 			} else {
@@ -2621,11 +2629,12 @@ func (i *Instruction) Format(b Builder) string {
 	instr := i.opcode.String() + instSuffix
 
 	var rvs []string
-	if rv := i.rValue; rv.Valid() {
-		rvs = append(rvs, rv.formatWithType(b))
+	r1, rs := i.Returns()
+	if r1.Valid() {
+		rvs = append(rvs, r1.formatWithType(b))
 	}
 
-	for _, v := range i.rValues.View() {
+	for _, v := range rs {
 		rvs = append(rvs, v.formatWithType(b))
 	}
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
index a2e986cd15..b9763791dd 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
@@ -22,9 +22,9 @@ func (b *builder) RunPasses() {
 func (b *builder) runPreBlockLayoutPasses() {
 	passSortSuccessors(b)
 	passDeadBlockEliminationOpt(b)
-	passRedundantPhiEliminationOpt(b)
 	// The result of passCalculateImmediateDominators will be used by various passes below.
 	passCalculateImmediateDominators(b)
+	passRedundantPhiEliminationOpt(b)
 	passNopInstElimination(b)
 
 	// TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic.
@@ -78,12 +78,11 @@ func (b *builder) runFinalizingPasses() {
 // passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
 func passDeadBlockEliminationOpt(b *builder) {
 	entryBlk := b.entryBlk()
-	b.clearBlkVisited()
 	b.blkStack = append(b.blkStack, entryBlk)
 	for len(b.blkStack) > 0 {
 		reachableBlk := b.blkStack[len(b.blkStack)-1]
 		b.blkStack = b.blkStack[:len(b.blkStack)-1]
-		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+		reachableBlk.visited = 1
 
 		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
 			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
@@ -94,7 +93,7 @@ func passDeadBlockEliminationOpt(b *builder) {
 		}
 
 		for _, succ := range reachableBlk.success {
-			if _, ok := b.blkVisited[succ]; ok {
+			if succ.visited == 1 {
 				continue
 			}
 			b.blkStack = append(b.blkStack, succ)
@@ -102,15 +101,18 @@ func passDeadBlockEliminationOpt(b *builder) {
 	}
 
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
-		if _, ok := b.blkVisited[blk]; !ok {
+		if blk.visited != 1 {
 			blk.invalid = true
 		}
+		blk.visited = 0
 	}
 }
 
 // passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block).
+// This requires the reverse post-order traversal to be calculated before calling this function,
+// hence passCalculateImmediateDominators must be called before this.
 func passRedundantPhiEliminationOpt(b *builder) {
-	redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations.
+	redundantParams := b.redundantParams[:0] // reuse the slice from previous iterations.
 
 	// TODO: this might be costly for large programs, but at least, as far as I did the experiment, it's almost the
 	//  same as the single iteration version in terms of the overall compilation time. That *might be* mostly thanks to the fact
@@ -118,15 +120,19 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	//  relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs,
 	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
 	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
+	//  -- Note --
+	// 	Currently, each iteration can run in any order of blocks, but it empirically converges quickly in practice when
+	// 	running on the reverse post-order. It might be possible to optimize this further by using the dominator tree.
 	for {
 		changed := false
-		_ = b.blockIteratorBegin() // skip entry block!
+		_ = b.blockIteratorReversePostOrderBegin() // skip entry block!
 		// Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops!
-		for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() {
-			paramNum := len(blk.params)
+		for blk := b.blockIteratorReversePostOrderNext(); blk != nil; blk = b.blockIteratorReversePostOrderNext() {
+			params := blk.params.View()
+			paramNum := len(params)
 
 			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
-				phiValue := blk.params[paramIndex].value
+				phiValue := params[paramIndex]
 				redundant := true
 
 				nonSelfReferencingValue := ValueInvalid
@@ -157,55 +163,58 @@ func passRedundantPhiEliminationOpt(b *builder) {
 				}
 
 				if redundant {
-					b.redundantParameterIndexToValue[paramIndex] = nonSelfReferencingValue
-					redundantParameterIndexes = append(redundantParameterIndexes, paramIndex)
+					redundantParams = append(redundantParams, redundantParam{
+						index: paramIndex, uniqueValue: nonSelfReferencingValue,
+					})
 				}
 			}
 
-			if len(b.redundantParameterIndexToValue) == 0 {
+			if len(redundantParams) == 0 {
 				continue
 			}
 			changed = true
 
 			// Remove the redundant PHIs from the argument list of branching instructions.
 			for predIndex := range blk.preds {
-				var cur int
+				redundantParamsCur, predParamCur := 0, 0
 				predBlk := blk.preds[predIndex]
 				branchInst := predBlk.branch
 				view := branchInst.vs.View()
 				for argIndex, value := range view {
-					if _, ok := b.redundantParameterIndexToValue[argIndex]; !ok {
-						view[cur] = value
-						cur++
+					if len(redundantParams) == redundantParamsCur ||
+						redundantParams[redundantParamsCur].index != argIndex {
+						view[predParamCur] = value
+						predParamCur++
+					} else {
+						redundantParamsCur++
 					}
 				}
-				branchInst.vs.Cut(cur)
+				branchInst.vs.Cut(predParamCur)
 			}
 
 			// Still need to have the definition of the value of the PHI (previously as the parameter).
-			for _, redundantParamIndex := range redundantParameterIndexes {
-				phiValue := blk.params[redundantParamIndex].value
-				onlyValue := b.redundantParameterIndexToValue[redundantParamIndex]
+			for i := range redundantParams {
+				redundantValue := &redundantParams[i]
+				phiValue := params[redundantValue.index]
 				// Create an alias in this block from the only phi argument to the phi value.
-				b.alias(phiValue, onlyValue)
+				b.alias(phiValue, redundantValue.uniqueValue)
 			}
 
 			// Finally, Remove the param from the blk.
-			var cur int
+			paramsCur, redundantParamsCur := 0, 0
 			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
-				param := blk.params[paramIndex]
-				if _, ok := b.redundantParameterIndexToValue[paramIndex]; !ok {
-					blk.params[cur] = param
-					cur++
+				param := params[paramIndex]
+				if len(redundantParams) == redundantParamsCur || redundantParams[redundantParamsCur].index != paramIndex {
+					params[paramsCur] = param
+					paramsCur++
+				} else {
+					redundantParamsCur++
 				}
 			}
-			blk.params = blk.params[:cur]
+			blk.params.Cut(paramsCur)
 
 			// Clears the map for the next iteration.
-			for _, paramIndex := range redundantParameterIndexes {
-				delete(b.redundantParameterIndexToValue, paramIndex)
-			}
-			redundantParameterIndexes = redundantParameterIndexes[:0]
+			redundantParams = redundantParams[:0]
 		}
 
 		if !changed {
@@ -214,7 +223,7 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	}
 
 	// Reuse the slice for the future passes.
-	b.ints = redundantParameterIndexes
+	b.redundantParams = redundantParams
 }
 
 // passDeadCodeEliminationOpt traverses all the instructions, and calculates the reference count of each Value, and
@@ -226,11 +235,13 @@ func passRedundantPhiEliminationOpt(b *builder) {
 // TODO: the algorithm here might not be efficient. Get back to this later.
 func passDeadCodeEliminationOpt(b *builder) {
 	nvid := int(b.nextValueID)
-	if nvid >= len(b.valueRefCounts) {
-		b.valueRefCounts = append(b.valueRefCounts, make([]int, b.nextValueID)...)
-	}
-	if nvid >= len(b.valueIDToInstruction) {
-		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+	if nvid >= len(b.valuesInfo) {
+		l := nvid - len(b.valuesInfo) + 1
+		b.valuesInfo = append(b.valuesInfo, make([]ValueInfo, l)...)
+		view := b.valuesInfo[len(b.valuesInfo)-l:]
+		for i := range view {
+			view[i].alias = ValueInvalid
+		}
 	}
 
 	// First, we gather all the instructions with side effects.
@@ -250,14 +261,6 @@ func passDeadCodeEliminationOpt(b *builder) {
 				// The strict side effect should create different instruction groups.
 				gid++
 			}
-
-			r1, rs := cur.Returns()
-			if r1.Valid() {
-				b.valueIDToInstruction[r1.ID()] = cur
-			}
-			for _, r := range rs {
-				b.valueIDToInstruction[r.ID()] = cur
-			}
 		}
 	}
 
@@ -278,28 +281,28 @@ func passDeadCodeEliminationOpt(b *builder) {
 
 		v1, v2, v3, vs := live.Args()
 		if v1.Valid() {
-			producingInst := b.valueIDToInstruction[v1.ID()]
+			producingInst := b.InstructionOfValue(v1)
 			if producingInst != nil {
 				liveInstructions = append(liveInstructions, producingInst)
 			}
 		}
 
 		if v2.Valid() {
-			producingInst := b.valueIDToInstruction[v2.ID()]
+			producingInst := b.InstructionOfValue(v2)
 			if producingInst != nil {
 				liveInstructions = append(liveInstructions, producingInst)
 			}
 		}
 
 		if v3.Valid() {
-			producingInst := b.valueIDToInstruction[v3.ID()]
+			producingInst := b.InstructionOfValue(v3)
 			if producingInst != nil {
 				liveInstructions = append(liveInstructions, producingInst)
 			}
 		}
 
 		for _, v := range vs {
-			producingInst := b.valueIDToInstruction[v.ID()]
+			producingInst := b.InstructionOfValue(v)
 			if producingInst != nil {
 				liveInstructions = append(liveInstructions, producingInst)
 			}
@@ -347,46 +350,19 @@ func (b *builder) incRefCount(id ValueID, from *Instruction) {
 	if wazevoapi.SSALoggingEnabled {
 		fmt.Printf("v%d referenced from %v\n", id, from.Format(b))
 	}
-	b.valueRefCounts[id]++
-}
-
-// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
-func (b *builder) clearBlkVisited() {
-	b.blkStack2 = b.blkStack2[:0]
-	for key := range b.blkVisited {
-		b.blkStack2 = append(b.blkStack2, key)
-	}
-	for _, blk := range b.blkStack2 {
-		delete(b.blkVisited, blk)
-	}
-	b.blkStack2 = b.blkStack2[:0]
+	info := &b.valuesInfo[id]
+	info.RefCount++
 }
 
 // passNopInstElimination eliminates the instructions which is essentially a no-op.
 func passNopInstElimination(b *builder) {
-	if int(b.nextValueID) >= len(b.valueIDToInstruction) {
-		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
-	}
-
-	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
-		for cur := blk.rootInstr; cur != nil; cur = cur.next {
-			r1, rs := cur.Returns()
-			if r1.Valid() {
-				b.valueIDToInstruction[r1.ID()] = cur
-			}
-			for _, r := range rs {
-				b.valueIDToInstruction[r.ID()] = cur
-			}
-		}
-	}
-
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
 		for cur := blk.rootInstr; cur != nil; cur = cur.next {
 			switch cur.Opcode() {
 			// TODO: add more logics here.
 			case OpcodeIshl, OpcodeSshr, OpcodeUshr:
 				x, amount := cur.Arg2()
-				definingInst := b.valueIDToInstruction[amount.ID()]
+				definingInst := b.InstructionOfValue(amount)
 				if definingInst == nil {
 					// If there's no defining instruction, that means the amount is coming from the parameter.
 					continue
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
index 9068180a0b..0118e8b2e5 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
@@ -23,8 +23,6 @@ import (
 //
 // This heuristic is done in maybeInvertBranches function.
 func passLayoutBlocks(b *builder) {
-	b.clearBlkVisited()
-
 	// We might end up splitting critical edges which adds more basic blocks,
 	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
 	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
@@ -35,7 +33,7 @@ func passLayoutBlocks(b *builder) {
 		}
 		nonSplitBlocks = append(nonSplitBlocks, blk)
 		if i != len(b.reversePostOrderedBasicBlocks)-1 {
-			_ = maybeInvertBranches(blk, b.reversePostOrderedBasicBlocks[i+1])
+			_ = maybeInvertBranches(b, blk, b.reversePostOrderedBasicBlocks[i+1])
 		}
 	}
 
@@ -47,20 +45,20 @@ func passLayoutBlocks(b *builder) {
 	for _, blk := range nonSplitBlocks {
 		for i := range blk.preds {
 			pred := blk.preds[i].blk
-			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+			if pred.visited == 1 || !pred.Valid() {
 				continue
 			} else if pred.reversePostOrder < blk.reversePostOrder {
 				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
 				// Split edge trampolines must come before the destination in reverse post-order.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
-				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+				pred.visited = 1 // mark as inserted.
 			}
 		}
 
 		// Now that we've already added all the potential trampoline blocks incoming to this block,
 		// we can add this block itself.
 		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
-		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+		blk.visited = 1 // mark as inserted.
 
 		if len(blk.success) < 2 {
 			// There won't be critical edge originating from this block.
@@ -113,10 +111,10 @@ func passLayoutBlocks(b *builder) {
 			}
 
 			fallthroughBranch := blk.currentInstr
-			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
+			if fallthroughBranch.opcode == OpcodeJump && BasicBlockID(fallthroughBranch.rValue) == trampoline.id {
 				// This can be lowered as fallthrough at the end of the block.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} else {
 				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
 			}
@@ -126,7 +124,7 @@ func passLayoutBlocks(b *builder) {
 			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
 				// This means the critical edge was backward, so we insert after the current block immediately.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} // If the target is forward, we can wait to insert until the target is inserted.
 		}
 		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
@@ -142,8 +140,8 @@ func passLayoutBlocks(b *builder) {
 
 	if wazevoapi.SSAValidationEnabled {
 		for _, trampoline := range trampolines {
-			if _, ok := b.blkVisited[trampoline]; !ok {
-				panic("BUG: trampoline block not inserted: " + trampoline.FormatHeader(b))
+			if trampoline.visited != 1 {
+				panic("BUG: trampoline block not inserted: " + trampoline.formatHeader(b))
 			}
 			trampoline.validate(b)
 		}
@@ -159,7 +157,7 @@ func (b *builder) markFallthroughJumps() {
 	for i, blk := range b.reversePostOrderedBasicBlocks {
 		if i < l {
 			cur := blk.currentInstr
-			if cur.opcode == OpcodeJump && cur.blk == b.reversePostOrderedBasicBlocks[i+1] {
+			if cur.opcode == OpcodeJump && BasicBlockID(cur.rValue) == b.reversePostOrderedBasicBlocks[i+1].id {
 				cur.AsFallthroughJump()
 			}
 		}
@@ -170,7 +168,7 @@ func (b *builder) markFallthroughJumps() {
 // nextInRPO is the next block in the reverse post-order.
 //
 // Returns true if the branch is inverted for testing purpose.
-func maybeInvertBranches(now *basicBlock, nextInRPO *basicBlock) bool {
+func maybeInvertBranches(b *builder, now *basicBlock, nextInRPO *basicBlock) bool {
 	fallthroughBranch := now.currentInstr
 	if fallthroughBranch.opcode == OpcodeBrTable {
 		return false
@@ -189,7 +187,8 @@ func maybeInvertBranches(now *basicBlock, nextInRPO *basicBlock) bool {
 	// So this block has two branches (a conditional branch followed by an unconditional branch) at the end.
 	// We can invert the condition of the branch if it makes the fallthrough more likely.
 
-	fallthroughTarget, condTarget := fallthroughBranch.blk.(*basicBlock), condBranch.blk.(*basicBlock)
+	fallthroughTarget := b.basicBlock(BasicBlockID(fallthroughBranch.rValue))
+	condTarget := b.basicBlock(BasicBlockID(condBranch.rValue))
 
 	if fallthroughTarget.loopHeader {
 		// First, if the tail's target is loopHeader, we don't need to do anything here,
@@ -233,8 +232,8 @@ invert:
 	}
 
 	condBranch.InvertBrx()
-	condBranch.blk = fallthroughTarget
-	fallthroughBranch.blk = condTarget
+	condBranch.rValue = Value(fallthroughTarget.ID())
+	fallthroughBranch.rValue = Value(condTarget.ID())
 	if wazevoapi.SSALoggingEnabled {
 		fmt.Printf("inverting branches at %d->%d and %d->%d\n",
 			now.ID(), fallthroughTarget.ID(), now.ID(), condTarget.ID())
@@ -277,7 +276,7 @@ func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlock
 	// Replace originalBranch with the newBranch.
 	newBranch := b.AllocateInstruction()
 	newBranch.opcode = originalBranch.opcode
-	newBranch.blk = trampoline
+	newBranch.rValue = Value(trampoline.ID())
 	switch originalBranch.opcode {
 	case OpcodeJump:
 	case OpcodeBrz, OpcodeBrnz:
@@ -305,7 +304,7 @@ func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlock
 		trampoline.validate(b)
 	}
 
-	if len(trampoline.params) > 0 {
+	if len(trampoline.params.View()) > 0 {
 		panic("trampoline should not have params")
 	}
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
index 50cb9c4750..e8288c4bd3 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
@@ -15,10 +15,6 @@ import (
 // At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
 func passCalculateImmediateDominators(b *builder) {
 	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
-	exploreStack := b.blkStack[:0]
-	b.clearBlkVisited()
-
-	entryBlk := b.entryBlk()
 
 	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
 	// This calculation of reverse postorder is not described in the paper,
@@ -28,14 +24,17 @@ func passCalculateImmediateDominators(b *builder) {
 	// which is a reasonable assumption as long as SSA Builder is properly used.
 	//
 	// First we push blocks in postorder iteratively visit successors of the entry block.
-	exploreStack = append(exploreStack, entryBlk)
+	entryBlk := b.entryBlk()
+	exploreStack := append(b.blkStack[:0], entryBlk)
+	// These flags are used to track the state of the block in the DFS traversal.
+	// We temporarily use the reversePostOrder field to store the state.
 	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
-	b.blkVisited[entryBlk] = visitStateSeen
+	entryBlk.visited = visitStateSeen
 	for len(exploreStack) > 0 {
 		tail := len(exploreStack) - 1
 		blk := exploreStack[tail]
 		exploreStack = exploreStack[:tail]
-		switch b.blkVisited[blk] {
+		switch blk.visited {
 		case visitStateUnseen:
 			// This is likely a bug in the frontend.
 			panic("BUG: unsupported CFG")
@@ -48,16 +47,18 @@ func passCalculateImmediateDominators(b *builder) {
 				if succ.ReturnBlock() || succ.invalid {
 					continue
 				}
-				if b.blkVisited[succ] == visitStateUnseen {
-					b.blkVisited[succ] = visitStateSeen
+				if succ.visited == visitStateUnseen {
+					succ.visited = visitStateSeen
 					exploreStack = append(exploreStack, succ)
 				}
 			}
 			// Finally, we could pop this block once we pop all of its successors.
-			b.blkVisited[blk] = visitStateDone
+			blk.visited = visitStateDone
 		case visitStateDone:
 			// Note: at this point we push blk in postorder despite its name.
 			reversePostOrder = append(reversePostOrder, blk)
+		default:
+			panic("BUG")
 		}
 	}
 	// At this point, reversePostOrder has postorder actually, so we reverse it.
@@ -67,7 +68,7 @@ func passCalculateImmediateDominators(b *builder) {
 	}
 
 	for i, blk := range reversePostOrder {
-		blk.reversePostOrder = i
+		blk.reversePostOrder = int32(i)
 	}
 
 	// Reuse the dominators slice if possible from the previous computation of function.
@@ -180,7 +181,7 @@ func passBuildLoopNestingForest(b *builder) {
 			b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk)
 		} else if n == ent {
 		} else if n.loopHeader {
-			n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk)
+			n.loopNestingForestChildren = n.loopNestingForestChildren.Append(&b.varLengthBasicBlockPool, blk)
 		}
 	}
 
@@ -193,7 +194,7 @@ func passBuildLoopNestingForest(b *builder) {
 
 func printLoopNestingForest(root *basicBlock, depth int) {
 	fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID())
-	for _, child := range root.loopNestingForestChildren {
+	for _, child := range root.loopNestingForestChildren.View() {
 		fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID())
 		if child.LoopHeader() {
 			printLoopNestingForest(child.(*basicBlock), depth+2)
@@ -202,10 +203,10 @@ func printLoopNestingForest(root *basicBlock, depth int) {
 }
 
 type dominatorSparseTree struct {
-	time         int
+	time         int32
 	euler        []*basicBlock
-	first, depth []int
-	table        [][]int
+	first, depth []int32
+	table        [][]int32
 }
 
 // passBuildDominatorTree builds the dominator tree for the function, and constructs builder.sparseTree.
@@ -232,11 +233,11 @@ func passBuildDominatorTree(b *builder) {
 	n := b.basicBlocksPool.Allocated()
 	st := &b.sparseTree
 	st.euler = append(st.euler[:0], make([]*basicBlock, 2*n-1)...)
-	st.first = append(st.first[:0], make([]int, n)...)
+	st.first = append(st.first[:0], make([]int32, n)...)
 	for i := range st.first {
 		st.first[i] = -1
 	}
-	st.depth = append(st.depth[:0], make([]int, 2*n-1)...)
+	st.depth = append(st.depth[:0], make([]int32, 2*n-1)...)
 	st.time = 0
 
 	// Start building the sparse tree.
@@ -244,9 +245,9 @@ func passBuildDominatorTree(b *builder) {
 	st.buildSparseTable()
 }
 
-func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int) {
+func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int32) {
 	if wazevoapi.SSALoggingEnabled {
-		fmt.Println(strings.Repeat("\t", height), "euler tour:", node.ID())
+		fmt.Println(strings.Repeat("\t", int(height)), "euler tour:", node.ID())
 	}
 	dt.euler[dt.time] = node
 	dt.depth[dt.time] = height
@@ -270,13 +271,13 @@ func (dt *dominatorSparseTree) buildSparseTable() {
 	table := dt.table
 
 	if n >= len(table) {
-		table = append(table, make([][]int, n+1)...)
+		table = append(table, make([][]int32, n-len(table)+1)...)
 	}
 	for i := range table {
 		if len(table[i]) < k {
-			table[i] = append(table[i], make([]int, k)...)
+			table[i] = append(table[i], make([]int32, k-len(table[i]))...)
 		}
-		table[i][0] = i
+		table[i][0] = int32(i)
 	}
 
 	for j := 1; 1<<j <= n; j++ {
@@ -292,7 +293,7 @@ func (dt *dominatorSparseTree) buildSparseTable() {
 }
 
 // rmq performs a range minimum query on the sparse table.
-func (dt *dominatorSparseTree) rmq(l, r int) int {
+func (dt *dominatorSparseTree) rmq(l, r int32) int32 {
 	table := dt.table
 	depth := dt.depth
 	j := int(math.Log2(float64(r - l + 1)))
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
index e8c8cd9de3..73daf4269c 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
@@ -21,6 +21,9 @@ const (
 
 	// TypeV128 represents 128-bit SIMD vectors.
 	TypeV128
+
+	// -- Do not add new types after this line. ----
+	typeEnd
 )
 
 // String implements fmt.Stringer.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
index bcf83cbf83..d906e7e354 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
@@ -15,17 +15,31 @@ import (
 //
 // Variable is useful to track the SSA Values of a variable in the source program, and
 // can be used to find the corresponding latest SSA Value via Builder.FindValue.
+//
+// Higher 4-bit is used to store Type for this variable.
 type Variable uint32
 
 // String implements fmt.Stringer.
 func (v Variable) String() string {
-	return fmt.Sprintf("var%d", v)
+	return fmt.Sprintf("var%d", v&0x0fffffff)
+}
+
+func (v Variable) setType(typ Type) Variable {
+	if v >= 1<<28 {
+		panic(fmt.Sprintf("Too large variable: %d", v))
+	}
+	return Variable(typ)<<28 | v
+}
+
+func (v Variable) getType() Type {
+	return Type(v >> 28)
 }
 
 // Value represents an SSA value with a type information. The relationship with Variable is 1: N (including 0),
 // that means there might be multiple Variable(s) for a Value.
 //
-// Higher 32-bit is used to store Type for this value.
+// 32 to 59-bit is used to store the unique identifier of the Instruction that generates this value if any.
+// 60 to 63-bit is used to store Type for this value.
 type Value uint64
 
 // ValueID is the lower 32bit of Value, which is the pure identifier of Value without type info.
@@ -33,7 +47,7 @@ type ValueID uint32
 
 const (
 	valueIDInvalid ValueID = math.MaxUint32
-	ValueInvalid   Value   = Value(valueIDInvalid)
+	ValueInvalid           = Value(valueIDInvalid)
 )
 
 // Format creates a debug string for this Value using the data stored in Builder.
@@ -54,7 +68,7 @@ func (v Value) formatWithType(b Builder) (ret string) {
 	if wazevoapi.SSALoggingEnabled { // This is useful to check live value analysis bugs.
 		if bd := b.(*builder); bd.donePostBlockLayoutPasses {
 			id := v.ID()
-			ret += fmt.Sprintf("(ref=%d)", bd.valueRefCounts[id])
+			ret += fmt.Sprintf("(ref=%d)", bd.valuesInfo[id].RefCount)
 		}
 	}
 	return ret
@@ -67,7 +81,7 @@ func (v Value) Valid() bool {
 
 // Type returns the Type of this value.
 func (v Value) Type() Type {
-	return Type(v >> 32)
+	return Type(v >> 60)
 }
 
 // ID returns the valueID of this value.
@@ -77,7 +91,20 @@ func (v Value) ID() ValueID {
 
 // setType sets a type to this Value and returns the updated Value.
 func (v Value) setType(typ Type) Value {
-	return v | Value(typ)<<32
+	return v | Value(typ)<<60
+}
+
+// setInstructionID sets an Instruction.id to this Value and returns the updated Value.
+func (v Value) setInstructionID(id int) Value {
+	if id < 0 || uint(id) >= 1<<28 {
+		panic(fmt.Sprintf("Too large instruction ID: %d", id))
+	}
+	return v | Value(id)<<32
+}
+
+// instructionID() returns the Instruction.id of this Value.
+func (v Value) instructionID() int {
+	return int(v>>32) & 0x0fffffff
 }
 
 // Values is a slice of Value. Use this instead of []Value to reuse the underlying memory.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
index 3149fdc9e1..313e34f9ae 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
@@ -69,7 +69,7 @@ type IDedPool[T any] struct {
 
 // NewIDedPool returns a new IDedPool.
 func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] {
-	return IDedPool[T]{pool: NewPool[T](resetFn)}
+	return IDedPool[T]{pool: NewPool[T](resetFn), maxIDEncountered: -1}
 }
 
 // GetOrAllocate returns the T with the given id.
@@ -97,7 +97,7 @@ func (p *IDedPool[T]) Get(id int) *T {
 // Reset resets the pool.
 func (p *IDedPool[T]) Reset() {
 	p.pool.Reset()
-	for i := range p.idToItems {
+	for i := 0; i <= p.maxIDEncountered; i++ {
 		p.idToItems[i] = nil
 	}
 	p.maxIDEncountered = -1
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
index 7177fbb4bf..3fc7aa143d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
@@ -5,9 +5,7 @@ func ResetMap[K comparable, V any](m map[K]V) map[K]V {
 	if m == nil {
 		m = make(map[K]V)
 	} else {
-		for v := range m {
-			delete(m, v)
-		}
+		clear(m)
 	}
 	return m
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/expctxkeys/importresolver.go b/vendor/github.com/tetratelabs/wazero/internal/expctxkeys/importresolver.go
new file mode 100644
index 0000000000..af52cc80eb
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/expctxkeys/importresolver.go
@@ -0,0 +1,6 @@
+package expctxkeys
+
+// ImportResolverKey is a context.Context Value key.
+// Its associated value should be an ImportResolver.
+// See issue 2294.
+type ImportResolverKey struct{}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
index 25d7d3fdca..0dc6ec19ce 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
@@ -6,6 +6,9 @@ type CpuFeatureFlags interface {
 	Has(cpuFeature CpuFeature) bool
 	// HasExtra returns true when the specified extraFlag (represented as uint64) is supported
 	HasExtra(cpuFeature CpuFeature) bool
+	// Raw returns the raw bitset that represents CPU features used by wazero. This can be used for cache keying.
+	// For now, we only use four features, so uint64 is enough.
+	Raw() uint64
 }
 
 type CpuFeature uint64
@@ -17,9 +20,11 @@ const (
 	CpuFeatureAmd64SSE4_1 CpuFeature = 1 << 19
 	// CpuFeatureAmd64SSE4_2 is the flag to query CpuFeatureFlags.Has for SSEv4.2 capabilities on amd64
 	CpuFeatureAmd64SSE4_2 CpuFeature = 1 << 20
+	// Note: when adding new features, ensure that the feature is included in CpuFeatureFlags.Raw.
 )
 
 const (
 	// CpuExtraFeatureAmd64ABM is the flag to query CpuFeatureFlags.HasExtra for Advanced Bit Manipulation capabilities (e.g. LZCNT) on amd64
 	CpuExtraFeatureAmd64ABM CpuFeature = 1 << 5
+	// Note: when adding new features, ensure that the feature is included in CpuFeatureFlags.Raw.
 )
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
index 8c9f1a9f34..fbdb539366 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
@@ -2,10 +2,10 @@
 
 package platform
 
-// CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods
-var CpuFeatures CpuFeatureFlags = loadCpuFeatureFlags()
+// CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods.
+var CpuFeatures = loadCpuFeatureFlags()
 
-// cpuFeatureFlags implements CpuFeatureFlags interface
+// cpuFeatureFlags implements CpuFeatureFlags interface.
 type cpuFeatureFlags struct {
 	flags      uint64
 	extraFlags uint64
@@ -15,13 +15,13 @@ type cpuFeatureFlags struct {
 // implemented in impl_amd64.s
 func cpuid(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
 
-// cpuidAsBitmap combines the result of invoking cpuid to uint64 bitmap
+// cpuidAsBitmap combines the result of invoking cpuid to uint64 bitmap.
 func cpuidAsBitmap(arg1, arg2 uint32) uint64 {
 	_ /* eax */, _ /* ebx */, ecx, edx := cpuid(arg1, arg2)
 	return (uint64(edx) << 32) | uint64(ecx)
 }
 
-// loadStandardRange load flags from the standard range, panics otherwise
+// loadStandardRange load flags from the standard range, panics otherwise.
 func loadStandardRange(id uint32) uint64 {
 	// ensure that the id is in the valid range, returned by cpuid(0,0)
 	maxRange, _, _, _ := cpuid(0, 0)
@@ -31,7 +31,7 @@ func loadStandardRange(id uint32) uint64 {
 	return cpuidAsBitmap(id, 0)
 }
 
-// loadStandardRange load flags from the extended range, panics otherwise
+// loadStandardRange load flags from the extended range, panics otherwise.
 func loadExtendedRange(id uint32) uint64 {
 	// ensure that the id is in the valid range, returned by cpuid(0x80000000,0)
 	maxRange, _, _, _ := cpuid(0x80000000, 0)
@@ -48,12 +48,32 @@ func loadCpuFeatureFlags() CpuFeatureFlags {
 	}
 }
 
-// Has implements the same method on the CpuFeatureFlags interface
+// Has implements the same method on the CpuFeatureFlags interface.
 func (f *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool {
 	return (f.flags & uint64(cpuFeature)) != 0
 }
 
-// HasExtra implements the same method on the CpuFeatureFlags interface
+// HasExtra implements the same method on the CpuFeatureFlags interface.
 func (f *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool {
 	return (f.extraFlags & uint64(cpuFeature)) != 0
 }
+
+// Raw implements the same method on the CpuFeatureFlags interface.
+func (f *cpuFeatureFlags) Raw() uint64 {
+	// Below, we only set the first 4 bits for the features we care about,
+	// instead of setting all the unnecessary bits obtained from the CPUID instruction.
+	var ret uint64
+	if f.Has(CpuFeatureAmd64SSE3) {
+		ret = 1 << 0
+	}
+	if f.Has(CpuFeatureAmd64SSE4_1) {
+		ret |= 1 << 1
+	}
+	if f.Has(CpuFeatureAmd64SSE4_2) {
+		ret |= 1 << 2
+	}
+	if f.HasExtra(CpuExtraFeatureAmd64ABM) {
+		ret |= 1 << 3
+	}
+	return ret
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
index 8ae826d367..291bcea65f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
@@ -4,11 +4,14 @@ package platform
 
 var CpuFeatures CpuFeatureFlags = &cpuFeatureFlags{}
 
-// cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms
+// cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms.
 type cpuFeatureFlags struct{}
 
-// Has implements the same method on the CpuFeatureFlags interface
+// Has implements the same method on the CpuFeatureFlags interface.
 func (c *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool { return false }
 
-// HasExtra implements the same method on the CpuFeatureFlags interface
+// HasExtra implements the same method on the CpuFeatureFlags interface.
 func (c *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool { return false }
+
+// Raw implements the same method on the CpuFeatureFlags interface.
+func (c *cpuFeatureFlags) Raw() uint64 { return 0 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
index a61996d58b..b0519003b7 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
@@ -12,8 +12,6 @@ const (
 	mmapProtARM64 = syscall.PROT_READ | syscall.PROT_WRITE
 )
 
-const MmapSupported = true
-
 func munmapCodeSegment(code []byte) error {
 	return syscall.Munmap(code)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
index 27833db377..079aa643f4 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
@@ -9,8 +9,6 @@ import (
 
 var errUnsupported = fmt.Errorf("mmap unsupported on GOOS=%s. Use interpreter instead.", runtime.GOOS)
 
-const MmapSupported = false
-
 func munmapCodeSegment(code []byte) error {
 	panic(errUnsupported)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
index 69fcb6d6b6..03a254d4a6 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
@@ -21,8 +21,6 @@ const (
 	windows_PAGE_EXECUTE_READWRITE uintptr = 0x00000040
 )
 
-const MmapSupported = true
-
 func munmapCodeSegment(code []byte) error {
 	return freeMemory(code)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_other.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_other.go
deleted file mode 100644
index 5cba99fb25..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_other.go
+++ /dev/null
@@ -1,23 +0,0 @@
-//go:build !(darwin || linux || freebsd) || tinygo
-
-package platform
-
-func remapCodeSegmentAMD64(code []byte, size int) ([]byte, error) {
-	b, err := mmapCodeSegmentAMD64(size)
-	if err != nil {
-		return nil, err
-	}
-	copy(b, code)
-	mustMunmapCodeSegment(code)
-	return b, nil
-}
-
-func remapCodeSegmentARM64(code []byte, size int) ([]byte, error) {
-	b, err := mmapCodeSegmentARM64(size)
-	if err != nil {
-		return nil, err
-	}
-	copy(b, code)
-	mustMunmapCodeSegment(code)
-	return b, nil
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_unix.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_unix.go
deleted file mode 100644
index 8f42d44fd7..0000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mremap_unix.go
+++ /dev/null
@@ -1,21 +0,0 @@
-//go:build (darwin || linux || freebsd) && !tinygo
-
-package platform
-
-func remapCodeSegmentAMD64(code []byte, size int) ([]byte, error) {
-	return remapCodeSegment(code, size, mmapProtAMD64)
-}
-
-func remapCodeSegmentARM64(code []byte, size int) ([]byte, error) {
-	return remapCodeSegment(code, size, mmapProtARM64)
-}
-
-func remapCodeSegment(code []byte, size, prot int) ([]byte, error) {
-	b, err := mmapCodeSegment(size, prot)
-	if err != nil {
-		return nil, err
-	}
-	copy(b, code)
-	mustMunmapCodeSegment(code)
-	return b, nil
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go b/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go
index c6dc0f857b..a275562406 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go
@@ -36,28 +36,6 @@ func MmapCodeSegment(size int) ([]byte, error) {
 	}
 }
 
-// RemapCodeSegment reallocates the memory mapping of an existing code segment
-// to increase its size. The previous code mapping is unmapped and must not be
-// reused after the function returns.
-//
-// This is similar to mremap(2) on linux, and emulated on platforms which do not
-// have this syscall.
-//
-// See https://man7.org/linux/man-pages/man2/mremap.2.html
-func RemapCodeSegment(code []byte, size int) ([]byte, error) {
-	if size < len(code) {
-		panic("BUG: RemapCodeSegment with size less than code")
-	}
-	if code == nil {
-		return MmapCodeSegment(size)
-	}
-	if runtime.GOARCH == "amd64" {
-		return remapCodeSegmentAMD64(code, size)
-	} else {
-		return remapCodeSegmentARM64(code, size)
-	}
-}
-
 // MunmapCodeSegment unmaps the given memory region.
 func MunmapCodeSegment(code []byte) error {
 	if len(code) == 0 {
@@ -65,17 +43,3 @@ func MunmapCodeSegment(code []byte) error {
 	}
 	return munmapCodeSegment(code)
 }
-
-// mustMunmapCodeSegment panics instead of returning an error to the
-// application.
-//
-// # Why panic?
-//
-// It is less disruptive to the application to leak the previous block if it
-// could be unmapped than to leak the new block and return an error.
-// Realistically, either scenarios are pretty hard to debug, so we panic.
-func mustMunmapCodeSegment(code []byte) {
-	if err := munmapCodeSegment(code); err != nil {
-		panic(err)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go b/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
index 9a77205bb5..fdbf1fde0d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
@@ -38,9 +38,6 @@ func NewStdioFile(stdin bool, f fs.File) (fsapi.File, error) {
 }
 
 func OpenFile(path string, flag experimentalsys.Oflag, perm fs.FileMode) (*os.File, experimentalsys.Errno) {
-	if flag&experimentalsys.O_DIRECTORY != 0 && flag&(experimentalsys.O_WRONLY|experimentalsys.O_RDWR) != 0 {
-		return nil, experimentalsys.EISDIR // invalid to open a directory writeable
-	}
 	return openFile(path, flag, perm)
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/binary/value.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/binary/value.go
index 755ee5ea3e..dfc4417edc 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/binary/value.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/binary/value.go
@@ -54,7 +54,6 @@ func decodeUTF8(r *bytes.Reader, contextFormat string, contextArgs ...interface{
 		return "", 0, fmt.Errorf("%s is not valid UTF-8", fmt.Sprintf(contextFormat, contextArgs...))
 	}
 
-	// TODO: use unsafe.String after flooring Go 1.20.
-	ret := *(*string)(unsafe.Pointer(&buf))
+	ret := unsafe.String(&buf[0], int(size))
 	return ret, size + uint32(sizeOfSize), nil
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
index 58a4582178..61a342ef23 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
@@ -69,4 +69,7 @@ type ModuleEngine interface {
 	// FunctionInstanceReference returns Reference for the given Index for a FunctionInstance. The returned values are used by
 	// the initialization via ElementSegment.
 	FunctionInstanceReference(funcIndex Index) Reference
+
+	// MemoryGrown notifies the engine that the memory has grown.
+	MemoryGrown()
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
index 8da6890765..6044892289 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
@@ -67,11 +67,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 	declaredFunctionIndexes map[Index]struct{},
 	br *bytes.Reader,
 ) error {
-	nonStaticLocals := make(map[Index]struct{})
-	if len(m.NonStaticLocals) > 0 {
-		m.NonStaticLocals[idx] = nonStaticLocals
-	}
-
 	functionType := &m.TypeSection[m.FunctionSection[idx]]
 	code := &m.CodeSection[idx]
 	body := code.Body
@@ -357,7 +352,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 					return fmt.Errorf("invalid local index for %s %d >= %d(=len(locals)+len(parameters))",
 						OpcodeLocalSetName, index, l)
 				}
-				nonStaticLocals[index] = struct{}{}
 				var expType ValueType
 				if index < inputLen {
 					expType = functionType.Params[index]
@@ -373,7 +367,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 					return fmt.Errorf("invalid local index for %s %d >= %d(=len(locals)+len(parameters))",
 						OpcodeLocalTeeName, index, l)
 				}
-				nonStaticLocals[index] = struct{}{}
 				var expType ValueType
 				if index < inputLen {
 					expType = functionType.Params[index]
@@ -458,14 +451,14 @@ func (m *Module) validateFunctionWithMaxStackValues(
 				return fmt.Errorf("read immediate: %w", err)
 			}
 
-			list := make([]uint32, nl)
+			sts.ls = sts.ls[:0]
 			for i := uint32(0); i < nl; i++ {
 				l, n, err := leb128.DecodeUint32(br)
 				if err != nil {
 					return fmt.Errorf("read immediate: %w", err)
 				}
 				num += n
-				list[i] = l
+				sts.ls = append(sts.ls, l)
 			}
 			ln, n, err := leb128.DecodeUint32(br)
 			if err != nil {
@@ -518,7 +511,7 @@ func (m *Module) validateFunctionWithMaxStackValues(
 				}
 			}
 
-			for _, l := range list {
+			for _, l := range sts.ls {
 				if int(l) >= len(controlBlockStack.stack) {
 					return fmt.Errorf("invalid l param given for %s", OpcodeBrTableName)
 				}
@@ -2010,6 +2003,8 @@ var vecSplatValueTypes = [...]ValueType{
 type stacks struct {
 	vs valueTypeStack
 	cs controlBlockStack
+	// ls is the label slice that is reused for each br_table instruction.
+	ls []uint32
 }
 
 func (sts *stacks) reset(functionType *FunctionType) {
@@ -2019,6 +2014,7 @@ func (sts *stacks) reset(functionType *FunctionType) {
 	sts.vs.maximumStackPointer = 0
 	sts.cs.stack = sts.cs.stack[:0]
 	sts.cs.stack = append(sts.cs.stack, controlBlock{blockType: functionType})
+	sts.ls = sts.ls[:0]
 }
 
 type controlBlockStack struct {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
index 5cc5012dae..8e072fd127 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
@@ -52,18 +52,22 @@ type MemoryInstance struct {
 	definition api.MemoryDefinition
 
 	// Mux is used in interpreter mode to prevent overlapping calls to atomic instructions,
-	// introduced with WebAssembly threads proposal.
+	// introduced with WebAssembly threads proposal, and in compiler mode to make memory modifications
+	// within Grow non-racy for the Go race detector.
 	Mux sync.Mutex
 
 	// waiters implements atomic wait and notify. It is implemented similarly to golang.org/x/sync/semaphore,
 	// with a fixed weight of 1 and no spurious notifications.
 	waiters sync.Map
 
+	// ownerModuleEngine is the module engine that owns this memory instance.
+	ownerModuleEngine ModuleEngine
+
 	expBuffer experimental.LinearMemory
 }
 
 // NewMemoryInstance creates a new instance based on the parameters in the SectionIDMemory.
-func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator) *MemoryInstance {
+func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator, moduleEngine ModuleEngine) *MemoryInstance {
 	minBytes := MemoryPagesToBytesNum(memSec.Min)
 	capBytes := MemoryPagesToBytesNum(memSec.Cap)
 	maxBytes := MemoryPagesToBytesNum(memSec.Max)
@@ -89,12 +93,13 @@ func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator) *
 		buffer = make([]byte, minBytes, capBytes)
 	}
 	return &MemoryInstance{
-		Buffer:    buffer,
-		Min:       memSec.Min,
-		Cap:       memoryBytesNumToPages(uint64(cap(buffer))),
-		Max:       memSec.Max,
-		Shared:    memSec.IsShared,
-		expBuffer: expBuffer,
+		Buffer:            buffer,
+		Min:               memSec.Min,
+		Cap:               memoryBytesNumToPages(uint64(cap(buffer))),
+		Max:               memSec.Max,
+		Shared:            memSec.IsShared,
+		expBuffer:         expBuffer,
+		ownerModuleEngine: moduleEngine,
 	}
 }
 
@@ -223,6 +228,11 @@ func MemoryPagesToBytesNum(pages uint32) (bytesNum uint64) {
 
 // Grow implements the same method as documented on api.Memory.
 func (m *MemoryInstance) Grow(delta uint32) (result uint32, ok bool) {
+	if m.Shared {
+		m.Mux.Lock()
+		defer m.Mux.Unlock()
+	}
+
 	currentPages := m.Pages()
 	if delta == 0 {
 		return currentPages, true
@@ -247,14 +257,12 @@ func (m *MemoryInstance) Grow(delta uint32) (result uint32, ok bool) {
 			m.Buffer = buffer
 			m.Cap = newPages
 		}
-		return currentPages, true
 	} else if newPages > m.Cap { // grow the memory.
 		if m.Shared {
 			panic("shared memory cannot be grown, this is a bug in wazero")
 		}
 		m.Buffer = append(m.Buffer, make([]byte, MemoryPagesToBytesNum(delta))...)
 		m.Cap = newPages
-		return currentPages, true
 	} else { // We already have the capacity we need.
 		if m.Shared {
 			// We assume grow is called under a guest lock.
@@ -264,8 +272,9 @@ func (m *MemoryInstance) Grow(delta uint32) (result uint32, ok bool) {
 		} else {
 			m.Buffer = m.Buffer[:MemoryPagesToBytesNum(newPages)]
 		}
-		return currentPages, true
 	}
+	m.ownerModuleEngine.MemoryGrown()
+	return currentPages, true
 }
 
 // Pages implements the same method as documented on api.Memory.
@@ -296,6 +305,7 @@ func PagesToUnitOfBytes(pages uint32) string {
 
 // Uses atomic write to update the length of a slice.
 func atomicStoreLengthAndCap(slice *[]byte, length uintptr, cap uintptr) {
+	//nolint:staticcheck
 	slicePtr := (*reflect.SliceHeader)(unsafe.Pointer(slice))
 	capPtr := (*uintptr)(unsafe.Pointer(&slicePtr.Cap))
 	atomic.StoreUintptr(capPtr, cap)
@@ -305,6 +315,7 @@ func atomicStoreLengthAndCap(slice *[]byte, length uintptr, cap uintptr) {
 
 // Uses atomic write to update the length of a slice.
 func atomicStoreLength(slice *[]byte, length uintptr) {
+	//nolint:staticcheck
 	slicePtr := (*reflect.SliceHeader)(unsafe.Pointer(slice))
 	lenPtr := (*uintptr)(unsafe.Pointer(&slicePtr.Len))
 	atomic.StoreUintptr(lenPtr, length)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
index 68573b918e..8369ad9ed6 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
@@ -185,9 +185,6 @@ type Module struct {
 	// as described in https://yurydelendik.github.io/webassembly-dwarf/, though it is not specified in the Wasm
 	// specification: https://github.com/WebAssembly/debugging/issues/1
 	DWARFLines *wasmdebug.DWARFLines
-
-	// NonStaticLocals collects the local indexes that will change its value through either local.get or local.tee.
-	NonStaticLocals []map[Index]struct{}
 }
 
 // ModuleID represents sha256 hash value uniquely assigned to Module.
@@ -366,8 +363,6 @@ func (m *Module) validateFunctions(enabledFeatures api.CoreFeatures, functions [
 	br := bytes.NewReader(nil)
 	// Also, we reuse the stacks across multiple function validations to reduce allocations.
 	vs := &stacks{}
-	// Non-static locals are gathered during validation and used in the down-stream compilation.
-	m.NonStaticLocals = make([]map[Index]struct{}, len(m.FunctionSection))
 	for idx, typeIndex := range m.FunctionSection {
 		if typeIndex >= typeCount {
 			return fmt.Errorf("invalid %s: type section index %d out of range", m.funcDesc(SectionIDFunction, Index(idx)), typeIndex)
@@ -655,7 +650,7 @@ func paramNames(localNames IndirectNameMap, funcIdx uint32, paramLen int) []stri
 func (m *ModuleInstance) buildMemory(module *Module, allocator experimental.MemoryAllocator) {
 	memSec := module.MemorySection
 	if memSec != nil {
-		m.MemoryInstance = NewMemoryInstance(memSec, allocator)
+		m.MemoryInstance = NewMemoryInstance(memSec, allocator, m.Engine)
 		m.MemoryInstance.definition = &module.MemoryDefinitionSection[0]
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/store.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/store.go
index 1db661e853..dda6e5b635 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/store.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/store.go
@@ -3,6 +3,7 @@ package wasm
 import (
 	"context"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"sync"
 	"sync/atomic"
@@ -352,7 +353,7 @@ func (s *Store) instantiate(
 		return nil, err
 	}
 
-	if err = m.resolveImports(module); err != nil {
+	if err = m.resolveImports(ctx, module); err != nil {
 		return nil, err
 	}
 
@@ -410,12 +411,22 @@ func (s *Store) instantiate(
 	return
 }
 
-func (m *ModuleInstance) resolveImports(module *Module) (err error) {
+func (m *ModuleInstance) resolveImports(ctx context.Context, module *Module) (err error) {
+	// Check if ctx contains an ImportResolver.
+	resolveImport, _ := ctx.Value(expctxkeys.ImportResolverKey{}).(experimental.ImportResolver)
+
 	for moduleName, imports := range module.ImportPerModule {
 		var importedModule *ModuleInstance
-		importedModule, err = m.s.module(moduleName)
-		if err != nil {
-			return err
+		if resolveImport != nil {
+			if v := resolveImport(moduleName); v != nil {
+				importedModule = v.(*ModuleInstance)
+			}
+		}
+		if importedModule == nil {
+			importedModule, err = m.s.module(moduleName)
+			if err != nil {
+				return err
+			}
 		}
 
 		for _, i := range imports {
@@ -649,20 +660,20 @@ func (s *Store) GetFunctionTypeID(t *FunctionType) (FunctionTypeID, error) {
 }
 
 // CloseWithExitCode implements the same method as documented on wazero.Runtime.
-func (s *Store) CloseWithExitCode(ctx context.Context, exitCode uint32) (err error) {
+func (s *Store) CloseWithExitCode(ctx context.Context, exitCode uint32) error {
 	s.mux.Lock()
 	defer s.mux.Unlock()
 	// Close modules in reverse initialization order.
+	var errs []error
 	for m := s.moduleList; m != nil; m = m.next {
 		// If closing this module errs, proceed anyway to close the others.
-		if e := m.closeWithExitCode(ctx, exitCode); e != nil && err == nil {
-			// TODO: use multiple errors handling in Go 1.20.
-			err = e // first error
+		if err := m.closeWithExitCode(ctx, exitCode); err != nil {
+			errs = append(errs, err)
 		}
 	}
 	s.moduleList = nil
 	s.nameToModule = nil
 	s.nameToModuleCap = 0
 	s.typeIDs = nil
-	return
+	return errors.Join(errs...)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/store_module_list.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/store_module_list.go
index 17c63e38e6..ede3047deb 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/store_module_list.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/store_module_list.go
@@ -3,8 +3,6 @@ package wasm
 import (
 	"errors"
 	"fmt"
-
-	"github.com/tetratelabs/wazero/api"
 )
 
 // deleteModule makes the moduleName available for instantiation again.
@@ -88,7 +86,7 @@ func (s *Store) registerModule(m *ModuleInstance) error {
 }
 
 // Module implements wazero.Runtime Module
-func (s *Store) Module(moduleName string) api.Module {
+func (s *Store) Module(moduleName string) *ModuleInstance {
 	m, err := s.module(moduleName)
 	if err != nil {
 		return nil
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasmdebug/dwarf.go b/vendor/github.com/tetratelabs/wazero/internal/wasmdebug/dwarf.go
index 3b0d3a7a62..50ba7b2b3f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasmdebug/dwarf.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasmdebug/dwarf.go
@@ -171,7 +171,6 @@ entry:
 	// Advance the line reader for the found position.
 	lineReader.Seek(ln.pos)
 	err = lineReader.Next(&le)
-
 	if err != nil {
 		// If we reach this block, that means there's a bug in the []line creation logic above.
 		panic("BUG: stored dwarf.LineReaderPos is invalid")
diff --git a/vendor/github.com/tetratelabs/wazero/runtime.go b/vendor/github.com/tetratelabs/wazero/runtime.go
index d1f0a1a310..34742289eb 100644
--- a/vendor/github.com/tetratelabs/wazero/runtime.go
+++ b/vendor/github.com/tetratelabs/wazero/runtime.go
@@ -197,7 +197,13 @@ func (r *runtime) Module(moduleName string) api.Module {
 	if len(moduleName) == 0 {
 		return nil
 	}
-	return r.store.Module(moduleName)
+	m := r.store.Module(moduleName)
+	if m == nil {
+		return nil
+	} else if m.Source.IsHostModule {
+		return hostModuleInstance{m}
+	}
+	return m
 }
 
 // CompileModule implements Runtime.CompileModule
diff --git a/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go b/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go
index 583c2adb04..cc37012cff 100644
--- a/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go
@@ -7,9 +7,6 @@ import "io/fs"
 // sysParseable is only used here as we define "supported" as being able to
 // parse `info.Sys()`. The above `go:build` constraints exclude 32-bit until
 // that's requested.
-//
-// TODO: When Go 1.21 is out, use the "unix" build constraint (as 1.21 makes
-// our floor Go version 1.19.
 const sysParseable = false
 
 func statFromFileInfo(info fs.FileInfo) Stat_t {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index a1dafd3957..d30f7645b0 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -704,8 +704,8 @@ github.com/tailscale/wireguard-go/tun
 # github.com/tcnksm/go-httpstat v0.2.0
 ## explicit
 github.com/tcnksm/go-httpstat
-# github.com/tetratelabs/wazero v1.7.2
-## explicit; go 1.20
+# github.com/tetratelabs/wazero v1.8.0
+## explicit; go 1.21
 github.com/tetratelabs/wazero
 github.com/tetratelabs/wazero/api
 github.com/tetratelabs/wazero/experimental