From 75322e94e4fef05097d22a9ebed052f11850d970 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 10 Sep 2024 18:01:34 +1000
Subject: [PATCH 01/28] Use Dawn libs directly to minimize binary size. Fix
 Release build errors. Fix Android build errors.

---
 .../external/onnxruntime_external_deps.cmake  | 41 +++++++++++++++--
 cmake/onnxruntime_providers_webgpu.cmake      | 12 ++---
 cmake/patches/dawn/dawn.patch                 | 12 +++++
 .../core/providers/webgpu/buffer_manager.cc   |  8 ++--
 .../providers/webgpu/program_cache_key.cc     |  5 +-
 .../core/providers/webgpu/program_manager.h   |  2 +-
 .../core/providers/webgpu/shader_helper.cc    | 46 +++++++++----------
 .../core/providers/webgpu/webgpu_context.cc   |  5 ++
 8 files changed, 89 insertions(+), 42 deletions(-)
 create mode 100644 cmake/patches/dawn/dawn.patch

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index a8ab4a53b9f3a..c2d5957a9910c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -590,12 +590,45 @@ if (onnxruntime_USE_WEBGPU)
     dawn
     URL ${DEP_URL_dawn}
     URL_HASH SHA1=${DEP_SHA1_dawn}
+    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
   )
-  set(DAWN_FETCH_DEPENDENCIES ON)
-  set(DAWN_ENABLE_INSTALL ON)
-  set(TINT_BUILD_TESTS OFF)
-  set(DAWN_USE_BUILT_DXC ON)
+
+  # use dawn::native_objects and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+  set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+  set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
+  set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
+
+  # disable things we don't use
   set(DAWN_DXC_ENABLE_ASSERTS_IN_NDEBUG OFF)
+  set(DAWN_ENABLE_DESKTOP_GL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_OPENGLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_SUPPORTS_GLFW_FOR_WINDOWING OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_GLFW OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_WINDOWS_UI OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_X11 OFF CACHE BOOL "" FORCE)
+
+  set(TINT_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_CMD_TOOLS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_WRITER OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
+  set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key
+
+  # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V.
+  if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DAWN_ENABLE_SPIRV_VALIDATION OFF CACHE BOOL "" FORCE)
+  endif()
+
+  if (WIN32)
+    set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
+
+    # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
+    set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+  endif()
+
   onnxruntime_fetchcontent_makeavailable(dawn)
 endif()
 
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index 587c4b2c1ff2c..8d00ab5aa4494 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -24,14 +24,8 @@
 
   source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_webgpu_cc_srcs})
   onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
-
-  # Copy webgpu_dawn.dll to the output directory
-  add_custom_command(
-    TARGET onnxruntime_providers_webgpu
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-    VERBATIM )
+  onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
+    onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+  target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native dawn::dawn_proc)
 
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
new file mode 100644
index 0000000000000..33eb430329603
--- /dev/null
+++ b/cmake/patches/dawn/dawn.patch
@@ -0,0 +1,12 @@
+diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
+index 0037d83276..6372c4ee77 100644
+--- a/src/tint/api/BUILD.cmake
++++ b/src/tint/api/BUILD.cmake
+@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib
+   tint_lang_wgsl_ast_transform
+   tint_lang_wgsl_common
+   tint_lang_wgsl_features
++  tint_lang_wgsl_inspector
+   tint_lang_wgsl_program
+   tint_lang_wgsl_sem
+   tint_lang_wgsl_writer_ir_to_program
diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc
index da544e1d1ed60..8751338d24178 100644
--- a/onnxruntime/core/providers/webgpu/buffer_manager.cc
+++ b/onnxruntime/core/providers/webgpu/buffer_manager.cc
@@ -243,10 +243,10 @@ std::ostream& operator<<(std::ostream& os, BufferCacheMode mode) {
 
 BufferManager::BufferManager(WebGpuContext& context, BufferCacheMode storage_buffer_cache_mode, BufferCacheMode uniform_buffer_cache_mode, BufferCacheMode query_resolve_buffer_cache_mode)
     : context_{context},
-      storage_cache_{std::move(CreateBufferCacheManager(storage_buffer_cache_mode))},
-      uniform_cache_{std::move(CreateBufferCacheManager(uniform_buffer_cache_mode))},
-      query_resolve_cache_{std::move(CreateBufferCacheManager(query_resolve_buffer_cache_mode))},
-      default_cache_{std::move(CreateBufferCacheManager(BufferCacheMode::Disabled))} {
+      storage_cache_{CreateBufferCacheManager(storage_buffer_cache_mode)},
+      uniform_cache_{CreateBufferCacheManager(uniform_buffer_cache_mode)},
+      query_resolve_cache_{CreateBufferCacheManager(query_resolve_buffer_cache_mode)},
+      default_cache_{CreateBufferCacheManager(BufferCacheMode::Disabled)} {
 }
 
 void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) {
diff --git a/onnxruntime/core/providers/webgpu/program_cache_key.cc b/onnxruntime/core/providers/webgpu/program_cache_key.cc
index 09a536f7916b2..6c7ef2bc89c6b 100644
--- a/onnxruntime/core/providers/webgpu/program_cache_key.cc
+++ b/onnxruntime/core/providers/webgpu/program_cache_key.cc
@@ -10,12 +10,14 @@ namespace webgpu {
 
 namespace {
 // append the info of an input or output to the cachekey
-void AppendTensorInfo(std::ostringstream& ss, const Tensor& tensor, ProgramVariableDataType var_type, ProgramTensorMetadataDependency dependency, bool& first) {
+void AppendTensorInfo(std::ostringstream& ss, const Tensor& tensor, ProgramVariableDataType var_type, ProgramTensorMetadataDependency dependency,
+                      bool& first) {
   if (first) {
     first = false;
   } else {
     ss << '|';
   }
+
   if ((dependency & ProgramTensorMetadataDependency::Type) == ProgramTensorMetadataDependency::Type) {
 #ifndef NDEBUG  // if debug build
     ss << var_type;
@@ -24,6 +26,7 @@ void AppendTensorInfo(std::ostringstream& ss, const Tensor& tensor, ProgramVaria
 #endif
     ss << ';';
   }
+
   if ((dependency & ProgramTensorMetadataDependency::Shape) == ProgramTensorMetadataDependency::Shape) {
     ss D("Dims=") << tensor.Shape().ToString();
   } else if ((dependency & ProgramTensorMetadataDependency::Rank) == ProgramTensorMetadataDependency::Rank) {
diff --git a/onnxruntime/core/providers/webgpu/program_manager.h b/onnxruntime/core/providers/webgpu/program_manager.h
index 782788910e3a5..83e5ff21c813c 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.h
+++ b/onnxruntime/core/providers/webgpu/program_manager.h
@@ -30,7 +30,7 @@ class ProgramArtifact {
   const std::vector<int> shape_uniform_ranks;
 
   ProgramArtifact(ProgramArtifact&&) = default;
-  ProgramArtifact& operator=(ProgramArtifact&&) = default;
+  ProgramArtifact& operator=(ProgramArtifact&&) = delete;  // can't change const members.
 
  private:
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(ProgramArtifact);
diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc
index cd21f4752f300..be89efae5fc97 100644
--- a/onnxruntime/core/providers/webgpu/shader_helper.cc
+++ b/onnxruntime/core/providers/webgpu/shader_helper.cc
@@ -196,6 +196,29 @@ Status ValidateVariableDependency(ProgramTensorMetadataDependency dependency, Sh
 }
 }  // namespace
 
+Status ShaderHelper::ValidateVariable(const ProgramInput& input, const ShaderVariable& var) const {
+  ORT_RETURN_IF_ERROR(ValidateVariableDataType(input.tensor->GetElementType(), var.type_));
+  ORT_RETURN_IF_ERROR(ValidateVariableShape(input.tensor->Shape(),
+                                            input.use_override_shape,
+                                            input.use_override_shape ? input.override_shape : input.tensor->Shape(),
+                                            var.num_components_));
+  ORT_RETURN_IF_ERROR(ValidateVariableDependency(input.dependency, var.usage_, true));
+
+  return Status::OK();
+}
+Status ShaderHelper::ValidateVariable(const ProgramOutput& output, const ShaderVariable& var) const {
+  ORT_RETURN_IF_ERROR(ValidateVariableDataType(output.tensor->GetElementType(), var.type_));
+  ORT_RETURN_IF_ERROR(ValidateVariableShape(output.tensor->Shape(),
+                                            output.use_override_shape,
+                                            output.use_override_shape ? output.override_shape : output.tensor->Shape(),
+                                            var.num_components_));
+  ORT_RETURN_IF_ERROR(ValidateVariableDependency(output.dependency, var.usage_, false));
+
+  return Status::OK();
+}
+
+#endif  // NDEBUG
+
 const ShaderVariable& ShaderHelper::AddVariableImpl(ProgramVariableScope scope,
                                                     const std::string& name,
                                                     ShaderVariable::Usage usage,
@@ -224,27 +247,6 @@ const ShaderVariable& ShaderHelper::AddVariableImpl(ProgramVariableScope scope,
   return *var;
 }
 
-Status ShaderHelper::ValidateVariable(const ProgramInput& input, const ShaderVariable& var) const {
-  ORT_RETURN_IF_ERROR(ValidateVariableDataType(input.tensor->GetElementType(), var.type_));
-  ORT_RETURN_IF_ERROR(ValidateVariableShape(input.tensor->Shape(),
-                                            input.use_override_shape,
-                                            input.use_override_shape ? input.override_shape : input.tensor->Shape(),
-                                            var.num_components_));
-  ORT_RETURN_IF_ERROR(ValidateVariableDependency(input.dependency, var.usage_, true));
-
-  return Status::OK();
-}
-Status ShaderHelper::ValidateVariable(const ProgramOutput& output, const ShaderVariable& var) const {
-  ORT_RETURN_IF_ERROR(ValidateVariableDataType(output.tensor->GetElementType(), var.type_));
-  ORT_RETURN_IF_ERROR(ValidateVariableShape(output.tensor->Shape(),
-                                            output.use_override_shape,
-                                            output.use_override_shape ? output.override_shape : output.tensor->Shape(),
-                                            var.num_components_));
-  ORT_RETURN_IF_ERROR(ValidateVariableDependency(output.dependency, var.usage_, false));
-
-  return Status::OK();
-}
-
 Status ShaderHelper::ValidateShapeForInputsAndOutputs() const {
   const auto& input_vars = vars_[static_cast<int>(ProgramVariableScope::Input)];
   const auto& output_vars = vars_[static_cast<int>(ProgramVariableScope::Output)];
@@ -304,8 +306,6 @@ Status ShaderHelper::ValidateShapeForInputsAndOutputs() const {
   return Status::OK();
 }
 
-#endif
-
 Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector<int>& shape_uniform_ranks) const {
   std::ostringstream ss;
   ss.imbue(std::locale::classic());
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 276d74905adb7..0d994faeda472 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -4,6 +4,9 @@
 #include <memory>
 #include <cmath>
 
+#include "dawn/dawn_proc.h"
+#include "dawn/native/DawnNative.h"
+
 #include "core/common/common.h"
 
 #include "core/providers/webgpu/compute_context.h"
@@ -89,6 +92,8 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
   std::call_once(init_flag_, [this, &webgpu_ep_info]() {
     // Initialization.Step.1 - Create wgpu::Instance
     if (instance_ == nullptr) {
+      dawnProcSetProcs(&dawn::native::GetProcs());
+
       wgpu::InstanceDescriptor instance_desc{};
       instance_desc.features.timedWaitAnyEnable = true;
       instance_ = wgpu::CreateInstance(&instance_desc);

From e8ed35f3d64fc0909f91f259b781b6a5f856323b Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 10 Sep 2024 18:21:23 +1000
Subject: [PATCH 02/28] Fix Windows build

---
 cmake/external/onnxruntime_external_deps.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c2d5957a9910c..ae16e4e0b9971 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -623,7 +623,9 @@ if (onnxruntime_USE_WEBGPU)
   endif()
 
   if (WIN32)
+    # building this requires the HLSL writer to be enabled in Tint. TBD if that we need either of these to be ON.
     set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
+    set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
 
     # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
     set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)

From f4cbc7654d01c9e13e3c21568a39682d061a73fc Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 11 Sep 2024 13:55:51 +1000
Subject: [PATCH 03/28] Update patch with iOS build fixes.

---
 cmake/patches/dawn/dawn.patch | 54 +++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
index 33eb430329603..d696d386452e8 100644
--- a/cmake/patches/dawn/dawn.patch
+++ b/cmake/patches/dawn/dawn.patch
@@ -1,3 +1,57 @@
+diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
+index 9c0bd6fa4e..bf8a57aeac 100644
+--- a/src/dawn/native/CMakeLists.txt
++++ b/src/dawn/native/CMakeLists.txt
+@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER)
+     target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER")
+ endif()
+
++if (IOS)
++    target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc)
++    target_compile_options(dawn_native PRIVATE -fno-objc-arc)
++endif()
++
+ if (DAWN_BUILD_MONOLITHIC_LIBRARY)
+     ###############################################################################
+     # Do the 'complete_lib' build.
+diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm
+index ce55acbd43..baa4835362 100644
+--- a/src/dawn/native/Surface_metal.mm
++++ b/src/dawn/native/Surface_metal.mm
+@@ -36,7 +36,13 @@
+ namespace dawn::native {
+
+ bool InheritsFromCAMetalLayer(void* obj) {
+-    id<NSObject> object = static_cast<id>(obj);
++    id<NSObject> object =
++#if TARGET_OS_IOS
++        (__bridge id)obj;
++#else
++        static_cast<id>(obj);
++#endif
++
+     return [object isKindOfClass:[CAMetalLayer class]];
+ }
+
+diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm
+index bde8bfea07..f2f6459e91 100644
+--- a/src/dawn/native/metal/SharedFenceMTL.mm
++++ b/src/dawn/native/metal/SharedFenceMTL.mm
+@@ -40,7 +40,13 @@ ResultOrError<Ref<SharedFence>> SharedFence::Create(
+     DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing.");
+     if (@available(macOS 10.14, iOS 12.0, *)) {
+         return AcquireRef(new SharedFence(
+-            device, label, static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)));
++            device, label,
++#if TARGET_OS_IOS
++            (__bridge id<MTLSharedEvent>)(descriptor->sharedEvent)
++#else
++            static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)
++#endif
++            ));
+     } else {
+         return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported.");
+     }
 diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
 index 0037d83276..6372c4ee77 100644
 --- a/src/tint/api/BUILD.cmake

From e1e75b8659317cbe29685454ccd2476c520469bf Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 11 Sep 2024 17:30:33 +1000
Subject: [PATCH 04/28] WGSL writer is only needed when Vulkan is being used

---
 cmake/external/onnxruntime_external_deps.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index cd4baa2b67c08..657f9cf5d51eb 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -663,7 +663,7 @@ if (onnxruntime_USE_WEBGPU)
   set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
   set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
   set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
-  set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key
+  set(TINT_BUILD_WGSL_WRITER OFF CACHE BOOL "" FORCE)
 
   # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V.
   if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -679,6 +679,10 @@ if (onnxruntime_USE_WEBGPU)
     set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
   endif()
 
+  if (ANDROID)
+    set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key for Vulkan shader
+  endif()
+
   onnxruntime_fetchcontent_makeavailable(dawn)
 endif()
 

From afd202a9e5a33ef93c7567bc911acad0cd73a28e Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Fri, 13 Sep 2024 16:53:32 +1000
Subject: [PATCH 05/28] Fix build errors

---
 onnxruntime/core/providers/webgpu/webgpu_context.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index 3251364e85ce3..f74dda38fca04 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -110,11 +110,11 @@ class WebGpuContext final {
       : instance_{instance}, adapter_{adapter}, device_{device}, validation_mode_{validation_mode} {}
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuContext);
 
-  std::vector<const char*> WebGpuContext::GetEnabledAdapterToggles() const;
-  std::vector<const char*> WebGpuContext::GetEnabledDeviceToggles() const;
-  std::vector<const char*> WebGpuContext::GetDisabledDeviceToggles() const;
-  std::vector<wgpu::FeatureName> WebGpuContext::GetAvailableRequiredFeatures(const wgpu::Adapter& adapter) const;
-  wgpu::RequiredLimits WebGpuContext::GetRequiredLimits(const wgpu::Adapter& adapter) const;
+  std::vector<const char*> GetEnabledAdapterToggles() const;
+  std::vector<const char*> GetEnabledDeviceToggles() const;
+  std::vector<const char*> GetDisabledDeviceToggles() const;
+  std::vector<wgpu::FeatureName> GetAvailableRequiredFeatures(const wgpu::Adapter& adapter) const;
+  wgpu::RequiredLimits GetRequiredLimits(const wgpu::Adapter& adapter) const;
 
   std::once_flag init_flag_;
 

From bd25d1c1321a76160477cc7906e85e9ba4fdd894 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Fri, 13 Sep 2024 17:15:56 +1000
Subject: [PATCH 06/28] Fix transpose.cc build error.

---
 onnxruntime/core/providers/webgpu/tensor/transpose.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 68af858d515c2..86a9478a15b57 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -52,7 +52,7 @@ const std::string AppendPermFunction(gsl::span<const size_t> perm) {
   ss.imbue(std::locale::classic());
   ss << "fn perm(i: y_indices_t)->x_indices_t {\n"
         "  var a: x_indices_t;\n";
-  for (auto i = 0; i < perm.size(); ++i) {
+  for (size_t i = 0; i < perm.size(); ++i) {
     ss << "  a[" << perm[i] << "] = i[" << i << "];\n";
   }
   ss << "  return a;\n"

From 3f9be822917f357d500413deb435dbd5d534592f Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Fri, 13 Sep 2024 17:35:58 +1000
Subject: [PATCH 07/28] Go back to WGSL writer being required on all builds

---
 cmake/external/onnxruntime_external_deps.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 657f9cf5d51eb..d515e117e0718 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -663,7 +663,7 @@ if (onnxruntime_USE_WEBGPU)
   set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
   set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
   set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
-  set(TINT_BUILD_WGSL_WRITER OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key. runtime error if not enabled.
 
   # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V.
   if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -680,7 +680,6 @@ if (onnxruntime_USE_WEBGPU)
   endif()
 
   if (ANDROID)
-    set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key for Vulkan shader
   endif()
 
   onnxruntime_fetchcontent_makeavailable(dawn)

From 788e129b27c657ec1d162ab7dcc65e401a9dea95 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 09:21:07 +1000
Subject: [PATCH 08/28] Refine external libraries to add dependencies

---
 .../external/onnxruntime_external_deps.cmake  | 67 +++++++++++++++++--
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index d515e117e0718..daec9ad75e061 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -575,10 +575,6 @@ if (onnxruntime_USE_MIMALLOC)
   onnxruntime_fetchcontent_makeavailable(mimalloc)
 endif()
 
-#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
-# dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
-# pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date ${ONNXRUNTIME_CLOG_TARGET_NAME})
 # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types.
 # The other libs do not have the problem. All the sources are already there. We can compile them in any order.
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
@@ -701,8 +697,69 @@ endif()
 
 if(onnxruntime_USE_SNPE)
     include(external/find_snpe.cmake)
-    list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
 endif()
 
+# add dependencies to the list of external libraries and populate onnxruntime_EXTERNAL_LIBRARIES with the result
+function(add_dependencies_to_external_libs output_var)
+  set (external_libs ${ARGN})
+  set(extended_deps)
+
+  function(get_dependencies input_target)
+    message(STATUS "get_dependencies: ${input_target}")
+    get_target_property(alias ${input_target} ALIASED_TARGET)
+    if(TARGET ${alias})
+      set(input_target ${alias})
+    endif()
+
+    if(${input_target} IN_LIST all_dependencies)
+      return()
+    endif()
+
+    list(APPEND all_dependencies ${input_target})
+
+    get_target_property(link_libraries ${input_target} LINK_LIBRARIES)
+    foreach(dependency IN LISTS link_libraries)
+      if(TARGET ${dependency})
+        get_dependencies(${dependency})
+      endif()
+    endforeach()
+
+    # get_target_property(link_libraries ${input_target} INTERFACE_LINK_LIBRARIES)
+    # foreach(dependency IN LISTS link_libraries)
+    #   if(TARGET ${dependency})
+    #     get_dependencies(${dependency})
+    #   endif()
+    # endforeach()
+
+    set(all_dependencies ${all_dependencies} PARENT_SCOPE)
+  endfunction()
+
+  foreach(external_lib IN LISTS external_libs)
+    message(STATUS "### Getting dependencies for : ${external_lib}")
+    get_dependencies(${external_lib})
+  endforeach()
+
+  foreach(dependency IN LISTS all_dependencies)
+    get_target_property(type ${dependency} TYPE)
+    if((${type} STREQUAL "STATIC_LIBRARY" OR ${type} STREQUAL "OBJECT_LIBRARY") AND
+       NOT ${dependency} IN_LIST external_libs_extended)
+      list(APPEND extended_deps ${dependency})
+    endif()
+  endforeach()
+
+  set(${output_var} ${extended_deps} PARENT_SCOPE)
+endfunction()
+
+# Create list of external libraries potentially added in this file.
+set(_external_libraries ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${SNPE_NN_LIBS} ${WIL_TARGET}
+                                   dawn::dawn_native dawn::dawn_proc nlohmann_json::nlohmann_json
+                                   onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface
+                                   flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date
+                                   ${ONNXRUNTIME_CLOG_TARGET_NAME})
+
+# add the dependencies as well. this is need in some places where we have to process the full list of libraries
+# e.g. iOS pre-linking.
+add_dependencies_to_external_libs(onnxruntime_EXTERNAL_LIBRARIES "${_external_libraries}")
+
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)

From edf50561d4155a7dd8df21c653ca576d0aa727dc Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 11:21:43 +1000
Subject: [PATCH 09/28] Try again

---
 cmake/external/helper_functions.cmake         | 59 +++++++++++
 .../external/onnxruntime_external_deps.cmake  | 97 ++++---------------
 2 files changed, 80 insertions(+), 76 deletions(-)

diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index e3f2211f96158..f16b7baeeb240 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -14,6 +14,65 @@ function(set_folder_for_subdir_targets srcDir folderName)
   endforeach()
 endfunction()
 
+# Add a new library and it's dependencies to an existing list.
+# The new library and any dependencies it has that are not already in the list will be prepended to the existing list.
+# output_var will be set to the combined list.
+#
+# e.g. libA is new, and depends on libB and libC. libB is already in extended_list.
+#      add_dependencies_to_external_lib(libA output_var "${existing_dependencies}") # need to quote existing list values
+#      before: existing_dependencies = [libB]
+#      after: output_var = [libA, libC, libB]
+function(add_dependencies_to_external_libs new_lib output_var)
+  set(existing_deps ${ARGN})
+  set(new_deps)
+
+  function(get_dependencies input_target)
+    get_target_property(alias ${input_target} ALIASED_TARGET)
+    if(TARGET ${alias})
+      set(input_target ${alias})
+    endif()
+
+    # if this already exists we don't need to recurse any more
+    if(${input_target} IN_LIST existing_deps OR ${input_target} IN_LIST new_deps)
+      return()
+    endif()
+
+    list(APPEND new_deps ${input_target})
+
+    get_target_property(link_libraries ${input_target} LINK_LIBRARIES)
+    foreach(dependency IN LISTS link_libraries)
+      if(TARGET ${dependency})
+        get_dependencies(${dependency})
+      endif()
+    endforeach()
+
+    # Add if needed. As this is to primarily update the items to link against, interface libraries shouldn't be relevant
+    # get_target_property(link_libraries ${input_target} INTERFACE_LINK_LIBRARIES)
+    # foreach(dependency IN LISTS link_libraries)
+    #   if(TARGET ${dependency})
+    #     get_dependencies(${dependency})
+    #   endif()
+    # endforeach()
+
+    set(new_deps ${new_deps} PARENT_SCOPE)
+  endfunction()
+
+  message(STATUS "### Getting dependencies for ${new_lib}")
+  get_dependencies(${new_lib})
+
+  set(combined_deps)
+  foreach(dependency IN LISTS new_deps)
+    get_target_property(type ${dependency} TYPE)
+    if(${type} STREQUAL "STATIC_LIBRARY" OR ${type} STREQUAL "OBJECT_LIBRARY")
+      list(APPEND combined_deps ${dependency})
+    endif()
+  endforeach()
+
+  list(APPEND combined_deps ${existing_deps})
+  message(STATUS "Combined: ${combined_deps}")
+  set(${output_var} ${combined_deps} PARENT_SCOPE)
+endfunction()
+
 # This file was copied from cmake source with modifications:
 # 1. Add the EXCLUDE_FROM_ALL keyword when this function calls add_subdirectory. It will also resolve the
 #    'make install' issue.
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index daec9ad75e061..5930b292eaf75 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -575,6 +575,11 @@ if (onnxruntime_USE_MIMALLOC)
   onnxruntime_fetchcontent_makeavailable(mimalloc)
 endif()
 
+set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json
+                                   onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface
+                                   flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date
+                                   ${ONNXRUNTIME_CLOG_TARGET_NAME})
+
 # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types.
 # The other libs do not have the problem. All the sources are already there. We can compile them in any order.
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
@@ -675,91 +680,31 @@ if (onnxruntime_USE_WEBGPU)
     set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
   endif()
 
-  if (ANDROID)
-  endif()
-
   onnxruntime_fetchcontent_makeavailable(dawn)
-endif()
 
-message(STATUS "Finished fetching external dependencies")
-
-set(onnxruntime_LINK_DIRS )
-
-if (onnxruntime_USE_CUDA)
-      find_package(CUDAToolkit REQUIRED)
-
-      if(onnxruntime_CUDNN_HOME)
-        file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
-        set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
-      endif()
-      include(cuDNN)
+  # Add with dependencies in reverse order as new values are added at the front in each call
+  add_dependencies_to_external_libs(dawn::dawn_proc onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
+  add_dependencies_to_external_libs(dawn::native onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
 endif()
 
+set(onnxruntime_LINK_DIRS)
 if(onnxruntime_USE_SNPE)
-    include(external/find_snpe.cmake)
+  include(external/find_snpe.cmake)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
 endif()
 
-# add dependencies to the list of external libraries and populate onnxruntime_EXTERNAL_LIBRARIES with the result
-function(add_dependencies_to_external_libs output_var)
-  set (external_libs ${ARGN})
-  set(extended_deps)
-
-  function(get_dependencies input_target)
-    message(STATUS "get_dependencies: ${input_target}")
-    get_target_property(alias ${input_target} ALIASED_TARGET)
-    if(TARGET ${alias})
-      set(input_target ${alias})
-    endif()
-
-    if(${input_target} IN_LIST all_dependencies)
-      return()
-    endif()
-
-    list(APPEND all_dependencies ${input_target})
-
-    get_target_property(link_libraries ${input_target} LINK_LIBRARIES)
-    foreach(dependency IN LISTS link_libraries)
-      if(TARGET ${dependency})
-        get_dependencies(${dependency})
-      endif()
-    endforeach()
-
-    # get_target_property(link_libraries ${input_target} INTERFACE_LINK_LIBRARIES)
-    # foreach(dependency IN LISTS link_libraries)
-    #   if(TARGET ${dependency})
-    #     get_dependencies(${dependency})
-    #   endif()
-    # endforeach()
-
-    set(all_dependencies ${all_dependencies} PARENT_SCOPE)
-  endfunction()
-
-  foreach(external_lib IN LISTS external_libs)
-    message(STATUS "### Getting dependencies for : ${external_lib}")
-    get_dependencies(${external_lib})
-  endforeach()
-
-  foreach(dependency IN LISTS all_dependencies)
-    get_target_property(type ${dependency} TYPE)
-    if((${type} STREQUAL "STATIC_LIBRARY" OR ${type} STREQUAL "OBJECT_LIBRARY") AND
-       NOT ${dependency} IN_LIST external_libs_extended)
-      list(APPEND extended_deps ${dependency})
-    endif()
-  endforeach()
-
-  set(${output_var} ${extended_deps} PARENT_SCOPE)
-endfunction()
+if (onnxruntime_USE_CUDA)
+  find_package(CUDAToolkit REQUIRED)
 
-# Create list of external libraries potentially added in this file.
-set(_external_libraries ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${SNPE_NN_LIBS} ${WIL_TARGET}
-                                   dawn::dawn_native dawn::dawn_proc nlohmann_json::nlohmann_json
-                                   onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface
-                                   flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date
-                                   ${ONNXRUNTIME_CLOG_TARGET_NAME})
+  if(onnxruntime_CUDNN_HOME)
+    file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
+    set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
+  endif()
 
-# add the dependencies as well. this is need in some places where we have to process the full list of libraries
-# e.g. iOS pre-linking.
-add_dependencies_to_external_libs(onnxruntime_EXTERNAL_LIBRARIES "${_external_libraries}")
+  include(cuDNN)
+endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+
+message(STATUS "Finished fetching external dependencies")

From 0ad21bfdf1287e15e12b679107c050cd872a99e5 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 11:31:59 +1000
Subject: [PATCH 10/28] De-alias existing deps

---
 cmake/external/helper_functions.cmake          | 14 ++++++++++++--
 cmake/external/onnxruntime_external_deps.cmake |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index f16b7baeeb240..506c199ad28ce 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -23,9 +23,19 @@ endfunction()
 #      before: existing_dependencies = [libB]
 #      after: output_var = [libA, libC, libB]
 function(add_dependencies_to_external_libs new_lib output_var)
-  set(existing_deps ${ARGN})
+  set(existing_deps_in ${ARGN})
   set(new_deps)
 
+  # need to de-alias existing_deps
+  foreach(existing_dep IN LISTS existing_deps_in)
+    get_target_property(alias ${existing_dep} ALIASED_TARGET)
+    if(TARGET ${alias})
+      list(APPEND existing_deps ${alias})
+    else()
+      list(APPEND existing_deps ${existing_dep})
+    endif()
+  endforeach()
+
   function(get_dependencies input_target)
     get_target_property(alias ${input_target} ALIASED_TARGET)
     if(TARGET ${alias})
@@ -68,7 +78,7 @@ function(add_dependencies_to_external_libs new_lib output_var)
     endif()
   endforeach()
 
-  list(APPEND combined_deps ${existing_deps})
+  list(APPEND combined_deps ${existing_deps_in})
   message(STATUS "Combined: ${combined_deps}")
   set(${output_var} ${combined_deps} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 5930b292eaf75..7bf42acf6827c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -684,7 +684,7 @@ if (onnxruntime_USE_WEBGPU)
 
   # Add with dependencies in reverse order as new values are added at the front in each call
   add_dependencies_to_external_libs(dawn::dawn_proc onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
-  add_dependencies_to_external_libs(dawn::native onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
+  add_dependencies_to_external_libs(dawn::dawn_native onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
 endif()
 
 set(onnxruntime_LINK_DIRS)

From ee1d958b83d912e109cc1cdcc9a87ea76161988d Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Sun, 15 Sep 2024 19:30:07 -0700
Subject: [PATCH 11/28] Fix C++20 errors

---
 include/onnxruntime/core/common/logging/logging.h        | 9 +++++----
 onnxruntime/core/common/logging/sinks/ostream_sink.cc    | 3 ++-
 .../core/platform/apple/logging/apple_log_sink.mm        | 4 +++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 55b5c25d1a222..9a2d8dd71b049 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -58,10 +58,11 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
-// TODO: When other compilers support std::chrono::operator<<, update this.
-// TODO: Check support for other compilers' version before enable C++20 for other compilers.
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4.
-#if __cplusplus >= 202002L && __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4, but the target macOS version must be
+// >= 13.3 for it to be used.
+#if __cplusplus >= 202002L && \
+  (!defined(__MAC_OS_X_VERSION_MAX_ALLOWED) || __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L) && \
+  (!defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED >= 130300L)
 namespace timestamp_ns = std::chrono;
 #else
 namespace timestamp_ns = ::date;
diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
index 033f4d2573cda..82af514ef3c63 100644
--- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc
+++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
@@ -45,7 +45,8 @@ void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger
   }
 #endif
 
-  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+  timestamp_ns::operator<<(msg, timestamp); // handle ambiguity with C++20 where date and std::chrono have operator<<
+  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
 
 #ifndef ORT_MINIMAL_BUILD
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 78614ffd2819d..88f2a828cf445 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -15,7 +15,9 @@
 void AppleLogSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
   using timestamp_ns::operator<<;
   std::ostringstream msg;
-  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+
+  timestamp_ns::operator<<(msg, timestamp); // handle ambiguity with C++20 where date and std::chrono have operator<<
+  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
   NSLog(@"%s", msg.str().c_str());
 }

From 3cb0c6a4de53a9da8e4026ca76c44ae30dff44f1 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 12:59:44 +1000
Subject: [PATCH 12/28] Fix c++20 errors

---
 include/onnxruntime/core/common/logging/logging.h        | 9 +++++----
 onnxruntime/core/common/logging/sinks/ostream_sink.cc    | 3 ++-
 .../core/platform/apple/logging/apple_log_sink.mm        | 4 +++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 55b5c25d1a222..9a2d8dd71b049 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -58,10 +58,11 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
-// TODO: When other compilers support std::chrono::operator<<, update this.
-// TODO: Check support for other compilers' version before enable C++20 for other compilers.
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4.
-#if __cplusplus >= 202002L && __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4, but the target macOS version must be
+// >= 13.3 for it to be used.
+#if __cplusplus >= 202002L && \
+  (!defined(__MAC_OS_X_VERSION_MAX_ALLOWED) || __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L) && \
+  (!defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED >= 130300L)
 namespace timestamp_ns = std::chrono;
 #else
 namespace timestamp_ns = ::date;
diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
index 033f4d2573cda..82af514ef3c63 100644
--- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc
+++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
@@ -45,7 +45,8 @@ void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger
   }
 #endif
 
-  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+  timestamp_ns::operator<<(msg, timestamp); // handle ambiguity with C++20 where date and std::chrono have operator<<
+  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
 
 #ifndef ORT_MINIMAL_BUILD
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 78614ffd2819d..88f2a828cf445 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -15,7 +15,9 @@
 void AppleLogSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
   using timestamp_ns::operator<<;
   std::ostringstream msg;
-  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+
+  timestamp_ns::operator<<(msg, timestamp); // handle ambiguity with C++20 where date and std::chrono have operator<<
+  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
   NSLog(@"%s", msg.str().c_str());
 }

From 7b85ddaf6d2b8f957289c26273b57ba22479bd28 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 18:18:15 +1000
Subject: [PATCH 13/28] Update some apple infra

---
 cmake/onnxruntime.cmake                          | 16 ++++++++++++++--
 .../apple/build_and_assemble_apple_pods.py       |  2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 81f5c20070c81..bee2cabe7460b 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -89,10 +89,22 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # create Info.plist for the framework and podspec for CocoaPods (optional)
   set(MACOSX_FRAMEWORK_NAME "onnxruntime")
   set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime")
-  # Need to include CoreML as a weaklink for CocoaPods package if the EP is enabled
+
+  # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled.
   if(onnxruntime_USE_COREML)
-    set(APPLE_WEAK_FRAMEWORK "\\\"CoreML\\\"")
+    list(APPEND _weak_frameworks "\\\"CoreML\\\"")
+  endif()
+
+  if(onnxruntime_USE_WEBGPU)
+    # TODO: Dawn includes all these. TBD if we need any others. As we're not doing anything graphical we may not.
+    # Cocoa (MacOS only), Foundation, IOKit, IOSurface, QuartzCore
+    list(APPEND _weak_frameworks "\\\"Metal\\\"")
   endif()
+
+  if (_weak_frameworks)
+    string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks})
+  endif()
+
   set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist")
   configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH})
   configure_file(
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 71aeb9e7b0304..dd037c17ae3b3 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -133,6 +133,8 @@ def main():
             str(build_dir / "framework_out"),
             "--variant",
             package_variant.name,
+            "--test_project_stage_dir",  # use a specific directory so it's easier to debug
+            str(build_dir / "test_apple_packages_staging"),
         ]
 
         run(test_apple_packages_args)

From 1da2bcea0576f2b13c4907052ef6010217b8ac7e Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 18:21:37 +1000
Subject: [PATCH 14/28] Enable webgpu in some configs to test via CI

---
 .../apple/default_full_apple_framework_build_settings.json       | 1 +
 .../github/apple/default_full_ios_framework_build_settings.json  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 84d7e355ed5b4..6175ac3a0ad58 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -19,6 +19,7 @@
             "--build_apple_framework",
             "--use_coreml",
             "--use_xnnpack",
+            "--use_webgpu",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index e2d8f70c02cf3..91646a8fbeb38 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -17,6 +17,7 @@
             "--parallel",
             "--build_apple_framework",
             "--use_coreml",
+            "--use_webgpu",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],

From 259aa5df90b506c7ccb6a63ca4d4c9c1102b9c8e Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Mon, 16 Sep 2024 18:25:05 +1000
Subject: [PATCH 15/28] Update one more CI

---
 .../azure-pipelines/templates/mac-cpu-packing-jobs.yml      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 01ec3b5a2f8ca..045de0da1fee1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -98,7 +98,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64"
         BuildJava: false
         BuildNodejs: false
         WithCache: ${{ parameters.WithCache }}
@@ -110,7 +110,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml  --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
         BuildJava: true
         BuildNodejs: true
         WithCache: ${{ parameters.WithCache }}
@@ -122,7 +122,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml  --use_webgpu
         BuildJava: true
         BuildNodejs: true
         WithCache: ${{ parameters.WithCache }}

From 64ccd2de32ca70da2490abdfa5e91245bd51a2d8 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 17 Sep 2024 16:55:38 +1000
Subject: [PATCH 16/28] Fix some build and test issues

---
 cmake/external/helper_functions.cmake         | 69 -------------------
 .../external/onnxruntime_external_deps.cmake  | 10 ++-
 cmake/onnxruntime.cmake                       | 49 +++++++++++--
 .../main/java/ai/onnxruntime/OrtProvider.java |  4 +-
 .../webgpu/webgpu_execution_provider.cc       |  4 +-
 5 files changed, 54 insertions(+), 82 deletions(-)

diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index 506c199ad28ce..e3f2211f96158 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -14,75 +14,6 @@ function(set_folder_for_subdir_targets srcDir folderName)
   endforeach()
 endfunction()
 
-# Add a new library and it's dependencies to an existing list.
-# The new library and any dependencies it has that are not already in the list will be prepended to the existing list.
-# output_var will be set to the combined list.
-#
-# e.g. libA is new, and depends on libB and libC. libB is already in extended_list.
-#      add_dependencies_to_external_lib(libA output_var "${existing_dependencies}") # need to quote existing list values
-#      before: existing_dependencies = [libB]
-#      after: output_var = [libA, libC, libB]
-function(add_dependencies_to_external_libs new_lib output_var)
-  set(existing_deps_in ${ARGN})
-  set(new_deps)
-
-  # need to de-alias existing_deps
-  foreach(existing_dep IN LISTS existing_deps_in)
-    get_target_property(alias ${existing_dep} ALIASED_TARGET)
-    if(TARGET ${alias})
-      list(APPEND existing_deps ${alias})
-    else()
-      list(APPEND existing_deps ${existing_dep})
-    endif()
-  endforeach()
-
-  function(get_dependencies input_target)
-    get_target_property(alias ${input_target} ALIASED_TARGET)
-    if(TARGET ${alias})
-      set(input_target ${alias})
-    endif()
-
-    # if this already exists we don't need to recurse any more
-    if(${input_target} IN_LIST existing_deps OR ${input_target} IN_LIST new_deps)
-      return()
-    endif()
-
-    list(APPEND new_deps ${input_target})
-
-    get_target_property(link_libraries ${input_target} LINK_LIBRARIES)
-    foreach(dependency IN LISTS link_libraries)
-      if(TARGET ${dependency})
-        get_dependencies(${dependency})
-      endif()
-    endforeach()
-
-    # Add if needed. As this is to primarily update the items to link against, interface libraries shouldn't be relevant
-    # get_target_property(link_libraries ${input_target} INTERFACE_LINK_LIBRARIES)
-    # foreach(dependency IN LISTS link_libraries)
-    #   if(TARGET ${dependency})
-    #     get_dependencies(${dependency})
-    #   endif()
-    # endforeach()
-
-    set(new_deps ${new_deps} PARENT_SCOPE)
-  endfunction()
-
-  message(STATUS "### Getting dependencies for ${new_lib}")
-  get_dependencies(${new_lib})
-
-  set(combined_deps)
-  foreach(dependency IN LISTS new_deps)
-    get_target_property(type ${dependency} TYPE)
-    if(${type} STREQUAL "STATIC_LIBRARY" OR ${type} STREQUAL "OBJECT_LIBRARY")
-      list(APPEND combined_deps ${dependency})
-    endif()
-  endforeach()
-
-  list(APPEND combined_deps ${existing_deps_in})
-  message(STATUS "Combined: ${combined_deps}")
-  set(${output_var} ${combined_deps} PARENT_SCOPE)
-endfunction()
-
 # This file was copied from cmake source with modifications:
 # 1. Add the EXCLUDE_FROM_ALL keyword when this function calls add_subdirectory. It will also resolve the
 #    'make install' issue.
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 7bf42acf6827c..22aae32d05744 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -642,7 +642,7 @@ if (onnxruntime_USE_WEBGPU)
     PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
   )
 
-  # use dawn::native_objects and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+  # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
   set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
   set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
   set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
@@ -682,9 +682,7 @@ if (onnxruntime_USE_WEBGPU)
 
   onnxruntime_fetchcontent_makeavailable(dawn)
 
-  # Add with dependencies in reverse order as new values are added at the front in each call
-  add_dependencies_to_external_libs(dawn::dawn_proc onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
-  add_dependencies_to_external_libs(dawn::dawn_native onnxruntime_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native dawn::dawn_proc)
 endif()
 
 set(onnxruntime_LINK_DIRS)
@@ -704,7 +702,7 @@ if (onnxruntime_USE_CUDA)
   include(cuDNN)
 endif()
 
-FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
-FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR)
+FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR)
 
 message(STATUS "Finished fetching external dependencies")
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index bee2cabe7460b..2f4ffbb6adbd7 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -376,16 +376,57 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
     endif()
   endforeach()
 
+  set(_processed_libs)  # keep track of processed libraries to skip any duplicate dependencies
+  function(add_symlink_for_static_lib_and_dependencies lib)
+    function(process cur_target)
+      # de-alias if applicable so a consistent target name is used
+      get_target_property(alias ${cur_target} ALIASED_TARGET)
+      if(TARGET ${alias})
+        set(cur_target ${alias})
+      endif()
+
+      if(${cur_target} IN_LIST _processed_libs OR ${cur_target} IN_LIST lib_and_dependencies)
+        return()
+      endif()
+
+      list(APPEND lib_and_dependencies ${cur_target})
+
+      get_target_property(link_libraries ${cur_target} LINK_LIBRARIES)
+      foreach(dependency ${link_libraries})
+        if(TARGET ${dependency})
+          process(${dependency})
+        endif()
+      endforeach()
+
+      set(lib_and_dependencies ${lib_and_dependencies} PARENT_SCOPE)
+    endfunction()
+
+    set(lib_and_dependencies)
+    process(${lib})
+
+    foreach(_target ${lib_and_dependencies})
+      get_target_property(type ${_target} TYPE)
+      if(${type} STREQUAL "STATIC_LIBRARY")
+        # message(STATUS "Adding symlink for ${_target}")
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                           COMMAND ${CMAKE_COMMAND} -E create_symlink
+                             $<TARGET_FILE:${_target}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_target}>)
+      endif()
+    endforeach()
+
+    list(APPEND _processed_libs ${lib_and_dependencies})
+    set(_processed_libs ${_processed_libs} PARENT_SCOPE)
+  endfunction()
+
   # for external libraries we create a symlink to the .a file
   foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES})
-    if(NOT TARGET ${_LIB}) # if we didn't build from source. it may not a target
+    if(NOT TARGET ${_LIB}) # if we didn't build from source it may not be a target
       continue()
     endif()
+
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E create_symlink
-                           $<TARGET_FILE:${_LIB}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_LIB}>)
+      add_symlink_for_static_lib_and_dependencies(${_LIB})
     endif()
   endforeach()
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtProvider.java b/java/src/main/java/ai/onnxruntime/OrtProvider.java
index ae9cb9f908629..b06f884896ee8 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProvider.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProvider.java
@@ -40,7 +40,9 @@ public enum OrtProvider {
   /** The XNNPACK execution provider. */
   XNNPACK("XnnpackExecutionProvider"),
   /** The Azure remote endpoint execution provider. */
-  AZURE("AzureExecutionProvider");
+  AZURE("AzureExecutionProvider"),
+  /** The WebGPU execution provider */
+  WEBGPU("WebGpuExecutionProvider");
 
   private static final Map<String, OrtProvider> valueMap = new HashMap<>(values().length);
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index d049cbbf64560..c689909e73c4b 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -115,7 +115,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Cosh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Asinh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Acosh);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Atanh);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Atanh); TEMPORARY - Doesn't handle 1.0f -> inf with Metal
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 12, Tanh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Tanh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, Not);
@@ -428,7 +428,7 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(9, Cosh),
       KERNEL_CREATE_INFO(9, Asinh),
       KERNEL_CREATE_INFO(9, Acosh),
-      KERNEL_CREATE_INFO(9, Atanh),
+      // KERNEL_CREATE_INFO(9, Atanh),  TEMPORARY - Doesn't handle 1.0f -> inf with Metal
       KERNEL_CREATE_INFO_VERSIONED(6, 12, Tanh),
       KERNEL_CREATE_INFO(13, Tanh),
       // KERNEL_CREATE_INFO(1, Not),

From ce23c21bd8b8cf2d6663cb4efd73f34820186aa3 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 18 Sep 2024 16:31:14 +1000
Subject: [PATCH 17/28] Expand check on whether std::chrono::operator<< can be
 used to cover catalyst Add additional required frameworks

---
 cmake/onnxruntime.cmake                       |  4 ++-
 .../onnxruntime/core/common/logging/logging.h | 35 +++++++++++++++----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 2f4ffbb6adbd7..d601f15b3a3d8 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -97,7 +97,9 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
 
   if(onnxruntime_USE_WEBGPU)
     # TODO: Dawn includes all these. TBD if we need any others. As we're not doing anything graphical we may not.
-    # Cocoa (MacOS only), Foundation, IOKit, IOSurface, QuartzCore
+    # Cocoa (MacOS only), Foundation, IOKit
+    list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
+    list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
     list(APPEND _weak_frameworks "\\\"Metal\\\"")
   endif()
 
diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 9a2d8dd71b049..d16def7b91cd4 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -58,16 +58,37 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4, but the target macOS version must be
-// >= 13.3 for it to be used.
-#if __cplusplus >= 202002L && \
-  (!defined(__MAC_OS_X_VERSION_MAX_ALLOWED) || __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L) && \
-  (!defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED >= 130300L)
-namespace timestamp_ns = std::chrono;
+// C++20 has operator<< in std::chrono for Timestamp type but some mac builds have additional checks on the
+// target deployment.
+#define _USE_CXX20_STD_CHRONO __cplusplus >= 202002L
+
+// Apply constraints for mac builds
+#if __APPLE__
+  #include <TargetConditionals.h>
+  // Catalyst check must be first as it has both TARGET_OS_MACCATALYST and TARGET_OS_MAC set
+  #if TARGET_OS_MACCATALYST
+    // maccatalyst requires version 16.3
+    #if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
+      #undef _USE_CXX20_STD_CHRONO
+    #endif
+  #elif TARGET_OS_MAC
+    // Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
+    // but the target macOS version must be >= 13.3 for it to be used.
+    #if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
+        (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
+      #undef _USE_CXX20_STD_CHRONO
+    #endif
+  #endif
+#endif
+
+#if defined(_USE_CXX20_STD_CHRONO)
+  namespace timestamp_ns = std::chrono;
 #else
-namespace timestamp_ns = ::date;
+  namespace timestamp_ns = ::date;
 #endif
 
+#undef _USE_CXX20_STD_CHRONO
+
 #ifndef NDEBUG
 ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true;  // Set directly based on your needs.
 #else

From 51db660147071f4ba6f4316c2465a3314f4bb92f Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 18 Sep 2024 17:10:26 +1000
Subject: [PATCH 18/28] Fix condition. Leave in some pragmas for debugging
 build failures short term

---
 .../onnxruntime/core/common/logging/logging.h    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index d16def7b91cd4..6384594c07123 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -81,10 +81,20 @@ using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
   #endif
 #endif
 
-#if defined(_USE_CXX20_STD_CHRONO)
-  namespace timestamp_ns = std::chrono;
+#define STRINGIFY(x) STRINGIFY2(x)
+#define STRINGIFY2(x) #x
+
+#pragma message("_USE_CXX20_STD_CHRONO is " STRINGIFY(_USE_CXX20_STD_CHRONO))
+#pragma message("TARGET_OS_MAC is " STRINGIFY(TARGET_OS_MAC))
+#pragma message("TARGET_OS_MACCATALYST is " STRINGIFY(TARGET_OS_MACCATALYST))
+#pragma message("__IPHONE_OS_VERSION_MIN_REQUIRED is " STRINGIFY(__IPHONE_OS_VERSION_MIN_REQUIRED))
+
+#if _USE_CXX20_STD_CHRONO
+namespace timestamp_ns = std::chrono;
+#pragma message("Using std::chrono")
 #else
-  namespace timestamp_ns = ::date;
+namespace timestamp_ns = ::date;
+#pragma message("Using ::date")
 #endif
 
 #undef _USE_CXX20_STD_CHRONO

From b712ebc5802ee5a87ff35a5b5db1e00d5464e653 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 18 Sep 2024 17:37:21 +1000
Subject: [PATCH 19/28] Add dummy header

---
 .../providers/webgpu/webgpu_provider_factory.h     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h

diff --git a/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
new file mode 100644
index 0000000000000..0b45b847d651f
--- /dev/null
+++ b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Dummy file to provide a signal in the ONNX Runtime C cocoapod as to whether the WebGPU EP was included in the build.
+// If it was, this file will be included in the cocoapod, and a test like this can be used:
+//
+//   #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+//     #define WEBGPU_EP_AVAILABLE 1
+//   #else
+//     #define WEBGPU_EP_AVAILABLE 0
+//   #endif
+
+// The WebGPU EP can be enabled via the generic SessionOptionsAppendExecutionProvider method, so no direct usage of
+// the provider factory is required.

From 45f3bbfa13eec78b3087961d45179f5bb992bb08 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 18 Sep 2024 17:52:40 +1000
Subject: [PATCH 20/28] Update apple uitest apps to run webgpu tests

---
 .../ios_package_uitest_cpp_api.mm             | 27 ++++++++++++++----
 .../macos_package_uitest_cpp_api.mm           | 28 +++++++++++++++----
 2 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index d145a00b1348f..3783d684b891d 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -12,16 +12,20 @@
 #include <onnxruntime/onnxruntime_cxx_api.h>
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
-#define COREML_EP_AVAILABLE 1
+  #define COREML_EP_AVAILABLE 1
+  #include <onnxruntime/coreml_provider_factory.h>
 #else
-#define COREML_EP_AVAILABLE 0
+  #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+  #define WEBGPUL_EP_AVAILABLE 1
+  // WebGPU EP doesn't require including the header as it's enabled via AddExecutionProvider
+#else
+  #define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.OrtSessionOptionsAppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,9 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
 @end
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index 613c6e545939f..b9a7074593488 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -12,16 +12,20 @@
 #include <onnxruntime/onnxruntime_cxx_api.h>
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
-#define COREML_EP_AVAILABLE 1
+  #define COREML_EP_AVAILABLE 1
+  #include <onnxruntime/coreml_provider_factory.h>
 #else
-#define COREML_EP_AVAILABLE 0
+  #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+  #define WEBGPUL_EP_AVAILABLE 1
+  // WebGPU EP doesn't require including the header as it's enabled via AddExecutionProvider
+#else
+  #define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.OrtSessionOptionsAppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,10 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
+
 @end

From 9b888af338ae51b4952f2fa5221d1b851525e8f5 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 18 Sep 2024 20:14:16 +1000
Subject: [PATCH 21/28] Disable WebGPU in mac-catalyst build. APIs used by Dawn
 are not available on catalyst.

---
 .../apple/default_full_ios_framework_build_settings.json       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index 91646a8fbeb38..4c2c9442ab217 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -17,7 +17,6 @@
             "--parallel",
             "--build_apple_framework",
             "--use_coreml",
-            "--use_webgpu",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
@@ -25,12 +24,14 @@
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
+            "--use_webgpu",
             "--apple_deploy_target=13.0"
         ],
         "iphonesimulator": [
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
+            "--use_webgpu",
             "--apple_deploy_target=13.0"
         ],
         "macabi":[

From 210a760ade1e64062031a1441495d028a5138c80 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Thu, 19 Sep 2024 09:01:22 +1000
Subject: [PATCH 22/28] Fix AppendExecutionProvider call

---
 .../ios_package_testUITests/ios_package_uitest_cpp_api.mm       | 2 +-
 .../macos_package_testUITests/macos_package_uitest_cpp_api.mm   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index 3783d684b891d..952209a1de244 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -45,7 +45,7 @@ void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU =
   if (useWebGPU) {
     std::unordered_map<std::string, std::string> provider_options;
     // set provider options if needed. e.g. deviceId
-    session_options.OrtSessionOptionsAppendExecutionProvider("WebGPU", provider_options);
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
   }
 
   Ort::Session session(env, modelPath, session_options);
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index b9a7074593488..807fee92144a1 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -45,7 +45,7 @@ void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU =
   if (useWebGPU) {
     std::unordered_map<std::string, std::string> provider_options;
     // set provider options if needed. e.g. deviceId
-    session_options.OrtSessionOptionsAppendExecutionProvider("WebGPU", provider_options);
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
   }
 
   Ort::Session session(env, modelPath, session_options);

From edb998034cd69f9196ea02ce35d1faed7e0629f3 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 24 Sep 2024 08:40:46 +1000
Subject: [PATCH 23/28] reduce some diffs

---
 cmake/external/onnxruntime_external_deps.cmake | 10 +++++-----
 cmake/onnxruntime.cmake                        |  2 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 22aae32d05744..6f54ce1b4face 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -686,11 +686,6 @@ if (onnxruntime_USE_WEBGPU)
 endif()
 
 set(onnxruntime_LINK_DIRS)
-if(onnxruntime_USE_SNPE)
-  include(external/find_snpe.cmake)
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
-endif()
-
 if (onnxruntime_USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
 
@@ -702,6 +697,11 @@ if (onnxruntime_USE_CUDA)
   include(cuDNN)
 endif()
 
+if(onnxruntime_USE_SNPE)
+  include(external/find_snpe.cmake)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
+endif()
+
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR)
 
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 312370d96d04a..6c7b5cf2667dc 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -96,8 +96,6 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
   endif()
 
   if(onnxruntime_USE_WEBGPU)
-    # TODO: Dawn includes all these. TBD if we need any others. As we're not doing anything graphical we may not.
-    # Cocoa (MacOS only), Foundation, IOKit
     list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
     list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
     list(APPEND _weak_frameworks "\\\"Metal\\\"")

From 698e6ae1051100d8226b268824cdba373fe88292 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 24 Sep 2024 11:25:09 +1000
Subject: [PATCH 24/28] Enable in Android build for automated testing. Fix
 comment.

---
 .../ios_package_uitest_cpp_api.mm                    | 12 ++++++------
 .../macos_package_uitest_cpp_api.mm                  | 12 ++++++------
 .../android/default_full_aar_build_settings.json     |  1 +
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index 952209a1de244..0546d840471cc 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -12,17 +12,17 @@
 #include <onnxruntime/onnxruntime_cxx_api.h>
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
-  #define COREML_EP_AVAILABLE 1
-  #include <onnxruntime/coreml_provider_factory.h>
+#define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
-  #define COREML_EP_AVAILABLE 0
+#define COREML_EP_AVAILABLE 0
 #endif
 
 #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
-  #define WEBGPUL_EP_AVAILABLE 1
-  // WebGPU EP doesn't require including the header as it's enabled via AddExecutionProvider
+#define WEBGPUL_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
 #else
-  #define WEBGPU_EP_AVAILABLE 0
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
 void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index 807fee92144a1..efdbd9b768ec0 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -12,17 +12,17 @@
 #include <onnxruntime/onnxruntime_cxx_api.h>
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
-  #define COREML_EP_AVAILABLE 1
-  #include <onnxruntime/coreml_provider_factory.h>
+#define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
-  #define COREML_EP_AVAILABLE 0
+#define COREML_EP_AVAILABLE 0
 #endif
 
 #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
-  #define WEBGPUL_EP_AVAILABLE 1
-  // WebGPU EP doesn't require including the header as it's enabled via AddExecutionProvider
+#define WEBGPUL_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
 #else
-  #define WEBGPU_EP_AVAILABLE 0
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
 void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json
index b0eff75812673..f08f246748a5a 100644
--- a/tools/ci_build/github/android/default_full_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_full_aar_build_settings.json
@@ -16,6 +16,7 @@
         "--build_shared_lib",
         "--use_nnapi",
         "--use_xnnpack",
+        "--use_webgpu",
         "--skip_tests"
     ]
 }

From b9e98a79e395a81c81e42069a8b06577c956b1f4 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 24 Sep 2024 11:50:10 +1000
Subject: [PATCH 25/28] Fix typo in #define

---
 .../ios_package_testUITests/ios_package_uitest_cpp_api.mm       | 2 +-
 .../macos_package_testUITests/macos_package_uitest_cpp_api.mm   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index 0546d840471cc..32b4b32e299d6 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -19,7 +19,7 @@
 #endif
 
 #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
-#define WEBGPUL_EP_AVAILABLE 1
+#define WEBGPU_EP_AVAILABLE 1
 // WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
 #else
 #define WEBGPU_EP_AVAILABLE 0
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index efdbd9b768ec0..86001b6cb50a5 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -19,7 +19,7 @@
 #endif
 
 #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
-#define WEBGPUL_EP_AVAILABLE 1
+#define WEBGPU_EP_AVAILABLE 1
 // WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
 #else
 #define WEBGPU_EP_AVAILABLE 0

From 4b55f23d1bff28932442b07afc487687681a5cca Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 24 Sep 2024 15:19:01 +1000
Subject: [PATCH 26/28] Fix some macos warnings.

---
 onnxruntime/core/providers/webgpu/shader_variable.h | 3 +++
 onnxruntime/core/providers/webgpu/tensor/where.cc   | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 326c6814410de..ce68fc04993e6 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -67,6 +67,9 @@ class ShaderIndicesHelper {
  public:
   ShaderIndicesHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims);
 
+  ShaderIndicesHelper(ShaderIndicesHelper&&) = default;
+  ShaderIndicesHelper& operator=(ShaderIndicesHelper&&) = default;
+
   inline int NumComponents() const { return num_components_; }
 
   // create a WGSL expression ({varname}_indices_t) for getting indices from offset.
diff --git a/onnxruntime/core/providers/webgpu/tensor/where.cc b/onnxruntime/core/providers/webgpu/tensor/where.cc
index 31806a0af1741..1d58538a7489c 100644
--- a/onnxruntime/core/providers/webgpu/tensor/where.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/where.cc
@@ -59,7 +59,7 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& b_input = shader.AddInput("b_data", ShaderUsage::UseUniform);
   const auto& output = shader.AddOutput("output_data", ShaderUsage::UseUniform);
 
-  auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> const auto {
+  const auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> auto {
     return "select(" + b + ", " + a + ", " + c + ")";
   };
   std::string assignment;
@@ -74,10 +74,10 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const {
     const auto& b_indices = shader.AddIndices("b_indices");
     const auto& output_indices = shader.AddIndices("output_indices");
 
-    auto single_assignment =
+    const auto single_assignment =
         [&expression, &output_indices, &a_indices, &b_indices, &c_indices](
             const std::string& rest_str, const std::string& x, const std::string& type_cast = "")
-        -> const auto {
+        -> auto {
       const std::string a_expression = "a_data[index_a" + x + "][component_a" + x + "]";
       const std::string b_expression = "b_data[index_b" + x + "][component_b" + x + "]";
       const std::string c_expression = "bool(c_data[index_c" + x + "] & (0xffu << (component_c" + x + " * 8)))";

From af7c39ea9d3798405fd64d58bd592979732487e0 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Wed, 25 Sep 2024 13:13:43 +1000
Subject: [PATCH 27/28] Fix ATanH on Metal

---
 .../providers/webgpu/math/unary_elementwise_ops.cc   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
index 3b43c87fb0c82..9e8117aa34a92 100644
--- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
@@ -165,7 +165,19 @@ WEBGPU_ELEMENTWISE_KERNEL(Asinh, 9, WebGpuSupportedFloatTypes())
 WEBGPU_ELEMENTWISE_IMPL(Acosh, "acosh(a)")
 WEBGPU_ELEMENTWISE_KERNEL(Acosh, 9, WebGpuSupportedFloatTypes())
 
+#if __APPLE__
+// Metal returns 0 for values >= 1.0.
+// Need custom impl to return +inf for 1.0 (by dividing 1 by 0), and NaN for > 1.0 (by dividing 0 by 0)
+WEBGPU_ELEMENTWISE_IMPL(Atanh,
+                        "select("
+                        " select(x_value_t(1.0), x_value_t(0.0), a > x_value_t(1.0)) / x_value_t(0.0),"
+                        " atanh(a),"
+                        " a < x_value_t(1.0))",
+                        "",
+                        ShaderUsage::UseValueTypeAlias)
+#else
 WEBGPU_ELEMENTWISE_IMPL(Atanh, "atanh(a)")
+#endif
 WEBGPU_ELEMENTWISE_KERNEL(Atanh, 9, WebGpuSupportedFloatTypes())
 
 WEBGPU_ELEMENTWISE_IMPL(Not, "!a")

From 770023f05d4c98ac958da77377d2a9b80f2c1e57 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Thu, 26 Sep 2024 09:45:09 +1000
Subject: [PATCH 28/28] Minor cleanups

---
 cmake/onnxruntime.cmake                                       | 1 +
 .../core/providers/webgpu/webgpu_execution_provider.cc        | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 6c7b5cf2667dc..b1d797ca16adc 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -376,6 +376,7 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
     endif()
   endforeach()
 
+  # helper function that recurses to also handle static library dependencies of the ORT external libraries
   set(_processed_libs)  # keep track of processed libraries to skip any duplicate dependencies
   function(add_symlink_for_static_lib_and_dependencies lib)
     function(process cur_target)
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index ea0615c0f4017..f5d66d6a24134 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -115,7 +115,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Cosh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Asinh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Acosh);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Atanh); TEMPORARY - Doesn't handle 1.0f -> inf with Metal
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, Atanh);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 12, Tanh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Tanh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, Not);
@@ -435,7 +435,7 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(9, Cosh),
       KERNEL_CREATE_INFO(9, Asinh),
       KERNEL_CREATE_INFO(9, Acosh),
-      // KERNEL_CREATE_INFO(9, Atanh),  TEMPORARY - Doesn't handle 1.0f -> inf with Metal
+      KERNEL_CREATE_INFO(9, Atanh),
       KERNEL_CREATE_INFO_VERSIONED(6, 12, Tanh),
       KERNEL_CREATE_INFO(13, Tanh),
       KERNEL_CREATE_INFO(1, Not),