diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
index 9ddc51bda..ba53d46d1 100644
--- a/src/BuildOnLinux.cmake
+++ b/src/BuildOnLinux.cmake
@@ -3,20 +3,21 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS}
-  ${ATen_XPU_NATIVE_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS}
-  ${ATen_XPU_XCCL_SRCS})
-
-if(USE_C10D_XCCL)
-  target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
-  target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
-endif()
 
 if(BUILD_SEPARATE_OPS)
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS}
+    ${ATen_XPU_XCCL_SRCS})
+
+  if(USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
+    target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
+  endif()
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
     set(sycl_lib torch-xpu-ops-sycl-${name})
@@ -31,6 +32,19 @@ if(BUILD_SEPARATE_OPS)
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
 elseif(BUILD_SPLIT_KERNEL_LIB)
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS}
+    ${ATen_XPU_XCCL_SRCS})
+
+  if(USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
+    target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
+  endif()
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
   # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
   set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
   set(ATen_XPU_SYCL_REDUCE_SRCS)
@@ -110,18 +124,24 @@ elseif(BUILD_SPLIT_KERNEL_LIB)
   install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 else()
   sycl_add_library(
-    torch_xpu_ops_sycl_kernels
-    SHARED
+    xpu_sycl
+    STATIC
+    CXX_SOURCES  ${ATen_XPU_CPP_SRCS} ${ATen_XPU_NATIVE_CPP_SRCS} ${ATen_XPU_GEN_SRCS} ${ATen_XPU_XCCL_SRCS}
     SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC torch_xpu_ops_sycl_kernels)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_sycl_kernels)
-
-  install(TARGETS torch_xpu_ops_sycl_kernels DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  add_library(torch_xpu_ops ALIAS xpu_sycl)
+  set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops)
+  set(SYCL_TARGET xpu_sycl)
+  if(USE_C10D_XCCL)
+    target_compile_definitions(xpu_sycl PRIVATE USE_C10D_XCCL)
+    target_link_libraries(xpu_sycl  PUBLIC torch::xccl)
+  endif()
+
+  install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
+  
 endif()
 set(SYCL_LINK_LIBRARIES_KEYWORD)
 
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-
 foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
   torch_compile_options(${lib})
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
index 0f571c247..58dabb53f 100644
--- a/src/BuildOnWindows.cmake
+++ b/src/BuildOnWindows.cmake
@@ -3,25 +3,27 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS})
-set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
-target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
-
-add_library(
-  torch_xpu_ops_aten
-  SHARED
-  ${ATen_XPU_NATIVE_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS})
-install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
-target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
-target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
 
 if(BUILD_SEPARATE_OPS)
+
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS})
+  set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
+  target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
+
+  add_library(
+    torch_xpu_ops_aten
+    SHARED
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS})
+  install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
+
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
     set(sycl_lib torch-xpu-ops-sycl-${name})
@@ -35,88 +37,70 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
 elseif(BUILD_SPLIT_KERNEL_LIB)
-  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
-  set(ATen_XPU_SYCL_BINARY_SRCS)
-  set(ATen_XPU_SYCL_UNARY_SRCS)
+
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS})
+  set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
+  target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
+
+  add_library(
+    torch_xpu_ops_aten
+    SHARED
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS})
+  install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
+
+  # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
   set(ATen_XPU_SYCL_REDUCE_SRCS)
-  set(ATen_XPU_SYCL_ACTIVATION_SRCS)
   set(ATen_XPU_SYCL_FOREACH_SRCS)
-  set(ATen_XPU_SYCL_TENSOR_SRCS)
-  set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
-  set(ATen_XPU_SYCL_POLY_SRCS)
-  set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
   set(ATen_XPU_SYCL_OTHERS_SRCS)
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
     string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
-    # Resolve cyclic dependences between
-    # torch_xpu_ops_sycl_unary_binary_kernels.dll and
-    # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels
-    # into a same kernel library. Here we move elementwise kernel pow and copy
-    # into torch_xpu_ops_sycl_unary_binary_kernels.dll.
     string(REGEX MATCH "Pow" IS_POW ${sycl_src})
     string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
     string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
     string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
-    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
-    string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
-    string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
-    string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
-    string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
-    #Move resize kernel to Norm and Loss lib, to resolve symbol.
-    string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
-    string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})
 
     if(NOT IS_FOREACH STREQUAL "")
       list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
-    elseif(NOT IS_BINARY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_BINARY_SRCS ${sycl_src})
-    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_UNARY_SRCS ${sycl_src})
     elseif(NOT IS_REDUCE STREQUAL "")
       list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
+    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
     elseif(NOT IS_ACTIVATION STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
-    elseif(NOT IS_TENSOR STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
-    elseif(NOT IS_DISTRIBUTION STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
-    elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
-    elseif(NOT IS_POLY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
     else()
       list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
     endif()
   endforeach()
-  # Binary kernel lib
-  set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
+  # Unary binary kernel lib
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
   sycl_add_library(
-    ${sycl_binary_lib}
+    ${sycl_unary_binary_lib}
     SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_BINARY_SRCS})
-  target_compile_definitions(${sycl_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_binary_lib})
-  target_link_libraries(${sycl_binary_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_binary_lib})
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_binary_lib})
+  target_link_libraries(${sycl_unary_binary_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
 
   # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Unary kernel lib
-  set(sycl_unary_lib torch_xpu_ops_sycl_unary_kernels)
-  sycl_add_library(
-    ${sycl_unary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_SRCS})
-  target_compile_definitions(${sycl_unary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_lib})
-  target_link_libraries(${sycl_unary_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_lib})
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_unary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
   # Reduce kernel lib
   set(sycl_reduce_lib torch_xpu_ops_sycl_reduce_kernels)
@@ -132,19 +116,6 @@ elseif(BUILD_SPLIT_KERNEL_LIB)
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_reduce_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
-  # Activation kernel lib
-  set(sycl_activation_lib torch_xpu_ops_sycl_activation_kernels)
-  sycl_add_library(
-    ${sycl_activation_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_ACTIVATION_SRCS})
-  target_compile_definitions(${sycl_activation_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_activation_lib})
-  target_link_libraries(${sycl_activation_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_activation_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_activation_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
   # Foreach kernel lib
   set(sycl_foreach_lib torch_xpu_ops_sycl_foreach_kernels)
@@ -160,96 +131,101 @@ elseif(BUILD_SPLIT_KERNEL_LIB)
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
-  # Tensor kernel lib
-  set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
-  sycl_add_library(
-    ${sycl_tensor_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
-  target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
-  target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Norm and Loss kernel lib
-  set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
+  
+  # Other kernel lib
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
   sycl_add_library(
-    ${sycl_norm_loss_lib}
+    ${sycl_lib}
     SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
-  target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
-  target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})
+    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
+  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
 
   # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
-  # Polynomial kernel lib
-  set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
-  sycl_add_library(
-    ${sycl_poly_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
-  target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
-  target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
+else()
 
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  # On Windows, it is not possible to combine all obj files into one library 
+  # because the obj files of kernels compiled on Windows are much larger than 
+  # those on Linux. If they are combined into one, the library size will exceed 
+  # 4GB, which conflicts with the size limit of a single library on Windows. 
+  # We will combine the libraries on Windows into one after the compiler is fixed.
+  add_library(
+    torch_xpu_ops
+    STATIC
+    ${ATen_XPU_CPP_SRCS}
+    ${ATen_XPU_NATIVE_CPP_SRCS}
+    ${ATen_XPU_GEN_SRCS})
+  install(TARGETS torch_xpu_ops DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_compile_definitions(torch_xpu_ops PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+ # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary+Reduce+Pow+Copy+Activation+Foreach 2) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_OTHERS_SRCS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
+    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
 
-  # Distribution kernel lib
-  set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
+    if(NOT IS_FOREACH STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_REDUCE STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_ACTIVATION STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
+    endif()
+  endforeach()
+  # Unary binary kernel lib
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
   sycl_add_library(
-    ${sycl_dist_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
-  target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
-  target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})
+    ${sycl_unary_binary_lib}
+    STATIC
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
 
   # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
   # Other kernel lib
   set(sycl_lib torch_xpu_ops_sycl_kernels)
   sycl_add_library(
     ${sycl_lib}
-    SHARED
+    STATIC
     SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
   target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
-  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
   list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
 
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-else()
-  # Internal file name is decided by the target name. On windows, torch_xpu_ops_sycl_kernels
-  # is too long in device code linkage command.
-  sycl_add_library(
-    xpu_sycl
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_SRCS})
-  target_compile_definitions(xpu_sycl PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC xpu_sycl)
-  target_link_libraries(xpu_sycl PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES xpu_sycl)
 
-  set_target_properties(xpu_sycl PROPERTIES OUTPUT_NAME torch_xpu_ops_sycl_kernels)
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS xpu_sycl DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  target_link_libraries(torch_xpu_ops
+      PUBLIC
+      ${sycl_unary_binary_lib}
+      ${sycl_lib}
+  )
+  target_link_options(torch_xpu_ops PUBLIC
+      "-WHOLEARCHIVE:$<TARGET_FILE:${sycl_unary_binary_lib}>"
+      "-WHOLEARCHIVE:$<TARGET_FILE:${sycl_lib}>"
+  )
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
 endif()
 set(SYCL_LINK_LIBRARIES_KEYWORD)
 
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
-
 foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
   torch_compile_options(${lib})
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7a427e294..7d2f8e4f2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ set(ATen_XPU_CPP_SRCS)
 set(ATen_XPU_NATIVE_CPP_SRCS)
 set(ATen_XPU_SYCL_SRCS)
 set(ATen_XPU_XCCL_SRCS)
+set(SYCL_TARGET torch_xpu_ops)
 
 set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")
 
@@ -28,5 +29,5 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
 if(CLANG_FORMAT)
   file(GLOB_RECURSE ALL_CSRCS ${TORCH_XPU_OPS_ROOT}/**.[ch] ${TORCH_XPU_OPS_ROOT}/**.[ch]pp)
   add_custom_target(CL_FORMAT_CSRCS COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CSRCS})
-  add_dependencies(torch_xpu_ops CL_FORMAT_CSRCS)
+  add_dependencies(${SYCL_TARGET} CL_FORMAT_CSRCS)
 endif()