intel · AlexeySachkov · Sep 11, 2024 · May 31, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_occupancy_queries.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_occupancy_queries.asciidoc
@@ -0,0 +1,154 @@
+= sycl_ext_oneapi_group_occupancy_queries
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2024 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 5 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+This extension also depends on the following other SYCL extensions:
+
+* link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[
+  sycl_ext_oneapi_launch_queries]
+
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+
+== Overview
+
+This extension is based on the kernel-queue-specific specific querying mechanism
+introduced by the sycl_ext_oneapi_launch_queries extension.
+
+The purpose of queries the to be added is to aid occupancy based calculations
+for kernel launches based on hardware occupancy per compute unit granularity.
+The queries take in account the kernel resources and user-specified constraints,
+such as, but not limited to, local (work-group) size and dynamic work-group
+local memory (in bytes). The motivation behind is to aid the tuning of kernels,
+by being able to design the algorithm's implementation to maintain the highest
+possible occupancy in a portable way.
+
+List of currently planned queries.
+* max_num_work_group_occupancy_per_cu
+
+[source,c++]
+----
+sycl::queue q{};
+auto bundle = sycl::get_kernel_bundle(q.get_context());
+auto kernel = bundle.get_kernel<class KernelName>();
+
+auto wgSizeRange = sycl::range{32, 1, 1};
+size_t localMemorySize = 32;
+
+namespace syclex = sycl::ext::oneapi::experimental;
+uint32_t maxWGsPerCU = kernel.ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_group_occupancy_per_cu>(
+      q, wgSizeRange, localMemorySize);
+----
+
+NOTE: SYCL 2020 requires lambdas to be named in order to locate the associated
+`sycl::kernel` object used to query information descriptors. Reducing the
+verbosity of the queries shown above is left to a future extension.
+
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_GROUP_OCCUPANCY_QUERIES` to one of the values defined in
+the table below. Applications can test for the existence of this macro to
+determine if the implementation supports this feature, or applications can test
+the macro's value to determine which of the extension's features the
+implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+
+=== Occupancy queries
+
+[source, c++]
+----
+namespace ext::oneapi::experimental::info::kernel {
+
+struct max_num_work_group_occupancy_per_cu;
+
+}
+----
+
+[%header,cols="1,5,5,5"]
+|===
+|Kernel Descriptor
+|Argument Types
+|Return Type
+|Description
+
+|`max_num_work_group_occupancy_per_cu`
+|`sycl::queue`, `sycl::range`, `size_t`
+|`uint32_t`
+|Returns the maximum number of actively executing work-groups per compute unit
+granularity, when the kernel is submitted to the specified queue with the
+specified work-group size and the specified amount of dynamic work-group local
+memory (in bytes). The actively executing work-groups are those that occupy
+the fundamental hardware unit responsible for the execution of work-groups in
+parallel.
+
+|===
+
+== Implementation notes
+
+The implementation needs to define `sycl::kernel::ext_onapi_get_info` with the
+extra `sycl::range` and `size_t` parameters in addition to the `sycl::queue`.
+
+The Cuda, Hip and Level Zero backend adapters have the required infrastructure
+required to implement the extension.
@@ -30,6 +30,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
 template <typename T> struct is_kernel_info_desc : std::false_type {};
 template <typename T>
 struct is_kernel_device_specific_info_desc : std::false_type {};
+template <typename T>
+struct is_kernel_queue_specific_info_desc : std::false_type {};
 template <typename T> struct is_event_info_desc : std::false_type {};
 template <typename T> struct is_event_profiling_info_desc : std::false_type {};
 // Normally we would just use std::enable_if to limit valid get_info template
@@ -128,6 +130,16 @@ struct IsSubGroupInfo<info::kernel_device_specific::compile_sub_group_size>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
 #undef __SYCL_PARAM_TRAITS_SPEC
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode)   \
+  template <>                                                                  \
+  struct is_##DescType##_info_desc<Namespace::info::DescType::Desc>            \
+      : std::true_type {                                                       \
+    using return_type = Namespace::info::DescType::Desc::return_type;          \
+  };
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+#undef __SYCL_PARAM_TRAITS_SPEC
+
 #define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode)              \
   template <>                                                                  \
   struct is_backend_info_desc<info::DescType::Desc> : std::true_type {         \

@@ -22,9 +22,7 @@ namespace ext::oneapi::experimental {
 namespace info::kernel_queue_specific {
 // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
 // #7598 is merged.
-struct max_num_work_group_sync {
-  using return_type = size_t;
-};
+// Defined in 'sycl/info/kernel_device_specific_traits.def'
 } // namespace info::kernel_queue_specific
 
 template <int Dimensions> class root_group {

@@ -0,0 +1,2 @@
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_occupancy_per_cu, uint32_t,)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
@@ -207,6 +207,8 @@ struct work_item_progress_capabilities;
 #include <sycl/info/ext_codeplay_device_traits.def>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+
 #undef __SYCL_PARAM_TRAITS_SPEC
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
 } // namespace _V1

@@ -174,7 +174,13 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
   // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
   // once #7598 is merged.
   template <typename Param>
-  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(const queue &q) const;
+
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(const queue &Queue, const range<3> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
 
 private:
   /// Constructs a SYCL kernel object from a valid kernel_impl instance.

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -119,8 +119,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   )
 
   fetch_adapter_source(cuda
-    ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    https://github.com/GeorgeWeb/unified-runtime.git
+    3883c1139bd9d799159bac3ccf9e29d151f030d4
   )
 
   fetch_adapter_source(hip

@@ -153,6 +153,11 @@ class kernel_impl {
   template <typename Param>
   typename Param::return_type ext_oneapi_get_info(const queue &q) const;
 
+  template <typename Param>
+  typename Param::return_type
+  ext_oneapi_get_info(const queue &Queue, const range<3> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
   /// Get a reference to a raw kernel object.
   ///
   /// \return a reference to a valid PiKernel instance with raw kernel object.
@@ -269,22 +274,75 @@ kernel_impl::get_info(const device &Device,
       getPlugin());
 }
 
+namespace syclex = ext::oneapi::experimental;
+
+template <>
+inline typename syclex::info::kernel_queue_specific::
+    max_num_work_group_occupancy_per_cu::return_type
+    kernel_impl::ext_oneapi_get_info<syclex::info::kernel_queue_specific::
+                                         max_num_work_group_occupancy_per_cu>(
+        const queue &Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  if (WorkGroupSize.size() == 0) {
+    throw runtime_error("The launch work-group size cannot be zero.",
+                        PI_ERROR_INVALID_WORK_GROUP_SIZE);
+  }
+
+  const auto &Plugin = getPlugin();
+  const auto &Handle = getHandleRef();
+  const auto &Device = Queue.get_device();
+
+  // Calculate max number of work-groups per compute unit
+  const auto NumCUs = Device.get_info<info::device::max_compute_units>();
+
+  pi_uint32 GroupCount{0};
+  Plugin->call<PiApiKind::piextKernelSuggestMaxCooperativeGroupCount>(
+      Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
+  return GroupCount / NumCUs;
+}
+
 template <>
-inline typename ext::oneapi::experimental::info::kernel_queue_specific::
-    max_num_work_group_sync::return_type
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
     kernel_impl::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &Queue) const {
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        [[maybe_unused]] const queue &Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  if (WorkGroupSize.size() == 0) {
+    throw runtime_error("The launch work-group size cannot be zero.",
+                        PI_ERROR_INVALID_WORK_GROUP_SIZE);
+  }
+
   const auto &Plugin = getPlugin();
   const auto &Handle = getHandleRef();
-  const auto MaxWorkGroupSize =
-      Queue.get_device().get_info<info::device::max_work_group_size>();
+
   pi_uint32 GroupCount = 0;
   Plugin->call<PiApiKind::piextKernelSuggestMaxCooperativeGroupCount>(
-      Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
+      Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
   return GroupCount;
 }
 
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        const queue &Queue) const {
+  const auto &Device = Queue.get_device();
+  // Prevent out of launch resources for Cuda if this is used for calculating
+  // the total work group size for kernel launches, by restricting the max size
+  // to the kernel_device_specific maximum.
+  const auto MaxWorkGroupSize =
+      (Device.get_backend() == backend::ext_oneapi_cuda)
+          ? get_info<info::kernel_device_specific::work_group_size>(Device)
+          : Device.get_info<info::device::max_work_group_size>();
+
+  return ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+      Queue, sycl::range{MaxWorkGroupSize, 1, 1},
+      /* DynamicLocalMemorySize */ 0);
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
@@ -108,6 +108,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_FREE_FUNCTION_KERNELS 1
 #define SYCL_EXT_ONEAPI_PROD 1
 #define SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS 1
+#define SYCL_EXT_ONEAPI_GROUP_OCCUPANCY_QUERIES 1
 
 #ifndef __has_include
 #define __has_include(x) 0

@@ -106,17 +106,34 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
     const device &, const sycl::range<3> &) const;
 
 template <typename Param>
-typename Param::return_type
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
 kernel::ext_oneapi_get_info(const queue &Queue) const {
   return impl->ext_oneapi_get_info<Param>(Queue);
 }
 
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(const queue &Queue, const range<3> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
+
 template __SYCL_EXPORT typename ext::oneapi::experimental::info::
     kernel_queue_specific::max_num_work_group_sync::return_type
     kernel::ext_oneapi_get_info<
         ext::oneapi::experimental::info::kernel_queue_specific::
             max_num_work_group_sync>(const queue &Queue) const;
 
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode)   \
+  template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      const queue &, const range<3> &, size_t) const;
+
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+
+#undef __SYCL_PARAM_TRAITS_SPEC
+
 kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
 
 pi_native_handle kernel::getNative() const { return impl->getNative(); }