diff --git a/CHANGELOG.md b/CHANGELOG.md
index a77cd9840e99..096eabef6720 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,6 +54,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * `dpnp` uses pybind11 3.0.2 [#27734](https://github.com/IntelPython/dpnp/pull/2773)
 * Modified CMake files for the extension to explicitly mark DPC++ compiler and dpctl headers as system ones and so to suppress the build warning generated inside them [#2770](https://github.com/IntelPython/dpnp/pull/2770)
 * Updated QR tests to avoid element-wise comparisons for `raw` and `r` modes [#2785](https://github.com/IntelPython/dpnp/pull/2785)
+* Moved all SYCL kernel functors from `backend/extensions/` to a unified `backend/kernels/` directory hierarchy [#2816](https://github.com/IntelPython/dpnp/pull/2816)
 
 ### Deprecated
 
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 370d59f95585..e1bc34c9ae8b 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -62,7 +62,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 99d91744366f..3b2df73f46ef 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -30,41 +30,123 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
+#include <stdexcept>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "choose_kernel.hpp"
+#include <sycl/sycl.hpp>
+
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
-// utils extension header
 #include "ext/common.hpp"
+#include "kernels/indexing/choose.hpp"
 
 // dpctl tensor headers
 #include "utils/indexing_utils.hpp"
 #include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
 
 namespace dpnp::extensions::indexing
 {
+namespace py = pybind11;
 
+namespace impl
+{
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-static kernels::choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
-static kernels::choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
+                                       size_t,
+                                       ssize_t,
+                                       int,
+                                       const ssize_t *,
+                                       const char *,
+                                       char *,
+                                       char **,
+                                       ssize_t,
+                                       ssize_t,
+                                       const ssize_t *,
+                                       const std::vector<sycl::event> &);
+
+static choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename ProjectorT, typename indTy, typename Ty>
+sycl::event choose_impl(sycl::queue &q,
+                        size_t nelems,
+                        ssize_t n_chcs,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *ind_cp,
+                        char *dst_cp,
+                        char **chcs_cp,
+                        ssize_t ind_offset,
+                        ssize_t dst_offset,
+                        const ssize_t *chc_offsets,
+                        const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
 
-namespace py = pybind11;
+    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
+    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
 
-namespace detail
+    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InOutIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
+                                            shape_and_strides};
+
+        using NthChoiceIndexerT =
+            dpnp::kernels::choose::strides::NthStrideOffsetUnpacked;
+        const NthChoiceIndexerT choices_indexer{
+            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
+
+        using ChooseFunc =
+            dpnp::kernels::choose::ChooseFunctor<ProjectorT, InOutIndexerT,
+                                                 NthChoiceIndexerT, indTy, Ty>;
+
+        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
+                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
+                                                ind_out_indexer,
+                                                choices_indexer));
+    });
+
+    return choose_ev;
+}
+
+template <typename fnT, typename IndT, typename T, typename Index>
+struct ChooseFactory
 {
+    fnT get()
+    {
+        if constexpr (std::is_integral<IndT>::value &&
+                      !std::is_same<IndT, bool>::value) {
+            fnT fn = choose_impl<Index, IndT, T>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
 
+namespace detail
+{
 using host_ptrs_allocator_t =
     dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
 using ptrs_t = std::vector<char *, host_ptrs_allocator_t>;
@@ -191,7 +273,6 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_chcs(const sycl::queue &q,
 
     return res;
 }
-
 } // namespace detail
 
 std::pair<sycl::event, sycl::event>
@@ -412,23 +493,6 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(arg_cleanup_ev, choose_generic_ev);
 }
 
-template <typename fnT, typename IndT, typename T, typename Index>
-struct ChooseFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<IndT>::value &&
-                      !std::is_same<IndT, bool>::value) {
-            fnT fn = kernels::choose_impl<Index, IndT, T>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
 using dpctl::tensor::indexing_utils::ClipIndex;
 using dpctl::tensor::indexing_utils::WrapIndex;
 
@@ -441,23 +505,22 @@ using ChooseClipFactory = ChooseFactory<fnT, IndT, T, ClipIndex<IndT>>;
 void init_choose_dispatch_tables(void)
 {
     using ext::common::init_dispatch_table;
-    using kernels::choose_fn_ptr_t;
 
     init_dispatch_table<choose_fn_ptr_t, ChooseClipFactory>(
         choose_clip_dispatch_table);
     init_dispatch_table<choose_fn_ptr_t, ChooseWrapFactory>(
         choose_wrap_dispatch_table);
 }
+} // namespace impl
 
 void init_choose(py::module_ m)
 {
-    dpnp::extensions::indexing::init_choose_dispatch_tables();
+    impl::init_choose_dispatch_tables();
 
-    m.def("_choose", &py_choose, "", py::arg("src"), py::arg("chcs"),
+    m.def("_choose", &impl::py_choose, "", py::arg("src"), py::arg("chcs"),
           py::arg("dst"), py::arg("mode"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
     return;
 }
-
 } // namespace dpnp::extensions::indexing
diff --git a/dpnp/backend/extensions/indexing/choose_kernel.hpp b/dpnp/backend/extensions/indexing/choose_kernel.hpp
deleted file mode 100644
index 6b1ac8005054..000000000000
--- a/dpnp/backend/extensions/indexing/choose_kernel.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-
-#pragma once
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/indexing_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/strided_iters.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpnp::extensions::indexing::strides_detail
-{
-
-struct NthStrideOffsetUnpacked
-{
-    NthStrideOffsetUnpacked(int common_nd,
-                            dpctl::tensor::ssize_t const *_offsets,
-                            dpctl::tensor::ssize_t const *_shape,
-                            dpctl::tensor::ssize_t const *_strides)
-        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
-          strides(_strides)
-    {
-    }
-
-    template <typename nT>
-    size_t operator()(dpctl::tensor::ssize_t gid, nT n) const
-    {
-        dpctl::tensor::ssize_t relative_offset(0);
-        _ind.get_displacement<const dpctl::tensor::ssize_t *,
-                              const dpctl::tensor::ssize_t *>(
-            gid, shape, strides + (n * nd), relative_offset);
-
-        return relative_offset + offsets[n];
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_vector<dpctl::tensor::ssize_t> _ind;
-
-    int nd;
-    dpctl::tensor::ssize_t const *offsets;
-    dpctl::tensor::ssize_t const *shape;
-    dpctl::tensor::ssize_t const *strides;
-};
-
-static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
-
-} // namespace dpnp::extensions::indexing::strides_detail
-
-namespace dpnp::extensions::indexing::kernels
-{
-
-template <typename ProjectorT,
-          typename IndOutIndexerT,
-          typename ChoicesIndexerT,
-          typename IndT,
-          typename T>
-class ChooseFunctor
-{
-private:
-    const IndT *ind = nullptr;
-    T *dst = nullptr;
-    char **chcs = nullptr;
-    dpctl::tensor::ssize_t n_chcs;
-    const IndOutIndexerT ind_out_indexer;
-    const ChoicesIndexerT chcs_indexer;
-
-public:
-    ChooseFunctor(const IndT *ind_,
-                  T *dst_,
-                  char **chcs_,
-                  dpctl::tensor::ssize_t n_chcs_,
-                  const IndOutIndexerT &ind_out_indexer_,
-                  const ChoicesIndexerT &chcs_indexer_)
-        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
-          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        const ProjectorT proj{};
-
-        dpctl::tensor::ssize_t i = id[0];
-
-        auto ind_dst_offsets = ind_out_indexer(i);
-        dpctl::tensor::ssize_t ind_offset = ind_dst_offsets.get_first_offset();
-        dpctl::tensor::ssize_t dst_offset = ind_dst_offsets.get_second_offset();
-
-        IndT chc_idx = ind[ind_offset];
-        // proj produces an index in the range of n_chcs
-        dpctl::tensor::ssize_t projected_idx = proj(n_chcs, chc_idx);
-
-        dpctl::tensor::ssize_t chc_offset = chcs_indexer(i, projected_idx);
-
-        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
-
-        dst[dst_offset] = chc[chc_offset];
-    }
-};
-
-typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
-                                       size_t,
-                                       dpctl::tensor::ssize_t,
-                                       int,
-                                       const dpctl::tensor::ssize_t *,
-                                       const char *,
-                                       char *,
-                                       char **,
-                                       dpctl::tensor::ssize_t,
-                                       dpctl::tensor::ssize_t,
-                                       const dpctl::tensor::ssize_t *,
-                                       const std::vector<sycl::event> &);
-
-template <typename ProjectorT, typename indTy, typename Ty>
-sycl::event choose_impl(sycl::queue &q,
-                        size_t nelems,
-                        dpctl::tensor::ssize_t n_chcs,
-                        int nd,
-                        const dpctl::tensor::ssize_t *shape_and_strides,
-                        const char *ind_cp,
-                        char *dst_cp,
-                        char **chcs_cp,
-                        dpctl::tensor::ssize_t ind_offset,
-                        dpctl::tensor::ssize_t dst_offset,
-                        const dpctl::tensor::ssize_t *chc_offsets,
-                        const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
-    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
-
-    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using InOutIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
-                                            shape_and_strides};
-
-        using NthChoiceIndexerT = strides_detail::NthStrideOffsetUnpacked;
-        const NthChoiceIndexerT choices_indexer{
-            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
-
-        using ChooseFunc = ChooseFunctor<ProjectorT, InOutIndexerT,
-                                         NthChoiceIndexerT, indTy, Ty>;
-
-        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
-                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
-                                                ind_out_indexer,
-                                                choices_indexer));
-    });
-
-    return choose_ev;
-}
-
-} // namespace dpnp::extensions::indexing::kernels
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 7ccb05238ae4..36786c8cbaf3 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -67,7 +67,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 02fc66f26610..8091e8874d17 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -28,24 +28,26 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
 #include <sycl/sycl.hpp>
 
+#include "dpctl4pybind11.hpp"
+
 #include "ext/common.hpp"
+#include "kernels/statistics/histogram.hpp"
 
-namespace dpctl::tensor
+namespace statistics::histogram
 {
-class usm_ndarray;
-}
-
 using dpctl::tensor::usm_ndarray;
 
 using ext::common::AtomicOp;
 using ext::common::IsNan;
 using ext::common::Less;
 
-namespace statistics::histogram
-{
-
 template <typename T, int Dims>
 struct CachedData
 {
@@ -69,23 +71,23 @@ struct CachedData
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto local_ptr = &local_data[0];
-        uint32_t size = local_data.size();
+        std::uint32_t size = local_data.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = global_data[i];
         }
     }
 
-    size_t size() const { return local_data.size(); }
+    std::size_t size() const { return local_data.size(); }
 
     T &operator[](const sycl::id<Dims> &id) const { return local_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return local_data[id];
     }
@@ -117,12 +119,12 @@ struct UncachedData
     {
     }
 
-    size_t size() const { return _shape.size(); }
+    std::size_t size() const { return _shape.size(); }
 
     T &operator[](const sycl::id<Dims> &id) const { return global_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return global_data[id];
     }
@@ -139,15 +141,15 @@ struct HistLocalType
 };
 
 template <>
-struct HistLocalType<uint64_t>
+struct HistLocalType<std::uint64_t>
 {
-    using type = uint32_t;
+    using type = std::uint32_t;
 };
 
 template <>
-struct HistLocalType<int64_t>
+struct HistLocalType<std::int64_t>
 {
-    using type = int32_t;
+    using type = std::int32_t;
 };
 
 template <typename T, typename localT = typename HistLocalType<T>::type>
@@ -159,8 +161,8 @@ struct HistWithLocalCopies
     using LocalHist = sycl::local_accessor<localT, 2>;
 
     HistWithLocalCopies(T *global_data,
-                        size_t bins_count,
-                        int32_t copies_count,
+                        std::size_t bins_count,
+                        std::int32_t copies_count,
                         sycl::handler &cgh)
     {
         local_hist = LocalHist(sycl::range<2>(copies_count, bins_count), cgh);
@@ -170,23 +172,25 @@ struct HistWithLocalCopies
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item, localT val = 0) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto *local_ptr = &local_hist[0][0];
-        uint32_t size = local_hist.size();
+        std::uint32_t size = local_hist.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = val;
         }
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &item, int32_t bin, localT value) const
+    void add(const sycl::nd_item<_Dims> &item,
+             std::int32_t bin,
+             localT value) const
     {
-        int32_t llid = item.get_local_linear_id();
-        int32_t local_hist_count = local_hist.get_range().get(0);
-        int32_t local_copy_id =
+        std::int32_t llid = item.get_local_linear_id();
+        std::int32_t local_hist_count = local_hist.get_range().get(0);
+        std::int32_t local_copy_id =
             local_hist_count == 1 ? 0 : llid % local_hist_count;
 
         AtomicOp<localT, sycl::memory_order::relaxed,
@@ -198,15 +202,15 @@ struct HistWithLocalCopies
     template <int _Dims>
     void finalize(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
-        uint32_t bins_count = local_hist.get_range().get(1);
-        uint32_t local_hist_count = local_hist.get_range().get(0);
+        std::uint32_t llid = item.get_local_linear_id();
+        std::uint32_t bins_count = local_hist.get_range().get(1);
+        std::uint32_t local_hist_count = local_hist.get_range().get(0);
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < bins_count; i += local_size) {
+        for (std::uint32_t i = llid; i < bins_count; i += local_size) {
             auto value = local_hist[0][i];
-            for (uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
+            for (std::uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
                 value += local_hist[lhc][i];
             }
             if (value != T(0)) {
@@ -217,7 +221,7 @@ struct HistWithLocalCopies
         }
     }
 
-    uint32_t size() const { return local_hist.size(); }
+    std::uint32_t size() const { return local_hist.size(); }
 
 private:
     LocalHist local_hist;
@@ -238,7 +242,7 @@ struct HistGlobalMemory
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &, int32_t bin, T value) const
+    void add(const sycl::nd_item<_Dims> &, std::int32_t bin, T value) const
     {
         AtomicOp<T, sycl::memory_order::relaxed,
                  sycl::memory_scope::device>::add(global_hist[bin], value);
@@ -253,10 +257,10 @@ struct HistGlobalMemory
     T *global_hist = nullptr;
 };
 
-template <typename T = uint32_t>
+template <typename T = std::uint32_t>
 struct NoWeights
 {
-    constexpr T get(size_t) const { return 1; }
+    constexpr T get(std::size_t) const { return 1; }
 };
 
 template <typename T>
@@ -264,7 +268,7 @@ struct Weights
 {
     Weights(T *weights) { data = weights; }
 
-    T get(size_t id) const { return data[id]; }
+    T get(std::size_t id) const { return data[id]; }
 
 private:
     T *data = nullptr;
@@ -277,55 +281,23 @@ bool check_in_bounds(const dT &val, const dT &min, const dT &max)
     return !_less(val, min) && !_less(max, val) && !IsNan<dT>::isnan(val);
 }
 
-template <typename T, typename HistImpl, typename Edges, typename Weights>
-class histogram_kernel;
-
 template <typename T, typename HistImpl, typename Edges, typename Weights>
 void submit_histogram(const T *in,
-                      const size_t size,
-                      const size_t dims,
-                      const uint32_t WorkPI,
+                      const std::size_t size,
+                      const std::size_t dims,
+                      const std::uint32_t WorkPI,
                       const HistImpl &hist,
                       const Edges &edges,
                       const Weights &weights,
                       sycl::nd_range<1> nd_range,
                       sycl::handler &cgh)
 {
-    cgh.parallel_for<histogram_kernel<T, HistImpl, Edges, Weights>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto id = item.get_group_linear_id();
-            auto lid = item.get_local_linear_id();
-            auto group = item.get_group();
-            auto local_size = item.get_local_range(0);
-
-            hist.init(item);
-            edges.init(item);
-
-            if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
-
-            auto bounds = edges.get_bounds();
-
-            for (uint32_t i = 0; i < WorkPI; ++i) {
-                auto data_idx = id * WorkPI * local_size + i * local_size + lid;
-                if (data_idx < size) {
-                    auto *d = &in[data_idx * dims];
-
-                    if (edges.in_bounds(d, bounds)) {
-                        auto bin = edges.get_bin(item, d, bounds);
-                        auto weight = weights.get(data_idx);
-                        hist.add(item, bin, weight);
-                    }
-                }
-            }
-
-            if constexpr (HistImpl::sync_before_finalize) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
+    using HistogramKernel =
+        dpnp::kernels::histogram::HistogramFunctor<T, HistImpl, Edges, Weights>;
 
-            hist.finalize(item);
-        });
+    cgh.parallel_for<HistogramKernel>(
+        nd_range,
+        HistogramKernel(in, size, dims, WorkPI, hist, edges, weights));
 }
 
 void validate(const usm_ndarray &sample,
@@ -333,8 +305,8 @@ void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
               const usm_ndarray &histogram);
 
-uint32_t get_local_hist_copies_count(uint32_t loc_mem_size_in_items,
-                                     uint32_t local_size,
-                                     uint32_t hist_size_in_items);
+std::uint32_t get_local_hist_copies_count(std::uint32_t loc_mem_size_in_items,
+                                          std::uint32_t local_size,
+                                          std::uint32_t hist_size_in_items);
 
 } // namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index f33a23609666..329c96dfc1c6 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -28,25 +28,21 @@
 
 #pragma once
 
-#include <algorithm>
-
-#include "utils/math_utils.hpp"
-#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
 #include <type_traits>
 
-#include <stdio.h>
-
-#include "ext/common.hpp"
+#include <sycl/sycl.hpp>
 
-using dpctl::tensor::usm_ndarray;
+#include "dpctl4pybind11.hpp"
 
-using ext::common::Align;
-using ext::common::CeilDiv;
+#include "kernels/statistics/sliding_window1d.hpp"
 
 namespace statistics::sliding_window1d
 {
+using dpctl::tensor::usm_ndarray;
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 class _RegistryDataStorage
 {
 public:
@@ -144,7 +140,7 @@ class _RegistryDataStorage
     ncT data[Size];
 };
 
-template <typename T, uint32_t Size = 1>
+template <typename T, std::uint32_t Size = 1>
 struct RegistryData : public _RegistryDataStorage<T, Size>
 {
     using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
@@ -336,7 +332,7 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
     T *store(T *const data) { return store(data, true); }
 };
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 struct RegistryWindow : public RegistryData<T, Size>
 {
     using SizeT = typename RegistryData<T, Size>::SizeT;
@@ -349,7 +345,7 @@ struct RegistryWindow : public RegistryData<T, Size>
         static_assert(std::is_integral_v<shT>,
                       "shift must be of an integral type");
 
-        uint32_t shift_r = this->size_x() - shift;
+        std::uint32_t shift_r = this->size_x() - shift;
         for (SizeT i = 0; i < Size; ++i) {
             this->data[i] = this->shift_left(i, shift);
             auto border =
@@ -369,7 +365,7 @@ struct RegistryWindow : public RegistryData<T, Size>
     }
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class Span
 {
 public:
@@ -391,13 +387,13 @@ class Span
     const SizeT size_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 Span<T, SizeT> make_span(T *const data, const SizeT size)
 {
     return Span<T, SizeT>(data, size);
 }
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class PaddedSpan : public Span<T, SizeT>
 {
 public:
@@ -417,68 +413,14 @@ class PaddedSpan : public Span<T, SizeT>
     const SizeT pad_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 PaddedSpan<T, SizeT>
     make_padded_span(T *const data, const SizeT size, const SizeT offset)
 {
     return PaddedSpan<T, SizeT>(data, size, offset);
 }
 
-template <typename Results,
-          typename AData,
-          typename VData,
-          typename Op,
-          typename Red>
-void process_block(Results &results,
-                   uint32_t r_size,
-                   AData &a_data,
-                   VData &v_data,
-                   uint32_t block_size,
-                   Op op,
-                   Red red)
-{
-    for (uint32_t i = 0; i < block_size; ++i) {
-        auto v_val = v_data.broadcast(i);
-        for (uint32_t r = 0; r < r_size; ++r) {
-            results[r] = red(results[r], op(a_data[r], v_val));
-        }
-        a_data.advance_left();
-    }
-}
-
-template <typename SizeT>
-SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-    const auto sg_loc_id = sbgroup.get_local_linear_id();
-
-    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
-    const SizeT id = sg_base_id + sg_loc_id;
-
-    return id;
-}
-
-template <typename SizeT>
-uint32_t get_results_num(const uint32_t wpi,
-                         const SizeT size,
-                         const SizeT global_id,
-                         const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-
-    const auto sbg_size = sbgroup.get_max_local_range()[0];
-    const auto size_ = sycl::sub_sat(size, global_id);
-    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
-}
-
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -491,77 +433,16 @@ void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
                              sycl::nd_range<1> nd_range,
                              sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            const auto *a_begin = a.begin();
-            const auto *a_end = a.end();
+    using SlidingWindow1dKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-            auto sbgroup = item.get_sub_group();
-
-            const auto chunks_count =
-                CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
-
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            for (uint32_t b = 0; b < chunks_count; ++b) {
-                auto v_data = RegistryData<const T>(item);
-                v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-                uint32_t chunk_size_ =
-                    std::min(v_size, SizeT(v_data.total_size()));
-                process_block(results, results_num, a_data, v_data, chunk_size_,
-                              op, red);
-
-                if (b != chunks_count - 1) {
-                    a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
-                                             _a_load_cond, 0);
-                    v_size -= v_data.total_size();
-                }
-            }
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dKernel>(
+        nd_range, SlidingWindow1dKernel(a, v, op, red, out));
 }
 
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_small_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -574,61 +455,18 @@ void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
                                           sycl::nd_range<1> nd_range,
                                           sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_small_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto sbgroup = item.get_sub_group();
-            auto sg_size = sbgroup.get_max_local_range()[0];
-
-            const uint32_t to_read = WorkPI * sg_size + v.size();
-            const auto *a_begin = a.begin();
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-            const auto *a_end = std::min(a_ptr + to_read, a.end());
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
+    using SlidingWindow1dSmallKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dSmallFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            auto v_data = RegistryData<const T>(item);
-            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            process_block(results, results_num, a_data, v_data, v_size, op,
-                          red);
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dSmallKernel>(
+        nd_range, SlidingWindow1dSmallKernel(a, v, op, red, out));
 }
 
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
-              const size_t l_pad,
-              const size_t r_pad);
+              const std::size_t l_pad,
+              const std::size_t r_pad);
 } // namespace statistics::sliding_window1d
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index fca8c43f816e..8830569ce9cf 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -41,40 +41,29 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "kernels/elementwise_functions/interpolate.hpp"
+
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include "kernels/elementwise_functions/interpolate.hpp"
-
 // utils extension headers
 #include "ext/common.hpp"
 #include "ext/validation_utils.hpp"
 
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace type_utils = dpctl::tensor::type_utils;
-
-using ext::common::value_type_of;
-using ext::validation::array_names;
-using ext::validation::array_ptr;
-
-using ext::common::dtype_from_typenum;
-using ext::validation::check_has_dtype;
-using ext::validation::check_num_dims;
-using ext::validation::check_same_dtype;
-using ext::validation::check_same_size;
-using ext::validation::common_checks;
-
 namespace dpnp::extensions::ufunc
 {
+namespace py = pybind11;
 
 namespace impl
 {
-using ext::common::init_dispatch_vector;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace type_utils = dpctl::tensor::type_utils;
 
 template <typename T>
-using value_type_of_t = typename value_type_of<T>::type;
+using value_type_of_t = typename ext::common::value_type_of<T>::type;
+
+using ext::common::dtype_from_typenum;
 
 typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const void *,      // x
@@ -88,8 +77,10 @@ typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const std::size_t, // xp_size
                                             const std::vector<sycl::event> &);
 
+interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+
 template <typename T, typename TIdx = std::int64_t>
-sycl::event interpolate_call(sycl::queue &exec_q,
+sycl::event interpolate_impl(sycl::queue &q,
                              const void *vx,
                              const void *vidx,
                              const void *vxp,
@@ -101,6 +92,8 @@ sycl::event interpolate_call(sycl::queue &exec_q,
                              const std::size_t xp_size,
                              const std::vector<sycl::event> &depends)
 {
+    dpctl::tensor::type_utils::validate_type_for_device<T>(q);
+
     using type_utils::is_complex_v;
     using TCoord = std::conditional_t<is_complex_v<T>, value_type_of_t<T>, T>;
 
@@ -112,23 +105,69 @@ sycl::event interpolate_call(sycl::queue &exec_q,
     const T *right = static_cast<const T *>(vright);
     T *out = static_cast<T *>(vout);
 
-    using dpnp::kernels::interpolate::interpolate_impl;
-    sycl::event interpolate_ev = interpolate_impl<TCoord, T>(
-        exec_q, x, idx, xp, fp, left, right, out, n, xp_size, depends);
+    sycl::event interpolate_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InterpolateFunc =
+            dpnp::kernels::interpolate::InterpolateFunctor<TCoord, T>;
+
+        cgh.parallel_for<InterpolateFunc>(
+            sycl::range<1>(n),
+            InterpolateFunc(x, idx, xp, fp, left, right, out, xp_size));
+    });
 
     return interpolate_ev;
 }
 
-interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+/**
+ * @brief A factory to define pairs of supported types for which
+ * interpolate function is available.
+ *
+ * @tparam T Type of input vector `a` and of result vector `y`.
+ */
+template <typename T>
+struct InterpolateOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
 
-void common_interpolate_checks(
-    const dpctl::tensor::usm_ndarray &x,
-    const dpctl::tensor::usm_ndarray &idx,
-    const dpctl::tensor::usm_ndarray &xp,
-    const dpctl::tensor::usm_ndarray &fp,
-    const dpctl::tensor::usm_ndarray &out,
-    const std::optional<const dpctl::tensor::usm_ndarray> &left,
-    const std::optional<const dpctl::tensor::usm_ndarray> &right)
+template <typename fnT, typename T>
+struct InterpolateFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename InterpolateOutputType<T>::value_type,
+                          void>) {
+            return nullptr;
+        }
+        else {
+            return interpolate_impl<T>;
+        }
+    }
+};
+
+namespace detail
+{
+using ext::validation::array_names;
+using ext::validation::check_has_dtype;
+using ext::validation::check_num_dims;
+using ext::validation::check_same_dtype;
+using ext::validation::check_same_size;
+using ext::validation::common_checks;
+
+void validate(const dpctl::tensor::usm_ndarray &x,
+              const dpctl::tensor::usm_ndarray &idx,
+              const dpctl::tensor::usm_ndarray &xp,
+              const dpctl::tensor::usm_ndarray &fp,
+              const dpctl::tensor::usm_ndarray &out,
+              const std::optional<const dpctl::tensor::usm_ndarray> &left,
+              const std::optional<const dpctl::tensor::usm_ndarray> &right)
 {
     array_names names = {{&x, "x"}, {&xp, "xp"}, {&fp, "fp"}, {&out, "out"}};
 
@@ -158,6 +197,7 @@ void common_interpolate_checks(
         throw py::value_error("array of sample points is empty");
     }
 }
+} // namespace detail
 
 std::pair<sycl::event, sycl::event>
     py_interpolate(const dpctl::tensor::usm_ndarray &x,
@@ -170,7 +210,7 @@ std::pair<sycl::event, sycl::event>
                    sycl::queue &exec_q,
                    const std::vector<sycl::event> &depends)
 {
-    common_interpolate_checks(x, idx, xp, fp, out, left, right);
+    detail::validate(x, idx, xp, fp, out, left, right);
 
     int out_typenum = out.get_typenum();
 
@@ -214,56 +254,21 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, ev);
 }
 
-/**
- * @brief A factory to define pairs of supported types for which
- * interpolate function is available.
- *
- * @tparam T Type of input vector `a` and of result vector `y`.
- */
-template <typename T>
-struct InterpolateOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-};
-
-template <typename fnT, typename T>
-struct InterpolateFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_same_v<
-                          typename InterpolateOutputType<T>::value_type,
-                          void>) {
-            return nullptr;
-        }
-        else {
-            return interpolate_call<T>;
-        }
-    }
-};
-
 static void init_interpolate_dispatch_vectors()
 {
-    init_dispatch_vector<interpolate_fn_ptr_t, InterpolateFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<interpolate_fn_ptr_t, impl::InterpolateFactory>(
         interpolate_dispatch_vector);
 }
-
 } // namespace impl
 
 void init_interpolate(py::module_ m)
 {
     impl::init_interpolate_dispatch_vectors();
 
-    using impl::py_interpolate;
-    m.def("_interpolate", &py_interpolate, "", py::arg("x"), py::arg("idx"),
-          py::arg("xp"), py::arg("fp"), py::arg("left"), py::arg("right"),
-          py::arg("out"), py::arg("sycl_queue"),
+    m.def("_interpolate", &impl::py_interpolate, "", py::arg("x"),
+          py::arg("idx"), py::arg("xp"), py::arg("fp"), py::arg("left"),
+          py::arg("right"), py::arg("out"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 }
-
 } // namespace dpnp::extensions::ufunc
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index cb084e972d78..9e7b1192e3a2 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -28,11 +28,18 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <cstddef>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
 #include <sycl/sycl.hpp>
 
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
@@ -41,10 +48,8 @@
 
 namespace dpnp::extensions::window
 {
-
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
 namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*window_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -72,6 +77,20 @@ sycl::event window_impl(sycl::queue &exec_q,
     return window_ev;
 }
 
+template <typename fnT, typename T, template <typename> typename FunctorT>
+struct Factory
+{
+    fnT get()
+    {
+        if constexpr (std::is_floating_point_v<T>) {
+            return window_impl<T, FunctorT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 template <typename funcPtrT>
 std::tuple<size_t, char *, funcPtrT>
     window_fn(sycl::queue &exec_q,
@@ -101,7 +120,7 @@ std::tuple<size_t, char *, funcPtrT>
     }
 
     const int result_typenum = result.get_typenum();
-    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    auto array_types = td_ns::usm_ndarray_types();
     const int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
     funcPtrT fn = window_dispatch_vector[result_type_id];
 
diff --git a/dpnp/backend/extensions/window/kaiser.cpp b/dpnp/backend/extensions/window/kaiser.cpp
index b83f88f69a9b..e5c1aa837a64 100644
--- a/dpnp/backend/extensions/window/kaiser.cpp
+++ b/dpnp/backend/extensions/window/kaiser.cpp
@@ -26,26 +26,24 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "kaiser.hpp"
+#include <sycl/sycl.hpp>
+
 #include "common.hpp"
+#include "kaiser.hpp"
+
+#include "kernels/window/kaiser.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
 
 // dpctl tensor headers
-#include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include <sycl/sycl.hpp>
-
-#include "kernels/elementwise_functions/i0.hpp"
-
 namespace dpnp::extensions::window
 {
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
-using ext::common::init_dispatch_vector;
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -53,34 +51,10 @@ typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        const py::object &,
                                        const std::vector<sycl::event> &);
 
-static kaiser_fn_ptr_t kaiser_dispatch_vector[dpctl_td_ns::num_types];
+static kaiser_fn_ptr_t kaiser_dispatch_vector[td_ns::num_types];
 
-template <typename T>
-class KaiserFunctor
+namespace impl
 {
-private:
-    T *res = nullptr;
-    const std::size_t N;
-    const T beta;
-
-public:
-    KaiserFunctor(T *res, const std::size_t N, const T beta)
-        : res(res), N(N), beta(beta)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        using dpnp::kernels::i0::cyl_bessel_i0;
-
-        const auto i = id.get(0);
-        const T alpha = (N - 1) / T(2);
-        const T tmp = (i - alpha) / alpha;
-        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
-                 cyl_bessel_i0(beta);
-    }
-};
-
 template <typename T>
 sycl::event kaiser_impl(sycl::queue &exec_q,
                         char *result,
@@ -96,7 +70,7 @@ sycl::event kaiser_impl(sycl::queue &exec_q,
     sycl::event kaiser_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KaiserKernel = KaiserFunctor<T>;
+        using KaiserKernel = dpnp::kernels::kaiser::KaiserFunctor<T>;
         cgh.parallel_for<KaiserKernel>(sycl::range<1>(nelems),
                                        KaiserKernel(res, nelems, beta));
     });
@@ -117,6 +91,7 @@ struct KaiserFactory
         }
     }
 };
+} // namespace impl
 
 std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
@@ -141,8 +116,8 @@ std::pair<sycl::event, sycl::event>
 
 void init_kaiser_dispatch_vectors()
 {
-    init_dispatch_vector<kaiser_fn_ptr_t, KaiserFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<kaiser_fn_ptr_t, impl::KaiserFactory>(
         kaiser_dispatch_vector);
 }
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 0a4712cc594e..4ba506620db2 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -28,11 +28,15 @@
 
 #pragma once
 
-#include <dpctl4pybind11.hpp>
 #include <sycl/sycl.hpp>
 
+#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
 namespace dpnp::extensions::window
 {
+namespace py = pybind11;
+
 extern std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
               const py::object &beta,
@@ -40,5 +44,4 @@ extern std::pair<sycl::event, sycl::event>
               const std::vector<sycl::event> &depends);
 
 extern void init_kaiser_dispatch_vectors(void);
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/window_py.cpp b/dpnp/backend/extensions/window/window_py.cpp
index 2b8090c40cca..5ae80f4027b5 100644
--- a/dpnp/backend/extensions/window/window_py.cpp
+++ b/dpnp/backend/extensions/window/window_py.cpp
@@ -33,11 +33,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "bartlett.hpp"
-#include "blackman.hpp"
+#include "kernels/window/bartlett.hpp"
+#include "kernels/window/blackman.hpp"
+#include "kernels/window/hamming.hpp"
+#include "kernels/window/hanning.hpp"
+
 #include "common.hpp"
-#include "hamming.hpp"
-#include "hanning.hpp"
 #include "kaiser.hpp"
 
 // utils extension header
@@ -51,6 +52,22 @@ using window_ns::window_fn_ptr_t;
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
+template <typename fnT, typename T>
+using BartlettFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::bartlett::BartlettFunctor>;
+
+template <typename fnT, typename T>
+using BlackmanFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::blackman::BlackmanFunctor>;
+
+template <typename fnT, typename T>
+using HammingFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hamming::HammingFunctor>;
+
+template <typename fnT, typename T>
+using HanningFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hanning::HanningFunctor>;
+
 static window_fn_ptr_t bartlett_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t blackman_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t hamming_dispatch_vector[dpctl_td_ns::num_types];
@@ -62,8 +79,7 @@ PYBIND11_MODULE(_window_impl, m)
     using event_vecT = std::vector<sycl::event>;
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BartlettFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BartlettFactory>(
             bartlett_dispatch_vector);
 
         auto bartlett_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -78,8 +94,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BlackmanFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BlackmanFactory>(
             blackman_dispatch_vector);
 
         auto blackman_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -94,8 +109,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HammingFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HammingFactory>(
             hamming_dispatch_vector);
 
         auto hamming_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -110,8 +124,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HanningFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HanningFactory>(
             hanning_dispatch_vector);
 
         auto hanning_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
diff --git a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
index ef38157b00e9..c85dafea24b0 100644
--- a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
@@ -28,67 +28,79 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+
 #include <sycl/sycl.hpp>
-#include <vector>
 
 #include "ext/common.hpp"
 
-using ext::common::IsNan;
-
 namespace dpnp::kernels::interpolate
 {
+using ext::common::IsNan;
+
 template <typename TCoord, typename TValue, typename TIdx = std::int64_t>
-sycl::event interpolate_impl(sycl::queue &q,
-                             const TCoord *x,
-                             const TIdx *idx,
-                             const TCoord *xp,
-                             const TValue *fp,
-                             const TValue *left,
-                             const TValue *right,
-                             TValue *out,
-                             const std::size_t n,
-                             const std::size_t xp_size,
-                             const std::vector<sycl::event> &depends)
+class InterpolateFunctor
 {
+private:
+    const TCoord *x = nullptr;
+    const TIdx *idx = nullptr;
+    const TCoord *xp = nullptr;
+    const TValue *fp = nullptr;
+    const TValue *left = nullptr;
+    const TValue *right = nullptr;
+    TValue *out = nullptr;
+    const std::size_t xp_size;
+
+public:
+    InterpolateFunctor(const TCoord *x_,
+                       const TIdx *idx_,
+                       const TCoord *xp_,
+                       const TValue *fp_,
+                       const TValue *left_,
+                       const TValue *right_,
+                       TValue *out_,
+                       const std::size_t xp_size_)
+        : x(x_), idx(idx_), xp(xp_), fp(fp_), left(left_), right(right_),
+          out(out_), xp_size(xp_size_)
+    {
+    }
+
     // Selected over the work-group version
     // due to simpler execution and slightly better performance.
-    return q.submit([&](sycl::handler &h) {
-        h.depends_on(depends);
-        h.parallel_for(sycl::range<1>(n), [=](sycl::id<1> i) {
-            TValue left_val = left ? *left : fp[0];
-            TValue right_val = right ? *right : fp[xp_size - 1];
+    void operator()(sycl::id<1> id) const
+    {
+        TValue left_val = left ? *left : fp[0];
+        TValue right_val = right ? *right : fp[xp_size - 1];
 
-            TCoord x_val = x[i];
-            TIdx x_idx = idx[i] - 1;
+        TCoord x_val = x[id];
+        TIdx x_idx = idx[id] - 1;
 
-            if (IsNan<TCoord>::isnan(x_val)) {
-                out[i] = x_val;
-            }
-            else if (x_idx < 0) {
-                out[i] = left_val;
-            }
-            else if (x_val == xp[xp_size - 1]) {
-                out[i] = fp[xp_size - 1];
-            }
-            else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
-                out[i] = right_val;
-            }
-            else {
-                TValue slope =
-                    (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
-                TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
+        if (IsNan<TCoord>::isnan(x_val)) {
+            out[id] = x_val;
+        }
+        else if (x_idx < 0) {
+            out[id] = left_val;
+        }
+        else if (x_val == xp[xp_size - 1]) {
+            out[id] = fp[xp_size - 1];
+        }
+        else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
+            out[id] = right_val;
+        }
+        else {
+            TValue slope =
+                (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
+            TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
 
-                if (IsNan<TValue>::isnan(res)) {
-                    res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
-                    if (IsNan<TValue>::isnan(res) &&
-                        (fp[x_idx] == fp[x_idx + 1])) {
-                        res = fp[x_idx];
-                    }
+            if (IsNan<TValue>::isnan(res)) {
+                res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
+                if (IsNan<TValue>::isnan(res) && (fp[x_idx] == fp[x_idx + 1])) {
+                    res = fp[x_idx];
                 }
-                out[i] = res;
             }
-        });
-    });
-}
-
+            out[id] = res;
+        }
+    }
+};
 } // namespace dpnp::kernels::interpolate
diff --git a/dpnp/backend/kernels/indexing/choose.hpp b/dpnp/backend/kernels/indexing/choose.hpp
new file mode 100644
index 000000000000..49b71d05c96b
--- /dev/null
+++ b/dpnp/backend/kernels/indexing/choose.hpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+
+namespace dpnp::kernels::choose
+{
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename IndOutIndexerT,
+          typename ChoicesIndexerT,
+          typename IndT,
+          typename T>
+class ChooseFunctor
+{
+private:
+    const IndT *ind = nullptr;
+    T *dst = nullptr;
+    char **chcs = nullptr;
+    ssize_t n_chcs;
+    const IndOutIndexerT ind_out_indexer;
+    const ChoicesIndexerT chcs_indexer;
+
+public:
+    ChooseFunctor(const IndT *ind_,
+                  T *dst_,
+                  char **chcs_,
+                  ssize_t n_chcs_,
+                  const IndOutIndexerT &ind_out_indexer_,
+                  const ChoicesIndexerT &chcs_indexer_)
+        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
+          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const ProjectorT proj{};
+
+        ssize_t i = id[0];
+
+        auto ind_dst_offsets = ind_out_indexer(i);
+        ssize_t ind_offset = ind_dst_offsets.get_first_offset();
+        ssize_t dst_offset = ind_dst_offsets.get_second_offset();
+
+        IndT chc_idx = ind[ind_offset];
+        // proj produces an index in the range of n_chcs
+        ssize_t projected_idx = proj(n_chcs, chc_idx);
+
+        ssize_t chc_offset = chcs_indexer(i, projected_idx);
+
+        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
+
+        dst[dst_offset] = chc[chc_offset];
+    }
+};
+
+namespace strides
+{
+using dpctl::tensor::strides::CIndexer_vector;
+
+struct NthStrideOffsetUnpacked
+{
+    NthStrideOffsetUnpacked(int common_nd,
+                            ssize_t const *_offsets,
+                            ssize_t const *_shape,
+                            ssize_t const *_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
+          strides(_strides)
+    {
+    }
+
+    template <typename nT>
+    size_t operator()(ssize_t gid, nT n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape, strides + (n * nd), relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape;
+    ssize_t const *strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
+
+} // namespace strides
+} // namespace dpnp::kernels::choose
diff --git a/dpnp/backend/kernels/statistics/histogram.hpp b/dpnp/backend/kernels/statistics/histogram.hpp
new file mode 100644
index 000000000000..6d0fedbe0bc3
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/histogram.hpp
@@ -0,0 +1,99 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+namespace dpnp::kernels::histogram
+{
+template <typename T, typename HistImpl, typename Edges, typename Weights>
+class HistogramFunctor
+{
+private:
+    const T *in = nullptr;
+    const std::size_t size;
+    const std::size_t dims;
+    const std::uint32_t WorkPI;
+    const HistImpl hist;
+    const Edges edges;
+    const Weights weights;
+
+public:
+    HistogramFunctor(const T *in_,
+                     const std::size_t size_,
+                     const std::size_t dims_,
+                     const std::uint32_t WorkPI_,
+                     const HistImpl &hist_,
+                     const Edges &edges_,
+                     const Weights &weights_)
+        : in(in_), size(size_), dims(dims_), WorkPI(WorkPI_), hist(hist_),
+          edges(edges_), weights(weights_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto id = item.get_group_linear_id();
+        auto lid = item.get_local_linear_id();
+        auto group = item.get_group();
+        auto local_size = item.get_local_range(0);
+
+        hist.init(item);
+        edges.init(item);
+
+        if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        auto bounds = edges.get_bounds();
+
+        for (std::uint32_t i = 0; i < WorkPI; ++i) {
+            auto data_idx = id * WorkPI * local_size + i * local_size + lid;
+            if (data_idx < size) {
+                auto *d = &in[data_idx * dims];
+
+                if (edges.in_bounds(d, bounds)) {
+                    auto bin = edges.get_bin(item, d, bounds);
+                    auto weight = weights.get(data_idx);
+                    hist.add(item, bin, weight);
+                }
+            }
+        }
+
+        if constexpr (HistImpl::sync_before_finalize) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        hist.finalize(item);
+    }
+};
+} // namespace dpnp::kernels::histogram
diff --git a/dpnp/backend/kernels/statistics/sliding_window1d.hpp b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
new file mode 100644
index 000000000000..5b3c5535afd4
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
@@ -0,0 +1,274 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+#include "ext/common.hpp"
+
+namespace dpnp::kernels::sliding_window1d
+{
+using ext::common::CeilDiv;
+
+namespace detail
+{
+template <typename SizeT>
+SizeT get_global_linear_id(const std::uint32_t wpi,
+                           const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+    const auto sg_loc_id = sbgroup.get_local_linear_id();
+
+    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
+    const SizeT id = sg_base_id + sg_loc_id;
+
+    return id;
+}
+
+template <typename SizeT>
+std::uint32_t get_results_num(const std::uint32_t wpi,
+                              const SizeT size,
+                              const SizeT global_id,
+                              const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+
+    const auto sbg_size = sbgroup.get_max_local_range()[0];
+    const auto size_ = sycl::sub_sat(size, global_id);
+    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
+}
+
+template <typename Results,
+          typename AData,
+          typename VData,
+          typename Op,
+          typename Red>
+void process_block(Results &results,
+                   std::uint32_t r_size,
+                   AData &a_data,
+                   VData &v_data,
+                   std::uint32_t block_size,
+                   Op op,
+                   Red red)
+{
+    for (std::uint32_t i = 0; i < block_size; ++i) {
+        auto v_val = v_data.broadcast(i);
+        for (std::uint32_t r = 0; r < r_size; ++r) {
+            results[r] = red(results[r], op(a_data[r], v_val));
+        }
+        a_data.advance_left();
+    }
+}
+} // namespace detail
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dFunctor(const SpanT &a_,
+                           const KernelT &v_,
+                           const OpT &op_,
+                           const RedT &red_,
+                           ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        const auto *a_begin = a.begin();
+        const auto *a_end = a.end();
+
+        auto sbgroup = item.get_sub_group();
+
+        const auto chunks_count =
+            CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        for (std::uint32_t b = 0; b < chunks_count; ++b) {
+            auto v_data = RegistryDataT<typename KernelT::value_type,
+                                        default_reg_data_size>(item);
+            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+            std::uint32_t chunk_size_ =
+                std::min(v_size, SizeT(v_data.total_size()));
+            detail::process_block(results, results_num, a_data, v_data,
+                                  chunk_size_, op, red);
+
+            if (b != chunks_count - 1) {
+                a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
+                                         _a_load_cond, 0);
+                v_size -= v_data.total_size();
+            }
+        }
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dSmallFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dSmallFunctor(const SpanT &a_,
+                                const KernelT &v_,
+                                const OpT &op_,
+                                const RedT &red_,
+                                ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto sbgroup = item.get_sub_group();
+        auto sg_size = sbgroup.get_max_local_range()[0];
+
+        const std::uint32_t to_read = WorkPI * sg_size + v.size();
+        const auto *a_begin = a.begin();
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+        const auto *a_end = std::min(a_ptr + to_read, a.end());
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        auto v_data =
+            RegistryDataT<typename KernelT::value_type, default_reg_data_size>(
+                item);
+        v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        detail::process_block(results, results_num, a_data, v_data, v_size, op,
+                              red);
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+} // namespace dpnp::kernels::sliding_window1d
diff --git a/dpnp/backend/extensions/window/bartlett.hpp b/dpnp/backend/kernels/window/bartlett.hpp
similarity index 80%
rename from dpnp/backend/extensions/window/bartlett.hpp
rename to dpnp/backend/kernels/window/bartlett.hpp
index 69d3be627c84..20d410150dcb 100644
--- a/dpnp/backend/extensions/window/bartlett.hpp
+++ b/dpnp/backend/kernels/window/bartlett.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -19,7 +19,7 @@
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, RES, OR PROFITS; OR BUSINESS
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::bartlett
 {
-
 template <typename T>
 class BartlettFunctor
 {
@@ -52,19 +52,4 @@ class BartlettFunctor
         res[i] = T(1) - sycl::fabs(i - alpha) / alpha;
     }
 };
-
-template <typename fnT, typename T>
-struct BartlettFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BartlettFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::bartlett
diff --git a/dpnp/backend/extensions/window/blackman.hpp b/dpnp/backend/kernels/window/blackman.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/blackman.hpp
rename to dpnp/backend/kernels/window/blackman.hpp
index 7a75d226792f..9df7cb8728e2 100644
--- a/dpnp/backend/extensions/window/blackman.hpp
+++ b/dpnp/backend/kernels/window/blackman.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::blackman
 {
-
 template <typename T>
 class BlackmanFunctor
 {
@@ -53,19 +53,4 @@ class BlackmanFunctor
                  T(0.08) * sycl::cospi(T(2) * alpha);
     }
 };
-
-template <typename fnT, typename T>
-struct BlackmanFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BlackmanFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::blackman
diff --git a/dpnp/backend/extensions/window/hamming.hpp b/dpnp/backend/kernels/window/hamming.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hamming.hpp
rename to dpnp/backend/kernels/window/hamming.hpp
index 521ebc10c281..895ecb0e588c 100644
--- a/dpnp/backend/extensions/window/hamming.hpp
+++ b/dpnp/backend/kernels/window/hamming.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hamming
 {
-
 template <typename T>
 class HammingFunctor
 {
@@ -51,19 +51,4 @@ class HammingFunctor
         res[i] = T(0.54) - T(0.46) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HammingFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HammingFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hamming
diff --git a/dpnp/backend/extensions/window/hanning.hpp b/dpnp/backend/kernels/window/hanning.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hanning.hpp
rename to dpnp/backend/kernels/window/hanning.hpp
index 612036d6b05a..35b441f921f8 100644
--- a/dpnp/backend/extensions/window/hanning.hpp
+++ b/dpnp/backend/kernels/window/hanning.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hanning
 {
-
 template <typename T>
 class HanningFunctor
 {
@@ -51,19 +51,4 @@ class HanningFunctor
         res[i] = T(0.5) - T(0.5) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HanningFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HanningFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hanning
diff --git a/dpnp/backend/kernels/window/kaiser.hpp b/dpnp/backend/kernels/window/kaiser.hpp
new file mode 100644
index 000000000000..ce8c8e52fd18
--- /dev/null
+++ b/dpnp/backend/kernels/window/kaiser.hpp
@@ -0,0 +1,64 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/elementwise_functions/i0.hpp"
+
+namespace dpnp::kernels::kaiser
+{
+template <typename T>
+class KaiserFunctor
+{
+private:
+    T *res = nullptr;
+    const std::size_t N;
+    const T beta;
+
+public:
+    KaiserFunctor(T *res, const std::size_t N, const T beta)
+        : res(res), N(N), beta(beta)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using dpnp::kernels::i0::cyl_bessel_i0;
+
+        const auto i = id.get(0);
+        const T alpha = (N - 1) / T(2);
+        const T tmp = (i - alpha) / alpha;
+        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
+                 cyl_bessel_i0(beta);
+    }
+};
+} // namespace dpnp::kernels::kaiser
diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 588345d91b2e..545fd888c1ba 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -259,6 +259,9 @@ def find_objects():
                     "-format=lcov",
                     "-ignore-filename-regex=/tmp/icpx*",
                     r"-ignore-filename-regex=.*/backend/kernels/elementwise_functions/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/indexing/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/statistics/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/window/.*\.hpp$",
                     "-instr-profile=" + instr_profile_fn,
                 ]
                 + objects