From 0defe751f6e31cd101d493c84f5f936c87066c5b Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 13 May 2026 22:31:35 +0200 Subject: [PATCH 1/2] [x86_64] Minimal support for avx512vl avx512vl just extends 128 and 256 bits register with some operations, it does not have any 512 bit instructions, so the description is mostly empty and preliminary work for #1345 --- README.md | 2 +- docs/Doxyfile | 1 + include/xsimd/arch/xsimd_avx512vl.hpp | 19 +++++++ include/xsimd/arch/xsimd_isa.hpp | 8 +++ include/xsimd/config/xsimd_arch.hpp | 2 +- include/xsimd/config/xsimd_config.hpp | 13 ++++- include/xsimd/config/xsimd_cpuid.hpp | 2 + include/xsimd/types/xsimd_all_registers.hpp | 1 + .../xsimd/types/xsimd_avx512vl_register.hpp | 51 +++++++++++++++++++ test/test_cpu_features.cpp | 4 +- 10 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 include/xsimd/arch/xsimd_avx512vl.hpp create mode 100644 include/xsimd/types/xsimd_avx512vl_register.hpp diff --git a/README.md b/README.md index ea71b1b46..2c2e459b1 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2 -x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher) +x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F, AVX512VL (gcc7 and higher) x86 AMD | FMA4 ARM | NEON, NEON64, SVE128/256/512 (fixed vector size) WebAssembly | WASM diff --git a/docs/Doxyfile b/docs/Doxyfile index 72cd9c32e..c574a8579 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -15,6 +15,7 @@ INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ ../include/xsimd/types/xsimd_avx512dq_register.hpp \ ../include/xsimd/types/xsimd_avx512f_register.hpp \ + ../include/xsimd/types/xsimd_avx512vl_register.hpp \ ../include/xsimd/types/xsimd_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \ diff --git a/include/xsimd/arch/xsimd_avx512vl.hpp b/include/xsimd/arch/xsimd_avx512vl.hpp new file mode 100644 index 000000000..d47b0df40 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512vl.hpp @@ -0,0 +1,19 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_HPP +#define XSIMD_AVX512VL_HPP + +#include "../types/xsimd_avx512vl_register.hpp" + +// no 512-bit operation with avx512-vl, it only provides 128 et 256 bits ones. + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 7975988aa..6beaa5273 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -73,6 +73,10 @@ #include "./xsimd_avx512f.hpp" #endif +#if XSIMD_WITH_AVX512VL +#include "./xsimd_avx512vl.hpp" +#endif + #if XSIMD_WITH_AVX512DQ #include "./xsimd_avx512dq.hpp" #endif @@ -89,6 +93,10 @@ #include "./xsimd_avx512pf.hpp" #endif +#if XSIMD_WITH_AVX512VL +#include "./xsimd_avx512pf.hpp" +#endif + #if XSIMD_WITH_AVX512IFMA #include "./xsimd_avx512ifma.hpp" #endif diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index 85b7eebf1..e1f1e60fa 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -162,7 +162,7 @@ namespace xsimd } // namespace detail using all_x86_architectures = arch_list< - avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512cd, avx512f, + avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512vl, avx512cd, avx512f, avxvnni, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index 58458f5bf..e3887c276 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -307,6 +307,17 @@ #define XSIMD_WITH_AVX512CD 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512VL is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512VL__ +#define XSIMD_WITH_AVX512VL XSIMD_WITH_AVX512CD +#else +#define XSIMD_WITH_AVX512VL 0 +#endif + /** * @ingroup xsimd_config_macro * @@ -615,7 +626,7 @@ #endif -#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512VL && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE #define XSIMD_NO_SUPPORTED_ARCHITECTURE #endif diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 8c167be4a..7466cd5f8 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -49,6 +49,7 @@ namespace xsimd ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2) ARCH_FIELD(avx512f) ARCH_FIELD(avx512cd) + ARCH_FIELD(avx512vl) ARCH_FIELD(avx512dq) ARCH_FIELD(avx512bw) ARCH_FIELD(avx512er) @@ -121,6 +122,7 @@ namespace xsimd avx512f = cpu.avx512f(); avx512cd = cpu.avx512cd(); + avx512vl = cpu.avx512vl(); avx512dq = cpu.avx512dq(); avx512bw = cpu.avx512bw(); avx512er = cpu.avx512er(); diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index df7149d1c..eb058f9b7 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -19,6 +19,7 @@ #include "./xsimd_avx512pf_register.hpp" #include "./xsimd_avx512vbmi2_register.hpp" #include "./xsimd_avx512vbmi_register.hpp" +#include "./xsimd_avx512vl_register.hpp" #include "./xsimd_avx512vnni_avx512bw_register.hpp" #include "./xsimd_avx512vnni_avx512vbmi2_register.hpp" #include "./xsimd_avx_register.hpp" diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp new file mode 100644 index 000000000..225590a6b --- /dev/null +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -0,0 +1,51 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_REGISTER_HPP +#define XSIMD_AVX512VL_REGISTER_HPP + +#include "./xsimd_avx512cd_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512DQ instructions + */ + struct avx512vl : avx512cd + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; } + static constexpr bool available() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx512vl"; } + }; + +#if XSIMD_WITH_AVX512VL + +#if !XSIMD_WITH_AVX512CD +#error "architecture inconsistency: avx512vl requires avx512cd" +#endif + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl, avx512cd); + + } +#endif +} +#endif diff --git a/test/test_cpu_features.cpp b/test/test_cpu_features.cpp index b0a7bc79f..05958a204 100644 --- a/test/test_cpu_features.cpp +++ b/test/test_cpu_features.cpp @@ -66,8 +66,9 @@ TEST_CASE("[cpu_features] x86 implication chains") CHECK_IMPLICATION(cpu.fma4(), cpu.avx()); CHECK_IMPLICATION(cpu.fma3(), cpu.avx()); - // AVX-512 iplication chain + // AVX-512 implication chain CHECK_IMPLICATION(cpu.avx512f(), cpu.avx2()); + CHECK_IMPLICATION(cpu.avx512vl(), cpu.avx512cd()); CHECK_IMPLICATION(cpu.avx512dq(), cpu.avx512f()); CHECK_IMPLICATION(cpu.avx512ifma(), cpu.avx512f()); CHECK_IMPLICATION(cpu.avx512pf(), cpu.avx512f()); @@ -132,6 +133,7 @@ TEST_CASE("[cpu_features] x86 features from environment") CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512F", cpu.avx512f()); CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512BW", cpu.avx512bw()); CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512CD", cpu.avx512cd()); + CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512VL", cpu.avx512vl()); CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512DQ", cpu.avx512dq()); CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVXVNNI", cpu.avxvnni()); } From ecdac9498462f3068c1e40d3cfb65b3351b5ce11 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 10 May 2026 18:03:49 +0200 Subject: [PATCH 2/2] Tentative support for avx512vl extensions to 256 bit registers In addition to missing instructions (e.g. bas on int64_t etc) this mostly changes the mask representation from vector register to scalar, thus the big diff. --- .github/workflows/linux.yml | 4 + include/xsimd/arch/xsimd_avx512vl_256.hpp | 721 ++++++++++++++++++ include/xsimd/arch/xsimd_isa.hpp | 1 + include/xsimd/config/xsimd_arch.hpp | 2 +- .../xsimd/config/xsimd_cpu_features_x86.hpp | 2 + .../xsimd/types/xsimd_avx512f_register.hpp | 2 +- .../xsimd/types/xsimd_avx512vl_register.hpp | 19 + 7 files changed, 749 insertions(+), 2 deletions(-) create mode 100644 include/xsimd/arch/xsimd_avx512vl_256.hpp diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 7eec735d8..289a5a083 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -32,6 +32,7 @@ jobs: - { compiler: 'clang', version: '18', flags: 'avx512' } - { compiler: 'clang', version: '18', flags: 'avx_128' } - { compiler: 'clang', version: '18', flags: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: 'avx512vl_256' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} @@ -94,6 +95,9 @@ jobs: if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi + if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512 -DXSIMD_DEFAULT_ARCH=avx512vl_256" + fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp new file mode 100644 index 000000000..358370b4e --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -0,0 +1,721 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_256_HPP +#define XSIMD_AVX512VL_256_HPP + +#include + +#include "../types/xsimd_avx512vl_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template + XSIMD_INLINE batch_bool compare_int_avx512vl_256(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // load mask + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto iter = size / 4; + static_assert((size % 4) == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * 4); + mask |= (register_type(block) << (i * 4)); + } + return mask; + } + + // from bool + template + XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + return static_cast::register_type>(mask); + } + + // mask + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // set + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = self.data & (register_type(1) << i); + } + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return _mm256_abs_epi64(self); + } + + // load masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + + // store masked + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_ps(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_pd(mem, mask.mask(), src); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epi64(self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epu64(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epi64(self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epu64(self, other); + } + + // swizzle (dynamic version) + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_ps(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_pd(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi64(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512vl_256 {})); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi32(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512vl_256 {})); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return swizzle(batch { self.data }, batch { mask.data }, avx2 {}).data; + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + + // swizzle + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_pd(self, mask); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_epi64(self, mask); + } + + // insert + template + XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept + { + + int32_t tmp = bit_cast(val); + return _mm256_castsi256_ps(_mm256_mask_set1_epi32(_mm256_castps_si256(self), __mmask8(1 << (I & 7)), tmp)); + } + + template + XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept + { + int64_t tmp = bit_cast(val); + return _mm256_castsi256_pd(_mm256_mask_set1_epi64(_mm256_castpd_si256(self), __mmask8(1 << (I & 3)), tmp)); + } + + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // rotl + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rolv_epi32(self, other); + } + XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rolv_epi64(self, other); + } + return rotl(self, other, avx2 {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept + { + return rotl(self, batch(other), A {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rol_epi32(self, count); + } + XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rol_epi64(self, count); + } + + return rotl(self, avx2 {}); + } + + // rotr + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rorv_epi64(self, other); + } + } + return rotr(self, other, avx2 {}); + } + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept + { + return rotr(self, batch(other), A {}); + } + + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_ror_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_ror_epi64(self, count); + } + } + return rotr(self, avx2 {}); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data == register_type(-1) >> (sizeof(register_type) * 4); + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data != register_type(0); + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value>> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_ps(cond, false_br, true_br); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_pd(cond, false_br, true_br); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_blend_epi64(cond, false_br, true_br); + } + } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512vl_256 {}); + } + + // reciprocal + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_ps(self); + } + + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_pd(self); + } + + // bitwise_and + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_not + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + // bitwise_or + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template ::value>> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 6beaa5273..5bee73b1a 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -75,6 +75,7 @@ #if XSIMD_WITH_AVX512VL #include "./xsimd_avx512vl.hpp" +#include "./xsimd_avx512vl_256.hpp" #endif #if XSIMD_WITH_AVX512DQ diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index e1f1e60fa..1084d3faa 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -163,7 +163,7 @@ namespace xsimd using all_x86_architectures = arch_list< avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512vl, avx512cd, avx512f, - avxvnni, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, + avxvnni, avx512vl_256, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index 3c840c2c5..d9897aaee 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -895,6 +895,8 @@ namespace xsimd inline bool avx512vl() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + inline bool avx512vl_256() const noexcept { return avx_enabled() && leaf7().all_bits_set(); } + inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512vbmi2() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } diff --git a/include/xsimd/types/xsimd_avx512f_register.hpp b/include/xsimd/types/xsimd_avx512f_register.hpp index 279ae4caa..c54161209 100644 --- a/include/xsimd/types/xsimd_avx512f_register.hpp +++ b/include/xsimd/types/xsimd_avx512f_register.hpp @@ -13,6 +13,7 @@ #define XSIMD_AVX512F_REGISTER_HPP #include "./xsimd_common_arch.hpp" +#include "./xsimd_fma3_avx2_register.hpp" namespace xsimd { @@ -69,7 +70,6 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512); XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d); - } #endif } diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp index 225590a6b..d08eb7704 100644 --- a/include/xsimd/types/xsimd_avx512vl_register.hpp +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -29,6 +29,18 @@ namespace xsimd static constexpr char const* name() noexcept { return "avx512vl"; } }; + /** + * @ingroup architectures + * + * AVX512VL instructions extension for 256 bits registers + */ + struct avx512vl_256 : fma3 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; } + static constexpr bool available() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx512vl/256"; } + }; + #if XSIMD_WITH_AVX512VL #if !XSIMD_WITH_AVX512CD @@ -45,6 +57,13 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl, avx512cd); + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl_256, avx2); + } #endif }