QuEST-Kit · TysonRayJones · Jun 2, 2026 · Apr 24, 2026 · Apr 24, 2026 · May 4, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -161,6 +161,7 @@ option(
 )
 message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
+
 # GPU Acceleration
 option(
   QUEST_ENABLE_CUDA
@@ -184,6 +185,20 @@ option(
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
 
+# GPU Performance Tuning
+# (We do not print this value when configuring CMake as it is for advanced users only)
+
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
+)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
+
+
 # Deprecated API
 option(
   QUEST_ENABLE_DEPRECATED_API
@@ -197,9 +212,15 @@ option(
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
 option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
@@ -222,10 +243,12 @@ if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
+
 if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
   message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
 endif()
 
+
 if(WIN32)
 
   # Force MSVC to export all symbols in a shared library, like GCC and clang
@@ -243,6 +266,37 @@ if(WIN32)
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -508,18 +562,19 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
+
+
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
 
 
-# these vars are already set, but repeated here for clarity
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
 set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
 set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library

diff --git a/docs/cmake.md b/docs/cmake.md
@@ -48,7 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 

diff --git a/docs/launch.md b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 

diff --git a/examples/extended/set_num_gpu_threads.c b/examples/extended/set_num_gpu_threads.c
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <stdio.h>
+#include <time.h>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    printf("Using %d threads per block... ", numThreadsPerBlock);
+    fflush(stdout);
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double start = (double) clock();
+
+    for (int r = 0; r < NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double end = (double) clock();
+    double dur = (end - start) / CLOCKS_PER_SEC;
+    double av = dur / NUM_REPS;
+
+    printf("took %fs\n", av);
+}
+
+
+int main(void)
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated)
+    {
+        printf(
+            "GPU acceleration is not enabled, and so changing the number "
+            "of threads per block has no effect. Exiting...\n");
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    printf("Initial numThreadsPerBlock: %d\n\n", initNumTPB);
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark sensible parameters
+    int goodTPB[] = {64, 128, 256, 512, 1024};
+    for (int i = 0; i < 5; i++)
+        benchmark(qureg, goodTPB[i]);
+
+    // Try silly parameters
+    setQuESTValidationOff();
+    int badTPB[] = {31, 15, 5, 1};
+    for (int i = 0; i < 4; i++)
+        benchmark(qureg, badTPB[i]);
+
+    destroyQureg(qureg);
+    finalizeQuESTEnv();
+
+    return 0;
+}
diff --git a/examples/extended/set_num_gpu_threads.cpp b/examples/extended/set_num_gpu_threads.cpp
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <iostream>
+#include <chrono>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    std::cout << "Using " << numThreadsPerBlock << " threads per block... " << std::flush;
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    using clock = std::chrono::steady_clock;
+    auto start = clock::now();
+
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    auto end = clock::now();
+    auto dur = std::chrono::duration<double>(end - start).count();
+    auto av  = dur / NUM_REPS;
+
+    std::cout << " took " << av << "s" << std::endl;
+}
+
+
+int main()
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated) {
+        std::cout 
+            << "GPU acceleration is not enabled, and so changing the number "
+            << "of threads per block has no effect. Exiting..."
+            << std::endl;
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    // The initial number of threads per block is informed by the optional environment
+    // variable QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK. If not specified, QuEST will
+    // use the value of the CMake option of the same name passed during compilation,
+    // which itself will has a default of 128
+    auto initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    std::cout << "Initial numThreadsPerBlock: " << initNumTPB << "\n\n";
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark QuEST with sensible numbers of threads per block (multiples of warp size)
+    for (auto numTPB : {64, 128, 256, 512, 1024})
+        benchmark(qureg, numTPB);
+
+    // Try silly parameters ¯\_(ツ)_/¯
+    setQuESTValidationOff();
+    for (auto numTPB : {31, 15, 5, 1})
+        benchmark(qureg, numTPB);
+
+    finalizeQuESTEnv();
+    return 0;
+}
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
@@ -83,14 +83,15 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
 
 
 

diff --git a/quest/include/experimental.h b/quest/include/experimental.h
@@ -44,7 +44,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM
-
 /** @notyetdoced
  * 
  *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
@@ -61,10 +60,46 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
  * @author Oliver Brown
  */
 void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }