QuEST-Kit · TysonRayJones · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -183,21 +183,6 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
-# GPU Performance Tuning
-## We do not print this value when configuring CMake as it is for advanced users only.
-
-set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
-  CACHE
-  STRING
-  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
-)
-mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)
-
-math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
-if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
-    message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
-endif()
-
 # Deprecated API
 option(
   QUEST_ENABLE_DEPRECATED_API
@@ -514,7 +499,6 @@ set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
 set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
 set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
 set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
-set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})
 
 
 # (for the love of God cmake, create a concise syntax for this)
@@ -523,6 +507,7 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
 
 
 # these vars are already set, but repeated here for clarity
@@ -531,10 +516,6 @@ set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library

diff --git a/docs/cmake.md b/docs/cmake.md
@@ -48,7 +48,6 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |
 
 
 

diff --git a/docs/launch.md b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 

diff --git a/examples/extended/user_owned_mpi.c b/examples/extended/user_owned_mpi.c
@@ -0,0 +1,42 @@
+/** @file
+ * 
+ * TODO
+ * 
+ * @author Oliver Brown
+ */
+
+#include "quest.h"
+#include <stdio.h>
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM
+#if ! QUEST_COMPILE_SUBCOMM
+
+int main(void)
+{    
+    printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+
+#else 
+
+#include <mpi.h>
+
+int main(void)
+{
+    const int  USE_DISTRIB = 1;
+    const bool USER_MPI    = 1;
+    const int  USE_OPENMP  = 1;
+    const int  USE_GPU     = 0;
+
+    MPI_Init(NULL, NULL);
+    initCustomMpiQuESTEnv(USE_DISTRIB, USER_MPI, USE_GPU, USE_OPENMP);
+    reportQuESTEnv();
+    finalizeQuESTEnv();
+    MPI_Finalize();
+
+    return 0;
+}
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/extended/user_owned_submpi.cpp b/examples/extended/user_owned_submpi.cpp
@@ -0,0 +1,84 @@
+/** @file
+ * 
+ * TODO
+ * 
+ * @author Oliver Brown
+ */
+
+#include "quest.h"
+#include <cstdio>
+
+
+    // TODO:
+    // this example sees some processes print to std-out while
+    // QuEST is reporting, colliding with output. May be worth
+    // introducing a sync to force non-QuEST-processes to wait
+    // during QUEST reporting
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM
+#if ! QUEST_COMPILE_SUBCOMM
+
+int main()
+{    
+    std::printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+
+#else 
+
+#include <mpi.h>
+
+int main (void)
+{
+    int nprocs, quest_nprocs, world_rank, quest_rank;
+    MPI_Comm comm_split, comm_quantum, comm_classical;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    const int I_AM_QUANTUM = world_rank % 2;
+
+    std::printf("[%d] Hello from rank %d of %d in MPI_COMM_WORLD.\n", world_rank, world_rank, nprocs);
+
+    MPI_Comm_split(MPI_COMM_WORLD, I_AM_QUANTUM, world_rank, &comm_split);
+
+    if (I_AM_QUANTUM) {
+        MPI_Comm_dup(comm_split, &comm_quantum);
+        MPI_Comm_size(comm_quantum, &quest_nprocs);
+        MPI_Comm_rank(comm_quantum, &quest_rank);
+        std::printf("[%d] Hello from rank %d of %d in comm_quantum.\n", world_rank, quest_rank, quest_nprocs);
+    } else {
+        MPI_Comm_dup(comm_split, &comm_classical);
+        quest_rank = -1;
+        quest_nprocs = -1;
+    }
+
+    // only procs in quantum comm initialise QuEST
+    if (I_AM_QUANTUM) {
+        std::printf("[%d] Initialising QuEST.\n", world_rank);
+        initCustomMpiCommQuESTEnv(comm_quantum, modeflag::USE_AUTO, modeflag::USE_AUTO);
+
+        reportQuESTEnv();
+
+        std::printf("[%d] Finalising QuEST.\n", world_rank);
+        finalizeQuESTEnv();
+    }
+
+    MPI_Comm_free(&comm_split);
+    if (I_AM_QUANTUM) {
+        MPI_Comm_free(&comm_quantum);
+    } else {
+        MPI_Comm_free(&comm_classical);
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
@@ -83,18 +83,13 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
-// default parameters which may have been tuned for performance when building the library
-#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
-
-
 
 /*
  * inherit the version information from CMake.

diff --git a/quest/include/environment.h b/quest/include/environment.h
@@ -35,16 +35,16 @@ extern "C" {
 typedef struct {
 
     // deployment modes which can be runtime disabled
-    int isMultithreaded;
-    int isGpuAccelerated;
-    int isDistributed;
-    bool userOwnsMpi;
+    bool isMultithreaded;
+    bool isGpuAccelerated;
+    bool isDistributed;
+    bool isMpiUserOwned;
 
     // deployment modes which cannot be directly changed after compilation
-    int isCuQuantumEnabled;
+    bool isCuQuantumEnabled;
 
     // deployment configurations which can be changed via environment variables
-    int isGpuSharingEnabled;
+    bool isGpuSharingEnabled;
 
     // distributed configuration
     int rank;
@@ -64,12 +64,6 @@ void initQuESTEnv();
  */
 void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread);
 
-/** @notyetdoced
- *  Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
- *  This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
- */
-void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);
-
 /// @notyetdoced
 void finalizeQuESTEnv();
 
@@ -92,14 +86,6 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
-/** @notyetdoced
- * GPU thread per block control
- * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
- * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
- */
-int getQuESTNumGpuThreadsPerBlock();
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
-
 
 // end de-mangler
 #ifdef __cplusplus

diff --git a/quest/include/experimental.h b/quest/include/experimental.h
@@ -0,0 +1,107 @@
+/** @file
+ * Experimental functions which are liable to
+ * API breaks within QuEST minor version releases.
+ * Some optional functions require compiling this
+ * file against MPI, despite being outside of /comm/, 
+ * and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (formatting)
+ * 
+ * @defgroup experimental Experimental
+ * @ingroup api
+ * @brief Experimental functions with tentative APIs
+ * @{
+ */
+
+#ifndef EXPERIMENTAL_H
+#define EXPERIMENTAL_H
+
+#include "quest/include/config.h"
+
+#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
+    #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
+#endif
+
+#if QUEST_COMPILE_SUBCOMM
+    #include <mpi.h>
+#endif
+
+// enable invocation by both C and C++ binaries
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyetdoced
+ *
+ *  Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
+ *  This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
+ * 
+ * @author Oliver Brown
+ */
+void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);
+
+
+#if QUEST_COMPILE_SUBCOMM
+/** @notyetdoced
+ * 
+ *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
+ *  Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and 
+ *  therefore that they have already initialised MPI, and they will call MPI_Finalize at the 
+ *  appropriate time.
+ *
+ *  The user-provided MPI communicator undergoes the same validation procedure as any that QuEST
+ *  would use, and so must contain a power-of-2 number of processes.
+ * 
+ * This function is only compiled and exposed when macro QUEST_COMPILE_SUBCOMM is 1, as is
+ * defined when providing CMake option QUEST_ENABLE_SUBCOMM during building.
+ *
+ * @author Oliver Brown
+ */
+void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
+#endif // QUEST_COMPILE_SUBCOMM
+
+
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will fallback to an internal default (presently @p 128).
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#endif // EXPERIMENTAL_H
+
+/** @} */ // (end file-wide doxygen defgroup)