From c150e0b7397a6ae3f9dec2bcbf0e91b5829512d2 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Fri, 24 Apr 2026 14:00:21 +0100
Subject: [PATCH 01/58] cpu_config.cpp: replaced omp_get_num_threads with
 omp_get_max_threads. Yes, it's confusing. Yes, the OpenMP ARB know.

---
 quest/src/cpu/cpu_config.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index c11ec224d..ad8e303a8 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
-    #pragma omp parallel shared(n)
-    #pragma omp single
-    n = omp_get_num_threads();
+    n = omp_get_max_threads();
 
     return n;
 #else

From 9b8ddd1bb3d085d4a0d92b7f663b42fee138481e Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Fri, 24 Apr 2026 18:10:26 +0100
Subject: [PATCH 02/58] We do these things not because they are easy, but
 because we thought they would be easy.

---
 quest/include/environment.h       |   8 ++
 quest/src/api/environment.cpp     |  11 +++
 quest/src/gpu/gpu_config.cpp      |  19 +++++
 quest/src/gpu/gpu_config.hpp      |   8 +-
 quest/src/gpu/gpu_kernels.cuh     |   6 +-
 quest/src/gpu/gpu_subroutines.cpp | 121 ++++++++++++++++++++----------
 6 files changed, 124 insertions(+), 49 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index 04f24bfe2..a6724828b 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -83,6 +83,14 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
+/** @notyetdoced
+ * GPU thread per block control
+ * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
+ * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
+ */
+int getQuESTGpuThreadsPerBlock();
+void setQuESTGpuThreadsPerBlock(const int NEW_TPB);
+
 
 // end de-mangler
 #ifdef __cplusplus
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 541491899..1f36ee64c 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -509,5 +509,16 @@ void getEnvironmentString(char str[200]) {
 }
 
 
+int getQuESTGpuThreadsPerBlock() {
+    QuESTEnv env = getQuESTEnv();
+    return env.isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
+}
+
+void setQuESTGpuThreadsPerBlock(const int NEW_TPB) {
+    // just rely on the internal function to throw an error if there's no GPU support compiled
+    gpu_setNumThreadsPerBlock(NEW_TPB);
+    return;
+}
+
 // end de-mangler
 }
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index c7db834b7..78ef1a414 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -41,6 +41,7 @@
     #include "quest/src/gpu/cuda_to_hip.hpp"
 #endif
 
+int numThreadsPerBlock = 128;
 
 
 /*
@@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock() {
+#if COMPILE_CUDA
+    return numThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
+}
+
+void gpu_setNumThreadsPerBlock(const int NEW_TPB) {
+#if COMPILE_CUDA
+    numThreadsPerBlock = NEW_TPB;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+#endif
+    return;
+}
+
 
 std::array<char,17> getBoundGpuUuid() {
 #if COMPILE_CUDA
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 1b3be6295..866475cc3 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -19,7 +19,6 @@
 #include "quest/include/channels.h"
 
 
-
 /*
  * CUDA ERROR HANDLING
  */
@@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads();
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock();
+
+void gpu_setNumThreadsPerBlock(const int NEW_TPB);
+
 void gpu_bindLocalGPUsToNodes();
 
 bool gpu_areAnyNodesBoundToSameGpu();
@@ -76,7 +79,6 @@ void gpu_initCuQuantum();
 void gpu_finalizeCuQuantum();
 
 
-
 /*
  * MEMORY MANAGEMENT
  */
@@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes();
 
 
 
-#endif // GPU_CONFIG_HPP
\ No newline at end of file
+#endif // GPU_CONFIG_HPP
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 4f2a737e4..7459235d6 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -46,16 +46,12 @@
  * THREAD MANAGEMENT
  */
 
-
-const int NUM_THREADS_PER_BLOCK = 128;
-
-
 __forceinline__ __device__ qindex getThreadInd() {
     return blockIdx.x*blockDim.x + threadIdx.x;
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads) {
+__host__ qindex getNumBlocks(qindex numThreads, const int NUM_THREADS_PER_BLOCK) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 5e18048f7..56b855c4e 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -66,7 +66,6 @@
 using std::vector;
 
 
-
 /*
  * GETTERS
  */
@@ -141,7 +140,8 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<i
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex sendInd = getSubBufferSendInd(qureg);
 
     devints sortedQubits = util_getSorted(qubits);
@@ -169,7 +169,8 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex sendInd = getSubBufferSendInd(qureg);
 
     kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
@@ -208,7 +209,8 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints sortedQubits = util_getSorted(ctrls, {targ2, targ1});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
@@ -232,7 +234,8 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = util_getSorted(ctrls);
@@ -257,7 +260,8 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedQubits = util_getSorted(ctrls, {targ});
@@ -299,7 +303,8 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints sortedQubits = util_getSorted(ctrls, {targ});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
@@ -326,7 +331,8 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = util_getSorted(ctrls);
@@ -368,7 +374,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints sortedQubits = util_getSorted(ctrls, {targ1,targ2});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});
@@ -463,7 +470,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         /// global memory) and greatly sabotage performance on some GPUs.
 
         qindex numThreads = numBatches;
-        qindex numBlocks = getNumBlocks(numThreads);
+        const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+        qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
         kernel_statevec_anyCtrlFewTargDenseMatr
             <NumCtrls, NumTargs, ApplyConj, ApplyTransp> 
@@ -486,6 +494,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         // where we assign one-block per multiprocessor because we are anyway memory-
         // bandwidth bound (so we don't expect many interweaved blocks per MP).
         qindex numThreads = gpu_getMaxNumConcurrentThreads();
+        const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
         
         // use strictly 2^# threads to maintain precondition of all kernels
         if (!isPowerOf2(numThreads))
@@ -497,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
         // evenly distribute the batches between threads, and the threads unevenly between blocks
         qindex numBatchesPerThread = numBatches / numThreads; // divides evenly
-        qindex numBlocks = getNumBlocks(numThreads);
+        qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
         // expand the cache if necessary
         qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
@@ -566,7 +575,8 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
@@ -634,7 +644,8 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
@@ -702,7 +713,8 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints deviceTargs = targs;
     devints deviceCtrls = util_getSorted(ctrls);
@@ -759,7 +771,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     kernel_densmatr_allTargDiagMatr_sub 
         <HasPower, ApplyLeft, ApplyRight, ConjRight> 
@@ -821,7 +834,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
     // faster than when giving threads many pair-amps to modify, due to memory movements
 
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         getPtr(deviceQubits), ctrls.size(), qubitStateMask, 
@@ -843,7 +857,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ct
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     qcomp powI = util_getPowerOfI(y.size());
@@ -884,7 +899,8 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints sortedCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
@@ -917,7 +933,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     // extract amp ptrs from qureg list
     vector<cu_qcomp*> ptrs;
@@ -957,7 +974,8 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps),
@@ -975,7 +993,8 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer),
@@ -1007,7 +1026,8 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braQubit = util_getBraQubit(ketQubit, qureg);
@@ -1033,7 +1053,8 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
@@ -1078,7 +1099,8 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
 #if COMPILE_CUDA || COMPILE_CUQUANTUM 
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     auto term = util_getTwoQubitDephasingTerm(prob);
     int braQubitA = util_getBraQubit(ketQubitA, qureg);
@@ -1106,7 +1128,8 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
@@ -1126,7 +1149,8 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
@@ -1154,7 +1178,8 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1176,7 +1201,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 16;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1201,7 +1227,8 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
@@ -1223,7 +1250,8 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex offset = getBufferRecvInd();
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
@@ -1246,7 +1274,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
@@ -1271,7 +1300,8 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex offset = getBufferRecvInd();
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
@@ -1300,7 +1330,8 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
@@ -1321,7 +1352,8 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
@@ -1349,7 +1381,8 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDampingFactors(prob);
@@ -1370,7 +1403,8 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
@@ -1389,7 +1423,8 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
@@ -1409,7 +1444,8 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     qindex recvInd = getBufferRecvInd();
 
     kernel_densmatr_oneQubitDamping_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
@@ -1437,7 +1473,8 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     devints devTargs = targs;
     devints devPairTargs = pairTargs;
@@ -1557,7 +1594,8 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 #if COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
 
     // allocate exponentially-big temporary memory (error if failed)
     devints devQubits = qubits;
@@ -1591,7 +1629,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     // we decouple numColsPerNode and numThreads for clarity
     // (and in case parallelisation granularity ever changes);
     qindex numThreads = powerOf2(qureg.logNumColsPerNode);
-    qindex numBlocks = getNumBlocks(numThreads);
+    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
     
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);

From 680fdda0e53b9dea14e402a9cbf8cb4337ef181a Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Mon, 4 May 2026 14:06:08 +0100
Subject: [PATCH 03/58] updated var names to match QuEST style

---
 quest/src/gpu/gpu_config.cpp      |   4 +-
 quest/src/gpu/gpu_config.hpp      |   2 +-
 quest/src/gpu/gpu_kernels.cuh     |   4 +-
 quest/src/gpu/gpu_subroutines.cpp | 244 +++++++++++++++---------------
 4 files changed, 127 insertions(+), 127 deletions(-)

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 78ef1a414..588779c4d 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -340,9 +340,9 @@ int gpu_getNumThreadsPerBlock() {
 #endif
 }
 
-void gpu_setNumThreadsPerBlock(const int NEW_TPB) {
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) {
 #if COMPILE_CUDA
-    numThreadsPerBlock = NEW_TPB;
+    numThreadsPerBlock = newThreadsPerBlock;
 #else
     error_gpuQueriedButGpuNotCompiled();
 #endif
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 866475cc3..0787e127a 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -66,7 +66,7 @@ qindex gpu_getMaxNumConcurrentThreads();
 
 int gpu_getNumThreadsPerBlock();
 
-void gpu_setNumThreadsPerBlock(const int NEW_TPB);
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock);
 
 void gpu_bindLocalGPUsToNodes();
 
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 7459235d6..540a409f5 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -51,14 +51,14 @@ __forceinline__ __device__ qindex getThreadInd() {
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads, const int NUM_THREADS_PER_BLOCK) {
+__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
     /// making it function specific
 
     // CUDA ceil
-    return ceil(numThreads / static_cast<qreal>(NUM_THREADS_PER_BLOCK));
+    return ceil(numThreads / static_cast<qreal>(numThreadsPerBlock));
 }
 
 
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 56b855c4e..a75c44ccb 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -140,14 +140,14 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<i
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
     devints sortedQubits = util_getSorted(qubits);
     qindex qubitStateMask  = util_getBitMask(qubits, qubitStates);
 
-    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, 
         getPtr(sortedQubits), qubits.size(), qubitStateMask
     );
@@ -169,11 +169,11 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
-    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, 
         qubit1, qubit2, qubit3, bit2
     );
@@ -209,13 +209,13 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = util_getSorted(ctrls, {targ2, targ1});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
 
-    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2
     );
@@ -234,14 +234,14 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask
     );
@@ -260,14 +260,14 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedQubits = util_getSorted(ctrls, {targ});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});
 
-    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask
     );
@@ -303,15 +303,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = util_getSorted(ctrls, {targ});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
 
     auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr);
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, 
         m00, m01, m10, m11
@@ -331,14 +331,14 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, 
         toCuQcomp(fac0), toCuQcomp(fac1)
@@ -374,8 +374,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = util_getSorted(ctrls, {targ1,targ2});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});
@@ -383,7 +383,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...)
     auto m = unpackMatrixToCuQcomps(matr);
 
-    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2,
         m[0], m[1], m[2],  m[3],  m[4],  m[5],  m[6],  m[7],
@@ -460,7 +460,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     if constexpr (NumTargs != -1) {
 
         // when NumTargs <= 5, each thread has a private array stored in the registers,
-        // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of 
+        // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of 
         // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
         // memory capacity, but does NOT exceed maximum register capacity.
 
@@ -470,12 +470,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         /// global memory) and greatly sabotage performance on some GPUs.
 
         qindex numThreads = numBatches;
-        const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-        qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+        const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         kernel_statevec_anyCtrlFewTargDenseMatr
             <NumCtrls, NumTargs, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 ampsPtr, numThreads, 
                 qubitsPtr, nCtrls, qubitStateMask, 
                 targsPtr, matrPtr
@@ -494,7 +494,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         // where we assign one-block per multiprocessor because we are anyway memory-
         // bandwidth bound (so we don't expect many interweaved blocks per MP).
         qindex numThreads = gpu_getMaxNumConcurrentThreads();
-        const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
+        const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
         
         // use strictly 2^# threads to maintain precondition of all kernels
         if (!isPowerOf2(numThreads))
@@ -506,15 +506,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
         // evenly distribute the batches between threads, and the threads unevenly between blocks
         qindex numBatchesPerThread = numBatches / numThreads; // divides evenly
-        qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         // expand the cache if necessary
-        qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
+        qindex numKernelInvocations = numBlocks * numThreadsPerBlock;
         qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);
 
         kernel_statevec_anyCtrlManyTargDenseMatr 
             <NumCtrls, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 toCuQcomps(cache),
                 ampsPtr, numThreads, numBatchesPerThread, 
                 qubitsPtr, nCtrls, qubitStateMask, 
@@ -575,14 +575,14 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = unpackMatrixToCuQcomps(matr);
 
-    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
     );
@@ -644,14 +644,14 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = unpackMatrixToCuQcomps(matr);
 
-    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
         elems[0], elems[1], elems[2], elems[3]
@@ -713,14 +713,14 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceTargs = targs;
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), 
         toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent)
@@ -771,12 +771,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_allTargDiagMatr_sub 
         <HasPower, ApplyLeft, ApplyRight, ConjRight> 
-        <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+        <<<numBlocks, numThreadsPerBlock>>> (
             toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
             toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
     );
@@ -834,9 +834,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
     // faster than when giving threads many pair-amps to modify, due to memory movements
 
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         getPtr(deviceQubits), ctrls.size(), qubitStateMask, 
         getPtr(deviceTargs), deviceTargs.size(),
@@ -857,8 +857,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ct
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     qcomp powI = util_getPowerOfI(y.size());
@@ -868,7 +868,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ct
     devints sortedCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
         maskXY, maskYZ, bufferMaskXY,
@@ -899,14 +899,14 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> c
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     qindex targMask = util_getBitMask(targs);
 
-    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask,
         toCuQcomp(fac0), toCuQcomp(fac1)
@@ -933,8 +933,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // extract amp ptrs from qureg list
     vector<cu_qcomp*> ptrs;
@@ -946,7 +946,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
     devcuqcompptrs devQuregAmps = ptrs;
     devcomps devCoeffs = coeffs;
 
-    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(outQureg.gpuAmps), numThreads,
         getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
     );
@@ -974,10 +974,10 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subB <<<numBlocks, numThreadsPerBlock>>> (
         outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps),
         numThreads, inQureg.numAmps
     );
@@ -993,10 +993,10 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subC <<<numBlocks, numThreadsPerBlock>>> (
         outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer),
         numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode        
     );
@@ -1026,13 +1026,13 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac
     );
 
@@ -1053,13 +1053,13 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac
     );
 
@@ -1099,14 +1099,14 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
 #if COMPILE_CUDA || COMPILE_CUQUANTUM 
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto term = util_getTwoQubitDephasingTerm(prob);
     int braQubitA = util_getBraQubit(ketQubitA, qureg);
     int braQubitB = util_getBraQubit(ketQubitB, qureg);
 
-    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols
         ketQubitA, ketQubitB, braQubitA, braQubitB, term
     );
@@ -1128,13 +1128,13 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3
     );
 
@@ -1149,14 +1149,14 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3
     );
@@ -1178,14 +1178,14 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, c3
     );
@@ -1201,8 +1201,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 16;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1211,7 +1211,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     // each kernel invocation sums all 4 amps together, so adjusts c1
     qreal altc1 = factors.c1 - factors.c2;
 
-    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2
     );
@@ -1227,14 +1227,14 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braBit2, c3
     );
@@ -1250,15 +1250,15 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
         ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2
     );
@@ -1274,8 +1274,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
@@ -1284,7 +1284,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
     qreal fac0 = 1 + factors.c3;
     qreal fac1 = factors.c1 - fac0;
 
-    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braBit1, braBit2, fac0, fac1
     );
@@ -1300,15 +1300,15 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
-    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
         ketQb1, ketQb2, braBit1, braBit2, c2
     );
@@ -1330,13 +1330,13 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, 
         factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1352,14 +1352,14 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1381,13 +1381,13 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDampingFactors(prob);
 
-    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
         ketQubit, braQubit, prob, factors.c1, factors.c2
     );
@@ -1403,12 +1403,12 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
-    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2
     );
 
@@ -1423,13 +1423,13 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
 
-    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1
     );
 
@@ -1444,11 +1444,11 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
         qubit, prob
     );
@@ -1473,14 +1473,14 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints devTargs = targs;
     devints devPairTargs = pairTargs;
     devints devAllTargs = util_getSorted(targs, pairTargs);
 
-    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads,
         getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size()
     );
@@ -1594,14 +1594,14 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 #if COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // allocate exponentially-big temporary memory (error if failed)
     devints devQubits = qubits;
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, 
         qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size()
     );
@@ -1629,8 +1629,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     // we decouple numColsPerNode and numThreads for clarity
     // (and in case parallelisation granularity ever changes);
     qindex numThreads = powerOf2(qureg.logNumColsPerNode);
-    const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock();
-    qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK);
+    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
@@ -1639,7 +1639,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     devints devQubits = qubits;
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), toCuQcomps(qureg.gpuAmps), 
         numThreads, firstDiagInd, numAmpsPerCol,
         qureg.rank, qureg.logNumAmpsPerNode, 

From 99c3b3c9772d108d3993b12062907507edcccc66 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Fri, 22 May 2026 18:54:34 +0100
Subject: [PATCH 04/58] environment.h/cpp: added num to GPU thread per block
 functions, and removed superfluous getQuESTEnv

---
 quest/include/environment.h   |  4 ++--
 quest/src/api/environment.cpp | 27 ++++++++++++++-------------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index a6724828b..f5630813c 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -88,8 +88,8 @@ QuESTEnv getQuESTEnv();
  * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
  * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
  */
-int getQuESTGpuThreadsPerBlock();
-void setQuESTGpuThreadsPerBlock(const int NEW_TPB);
+int getQuESTNumGpuThreadsPerBlock();
+void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
 
 
 // end de-mangler
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 1f36ee64c..3d10a6896 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -492,31 +492,32 @@ void reportQuESTEnv() {
 void getEnvironmentString(char str[200]) {
     validate_envIsInit(__func__);
 
-    QuESTEnv env = getQuESTEnv();
-
     int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
-    int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
-    int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
+    int cuQuantum = globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
+    int gpuDirect = globalEnvPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
     snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
-        env.isGpuAccelerated,
-        env.isMultithreaded,
-        env.isDistributed,
+        globalEnvPtr->isGpuAccelerated,
+        globalEnvPtr->isMultithreaded,
+        globalEnvPtr->isDistributed,
         numThreads,
-        env.numNodes,
+        globalEnvPtr->numNodes,
         cuQuantum,
         gpuDirect);
 }
 
 
-int getQuESTGpuThreadsPerBlock() {
-    QuESTEnv env = getQuESTEnv();
-    return env.isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+    
+    return globalEnvPtr->isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
 }
 
-void setQuESTGpuThreadsPerBlock(const int NEW_TPB) {
+void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
+    validate_envIsInit(__func__);
+
     // just rely on the internal function to throw an error if there's no GPU support compiled
-    gpu_setNumThreadsPerBlock(NEW_TPB);
+    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
     return;
 }
 

From 40a16c17062fda71525d2ec8efd7f101db18e68c Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 15:45:55 +0100
Subject: [PATCH 05/58] gpu_config.cpp: numThreadsPerBlock ->
 global_numThreadsPerBlock

---
 quest/src/gpu/gpu_config.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 588779c4d..da51b0c15 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -41,7 +41,6 @@
     #include "quest/src/gpu/cuda_to_hip.hpp"
 #endif
 
-int numThreadsPerBlock = 128;
 
 
 /*
@@ -331,18 +330,20 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
+int global_numThreadsPerBlock = 128;
+
 int gpu_getNumThreadsPerBlock() {
 #if COMPILE_CUDA
-    return numThreadsPerBlock;
+    return global_numThreadsPerBlock;
 #else
     error_gpuQueriedButGpuNotCompiled();
     return -1;
 #endif
 }
 
-void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) {
+void gpu_setNumThreadsPerBlock(const int newNumThreadsPerBlock) {
 #if COMPILE_CUDA
-    numThreadsPerBlock = newThreadsPerBlock;
+    global_numThreadsPerBlock = newNumThreadsPerBlock;
 #else
     error_gpuQueriedButGpuNotCompiled();
 #endif

From 718db7c0645e0a3da98291a7a161ccfb96208275 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 15:53:27 +0100
Subject: [PATCH 06/58] GPU threads per block can now be set and queried even
 when GPU is not compiled

---
 quest/src/api/environment.cpp |  2 +-
 quest/src/gpu/gpu_config.cpp  | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 3d10a6896..9e3d4fe7a 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -510,7 +510,7 @@ void getEnvironmentString(char str[200]) {
 int getQuESTNumGpuThreadsPerBlock() {
     validate_envIsInit(__func__);
     
-    return globalEnvPtr->isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
+    return gpu_getNumThreadsPerBlock();
 }
 
 void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index da51b0c15..35e18989e 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -333,20 +333,13 @@ qindex gpu_getMaxNumConcurrentThreads() {
 int global_numThreadsPerBlock = 128;
 
 int gpu_getNumThreadsPerBlock() {
-#if COMPILE_CUDA
+    // permitted even when GPU backend not compiled
     return global_numThreadsPerBlock;
-#else
-    error_gpuQueriedButGpuNotCompiled();
-    return -1;
-#endif
 }
 
 void gpu_setNumThreadsPerBlock(const int newNumThreadsPerBlock) {
-#if COMPILE_CUDA
+    // permitted even when GPU backend not compiled
     global_numThreadsPerBlock = newNumThreadsPerBlock;
-#else
-    error_gpuQueriedButGpuNotCompiled();
-#endif
     return;
 }
 

From e75c898e02f99f72809694e6478cef14cbcbbf33 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 16:06:17 +0100
Subject: [PATCH 07/58] environment.cpp: added gpu numThreadsPerBlock to
 reporting

---
 quest/src/api/environment.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 9e3d4fe7a..e83704dbb 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -252,6 +252,7 @@ void printGpuInfo() {
         {"gpuMemory",     isGpu?  printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes())            + pg : na},
         {"gpuMemoryFree", isGpu?  printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
         {"gpuCache",      isGpu?  printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes())            + pg : na},
+        {"numThreadsPerBlock", isGpu? printer_toStr(gpu_getNumThreadsPerBlock()) : na},
     });
 }
 

From 2fcab1a71e1485fb2afc2804fed24b782dcc4142 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 17:20:46 +0100
Subject: [PATCH 08/58] Added compile time configurable default value for gpu
 threads per block

---
 CMakeLists.txt               | 16 +++++++++++++++-
 quest/include/config.h.in    |  4 +++-
 quest/src/gpu/gpu_config.cpp |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f91c05f83..c95480b1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,20 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${ENABLE_HIP}. Set ENABLE_HIP to modify.")
 
+# GPU Performance Tuning
+## We do not print this value when configuring CMake as it is for advanced users only.
+
+set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
+  CACHE
+  STRING
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
+)
+mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)
+
+math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
+    message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
+endif()
 
 # Deprecated API
 option(
@@ -478,7 +492,7 @@ set(COMPILE_OPENMP ${ENABLE_MULTITHREADING})
 set(COMPILE_MPI ${ENABLE_DISTRIBUTION})
 set(COMPILE_CUQUANTUM ${ENABLE_CUQUANTUM})
 set(INCLUDE_DEPRECATED_FUNCTIONS ${ENABLE_DEPRECATED_API})
-
+set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})
 
 # (for the love of God cmake, create a concise syntax for this)
 if (ENABLE_CUDA OR ENABLE_HIP)
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index 2cb12fa90..f2a8852b4 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -82,6 +82,8 @@
 #cmakedefine01 COMPILE_CUDA
 #cmakedefine01 COMPILE_CUQUANTUM
 
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
 
 // not actually a CMake option (user cannot disable) but nonetheless crucial
 #cmakedefine01 NUMA_AWARE
@@ -166,4 +168,4 @@
 
 
 
-#endif // CONFIG_H
\ No newline at end of file
+#endif // CONFIG_H
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 35e18989e..f8d88dd8b 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -330,7 +330,7 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
-int global_numThreadsPerBlock = 128;
+int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK;
 
 int gpu_getNumThreadsPerBlock() {
     // permitted even when GPU backend not compiled

From 25cbe77d6bdcc35578ba8c484746c144b209d9ff Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 17:43:20 +0100
Subject: [PATCH 09/58] errors.cpp/hpp: added error_gpuBadNumThreadsPerBlock

---
 quest/src/core/errors.cpp | 5 +++++
 quest/src/core/errors.hpp | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 5e34ecdd4..74c005d02 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -640,6 +640,11 @@ void error_gpuDenseMatrixConjugatedAndTransposed() {
     raiseInternalError("The GPU + cuQuantum implementation of anyCtrlAnyTargDenseMatr() assumes that at most one of template arguments ApplyConj and ApplyTransp is true, though this was violated.");
 }
 
+void error_gpuBadNumThreadsPerBlock() {
+
+    raiseInternalError("The number of threads per block must be a multiple of 32 on NVIDIA GPUs or a multiple of 64 on AMD GPUs.");
+}
+
 void assert_quregIsGpuAccelerated(Qureg qureg) {
 
     if (!qureg.isGpuAccelerated)
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index dab8faf10..0d6a328e7 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -247,6 +247,8 @@ void error_gpuDeadCopyMatrixFunctionCalled();
 
 void error_gpuDenseMatrixConjugatedAndTransposed();
 
+void error_gpuBadNumThreadsPerBlock();
+
 void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
@@ -405,4 +407,4 @@ void error_unexpectedNumLindbladSuperpropTerms();
 
 
 
-#endif // ERRORS_HPP
\ No newline at end of file
+#endif // ERRORS_HPP

From fcbd4f435f9b19c8475b8f4d892e10a0d115ceae Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 17:48:49 +0100
Subject: [PATCH 10/58] gpu_config.cpp/hpp: added isHipCompiled and validation
 for setNumThreadsPerBlock

---
 quest/src/gpu/gpu_config.cpp | 15 +++++++++++++++
 quest/src/gpu/gpu_config.hpp |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index f8d88dd8b..3bfa5577c 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -174,6 +174,11 @@ bool gpu_isCuQuantumCompiled() {
 }
 
 
+bool gpu_isHipCompiled() {
+    return (bool) (COMPILE_CUDA && COMPILE_HIP);
+}
+
+
 int gpu_getNumberOfLocalGpus() {
 #if COMPILE_CUDA
 
@@ -338,6 +343,16 @@ int gpu_getNumThreadsPerBlock() {
 }
 
 void gpu_setNumThreadsPerBlock(const int newNumThreadsPerBlock) {
+    if (gpu_isHipCompiled()) {
+        // number of threads per block should be a multiple of 64
+        if (newNumThreadsPerBlock % 64)
+            error_gpuBadNumThreadsPerBlock();
+    } else {
+        // number of threads per block should be a multiple of 32
+        if (newNumThreadsPerBlock % 32)
+            error_gpuBadNumThreadsPerBlock();
+    }
+
     // permitted even when GPU backend not compiled
     global_numThreadsPerBlock = newNumThreadsPerBlock;
     return;
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 0787e127a..fe1bdb04f 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -42,6 +42,8 @@ bool gpu_isGpuCompiled();
 
 bool gpu_isCuQuantumCompiled();
 
+bool gpu_isHipCompiled();
+
 bool gpu_isGpuAvailable();
 
 bool gpu_isDirectGpuCommPossible();

From 8c2a65b8b7814155ce18265581f954678be5c7b5 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com>
Date: Mon, 25 May 2026 17:57:45 +0100
Subject: [PATCH 11/58] environment.cpp: added isHipCompiled to reporting and
 reordered to group GPU stuff

---
 quest/src/api/environment.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index e83704dbb..71ea2c267 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -187,9 +187,10 @@ void printCompilationInfo() {
 
     print_table(
         "compilation", {
+        {"isOmpCompiled",       cpu_isOpenmpCompiled()},
         {"isMpiCompiled",       comm_isMpiCompiled()},
         {"isGpuCompiled",       gpu_isGpuCompiled()},
-        {"isOmpCompiled",       cpu_isOpenmpCompiled()},
+        {"isHipCompiled",       gpu_isHipCompiled()},
         {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
     });
 }
@@ -199,9 +200,9 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
+        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
         {"isMpiEnabled",        globalEnvPtr->isDistributed},
         {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
         {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
         {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
     });

From 2edb976bfde547a7d30f32929fc452c7f7b09e50 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Tue, 26 May 2026 15:55:16 +0100
Subject: [PATCH 12/58] Docs: added new QUEST_GPU_NUM_THREADS_PER_BLOCK CMake
 var.

---
 docs/cmake.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cmake.md b/docs/cmake.md
index d3c23ee4c..6fcce9845 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -46,7 +46,7 @@ make
 | `DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `OUTPUT_EXE` *must* also be defined. |
 | `OUTPUT_EXE` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE`. `USER_SOURCE` *must* also be defined. |
-
+| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |
 
 
 

From d610e8ceacb91623007be18b374576be45ef75c6 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Tue, 26 May 2026 16:01:23 +0100
Subject: [PATCH 13/58] environment.cpp: added comment to note that
 newThreadsPerBlock is validated in the internal gpu function to stop me
 wonder where that is in future

---
 quest/src/api/environment.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 71ea2c267..cbdf71c6c 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -519,6 +519,7 @@ void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
     validate_envIsInit(__func__);
 
     // just rely on the internal function to throw an error if there's no GPU support compiled
+    // or if newThreadsPerBlock is not a multiple of 32 (NVIDIA) or 64 (AMD)
     gpu_setNumThreadsPerBlock(newThreadsPerBlock);
     return;
 }

From 6e975bdadf9f1946bf7453abedf92a44b0ed2745 Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Tue, 26 May 2026 17:43:06 +0100
Subject: [PATCH 14/58] tests: added basic tests for
 (set/get)QuESTNumGpuThreadsPerBlock

---
 tests/unit/environment.cpp | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index 6d4efb80d..c17049561 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -166,6 +166,40 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 }
 
 
+TEST_CASE( "QuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+        // Check that it initially matches the compile time value
+        // stored in config.h
+        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
+
+        // try a set/get iteration
+        const int test_num_tpb = 64;
+        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(test_num_tpb));
+        REQUIRE(getQuESTNumGpuThreadsPerBlock() == test_num_tpb);
+
+        // set it back to the original and confirm that also worked
+        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK));
+        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
+
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // The way the error-handling currently works, Catch2 can't catch these (ironically)
+        // but leaving them in case we ever update the way errors are done.
+        
+        SECTION( "Less than 32" ) {
+            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(31) , ContainsSubstring("number of threads per block") );
+        }
+
+        SECTION("Not a multiple of 32 or 64.") {
+            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(94) , ContainsSubstring("number of threads per block") );
+        }
+
+    }
+}
+
 /** @} (end defgroup) */
 
 

From 97dbf085f1812db1a01f8354a4217bc5af0918ee Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Tue, 26 May 2026 17:51:01 +0100
Subject: [PATCH 15/58] de-const

---
 quest/src/gpu/gpu_kernels.cuh     |  2 +-
 quest/src/gpu/gpu_subroutines.cpp | 80 +++++++++++++++----------------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 74ee13f66..dd34f60b0 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -47,7 +47,7 @@ __forceinline__ __device__ qindex getThreadInd() {
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) {
+__host__ qindex getNumBlocks(qindex numThreads, int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 0da26554c..8fe0cb7cf 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -140,7 +140,7 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, SmallList qubits, SmallList
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
@@ -169,7 +169,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
@@ -209,7 +209,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, SmallList ctrls, SmallList ctrlS
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ2, targ1}));
@@ -234,7 +234,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, SmallList ctrls, SmallList ctrlS
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -260,7 +260,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, SmallList ctrls, SmallList ctrlS
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -304,7 +304,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, SmallList ctrls, Sma
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ}));
@@ -332,7 +332,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, SmallList ctrls, Sma
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -376,7 +376,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ1,targ2}));
@@ -472,7 +472,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal
         /// global memory) and greatly sabotage performance on some GPUs.
 
         qindex numThreads = numBatches;
-        const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
         qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         kernel_statevec_anyCtrlFewTargDenseMatr
@@ -496,7 +496,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal
         // where we assign one-block per multiprocessor because we are anyway memory-
         // bandwidth bound (so we don't expect many interweaved blocks per MP).
         qindex numThreads = gpu_getMaxNumConcurrentThreads();
-        const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
         
         // use strictly 2^# threads to maintain precondition of all kernels
         if (!isPowerOf2(numThreads))
@@ -578,7 +578,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = getDevInts(util_getSorted(ctrls));
@@ -649,7 +649,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = getDevInts(util_getSorted(ctrls));
@@ -718,7 +718,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceTargs = getDevInts(targs);
@@ -776,7 +776,7 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_allTargDiagMatr_sub 
@@ -839,7 +839,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, SmallList ctrl
     // faster than when giving threads many pair-amps to modify, due to memory movements
 
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
@@ -862,7 +862,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, SmallList ctrl
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -904,7 +904,7 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, SmallList ctr
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedCtrls = getDevInts(util_getSorted(ctrls));
@@ -938,7 +938,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // extract amp ptrs from qureg list
@@ -979,7 +979,7 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_mixQureg_subB <<<numBlocks, numThreadsPerBlock>>> (
@@ -998,7 +998,7 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_mixQureg_subC <<<numBlocks, numThreadsPerBlock>>> (
@@ -1031,7 +1031,7 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
@@ -1058,7 +1058,7 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 #elif COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
@@ -1104,7 +1104,7 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
 #if COMPILE_CUDA || COMPILE_CUQUANTUM 
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto term = util_getTwoQubitDephasingTerm(prob);
@@ -1133,7 +1133,7 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
@@ -1154,7 +1154,7 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -1183,7 +1183,7 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
@@ -1206,7 +1206,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 16;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
@@ -1232,7 +1232,7 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
@@ -1255,7 +1255,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
@@ -1279,7 +1279,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
@@ -1305,7 +1305,7 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
@@ -1335,7 +1335,7 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
@@ -1357,7 +1357,7 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -1386,7 +1386,7 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
@@ -1408,7 +1408,7 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
@@ -1428,7 +1428,7 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
@@ -1449,7 +1449,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
@@ -1478,7 +1478,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, SmallList targ
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints devTargs = getDevInts(targs);
@@ -1599,7 +1599,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 #if COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode;
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // allocate exponentially-big temporary memory (error if failed)
@@ -1634,7 +1634,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     // we decouple numColsPerNode and numThreads for clarity
     // (and in case parallelisation granularity ever changes);
     qindex numThreads = powerOf2(qureg.logNumColsPerNode);
-    const int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
     qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);

From 460cbb5a1493a623a753e58b810701dadeae88ed Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Wed, 27 May 2026 15:03:22 +0100
Subject: [PATCH 16/58] gpu_config.cpp: corrected gpu_isHipCompiled for new
 preprocessor vars

---
 quest/src/gpu/gpu_config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 2bf14623e..ba5e1b1da 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -175,7 +175,7 @@ bool gpu_isCuQuantumCompiled() {
 
 
 bool gpu_isHipCompiled() {
-    return (bool) (COMPILE_CUDA && COMPILE_HIP);
+    return (bool) (QUEST_COMPILE_CUDA && QUEST_COMPILE_HIP);
 }
 
 

From 16179f1f4807a53c6aa3f816fc246165871b03b5 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 00:44:45 -0400
Subject: [PATCH 17/58] Make env.isX bool to match env.userOwnsMpi

---
 quest/include/environment.h   | 12 ++++++------
 quest/src/api/environment.cpp |  7 ++++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index 608829912..440305b75 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -35,20 +35,20 @@ extern "C" {
 typedef struct {
 
     // deployment modes which can be runtime disabled
-    int isMultithreaded;
-    int isGpuAccelerated;
-    int isDistributed;
-    bool userOwnsMpi;
+    bool isMultithreaded;
+    bool isGpuAccelerated;
+    bool isDistributed;
 
     // deployment modes which cannot be directly changed after compilation
-    int isCuQuantumEnabled;
+    bool isCuQuantumEnabled;
 
     // deployment configurations which can be changed via environment variables
-    int isGpuSharingEnabled;
+    bool isGpuSharingEnabled;
 
     // distributed configuration
     int rank;
     int numNodes;
+    bool userOwnsMpi;
 
 } QuESTEnv;
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 1cc2f6862..82540fdcb 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -86,7 +86,8 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     // by mpirun believe they are each the main rank. This seems unavoidable.
     validate_newEnvDeploymentMode(useDistrib, useGpuAccel, useMultithread, caller);
 
-    // overwrite deployments left as modeflag::USE_AUTO
+    // overwrite deployments (left as modeflag::USE_AUTO=-1) with 0,1 (a bool),
+    // which crucially, resolves useDistrib, permitting its consultation below
     autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
 
     // optionally initialise MPI; necessary before completing validation,
@@ -140,17 +141,17 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     if (globalEnvPtr == nullptr)
         error_allocOfQuESTEnvFailed();
 
-    // bind deployment info to global instance
+    // bind deployment info to global instance (autocasting int to bool)
     globalEnvPtr->isMultithreaded     = useMultithread;
     globalEnvPtr->isGpuAccelerated    = useGpuAccel;
     globalEnvPtr->isDistributed       = useDistrib;
-    globalEnvPtr->userOwnsMpi         = userOwnsMpi;
     globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
     globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
     globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
+    globalEnvPtr->userOwnsMpi = userOwnsMpi;
 }
 
 void updateQuESTEnvDistInfo() {

From 1ddfb6bce559a3953a13e5d2e27b3d0770ea9e84 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 00:45:46 -0400
Subject: [PATCH 18/58] Add MPI status validation

---
 quest/src/api/environment.cpp |  8 +++++-
 quest/src/core/validation.cpp | 47 +++++++++++++++++++++++++++++++++++
 quest/src/core/validation.hpp |  2 ++
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 82540fdcb..9f77421d8 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -74,9 +74,12 @@ static bool hasEnvBeenFinalized = false;
 void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller) {
 
     // ensure that we are never re-initialising QuEST (even after finalize) because
-    // this leads to undefined behaviour in distributed mode, as per the MPI
+    // this leads to undefined behaviour in distributed mode, as per the MPI std,
+    // regardless of whether the user owns MPI
     validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
 
+    // load env-vars before validating deployment mode, because some env vars can
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -90,6 +93,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     // which crucially, resolves useDistrib, permitting its consultation below
     autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
 
+    // ensure that current state of MPI is valid
+    validate_mpiInitStatus(useDistrib, userOwnsMpi, caller);
+
     // optionally initialise MPI; necessary before completing validation,
     // and before any GPU initialisation and validation, since we will
     // perform that specifically upon the MPI-process-bound GPU(s). Further,
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 959acb61e..871d94199 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -107,6 +107,15 @@ namespace report {
     string CUQUANTUM_DEPLOYED_ON_GPU_WITHOUT_MEM_POOLS =
         "Cannot use cuQuantum since your GPU does not support memory pools. Recompile with cuQuantum disabled to fall-back to using Thrust and custom kernels.";
 
+    string USER_OWNED_MPI_WAS_NOT_INIT =
+        "User owns MPI but did not prior initialise MPI before initialising QuEST.";
+
+    string QUEST_OWNED_MPI_WAS_PRE_INIT =
+        "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership.";
+
+    string QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT =
+        "QuESTEnv was initialised to be non-distributed but MPI was externally initialised - this is presently unsupported due to a (very minor) technical limitation. If you need this facility, please raise a Github issue!";
+
     
     /*
      * EXISTING QUESTENV
@@ -1482,6 +1491,44 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) {
     assertAllNodesAgreeThat(hasMemPools, report::CUQUANTUM_DEPLOYED_ON_GPU_WITHOUT_MEM_POOLS, caller);
 }
 
+void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // Validation prior to this function confirms init(Custom*)QuESTEnv is only ever called
+    // once, but we must additionally confirm the user has interacted with MPI legally
+
+    bool isMpiInit = comm_isInit();
+
+    // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it
+    if (!userOwnsMpi)
+        assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller);
+
+    // (B) If QuEST is instructed not to use distribution, we must demand the user is not
+    // using MPI, because we internally consult comm_isInit() to detect QuEST distribution
+    // in many functions, and that will give a false positive when the user inits MPI directly. 
+    if (!useDistrib)
+        assertThat(!isMpiInit, report::QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT, caller);
+
+    // TODO: we can relax above, permitting the user to play with MPI directly while 
+    // disabling it for QuEST, by replacing internal comm_isInit() with e.g. env_isDistributed()
+
+    // (C) If QuEST will use MPI owned by the user, the user must have pre-initialised it
+    if (useDistrib && userOwnsMpi)
+        assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller);
+    
+    // Confirmation that all 8 scenarios are handled:
+    //     useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI)
+    // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
+    //     useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!)
+    // (B) useDistrib=0, userOwnsMpi=1, isMpiInit=1 (illegal: comm_isInit() limitation as above)
+    //     useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI)
+    // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
+    // (C) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init)
+    //     useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init)
+}
+
 
 
 /*
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 66fb8f546..345931946 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -77,6 +77,8 @@ void validate_newEnvNodesEachHaveUniqueGpu(const char* caller);
 
 void validate_gpuIsCuQuantumCompatible(const char* caller);
 
+void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller);
+
 
 
 /*

From 507c2e46b2f8b8f7a704cac1e28273de08ccabc5 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 01:33:34 -0400
Subject: [PATCH 19/58] Simplify comm_init()

---
 quest/src/api/environment.cpp  |  3 +-
 quest/src/comm/comm_config.cpp | 53 +++++++---------------------------
 quest/src/comm/comm_config.hpp |  2 +-
 3 files changed, 14 insertions(+), 44 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 9f77421d8..3c0b90999 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -100,7 +100,8 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     // and before any GPU initialisation and validation, since we will
     // perform that specifically upon the MPI-process-bound GPU(s). Further,
     // we can make sure validation errors are reported only by the root node.
-    comm_init(useDistrib, userOwnsMpi);
+    if (useDistrib)
+        comm_init(userOwnsMpi);
 
     validate_newEnvDistributedBetweenPower2Nodes(caller);
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 9da8f34e1..f141d1f85 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -103,55 +103,24 @@ bool comm_isInit() {
 }
 
 
-void comm_init(int useDistrib, bool userOwnsMpi) {
+void comm_init(bool userOwnsMpi) {
 #if QUEST_COMPILE_MPI
 
-    // error if user owns MPI but has not initialised
-    if (userOwnsMpi && !comm_isInit()) {
+    // re-assert prior user-validations for robustness
+    if (userOwnsMpi && !comm_isInit())
         error_commNotInit();
-    }
+    if (!userOwnsMpi && comm_isInit())
+        error_commAlreadyInit();
    
-    // Overall mpiCommQuest should be set in the following ways
-    // however only useDistrib = 1 and userOwnsMpi = false
-    // and useDistrib = 0 and userOwnsMpi = true 
-    // require action here
-    //
-    // | useDistrib | userOwnsMpi |  mpiCommQuest  |
-    // | ---------- | ----------- | -------------- |
-    // |     0      |    false    | MPI_COMM_NULL  |
-    // | ---------- | ----------- | -------------- |
-    // |     1      |    false    | MPI_COMM_WORLD |
-    // | ---------- | ----------- | -------------- |
-    // |     0      |    true     | MPI_COMM_SELF  |
-    // | ---------- | ----------- | -------------- |
-    // |            |             | MPI_COMM_WORLD |
-    // |     1      |    true     |      or        |
-    // |            |             | userQuestComm  |
-    // | ---------- | ----------- | -------------- |
-    
+    // init MPI only when it's not the user's responsibility
+    if (!userOwnsMpi)
+        MPI_Init(NULL, NULL);
 
-    if (useDistrib && !userOwnsMpi) {
-        // error if attempting re-initialisation
-        if (comm_isInit()) {
-            error_commAlreadyInit();
-        } else {
-            MPI_Init(NULL, NULL);
-            // The user wants MPI and is leaving it to QuEST
-            MPI_Comm_dup(MPI_COMM_WORLD, &mpiCommQuest);
-        }
-    } else if (!useDistrib && userOwnsMpi) {
-        // The user has initialised MPI but wants QuEST to ignore it
-        MPI_Comm_dup(MPI_COMM_SELF, &mpiCommQuest);
-    } else if (useDistrib && userOwnsMpi) {
-        // if mpiCommQuEST is still MPI_COMM_NULL the user is not 
-        // providing their own MPI_Comm and we should set mpiCommQuest
-        // to MPI_COMM_WORLD
-        if (mpiCommQuest == MPI_COMM_NULL)
-            MPI_Comm_dup(MPI_COMM_WORLD, &mpiCommQuest);
-    }
+    // choose communicator only when the user hasn't 
+    if (mpiCommQuest == MPI_COMM_NULL)
+        MPI_Comm_dup(MPI_COMM_WORLD, &mpiCommQuest);
 
 #endif
-    return;
 }
 
 
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index b2d038cd5..b061dd3e2 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -22,7 +22,7 @@ bool comm_isMpiCompiled();
 bool comm_isMpiSubCommunicatorCompiled();
 bool comm_isMpiGpuAware();
 
-void comm_init(int useDistrib, bool userOwnsMpi);
+void comm_init(bool userOwnsMpi);
 void comm_end(bool userOwnsMpi);
 void comm_sync();
 

From e80f768a1a5a8343b7131736c51a79c63cfdc4bd Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 01:37:47 -0400
Subject: [PATCH 20/58] Enable error msgs even when MPI config is invalid

These exemptions for communicator NULL-ness enable an error message to reach the user even then the user has called MPI_Init themselves but then triggered a validation error before the communicator could be set
---
 quest/src/comm/comm_config.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index f141d1f85..7bc9c2f92 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -132,6 +132,13 @@ void comm_end(bool userOwnsMpi) {
     if (!comm_isInit())
         return;
 
+    // ungracefully handle when the communicator is still NULL, because comm_end() may be
+    // triggered by "bad MPI init" validation, during which, the communicator may not yet
+    // have been set. We choose NOT to divert to MPI_COMM_WORLD, which is likely just to
+    // stall at MPI_Barrier, and instead let the user's communicator live on; then crash!
+    if (mpiCommQuest == MPI_COMM_NULL)
+        return;
+
     MPI_Barrier(mpiCommQuest);
     MPI_Comm_free(&mpiCommQuest);
     
@@ -152,8 +159,16 @@ int comm_getRank() {
     if (!comm_isInit())
         return ROOT_RANK;
 
+    // consult the (potentially sub-) communicator for rank; if it is still
+    // NULL, as can only validly happen during failed MPI status validation (the
+    // error msg is attemptedly printed on only the root process), fallback to
+    // using WORLD (and pray the user hasn't silenced world-root std-out!). We
+    // COULD safely return ROOT_RANK instead, letting all processes believe they
+    // are root, but this grossly duplicates the output across ALL processes
+    MPI_Comm comm = (mpiCommQuest == MPI_COMM_NULL)? MPI_COMM_WORLD : mpiCommQuest;
+
     int rank;
-    MPI_Comm_rank(mpiCommQuest, &rank);
+    MPI_Comm_rank(comm, &rank);
     return rank;
 
 #else
@@ -200,6 +215,12 @@ void comm_sync() {
     if (!comm_isInit())
         return;
 
+    // gracefully handle when the communicator is still NULL, because comm_sync() is
+    // triggered by "bad MPI init" validation (during the error message printing)
+    // during which, the communicator may not yet have been overriden
+    if (mpiCommQuest == MPI_COMM_NULL)
+        return;
+
     MPI_Barrier(mpiCommQuest);
 #endif
 }

From 8b73cd3b7ea5f42724e73865233f598d786e4f97 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 01:42:47 -0400
Subject: [PATCH 21/58] Add Oliver's custom MPI examples

taken from #712
---
 examples/extended/user_owned_mpi.c      | 31 ++++++++++++
 examples/extended/user_owned_submpi.cpp | 66 +++++++++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 examples/extended/user_owned_mpi.c
 create mode 100644 examples/extended/user_owned_submpi.cpp

diff --git a/examples/extended/user_owned_mpi.c b/examples/extended/user_owned_mpi.c
new file mode 100644
index 000000000..55967a4ef
--- /dev/null
+++ b/examples/extended/user_owned_mpi.c
@@ -0,0 +1,31 @@
+/** @file
+ * 
+ * TODO
+ * 
+ * @author Oliver Brown
+ */
+
+#include <mpi.h>
+#include "quest.h"
+
+
+    // TODO:
+    // this file will only receive mpi.h from CMakeLists.txt if
+    // we are also compiling with QUEST_ENABLE_SUBCOMM. Fix this!
+
+
+int main (void)
+{
+    const int  USE_DISTRIB = 1;
+    const bool USER_MPI    = 1;
+    const int  USE_OPENMP  = 1;
+    const int  USE_GPU     = 0;
+
+    MPI_Init(NULL, NULL);
+    initCustomMpiQuESTEnv(USE_DISTRIB, USER_MPI, USE_GPU, USE_OPENMP);
+    reportQuESTEnv();
+    finalizeQuESTEnv();
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/examples/extended/user_owned_submpi.cpp b/examples/extended/user_owned_submpi.cpp
new file mode 100644
index 000000000..d1d336637
--- /dev/null
+++ b/examples/extended/user_owned_submpi.cpp
@@ -0,0 +1,66 @@
+/** @file
+ * 
+ * TODO
+ * 
+ * @author Oliver Brown
+ */
+
+#include <cstdio>
+#include <mpi.h>
+#include <quest.h>
+
+
+    // TODO:
+    // this file will only receive mpi.h from CMakeLists.txt if
+    // we are also compiling with QUEST_ENABLE_SUBCOMM. Fix this!
+
+
+int main (void)
+{
+    int nprocs, quest_nprocs, world_rank, quest_rank;
+    MPI_Comm comm_split, comm_quantum, comm_classical;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    const int I_AM_QUANTUM = world_rank % 2;
+
+    std::printf("[%d] Hello from rank %d of %d in MPI_COMM_WORLD.\n", world_rank, world_rank, nprocs);
+
+    MPI_Comm_split(MPI_COMM_WORLD, I_AM_QUANTUM, world_rank, &comm_split);
+
+    if (I_AM_QUANTUM) {
+        MPI_Comm_dup(comm_split, &comm_quantum);
+        MPI_Comm_size(comm_quantum, &quest_nprocs);
+        MPI_Comm_rank(comm_quantum, &quest_rank);
+        std::printf("[%d] Hello from rank %d of %d in comm_quantum.\n", world_rank, quest_rank, quest_nprocs);
+    } else {
+        MPI_Comm_dup(comm_split, &comm_classical);
+        quest_rank = -1;
+        quest_nprocs = -1;
+    }
+
+    // only procs in quantum comm initialise QuEST
+    if (I_AM_QUANTUM) {
+        std::printf("[%d] Initialising QuEST.\n", world_rank);
+        initCustomMpiCommQuESTEnv(comm_quantum, modeflag::USE_AUTO, modeflag::USE_AUTO);
+
+        reportQuESTEnv();
+
+        std::printf("[%d] Finalising QuEST.\n", world_rank);
+        finalizeQuESTEnv();
+    }
+
+    MPI_Comm_free(&comm_split);
+    if (I_AM_QUANTUM) {
+        MPI_Comm_free(&comm_quantum);
+    } else {
+        MPI_Comm_free(&comm_classical);
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}

From e91f54f9126cd2777093d2920ebce89e823c2b69 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 01:52:27 -0400
Subject: [PATCH 22/58] renamed env.userOwnsMpi to env.isMpiUserOwned

for consistency with e.g. env.isMultithreaded. The user arg to e.g. initQuESTEnv() is kept as "userOwnsMpi" for a very superficial consistency with e.g. "useMultithreaded" :^)
---
 quest/include/environment.h   |  2 +-
 quest/src/api/environment.cpp | 22 ++++++++++------------
 tests/unit/environment.cpp    |  6 +++---
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index 440305b75..a584192d7 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -38,6 +38,7 @@ typedef struct {
     bool isMultithreaded;
     bool isGpuAccelerated;
     bool isDistributed;
+    bool isMpiUserOwned;
 
     // deployment modes which cannot be directly changed after compilation
     bool isCuQuantumEnabled;
@@ -48,7 +49,6 @@ typedef struct {
     // distributed configuration
     int rank;
     int numNodes;
-    bool userOwnsMpi;
 
 } QuESTEnv;
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 3c0b90999..1e740e427 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -152,13 +152,13 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     globalEnvPtr->isMultithreaded     = useMultithread;
     globalEnvPtr->isGpuAccelerated    = useGpuAccel;
     globalEnvPtr->isDistributed       = useDistrib;
+    globalEnvPtr->isMpiUserOwned      = userOwnsMpi;
     globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
     globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
     globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
-    globalEnvPtr->userOwnsMpi = userOwnsMpi;
 }
 
 void updateQuESTEnvDistInfo() {
@@ -214,7 +214,7 @@ void printDeploymentInfo() {
     print_table(
         "deployment", {
         {"isMpiEnabled",        globalEnvPtr->isDistributed},
-        {"doesUserOwnMpi",      globalEnvPtr->userOwnsMpi},
+        {"isMpiUserOwned",      globalEnvPtr->isMpiUserOwned},
         {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
         {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
         {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
@@ -457,7 +457,7 @@ void finalizeQuESTEnv() {
 
     if (globalEnvPtr->isDistributed) {
         comm_sync();
-        comm_end(globalEnvPtr->userOwnsMpi);
+        comm_end(globalEnvPtr->isMpiUserOwned);
     }
 
     // free global env's heap memory and flag it as unallocated
@@ -517,19 +517,17 @@ void reportQuESTEnv() {
 void getQuESTEnvironmentString(char str[200]) {
     validate_envIsInit(__func__);
 
-    QuESTEnv env = getQuESTEnv();
-
     int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
-    int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
-    int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
+    int cuQuantum = globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
+    int gpuDirect = globalEnvPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
     snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d userOwnsMPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
-        env.isGpuAccelerated,
-        env.isMultithreaded,
-        env.isDistributed,
-        env.userOwnsMpi,
+        globalEnvPtr->isGpuAccelerated,
+        globalEnvPtr->isMultithreaded,
+        globalEnvPtr->isDistributed,
+        globalEnvPtr->isMpiUserOwned,
         numThreads,
-        env.numNodes,
+        globalEnvPtr->numNodes,
         cuQuantum,
         gpuDirect);
 }
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index 344ac5864..85d96cf8e 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -161,9 +161,9 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
         REQUIRE( (env.isMultithreaded     == 0 || env.isMultithreaded     == 1) );
         REQUIRE( (env.isGpuAccelerated    == 0 || env.isGpuAccelerated    == 1) );
         REQUIRE( (env.isDistributed       == 0 || env.isDistributed       == 1) );
-        REQUIRE( (env.userOwnsMpi         == 0 || env.userOwnsMpi         == 1) );
-        REQUIRE( (env.isCuQuantumEnabled  == 0 || env.isCuQuantumEnabled  == 1) );
-        REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) );
+        REQUIRE( (env.isMpiUserOwned      == 0 || env.isMpiUserOwned      == 1) ); // <- pointless since bool
+        REQUIRE( (env.isCuQuantumEnabled  == 0 || env.isCuQuantumEnabled  == 1) ); //    but you can't be too
+        REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) ); //    careful ;^)
         
         REQUIRE( env.rank     >= 0 );
         REQUIRE( env.numNodes >= 0 );

From fe1020cf43a96fad2eb7b646cdc3d79f3b5cad76 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 01:53:08 -0400
Subject: [PATCH 23/58] Remove redundant stdbool include

---
 quest/src/api/subcommunicator.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index e248f0dba..e2fd00129 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -7,7 +7,6 @@
 
 #if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
 
-#include <stdbool.h>
 #include <mpi.h>
 
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {

From d363d092c692cab0625043b49e39f760db1ee899 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:07:27 -0400
Subject: [PATCH 24/58] Add validation to initCustomMpiCommQuESTEnv

---
 quest/src/api/subcommunicator.cpp | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index e2fd00129..dcef1c161 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -2,29 +2,36 @@
 #include "quest/include/environment.h"
 #include "quest/include/subcommunicator.h"
 
+#include "quest/src/core/validation.hpp"
 #include "quest/src/comm/comm_config.hpp"
-#include "quest/src/core/errors.hpp"
 
 #if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
 
 #include <mpi.h>
 
+
+
+// TODO:
+// We must resolve this inner function of QuEST initialisation, but which is
+// private to api/environment.cpp, and so cannot be exposed in the user-facing
+// include/environment.hpp. Grr! For now, we here just cheekily extern it c:
+extern void validateAndInitCustomQuESTEnv(
+    int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller);
+
+
+
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
+
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
     const int useDistrib = 1;
     const bool userOwnsMpi = true;
 
-    // set mpiCommQuest to user provided communicator
-    if (comm_isInit()) {
-        comm_setMpiComm(userQuestComm);
-    } else {
-        error_commNotInit();
-    }
-
-    // initialise QuEST around that communicator
-    initCustomMpiQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread);
+    // pre-validate that we are able to set the MPI communicator
+    validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__);
+    comm_setMpiComm(userQuestComm);
 
-    return;
+    // perform remaining validation and init QuEST env
+    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
 
 #endif

From 70ac5693a37b820ad05913a43c57a616aefd37bd Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:09:45 -0400
Subject: [PATCH 25/58] Rename mpiCommQuest to global_mpiComm

---
 quest/src/comm/comm_config.cpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 7bc9c2f92..73f8ffa3b 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -21,7 +21,7 @@
 #if QUEST_COMPILE_MPI
     #include <mpi.h>
 
-    static MPI_Comm mpiCommQuest = MPI_COMM_NULL;
+    static MPI_Comm global_mpiComm = MPI_COMM_NULL;
 #endif
 
 
@@ -117,8 +117,8 @@ void comm_init(bool userOwnsMpi) {
         MPI_Init(NULL, NULL);
 
     // choose communicator only when the user hasn't 
-    if (mpiCommQuest == MPI_COMM_NULL)
-        MPI_Comm_dup(MPI_COMM_WORLD, &mpiCommQuest);
+    if (global_mpiComm == MPI_COMM_NULL)
+        MPI_Comm_dup(MPI_COMM_WORLD, &global_mpiComm);
 
 #endif
 }
@@ -136,11 +136,11 @@ void comm_end(bool userOwnsMpi) {
     // triggered by "bad MPI init" validation, during which, the communicator may not yet
     // have been set. We choose NOT to divert to MPI_COMM_WORLD, which is likely just to
     // stall at MPI_Barrier, and instead let the user's communicator live on; then crash!
-    if (mpiCommQuest == MPI_COMM_NULL)
+    if (global_mpiComm == MPI_COMM_NULL)
         return;
 
-    MPI_Barrier(mpiCommQuest);
-    MPI_Comm_free(&mpiCommQuest);
+    MPI_Barrier(global_mpiComm);
+    MPI_Comm_free(&global_mpiComm);
     
     // QuEST must finalise MPI if the user does not own it
     if (!userOwnsMpi)
@@ -165,7 +165,7 @@ int comm_getRank() {
     // using WORLD (and pray the user hasn't silenced world-root std-out!). We
     // COULD safely return ROOT_RANK instead, letting all processes believe they
     // are root, but this grossly duplicates the output across ALL processes
-    MPI_Comm comm = (mpiCommQuest == MPI_COMM_NULL)? MPI_COMM_WORLD : mpiCommQuest;
+    MPI_Comm comm = (global_mpiComm == MPI_COMM_NULL)? MPI_COMM_WORLD : global_mpiComm;
 
     int rank;
     MPI_Comm_rank(comm, &rank);
@@ -197,7 +197,7 @@ int comm_getNumNodes() {
         return 1;
 
     int numNodes;
-    MPI_Comm_size(mpiCommQuest, &numNodes);
+    MPI_Comm_size(global_mpiComm, &numNodes);
     return numNodes;
 
 #else
@@ -218,29 +218,29 @@ void comm_sync() {
     // gracefully handle when the communicator is still NULL, because comm_sync() is
     // triggered by "bad MPI init" validation (during the error message printing)
     // during which, the communicator may not yet have been overriden
-    if (mpiCommQuest == MPI_COMM_NULL)
+    if (global_mpiComm == MPI_COMM_NULL)
         return;
 
-    MPI_Barrier(mpiCommQuest);
+    MPI_Barrier(global_mpiComm);
 #endif
 }
 
 #if QUEST_COMPILE_MPI
     MPI_Comm comm_getMpiComm() {
-        return mpiCommQuest;
+        return global_mpiComm;
     }
 
     #if QUEST_COMPILE_SUBCOMM
         void comm_setMpiComm(MPI_Comm newComm) {
 
-            // error if mpiCommQuEST is already set!
-            if (mpiCommQuest != MPI_COMM_NULL) {
-                MPI_Barrier(mpiCommQuest);
-                MPI_Comm_free(&mpiCommQuest);
+            // error if global_mpiComm is already set!
+            if (global_mpiComm != MPI_COMM_NULL) {
+                MPI_Barrier(global_mpiComm);
+                MPI_Comm_free(&global_mpiComm);
                 error_commDoubleSetMpiComm();
             }
 
-            int mpi_err = MPI_Comm_dup(newComm, &mpiCommQuest);
+            int mpi_err = MPI_Comm_dup(newComm, &global_mpiComm);
             if (mpi_err != MPI_SUCCESS) {
                 error_commInvalidMpiComm();
             }

From ef6860b543145918fd6f01d776c76a30bd984937 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:17:43 -0400
Subject: [PATCH 26/58] Rename mpiCommQuest (local var) to mpiComm

and inline where trivial
---
 quest/src/comm/comm_routines.cpp | 63 +++++++++++---------------------
 quest/src/core/errors.cpp        |  2 +-
 2 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index 0bc90563b..166586606 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -149,8 +149,7 @@ int getMaxNumMessages() {
     // messages. Beware the max is obtained via a void pointer and might be unset...
     void* tagUpperBoundPtr;
     int isAttribSet;
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-    MPI_Comm_get_attr(mpiCommQuest, MPI_TAG_UB, &tagUpperBoundPtr, &isAttribSet);
+    MPI_Comm_get_attr(comm_getMpiComm(), MPI_TAG_UB, &tagUpperBoundPtr, &isAttribSet);
 
     // if something went wrong with obtaining the tag bound, return the safe minimum
     if (!isAttribSet)
@@ -217,7 +216,7 @@ std::array<qindex,3> dividePayloadIntoMessages(qindex numAmps) {
 void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // each message is asynchronously dispatched with a final wait, as per arxiv.org/abs/2308.07402
 
@@ -229,8 +228,8 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
     // so that messages are permitted to arrive out-of-order (supporting UCX adaptive-routing)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiCommQuest, &requests[2*m]);
-        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiCommQuest, &requests[2*m+1]);
+        MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[2*m]);
+        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[2*m+1]);
     }
 
     // wait for all exchanges to complete (MPI will automatically free the request memory)
@@ -251,7 +250,7 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
 void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // we will not track nor wait for the asynch send; instead, the caller will later comm_sync()
     MPI_Request nullReq = MPI_REQUEST_NULL;
@@ -262,7 +261,7 @@ void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
     // asynchronously send the uniquely-tagged messages
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiCommQuest, &nullReq);
+        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &nullReq);
     }
 
 #else
@@ -274,7 +273,7 @@ void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
 void receiveArray(qcomp* dest, qindex numElems, int pairRank) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // expect the data in multiple messages
     auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numElems);
@@ -285,7 +284,7 @@ void receiveArray(qcomp* dest, qindex numElems, int pairRank) {
     // listen to receive each uniquely-tagged message asynchronously (as per arxiv.org/abs/2308.07402)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Irecv(&dest[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiCommQuest, &requests[m]);
+        MPI_Irecv(&dest[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[m]);
     }
 
     // receivers wait for all messages to be received (while sender asynch proceeds)
@@ -310,8 +309,7 @@ void globallyCombineNonUniformSubArrays(
 ) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
+    auto mpiComm = comm_getMpiComm();
     int myRank = comm_getRank();
     int numNodes = comm_getNumNodes();
 
@@ -345,14 +343,14 @@ void globallyCombineNonUniformSubArrays(
         for (int m=0; m<numBigMsgs; m++) {
             qindex recvInd = globalRecvIndPerRank[sendRank] + (m * bigMsgSize);
             requests.push_back(MPI_REQUEST_NULL);
-            MPI_Ibcast(&recv[recvInd], bigMsgSize, MPI_QCOMP, sendRank, mpiCommQuest, &requests.back());
+            MPI_Ibcast(&recv[recvInd], bigMsgSize, MPI_QCOMP, sendRank, mpiComm, &requests.back());
         }
 
         // and potentially one remaining asynch message 
         if (remMsgSize > 0) {
             qindex recvInd = globalRecvIndPerRank[sendRank] + (numBigMsgs * bigMsgSize);
             requests.push_back(MPI_REQUEST_NULL);
-            MPI_Ibcast(&recv[recvInd], remMsgSize, MPI_QCOMP, sendRank, mpiCommQuest, &requests.back());
+            MPI_Ibcast(&recv[recvInd], remMsgSize, MPI_QCOMP, sendRank, mpiComm, &requests.back());
         }
     }
 
@@ -648,9 +646,7 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank) {
 void comm_broadcastAmp(int sendRank, qcomp* sendAmp) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
-    MPI_Bcast(sendAmp, 1, MPI_QCOMP, sendRank, mpiCommQuest);
+    MPI_Bcast(sendAmp, 1, MPI_QCOMP, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -661,7 +657,7 @@ void comm_broadcastAmp(int sendRank, qcomp* sendAmp) {
 void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // only the sender and root nodes need to continue
     int recvRank = ROOT_RANK;
@@ -678,8 +674,8 @@ void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m);
         (myRank == sendRank)?
-            MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, recvRank, tag, mpiCommQuest, &requests[m]): // sender
-            MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, sendRank, tag, mpiCommQuest, &requests[m]); // root
+            MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, recvRank, tag, mpiComm, &requests[m]): // sender
+            MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, sendRank, tag, mpiComm, &requests[m]); // root
     }
 
     // wait for all exchanges to complete (MPI will automatically free the request memory)
@@ -692,13 +688,10 @@ void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps)
 
 
 void comm_broadcastIntsFromRoot(int* arr, qindex length) {
-
 #if QUEST_COMPILE_MPI
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
 
     int sendRank = ROOT_RANK;
-    MPI_Bcast(arr, length, MPI_INT, sendRank, mpiCommQuest);
+    MPI_Bcast(arr, length, MPI_INT, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -709,10 +702,8 @@ void comm_broadcastIntsFromRoot(int* arr, qindex length) {
 void comm_broadcastUnsignedsFromRoot(unsigned* arr, qindex length) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
     int sendRank = ROOT_RANK;
-    MPI_Bcast(arr, length, MPI_UNSIGNED, sendRank, mpiCommQuest);
+    MPI_Bcast(arr, length, MPI_UNSIGNED, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -739,9 +730,7 @@ void comm_combineSubArrays(qcomp* recv, vector<qindex> recvInds, vector<qindex>
 void comm_reduceAmp(qcomp* localAmp) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
-    MPI_Allreduce(MPI_IN_PLACE, localAmp, 1, MPI_QCOMP, MPI_SUM, mpiCommQuest);
+    MPI_Allreduce(MPI_IN_PLACE, localAmp, 1, MPI_QCOMP, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -752,9 +741,7 @@ void comm_reduceAmp(qcomp* localAmp) {
 void comm_reduceReal(qreal* localReal) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
-    MPI_Allreduce(MPI_IN_PLACE, localReal, 1, MPI_QREAL, MPI_SUM, mpiCommQuest);
+    MPI_Allreduce(MPI_IN_PLACE, localReal, 1, MPI_QREAL, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -765,9 +752,7 @@ void comm_reduceReal(qreal* localReal) {
 void comm_reduceReals(qreal* localReals, qindex numLocalReals) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
-    MPI_Allreduce(MPI_IN_PLACE, localReals, numLocalReals, MPI_QREAL, MPI_SUM, mpiCommQuest);
+    MPI_Allreduce(MPI_IN_PLACE, localReals, numLocalReals, MPI_QREAL, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -778,12 +763,10 @@ void comm_reduceReals(qreal* localReals, qindex numLocalReals) {
 bool comm_isTrueOnAllNodes(bool val) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
     // perform global AND and broadcast result back to all nodes
     int local = (int) val;
     int global;
-    MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_LAND, mpiCommQuest);
+    MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_LAND, comm_getMpiComm());
     return (bool) global;
 
 #else
@@ -819,8 +802,6 @@ bool comm_isTrueOnRootNode(bool val) {
 vector<string> comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars) {
 #if QUEST_COMPILE_MPI
 
-    MPI_Comm mpiCommQuest = comm_getMpiComm();
-
     // no need to validate array sizes and memory alloc successes;
     // these are trivial O(#nodes)-size arrays containing <20 chars
     int numNodes = comm_getNumNodes();
@@ -831,7 +812,7 @@ vector<string> comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars)
     // all nodes send root all their local chars
     int recvRank = ROOT_RANK;
     MPI_Gather(localChars, maxNumLocalChars, MPI_CHAR, allChars.data(),
-        maxNumLocalChars, MPI_CHAR, recvRank, mpiCommQuest);
+        maxNumLocalChars, MPI_CHAR, recvRank, comm_getMpiComm());
 
     // divide allChars into stings, delimited by each node's terminal char
     vector<string> out(numNodes);
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 7b624a2f7..2c576439c 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -188,7 +188,7 @@ void error_commNumMessagesExceedTagMax() {
 
 void error_commDoubleSetMpiComm() {
   
-    raiseInternalError("An attempt was made to set mpiCommQuest after it had already been set, as indicated by mpiCommQuest != MPI_COMM_NULL.");
+    raiseInternalError("An attempt was made to set the QuEST MPI communicator after it had already been set (and changed from MPI_COMM_NULL).");
 }
 
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) {

From d85e0641bfebf5f0f0a96dba1deb26e7f9d3c4ba Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:21:36 -0400
Subject: [PATCH 27/58] Made environment.cpp adhere to global_ convention

shame on 2024 me!
---
 quest/src/api/environment.cpp | 92 +++++++++++++++++------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 1e740e427..61e2731ef 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -48,7 +48,7 @@ using std::string;
  */
 
 
-static QuESTEnv* globalEnvPtr = nullptr;
+static QuESTEnv* global_envPtr = nullptr;
 
 
 
@@ -62,7 +62,7 @@ static QuESTEnv* globalEnvPtr = nullptr;
  */
 
 
-static bool hasEnvBeenFinalized = false;
+static bool global_hasEnvBeenFinalized = false;
 
 
 
@@ -76,7 +76,7 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     // ensure that we are never re-initialising QuEST (even after finalize) because
     // this leads to undefined behaviour in distributed mode, as per the MPI std,
     // regardless of whether the user owns MPI
-    validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
+    validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
 
     // load env-vars before validating deployment mode, because some env vars can
     // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
@@ -142,28 +142,28 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     rand_setSeedsToDefault();
 
     // allocate space for the global QuESTEnv singleton (overwriting nullptr, unless malloc fails)
-    globalEnvPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
+    global_envPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
 
     // pedantically check that teeny tiny malloc just succeeded
-    if (globalEnvPtr == nullptr)
+    if (global_envPtr == nullptr)
         error_allocOfQuESTEnvFailed();
 
     // bind deployment info to global instance (autocasting int to bool)
-    globalEnvPtr->isMultithreaded     = useMultithread;
-    globalEnvPtr->isGpuAccelerated    = useGpuAccel;
-    globalEnvPtr->isDistributed       = useDistrib;
-    globalEnvPtr->isMpiUserOwned      = userOwnsMpi;
-    globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
-    globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
+    global_envPtr->isMultithreaded     = useMultithread;
+    global_envPtr->isGpuAccelerated    = useGpuAccel;
+    global_envPtr->isDistributed       = useDistrib;
+    global_envPtr->isMpiUserOwned      = userOwnsMpi;
+    global_envPtr->isCuQuantumEnabled  = useCuQuantum;
+    global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
-    globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
-    globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
+    global_envPtr->rank     = (useDistrib)? comm_getRank()     : 0;
+    global_envPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
 }
 
 void updateQuESTEnvDistInfo() {
-    globalEnvPtr->rank     = (globalEnvPtr->isDistributed)? comm_getRank()     : 0;
-    globalEnvPtr->numNodes = (globalEnvPtr->isDistributed)? comm_getNumNodes() : 1;
+    global_envPtr->rank     = (global_envPtr->isDistributed)? comm_getRank()     : 0;
+    global_envPtr->numNodes = (global_envPtr->isDistributed)? comm_getNumNodes() : 1;
     return;
 }
 
@@ -213,12 +213,12 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled",        globalEnvPtr->isDistributed},
-        {"isMpiUserOwned",      globalEnvPtr->isMpiUserOwned},
-        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
-        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
-        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
+        {"isMpiEnabled",        global_envPtr->isDistributed},
+        {"isMpiUserOwned",      global_envPtr->isMpiUserOwned},
+        {"isGpuEnabled",        global_envPtr->isGpuAccelerated},
+        {"isOmpEnabled",        global_envPtr->isMultithreaded},
+        {"isCuQuantumEnabled",  global_envPtr->isCuQuantumEnabled},
+        {"isGpuSharingEnabled", global_envPtr->isGpuSharingEnabled},
     });
 }
 
@@ -278,7 +278,7 @@ void printDistributionInfo() {
     print_table(
         "distribution", {
         {"isMpiGpuAware", (comm_isMpiCompiled())? printer_toStr(comm_isMpiGpuAware()) : na},
-        {"numMpiNodes",   printer_toStr(globalEnvPtr->numNodes)},
+        {"numMpiNodes",   printer_toStr(global_envPtr->numNodes)},
     });
 }
 
@@ -288,7 +288,7 @@ void printQuregSizeLimits(bool isDensMatr) {
     using namespace printer_substrings;
 
     // for brevity
-    int numNodes = globalEnvPtr->numNodes;
+    int numNodes = global_envPtr->numNodes;
 
     // by default, CPU limits are unknown (because memory query might fail)
     string maxQbForCpu = un;
@@ -300,7 +300,7 @@ void printQuregSizeLimits(bool isDensMatr) {
         maxQbForCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, cpuMem));
 
         // and the max MPI sizes are only relevant when env is distributed
-        if (globalEnvPtr->isDistributed)
+        if (global_envPtr->isDistributed)
             maxQbForMpiCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, cpuMem));
 
         // when MPI irrelevant, change their status from "unknown" to "N/A"
@@ -315,12 +315,12 @@ void printQuregSizeLimits(bool isDensMatr) {
     string maxQbForMpiGpu = na;
 
     // max GPU registers only relevant if env is GPU-accelerated
-    if (globalEnvPtr->isGpuAccelerated) {
+    if (global_envPtr->isGpuAccelerated) {
         qindex gpuMem = gpu_getCurrentAvailableMemoryInBytes();
         maxQbForGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, gpuMem));
 
         // and the max MPI sizes are further only relevant when env is distributed 
-        if (globalEnvPtr->isDistributed)
+        if (global_envPtr->isDistributed)
             maxQbForMpiGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, gpuMem));
     }
 
@@ -357,7 +357,7 @@ void printQuregAutoDeployments(bool isDensMatr) {
 
     // test to theoretically max #qubits, surpassing max that can fit in RAM and GPUs, because
     // auto-deploy will still try to deploy there to (then subsequent validation will fail)
-    int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, globalEnvPtr->numNodes);
+    int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, global_envPtr->numNodes);
 
     for (int numQubits=1; numQubits<maxQubits; numQubits++) {
 
@@ -365,7 +365,7 @@ void printQuregAutoDeployments(bool isDensMatr) {
         useDistrib  = modeflag::USE_AUTO;
         useGpuAccel = modeflag::USE_AUTO;
         useMulti    = modeflag::USE_AUTO;;
-        autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *globalEnvPtr);
+        autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *global_envPtr);
 
         // skip if deployments are unchanged
         if (useDistrib  == prevDistrib  &&
@@ -430,7 +430,7 @@ void initQuESTEnv() {
 
 int isQuESTEnvInit() {
 
-    return (int) (globalEnvPtr != nullptr);
+    return (int) (global_envPtr != nullptr);
 }
 
 
@@ -438,7 +438,7 @@ QuESTEnv getQuESTEnv() {
     validate_envIsInit(__func__);
 
     // returns a copy, so cheeky users calling memcpy() upon const struct still won't mutate
-    return *globalEnvPtr;
+    return *global_envPtr;
 }
 
 
@@ -449,33 +449,33 @@ void finalizeQuESTEnv() {
     // calling this will not automatically
     // free the memory of existing Quregs
 
-    if (globalEnvPtr->isGpuAccelerated)
+    if (global_envPtr->isGpuAccelerated)
         gpu_clearCache(); // syncs first
 
-    if (globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
+    if (global_envPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
         gpu_finalizeCuQuantum();
 
-    if (globalEnvPtr->isDistributed) {
+    if (global_envPtr->isDistributed) {
         comm_sync();
-        comm_end(globalEnvPtr->isMpiUserOwned);
+        comm_end(global_envPtr->isMpiUserOwned);
     }
 
     // free global env's heap memory and flag it as unallocated
-    free(globalEnvPtr);
-    globalEnvPtr = nullptr;
+    free(global_envPtr);
+    global_envPtr = nullptr;
 
     // flag that the environment was finalised, to ensure it is never re-initialised
-    hasEnvBeenFinalized = true;
+    global_hasEnvBeenFinalized = true;
 }
 
 
 void syncQuESTEnv() {
     validate_envIsInit(__func__);
 
-    if (globalEnvPtr->isGpuAccelerated)
+    if (global_envPtr->isGpuAccelerated)
         gpu_sync();
 
-    if (globalEnvPtr->isDistributed) {
+    if (global_envPtr->isDistributed) {
         comm_sync();
         #if QUEST_COMPILE_SUBCOMM
             updateQuESTEnvDistInfo();
@@ -518,16 +518,16 @@ void getQuESTEnvironmentString(char str[200]) {
     validate_envIsInit(__func__);
 
     int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
-    int cuQuantum = globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
-    int gpuDirect = globalEnvPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
+    int cuQuantum = global_envPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
+    int gpuDirect = global_envPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
     snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d userOwnsMPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
-        globalEnvPtr->isGpuAccelerated,
-        globalEnvPtr->isMultithreaded,
-        globalEnvPtr->isDistributed,
-        globalEnvPtr->isMpiUserOwned,
+        global_envPtr->isGpuAccelerated,
+        global_envPtr->isMultithreaded,
+        global_envPtr->isDistributed,
+        global_envPtr->isMpiUserOwned,
         numThreads,
-        globalEnvPtr->numNodes,
+        global_envPtr->numNodes,
         cuQuantum,
         gpuDirect);
 }

From 8fe9bbec3d43a7db13773940cf902b5fa39f9022 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:24:22 -0400
Subject: [PATCH 28/58] Remove suspicious updateQuESTEnvDistInfo()

---
 quest/src/api/environment.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 61e2731ef..f0d990ad1 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -161,11 +161,6 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
 }
 
-void updateQuESTEnvDistInfo() {
-    global_envPtr->rank     = (global_envPtr->isDistributed)? comm_getRank()     : 0;
-    global_envPtr->numNodes = (global_envPtr->isDistributed)? comm_getNumNodes() : 1;
-    return;
-}
 
 
 /*
@@ -475,12 +470,8 @@ void syncQuESTEnv() {
     if (global_envPtr->isGpuAccelerated)
         gpu_sync();
 
-    if (global_envPtr->isDistributed) {
+    if (global_envPtr->isDistributed)
         comm_sync();
-        #if QUEST_COMPILE_SUBCOMM
-            updateQuESTEnvDistInfo();
-        #endif
-    }
 }
 
 

From 314e72e0e040973bd369876af066f17d696db30b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:31:48 -0400
Subject: [PATCH 29/58] Error in comm_getMpiComm() when comm=NULL

---
 quest/src/comm/comm_config.cpp | 4 ++++
 quest/src/core/errors.cpp      | 5 +++++
 quest/src/core/errors.hpp      | 2 ++
 3 files changed, 11 insertions(+)

diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 73f8ffa3b..aa627a444 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -227,6 +227,10 @@ void comm_sync() {
 
 #if QUEST_COMPILE_MPI
     MPI_Comm comm_getMpiComm() {
+
+        if (global_mpiComm == MPI_COMM_NULL)
+            error_commMpiCommIsNull();
+
         return global_mpiComm;
     }
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 2c576439c..d77bb2d38 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -191,6 +191,11 @@ void error_commDoubleSetMpiComm() {
     raiseInternalError("An attempt was made to set the QuEST MPI communicator after it had already been set (and changed from MPI_COMM_NULL).");
 }
 
+void error_commMpiCommIsNull() {
+
+    raiseInternalError("The MPI communicator was queried but was unexpectedly still MPI_COMM_NULL.");
+}
+
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) {
 
     bool valid = (
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index f276c06ad..c41b69851 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -95,6 +95,8 @@ void error_commNumMessagesExceedTagMax();
 
 void error_commDoubleSetMpiComm();
 
+void error_commMpiCommIsNull();
+
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps);
 
 void assert_commPayloadIsPowerOf2(qindex numAmps);

From 51c0731b6fcc2a666f7b454777676cf148de786c Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 02:58:25 -0400
Subject: [PATCH 30/58] Remove MPI leak from comm_config.hpp

by just using extern. This is terrible and inadvisable, but offers an easier understanding of the software architecture (and so is easier to fix correctly) than the previous "macros change which signatures this header exposes" design.

Also, we removed the unnecessary avoiding of defining comm_setMpiComm when SUBCOMM was not defined, which made the architecture even more confusing. Now, SUBCOMM only influences the contents of subcommunicator.cpp and subcommunicator.hpp. Simple!
---
 quest/src/api/subcommunicator.cpp |  8 +++--
 quest/src/comm/comm_config.cpp    | 56 +++++++++++++++++++------------
 quest/src/comm/comm_config.hpp    | 15 ++-------
 quest/src/comm/comm_routines.cpp  |  3 +-
 4 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index dcef1c161..688b2e9eb 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -7,9 +7,14 @@
 
 #if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
 
-#include <mpi.h>
+#include <mpi.h> // MPI_Comm
 
 
+// TODO:
+// We must resolve this communicator function which contains an MPI type
+// and ergo should not be leaked outside comm_config.cpp. For now, we cheat! 
+extern void comm_setMpiComm(MPI_Comm newComm);
+
 
 // TODO:
 // We must resolve this inner function of QuEST initialisation, but which is
@@ -19,7 +24,6 @@ extern void validateAndInitCustomQuESTEnv(
     int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller);
 
 
-
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
 
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index aa627a444..5f7a90a24 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -30,6 +30,7 @@
  * WARN ABOUT CUDA-AWARENESS
  */
 
+
 #if QUEST_COMPILE_MPI && QUEST_COMPILE_CUDA
 
     // this check is OpenMPI specific
@@ -54,6 +55,7 @@
 
 /*
  * MPI ENVIRONMENT MANAGEMENT
+ *
  * all of which is safely callable in non-distributed mode
  */
 
@@ -124,7 +126,6 @@ void comm_init(bool userOwnsMpi) {
 }
 
 
-
 void comm_end(bool userOwnsMpi) {
 #if QUEST_COMPILE_MPI
 
@@ -225,31 +226,42 @@ void comm_sync() {
 #endif
 }
 
+
+
+/*
+ * MPI COMMUNICATOR MANAGEMENT
+ *
+ * which requires exposing MPI_Comm in external-facing signatures.
+ * In lieu of leaking these into comm_config.hpp, callers must
+ * declare them as extern
+ */
+
+
 #if QUEST_COMPILE_MPI
-    MPI_Comm comm_getMpiComm() {
 
-        if (global_mpiComm == MPI_COMM_NULL)
-            error_commMpiCommIsNull();
+MPI_Comm comm_getMpiComm() {
 
-        return global_mpiComm;
-    }
+    if (global_mpiComm == MPI_COMM_NULL)
+        error_commMpiCommIsNull();
 
-    #if QUEST_COMPILE_SUBCOMM
-        void comm_setMpiComm(MPI_Comm newComm) {
+    return global_mpiComm;
+}
 
-            // error if global_mpiComm is already set!
-            if (global_mpiComm != MPI_COMM_NULL) {
-                MPI_Barrier(global_mpiComm);
-                MPI_Comm_free(&global_mpiComm);
-                error_commDoubleSetMpiComm();
-            }
+void comm_setMpiComm(MPI_Comm newComm) {
 
-            int mpi_err = MPI_Comm_dup(newComm, &global_mpiComm);
-            if (mpi_err != MPI_SUCCESS) {
-                error_commInvalidMpiComm();
-            }
+    // error if global_mpiComm is already set!
+    if (global_mpiComm != MPI_COMM_NULL) {
+        MPI_Barrier(global_mpiComm);
+        MPI_Comm_free(&global_mpiComm);
+        error_commDoubleSetMpiComm();
+    }
 
-            return;
-        }
-    #endif
-#endif
+    int mpi_err = MPI_Comm_dup(newComm, &global_mpiComm);
+    if (mpi_err != MPI_SUCCESS) {
+        error_commInvalidMpiComm();
+    }
+
+    return;
+}
+
+#endif // QUEST_COMPILE_MPI
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index b061dd3e2..6b304575c 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -10,12 +10,6 @@
 #ifndef COMM_CONFIG_HPP
 #define COMM_CONFIG_HPP
 
-#include "quest/include/config.h"
-
-#if QUEST_COMPILE_MPI
-  #include <mpi.h>
-#endif
-
 constexpr int ROOT_RANK = 0;
 
 bool comm_isMpiCompiled();
@@ -33,11 +27,8 @@ bool comm_isInit();
 bool comm_isRootNode();
 bool comm_isRootNode(int rank);
 
-#if QUEST_COMPILE_MPI
-  MPI_Comm comm_getMpiComm();
-  #if QUEST_COMPILE_SUBCOMM
-    void comm_setMpiComm(MPI_Comm newComm);
-  #endif
-#endif
+// Signatures containing MPI types which callers must extern:
+// extern MPI_Comm comm_getMpiComm()
+// extern void comm_setMpiComm(MPI_Comm newComm)
 
 #endif // COMM_CONFIG_HPP
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index 166586606..cf6956454 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -6,7 +6,7 @@
  * 
  * @author Tyson Jones
  * @author Jakub Adamski (sped-up large comm by asynch messages)
- * @author Oliver Brown (patched max-message inference, consulted on AR and MPICH support)
+ * @author Oliver Brown (added custom communicators, patched max-message inference, consulted on AR and MPICH support)
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
  */
 
@@ -24,6 +24,7 @@
 
 #if QUEST_COMPILE_MPI
     #include <mpi.h>
+    extern MPI_Comm comm_getMpiComm(); // comm_config.cpp does not leak MPI_Comm
 #endif
 
 #include <vector>

From 047ede75d6ceee2e121c47c135e5e8eef1bc9daa Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:02:19 -0400
Subject: [PATCH 31/58] Rename comm_isMpiSubCommunicatorCompiled to
 comm_isMpiSubCommCompiled

although I suspect this is a poor choice of name. The logic should always be considered "compiled" when MPI is known, and we choose instead whether to "expose" the MPI signature to the users
---
 quest/src/api/environment.cpp  | 2 +-
 quest/src/comm/comm_config.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index f0d990ad1..ab18ced91 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -196,7 +196,7 @@ void printCompilationInfo() {
     print_table(
         "compilation", {
         {"isMpiCompiled",                comm_isMpiCompiled()},
-        {"isMpiSubCommunicatorCompiled", comm_isMpiSubCommunicatorCompiled()},
+        {"isMpiSubCommCompiled",         comm_isMpiSubCommCompiled()},
         {"isGpuCompiled",                gpu_isGpuCompiled()},
         {"isOmpCompiled",                cpu_isOpenmpCompiled()},
         {"isCuQuantumCompiled",          gpu_isCuQuantumCompiled()},
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 5f7a90a24..c71cc16b8 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -64,7 +64,7 @@ bool comm_isMpiCompiled() {
     return (bool) QUEST_COMPILE_MPI;
 }
 
-bool comm_isMpiSubCommunicatorCompiled() {
+bool comm_isMpiSubCommCompiled() {
     return (bool) QUEST_COMPILE_SUBCOMM;
 }
 

From 1680a12600429fc94f3e6cc1ec14616644601838 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:05:37 -0400
Subject: [PATCH 32/58] Replace magic number

---
 quest/src/core/validation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 871d94199..7f53e87bc 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -1164,10 +1164,10 @@ void default_inputErrorHandler(const char* func, const char* msg) {
     // will then attempt to instantly abort all nodes, losing the error message.
     comm_sync();
 
-    // finalise MPI before error-exit to avoid scaring user with giant MPI error message
+    // finalise MPI before error-exit to avoid scaring user with giant MPI error message;
     // we always "take ownership" of MPI here since we're about to kill the whole program
     if (comm_isInit())
-        comm_end(0);
+        comm_end(/*userOwnsMpi=*/false);
 
     // simply exit, interrupting any other process (potentially leaking)
     exit(EXIT_FAILURE);

From 00332a844ba5edfb12d1cdf2f744aff2dfb1cc0e Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:25:30 -0400
Subject: [PATCH 33/58] Make initCustomMpiCommQuESTEnv validate against re-init

without triggering an internal error
---
 quest/src/api/subcommunicator.cpp | 11 ++++++++---
 quest/src/comm/comm_config.cpp    | 26 ++++++++++++++++----------
 quest/src/comm/comm_config.hpp    |  4 +++-
 quest/src/core/errors.cpp         |  2 +-
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index 688b2e9eb..47461cbab 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -7,7 +7,7 @@
 
 #if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
 
-#include <mpi.h> // MPI_Comm
+#include <mpi.h>
 
 
 // TODO:
@@ -32,9 +32,14 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
 
     // pre-validate that we are able to set the MPI communicator
     validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__);
-    comm_setMpiComm(userQuestComm);
 
-    // perform remaining validation and init QuEST env
+    // avoid re-setting the MPI comm (to avoid an internal error), which happens
+    // if a user illegally re-calls this function, which will be subsequently
+    // caught by the validation in validateAndInitCustomQuESTEnv() below
+    if (comm_isMpiCommSet())
+        comm_setMpiComm(userQuestComm);
+
+    // perform remaining validation (some is harmlessly repeated) and init QuEST env
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index c71cc16b8..b90861cc9 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -231,11 +231,16 @@ void comm_sync() {
 /*
  * MPI COMMUNICATOR MANAGEMENT
  *
- * which requires exposing MPI_Comm in external-facing signatures.
- * In lieu of leaking these into comm_config.hpp, callers must
- * declare them as extern
+ * some of which requires exposing MPI_Comm in external-facing signatures.
+ * In lieu of leaking these into comm_config.hpp, callers must extern them.
  */
 
+bool comm_isMpiCommSet() {
+
+    // once comm_init() or comm_setMpiComm() overwrite
+    // the communicator, is can never return to NULL  
+    return (global_mpiComm == MPI_COMM_NULL);
+}
 
 #if QUEST_COMPILE_MPI
 
@@ -249,17 +254,18 @@ MPI_Comm comm_getMpiComm() {
 
 void comm_setMpiComm(MPI_Comm newComm) {
 
-    // error if global_mpiComm is already set!
-    if (global_mpiComm != MPI_COMM_NULL) {
-        MPI_Barrier(global_mpiComm);
-        MPI_Comm_free(&global_mpiComm);
+    // this is called prior to QuEST initialisation,
+    // and merely seeks to overwrite global_mpiComm 
+
+    if (global_mpiComm != MPI_COMM_NULL)
         error_commDoubleSetMpiComm();
-    }
+    if (newComm == MPI_COMM_NULL)
+        error_commMpiCommIsNull();
 
     int mpi_err = MPI_Comm_dup(newComm, &global_mpiComm);
-    if (mpi_err != MPI_SUCCESS) {
+
+    if (mpi_err != MPI_SUCCESS)
         error_commInvalidMpiComm();
-    }
 
     return;
 }
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 6b304575c..ffe49eaed 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -13,7 +13,7 @@
 constexpr int ROOT_RANK = 0;
 
 bool comm_isMpiCompiled();
-bool comm_isMpiSubCommunicatorCompiled();
+bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
 
 void comm_init(bool userOwnsMpi);
@@ -27,6 +27,8 @@ bool comm_isInit();
 bool comm_isRootNode();
 bool comm_isRootNode(int rank);
 
+bool comm_isMpiCommSet();
+
 // Signatures containing MPI types which callers must extern:
 // extern MPI_Comm comm_getMpiComm()
 // extern void comm_setMpiComm(MPI_Comm newComm)
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index d77bb2d38..6e354d267 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -193,7 +193,7 @@ void error_commDoubleSetMpiComm() {
 
 void error_commMpiCommIsNull() {
 
-    raiseInternalError("The MPI communicator was queried but was unexpectedly still MPI_COMM_NULL.");
+    raiseInternalError("The MPI communicator was queried (or set) but was unexpectedly MPI_COMM_NULL (or set to be).");
 }
 
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) {

From 6763af08b3a6e65f15d43f6ca5996d386069a272 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:33:22 -0400
Subject: [PATCH 34/58] Make initCustomMpiCommQuESTEnv validate subcomm is
 non-null

---
 quest/src/api/subcommunicator.cpp |  1 +
 quest/src/comm/comm_config.cpp    | 20 +++++++++++---------
 quest/src/core/validation.cpp     | 11 +++++++++++
 quest/src/core/validation.hpp     |  2 ++
 4 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index 47461cbab..e85b939ad 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -32,6 +32,7 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
 
     // pre-validate that we are able to set the MPI communicator
     validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__);
+    validate_mpiSubCommIsNonNull(userQuestComm != MPI_COMM_NULL, __func__);
 
     // avoid re-setting the MPI comm (to avoid an internal error), which happens
     // if a user illegally re-calls this function, which will be subsequently
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index b90861cc9..c5a36a3f0 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -133,7 +133,7 @@ void comm_end(bool userOwnsMpi) {
     if (!comm_isInit())
         return;
 
-    // ungracefully handle when the communicator is still NULL, because comm_end() may be
+    // gracefully handle when the communicator is still NULL, because comm_end() may be
     // triggered by "bad MPI init" validation, during which, the communicator may not yet
     // have been set. We choose NOT to divert to MPI_COMM_WORLD, which is likely just to
     // stall at MPI_Barrier, and instead let the user's communicator live on; then crash!
@@ -160,16 +160,18 @@ int comm_getRank() {
     if (!comm_isInit())
         return ROOT_RANK;
 
-    // consult the (potentially sub-) communicator for rank; if it is still
-    // NULL, as can only validly happen during failed MPI status validation (the
-    // error msg is attemptedly printed on only the root process), fallback to
-    // using WORLD (and pray the user hasn't silenced world-root std-out!). We
-    // COULD safely return ROOT_RANK instead, letting all processes believe they
-    // are root, but this grossly duplicates the output across ALL processes
-    MPI_Comm comm = (global_mpiComm == MPI_COMM_NULL)? MPI_COMM_WORLD : global_mpiComm;
+    // Consult the (potentially sub-) communicator for rank; if it is still
+    // NULL, as can only validly happen during failed QuESTEnv init validation
+    // (which triggers root-only error printing and ergo this function), we
+    // fall back to every process believing it is root and so attempting to
+    // print. This safely avoids consulting a potentially bugged MPI communicator
+    // and losing the message. We once tried to fallback to MPI_COMM_WORLD here,
+    // to avoid duplicate output, but it is not worth the risk of msg loss!
+    if (global_mpiComm == MPI_COMM_NULL)
+        return ROOT_RANK;
 
     int rank;
-    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_rank(global_mpiComm, &rank);
     return rank;
 
 #else
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 7f53e87bc..0d8a49423 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -110,6 +110,9 @@ namespace report {
     string USER_OWNED_MPI_WAS_NOT_INIT =
         "User owns MPI but did not prior initialise MPI before initialising QuEST.";
 
+    string USER_GIVEN_MPI_COMMUNICATOR_IS_NULL =
+        "The provided MPI communicator was null (MPI_COMM_NULL).";
+
     string QUEST_OWNED_MPI_WAS_PRE_INIT =
         "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership.";
 
@@ -1529,6 +1532,14 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle
     //     useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init)
 }
 
+void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertThat(isNonNull, report::USER_GIVEN_MPI_COMMUNICATOR_IS_NULL, caller);
+}
+
 
 
 /*
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 345931946..109728643 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -79,6 +79,8 @@ void validate_gpuIsCuQuantumCompatible(const char* caller);
 
 void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller);
 
+void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller);
+
 
 
 /*

From 1c9072cee12f8158c300f317957b1c835b8fe949 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:45:51 -0400
Subject: [PATCH 35/58] Make initCustomMpiCommQuESTEnv validate set-subcomm
 succeeds

replacing the original internal error. Note that all of the other MPI functions between comm_config.cpp and comm_subroutines.cpp are unguarded; we should create a macro around them
---
 quest/src/api/subcommunicator.cpp |  8 +++++---
 quest/src/comm/comm_config.cpp    | 12 ++++--------
 quest/src/comm/comm_config.hpp    |  2 +-
 quest/src/core/errors.cpp         |  5 -----
 quest/src/core/errors.hpp         |  2 --
 quest/src/core/validation.cpp     | 11 +++++++++++
 quest/src/core/validation.hpp     |  2 ++
 7 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index e85b939ad..74c05293a 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -13,7 +13,7 @@
 // TODO:
 // We must resolve this communicator function which contains an MPI type
 // and ergo should not be leaked outside comm_config.cpp. For now, we cheat! 
-extern void comm_setMpiComm(MPI_Comm newComm);
+extern bool comm_setMpiComm(MPI_Comm newComm);
 
 
 // TODO:
@@ -37,8 +37,10 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // avoid re-setting the MPI comm (to avoid an internal error), which happens
     // if a user illegally re-calls this function, which will be subsequently
     // caught by the validation in validateAndInitCustomQuESTEnv() below
-    if (comm_isMpiCommSet())
-        comm_setMpiComm(userQuestComm);
+    if (!comm_isMpiCommSet()) {
+        bool success = comm_setMpiComm(userQuestComm);
+        validate_mpiSubCommSetSucceeded(success, __func__);
+    }
 
     // perform remaining validation (some is harmlessly repeated) and init QuEST env
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index c5a36a3f0..20f8ac026 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -241,7 +241,7 @@ bool comm_isMpiCommSet() {
 
     // once comm_init() or comm_setMpiComm() overwrite
     // the communicator, is can never return to NULL  
-    return (global_mpiComm == MPI_COMM_NULL);
+    return (global_mpiComm != MPI_COMM_NULL);
 }
 
 #if QUEST_COMPILE_MPI
@@ -254,7 +254,7 @@ MPI_Comm comm_getMpiComm() {
     return global_mpiComm;
 }
 
-void comm_setMpiComm(MPI_Comm newComm) {
+bool comm_setMpiComm(MPI_Comm newComm) {
 
     // this is called prior to QuEST initialisation,
     // and merely seeks to overwrite global_mpiComm 
@@ -264,12 +264,8 @@ void comm_setMpiComm(MPI_Comm newComm) {
     if (newComm == MPI_COMM_NULL)
         error_commMpiCommIsNull();
 
-    int mpi_err = MPI_Comm_dup(newComm, &global_mpiComm);
-
-    if (mpi_err != MPI_SUCCESS)
-        error_commInvalidMpiComm();
-
-    return;
+    auto status = MPI_Comm_dup(newComm, &global_mpiComm);
+    return status == MPI_SUCCESS;
 }
 
 #endif // QUEST_COMPILE_MPI
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index ffe49eaed..8441dbc23 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -31,6 +31,6 @@ bool comm_isMpiCommSet();
 
 // Signatures containing MPI types which callers must extern:
 // extern MPI_Comm comm_getMpiComm()
-// extern void comm_setMpiComm(MPI_Comm newComm)
+// extern bool comm_setMpiComm(MPI_Comm newComm)
 
 #endif // COMM_CONFIG_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 6e354d267..7e2c6ab73 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -156,11 +156,6 @@ void error_commAlreadyInit() {
     raiseInternalError("The MPI communication environment was attemptedly re-initialised despite the QuEST environment already existing.");
 }
 
-void error_commInvalidMpiComm() {
-
-    raiseInternalError("The supplied MPI communicator was MPI_COMM_NULL, or duplication failed.");
-}
-
 void error_commButEnvNotDistributed() {
 
     raiseInternalError("A function attempted to invoke communication despite QuEST being compiled in non-distributed mode.");
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index c41b69851..d557d39b2 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -81,8 +81,6 @@ void error_commNotInit();
 
 void error_commAlreadyInit();
 
-void error_commInvalidMpiComm();
-
 void error_commButEnvNotDistributed();
 
 void error_commOutOfBounds();
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 0d8a49423..e1df0af76 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -113,6 +113,9 @@ namespace report {
     string USER_GIVEN_MPI_COMMUNICATOR_IS_NULL =
         "The provided MPI communicator was null (MPI_COMM_NULL).";
 
+    string USER_GIVEN_MPI_COMMUNICATOR_FAILED_TO_SET =
+        "The provided MPI communicator could not be used; MPI_Comm_dup() was not successful.";
+
     string QUEST_OWNED_MPI_WAS_PRE_INIT =
         "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership.";
 
@@ -1540,6 +1543,14 @@ void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller) {
     assertThat(isNonNull, report::USER_GIVEN_MPI_COMMUNICATOR_IS_NULL, caller);
 }
 
+void validate_mpiSubCommSetSucceeded(bool success, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertThat(success, report::USER_GIVEN_MPI_COMMUNICATOR_FAILED_TO_SET, caller);
+}
+
 
 
 /*
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 109728643..787316326 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -81,6 +81,8 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle
 
 void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller);
 
+void validate_mpiSubCommSetSucceeded(bool success, const char* caller);
+
 
 
 /*

From 7c75e72f42f3950c173c41d12ca057be74e1ebbc Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:47:44 -0400
Subject: [PATCH 36/58] Remove redundant env.bool tests

---
 tests/unit/environment.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index 85d96cf8e..9ecf8e376 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -158,13 +158,6 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 
         QuESTEnv env = getQuESTEnv();
 
-        REQUIRE( (env.isMultithreaded     == 0 || env.isMultithreaded     == 1) );
-        REQUIRE( (env.isGpuAccelerated    == 0 || env.isGpuAccelerated    == 1) );
-        REQUIRE( (env.isDistributed       == 0 || env.isDistributed       == 1) );
-        REQUIRE( (env.isMpiUserOwned      == 0 || env.isMpiUserOwned      == 1) ); // <- pointless since bool
-        REQUIRE( (env.isCuQuantumEnabled  == 0 || env.isCuQuantumEnabled  == 1) ); //    but you can't be too
-        REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) ); //    careful ;^)
-        
         REQUIRE( env.rank     >= 0 );
         REQUIRE( env.numNodes >= 0 );
         

From 93f30f28d51f39ce4edc35a7a9c87457f1f9b949 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 03:48:55 -0400
Subject: [PATCH 37/58] Rename error_commDoubleSetMpiComm

---
 quest/src/comm/comm_config.cpp | 2 +-
 quest/src/core/errors.cpp      | 2 +-
 quest/src/core/errors.hpp      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 20f8ac026..27926f5d8 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -260,7 +260,7 @@ bool comm_setMpiComm(MPI_Comm newComm) {
     // and merely seeks to overwrite global_mpiComm 
 
     if (global_mpiComm != MPI_COMM_NULL)
-        error_commDoubleSetMpiComm();
+        error_commAlreadyHasSetMpiComm();
     if (newComm == MPI_COMM_NULL)
         error_commMpiCommIsNull();
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 7e2c6ab73..862136a9c 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -181,7 +181,7 @@ void error_commNumMessagesExceedTagMax() {
     raiseInternalError("A function attempted to communicate via more messages than permitted (since there would be more uniquely-tagged messages than the tag upperbound).");
 }
 
-void error_commDoubleSetMpiComm() {
+void error_commAlreadyHasSetMpiComm() {
   
     raiseInternalError("An attempt was made to set the QuEST MPI communicator after it had already been set (and changed from MPI_COMM_NULL).");
 }
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index d557d39b2..33cc0661d 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -91,7 +91,7 @@ void error_commGivenInconsistentNumSubArraysANodes();
 
 void error_commNumMessagesExceedTagMax();
 
-void error_commDoubleSetMpiComm();
+void error_commAlreadyHasSetMpiComm();
 
 void error_commMpiCommIsNull();
 

From 790d11c4041f2c6a94d810f5388bdaac93251c41 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 04:03:49 -0400
Subject: [PATCH 38/58] Skip custom MPI examples when no MPI

---
 examples/extended/user_owned_mpi.c      | 20 ++++++++++++++-----
 examples/extended/user_owned_submpi.cpp | 26 +++++++++++++++++++++----
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/examples/extended/user_owned_mpi.c b/examples/extended/user_owned_mpi.c
index 55967a4ef..5142b9c3c 100644
--- a/examples/extended/user_owned_mpi.c
+++ b/examples/extended/user_owned_mpi.c
@@ -5,16 +5,24 @@
  * @author Oliver Brown
  */
 
-#include <mpi.h>
 #include "quest.h"
 
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM
+#if ! QUEST_COMPILE_SUBCOMM
+
+int main(void)
+{    
+    std::printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
 
-    // TODO:
-    // this file will only receive mpi.h from CMakeLists.txt if
-    // we are also compiling with QUEST_ENABLE_SUBCOMM. Fix this!
+#else 
 
+#include <mpi.h>
 
-int main (void)
+int main(void)
 {
     const int  USE_DISTRIB = 1;
     const bool USER_MPI    = 1;
@@ -29,3 +37,5 @@ int main (void)
 
     return 0;
 }
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/extended/user_owned_submpi.cpp b/examples/extended/user_owned_submpi.cpp
index d1d336637..164ecd419 100644
--- a/examples/extended/user_owned_submpi.cpp
+++ b/examples/extended/user_owned_submpi.cpp
@@ -5,15 +5,31 @@
  * @author Oliver Brown
  */
 
+#include "quest.h"
 #include <cstdio>
-#include <mpi.h>
-#include <quest.h>
 
 
     // TODO:
-    // this file will only receive mpi.h from CMakeLists.txt if
-    // we are also compiling with QUEST_ENABLE_SUBCOMM. Fix this!
+    // this example sees some processes print to std-out while
+    // QuEST is reporting, colliding with output. May be worth
+    // introducing a sync to force non-QuEST-processes to wait
+    // during QUEST reporting
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM
+#if ! QUEST_COMPILE_SUBCOMM
 
+int main()
+{    
+    std::printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+
+#else 
+
+#include <mpi.h>
 
 int main (void)
 {
@@ -64,3 +80,5 @@ int main (void)
 
     return 0;
 }
+
+#endif // QUEST_COMPILE_SUBCOMM

From ac86d12370832ab34af6229dde340f4d6231f18f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 29 May 2026 04:07:49 -0400
Subject: [PATCH 39/58] Patches

---
 examples/extended/user_owned_mpi.c | 3 ++-
 quest/src/comm/comm_config.cpp     | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/extended/user_owned_mpi.c b/examples/extended/user_owned_mpi.c
index 5142b9c3c..f5117f48f 100644
--- a/examples/extended/user_owned_mpi.c
+++ b/examples/extended/user_owned_mpi.c
@@ -6,6 +6,7 @@
  */
 
 #include "quest.h"
+#include <stdio.h>
 
 // This example requires linking with MPI, which the CMake
 // build only enables when QUEST_ENABLE_SUBCOMM is ON, which
@@ -14,7 +15,7 @@
 
 int main(void)
 {    
-    std::printf("Example skipped since MPI is not linked.\n");
+    printf("Example skipped since MPI is not linked.\n");
     return 0;
 }
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 27926f5d8..c69e72919 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -238,10 +238,14 @@ void comm_sync() {
  */
 
 bool comm_isMpiCommSet() {
+#if QUEST_COMPILE_MPI
 
     // once comm_init() or comm_setMpiComm() overwrite
     // the communicator, is can never return to NULL  
     return (global_mpiComm != MPI_COMM_NULL);
+# else
+    return false;
+#endif
 }
 
 #if QUEST_COMPILE_MPI

From a483af5f17e457744f900329270aa8a3457e8c9d Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 30 May 2026 18:05:04 -0400
Subject: [PATCH 40/58] Permit usage of MPI when QuEST is non-distributed

- added comm_isActive to indicate whether QuEST is using MPI (which is distinct from whether MPI itself is initialised)
- renamed comm_isInit to comm_isMpiInit, since it queries MPI directly/globally, and when true, does not indicate whether QuEST is actually using MPI
- record isMpiUserOwned within comm_config.cpp, since failed-validation must not kill user-owned MPI, and it must know user-ownership before QuESTEnv succeeds/records it (because validation can fail DURING QuESTEnv initialisation)
- explicitly divided (through doc) comm_config.cpp into things which query MPI globally, and thinks which query only QuEST's MPI env/communicator
-
---
 quest/src/api/channels.cpp        |   2 +-
 quest/src/api/environment.cpp     |   2 +-
 quest/src/api/matrices.cpp        |   2 +-
 quest/src/api/paulis.cpp          |   2 +-
 quest/src/api/qureg.cpp           |   2 +-
 quest/src/api/subcommunicator.cpp |   2 +-
 quest/src/comm/comm_config.cpp    | 249 +++++++++++++++++++-----------
 quest/src/comm/comm_config.hpp    |  11 +-
 quest/src/core/errors.cpp         |  12 +-
 quest/src/core/errors.hpp         |   4 +
 quest/src/core/randomiser.cpp     |   4 +-
 quest/src/core/validation.cpp     |  40 ++---
 quest/src/gpu/gpu_config.cpp      |   2 +-
 13 files changed, 204 insertions(+), 130 deletions(-)

diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp
index d6e3ac4fb..afdeea1d4 100644
--- a/quest/src/api/channels.cpp
+++ b/quest/src/api/channels.cpp
@@ -107,7 +107,7 @@ void freeAllMemoryIfAnyAllocsFailed(T& obj) {
 
     // determine whether any node experienced a failure
     bool anyFail = didAnyLocalAllocsFail(obj);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     // if so, free all memory before subsequent validation
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index ab18ced91..91a274fd9 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -452,7 +452,7 @@ void finalizeQuESTEnv() {
 
     if (global_envPtr->isDistributed) {
         comm_sync();
-        comm_end(global_envPtr->isMpiUserOwned);
+        comm_end();
     }
 
     // free global env's heap memory and flag it as unallocated
diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
index b17987eb4..de27a360c 100644
--- a/quest/src/api/matrices.cpp
+++ b/quest/src/api/matrices.cpp
@@ -165,7 +165,7 @@ void freeAllMemoryIfAnyAllocsFailed(T matr) {
 
     // ascertain whether any allocs failed on any node
     bool anyFail = didAnyLocalAllocsFail(matr);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     // if so, free all heap fields
diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp
index 855a9cfd8..7d367ed23 100644
--- a/quest/src/api/paulis.cpp
+++ b/quest/src/api/paulis.cpp
@@ -38,7 +38,7 @@ bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) {
         ! mem_isAllocated(sum.coeffs)  || 
         ! mem_isAllocated(sum.isApproxHermitian) );
     
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     return anyFail;
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 034c96e5c..286785047 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -116,7 +116,7 @@ bool didAnyLocalAllocsFail(Qureg qureg) {
 bool didAnyAllocsFailOnAnyNode(Qureg qureg) {
 
     bool anyFail = didAnyLocalAllocsFail(qureg);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     return anyFail;
diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index 74c05293a..6560f2f24 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -37,7 +37,7 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // avoid re-setting the MPI comm (to avoid an internal error), which happens
     // if a user illegally re-calls this function, which will be subsequently
     // caught by the validation in validateAndInitCustomQuESTEnv() below
-    if (!comm_isMpiCommSet()) {
+    if (!comm_isActive()) {
         bool success = comm_setMpiComm(userQuestComm);
         validate_mpiSubCommSetSucceeded(success, __func__);
     }
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index c69e72919..acd1e223c 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -6,8 +6,11 @@
  * 
  * Note that even when QUEST_COMPILE_MPI=1, the user may have
  * disabled distribution when creating the QuEST environment
- * at runtime. Ergo we use comm_isInit() to determine whether
- * functions should invoke the MPI API.
+ * at runtime - even despite they themselves initialising and
+ * using MPI. So we must be careful about consulting MPI status!
+ * Furthermore, all routines here will only ever consult/affect
+ * the QuEST communicator, never the entire MPI environment,
+ * the latter of which may contain non-participating processes.
  * 
  * @author Tyson Jones
  */
@@ -20,8 +23,6 @@
 
 #if QUEST_COMPILE_MPI
     #include <mpi.h>
-
-    static MPI_Comm global_mpiComm = MPI_COMM_NULL;
 #endif
 
 
@@ -53,10 +54,92 @@
 
 
 
+/*
+ * COMMUNICATOR MANAGEMENT
+ *
+ * QuEST will only ever use the overridable global_mpiComm communicator,
+ * so that superusers can dedicate external MPI processes to other tasks.
+ * Beware that it's valid for QuEST to be compiled with MPI, but have
+ * distribution runtime-disabled, while the user is themselves using
+ * (and ergo have initialised) MPI. In that scenario, we must not touch
+ * MPI, hence why comm_isActive() below is distinct from comm_isMpiInit().
+ */
+
+
+// We must record whether the user owns MPI, so that we do not ever attempt
+// to kill it when gracefully exiting, or due to a validation error
+static bool global_isMpiUserOwned = false;
+
+
+// Guarded since MPI_Comm cannot be exposed when not compiling MPI. This
+// communicator is overridden from NULL either BEFORE or DURING comm_init()
+#if QUEST_COMPILE_MPI
+    static MPI_Comm global_mpiComm = MPI_COMM_NULL;
+#endif
+
+
+bool comm_isActive() {
+#if QUEST_COMPILE_MPI
+
+    // comm_init(), or potentially comm_setMpiComm() before it, will only
+    // ever override mpiComm with non-NULL, indicating active comm. Note
+    // it's principally for mpiComm to later return to NULL, via comm_end(),
+    // and for QuEST execution to continue (though not supported presently).
+    // if comm_isActive() is true, then it is guaranteed MPI is initialised
+    return global_mpiComm != MPI_COMM_NULL;
+
+    // note it is legal for QuEST distribution to be disabled (and ergo
+    // mpiComm never initialised) even when the user is themselves accessing
+    // MPI, hence this function is semantically distinct from comm_isMpiInit()
+#else
+
+    // QuEST communication is obviously never active if
+    // not even MPI is compiled; though this does not
+    // imply at all the user isn't themselves using MPI!
+    return false;
+
+#endif
+}
+
+
+// Hide MPI_Comm from signatures when MPI is not compiled. Beware that
+// these are not exposed in comm_config.hpp; callers must 'extern' them!
+#if QUEST_COMPILE_MPI
+
+
+MPI_Comm comm_getMpiComm() {
+
+    // illegal to call before communicator has been overridden
+    if (global_mpiComm == MPI_COMM_NULL)
+        error_commMpiCommIsNull();
+
+    return global_mpiComm;
+}
+
+
+bool comm_setMpiComm(MPI_Comm newComm) {
+
+    // illegal to re-set, or set to null
+    if (global_mpiComm != MPI_COMM_NULL)
+        error_commAlreadyHasSetMpiComm();
+    if (newComm == MPI_COMM_NULL)
+        error_commNewMpiCommIsNull();
+
+    // detect bad communicator, and inform validation
+    auto status = MPI_Comm_dup(newComm, &global_mpiComm);
+    return status == MPI_SUCCESS;
+}
+
+
+#endif // QUEST_COMPILE_MPI
+
+
+
 /*
  * MPI ENVIRONMENT MANAGEMENT
  *
- * all of which is safely callable in non-distributed mode
+ * which queries MPI itself (as may be user-activated), rather
+ * than QuEST's (possibly more limited) MPI environment
  */
 
 
@@ -89,64 +172,96 @@ bool comm_isMpiGpuAware() {
 }
 
 
-bool comm_isInit() {
+bool comm_isMpiInit() {
 #if QUEST_COMPILE_MPI
 
     // safely callable before MPI initialisation, but NOT after comm_end()
     int isInit;
     MPI_Initialized(&isInit);
+
+    // when MPI is not initialised, it is guaranteed that QuEST's communicator
+    // is inactive, which we double check here so callers can be absolutely sure
+    if (!isInit && comm_isActive())
+        error_commActiveButMpiNotInit();
+
     return (bool) isInit;
 
 #else
 
     // obviously MPI is never initialised if not even compiled
     return false;
+
 #endif
 }
 
 
+
+/*
+ * QUEST COMMUNICATION MANAGEMENT
+ *
+ * which interacts only with QuEST's MPI environment,
+ * which may be smaller than the user-controlled MPI env
+ */
+
+
 void comm_init(bool userOwnsMpi) {
 #if QUEST_COMPILE_MPI
 
-    // re-assert prior user-validations for robustness
-    if (userOwnsMpi && !comm_isInit())
+    // re-assert prior user-validations for clarity
+    if (userOwnsMpi && !comm_isMpiInit())
         error_commNotInit();
-    if (!userOwnsMpi && comm_isInit())
+    if (!userOwnsMpi && comm_isMpiInit())
         error_commAlreadyInit();
    
     // init MPI only when it's not the user's responsibility
     if (!userOwnsMpi)
         MPI_Init(NULL, NULL);
 
-    // choose communicator only when the user hasn't 
+    // choose communicator only when the user hasn't already
+    // (via comm_setMpiComm, during custom env initialisation)
     if (global_mpiComm == MPI_COMM_NULL)
-        MPI_Comm_dup(MPI_COMM_WORLD, &global_mpiComm);
+        comm_setMpiComm(MPI_COMM_WORLD);
+
+    // remember user ownership, so we avoid later killing user-owned MPI
+    global_isMpiUserOwned = userOwnsMpi;
 
 #endif
 }
 
 
-void comm_end(bool userOwnsMpi) {
+void comm_end() {
 #if QUEST_COMPILE_MPI
 
-    // gracefully permit comm_end() before comm_init(), as input validation can trigger
-    if (!comm_isInit())
-        return;
-
-    // gracefully handle when the communicator is still NULL, because comm_end() may be
-    // triggered by "bad MPI init" validation, during which, the communicator may not yet
-    // have been set. We choose NOT to divert to MPI_COMM_WORLD, which is likely just to
-    // stall at MPI_Barrier, and instead let the user's communicator live on; then crash!
-    if (global_mpiComm == MPI_COMM_NULL)
+    // If QuEST isn't using distribution, regardless of whether the user is using MPI,
+    // then we gracefully exit. We do NOT attempt to end MPI on the user's behalf (as we
+    // may be tempted to do during validation failure to avoid their MPI-crash), because
+    // it's possible/legal that not all processes are participating in this comm_end()
+    // call, in which case so MPI_Finalize() could just cause a hang.
+    if (!comm_isActive())
         return;
 
+    // Syncing is not strictly necessary, but it ensures that finalizeQuESTEnv() never
+    // completes on one process while another process is still performing simulation
+    // (though that'd be weird), and so may avoid a silly user benchmarking pitfall
     MPI_Barrier(global_mpiComm);
     MPI_Comm_free(&global_mpiComm);
     
-    // QuEST must finalise MPI if the user does not own it
-    if (!userOwnsMpi)
+    // Do NOT close MPI if the user owns; they may still wish to use it after QuEST!
+    if (!global_isMpiUserOwned)
         MPI_Finalize();
 
+    // Presently, comm_end() is only ever called during QuESTEnv destruction (either
+    // deliberately, or because of failed validation during QuESTEnv initialisation).
+    // This means any comm_*() call hereafter is invalid/illegal and will be prevented
+    // by validation. However, we can imagine a future where distribution gets runtime
+    // disabled while QuEST execution continues (e.g. initQuESTEnv automatically
+    // disabled distribution), and so we must indicate that communication is no longer
+    // active by overwriting comm to NULL. BEWARE that this is "hacky"; we have
+    // updated mpiComm here without MPI_Comm_dup(), but that's fine, because hereafter
+    // MPI will never be used again (illegal to re-init both MPI, and QuEST!)
+    global_mpiComm = MPI_COMM_NULL;
+    global_isMpiUserOwned = false;
+
 #endif
 }
 
@@ -155,21 +270,13 @@ int comm_getRank() {
 #if QUEST_COMPILE_MPI
 
     // if distribution was not runtime enabled (or a validation error was 
-    // triggered), every node (if many MPI processes were launched)
-    // believes it is the root rank
-    if (!comm_isInit())
-        return ROOT_RANK;
-
-    // Consult the (potentially sub-) communicator for rank; if it is still
-    // NULL, as can only validly happen during failed QuESTEnv init validation
-    // (which triggers root-only error printing and ergo this function), we
-    // fall back to every process believing it is root and so attempting to
-    // print. This safely avoids consulting a potentially bugged MPI communicator
-    // and losing the message. We once tried to fallback to MPI_COMM_WORLD here,
-    // to avoid duplicate output, but it is not worth the risk of msg loss!
-    if (global_mpiComm == MPI_COMM_NULL)
+    // triggered during distributed initialisation), every process believes
+    // it is the root rank; this may lead to unavoidable error msg spam!
+    if (!comm_isActive())
         return ROOT_RANK;
 
+    // obtain the process rank within the QuEST communicator, which can
+    // differ from the global MPI process rank when users own MPI
     int rank;
     MPI_Comm_rank(global_mpiComm, &rank);
     return rank;
@@ -178,6 +285,7 @@ int comm_getRank() {
 
     // if MPI isn't compiled, we're definitely non-distributed; return main rank 
     return ROOT_RANK;
+
 #endif
 }
 
@@ -194,19 +302,25 @@ int comm_getNumNodes() {
 #if QUEST_COMPILE_MPI
 
     // if distribution was not runtime enabled (or a validation error was 
-    // triggered), every node (if many MPI processes were launched)
-    // believes it is the one and only node
-    if (!comm_isInit())
+    // triggered during distributed initialisation), every process is told
+    // it is the one and only node; this may lead to error msg spam, but
+    // appears unavoidable!
+    if (!comm_isActive())
         return 1;
 
+    // obtain the number of processes within the QuEST communicator, which
+    // can be smaller than global MPI process count when users own MPI
     int numNodes;
     MPI_Comm_size(global_mpiComm, &numNodes);
     return numNodes;
 
 #else
 
-    // if MPI isn't compiled, we're definitely non-distributed; return single node
+    // if MPI isn't compiled, QuEST is definitely non-distributed and
+    // each process only knows itself (though users may own MPI and
+    // actually have many processes; that's none of our business!)
     return 1;
+
 #endif
 }
 
@@ -214,62 +328,13 @@ int comm_getNumNodes() {
 void comm_sync() {
 #if QUEST_COMPILE_MPI
 
-    // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation 
-    if (!comm_isInit())
-        return;
-
-    // gracefully handle when the communicator is still NULL, because comm_sync() is
-    // triggered by "bad MPI init" validation (during the error message printing)
-    // during which, the communicator may not yet have been overriden
-    if (global_mpiComm == MPI_COMM_NULL)
+    // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation
+    if (!comm_isActive())
         return;
 
     MPI_Barrier(global_mpiComm);
-#endif
-}
-
-
-
-/*
- * MPI COMMUNICATOR MANAGEMENT
- *
- * some of which requires exposing MPI_Comm in external-facing signatures.
- * In lieu of leaking these into comm_config.hpp, callers must extern them.
- */
-
-bool comm_isMpiCommSet() {
-#if QUEST_COMPILE_MPI
 
-    // once comm_init() or comm_setMpiComm() overwrite
-    // the communicator, is can never return to NULL  
-    return (global_mpiComm != MPI_COMM_NULL);
-# else
-    return false;
 #endif
-}
-
-#if QUEST_COMPILE_MPI
 
-MPI_Comm comm_getMpiComm() {
-
-    if (global_mpiComm == MPI_COMM_NULL)
-        error_commMpiCommIsNull();
-
-    return global_mpiComm;
+    // do nothing at all when MPI is not compiled (user owned MPI processes go unsynced)
 }
-
-bool comm_setMpiComm(MPI_Comm newComm) {
-
-    // this is called prior to QuEST initialisation,
-    // and merely seeks to overwrite global_mpiComm 
-
-    if (global_mpiComm != MPI_COMM_NULL)
-        error_commAlreadyHasSetMpiComm();
-    if (newComm == MPI_COMM_NULL)
-        error_commMpiCommIsNull();
-
-    auto status = MPI_Comm_dup(newComm, &global_mpiComm);
-    return status == MPI_SUCCESS;
-}
-
-#endif // QUEST_COMPILE_MPI
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 8441dbc23..83e0aced4 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -12,23 +12,24 @@
 
 constexpr int ROOT_RANK = 0;
 
+// queries of MPI's global/general status (when visible)
 bool comm_isMpiCompiled();
 bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
+bool comm_isMpiInit();
 
+// control of QuEST's (possibly more limited) MPI env
+bool comm_isActive();
 void comm_init(bool userOwnsMpi);
-void comm_end(bool userOwnsMpi);
+void comm_end();
 void comm_sync();
 
+// queries of QuEST's (possibly more limited) MPI env
 int comm_getRank();
 int comm_getNumNodes();
-
-bool comm_isInit();
 bool comm_isRootNode();
 bool comm_isRootNode(int rank);
 
-bool comm_isMpiCommSet();
-
 // Signatures containing MPI types which callers must extern:
 // extern MPI_Comm comm_getMpiComm()
 // extern bool comm_setMpiComm(MPI_Comm newComm)
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 862136a9c..63e44e71e 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -188,7 +188,17 @@ void error_commAlreadyHasSetMpiComm() {
 
 void error_commMpiCommIsNull() {
 
-    raiseInternalError("The MPI communicator was queried (or set) but was unexpectedly MPI_COMM_NULL (or set to be).");
+    raiseInternalError("The MPI communicator was queried but was unexpectedly MPI_COMM_NULL.");
+}
+
+void error_commNewMpiCommIsNull() {
+
+    raiseInternalError("The MPI communicator was attemptedly set to MPI_COMM_NULL, which validation should have prior caught.");
+}
+
+void error_commActiveButMpiNotInit() {
+
+    raiseInternalError("QuEST believed communication was active, but MPI_Init reported MPI was not initialised.");
 }
 
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) {
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 33cc0661d..33cc182c7 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -95,6 +95,10 @@ void error_commAlreadyHasSetMpiComm();
 
 void error_commMpiCommIsNull();
 
+void error_commNewMpiCommIsNull();
+
+void error_commActiveButMpiNotInit();
+
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps);
 
 void assert_commPayloadIsPowerOf2(qindex numAmps);
diff --git a/quest/src/core/randomiser.cpp b/quest/src/core/randomiser.cpp
index 65c6da4eb..7b35a29fc 100644
--- a/quest/src/core/randomiser.cpp
+++ b/quest/src/core/randomiser.cpp
@@ -66,14 +66,14 @@ void rand_setSeeds(vector<unsigned> seeds) {
 
     // all nodes learn root node's #seeds
     unsigned numRootSeeds = seeds.size();
-    if (comm_isInit())
+    if (comm_isActive())
         comm_broadcastUnsignedsFromRoot(&numRootSeeds, 1);
 
     // all nodes ensure they have space to receive root node's seeds
     seeds.resize(numRootSeeds);
     
     // all nodes receive root seeds
-    if (comm_isInit())
+    if (comm_isActive())
         comm_broadcastUnsignedsFromRoot(seeds.data(), seeds.size());
 
     // all nodes remember seeds (in case user wishes to later recall them)
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index e1df0af76..0f9cecc97 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -1167,13 +1167,14 @@ void default_inputErrorHandler(const char* func, const char* msg) {
         + "Exiting...\n");
 
     // force a synch because otherwise non-main nodes may exit before print, and MPI
-    // will then attempt to instantly abort all nodes, losing the error message.
+    // will then attempt to instantly abort all nodes, losing the error message
     comm_sync();
 
-    // finalise MPI before error-exit to avoid scaring user with giant MPI error message;
-    // we always "take ownership" of MPI here since we're about to kill the whole program
-    if (comm_isInit())
-        comm_end(/*userOwnsMpi=*/false);
+    // finalise QuEST-owned MPI before error-exit to avoid scaring user with giant MPI crash
+    // message. note user-owned MPI is NOT killed because it's possible only SOME processes
+    // reach here, and attempting to sync/kill them would result in an MPI hang/crash anyway
+    if (comm_isActive())
+        comm_end(); // keeps user-owned MPI alive
 
     // simply exit, interrupting any other process (potentially leaking)
     exit(EXIT_FAILURE);
@@ -1355,7 +1356,7 @@ void assertAllNodesAgreeThat(bool valid, string msg, tokenSubs vars, const char*
     // when performing validation that may be non-uniform between nodes. For
     // example, mallocs may succeed on one node but fail on another due to
     // inhomogeneous loads.
-    if (comm_isInit())
+    if (comm_isActive())
         valid = comm_isTrueOnAllNodes(valid);
 
     // prepare error message only if validation will fail
@@ -1499,28 +1500,21 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) {
 
 void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller) {
 
-    if (!global_isValidationEnabled)
-        return;
-
     // Validation prior to this function confirms init(Custom*)QuESTEnv is only ever called
     // once, but we must additionally confirm the user has interacted with MPI legally
 
-    bool isMpiInit = comm_isInit();
+    if (!global_isValidationEnabled)
+        return;
+
+    // We consult whether MPI itself has been initialised, NOT whether QuEST is using it
+    bool isMpiInit = comm_isMpiInit();
 
-    // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it
+    // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it,
+    //     even when they are not distributing QuEST (i.e. useDistrib=0), just for clarity!
     if (!userOwnsMpi)
         assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller);
 
-    // (B) If QuEST is instructed not to use distribution, we must demand the user is not
-    // using MPI, because we internally consult comm_isInit() to detect QuEST distribution
-    // in many functions, and that will give a false positive when the user inits MPI directly. 
-    if (!useDistrib)
-        assertThat(!isMpiInit, report::QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT, caller);
-
-    // TODO: we can relax above, permitting the user to play with MPI directly while 
-    // disabling it for QuEST, by replacing internal comm_isInit() with e.g. env_isDistributed()
-
-    // (C) If QuEST will use MPI owned by the user, the user must have pre-initialised it
+    // (B) If QuEST will use MPI owned by the user, the user must have pre-initialised it
     if (useDistrib && userOwnsMpi)
         assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller);
     
@@ -1528,10 +1522,10 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle
     //     useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI)
     // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
     //     useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!)
-    // (B) useDistrib=0, userOwnsMpi=1, isMpiInit=1 (illegal: comm_isInit() limitation as above)
+    //     useDistrib=0, userOwnsMpi=1, isMpiInit=1 (legal: user owns MPI, QuEST won't use it)
     //     useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI)
     // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
-    // (C) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init)
+    // (B) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init)
     //     useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init)
 }
 
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 5bf4b257f..4e03217e5 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -395,7 +395,7 @@ bool gpu_areAnyNodesBoundToSameGpu() {
 #if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
-    if (!comm_isInit())
+    if (!comm_isActive())
         return false;
 
     // obtain bound GPU's UUID; a unique identifier 16-char identifier

From 752e89f20623973e6504b2318c58caa2bf126dbc Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 30 May 2026 18:20:06 -0400
Subject: [PATCH 41/58] patch bug where user-MPI was finalised

as found by Codex! All hail our new overlords
---
 quest/src/api/subcommunicator.cpp |  4 ++--
 quest/src/comm/comm_config.cpp    | 15 +++++++++------
 quest/src/comm/comm_config.hpp    |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp
index 6560f2f24..c497b7402 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/subcommunicator.cpp
@@ -13,7 +13,7 @@
 // TODO:
 // We must resolve this communicator function which contains an MPI type
 // and ergo should not be leaked outside comm_config.cpp. For now, we cheat! 
-extern bool comm_setMpiComm(MPI_Comm newComm);
+extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi);
 
 
 // TODO:
@@ -38,7 +38,7 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // if a user illegally re-calls this function, which will be subsequently
     // caught by the validation in validateAndInitCustomQuESTEnv() below
     if (!comm_isActive()) {
-        bool success = comm_setMpiComm(userQuestComm);
+        bool success = comm_setMpiComm(userQuestComm, userOwnsMpi);
         validate_mpiSubCommSetSucceeded(success, __func__);
     }
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index acd1e223c..8b7a72ff5 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -117,7 +117,7 @@ MPI_Comm comm_getMpiComm() {
 }
 
 
-bool comm_setMpiComm(MPI_Comm newComm) {
+bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi) {
 
     // illegal to re-set, or set to null
     if (global_mpiComm != MPI_COMM_NULL)
@@ -127,7 +127,13 @@ bool comm_setMpiComm(MPI_Comm newComm) {
 
     // detect bad communicator, and inform validation
     auto status = MPI_Comm_dup(newComm, &global_mpiComm);
-    return status == MPI_SUCCESS;
+    if (status != MPI_SUCCESS)
+        return false;
+
+    // record ownership as soon as QuEST communication becomes active, so
+    // validation errors during env initialisation never kill user-owned MPI
+    global_isMpiUserOwned = userOwnsMpi;
+    return true;
 }
 
 
@@ -220,10 +226,7 @@ void comm_init(bool userOwnsMpi) {
     // choose communicator only when the user hasn't already
     // (via comm_setMpiComm, during custom env initialisation)
     if (global_mpiComm == MPI_COMM_NULL)
-        comm_setMpiComm(MPI_COMM_WORLD);
-
-    // remember user ownership, so we avoid later killing user-owned MPI
-    global_isMpiUserOwned = userOwnsMpi;
+        comm_setMpiComm(MPI_COMM_WORLD, userOwnsMpi);
 
 #endif
 }
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 83e0aced4..826ebdf1c 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -32,6 +32,6 @@ bool comm_isRootNode(int rank);
 
 // Signatures containing MPI types which callers must extern:
 // extern MPI_Comm comm_getMpiComm()
-// extern bool comm_setMpiComm(MPI_Comm newComm)
+// extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi)
 
 #endif // COMM_CONFIG_HPP

From 53d3f28d07a48daa33287eaebcf0a4b3796ac922 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 31 May 2026 01:24:44 -0400
Subject: [PATCH 42/58] moved new custom-env funcs to experimental.h

---
 quest/include/environment.h                   |  6 --
 quest/include/experimental.h                  | 75 +++++++++++++++++++
 quest/include/quest.h                         |  2 +-
 quest/include/subcommunicator.h               | 31 --------
 quest/src/api/CMakeLists.txt                  |  2 +-
 quest/src/api/environment.cpp                 |  4 -
 .../{subcommunicator.cpp => experimental.cpp} | 64 +++++++++++++---
 7 files changed, 129 insertions(+), 55 deletions(-)
 create mode 100644 quest/include/experimental.h
 delete mode 100644 quest/include/subcommunicator.h
 rename quest/src/api/{subcommunicator.cpp => experimental.cpp} (51%)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index a584192d7..c3d867671 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -64,12 +64,6 @@ void initQuESTEnv();
  */
 void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread);
 
-/** @notyetdoced
- *  Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
- *  This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
- */
-void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);
-
 /// @notyetdoced
 void finalizeQuESTEnv();
 
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
new file mode 100644
index 000000000..2fabdc34f
--- /dev/null
+++ b/quest/include/experimental.h
@@ -0,0 +1,75 @@
+/** @file
+ * Experimental functions which are liable to
+ * API breaks within QuEST minor version releases.
+ * Some optional functions require compiling this
+ * file against MPI, despite being outside of /comm/, 
+ * and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (formatting)
+ * 
+ * @defgroup experimental Experimental
+ * @ingroup api
+ * @brief Experimental functions with tentative APIs
+ * @{
+ */
+
+#ifndef EXPERIMENTAL_H
+#define EXPERIMENTAL_H
+
+#include "quest/include/config.h"
+
+#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
+    #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
+#endif
+
+#if QUEST_COMPILE_SUBCOMM
+    #include <mpi.h>
+#endif
+
+// enable invocation by both C and C++ binaries
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyetdoced
+ *
+ *  Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
+ *  This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
+ * 
+ * @author Oliver Brown
+ */
+void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);
+
+
+#if QUEST_COMPILE_SUBCOMM
+
+/** @notyetdoced
+ * 
+ *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
+ *  Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and 
+ *  therefore that they have already initialised MPI, and they will call MPI_Finalize at the 
+ *  appropriate time.
+ *
+ *  The user-provided MPI communicator undergoes the same validation procedure as any that QuEST
+ *  would use, and so must contain a power-of-2 number of processes.
+ * 
+ * This function is only compiled and exposed when macro QUEST_COMPILE_SUBCOMM is 1, as is
+ * defined when providing CMake option QUEST_ENABLE_SUBCOMM during building.
+ *
+ * @author Oliver Brown
+ */
+void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
+
+#endif // QUEST_COMPILE_SUBCOMM
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#endif // EXPERIMENTAL_H
+
+/** @} */ // (end file-wide doxygen defgroup)
diff --git a/quest/include/quest.h b/quest/include/quest.h
index 16f8e9b49..da1c778e2 100644
--- a/quest/include/quest.h
+++ b/quest/include/quest.h
@@ -38,6 +38,7 @@
 #include "quest/include/debug.h"
 #include "quest/include/decoherence.h"
 #include "quest/include/environment.h"
+#include "quest/include/experimental.h"
 #include "quest/include/trotterisation.h"
 #include "quest/include/initialisations.h"
 #include "quest/include/channels.h"
@@ -45,7 +46,6 @@
 #include "quest/include/operations.h"
 #include "quest/include/paulis.h"
 #include "quest/include/qureg.h"
-#include "quest/include/subcommunicator.h"
 #include "quest/include/matrices.h"
 #include "quest/include/wrappers.h"
 
diff --git a/quest/include/subcommunicator.h b/quest/include/subcommunicator.h
deleted file mode 100644
index 8854404d6..000000000
--- a/quest/include/subcommunicator.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef SUBCOMMUNICATOR_H
-#define SUBCOMMUNICATOR_H
-
-#include "quest/include/config.h" 
-
-#if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
-
-#include <mpi.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** @notyetdoced
- *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
- *  Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and 
- *  therefore that they have already initialised MPI, and they will call MPI_Finalize at the 
- *  appropriate time.
- *
- *  The user-provided MPI communicator undergoes the same validation procedure as any that QuEST
- *  would use, and so must contain a power-of-2 number of processes.
- */
-void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
-#endif
diff --git a/quest/src/api/CMakeLists.txt b/quest/src/api/CMakeLists.txt
index 43b61df7d..7f90dcf17 100644
--- a/quest/src/api/CMakeLists.txt
+++ b/quest/src/api/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(QuEST
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   modes.cpp
@@ -12,7 +13,6 @@ target_sources(QuEST
   operations.cpp
   paulis.cpp
   qureg.cpp
-  subcommunicator.cpp
   trotterisation.cpp
   types.cpp
 )
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 91a274fd9..e7db211ff 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -412,10 +412,6 @@ void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) {
 }
 
 
-void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread) {
-    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
-}
-
 void initQuESTEnv() {
 
     const bool userOwnsMpi = false;
diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/experimental.cpp
similarity index 51%
rename from quest/src/api/subcommunicator.cpp
rename to quest/src/api/experimental.cpp
index c497b7402..1ad6fdb42 100644
--- a/quest/src/api/subcommunicator.cpp
+++ b/quest/src/api/experimental.cpp
@@ -1,29 +1,65 @@
+/** @file
+ * Experimental functions which are liable to
+ * API breaks within QuEST minor version releases.
+ * Some optional functions require compiling this
+ * file against MPI, despite being outside of /comm/, 
+ * and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
+ * 
+ * @author Oliver Brown
+ */
+
 #include "quest/include/config.h"
 #include "quest/include/environment.h"
-#include "quest/include/subcommunicator.h"
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/comm/comm_config.hpp"
 
-#if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM
+#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
+    #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
+#endif
+
+#if QUEST_COMPILE_SUBCOMM
+    #include <mpi.h>
+#endif
 
-#include <mpi.h>
 
 
-// TODO:
-// We must resolve this communicator function which contains an MPI type
-// and ergo should not be leaked outside comm_config.cpp. For now, we cheat! 
-extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi);
+/*
+ * EXTERNAL FUNCTIONS
+ *
+ * which we here regretfully 'extern' because we are either
+ * unsure which header should expose them, or because they
+ * contain deployment-specific types (like MPI_Comm) which
+ * we do not wish to expose within internal headers 
+ */
 
 
-// TODO:
-// We must resolve this inner function of QuEST initialisation, but which is
-// private to api/environment.cpp, and so cannot be exposed in the user-facing
-// include/environment.hpp. Grr! For now, we here just cheekily extern it c:
 extern void validateAndInitCustomQuESTEnv(
     int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller);
 
 
+#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
+    extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi);
+#endif
+
+
+
+/*
+ * API FUNCTIONS
+ */
+
+
+// enable invocation by both C and C++ binaries
+extern "C" {
+
+
+void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread) {
+    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
+}
+
+
+#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
+
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
 
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
@@ -46,4 +82,8 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
 
-#endif
+#endif // QUEST_COMPILE_SUBCOMM
+
+
+// end de-mangler
+}

From dc2cf6c7673b12e6b9427b229011c82e84b77937 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 31 May 2026 18:49:17 -0400
Subject: [PATCH 43/58] moved numTBP API to experimental.h

---
 quest/include/environment.h    |  8 --------
 quest/include/experimental.h   | 15 +++++++++++++--
 quest/src/api/environment.cpp  | 27 ++++++---------------------
 quest/src/api/experimental.cpp | 19 +++++++++++++++++--
 4 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index ef86977eb..c3d867671 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -86,14 +86,6 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
-/** @notyetdoced
- * GPU thread per block control
- * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
- * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
- */
-int getQuESTNumGpuThreadsPerBlock();
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
-
 
 // end de-mangler
 #ifdef __cplusplus
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index 2fabdc34f..4994610b4 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -44,7 +44,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM
-
 /** @notyetdoced
  * 
  *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
@@ -61,10 +60,22 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
  * @author Oliver Brown
  */
 void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+/** @notyetdoced
+ * 
+ * GPU thread per block control
+ * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
+ * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/// notyetdoced
+void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index f6b3708ab..16698c88f 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -209,12 +209,12 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
-        {"isMpiEnabled",        globalEnvPtr->isDistributed},
-        {"isMpiUserOwned",      globalEnvPtr->isMpiUserOwned},
-        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
-        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
-        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
+        {"isOmpEnabled",        global_envPtr->isMultithreaded},
+        {"isMpiEnabled",        global_envPtr->isDistributed},
+        {"isMpiUserOwned",      global_envPtr->isMpiUserOwned},
+        {"isGpuEnabled",        global_envPtr->isGpuAccelerated},
+        {"isCuQuantumEnabled",  global_envPtr->isCuQuantumEnabled},
+        {"isGpuSharingEnabled", global_envPtr->isGpuSharingEnabled},
     });
 }
 
@@ -522,20 +522,5 @@ void getQuESTEnvironmentString(char str[200]) {
 }
 
 
-int getQuESTNumGpuThreadsPerBlock() {
-    validate_envIsInit(__func__);
-    
-    return gpu_getNumThreadsPerBlock();
-}
-
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
-    validate_envIsInit(__func__);
-
-    // just rely on the internal function to throw an error if there's no GPU support compiled
-    // or if newThreadsPerBlock is not a multiple of 32 (NVIDIA) or 64 (AMD)
-    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
-    return;
-}
-
 // end de-mangler
 }
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
index 1ad6fdb42..dc8a050a9 100644
--- a/quest/src/api/experimental.cpp
+++ b/quest/src/api/experimental.cpp
@@ -13,6 +13,7 @@
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/comm/comm_config.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
     #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
@@ -59,7 +60,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
-
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
 
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
@@ -81,9 +81,24 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // perform remaining validation (some is harmlessly repeated) and init QuEST env
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+    
+    return gpu_getNumThreadsPerBlock();
+}
+
+
+void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock) {
+    validate_envIsInit(__func__);
+
+    // TODO: validate
+
+    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
+}
+
+
 // end de-mangler
 }

From fe5aaf88e863f71a57b403d90d28e283270511e9 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 31 May 2026 18:59:55 -0400
Subject: [PATCH 44/58] Flag register-spill risk when increasing TBP

---
 quest/src/gpu/gpu_kernels.cuh     | 6 +++++-
 quest/src/gpu/gpu_subroutines.cpp | 9 ++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 16af56621..b6954f701 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -301,7 +301,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     // must be strictly through compile-time-known indices, otherwise it will auto-
     // spill to local memory). Hence, this _subA() function is not a subroutine 
     // despite some logic being common to non-compile-time _subB(), and hence
-    // why the loops below are explicitly compile-time unrolled
+    // why the loops below are explicitly compile-time unrolled. Beware that when
+    // numThreadsPerBlock is increased from 128, this kernel will still behave
+    // correctly, but privateCache below will spill over into local memory at a
+    // performance penalty for NumTargs <= 5, with spillage occurring for fewer
+    // NumTargs as numThreadsPerBlock increases.
     REGISTER gpu_qcomp privateCache[1 << NumTargs];
 
     // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1)
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index b994f46e0..9b8e819b5 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -462,9 +462,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
     if constexpr (NumTargs != -1) {
 
         // when NumTargs <= 5, each thread has a private array stored in the registers,
-        // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of 
-        // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
-        // memory capacity, but does NOT exceed maximum register capacity.
+        // enabling rapid IO. When using the default numThreadsPerBlock = 128, the max
+        // size of this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
+        // memory capacity, but does NOT exceed maximum register capacity. When the user
+        // increases numThreadsPerBlock, the thread-private array in the below kernel
+        // will spill from registers into local memory, degrading performance, but
+        // behaving correctly and stably.
 
         /// @todo
         /// We should really check the above claims, otherwise the thread-private arrays could

From 3f550a770562e6594d3dd490328a50cfd026d266 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 31 May 2026 23:07:31 -0400
Subject: [PATCH 45/58] add numTBP validation

(existing checks were internal errors, and incomplete)
---
 quest/include/experimental.h   | 12 ++++---
 quest/src/api/environment.cpp  |  3 ++
 quest/src/api/experimental.cpp |  4 ++-
 quest/src/core/errors.cpp      |  7 ++--
 quest/src/core/errors.hpp      |  2 ++
 quest/src/core/validation.cpp  | 66 ++++++++++++++++++++++++++++++++++
 quest/src/core/validation.hpp  |  2 ++
 quest/src/gpu/gpu_config.cpp   | 46 ++++++++++++++++--------
 quest/src/gpu/gpu_config.hpp   | 14 +++++++-
 9 files changed, 131 insertions(+), 25 deletions(-)

diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index 4994610b4..fa26a7521 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -65,15 +65,17 @@ void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMulti
 
 /** @notyetdoced
  * 
- * GPU thread per block control
- * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
- * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
+ * @author Oliver Brown
  */
 int getQuESTNumGpuThreadsPerBlock();
 
 
-/// notyetdoced
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (input validation)
+ */
+void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock);
 
 
 // end de-mangler
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 16698c88f..2de1a808e 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -131,6 +131,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
+    // validate the initial numTBP is valid (we will change this to an env-var subsequently)
+    validate_numGpuThreadsPerBlock(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK, useGpuAccel, caller);
+
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
     if (useCuQuantum) {
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
index dc8a050a9..f72ecb321 100644
--- a/quest/src/api/experimental.cpp
+++ b/quest/src/api/experimental.cpp
@@ -94,7 +94,9 @@ int getQuESTNumGpuThreadsPerBlock() {
 void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock) {
     validate_envIsInit(__func__);
 
-    // TODO: validate
+    // validation messages and queries depend upon GPU usage
+    bool gpuIsActive = getQuESTEnv().isGpuAccelerated;
+    validate_numGpuThreadsPerBlock(newThreadsPerBlock, gpuIsActive, __func__);
 
     gpu_setNumThreadsPerBlock(newThreadsPerBlock);
 }
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 7668ff403..f13579dc3 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -661,9 +661,10 @@ void error_gpuDenseMatrixConjugatedAndTransposed() {
     raiseInternalError("The GPU + cuQuantum implementation of anyCtrlAnyTargDenseMatr() assumes that at most one of template arguments ApplyConj and ApplyTransp is true, though this was violated.");
 }
 
-void error_gpuBadNumThreadsPerBlock() {
-
-    raiseInternalError("The number of threads per block must be a multiple of 32 on NVIDIA GPUs or a multiple of 64 on AMD GPUs.");
+void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock) {
+    int warpSize = gpu_isHipCompiled()? HIP_WARP_SIZE : CUDA_WARP_SIZE;
+    if (numThreadsPerBlock > 0 && numThreadsPerBlock % warpSize != 0)
+        raiseInternalError("The number of threads per block was not a positive multiple of the platform warp size (32 for NVIDIA, 64 for AMD).");
 }
 
 void assert_quregIsGpuAccelerated(Qureg qureg) {
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 56a5aaa40..8973f7856 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -261,6 +261,8 @@ void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
 
+void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock);
+
 void assert_quregIsGpuAccelerated(Qureg qureg);
 
 void assert_mixQuregTempGpuAllocSucceeded(qcomp* gpuPtr);
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 0f9cecc97..ec1d6a2e0 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -159,6 +159,29 @@ namespace report {
     string INVALID_REPORTED_PAULI_STR_STYLE_FLAG =
         "Given an unrecognised style flag (${FLAG}). Legal flags are 0 and 1.";
 
+    // substrings re-used below
+    string _invalid_num_tpb_prefix =
+        "Given an invalid number of threads per GPU block (possibly specified by environment variable) of ${NUM_TPB}.";
+    string _num_tpb_ineffectual_suffix =
+        "Note GPU acceleration is not active so this parameter has no effect anyway.";
+    string _num_tpb_warp_indivisible_infix =
+        "Number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE =
+        _invalid_num_tpb_prefix + " Number must be positive.";
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY =
+        _invalid_num_tpb_prefix + " Number must be positive. " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX =
+        _invalid_num_tpb_prefix + " Exceeds the hardware-imposed maximum of ${MAX_TPB}.";
+
 
     /*
      * QUREG CREATION
@@ -1643,6 +1666,49 @@ void validate_reportedPauliStrStyleFlag(int flag, const char* caller) {
     assertThat(flag==0 || flag==1, report::INVALID_REPORTED_PAULI_STR_STYLE_FLAG, {{"${FLAG}",flag}}, caller);
 }
 
+void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // var 'isGpuActive' indicates that the GPU backend is compiled, a physical
+    // GPU is available, AND that the QuESTEnv has GPU-acceleration enabled, i.e.
+    // isGPuActive = gpu_isGpuCompiled() && gpu_isGpuAvailable() && env.isGpuAccelerated,
+    // though is established before QuESTEnv initialisation has completed.
+
+    // validate numTPB > 0 with an error message that points out TPB may be redundant
+    tokenSubs vars = {{"${NUM_TPB}", numTPB}};
+    auto errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY;
+    assertThat(numTPB > 0, errorMsg, vars, caller);
+
+    // prepare to validate TPB is warp-divisible, again pointing out redundancy...
+    vars["${CUDA_WARP_SIZE}"] = CUDA_WARP_SIZE;
+    vars["${HIP_WARP_SIZE}"] = HIP_WARP_SIZE;
+    errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY;
+
+    // ... but note that when the GPU backend isn't compiled, we don't know whether the
+    // user has an NVIDIA or AMD GPU, which have distinct warps of 32 (CUDA) and 64 (HIP),
+    // and so choose the smaller divisor (32,CUDA), ergo potentially permitting warp TPB
+    // that are incompatible with HIP. An extremely unimportant subtlety!
+    static_assert(HIP_WARP_SIZE >= CUDA_WARP_SIZE);
+    int warpSize = gpu_isHipCompiled()? HIP_WARP_SIZE : CUDA_WARP_SIZE;
+    assertThat(numTPB % warpSize == 0, errorMsg, vars, caller);
+
+    // the final check of max numTBP requires querying the hardware device, which obviously
+    // isn't possible if not available (and is pointless if available but we're not using!)
+    if (!isGpuActive)
+        return;
+
+    // otherwise, we verify numTPB doesn't exceed the hardware-declared maximum
+    auto maxNumTPB = gpu_getMaxNumThreadsPerBlock();
+    vars = {{"${NUM_TPB}", numTPB}, {"${MAX_TPB}", maxNumTPB}};
+    assertThat(numTPB <= maxNumTPB, report::GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX, vars, caller);
+}
+
 
 
 /*
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 787316326..0ed780856 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -113,6 +113,8 @@ void validate_numPauliChars(const char* paulis, const char* caller);
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller);
 
+void validate_numGpuThreadsPerBlock(int numTBP, bool isGpuActive, const char* caller);
+
 
 
 /*
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 19c0233bb..5036813b6 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -335,27 +335,43 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
-int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK;
+
+int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK; // TODO!!! make this read env-var
+
 
 int gpu_getNumThreadsPerBlock() {
-    // permitted even when GPU backend not compiled
+#if QUEST_COMPILE_CUDA
+
     return global_numThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
 }
 
-void gpu_setNumThreadsPerBlock(const int newNumThreadsPerBlock) {
-    if (gpu_isHipCompiled()) {
-        // number of threads per block should be a multiple of 64
-        if (newNumThreadsPerBlock % 64)
-            error_gpuBadNumThreadsPerBlock();
-    } else {
-        // number of threads per block should be a multiple of 32
-        if (newNumThreadsPerBlock % 32)
-            error_gpuBadNumThreadsPerBlock();
-    }
 
-    // permitted even when GPU backend not compiled
-    global_numThreadsPerBlock = newNumThreadsPerBlock;
-    return;
+void gpu_setNumThreadsPerBlock(int newNumTPB) {
+#if QUEST_COMPILE_CUDA
+    assert_gpuNumThreadsPerBlockIsWarpDivisible(newNumTPB); // CUDA vs HIP specific
+
+    global_numThreadsPerBlock = newNumTPB;
+#else
+    error_gpuQueriedButGpuNotCompiled(); // not really a query, but eh
+#endif
+}
+
+
+int gpu_getMaxNumThreadsPerBlock() {
+#if QUEST_COMPILE_CUDA
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, getBoundGpuId());
+    return prop.maxThreadsPerBlock; // HIP compatible
+
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
 }
 
 
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index e95c9f4f7..184f586d2 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -19,6 +19,16 @@
 #include "quest/include/channels.h"
 
 
+
+/*
+ * CONSTANTS
+ */
+
+constexpr int CUDA_WARP_SIZE = 32;
+constexpr int HIP_WARP_SIZE = 64;
+
+
+
 /*
  * CUDA ERROR HANDLING
  */
@@ -68,7 +78,9 @@ qindex gpu_getMaxNumConcurrentThreads();
 
 int gpu_getNumThreadsPerBlock();
 
-void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock);
+void gpu_setNumThreadsPerBlock(int newThreadsPerBlock);
+
+int gpu_getMaxNumThreadsPerBlock();
 
 void gpu_bindLocalGPUsToNodes();
 

From db43c1757b6276852839a213c8d961b061f4c753 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 00:16:28 -0400
Subject: [PATCH 46/58] Allow numTPB query/set when GPU not compiled

woops!
---
 quest/src/gpu/gpu_config.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 5036813b6..886293442 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -340,24 +340,17 @@ int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK; // TODO!!!
 
 
 int gpu_getNumThreadsPerBlock() {
-#if QUEST_COMPILE_CUDA
 
     return global_numThreadsPerBlock;
-#else
-    error_gpuQueriedButGpuNotCompiled();
-    return -1;
-#endif
 }
 
 
 void gpu_setNumThreadsPerBlock(int newNumTPB) {
 #if QUEST_COMPILE_CUDA
     assert_gpuNumThreadsPerBlockIsWarpDivisible(newNumTPB); // CUDA vs HIP specific
+#endif
 
     global_numThreadsPerBlock = newNumTPB;
-#else
-    error_gpuQueriedButGpuNotCompiled(); // not really a query, but eh
-#endif
 }
 
 

From 853a1516fffe065f74d365e51c938ad004f16e54 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 00:20:00 -0400
Subject: [PATCH 47/58] improve unit tests

---
 quest/include/environment.h   |   6 ++
 quest/include/experimental.h  |   2 +-
 quest/src/api/environment.cpp |  13 ++++
 tests/unit/CMakeLists.txt     |   1 +
 tests/unit/decoherence.cpp    |   3 +-
 tests/unit/environment.cpp    |  34 ---------
 tests/unit/experimental.cpp   | 127 ++++++++++++++++++++++++++++++++++
 tests/utils/config.cpp        |  21 +++++-
 tests/utils/config.hpp        |   4 ++
 9 files changed, 172 insertions(+), 39 deletions(-)
 create mode 100644 tests/unit/experimental.cpp

diff --git a/quest/include/environment.h b/quest/include/environment.h
index c3d867671..bf855fa55 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -46,6 +46,12 @@ typedef struct {
     // deployment configurations which can be changed via environment variables
     bool isGpuSharingEnabled;
 
+
+        // TODO: we are attaching this for now, but we should, in a subsequent PR,
+        // attach all the important info to the QuESTEnv, for user consumption!
+        bool isHipCompiled;
+
+
     // distributed configuration
     int rank;
     int numNodes;
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index fa26a7521..b313628b1 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -73,7 +73,7 @@ int getQuESTNumGpuThreadsPerBlock();
 /** @notyetdoced
  * 
  * @author Oliver Brown
- * @author Tyson Jones (input validation)
+ * @author Tyson Jones (input validation, tests)
  */
 void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock);
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 2de1a808e..d9ec128c0 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -159,6 +159,19 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->isCuQuantumEnabled  = useCuQuantum;
     global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
+
+        // DEBUG / TODO
+        // We are attaching isHipCompiled here, as needed by the
+        // setNumTPB unit tests; but it's a great idea to attach
+        // all compilation information to the env, so that users
+        // can programmatically query. Even the compiled facilities
+        // not actively used by the environment are useful to know,
+        // since they inform how users re-initialise QuEST later
+        // (in a different runtime)!  
+
+        global_envPtr->isHipCompiled = gpu_isHipCompiled();
+
+
     // bind distributed info
     global_envPtr->rank     = (useDistrib)? comm_getRank()     : 0;
     global_envPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d617ba8df..59341759f 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -7,6 +7,7 @@ target_sources(tests
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   multiplication.cpp
diff --git a/tests/unit/decoherence.cpp b/tests/unit/decoherence.cpp
index f36c491bb..60b4cd640 100644
--- a/tests/unit/decoherence.cpp
+++ b/tests/unit/decoherence.cpp
@@ -38,7 +38,8 @@ using std::vector;
  */
 
 
-#define TEST_CATEGORY "[unit][decoherence]"
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[decoherence]"
 
 
 void TEST_ON_CACHED_QUREGS(auto apiFunc, vector<int> targs, vector<qmatrix> kraus) {
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index ee259e220..9ecf8e376 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -178,40 +178,6 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "QuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
-
-    SECTION( LABEL_CORRECTNESS ) {
-        // Check that it initially matches the compile time value
-        // stored in config.h
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
-
-        // try a set/get iteration
-        const int test_num_tpb = 64;
-        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(test_num_tpb));
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == test_num_tpb);
-
-        // set it back to the original and confirm that also worked
-        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK));
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
-
-    }
-
-    SECTION( LABEL_VALIDATION ) {
-
-        // The way the error-handling currently works, Catch2 can't catch these (ironically)
-        // but leaving them in case we ever update the way errors are done.
-        
-        SECTION( "Less than 32" ) {
-            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(31) , ContainsSubstring("number of threads per block") );
-        }
-
-        SECTION("Not a multiple of 32 or 64.") {
-            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(94) , ContainsSubstring("number of threads per block") );
-        }
-
-    }
-}
-
 /** @} (end defgroup) */
 
 
diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp
new file mode 100644
index 000000000..ea309de57
--- /dev/null
+++ b/tests/unit/experimental.cpp
@@ -0,0 +1,127 @@
+/** @file
+ * Unit tests of the environment module.
+ *
+ * @author Oliver Brown
+ * @author Tyson Jones
+ * 
+ * @defgroup unitexperi Experimental
+ * @ingroup unittests
+ */
+
+#include "quest.h"
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include "tests/utils/macros.hpp"
+#include "tests/utils/config.hpp"
+
+using Catch::Matchers::ContainsSubstring;
+
+
+
+/*
+ * UTILITIES
+ */
+
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[experimental]"
+
+
+
+/** 
+ * TESTS
+ * 
+ * @ingroup unitexperi
+ * @{
+ */
+
+
+TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    // remember the default number for later restoration (hence static)
+    static int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // begin at 64 (AMD min, larger than NVIDIA min of 32),
+        // stop at 1024 (should be less than dev-specific max)
+        int inNumTPB = GENERATE( 64, 128, 256, 512, 1024 ); 
+        setQuESTNumGpuThreadsPerBlock(inNumTPB);
+
+        int outNumTPB = getQuESTNumGpuThreadsPerBlock();
+        REQUIRE( inNumTPB == outNumTPB );
+        
+        // BEWARE that we do not here test whether all QuEST
+        // operators succeed with the various numTBP; that must
+        // be ad hoc asssesed via updating the numTBP env-var
+        // before launching the entirety of the tests
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        SECTION( "Negative" ) {
+
+            int badNumTPB = GENERATE( 0, -1, -9999 );
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "must be positive" ) );
+        }
+
+        SECTION( "Indivisible by warp size" ) {
+
+            QuESTEnv env = getQuESTEnv();
+            int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            int badNumTPB = GENERATE_COPY( warpSize - 1, warpSize + 1, warpSize + warpSize/2, 3*warpSize + warpSize/2 );
+
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Number does not divide evenly into the warp size" ) );
+        }
+
+        SECTION( "Exceeds device maximum" ) {
+
+            int badNumTPB = 999999; // exceeds expected 1024 max
+
+            // Cannot be tested (since validation not imposed) when GPU is not actively used
+            if (getQuESTEnv().isGpuAccelerated)
+                REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Exceeds the hardware-imposed maximum" ) );
+
+            SUCCEED( );
+        }
+    }
+
+    // restore numTBP, so as not to interfere with other tests
+    setQuESTNumGpuThreadsPerBlock(initNumTPB);
+}
+
+
+TEST_CASE( "getQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // check initial value matches either the env-var (if set),
+        // or the fixed default in the codebase (hardcoded in test utils)
+        int defaultNum = getDefaultNumGpuThreadsPerBlock(); // test util via env-var
+        int reportedNum = getQuESTNumGpuThreadsPerBlock();  // QuEST API
+
+        REQUIRE( defaultNum == reportedNum );
+
+        // further testing of this function appears in setQuESTNumGpuThreadsPerBlock()
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // there is none (except untestable env is init!)
+        SUCCEED( );
+    }
+}
+
+
+/** @} (end defgroup) */
+
+
+
+/**
+ * @todo
+ * UNTESTED FUNCTIONS
+ */
+
+// nothing! :^)
diff --git a/tests/utils/config.cpp b/tests/utils/config.cpp
index 30a3844ba..d8eeab605 100644
--- a/tests/utils/config.cpp
+++ b/tests/utils/config.cpp
@@ -40,9 +40,7 @@ int getIntEnvVarValueOrDefault(string name, int defaultValue) {
 
 
 /*
- * PUBLIC
- *
- * which each call std::getenv only once
+ * PUBLIC TEST ENV VARS
  */
 
 int getNumQubitsInUnitTestedQuregs() {
@@ -74,3 +72,20 @@ bool getWhetherToTestAllDeployments() {
     static bool value = getIntEnvVarValueOrDefault("QUEST_TEST_TRY_ALL_DEPLOYMENTS", 1);
     return value;
 }
+
+
+
+/*
+ * PUBLIC QUEST ENV VARS
+ */
+
+int getDefaultNumGpuThreadsPerBlock() {
+
+    // when the env-var is not present, we MUST return the default assumed by the QuEST src code,
+    // which at the time of writing, is a fixed 128 (rather than hardware-specific value)
+    const int compileTimeDefaultTPB = 128;
+
+    // when the env-var is present, we consult that, just like QuEST
+    static int value = getIntEnvVarValueOrDefault("QUEST_NUM_GPU_THREADS_PER_BLOCK", compileTimeDefaultTPB);
+    return value;
+}
diff --git a/tests/utils/config.hpp b/tests/utils/config.hpp
index 10a61f67a..80be56e01 100644
--- a/tests/utils/config.hpp
+++ b/tests/utils/config.hpp
@@ -82,12 +82,16 @@
  * ACCESSING ENV-VARS 
  */
 
+// test env-vars
 int getNumQubitsInUnitTestedQuregs();
 int getMaxNumTestedQubitPermutations();
 int getMaxNumTestedSuperoperatorTargets();
 int getNumTestedMixedDeploymentRepetitions();
 bool getWhetherToTestAllDeployments();
 
+// quest env-vars
+int getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // CONFIG_PP
 

From 4a0a079789b0b4bd2f48b7d1c04e11ee7c7329af Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 00:30:57 -0400
Subject: [PATCH 48/58] revise HIP significance

---
 CMakeLists.txt            | 5 +----
 quest/include/config.h.in | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3acdbbb1d..f7a657579 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -523,6 +523,7 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
 
 
 # these vars are already set, but repeated here for clarity
@@ -531,10 +532,6 @@ set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index c372c793a..bab1fd3cb 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -83,6 +83,7 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
 // default parameters which may have been tuned for performance when building the library
 #cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
@@ -91,10 +92,6 @@
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
-
-
 
 /*
  * inherit the version information from CMake.

From 7e68a4d5b4789aaa60e43e037fc478a2ca5f6ae3 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 02:18:54 -0400
Subject: [PATCH 49/58] Replace TBP cmake var with environment var
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Woof that's a lot of boilerplate - but at least we have the safest environment variables in the business! 😅
---
 CMakeLists.txt                 | 16 ---------
 docs/cmake.md                  |  1 -
 docs/launch.md                 |  1 +
 quest/include/config.h.in      |  2 --
 quest/include/experimental.h   | 25 ++++++++++++--
 quest/include/modes.h          | 37 ++++++++++++++++++++-
 quest/src/api/environment.cpp  | 11 ++++--
 quest/src/api/experimental.cpp |  6 ++--
 quest/src/core/envvars.cpp     | 33 ++++++++++++++++--
 quest/src/core/envvars.hpp     |  3 ++
 quest/src/core/errors.cpp      | 17 +++++++++-
 quest/src/core/errors.hpp      |  6 ++++
 quest/src/core/parser.cpp      | 61 ++++++++++++++++++++++++++++++++++
 quest/src/core/parser.hpp      |  4 +++
 quest/src/core/validation.cpp  | 35 ++++++++++++++++---
 quest/src/core/validation.hpp  |  2 ++
 quest/src/gpu/gpu_config.cpp   |  9 ++++-
 quest/src/gpu/gpu_config.hpp   |  9 +++--
 18 files changed, 239 insertions(+), 39 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7a657579..f52d0a877 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,21 +183,6 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
-# GPU Performance Tuning
-## We do not print this value when configuring CMake as it is for advanced users only.
-
-set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
-  CACHE
-  STRING
-  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
-)
-mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)
-
-math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
-if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
-    message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
-endif()
-
 # Deprecated API
 option(
   QUEST_ENABLE_DEPRECATED_API
@@ -514,7 +499,6 @@ set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
 set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
 set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
 set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
-set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})
 
 
 # (for the love of God cmake, create a concise syntax for this)
diff --git a/docs/cmake.md b/docs/cmake.md
index 34a4c0aeb..6d0baeb9f 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -48,7 +48,6 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |
 
 
 
diff --git a/docs/launch.md b/docs/launch.md
index 9d5e6ac22..3eb8493ee 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index bab1fd3cb..b2ccf8f54 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -85,8 +85,6 @@
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
 #cmakedefine01 QUEST_COMPILE_HIP
 
-// default parameters which may have been tuned for performance when building the library
-#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index b313628b1..2efad722a 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -70,12 +70,31 @@ void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMulti
 int getQuESTNumGpuThreadsPerBlock();
 
 
-/** @notyetdoced
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will fallback to an internal default (presently @p 128).
  * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
  * @author Oliver Brown
- * @author Tyson Jones (input validation, tests)
+ * @author Tyson Jones
  */
-void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock);
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
 
 
 // end de-mangler
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 285b1cb5d..180e85879 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -43,6 +43,10 @@
      *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
      *  - permit sharing: @p 1, @p '1'
      * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified string does not evaluate to an integer @p 0 or @p 1.
+     * 
      * @author Tyson Jones
      */
     const int QUEST_PERMIT_NODES_TO_SHARE_GPU = 0;
@@ -68,7 +72,7 @@
      *    default validation epsilon.
      * 
      * @constraints
-     * The function initQuESTEnv() will throw a validation error if:
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
      *   - The specified epsilon must be `0` or positive.
      *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
      *     in a `qreal`, which is specific to its precision.
@@ -78,6 +82,37 @@
     const qreal QUEST_DEFAULT_VALIDATION_EPSILON = 0;
 
 
+    /** @envvardoc
+     * 
+     * Specifies the default number of threads per block used by GPU acceleration. 
+     * 
+     * The number of dispatched CUDA threads per block controls the parallelisation granularity of
+     * QuEST's GPU backend, affecting performance.
+     * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
+     * QuEST's hardcoded default of 128. The specified number will be used by all of QuEST's
+     * GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * The actual number of threads per block used at any time can be queried via 
+     * getQuESTNumGpuThreadsPerBlock(). 
+     * 
+     * @envvarvalues
+     *  - use internal default of `128`: @p '', @p , (unspecified)
+     *  - use number `x`: @p x, @p 'x', @p '+x'
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified number must be a positive integer.
+     *   - The specified number must not exceed the minimum or maximum value which can be stored in an @p int.
+     *   - The specified number must be divisible by the GPU warp size, which is 32 or 64, depending on
+     *     whether deployed to an NVIDIA or AMD GPU. This restriction is imposed even when QuEST is not
+     *     deployed with GPU-acceleration.
+     *   - The specified number exceeds the maximum imposed by the available GPU hardware.
+     * 
+     * @author Oliver Brown
+     * @author Tyson Jones
+     */
+    const qreal QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = 0;
+
+
 #endif
 
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index d9ec128c0..370fd8709 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -79,7 +79,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
 
     // load env-vars before validating deployment mode, because some env vars can
-    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU). note that
+    // some env-vars (like QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK) will be here
+    // validated to have a correct format (like an int), but the validity of its
+    // actual value will be checked later (since it requires deciding GPU-accel).
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -131,8 +134,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
-    // validate the initial numTBP is valid (we will change this to an env-var subsequently)
-    validate_numGpuThreadsPerBlock(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK, useGpuAccel, caller);
+    // validate the initial numTPB env-var (if specified) is valid
+    int initNumThreadsPerBlock = envvars_getDefaultNumGpuThreadsPerBlock();
+    validate_numGpuThreadsPerBlock(initNumThreadsPerBlock, useGpuAccel, caller);
+    gpu_setNumThreadsPerBlock(initNumThreadsPerBlock);
 
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
index f72ecb321..8b413cfa0 100644
--- a/quest/src/api/experimental.cpp
+++ b/quest/src/api/experimental.cpp
@@ -91,14 +91,14 @@ int getQuESTNumGpuThreadsPerBlock() {
 }
 
 
-void setQuESTNumGpuThreadsPerBlock(int newThreadsPerBlock) {
+void setQuESTNumGpuThreadsPerBlock(int numTPB) {
     validate_envIsInit(__func__);
 
     // validation messages and queries depend upon GPU usage
     bool gpuIsActive = getQuESTEnv().isGpuAccelerated;
-    validate_numGpuThreadsPerBlock(newThreadsPerBlock, gpuIsActive, __func__);
+    validate_numGpuThreadsPerBlock(numTPB, gpuIsActive, __func__);
 
-    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
+    gpu_setNumThreadsPerBlock(numTPB);
 }
 
 
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index bd9f87b6f..4f9d9f0e3 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -12,6 +12,7 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/validation.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #include <string>
 #include <cstdlib>
@@ -26,8 +27,9 @@ using std::string;
 
 
 namespace envvar_names {
-    string QUEST_PERMIT_NODES_TO_SHARE_GPU = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
-    string QUEST_DEFAULT_VALIDATION_EPSILON = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_PERMIT_NODES_TO_SHARE_GPU         = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
+    string QUEST_DEFAULT_VALIDATION_EPSILON        = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK";
 }
 
 
@@ -46,6 +48,10 @@ namespace envvar_values {
     // by default, the initial validation epsilon (before being overriden
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
     qreal QUEST_DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+
+    // by default, the initial number of GPU threads per block is given
+    // by the constants of gpu_config.hpp, before env-var or runtime overriding
+    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK;
 }
 
 
@@ -123,6 +129,21 @@ void validateAndSetDefaultValidationEpsilon(const char* caller) {
 }
 
 
+void validateAndSetDefaultNumGpuThreadsPerBlock(const char* caller) {
+
+    // permit unspecified, falling back to the hardcoded default
+    string name = envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+    if (!isEnvVarSpecified(name))
+        return;
+
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = parser_parseInteger(value);
+}
+
+
 
 /*
  * PUBLIC
@@ -138,6 +159,7 @@ void envvars_validateAndLoadEnvVars(const char* caller) {
     // load all env-vars
     validateAndSetWhetherGpuSharingIsPermitted(caller);
     validateAndSetDefaultValidationEpsilon(caller);
+    validateAndSetDefaultNumGpuThreadsPerBlock(caller);
 
     // ensure no re-loading
     global_areEnvVarsLoaded = true;
@@ -156,3 +178,10 @@ qreal envvars_getDefaultValidationEpsilon() {
 
     return envvar_values::QUEST_DEFAULT_VALIDATION_EPSILON;
 }
+
+
+int envvars_getDefaultNumGpuThreadsPerBlock() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+}
diff --git a/quest/src/core/envvars.hpp b/quest/src/core/envvars.hpp
index 555e76f15..4862e8d08 100644
--- a/quest/src/core/envvars.hpp
+++ b/quest/src/core/envvars.hpp
@@ -15,6 +15,7 @@
 namespace envvar_names { 
     extern std::string QUEST_PERMIT_NODES_TO_SHARE_GPU;
     extern std::string QUEST_DEFAULT_VALIDATION_EPSILON;
+    extern std::string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -33,5 +34,7 @@ bool envvars_getWhetherGpuSharingIsPermitted();
 
 qreal envvars_getDefaultValidationEpsilon();
 
+int envvars_getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // ENVVARS_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index f13579dc3..4055ae56b 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -646,6 +646,11 @@ void error_gpuUnexpectedlyInaccessible() {
     raiseInternalError("A function internally assumed (as a precondition) that QuEST was compiled with GPU-acceleration enabled, and that one was physically accessible, though this was untrue.");
 }
 
+void error_gpuNumThreadsPerBlockNotSet() {
+
+    raiseInternalError("A function queried the GPU numThreadsPerBlock before it had been set (intendedly by QuESTEnv initialisation).");
+}
+
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
 
     raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");  
@@ -662,7 +667,7 @@ void error_gpuDenseMatrixConjugatedAndTransposed() {
 }
 
 void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock) {
-    int warpSize = gpu_isHipCompiled()? HIP_WARP_SIZE : CUDA_WARP_SIZE;
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
     if (numThreadsPerBlock > 0 && numThreadsPerBlock % warpSize != 0)
         raiseInternalError("The number of threads per block was not a positive multiple of the platform warp size (32 for NVIDIA, 64 for AMD).");
 }
@@ -886,6 +891,16 @@ void error_attemptedToParseRealFromInvalidString() {
     raiseInternalError("A function attempted to parse a string to a qreal but the string was not validly formatted. This should have been caught by prior user validation.");
 }
 
+void error_attemptedToParseIntegerFromInvalidString() {
+
+    raiseInternalError("A function attempted to parse a string to an int but the string was not validly formatted. This should have been caught by prior user validation.");
+}
+
+void error_attemptedToParseOutOfRangeInteger() {
+
+    raiseInternalError("A function attempted to parse a string to an integer but the numerical value of the string literal exceeded the range of the integer. This should have been caught by prior validation.");
+}
+
 void error_attemptedToParseOutOfRangeReal() {
 
     raiseInternalError("A function attempted to parse a string to a qreal but the numerical value of the string literal exceeded the range of the qreal. This should have been caught by prior user validation.");
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 8973f7856..12465917f 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -249,6 +249,8 @@ void error_gpuCopyButMatrixNotGpuAccelerated();
 
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
 
+void error_gpuNumThreadsPerBlockNotSet();
+
 void error_gpuUnexpectedlyInaccessible();
 
 void error_gpuDeadCopyMatrixFunctionCalled();
@@ -369,6 +371,10 @@ void error_attemptedToParseComplexFromInvalidString();
 
 void error_attemptedToParseRealFromInvalidString();
 
+void error_attemptedToParseIntegerFromInvalidString();
+
+void error_attemptedToParseOutOfRangeInteger();
+
 void error_attemptedToParseOutOfRangeReal();
 
 void error_attemptedToParsePauliStringFromInvalidString();
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 140d77745..9d9194a3f 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -82,6 +82,9 @@ namespace patterns {
     // full complex; any format, importantly in order of decreasing specificity. do not consult for captured groups
     string num = group(comp) + "|" + group(imag) + "|" + group(real);
 
+    // full signed integer
+    string signedInt = optSign + "[0-9]+";
+
     // no capturing because 'num' pollutes captured groups, and pauli syntax overlaps real integers
     string pauli = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
     string paulis = group(optSpace + pauli + optSpace) + "+";
@@ -96,6 +99,7 @@ namespace regexes {
     regex imag(patterns::imag);
     regex comp(patterns::comp);
     regex num(patterns::num);
+    regex signedInt(patterns::signedInt);
     regex paulis(patterns::paulis);
     regex weightedPaulis(patterns::weightedPaulis);
 }
@@ -173,6 +177,63 @@ int getNumPaulisInLine(string line) {
 
 
 
+/*
+ * INTEGER PARSING
+ */
+
+
+bool parser_isAnySizedInteger(string str) {
+
+    smatch match;
+    return regex_match(str, match, regexes::signedInt);
+}
+
+
+bool parser_isValidInteger(string str) {
+
+    // reject str if it doesn't match regex
+    if (!parser_isAnySizedInteger(str))
+        return false;
+
+    // remove whitespace which stoi() below cannot handle after the sign
+    removeWhiteSpace(str);
+
+    // check number is in-range of int via duck-typing
+    try {
+        std::stoi(str);
+    } catch (const out_of_range&) {
+        return false;
+
+    // error if our regex permitted an unparsable string
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    }
+
+    return true;
+}
+
+
+int parser_parseInteger(string str) {
+
+    if (!parser_isValidInteger(str))
+        error_attemptedToParseIntegerFromInvalidString();
+
+    removeWhiteSpace(str); // stoi can't handle
+
+    try {
+        return std::stoi(str);
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    } catch (const out_of_range&) {
+        error_attemptedToParseOutOfRangeInteger();
+    }
+
+    // unreachable
+    return -1;
+}
+
+
+
 /*
  * REAL NUMBER PARSING
  */
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
index 4a9df2d02..3d34588ae 100644
--- a/quest/src/core/parser.hpp
+++ b/quest/src/core/parser.hpp
@@ -20,12 +20,16 @@ using std::string;
  * PARSING NUMBERS
  */
 
+bool parser_isAnySizedInteger(string str);
+bool parser_isValidInteger(string str);
+
 bool parser_isAnySizedReal(string str);
 bool parser_isAnySizedComplex(string str);
 
 bool parser_isValidReal(string str);
 bool parser_isValidComplex(string str);
 
+int parser_parseInteger(string str);
 qreal parser_parseReal(string str);
 qcomp parser_parseComplex(string str);
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index ec1d6a2e0..704db1cdc 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -1170,6 +1170,13 @@ namespace report {
 
     string DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE =
         "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was not a recognisable integer.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was larger (in magnitude) than the maximum value which can be stored in an integer.";
+
 }
 
 
@@ -1684,8 +1691,8 @@ void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* ca
     assertThat(numTPB > 0, errorMsg, vars, caller);
 
     // prepare to validate TPB is warp-divisible, again pointing out redundancy...
-    vars["${CUDA_WARP_SIZE}"] = CUDA_WARP_SIZE;
-    vars["${HIP_WARP_SIZE}"] = HIP_WARP_SIZE;
+    vars["${CUDA_WARP_SIZE}"] = gpu_CUDA_WARP_SIZE;
+    vars["${HIP_WARP_SIZE}"] = gpu_HIP_WARP_SIZE;
     errorMsg = isGpuActive? 
         report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE :
         report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY;
@@ -1694,8 +1701,8 @@ void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* ca
     // user has an NVIDIA or AMD GPU, which have distinct warps of 32 (CUDA) and 64 (HIP),
     // and so choose the smaller divisor (32,CUDA), ergo potentially permitting warp TPB
     // that are incompatible with HIP. An extremely unimportant subtlety!
-    static_assert(HIP_WARP_SIZE >= CUDA_WARP_SIZE);
-    int warpSize = gpu_isHipCompiled()? HIP_WARP_SIZE : CUDA_WARP_SIZE;
+    static_assert(gpu_HIP_WARP_SIZE >= gpu_CUDA_WARP_SIZE);
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
     assertThat(numTPB % warpSize == 0, errorMsg, vars, caller);
 
     // the final check of max numTBP requires querying the hardware device, which obviously
@@ -5053,6 +5060,9 @@ void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* ca
 
 void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5064,6 +5074,9 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5073,3 +5086,17 @@ void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller
     qreal eps = parser_parseReal(varValue);
     assertThat(eps >= 0, report::DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE, caller);
 }
+
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller) {
+
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
+    // we here only validate that the value is a valid signed integer;
+    // validation of its GPU-compatibility is performed by another func
+    assertThat(parser_isAnySizedInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT, caller);
+    assertThat(parser_isValidInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 0ed780856..58a0b632f 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -556,6 +556,8 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller);
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller);
 
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller);
+
 
 
 #endif // VALIDATION_HPP
\ No newline at end of file
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 886293442..7bfc0a6bc 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -336,10 +336,17 @@ qindex gpu_getMaxNumConcurrentThreads() {
  */
 
 
-int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK; // TODO!!! make this read env-var
+// the default numTPB is not known until runtime since the macro
+// gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK may be overriden by the
+// QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK environment variable. We do 
+// not read the env-var immediately since it may malformed; we must wait
+// for initQuESTEnv() to validate and potentially throw an error
+static int global_numThreadsPerBlock = -1;
 
 
 int gpu_getNumThreadsPerBlock() {
+    if (global_numThreadsPerBlock == -1)
+        error_gpuNumThreadsPerBlockNotSet();
 
     return global_numThreadsPerBlock;
 }
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 184f586d2..58030ba63 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -24,8 +24,13 @@
  * CONSTANTS
  */
 
-constexpr int CUDA_WARP_SIZE = 32;
-constexpr int HIP_WARP_SIZE = 64;
+constexpr int gpu_CUDA_WARP_SIZE = 32;
+constexpr int gpu_HIP_WARP_SIZE = 64;
+
+// The default numTPB when the associated, overriding environment
+// variable is not specified. This hardcoded constant MUST pass
+// validation, else the user will be told their env-var is bad
+constexpr int gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK = 128;
 
 
 

From 9494a2942d8a9cdc42b3ce5c9bf3041dff16ef85 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 02:24:01 -0400
Subject: [PATCH 50/58] Remove env.isHipCompiled

since it should be added in a separate PR with the other intendedly programmatically-accessible fields. I know in my heart of hearts that if I left isHipCompiled attached, the other fields would never follow hehe
---
 quest/include/environment.h   |  6 ------
 quest/src/api/environment.cpp | 13 -------------
 tests/unit/experimental.cpp   | 10 ++++++++--
 3 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index bf855fa55..c3d867671 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -46,12 +46,6 @@ typedef struct {
     // deployment configurations which can be changed via environment variables
     bool isGpuSharingEnabled;
 
-
-        // TODO: we are attaching this for now, but we should, in a subsequent PR,
-        // attach all the important info to the QuESTEnv, for user consumption!
-        bool isHipCompiled;
-
-
     // distributed configuration
     int rank;
     int numNodes;
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 370fd8709..a765a7aa6 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -164,19 +164,6 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->isCuQuantumEnabled  = useCuQuantum;
     global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
-
-        // DEBUG / TODO
-        // We are attaching isHipCompiled here, as needed by the
-        // setNumTPB unit tests; but it's a great idea to attach
-        // all compilation information to the env, so that users
-        // can programmatically query. Even the compiled facilities
-        // not actively used by the environment are useful to know,
-        // since they inform how users re-initialise QuEST later
-        // (in a different runtime)!  
-
-        global_envPtr->isHipCompiled = gpu_isHipCompiled();
-
-
     // bind distributed info
     global_envPtr->rank     = (useDistrib)? comm_getRank()     : 0;
     global_envPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp
index ea309de57..b36f67ad1 100644
--- a/tests/unit/experimental.cpp
+++ b/tests/unit/experimental.cpp
@@ -69,8 +69,14 @@ TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
 
         SECTION( "Indivisible by warp size" ) {
 
-            QuESTEnv env = getQuESTEnv();
-            int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            // If HIP status was attached to QuESTEnv, we could do:
+            //     QuESTEnv env = getQuESTEnv();
+            //     int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            // Since this currently isn't the case, we assume a warp size of 32,
+            // which will mean when this test is run on AMD GPUs, the below tested
+            // badNumTBP won't be as interestingly/rigorously spread
+            int warpSize = 32;
+
             int badNumTPB = GENERATE_COPY( warpSize - 1, warpSize + 1, warpSize + warpSize/2, 3*warpSize + warpSize/2 );
 
             REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Number does not divide evenly into the warp size" ) );

From d1afb18aaa93b2ddf20c2b63684ebfbfd4db4140 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 17:42:24 -0400
Subject: [PATCH 51/58] restored numTPB cmake var

which is now the lowest-priority default, overridden at executable launch via the environment variable, in-turn overridden at runtime using the setter
---
 CMakeLists.txt               | 38 +++++++++++++++++++++++++++++++++++-
 docs/cmake.md                |  1 +
 quest/include/config.h.in    |  4 ++++
 quest/src/core/envvars.cpp   |  7 ++++---
 quest/src/gpu/gpu_config.cpp |  8 +++-----
 quest/src/gpu/gpu_config.hpp |  5 -----
 6 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f52d0a877..57921ced3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,6 +183,21 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
+
+# GPU Performance Tuning
+# (We do not print this value when configuring CMake as it is for advanced users only)
+
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
+)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
+
+
 # Deprecated API
 option(
   QUEST_ENABLE_DEPRECATED_API
@@ -242,6 +257,23 @@ if(WIN32)
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -510,7 +542,11 @@ endif()
 set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
 
 
-# these vars are already set, but repeated here for clarity
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
+
+
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
 set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
 set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
diff --git a/docs/cmake.md b/docs/cmake.md
index 6d0baeb9f..e2377d03c 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -48,6 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default when not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index b2ccf8f54..1bb8a0470 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -90,6 +90,10 @@
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
+
+
 
 /*
  * inherit the version information from CMake.
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index 4f9d9f0e3..b54250479 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -6,6 +6,7 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 
@@ -49,9 +50,9 @@ namespace envvar_values {
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
     qreal QUEST_DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
 
-    // by default, the initial number of GPU threads per block is given
-    // by the constants of gpu_config.hpp, before env-var or runtime overriding
-    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK;
+    // by default, the initial number of GPU threads per block is informed by
+    // the below cmake variable (before being overridden by env-var or at runtime)
+    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 7bfc0a6bc..fc68969ad 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -336,11 +336,9 @@ qindex gpu_getMaxNumConcurrentThreads() {
  */
 
 
-// the default numTPB is not known until runtime since the macro
-// gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK may be overriden by the
-// QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK environment variable. We do 
-// not read the env-var immediately since it may malformed; we must wait
-// for initQuESTEnv() to validate and potentially throw an error
+// the default numTPB is not known until runtime since the initial value
+// (provided either by the CMake var, or the environment variable) must
+// be validated during QuEST initialisation.
 static int global_numThreadsPerBlock = -1;
 
 
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 58030ba63..98cb9c8a3 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -27,11 +27,6 @@
 constexpr int gpu_CUDA_WARP_SIZE = 32;
 constexpr int gpu_HIP_WARP_SIZE = 64;
 
-// The default numTPB when the associated, overriding environment
-// variable is not specified. This hardcoded constant MUST pass
-// validation, else the user will be told their env-var is bad
-constexpr int gpu_UNSPECIFIED_DEFAULT_NUM_THREADS_PER_BLOCK = 128;
-
 
 
 /*

From 9532b26b91124b35aad60278934638046f26a3d6 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 17:43:02 -0400
Subject: [PATCH 52/58] warn during config when CMake-var contradicts env-var

---
 CMakeLists.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57921ced3..c6da1edeb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,10 +236,12 @@ if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
+
 if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
   message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
 endif()
 
+
 if(WIN32)
   
   # Force MSVC to export all symbols in a shared library, like GCC and clang
@@ -274,6 +276,20 @@ if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PE
 endif()
 
 
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at

From 9f6c82fdfdadf1afb313d1493c7ebc26045a69c7 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 17:43:16 -0400
Subject: [PATCH 53/58] lil clean

---
 CMakeLists.txt                 | 9 ++++++++-
 quest/include/precision.h      | 6 +++---
 quest/src/comm/comm_config.cpp | 8 ++++++++
 quest/src/comm/comm_config.hpp | 1 +
 quest/src/core/envvars.cpp     | 2 +-
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6da1edeb..b5a438713 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,6 +161,7 @@ option(
 )
 message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
+
 # GPU Acceleration
 option(
   QUEST_ENABLE_CUDA
@@ -211,9 +212,15 @@ option(
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
 option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
diff --git a/quest/include/precision.h b/quest/include/precision.h
index 2c89545f7..7b932e678 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -126,13 +126,13 @@
  */
 
 #if QUEST_FLOAT_PRECISION == 1
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
 #elif QUEST_FLOAT_PRECISION == 2
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
 #elif QUEST_FLOAT_PRECISION == 4
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 8b7a72ff5..011fdd8de 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -201,6 +201,14 @@ bool comm_isMpiInit() {
 }
 
 
+bool comm_isMpiUserOwned() {
+
+    // this isn't presently used by the code base; I'm just naughtily silencing
+    // "unused var" warning when compiling without MPI :^)
+    return global_isMpiUserOwned;
+}
+
+
 
 /*
  * QUEST COMMUNICATION MANAGEMENT
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 826ebdf1c..cc009ab9a 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -17,6 +17,7 @@ bool comm_isMpiCompiled();
 bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
 bool comm_isMpiInit();
+bool comm_isMpiUserOwned();
 
 // control of QuEST's (possibly more limited) MPI env
 bool comm_isActive();
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index b54250479..c1d3e81ed 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -48,7 +48,7 @@ namespace envvar_values {
 
     // by default, the initial validation epsilon (before being overriden
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
-    qreal QUEST_DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+    qreal QUEST_DEFAULT_VALIDATION_EPSILON = QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
 
     // by default, the initial number of GPU threads per block is informed by
     // the below cmake variable (before being overridden by env-var or at runtime)

From e75459d8876e98c8921bd2e4b21e6320618cb3ce Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 18:02:03 -0400
Subject: [PATCH 54/58] Improve numTPB validation msg

given it can now be a result of the cmake var, the env var, or the runtime setter argument
---
 quest/src/core/validation.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 704db1cdc..603cf2b86 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -161,17 +161,19 @@ namespace report {
 
     // substrings re-used below
     string _invalid_num_tpb_prefix =
-        "Given an invalid number of threads per GPU block (possibly specified by environment variable) of ${NUM_TPB}.";
+        "An invalid number of GPU threads per block (${NUM_TPB}) was passed, or specified via environment variable " + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + ", or compiled into the QuEST library through the CMake option of the same name.";
+    string _num_tpb_warp_indivisible_infix =
+        "The specified number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
+    string _num_tpb_warp_negative_infix =
+        "The specified number must be positive.";
     string _num_tpb_ineffectual_suffix =
         "Note GPU acceleration is not active so this parameter has no effect anyway.";
-    string _num_tpb_warp_indivisible_infix =
-        "Number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
 
     string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE =
-        _invalid_num_tpb_prefix + " Number must be positive.";
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix;
 
     string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY =
-        _invalid_num_tpb_prefix + " Number must be positive. " + _num_tpb_ineffectual_suffix;
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix + " " + _num_tpb_ineffectual_suffix;
 
     string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE =
         _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix;

From e90941930534b150eaf0e59c3459bd565be00df8 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 19:27:13 -0400
Subject: [PATCH 55/58] Crossref'd cmake var, env-var and setter in docs

---
 docs/cmake.md                |  2 +-
 quest/include/experimental.h |  5 ++++-
 quest/include/modes.h        | 11 +++++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/docs/cmake.md b/docs/cmake.md
index e2377d03c..fec90d76a 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -48,7 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default when not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index 2efad722a..8c2cc4e0a 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -75,7 +75,10 @@ int getQuESTNumGpuThreadsPerBlock();
  * This changes the GPU parallelisation granularity and can affect performance, and is useful
  * for performance tuning or diagnostics. Before this function is called, QuEST will use the
  * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
- * if defined. Otherwise, it will fallback to an internal default (presently @p 128).
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
  * 
  * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
  * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 180e85879..25ad8bb54 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -84,15 +84,18 @@
 
     /** @envvardoc
      * 
-     * Specifies the default number of threads per block used by GPU acceleration. 
+     * Specifies the default number of threads per block (or "block dimension") used by GPU acceleration. 
      * 
      * The number of dispatched CUDA threads per block controls the parallelisation granularity of
      * QuEST's GPU backend, affecting performance.
      * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
-     * QuEST's hardcoded default of 128. The specified number will be used by all of QuEST's
-     * GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * QuEST's default otherwise set during compilation via a CMake option of the same name. If 
+     * that CMake option was not set, the default is assumed to be @p 128.
+     * 
+     * The number specified by this environment variable will be used as the block dimension by all of
+     * QuEST's GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
      * The actual number of threads per block used at any time can be queried via 
-     * getQuESTNumGpuThreadsPerBlock(). 
+     * getQuESTNumGpuThreadsPerBlock(), or reported by reportQuESTEnv().
      * 
      * @envvarvalues
      *  - use internal default of `128`: @p '', @p , (unspecified)

From 72ac142b29ac39acd153c74f6382490c3691874b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 20:24:46 -0400
Subject: [PATCH 56/58] Permit non-warp-multiple when validation off

which is achieved by simply removing the overzealous internal error handling. Let users have fun!
---
 quest/src/core/errors.cpp    | 6 ------
 quest/src/core/errors.hpp    | 2 --
 quest/src/gpu/gpu_config.cpp | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 0d4f7ea16..807cad105 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -670,12 +670,6 @@ void error_gpuDenseMatrixConjugatedAndTransposed() {
     raiseInternalError("The GPU + cuQuantum implementation of anyCtrlAnyTargDenseMatr() assumes that at most one of template arguments ApplyConj and ApplyTransp is true, though this was violated.");
 }
 
-void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock) {
-    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
-    if (numThreadsPerBlock > 0 && numThreadsPerBlock % warpSize != 0)
-        raiseInternalError("The number of threads per block was not a positive multiple of the platform warp size (32 for NVIDIA, 64 for AMD).");
-}
-
 void assert_quregIsGpuAccelerated(Qureg qureg) {
 
     if (!qureg.isGpuAccelerated)
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 12465917f..f91f890b0 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -263,8 +263,6 @@ void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
 
-void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock);
-
 void assert_quregIsGpuAccelerated(Qureg qureg);
 
 void assert_mixQuregTempGpuAllocSucceeded(qcomp* gpuPtr);
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index fc68969ad..001cc62c0 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -351,9 +351,6 @@ int gpu_getNumThreadsPerBlock() {
 
 
 void gpu_setNumThreadsPerBlock(int newNumTPB) {
-#if QUEST_COMPILE_CUDA
-    assert_gpuNumThreadsPerBlockIsWarpDivisible(newNumTPB); // CUDA vs HIP specific
-#endif
 
     global_numThreadsPerBlock = newNumTPB;
 }

From 557a0609678559a362c2f1f4fd3afa1135f0afa9 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 20:35:49 -0400
Subject: [PATCH 57/58] added examples

---
 .../extended/set_num_gpu_threads_per_block.c  | 91 +++++++++++++++++++
 .../set_num_gpu_threads_per_block.cpp         | 91 +++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 examples/extended/set_num_gpu_threads_per_block.c
 create mode 100644 examples/extended/set_num_gpu_threads_per_block.cpp

diff --git a/examples/extended/set_num_gpu_threads_per_block.c b/examples/extended/set_num_gpu_threads_per_block.c
new file mode 100644
index 000000000..1b3dc175f
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads_per_block.c
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <stdio.h>
+#include <time.h>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    printf("Using %d threads per block... ", numThreadsPerBlock);
+    fflush(stdout);
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double start = (double) clock();
+
+    for (int r = 0; r < NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double end = (double) clock();
+    double dur = (end - start) / CLOCKS_PER_SEC;
+    double av = dur / NUM_REPS;
+
+    printf("took %fs\n", av);
+}
+
+
+int main(void)
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated)
+    {
+        printf(
+            "GPU acceleration is not enabled, and so changing the number "
+            "of threads per block has no effect. Exiting...\n");
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    printf("Initial numThreadsPerBlock: %d\n\n", initNumTPB);
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark sensible parameters
+    int goodTPB[] = {64, 128, 256, 512, 1024};
+    for (int i = 0; i < 5; i++)
+        benchmark(qureg, goodTPB[i]);
+
+    // Try silly parameters
+    setQuESTValidationOff();
+    int badTPB[] = {31, 15, 5, 1};
+    for (int i = 0; i < 4; i++)
+        benchmark(qureg, badTPB[i]);
+
+    destroyQureg(qureg);
+    finalizeQuESTEnv();
+
+    return 0;
+}
diff --git a/examples/extended/set_num_gpu_threads_per_block.cpp b/examples/extended/set_num_gpu_threads_per_block.cpp
new file mode 100644
index 000000000..c298d736d
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads_per_block.cpp
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <iostream>
+#include <chrono>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    std::cout << "Using " << numThreadsPerBlock << " threads per block... " << std::flush;
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    using clock = std::chrono::steady_clock;
+    auto start = clock::now();
+
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    auto end = clock::now();
+    auto dur = std::chrono::duration<double>(end - start).count();
+    auto av  = dur / NUM_REPS;
+
+    std::cout << " took " << av << "s" << std::endl;
+}
+
+
+int main()
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated) {
+        std::cout 
+            << "GPU acceleration is not enabled, and so changing the number "
+            << "of threads per block has no effect. Exiting..."
+            << std::endl;
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    // The initial number of threads per block is informed by the optional environment
+    // variable QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK. If not specified, QuEST will
+    // use the value of the CMake option of the same name passed during compilation,
+    // which itself will has a default of 128
+    auto initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    std::cout << "Initial numThreadsPerBlock: " << initNumTPB << "\n\n";
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark QuEST with sensible numbers of threads per block (multiples of warp size)
+    for (auto numTPB : {64, 128, 256, 512, 1024})
+        benchmark(qureg, numTPB);
+
+    // Try silly parameters ¯\_(ツ)_/¯
+    setQuESTValidationOff();
+    for (auto numTPB : {31, 15, 5, 1})
+        benchmark(qureg, numTPB);
+    
+    finalizeQuESTEnv();
+    return 0;
+}

From 91c9ade4e9658ac9431afb20c3d1e90925dd2c8f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 1 Jun 2026 20:40:01 -0400
Subject: [PATCH 58/58] Renamed examples

for brevity! pixels r precious
---
 .../{set_num_gpu_threads_per_block.c => set_num_gpu_threads.c}    | 0
 ...{set_num_gpu_threads_per_block.cpp => set_num_gpu_threads.cpp} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/extended/{set_num_gpu_threads_per_block.c => set_num_gpu_threads.c} (100%)
 rename examples/extended/{set_num_gpu_threads_per_block.cpp => set_num_gpu_threads.cpp} (100%)

diff --git a/examples/extended/set_num_gpu_threads_per_block.c b/examples/extended/set_num_gpu_threads.c
similarity index 100%
rename from examples/extended/set_num_gpu_threads_per_block.c
rename to examples/extended/set_num_gpu_threads.c
diff --git a/examples/extended/set_num_gpu_threads_per_block.cpp b/examples/extended/set_num_gpu_threads.cpp
similarity index 100%
rename from examples/extended/set_num_gpu_threads_per_block.cpp
rename to examples/extended/set_num_gpu_threads.cpp