Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
16179f1
Make env.isX bool to match env.userOwnsMpi
TysonRayJones May 29, 2026
1ddfb6b
Add MPI status validation
TysonRayJones May 29, 2026
507c2e4
Simplify comm_init()
TysonRayJones May 29, 2026
e80f768
Enable error msgs even when MPI config is invalid
TysonRayJones May 29, 2026
8b73cd3
Add Oliver's custom MPI examples
TysonRayJones May 29, 2026
e91f54f
renamed env.userOwnsMpi to env.isMpiUserOwned
TysonRayJones May 29, 2026
fe1020c
Remove redundant stdbool include
TysonRayJones May 29, 2026
d363d09
Add validation to initCustomMpiCommQuESTEnv
TysonRayJones May 29, 2026
70ac569
Rename mpiCommQuest to global_mpiComm
TysonRayJones May 29, 2026
ef6860b
Rename mpiCommQuest (local var) to mpiComm
TysonRayJones May 29, 2026
d85e064
Made environment.cpp adhere to global_ convention
TysonRayJones May 29, 2026
8fe9bbe
Remove suspicious updateQuESTEnvDistInfo()
TysonRayJones May 29, 2026
314e72e
Error in comm_getMpiComm() when comm=NULL
TysonRayJones May 29, 2026
51c0731
Remove MPI leak from comm_config.hpp
TysonRayJones May 29, 2026
047ede7
Rename comm_isMpiSubCommunicatorCompiled to comm_isMpiSubCommCompiled
TysonRayJones May 29, 2026
1680a12
Replace magic number
TysonRayJones May 29, 2026
00332a8
Make initCustomMpiCommQuESTEnv validate against re-init
TysonRayJones May 29, 2026
6763af0
Make initCustomMpiCommQuESTEnv validate subcomm is non-null
TysonRayJones May 29, 2026
1c9072c
Make initCustomMpiCommQuESTEnv validate set-subcomm succeeds
TysonRayJones May 29, 2026
7c75e72
Remove redundant env.bool tests
TysonRayJones May 29, 2026
93f30f2
Rename error_commDoubleSetMpiComm
TysonRayJones May 29, 2026
790d11c
Skip custom MPI examples when no MPI
TysonRayJones May 29, 2026
ac86d12
Patches
TysonRayJones May 29, 2026
a483af5
Permit usage of MPI when QuEST is non-distributed
TysonRayJones May 30, 2026
752e89f
patch bug where user-MPI was finalised
TysonRayJones May 30, 2026
53d3f28
moved new custom-env funcs to experimental.h
TysonRayJones May 31, 2026
6c07b28
Merge remote-tracking branch 'origin/permit-mpi-usage-without-distrib…
TysonRayJones May 31, 2026
dc2cf6c
moved numTBP API to experimental.h
TysonRayJones May 31, 2026
fe5aaf8
Flag register-spill risk when increasing TBP
TysonRayJones May 31, 2026
3f550a7
add numTBP validation
TysonRayJones Jun 1, 2026
db43c17
Allow numTPB query/set when GPU not compiled
TysonRayJones Jun 1, 2026
853a151
improve unit tests
TysonRayJones Jun 1, 2026
4a0a079
revise HIP significance
TysonRayJones Jun 1, 2026
7e68a4d
Replace TBP cmake var with environment var
TysonRayJones Jun 1, 2026
9494a29
Remove env.isHipCompiled
TysonRayJones Jun 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 1 addition & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -183,21 +183,6 @@ option(
)
message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")

# GPU Performance Tuning
## We do not print this value when configuring CMake as it is for advanced users only.

set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
CACHE
STRING
"The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
)
mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)

math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
endif()

# Deprecated API
option(
QUEST_ENABLE_DEPRECATED_API
Expand Down Expand Up @@ -514,7 +499,6 @@ set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})


# (for the love of God cmake, create a concise syntax for this)
Expand All @@ -523,6 +507,7 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
else()
set(QUEST_COMPILE_CUDA 0)
endif()
set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})


# these vars are already set, but repeated here for clarity
Expand All @@ -531,10 +516,6 @@ set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})


# these do not appear in src but are saved for record-keeping in config.h.in
set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})



# ============================
# Pass files to library
Expand Down
1 change: 0 additions & 1 deletion docs/cmake.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ make
| `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
| `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
| `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |



Expand Down
1 change: 1 addition & 0 deletions docs/launch.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment

- [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
- [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)

Note the unit tests in the preceding section accept additional environment variables.

Expand Down
42 changes: 42 additions & 0 deletions examples/extended/user_owned_mpi.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/** @file
*
* TODO
*
* @author Oliver Brown
*/

#include "quest.h"
#include <stdio.h>

// This example requires linking with MPI, which the CMake
// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
// results in quest.h defining QUEST_COMPILE_SUBCOMM
#if ! QUEST_COMPILE_SUBCOMM

int main(void)
{
printf("Example skipped since MPI is not linked.\n");
return 0;
}

#else

#include <mpi.h>

int main(void)
{
const int USE_DISTRIB = 1;
const bool USER_MPI = 1;
const int USE_OPENMP = 1;
const int USE_GPU = 0;

MPI_Init(NULL, NULL);
initCustomMpiQuESTEnv(USE_DISTRIB, USER_MPI, USE_GPU, USE_OPENMP);
reportQuESTEnv();
finalizeQuESTEnv();
MPI_Finalize();

return 0;
}

#endif // QUEST_COMPILE_SUBCOMM
84 changes: 84 additions & 0 deletions examples/extended/user_owned_submpi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/** @file
*
* TODO
*
* @author Oliver Brown
*/

#include "quest.h"
#include <cstdio>


// TODO:
// this example sees some processes print to std-out while
// QuEST is reporting, colliding with output. May be worth
// introducing a sync to force non-QuEST-processes to wait
// during QUEST reporting


// This example requires linking with MPI, which the CMake
// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
// results in quest.h defining QUEST_COMPILE_SUBCOMM
#if ! QUEST_COMPILE_SUBCOMM

int main()
{
std::printf("Example skipped since MPI is not linked.\n");
return 0;
}

#else

#include <mpi.h>

int main (void)
{
int nprocs, quest_nprocs, world_rank, quest_rank;
MPI_Comm comm_split, comm_quantum, comm_classical;

MPI_Init(NULL, NULL);

MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

const int I_AM_QUANTUM = world_rank % 2;

std::printf("[%d] Hello from rank %d of %d in MPI_COMM_WORLD.\n", world_rank, world_rank, nprocs);

MPI_Comm_split(MPI_COMM_WORLD, I_AM_QUANTUM, world_rank, &comm_split);

if (I_AM_QUANTUM) {
MPI_Comm_dup(comm_split, &comm_quantum);
MPI_Comm_size(comm_quantum, &quest_nprocs);
MPI_Comm_rank(comm_quantum, &quest_rank);
std::printf("[%d] Hello from rank %d of %d in comm_quantum.\n", world_rank, quest_rank, quest_nprocs);
} else {
MPI_Comm_dup(comm_split, &comm_classical);
quest_rank = -1;
quest_nprocs = -1;
}

// only procs in quantum comm initialise QuEST
if (I_AM_QUANTUM) {
std::printf("[%d] Initialising QuEST.\n", world_rank);
initCustomMpiCommQuESTEnv(comm_quantum, modeflag::USE_AUTO, modeflag::USE_AUTO);

reportQuESTEnv();

std::printf("[%d] Finalising QuEST.\n", world_rank);
finalizeQuESTEnv();
}

MPI_Comm_free(&comm_split);
if (I_AM_QUANTUM) {
MPI_Comm_free(&comm_quantum);
} else {
MPI_Comm_free(&comm_classical);
}

MPI_Finalize();

return 0;
}

#endif // QUEST_COMPILE_SUBCOMM
7 changes: 1 addition & 6 deletions quest/include/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,13 @@
#cmakedefine01 QUEST_COMPILE_SUBCOMM
#cmakedefine01 QUEST_COMPILE_CUDA
#cmakedefine01 QUEST_COMPILE_CUQUANTUM
#cmakedefine01 QUEST_COMPILE_HIP

// default parameters which may have been tuned for performance when building the library
#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@

// crucial to QuEST source (informs optional NUMA usage)
#cmakedefine01 QUEST_ENABLE_NUMA


// not consulted by src (included for book-keeping)
#cmakedefine01 QUEST_COMPILE_HIP



/*
* inherit the version information from CMake.
Expand Down
26 changes: 6 additions & 20 deletions quest/include/environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@ extern "C" {
typedef struct {

// deployment modes which can be runtime disabled
int isMultithreaded;
int isGpuAccelerated;
int isDistributed;
bool userOwnsMpi;
bool isMultithreaded;
bool isGpuAccelerated;
bool isDistributed;
bool isMpiUserOwned;

// deployment modes which cannot be directly changed after compilation
int isCuQuantumEnabled;
bool isCuQuantumEnabled;

// deployment configurations which can be changed via environment variables
int isGpuSharingEnabled;
bool isGpuSharingEnabled;

// distributed configuration
int rank;
Expand All @@ -64,12 +64,6 @@ void initQuESTEnv();
*/
void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread);

/** @notyetdoced
* Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
* This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
*/
void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);

/// @notyetdoced
void finalizeQuESTEnv();

Expand All @@ -92,14 +86,6 @@ int isQuESTEnvInit();
QuESTEnv getQuESTEnv();


/** @notyetdoced
* GPU thread per block control
* This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
* OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
*/
int getQuESTNumGpuThreadsPerBlock();
void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);


// end de-mangler
#ifdef __cplusplus
Expand Down
107 changes: 107 additions & 0 deletions quest/include/experimental.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/** @file
* Experimental functions which are liable to
* API breaks within QuEST minor version releases.
* Some optional functions require compiling this
* file against MPI, despite being outside of /comm/,
* and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
*
* @author Oliver Brown
* @author Tyson Jones (formatting)
*
* @defgroup experimental Experimental
* @ingroup api
* @brief Experimental functions with tentative APIs
* @{
*/

#ifndef EXPERIMENTAL_H
#define EXPERIMENTAL_H

#include "quest/include/config.h"

#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
#error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
#endif

#if QUEST_COMPILE_SUBCOMM
#include <mpi.h>
#endif

// enable invocation by both C and C++ binaries
#ifdef __cplusplus
extern "C" {
#endif


/** @notyetdoced
*
* Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
* This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
*
* @author Oliver Brown
*/
void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);


#if QUEST_COMPILE_SUBCOMM
/** @notyetdoced
*
* Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
* Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and
* therefore that they have already initialised MPI, and they will call MPI_Finalize at the
* appropriate time.
*
* The user-provided MPI communicator undergoes the same validation procedure as any that QuEST
* would use, and so must contain a power-of-2 number of processes.
*
* This function is only compiled and exposed when macro QUEST_COMPILE_SUBCOMM is 1, as is
* defined when providing CMake option QUEST_ENABLE_SUBCOMM during building.
*
* @author Oliver Brown
*/
void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
#endif // QUEST_COMPILE_SUBCOMM


/** @notyetdoced
*
* @author Oliver Brown
*/
int getQuESTNumGpuThreadsPerBlock();


/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
*
* This changes the GPU parallelisation granularity and can affect performance, and is useful
* for performance tuning or diagnostics. Before this function is called, QuEST will use the
* number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
* if defined. Otherwise, it will fallback to an internal default (presently @p 128).
*
* Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
* and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
*
* @note
* This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
*
* @param[in] numThreadsPerBlock the new block size.
* @throws @validationerror
* - if the @p QuESTEnv is not initialised.
* - if @p numThreadsPerBlock is negative.
* - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
* - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
* @see
* - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
* @author Oliver Brown
* @author Tyson Jones
*/
void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);


// end de-mangler
#ifdef __cplusplus
}
#endif

#endif // EXPERIMENTAL_H

/** @} */ // (end file-wide doxygen defgroup)
Loading