Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 218 additions & 4 deletions cpp/command/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

using namespace std;

static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);
static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);
static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);

static vector<PlayUtils::BenchmarkResults> doFixedTuneThreads(
const SearchParams& params,
Expand Down Expand Up @@ -51,6 +52,52 @@ static const int64_t defaultMaxVisits = 800;
static constexpr double defaultSecondsPerGameMove = 5.0;
static const int ternarySearchInitialMax = 32;

static int getDefaultMaxBatchSize(int expectedConcurrentEvals) {
return std::max(8,((expectedConcurrentEvals+3)/4)*4);
}

static void addUniqueInt(vector<int>& values, int value) {
if(value <= 0 || value > 65536)
return;
for(int x: values) {
if(x == value)
return;
}
values.push_back(value);
}

static vector<int> getNNServerThreadsToTest(int baseNumNNServerThreads) {
testAssert(baseNumNNServerThreads >= 1);
vector<int> ret;
const int maxNumNNServerThreadsToTry = std::max(baseNumNNServerThreads,4);
const int multipliers[] = {1,2,4};
for(int multiplier: multipliers) {
int numThreads = baseNumNNServerThreads * multiplier;
if(numThreads > maxNumNNServerThreadsToTry)
break;
ret.push_back(numThreads);
}
return ret;
}

static vector<int> getNNMaxBatchSizesToTest(int numSearchThreads) {
testAssert(numSearchThreads >= 1);
const int defaultMaxBatchSize = getDefaultMaxBatchSize(numSearchThreads);
vector<int> ret;
const int fixedCandidates[] = {8,16,32,64};
for(int batchSize: fixedCandidates)
addUniqueInt(ret,batchSize);
if(defaultMaxBatchSize >= 128)
addUniqueInt(ret,128);
addUniqueInt(ret,defaultMaxBatchSize);
sort(ret.begin(),ret.end());
return ret;
}

static double getNNEvalsPerSecond(const PlayUtils::BenchmarkResults& result) {
return result.numNNEvals / (result.totalSeconds + 0.00001);
}

int MainCmds::benchmark(const vector<string>& args) {
Board::initHash();
ScoreValue::initTables();
Expand Down Expand Up @@ -256,6 +303,7 @@ int MainCmds::benchmark(const vector<string>& args) {
cout << "Your GTP config is currently set to trtUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl;
if(nnEval->getUsingFP16Mode() == enabled_t::False)
cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080) setting this to true may give a large performance boost." << endl;
cout << "For repeated TensorRT benchmark or genconfig runs with the same model/GPU/batch size, building with -DUSE_CACHE_TENSORRT_PLAN=1 can greatly reduce startup time." << endl;
#endif
#ifdef USE_METAL_BACKEND
cout << "You are currently using the Metal version of KataGo." << endl;
Expand Down Expand Up @@ -320,10 +368,11 @@ static void warmStartNNEval(const CompactSgf& sgf, Logger& logger, const SearchP
delete bot;
}

static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
int expectedConcurrentEvals = maxNumThreads;
const int defaultMaxBatchSize = std::max(8,((maxNumThreads+3)/4)*4);
static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
return createNNEvalWithBatchSize(expectedConcurrentEvals,getDefaultMaxBatchSize(expectedConcurrentEvals),sgf,modelFile,logger,cfg,params);
}

static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
Rand seedRand;

#ifdef USE_EIGEN_BACKEND
Expand Down Expand Up @@ -632,6 +681,8 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
vector<int> configDeviceIdxs;
int configNNCacheSizePowerOfTwo = 20;
int configNNMutexPoolSizePowerOfTwo = 16;
int configNNMaxBatchSize = -1;
int configNumNNServerThreadsPerModel = 1;
int configNumSearchThreads = 6;

cout << endl;
Expand Down Expand Up @@ -783,6 +834,8 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
}
});
}
if(configDeviceIdxs.size() > 0)
configNumNNServerThreadsPerModel = (int)configDeviceIdxs.size();
#endif

{
Expand Down Expand Up @@ -825,9 +878,15 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
bool skipThreadTuning = false;
if(FileUtils::exists(outputFile)) {
int oldConfigNumSearchThreads = -1;
int oldConfigNumNNServerThreadsPerModel = -1;
int oldConfigNNMaxBatchSize = -1;
try {
ConfigParser oldCfg(outputFile);
oldConfigNumSearchThreads = oldCfg.getInt("numSearchThreads",1,4096);
if(oldCfg.contains("numNNServerThreadsPerModel"))
oldConfigNumNNServerThreadsPerModel = oldCfg.getInt("numNNServerThreadsPerModel",1,1024);
if(oldCfg.contains("nnMaxBatchSize"))
oldConfigNNMaxBatchSize = oldCfg.getInt("nnMaxBatchSize",1,65536);
}
catch(const StringError&) {
cout << "NOTE: Overwritten config does not specify numSearchThreads or otherwise could not be parsed." << endl;
Expand All @@ -842,6 +901,10 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
);
if(skipThreadTuning) {
configNumSearchThreads = oldConfigNumSearchThreads;
if(oldConfigNumNNServerThreadsPerModel > 0)
configNumNNServerThreadsPerModel = oldConfigNumNNServerThreadsPerModel;
if(oldConfigNNMaxBatchSize > 0)
configNNMaxBatchSize = oldConfigNNMaxBatchSize;
}
}
}
Expand All @@ -855,8 +918,10 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
configMaxTime,
configMaxPonderTime,
configDeviceIdxs,
configNNMaxBatchSize,
configNNCacheSizePowerOfTwo,
configNNMutexPoolSizePowerOfTwo,
configNumNNServerThreadsPerModel,
configNumSearchThreads
);
};
Expand Down Expand Up @@ -968,6 +1033,155 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
configNumSearchThreads = results[bestIdx].numThreads;

delete nnEval;
nnEval = NULL;

#ifndef USE_EIGEN_BACKEND
#ifdef USE_TENSORRT_BACKEND
cout << "Tip: For repeated TensorRT genconfig runs on the same model/GPU/batch size, a build with -DUSE_CACHE_TENSORRT_PLAN=1 can make startup much faster." << endl;
#endif

{
int baseNumNNServerThreads = configDeviceIdxs.size() > 0 ? (int)configDeviceIdxs.size() : 1;
vector<int> numNNServerThreadsToTest = getNNServerThreadsToTest(baseNumNNServerThreads);

if(numNNServerThreadsToTest.size() > 1) {
cout << endl;
cout << "=========================================================================" << endl;
cout << "TUNING NEURAL NET SERVER THREADS NOW" << endl;
cout << "Tuning numNNServerThreadsPerModel using nnEvals/s at "
<< configNumSearchThreads << " numSearchThreads." << endl;

int bestNumNNServerThreads = configNumNNServerThreadsPerModel;
double bestNNEvalsPerSecond = -1.0;

for(int numNNServerThreads: numNNServerThreadsToTest) {
configNumNNServerThreadsPerModel = numNNServerThreads;
updateConfigContents();

istringstream nnServerInConfig(configFileContents);
ConfigParser nnServerCfg(nnServerInConfig);
Logger nnServerLogger(&nnServerCfg, logToStdOut);
Setup::initializeSession(nnServerCfg);

SearchParams nnServerParams = Setup::loadSingleParams(nnServerCfg,Setup::SETUP_FOR_BENCHMARK);
nnServerParams.maxVisits = maxVisits;
nnServerParams.maxPlayouts = maxVisits;
nnServerParams.maxTime = 1e20;
nnServerParams.searchFactorAfterOnePass = 1.0;
nnServerParams.searchFactorAfterTwoPass = 1.0;

int maxNumThreadsForBatch = std::max(configNumSearchThreads,numNNServerThreads);
NNEvaluator* nnServerEval = createNNEval(maxNumThreadsForBatch, *sgf, modelFile, nnServerLogger, nnServerCfg, nnServerParams);
auto getNNServerDesiredBatchSize = [&](int currentNumThreads) {
(void)currentNumThreads;
return nnServerEval->getMaxBatchSize();
};

vector<int> numThreads = {configNumSearchThreads};
vector<PlayUtils::BenchmarkResults> nnServerResults = doFixedTuneThreads(
nnServerParams,*sgf,numPositionsPerGame,nnServerEval,nnServerLogger,secondsPerGameMove,numThreads,false,getNNServerDesiredBatchSize
);
testAssert(nnServerResults.size() == 1);
double nnEvalsPerSecond = getNNEvalsPerSecond(nnServerResults[0]);
cout << "numNNServerThreadsPerModel = " << numNNServerThreads
<< ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond)
<< " visits/s = " << Global::strprintf("%.2f",nnServerResults[0].totalVisits / (nnServerResults[0].totalSeconds + 0.00001))
<< " avgBatchSize = " << Global::strprintf("%.2f",nnServerResults[0].avgBatchSize)
<< endl;

if(nnEvalsPerSecond > bestNNEvalsPerSecond) {
bestNNEvalsPerSecond = nnEvalsPerSecond;
bestNumNNServerThreads = numNNServerThreads;
}

delete nnServerEval;
}

configNumNNServerThreadsPerModel = bestNumNNServerThreads;
cout << "Using " << configNumNNServerThreadsPerModel
<< " numNNServerThreadsPerModel based on nnEvals/s!" << endl;
}
}

{
vector<int> nnMaxBatchSizesToTest = getNNMaxBatchSizesToTest(configNumSearchThreads);

if(nnMaxBatchSizesToTest.size() > 1) {
cout << endl;
cout << "=========================================================================" << endl;
cout << "TUNING NEURAL NET MAX BATCH SIZE NOW" << endl;
cout << "Tuning nnMaxBatchSize using nnEvals/s at "
<< configNumSearchThreads << " numSearchThreads and "
<< configNumNNServerThreadsPerModel << " numNNServerThreadsPerModel." << endl;

int bestNNMaxBatchSize = getDefaultMaxBatchSize(configNumSearchThreads);
double bestNNEvalsPerSecond = -1.0;

for(int nnMaxBatchSize: nnMaxBatchSizesToTest) {
configNNMaxBatchSize = nnMaxBatchSize;
updateConfigContents();

double nnEvalsPerSecond = -1.0;
double visitsPerSecond = -1.0;
double avgBatchSize = -1.0;
NNEvaluator* batchEval = NULL;

try {
istringstream batchInConfig(configFileContents);
ConfigParser batchCfg(batchInConfig);
Logger batchLogger(&batchCfg, logToStdOut);
Setup::initializeSession(batchCfg);

SearchParams batchParams = Setup::loadSingleParams(batchCfg,Setup::SETUP_FOR_BENCHMARK);
batchParams.maxVisits = maxVisits;
batchParams.maxPlayouts = maxVisits;
batchParams.maxTime = 1e20;
batchParams.searchFactorAfterOnePass = 1.0;
batchParams.searchFactorAfterTwoPass = 1.0;

int expectedConcurrentEvals = std::max(configNumSearchThreads,configNumNNServerThreadsPerModel);
batchEval = createNNEvalWithBatchSize(expectedConcurrentEvals, nnMaxBatchSize, *sgf, modelFile, batchLogger, batchCfg, batchParams);
auto getBatchDesiredBatchSize = [&](int currentNumThreads) {
(void)currentNumThreads;
return batchEval->getMaxBatchSize();
};

vector<int> numThreads = {configNumSearchThreads};
vector<PlayUtils::BenchmarkResults> batchResults = doFixedTuneThreads(
batchParams,*sgf,numPositionsPerGame,batchEval,batchLogger,secondsPerGameMove,numThreads,false,getBatchDesiredBatchSize
);
testAssert(batchResults.size() == 1);

nnEvalsPerSecond = getNNEvalsPerSecond(batchResults[0]);
visitsPerSecond = batchResults[0].totalVisits / (batchResults[0].totalSeconds + 0.00001);
avgBatchSize = batchResults[0].avgBatchSize;

if(nnEvalsPerSecond > bestNNEvalsPerSecond) {
bestNNEvalsPerSecond = nnEvalsPerSecond;
bestNNMaxBatchSize = nnMaxBatchSize;
}
}
catch(const StringError& e) {
cout << "nnMaxBatchSize = " << nnMaxBatchSize << " failed: " << e.what() << endl;
}

delete batchEval;

if(nnEvalsPerSecond >= 0.0) {
cout << "nnMaxBatchSize = " << nnMaxBatchSize
<< ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond)
<< " visits/s = " << Global::strprintf("%.2f",visitsPerSecond)
<< " avgBatchSize = " << Global::strprintf("%.2f",avgBatchSize)
<< endl;
}
}

configNNMaxBatchSize = bestNNMaxBatchSize;
cout << "Using " << configNNMaxBatchSize
<< " nnMaxBatchSize based on nnEvals/s!" << endl;
}
}
#endif
}

updateConfigContents();
Expand Down
2 changes: 2 additions & 0 deletions cpp/command/runtests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,10 @@ int MainCmds::runtinynntests(const vector<string>& args) {
maxTime,
maxPonderTime,
std::vector<int>(),
-1,
nnCacheSizePowerOfTwo,
nnMutexPoolSizePowerOfTwo,
1,
numSearchThreads
);
istringstream in(cfgStr);
Expand Down
5 changes: 5 additions & 0 deletions cpp/configs/gtp_example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,11 @@ searchFactorWhenWinningThreshold = 0.95
# if running out of memory, or using multiple GPUs that expect to share work.
# nnMaxBatchSize = <integer>

# TensorRT users who repeatedly run genconfig or benchmark with the same
# model/GPU/batch settings may greatly reduce startup time by building with
# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for
# distributed clients, which update models frequently.

# Controls the neural network cache size, which is the primary RAM/memory use.
# KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net
# evaluations in case of transpositions in the tree.
Expand Down
30 changes: 23 additions & 7 deletions cpp/program/gtpconfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,17 @@ searchFactorWhenWinningThreshold = 0.95
# Maximum number of positions to send to a single GPU at once. The default
# value is roughly equal to numSearchThreads, but can be specified manually
# if running out of memory, or using multiple GPUs that expect to share work.
# nnMaxBatchSize = <integer>
$$NN_MAX_BATCH_SIZE
#
# Number of neural net server threads per model. Usually this is the number of
# GPUs, but genconfig may tune this higher to run multiple backend contexts on
# the same GPU when that improves nnEvals/s.
# numNNServerThreadsPerModel = 1
#
# TensorRT users who repeatedly run genconfig or benchmark with the same
# model/GPU/batch settings may greatly reduce startup time by building with
# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for
# distributed clients, which update models frequently.

# Controls the neural network cache size, which is the primary RAM/memory use.
# KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net
Expand Down Expand Up @@ -466,10 +476,13 @@ string GTPConfig::makeConfig(
double maxTime,
double maxPonderTime,
const std::vector<int>& deviceIdxs,
int nnMaxBatchSize,
int nnCacheSizePowerOfTwo,
int nnMutexPoolSizePowerOfTwo,
int numNNServerThreadsPerModel,
int numSearchThreads
) {
testAssert(numNNServerThreadsPerModel >= 1);
string config = gtpBasePart1 + gtpBasePart2;
auto replace = [&](const string& key, const string& replacement) {
size_t pos = config.find(key);
Expand Down Expand Up @@ -518,25 +531,28 @@ string GTPConfig::makeConfig(
else replace("$$PONDERING", "ponderingEnabled = true\n# maxTimePondering = 60.0");

replace("$$NUM_SEARCH_THREADS", Global::intToString(numSearchThreads));
if(nnMaxBatchSize > 0) replace("$$NN_MAX_BATCH_SIZE", "nnMaxBatchSize = " + Global::intToString(nnMaxBatchSize));
else replace("$$NN_MAX_BATCH_SIZE", "# nnMaxBatchSize = <integer>");
replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo));
replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo));

if(deviceIdxs.size() <= 0) {
if(deviceIdxs.size() <= 0 && numNNServerThreadsPerModel <= 1) {
replace("$$MULTIPLE_GPUS", "");
}
else {
string replacement = "";
replacement += "numNNServerThreadsPerModel = " + Global::uint64ToString(deviceIdxs.size()) + "\n";
replacement += "numNNServerThreadsPerModel = " + Global::intToString(numNNServerThreadsPerModel) + "\n";

for(int i = 0; i<deviceIdxs.size(); i++) {
for(int i = 0; i<numNNServerThreadsPerModel; i++) {
int deviceIdx = deviceIdxs.size() <= 0 ? 0 : deviceIdxs[(size_t)i % deviceIdxs.size()];
#ifdef USE_CUDA_BACKEND
replacement += "cudaDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
replacement += "cudaDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
#endif
#ifdef USE_TENSORRT_BACKEND
replacement += "trtDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
replacement += "trtDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
#endif
#ifdef USE_OPENCL_BACKEND
replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
#endif
}
replace("$$MULTIPLE_GPUS", replacement);
Expand Down
2 changes: 2 additions & 0 deletions cpp/program/gtpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ namespace GTPConfig {
double maxTime,
double maxPonderTime,
const std::vector<int>& deviceIdxs,
int nnMaxBatchSize,
int nnCacheSizePowerOfTwo,
int nnMutexPoolSizePowerOfTwo,
int numNNServerThreadsPerModel,
int numSearchThreads
);
}
Expand Down