Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions mllm/backends/qnn/QNNAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "mllm/utils/Common.hpp"
#include "mllm/utils/Log.hpp"
#include <dlfcn.h>
#include <cstdio>
#include <sstream>

namespace mllm::qnn {

Expand Down Expand Up @@ -78,7 +80,25 @@ void QNNAllocator::free(Storage* storage) {

void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) {
// Make sure there has a memory that we can register to.
MLLM_RT_ASSERT(qnnMemPtrSet_.count(ptr));
if (!qnnMemPtrSet_.count(ptr)) {
std::ostringstream dims;
dims << "[";
const auto rank = QNN_TENSOR_GET_RANK(qnn_tensor);
const auto* shape = QNN_TENSOR_GET_DIMENSIONS(qnn_tensor);
for (uint32_t i = 0; i < rank; ++i) { dims << (i == 0 ? "" : ",") << shape[i]; }
dims << "]";
std::fprintf(stderr,
"QNN shared-buffer register failed: tensor='%s', ptr=%p, dtype=%d, rank=%u, dims=%s is not owned by "
"QNNAllocator (owned ptr count=%zu)\n",
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(qnn_tensor)), rank, dims.str().c_str(), qnnMemPtrSet_.size());
std::fflush(stderr);
MLLM_ERROR("QNN shared-buffer register failed: tensor='{}', ptr={}, dtype={}, rank={}, dims={} is not owned by "
"QNNAllocator (owned ptr count={})",
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(qnn_tensor)), rank, dims.str(), qnnMemPtrSet_.size());
MLLM_RT_ASSERT(qnnMemPtrSet_.count(ptr));
}

// if already registered, just set the mem handle
if (ptrToFdAndMemHandleMap_.count(ptr) > 0) {
Expand All @@ -90,7 +110,14 @@ void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_

// Get the file id of this memory space.
int mem_fd = rpcmem_to_fd(ptr);
MLLM_RT_ASSERT(mem_fd != -1);
if (mem_fd == -1) {
std::fprintf(stderr, "QNN shared-buffer register failed: rpcmem_to_fd returned -1 for tensor='%s', ptr=%p\n",
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr);
std::fflush(stderr);
MLLM_ERROR("QNN shared-buffer register failed: rpcmem_to_fd returned -1 for tensor='{}', ptr={}",
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr);
MLLM_RT_ASSERT(mem_fd != -1);
}

// Make qnn memory descriptor. Set ION.
Qnn_MemDescriptor_t mem_descriptor = QNN_MEM_DESCRIPTOR_INIT;
Expand All @@ -106,7 +133,24 @@ void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_

// Register to QNN memory
Qnn_MemHandle_t mem_handle = QNN_TENSOR_GET_MEM_HANDLE(qnn_tensor);
MLLM_RT_ASSERT_EQ(QNN_SUCCESS, qnnInterface_.memRegister(context_, &mem_descriptor, 1u, &mem_handle));
Qnn_ErrorHandle_t status = qnnInterface_.memRegister(context_, &mem_descriptor, 1u, &mem_handle);
if (QNN_SUCCESS != status) {
std::ostringstream dims;
dims << "[";
const auto rank = QNN_TENSOR_GET_RANK(qnn_tensor);
const auto* shape = QNN_TENSOR_GET_DIMENSIONS(qnn_tensor);
for (uint32_t i = 0; i < rank; ++i) { dims << (i == 0 ? "" : ",") << shape[i]; }
dims << "]";
std::fprintf(stderr, "QNN memRegister failed: status=%lu, tensor='%s', ptr=%p, fd=%d, dtype=%d, rank=%u, dims=%s\n",
static_cast<unsigned long>(status),
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr, mem_fd,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(qnn_tensor)), rank, dims.str().c_str());
std::fflush(stderr);
MLLM_ERROR("QNN memRegister failed: status={}, tensor='{}', ptr={}, fd={}, dtype={}, rank={}, dims={}", status,
QNN_TENSOR_GET_NAME(qnn_tensor) ? QNN_TENSOR_GET_NAME(qnn_tensor) : "<null>", ptr, mem_fd,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(qnn_tensor)), rank, dims.str());
MLLM_RT_ASSERT_EQ(QNN_SUCCESS, status);
}

QNN_TENSOR_SET_MEM_HANDLE(qnn_tensor, mem_handle);

Expand Down
18 changes: 11 additions & 7 deletions mllm/backends/qnn/QNNBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -657,13 +657,17 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
inputs.size(), graphName);
return;
}
if (outputs.size() != model->getGraphOutputTensorWrappers().size()) {
MLLM_ERROR("Output size mismatch: expected {}, got {} for graph '{}'", model->getGraphOutputTensorWrappers().size(),
outputs.size(), graphName);
return;
}

std::vector<Qnn_Tensor_t> qnn_inputs;
std::vector<Qnn_Tensor_t> qnn_outputs;
// Prepare QNN inputs
for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
auto wrapper = model->getGraphInputTensorWrappers()[i];
auto& wrapper_tensor = wrapper->getDataContainer();
const auto& runtime_input = inputs[i];

// Validate input tensors
Expand All @@ -672,9 +676,9 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
return;
}

// Case of executing retrieved graph created by AOT
// input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
// Retrieved AOT graphs may be executed repeatedly with different runtime buffers
// in diagnostic paths. Rebind on every execution so QNN sees the current tensor.
wrapper->__setDataContainer(runtime_input);

// Allocate and register the wrapper tensor with QNN allocator
// QNNAllocator will handle registered memory descriptor when needed
Expand All @@ -684,7 +688,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
// Prepare QNN outputs
for (int j = 0; j < model->getGraphOutputTensorWrappers().size(); j++) {
auto wrapper = model->getGraphOutputTensorWrappers()[j];
auto& wrapper_tensor = wrapper->getDataContainer();
const auto& runtime_output = outputs[j];

// Validate output tensors
Expand All @@ -693,8 +696,9 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
return;
}

// output wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_output); }
// Retrieved AOT graphs may be executed repeatedly with different runtime buffers
// in diagnostic paths. Rebind on every execution so QNN writes to the current tensor.
wrapper->__setDataContainer(runtime_output);

// alloc and register qnn tensor
wrapper->alloc(); // QNNAllocator will handle registered memory descriptor
Expand Down
32 changes: 32 additions & 0 deletions mllm/backends/qnn/QNNModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,34 @@

#include "mllm/backends/qnn/QNNModel.hpp"
#include <cassert>
#include <cstdlib>
#include <sstream>
#include "mllm/backends/qnn/QNNTypeMacros.hpp"
#include "mllm/backends/qnn/QNNUtils.hpp"
#include "mllm/utils/Log.hpp"

namespace mllm::qnn {

namespace {

bool shouldDumpQnnIO() {
const char* flag = std::getenv("MLLM_QNN_DUMP_IO");
return flag != nullptr && std::string(flag) != "0";
}

std::string dimsToString(const std::vector<uint32_t>& dims) {
std::ostringstream oss;
oss << "[";
for (size_t i = 0; i < dims.size(); ++i) {
if (i > 0) { oss << ", "; }
oss << dims[i];
}
oss << "]";
return oss.str();
}

} // namespace

template<typename... Args>
void freeMultiPtr(Args... args) {
(free(args), ...);
Expand Down Expand Up @@ -112,6 +134,11 @@ ModelError_t QNNModel::loadGraphTensorInfo(const Qnn_Tensor_t* inputTensors, uin

inputTensorWrappers_.push_back(wrapper);
tensorWrapperMap_[tensorName] = wrapper;

if (shouldDumpQnnIO()) {
MLLM_INFO("QNN graph {} input[{}]: name='{}', dtype={}, dims={}", graphName_, i, tensorName,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(tensor)), dimsToString(dimensions));
}
}

// Create wrappers for output tensors
Expand All @@ -134,6 +161,11 @@ ModelError_t QNNModel::loadGraphTensorInfo(const Qnn_Tensor_t* inputTensors, uin

outputTensorWrappers_.push_back(wrapper);
tensorWrapperMap_[tensorName] = wrapper;

if (shouldDumpQnnIO()) {
MLLM_INFO("QNN graph {} output[{}]: name='{}', dtype={}, dims={}", graphName_, i, tensorName,
static_cast<int>(QNN_TENSOR_GET_DATA_TYPE(tensor)), dimsToString(dimensions));
}
}

MLLM_INFO("QNNModel::loadGraphTensorInfo() loaded {} input tensors and {} output tensors for graph: {}", numInputTensors,
Expand Down
3 changes: 1 addition & 2 deletions mllm/backends/qnn/QNNUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,8 @@ class QNNTensorWrapper {

bool isAlloc() { return isAlloc_; }
void __setDataContainer(const Tensor& tensor) {
MLLM_RT_ASSERT(dataContainer_.isNil())
dataContainer_ = tensor;
if (!tensor.isNil()) { isAlloc_ = true; }
isAlloc_ = !tensor.isNil();
}

// Helper to set complex quantization params and manage memory
Expand Down
52 changes: 44 additions & 8 deletions mllm/backends/qnn/aot/QnnWrappersAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,18 @@

namespace mllm::qnn::aot {

namespace {

std::string qnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) {
if (v && v->hasSymbolAttr()) { return v->getSymbolAttr()->str(); }
return v ? v->name() : "";
}

} // namespace

QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) {
auto type = parseQnnTensorTypeFromIR(v);
auto name = v->name();
auto name = parseQnnTensorNameFromIR(v);
auto quant = parseQnnQuantizeParamFromIR(v);

if (force_static_weight || type == QNN_TENSOR_TYPE_STATIC) {
Expand Down Expand Up @@ -103,7 +112,9 @@ Qnn_DataType_t QnnAOTNodeTensor::parseQnnDataTypeFromIR(const ir::tensor::Tensor
return mllm::qnn::mllmDataTypeToQnnDataType(v->tensor_.dtype());
}

std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) { return v->name(); }
std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) {
return qnnTensorNameFromIR(v);
}

Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v) {
Qnn_QuantizeParams_t ret = QNN_QUANTIZE_PARAMS_INIT;
Expand Down Expand Up @@ -139,10 +150,30 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name());
}

MLLM_RT_ASSERT_EQ(cfg->quant_to_type, kUInt8);
int32_t offset = 0;
switch (cfg->quant_to_type) {
case kUInt8: {
offset = -128;
break;
}
case kUInt16: {
offset = -32768;
break;
}
case kInt8:
case kInt16: {
offset = 0;
break;
}
default: {
MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported SymPerTensor quant target type {} for tensor: {}",
nameOfType(cfg->quant_to_type), v->name());
}
}

ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = -128};
MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}", v->name(), cfg->scale.item<float>());
ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = offset};
MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}, offset: {}", v->name(),
cfg->scale.item<float>(), offset);
break;
}
default: {
Expand Down Expand Up @@ -335,8 +366,13 @@ void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) {
for (auto& in : qnn_op->inputs) qnn_model_->addTensorWrapper(in->getWrapper());
for (auto& out : qnn_op->outputs) qnn_model_->addTensorWrapper(out->getWrapper());

qnn_model_->addNode(QNN_OPCONFIG_VERSION_1, qnn_op->name_, qnn_op->package_name_, qnn_op->op_name_, qnn_op->param_tensor,
qnn_op->param_scalar, inputNames, outputNames);
auto add_node_status =
qnn_model_->addNode(QNN_OPCONFIG_VERSION_1, qnn_op->name_, qnn_op->package_name_, qnn_op->op_name_,
qnn_op->param_tensor, qnn_op->param_scalar, inputNames, outputNames);
if (add_node_status != mllm::qnn::MODEL_NO_ERROR) {
MLLM_ERROR_EXIT(ExitCode::kCoreError, "QNN AOT failed to add node {} (op type {}) to graph.", qnn_op->name_,
qnn_op->op_name_);
}

op_node_.insert({qnn_op->getName(), qnn_op});
}
Expand Down Expand Up @@ -686,7 +722,7 @@ void QnnAOTEnv::captureAOTNodeOp(const std::string& qnn_context_name, const std:

QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qnn_context_name, const std::string& graph_name,
const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) {
auto __qnn_tensor_name = v->name();
auto __qnn_tensor_name = qnnTensorNameFromIR(v);

bool __qnn_enable_static_weight = force_static_weight;

Expand Down
10 changes: 10 additions & 0 deletions mllm/backends/qnn/aot/visitor/Matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "mllm/utils/Common.hpp"
#include "mllm/compile/ir/linalg/Op.hpp"
#include "mllm/compile/ir/builtin/Attribute.hpp"
#include "mllm/core/aops/MatMulOp.hpp"
#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
#include "mllm/backends/qnn/aot/visitor/Matmul.hpp"
#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
Expand All @@ -22,6 +23,11 @@ bool QnnAOTMatMulPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
MLLM_ERROR("Failed to cast to linalg::MatMulOp");
return false;
}
auto aop = dynamic_cast<mllm::aops::MatMulOp*>(matmul_op->getAOp());
if (!aop) {
MLLM_ERROR("Failed to cast AOp to aops::MatMulOp");
return false;
}

MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_graph_name"));
auto qnn_graph_name = op->getAttr("qnn_graph_name")->cast_<ir::StrAttr>()->data();
Expand All @@ -44,6 +50,10 @@ bool QnnAOTMatMulPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, output))
->setName(matmul_op->getAOp()->getName());

const auto& options = aop->options();
qnn_op_node->emplaceParamScalar(QNNParamScalarWrapper::create("transpose_in0", options.transpose_a));
qnn_op_node->emplaceParamScalar(QNNParamScalarWrapper::create("transpose_in1", options.transpose_b));

// Register this op node into one graph.
env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, qnn_op_node);

Expand Down
Loading