Skip to content

Commit

Permalink
[MNN:Sync] Sync Internal 2.9.1
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaying committed Jun 3, 2024
1 parent cf23784 commit 5b19cbb
Show file tree
Hide file tree
Showing 106 changed files with 6,757 additions and 2,117 deletions.
13 changes: 12 additions & 1 deletion docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,19 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
- 一般是直接访问了 tensor 的 host
- 按 [输入数据](./inference/session.html#id8) 和[获取输出](./inference/session.html#id21) 里面的方式建host tensor 并 copy ,参考相关文档修改使用代码
- 是否可基于 deviceId 直接传 GPU 地址?
- 可以,需要理解 MNN GPU 内存布局并传上下文给 MNN ,但相关实现较复杂
- 可以,可以通过setDevicePtr设置输入VARP的GPU地址,通过copyToDevicePtr设置输出VARP拷贝到的GPU地址
- 相关使用参考tools/cpp/GpuInterTest.cpp
- 目前OPENCL推理支持OPENCL/OPENGL内存做输入输出。CUDA推理支持CUDA内存做输入输出
- 采用 MNN_Express 系列接口,可以支持模型之间的内存直接传递不做拷贝
### 多卡GPU上,用户指定特定GPU做推理问题
- 通过设置MNNDeviceContext结构体参数来指定特定GPU
- 通过设置platformSize、platformId、deviceId参数来进行指定
- 目前支持OpenCL和CUDA后端进行设置
- 具体可以参考:tools/cpp/testModel.cpp
## 性能相关
### 使用 GPU 时,调用 copyToHostTensor / copyFromHostTensor 非常慢
GPU 后端调用 copy 的时间包含两个部分
Expand Down
8 changes: 7 additions & 1 deletion express/Executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,13 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
Session::ModeGroup group;
group.inputMode = Interpreter::Session_Input_User;
group.outputMode = Interpreter::Session_Output_User;
group.callBackMode = Interpreter::Session_Release;
auto globalExecutor = ExecutorScope::Current();
auto debug = globalExecutor->getDebugTools();
if (debug->after != nullptr && debug->before != nullptr) {
group.callBackMode = Interpreter::Session_Debug;
} else {
group.callBackMode = Interpreter::Session_Release;
}
group.memoryUsageMode = Interpreter::Session_Memory_Cache;
std::shared_ptr<ComputeCache> cahce(new ComputeCache);
for (auto& iter : dstExpr) {
Expand Down
16 changes: 15 additions & 1 deletion express/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <map>
#include <set>
#include <stack>
#include <MNN/expr/ExecutorScope.hpp>
#include "MNN_generated.h"
#include "core/TensorUtils.hpp"
#include "core/Session.hpp"
Expand Down Expand Up @@ -188,6 +189,9 @@ void* Executor::ComputeCache::mapOutput(int offset, Tensor* dest) {
//MNN_ASSERT(nullptr != ptr);
return ptr;
}
if (0 == tensor->usize()) {
return nullptr;
}
Utils::allocMemoryForHostTensor(dest);
tensor->copyToHostTensor(dest);
MNN_ASSERT(nullptr != dest->host<void>());
Expand All @@ -213,6 +217,9 @@ ErrorCode Executor::ComputeCache::compute() {
std::stack<ComputeCache*> dfsStack;
std::set<ComputeCache*> visited;
dfsStack.push(this);
ErrorCode code = NO_ERROR;
auto globalExecutor = ExecutorScope::Current();
auto debug = globalExecutor->getDebugTools();
while (!dfsStack.empty()) {
//printf("stcak = %d\n", dfsStack.size());
auto cache = dfsStack.top();
Expand Down Expand Up @@ -248,7 +255,14 @@ ErrorCode Executor::ComputeCache::compute() {
} else {
visited.insert(cache);
dfsStack.pop();
cache->mSession->run();
if (debug->after != nullptr && debug->before != nullptr) {
code = mSession->runWithCallBack(debug->before, debug->after);
} else {
code = mSession->run();
}
if (NO_ERROR != code) {
return code;
}
cache->mContentDirty = false;
}
}
Expand Down
24 changes: 18 additions & 6 deletions express/module/StaticModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ void StaticModule::resetInputOutputs() {
des->usage = Tensor::InsideDescribe::INPUT;
}
pipelineInfo.first.inputTensorCopyCache.insert(std::make_pair(mInputTensors[i], std::make_tuple(nullptr, nullptr, true, true)));
mPrevInputTensor[i] = nullptr;
mPrevInputTensor[i].first = nullptr;
mPrevInputTensor[i].second = nullptr;
}
mOutputTensors.resize(mResource->mOutputFromTensor.size());
for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
Expand Down Expand Up @@ -221,9 +222,15 @@ StaticModule::StaticModule(std::vector<int> inputs,
if (bnCache.cache.first->type() == MNN_FORWARD_CPU) {
bnCache.cache.second = bnCache.cache.first;
} else {
// Use Multi-thread if user has set numberthread > 1
BackendConfig defaultConfig;
defaultConfig.flags = 4;
bnCache.cache.second.reset(rt.second->onCreate(&defaultConfig));
auto cpurt = rt.first.find(MNN_FORWARD_CPU);
if (cpurt != rt.first.end()) {
bnCache.cache.second.reset(cpurt->second->onCreate(&defaultConfig));
} else {
bnCache.cache.second.reset(rt.second->onCreate(&defaultConfig));
}
}
if (config.rearrange) {
mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get());
Expand Down Expand Up @@ -285,7 +292,8 @@ StaticModule::~StaticModule() {
void StaticModule::onClearCache() {
if (nullptr != mSession) {
for (int i=0; i<mPrevInputTensor.size(); ++i) {
mPrevInputTensor[i] = nullptr;
mPrevInputTensor[i].first = nullptr;
mPrevInputTensor[i].second = nullptr;
}
for (auto& iter : mSession->getPipelineInfo(0).first.inputTensorCopyCache) {
std::get<3>(iter.second) = true;
Expand Down Expand Up @@ -318,19 +326,23 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
MNN_ASSERT(inputs.size() == mInputTensors.size());
auto& pipelineInfo = mSession->getPipelineInfo(0);
if (mResource->mModes.inputMode == Interpreter::Session_Input_User) {
pipelineInfo.first.inputBackendChange = false;
for (int i = 0; i < inputs.size(); ++i) {
if (nullptr == mInputTensors[i]) {
continue;
}
auto inputTensor = Utils::getTensor(inputs[i]);
Schedule::TENSORCACHE* cacheTensor = nullptr;

if (mPrevInputTensor[i] != inputTensor) {
if (mPrevInputTensor[i].first != inputTensor) {
auto newBackend = TensorUtils::getDescribeOrigin(inputTensor)->getBackend();
if (mPrevInputTensor[i].second != newBackend) {
pipelineInfo.first.inputBackendChange = true;
}
auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
cacheTensor = &cacheIter->second;
MNN_ASSERT(cacheIter != pipelineInfo.first.inputTensorCopyCache.end());
std::get<3>(cacheIter->second) = true;
mPrevInputTensor[i] = inputTensor;
mPrevInputTensor[i] = std::make_pair(inputTensor, newBackend);
if (std::get<1>(*cacheTensor) != nullptr) {
if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribeOrigin(std::get<0>(*cacheTensor))->getBackend())) {
// No need copy now, reset it
Expand Down
2 changes: 1 addition & 1 deletion express/module/StaticModule.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class StaticModule : public Module {
};
std::shared_ptr<Session> mSession;
std::vector<Tensor*> mInputTensors;
std::vector<Tensor*> mPrevInputTensor;
std::vector<std::pair<Tensor*, Backend*>> mPrevInputTensor;
std::vector<Tensor*> mOutputTensors;
std::shared_ptr<Resource> mResource;
};
Expand Down
2 changes: 1 addition & 1 deletion include/MNN/MNNDefine.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 9
#define MNN_VERSION_PATCH 0
#define MNN_VERSION_PATCH 1
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */
10 changes: 7 additions & 3 deletions source/backend/arm82/Arm82Unary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@ struct _Exp {
void operator()(void* outRaw, const void* inpRaw, int realSize) const {
auto out = (float*)outRaw;
auto inp = (const float*)inpRaw;
float offset[2] = {
float offset[4] = {
1.0f,
0.0f,
0.0f,
0.0f
};
MNNExp(out, inp, offset, realSize);
Expand All @@ -125,9 +127,11 @@ struct _ExpM1 {
void operator()(void* outRaw, const void* inpRaw, int realSize) const {
auto out = (float*)outRaw;
auto inp = (const float*)inpRaw;
float offset[2] = {
float offset[4] = {
1.0f,
-1.0f
-1.0f,
0.0f,
0.0f
};
MNNExp(out, inp, offset, realSize);
}
Expand Down
61 changes: 54 additions & 7 deletions source/backend/cpu/BinaryUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,26 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i

if (-1 == needBroadcastIndex) {
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
int sizeDivC4 = sizeDivUnit / 4;
int sizeDivUnitRemain = sizeDivUnit % 4;
for (int i = 0; i < sizeDivC4; ++i) {
V a0 = V::load(src0);
V b0 = V::load(src1);
V a1 = V::load(src0 + 1 * pack);
V b1 = V::load(src1 + 1 * pack);
V a2 = V::load(src0 + 2 * pack);
V b2 = V::load(src1 + 2 * pack);
V a3 = V::load(src0 + 3 * pack);
V b3 = V::load(src1 + 3 * pack);
V::save(dst, compute(a0, b0));
V::save(dst+1*pack, compute(a1, b1));
V::save(dst+2*pack, compute(a2, b2));
V::save(dst+3*pack, compute(a3, b3));
src0 += 4*pack;
src1 += 4*pack;
dst += 4*pack;
}
for (int i = 0; i < sizeDivUnitRemain; ++i) {
V a = V::load(src0);
V b = V::load(src1);
V::save(dst, compute(a, b));
Expand All @@ -222,11 +241,23 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i
const U srcValue0 = src0[0];
V a = V(srcValue0);
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
const auto src1Ptr = src1;
auto dstPtr = dst;
V b = V::load(src1Ptr);
V::save(dstPtr, compute(a, b));
int sizeDivC4 = sizeDivUnit / 4;
int sizeUnitRemain = sizeDivUnit % 4;
for (int i = 0; i < sizeDivC4; ++i) {
V b0 = V::load(src1);
V b1 = V::load(src1 + 1*pack);
V b2 = V::load(src1 + 2*pack);
V b3 = V::load(src1 + 3*pack);
V::save(dst, compute(a, b0));
V::save(dst+1*pack, compute(a, b1));
V::save(dst+2*pack, compute(a, b2));
V::save(dst+3*pack, compute(a, b3));
src1 += 4*pack;
dst += 4*pack;
}
for (int i = 0; i < sizeUnitRemain; ++i) {
V b = V::load(src1);
V::save(dst, compute(a, b));
src1 += pack;
dst += pack;
}
Expand All @@ -243,7 +274,23 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i
const auto srcValue1 = static_cast<U>(src1[0]);
V b = V(srcValue1);
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
int sizeDivC4 = sizeDivUnit / 4;
int sizeUnitRemain = sizeDivUnit % 4;
for (int i = 0; i < sizeDivC4; ++i) {
const auto src0Ptr = src0;
auto dstPtr = dst;
V a0 = V::load(src0Ptr);
V a1 = V::load(src0Ptr + 1*pack);
V a2 = V::load(src0Ptr + 2*pack);
V a3 = V::load(src0Ptr + 3*pack);
V::save(dstPtr, compute(a0, b));
V::save(dstPtr+1*pack, compute(a1, b));
V::save(dstPtr+2*pack, compute(a2, b));
V::save(dstPtr+3*pack, compute(a3, b));
src0 += 4*pack;
dst += 4*pack;
}
for (int i = 0; i < sizeUnitRemain; ++i) {
const auto src0Ptr = src0;
auto dstPtr = dst;
V a = V::load(src0Ptr);
Expand Down
26 changes: 18 additions & 8 deletions source/backend/cpu/CPUAttention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,23 @@ static void prefill_unpack(char* pack_qkv, char* unpack_qkv, int mNumHead, int m

template <typename T>
static void prefill_softmax(int* mask_ptr, float* mask_qk, float* softmax_qk, char* unpack_qk, char* pack_qk,
float mScale, int eP, int query_e, int seq_len, float min_val) {
float mScale, int eP, int query_e, int seq_len, float min_val, bool float_mask) {
T* qk_src = reinterpret_cast<T*>(unpack_qk);
T* qk_dst = reinterpret_cast<T*>(pack_qk);
for (int i = 0; i < seq_len * seq_len; i++) {
if (mask_ptr[i]) {
mask_qk[i] = qk_src[i] * mScale;
} else {
mask_qk[i] = min_val;
if (float_mask) {
T* fpmask_ptr = reinterpret_cast<T*>(mask_ptr);
// float mask
for (int i = 0; i < seq_len * seq_len; i++) {
mask_qk[i] = qk_src[i] * mScale + fpmask_ptr[i];
}
} else {
// int mask
for (int i = 0; i < seq_len * seq_len; i++) {
if (mask_ptr[i]) {
mask_qk[i] = qk_src[i] * mScale;
} else {
mask_qk[i] = min_val;
}
}
}
for (int i = 0; i < seq_len; i++) {
Expand Down Expand Up @@ -258,6 +267,7 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
auto key = inputs[1];
auto value = inputs[2];
auto mask = inputs[3];
bool float_mask = (mask->getType() == halide_type_of<float>());
auto shape = query->shape();
int seq_len = shape[1];
mThreadNum = ((CPUBackend *)backend())->threadNumber();
Expand Down Expand Up @@ -324,9 +334,9 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
// div scale and mask
auto mask_ptr = mask->host<int>();
if (bytes == 2) {
prefill_softmax<FLOAT16_T>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, -65504.0);
prefill_softmax<FLOAT16_T>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, -65504.0, float_mask);
} else {
prefill_softmax<float>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, std::numeric_limits<float>::lowest());
prefill_softmax<float>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, std::numeric_limits<float>::lowest(), float_mask);
}
// qk @ v
for (int i = 0 ; i < loop_e; i++) {
Expand Down
10 changes: 5 additions & 5 deletions source/backend/cpu/CPUCast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,31 @@
#include <cmath>

namespace MNN {
ErrorCode CPUCastCreator::cast(void* const inputRaw, void* outputRaw, ConvertType type,
ErrorCode CPUCastCreator::cast(const void* inputRaw, void* outputRaw, ConvertType type,
int number, float scale, float zero, float min, float max, const CPUBackend* bn) {
auto pack = bn->functions()->pack;
int c4Size = number / pack;
int remain = number % pack;
if (type == FlOAT_TO_INT8) {
scale = (scale == 0.f ? 0.f : 1.f / scale);
std::vector<float> scales(pack, scale);
bn->int8Functions()->MNNFloat2Int8(static_cast<float*>(inputRaw), static_cast<int8_t*>(outputRaw), c4Size, scales.data(), min, max, zero);
bn->int8Functions()->MNNFloat2Int8((float*)(inputRaw), (int8_t*)(outputRaw), c4Size, scales.data(), min, max, zero);
if (remain > 0) {
std::vector<float> tempSrc(pack);
std::vector<int8_t> tempDst(pack);
::memcpy(tempSrc.data(), static_cast<float* const>(inputRaw) + c4Size * pack, remain * sizeof(float));
::memcpy(tempSrc.data(), (float*)(inputRaw) + c4Size * pack, remain * sizeof(float));
bn->int8Functions()->MNNFloat2Int8(tempSrc.data(), tempDst.data(), 1, scales.data(), min, max, zero);
::memcpy(static_cast<int8_t*>(outputRaw) + c4Size * pack, tempDst.data(), remain * sizeof(int8_t));
}
return NO_ERROR;
}
if (type == INT8_TO_FlOAT) {
std::vector<float> scales(pack, scale);
bn->int8Functions()->MNNInt8ScaleToFloat(static_cast<float*>(outputRaw), static_cast<int8_t*>(inputRaw), scales.data(), c4Size, zero);
bn->int8Functions()->MNNInt8ScaleToFloat((float*)(outputRaw), (int8_t*)(inputRaw), scales.data(), c4Size, zero);
if (remain > 0) {
std::vector<float> tempDst(pack);
std::vector<int8_t> tempSrc(pack);
::memcpy(tempSrc.data(), static_cast<int8_t* const>(inputRaw) + c4Size * pack, remain * sizeof(int8_t));
::memcpy(tempSrc.data(), (int8_t*)(inputRaw) + c4Size * pack, remain * sizeof(int8_t));
bn->int8Functions()->MNNInt8ScaleToFloat(tempDst.data(), tempSrc.data(), scales.data(), 1, zero);
::memcpy(static_cast<float*>(outputRaw) + c4Size * pack, tempDst.data(), remain * sizeof(float));
}
Expand Down
2 changes: 1 addition & 1 deletion source/backend/cpu/CPUCast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CPUCastCreator : public CPUBackend::Creator {
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override;
static ErrorCode cast(const Tensor* input, const Tensor* output, const CPUBackend* bn, ConvertType type);
static ErrorCode cast(void* const inputRaw, void* outputRaw, ConvertType type, int number, float scale, float zero, float min, float max, const CPUBackend* bn);
static ErrorCode cast(const void* inputRaw, void* outputRaw, ConvertType type, int number, float scale, float zero, float min, float max, const CPUBackend* bn);
};
} // namespace MNN
#endif /* CPUCast_hpp */
Loading

0 comments on commit 5b19cbb

Please sign in to comment.