Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support hit cache feature. #1266

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 249 additions & 0 deletions backends/npu/kernels/funcs/npu_op_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,255 @@
#include "runtime/flags.h"
#include "runtime/runtime.h"

thread_local char g_hash_buf[g_hash_buf_size];
thread_local int g_hash_offset = 0;
constexpr int g_rShift33Bits = 33;
constexpr uint64_t MIX_STEP1 = 18397679294719823053LLU;
constexpr uint64_t MIX_STEP2 = 14181476777654086739LLU;

typedef void (*AddTensorAddrToCachedList)(void *addr);

void add_param_to_buf(const phi::DenseTensor &at_tensor) {
static const auto addTensorAddrToCachedListAddr =
GetOpApiFuncAddr("AddTensorAddrToCachedList");

AddTensorAddrToCachedList addTensorAddrToCachedListFunc =
reinterpret_cast<AddTensorAddrToCachedList>(
addTensorAddrToCachedListAddr);
if (!at_tensor.initialized()) {
MEMCPY_TO_BUF(",", 1);
return;
}
auto origin_sizes = phi::vectorize(at_tensor.dims());
auto origin_strides = phi::vectorize(at_tensor.strides());

// view shape
MEMCPY_TO_BUF(origin_sizes.data(),
static_cast<int64_t>(origin_sizes.size() * sizeof(int64_t)));
// data type
auto st = at_tensor.dtype();
MEMCPY_TO_BUF(&st, sizeof(st));
// seperator
MEMCPY_TO_BUF(",", 1);
// strides
MEMCPY_TO_BUF(origin_strides.data(),
static_cast<int64_t>(origin_strides.size() * sizeof(int64_t)));
// offset
// storage shape
aclDataType acl_data_type = ConvertToNpuDtype(st);

std::vector<int64_t> storageDims(5);
if (acl_data_type != ACL_STRING) {
storageDims.push_back(at_tensor.numel() * sizeof(st));
}
MEMCPY_TO_BUF(storageDims.data(),
static_cast<int64_t>(storageDims.size() * sizeof(int64_t)));

addTensorAddrToCachedListFunc(const_cast<void *>(at_tensor.data()));
}

void add_param_to_buf(const phi::Scalar &at_scalar) {
auto scalar_data_type = at_scalar.dtype();
switch (scalar_data_type) {
case phi::DataType::FLOAT64: {
double value = at_scalar.to<double>();
MEMCPY_TO_BUF(&value, sizeof(double));
break;
}
case phi::DataType::FLOAT32: {
float value = at_scalar.to<float>();
MEMCPY_TO_BUF(&value, sizeof(float));
break;
}
case phi::DataType::INT64: {
int64_t value = at_scalar.to<int64_t>();
MEMCPY_TO_BUF(&value, sizeof(int64_t));
break;
}
case phi::DataType::BOOL: {
bool value = at_scalar.to<bool>();
MEMCPY_TO_BUF(&value, sizeof(bool));
break;
}
default: {
break;
}
}
}

void add_param_to_buf(const phi::IntArray &phi_array) {
std::vector<int64_t> temp_array(phi_array.GetData().begin(),
phi_array.GetData().end());
MEMCPY_TO_BUF(temp_array.data(),
static_cast<int64_t>(temp_array.size() * sizeof(int64_t)));
}

void add_param_to_buf(const std::vector<phi::DenseTensor *> &phi_tensor_list) {
for (size_t i = 0; i < phi_tensor_list.size(); i++) {
add_param_to_buf(phi_tensor_list[i]);
}
auto counter = phi_tensor_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}

void add_param_to_buf(const std::string &s) {
MEMCPY_TO_BUF(s.c_str(), static_cast<int64_t>(s.size()));
}

void add_param_to_buf() {}

inline uint64_t rotating_left(uint64_t x, uint8_t n) {
return (x << n) | (x >> (64 - n));
}

inline uint64_t mixture(uint64_t x) {
// constants step1(18397679294719823053) and step2(14181476777654086739) are
// used to allow hash values to be more evenly distributed after
// multiplication.
x ^= x >> g_rShift33Bits;
x *= MIX_STEP1;
x ^= x >> g_rShift33Bits;
x *= MIX_STEP2;
x ^= x >> g_rShift33Bits;

return x;
}

// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
uint64_t gen_hash(const void *key,
const int len,
const uint32_t seed = 0xdeadb0d7) {
const uint8_t *data = (const uint8_t *)key;
// the length of each block is 16 bytes
const int block_num = len / 16;
// has and hax are literal appromix to hash, and hax is the return value of
// this function.
uint64_t has = seed;
uint64_t hax = seed;

// use 9782798678568883157 and 5545529020109919103 for blocking and
// obfuscation of input data
const uint64_t c1 = 9782798678568883157LLU;
const uint64_t c2 = 5545529020109919103LLU;

const uint64_t *blocks = (const uint64_t *)(data);

for (int i = 0; i < block_num; i++) {
int even_num = 2;
uint64_t tmp1 = blocks[i * even_num];
uint64_t tmp2 = blocks[i * even_num + 1];

int8_t bits_31 = 31;
tmp1 *= c1;
tmp1 = rotating_left(tmp1, bits_31);
tmp1 *= c2;
has ^= tmp1;

int8_t bits_27 = 27;
has = rotating_left(has, bits_27);
has += hax;
// increase randomness by mul by 5 and adding a constant
has = has * 5 + 1390208809;

int8_t bits_33 = 33;
tmp2 *= c2;
tmp2 = rotating_left(tmp2, bits_33);
tmp2 *= c1;
hax ^= tmp2;

hax = rotating_left(hax, bits_31);
hax += has;
// increase randomness by mul by 5 and adding a constant
hax = hax * 5 + 944331445;
}

// the length of each block is 16 bytes
const uint8_t *tail = (const uint8_t *)(data + block_num * 16);
uint64_t t1 = 0;
uint64_t t2 = 0;
// because the size of a block is 16, different offsets are calculated for
// tail blocks for different sizes
switch (static_cast<uint64_t>(len) & 15) {
case 15:
t2 ^= ((uint64_t)tail[14]) << 48;
[[fallthrough]];
case 14:
t2 ^= ((uint64_t)tail[13]) << 40;
[[fallthrough]];
case 13:
t2 ^= ((uint64_t)tail[12]) << 32;
[[fallthrough]];
case 12:
t2 ^= ((uint64_t)tail[11]) << 24;
[[fallthrough]];
case 11:
t2 ^= ((uint64_t)tail[10]) << 16;
[[fallthrough]];
case 10:
t2 ^= ((uint64_t)tail[9]) << 8;
[[fallthrough]];
case 9:
t2 ^= ((uint64_t)tail[8]) << 0;
t2 *= c2;
t2 = rotating_left(t2, 33);
t2 *= c1;
hax ^= t2;
[[fallthrough]];
case 8:
t1 ^= ((uint64_t)tail[7]) << 56;
[[fallthrough]];
case 7:
t1 ^= ((uint64_t)tail[6]) << 48;
[[fallthrough]];
case 6:
t1 ^= ((uint64_t)tail[5]) << 40;
[[fallthrough]];
case 5:
t1 ^= ((uint64_t)tail[4]) << 32;
[[fallthrough]];
case 4:
t1 ^= ((uint64_t)tail[3]) << 24;
[[fallthrough]];
case 3:
t1 ^= ((uint64_t)tail[2]) << 16;
[[fallthrough]];
case 2:
t1 ^= ((uint64_t)tail[1]) << 8;
[[fallthrough]];
case 1:
t1 ^= ((uint64_t)tail[0]) << 0;
t1 *= c1;
t1 = rotating_left(t1, 31);
t1 *= c2;
has ^= t1;
[[fallthrough]];
default:
break; // 空语句
}

has ^= static_cast<uint64_t>(len);
hax ^= static_cast<uint64_t>(len);

has += hax;
hax += has;

has = mixture(has);
hax = mixture(hax);

has += hax;
hax += has;
return hax;
}

uint64_t calc_hash_id() {
if (g_hash_offset == g_hash_buf_max_size) {
return 0;
}
uint64_t hash_id = gen_hash(g_hash_buf, g_hash_offset);
return hash_id;
}

#ifndef PADDLE_ON_INFERENCE
#define PY_GIL_RELEASE(expr) \
if (PyGILState_Check()) { \
Expand Down
Loading