-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a clause SAMPLING, has the ability to sample based on probability #4700
Open
YolandaLyj
wants to merge
12
commits into
vesoft-inc:master
Choose a base branch
from
YolandaLyj:sampling_dev
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 2 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
256d9b2
Add SAMPLING clause, probability-based sample
YolandaLyj d720e12
Rename sampler.h to Sampler.h
YolandaLyj 96fd97f
fix: remove unused warning
YolandaLyj 3ad90fc
Merge branch 'master' into sampling_dev
codesigner 4ad0257
fix compile
codesigner 1145b9a
Merge branch 'master' into sampling_dev
codesigner 7804f35
fix bugs found when checking cpplint code style
YolandaLyj 3536434
Merge branch 'master' into sampling_dev
codesigner 241b253
format change using clang-format
codesigner de461ae
format change using clang-format
codesigner 809cd49
Merge branch 'master' into sampling_dev
Sophie-Xie 70c9c87
Merge branch 'master' into sampling_dev
Sophie-Xie File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
/* Copyright (c) 2020 vesoft inc. All rights reserved. | ||
* | ||
* This source code is licensed under Apache 2.0 License. | ||
*/ | ||
|
||
#ifndef COMMON_ALGORITHM_SAMPLER_H_ | ||
#define COMMON_ALGORITHM_SAMPLER_H_ | ||
|
||
#include <cfloat> | ||
#include <ctime> | ||
#include <random> | ||
#include <type_traits> | ||
#include <utility> | ||
#include <vector> | ||
|
||
namespace nebula { | ||
namespace algorithm { | ||
|
||
namespace { | ||
template <typename T = float> | ||
T UniformRandom() { | ||
static_assert(std::is_floating_point<T>::value, | ||
"Only support float point type"); | ||
#if defined(__clang__) | ||
static std::default_random_engine e(std::time(nullptr)); | ||
static std::uniform_real_distribution<T> u(0., 1.); | ||
#elif defined(__GNUC__) || defined(__GNUG__) | ||
static thread_local std::default_random_engine e(std::time(nullptr)); | ||
static thread_local std::uniform_real_distribution<T> u(0., 1.); | ||
#endif | ||
return u(e); | ||
} | ||
} // namespace | ||
|
||
template <typename T> | ||
void Normalization(std::vector<T>& distribution) { | ||
static_assert(std::is_floating_point<T>::value, | ||
"Only support float point type"); | ||
T norm_sum = 0.0f; | ||
for (auto& dist : distribution) { | ||
norm_sum += dist; | ||
} | ||
if (norm_sum <= FLT_EPSILON and !distribution.empty()) { | ||
for (size_t i = 0; i < distribution.size(); ++i) { | ||
distribution[i] = 1.0f / static_cast<T>(distribution.size()); | ||
} | ||
return; | ||
} | ||
for (size_t i = 0; i < distribution.size(); ++i) { | ||
distribution[i] /= norm_sum; | ||
} | ||
} | ||
|
||
// https://en.wikipedia.org/wiki/Alias_method | ||
template <typename T = float> | ||
class AliasSampler { | ||
public: | ||
static_assert(std::is_floating_point<T>::value, | ||
"Only support float point type"); | ||
using AliasType = uint32_t; | ||
bool Init(std::vector<T>& distribution); | ||
inline bool Init(const std::vector<T>& distribution); | ||
AliasType Sample() const; | ||
inline size_t Size() const; | ||
|
||
private: | ||
std::vector<T> prob_; | ||
std::vector<AliasType> alias_; | ||
}; | ||
|
||
template <typename T> | ||
bool AliasSampler<T>::Init(std::vector<T>& distribution) { | ||
// normalization sum of distribution to 1 | ||
Normalization(distribution); | ||
|
||
prob_.resize(distribution.size()); | ||
alias_.resize(distribution.size()); | ||
std::vector<AliasType> smaller, larger; | ||
smaller.reserve(distribution.size()); | ||
larger.reserve(distribution.size()); | ||
|
||
for (size_t i = 0; i < distribution.size(); ++i) { | ||
prob_[i] = distribution[i] * distribution.size(); | ||
if (prob_[i] < 1.0) { | ||
smaller.push_back(i); | ||
} else { | ||
larger.push_back(i); | ||
} | ||
} | ||
// Construct the probability and alias tables | ||
AliasType small, large; | ||
while (!smaller.empty() && !larger.empty()) { | ||
small = smaller.back(); | ||
smaller.pop_back(); | ||
large = larger.back(); | ||
larger.pop_back(); | ||
alias_[small] = large; | ||
prob_[large] = prob_[large] + prob_[small] - 1.0; | ||
if (prob_[large] < 1.0) { | ||
smaller.push_back(large); | ||
} else { | ||
larger.push_back(large); | ||
} | ||
} | ||
while (!smaller.empty()) { | ||
small = smaller.back(); | ||
smaller.pop_back(); | ||
prob_[small] = 1.0; | ||
} | ||
while (!larger.empty()) { | ||
large = larger.back(); | ||
larger.pop_back(); | ||
prob_[large] = 1.0; | ||
} | ||
return true; | ||
} | ||
|
||
template <typename T> | ||
bool AliasSampler<T>::Init(const std::vector<T>& distribution) { | ||
std::vector<T> dist = distribution; | ||
return Init(dist); | ||
} | ||
|
||
template <typename T> | ||
typename AliasSampler<T>::AliasType AliasSampler<T>::Sample() const { | ||
AliasType roll = floor(prob_.size() * UniformRandom()); | ||
bool coin = UniformRandom() < prob_[roll]; | ||
return coin ? roll : alias_[roll]; | ||
} | ||
|
||
template <typename T> | ||
size_t AliasSampler<T>::Size() const { | ||
return prob_.size(); | ||
} | ||
|
||
/** | ||
* binary sample in accumulation weights | ||
*/ | ||
template <typename T = float> | ||
size_t BinarySampleAcc(const std::vector<T>& accumulate_weights) { | ||
if (accumulate_weights.empty()) { | ||
return 0; | ||
} | ||
T rnd = UniformRandom() * accumulate_weights.back(); | ||
size_t low = 0, high = accumulate_weights.size() - 1, mid = 0; | ||
while (low <= high) { | ||
mid = ((high - low) >> 1) + low; | ||
if (rnd < accumulate_weights[mid]) { | ||
if (mid == 0) { | ||
return mid; | ||
} | ||
high = mid - 1; | ||
if (high >= 0 && rnd >= accumulate_weights[high]) { | ||
// rnd in [mid-1, mid) | ||
return mid; | ||
} | ||
} else { | ||
low = mid + 1; | ||
if (low < accumulate_weights.size() && rnd < accumulate_weights[low]) { | ||
// rnd in [mid, mid+1) | ||
return low; | ||
} | ||
} | ||
} | ||
return mid; | ||
} | ||
|
||
/** | ||
* binary sample in weights | ||
*/ | ||
template <typename T = float> | ||
size_t BinarySample(const std::vector<T>& weights) { | ||
std::vector<T> accumulate_weights(weights.size(), 0.0f); | ||
T cur_weight = 0.0f; | ||
for (size_t i = 0; i < weights.size(); ++i) { | ||
cur_weight += weights[i]; | ||
accumulate_weights[i] = cur_weight; | ||
} | ||
Normalization(accumulate_weights); | ||
return BinarySampleAcc(accumulate_weights); | ||
} | ||
|
||
} // namespace algorithm | ||
} // namespace nebula | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
// Copyright (c) 2020 vesoft inc. All rights reserved. | ||
// | ||
// This source code is licensed under Apache 2.0 License. | ||
|
||
#include "graph/executor/query/SamplingExecutor.h" | ||
|
||
#include "common/algorithm/Sampler.h" | ||
#include "graph/planner/plan/Query.h" | ||
|
||
namespace nebula { | ||
namespace graph { | ||
|
||
using WeightType = float; | ||
|
||
folly::Future<Status> SamplingExecutor::execute() { | ||
SCOPED_TIMER(&execTime_); | ||
auto *sampling = asNode<Sampling>(node()); | ||
Result result = ectx_->getResult(sampling->inputVar()); | ||
auto *iter = result.iterRef(); | ||
if (UNLIKELY(iter == nullptr)) { | ||
return Status::Error( | ||
"Internal error: nullptr iterator in sampling executor"); | ||
} | ||
if (UNLIKELY(!result.iter()->isSequentialIter())) { | ||
std::stringstream ss; | ||
ss << "Internal error: Sampling executor does not supported " | ||
<< iter->kind(); | ||
return Status::Error(ss.str()); | ||
} | ||
auto &factors = sampling->factors(); | ||
auto size = iter->size(); | ||
if (size <= 0) { | ||
iter->clear(); | ||
return finish(ResultBuilder() | ||
.value(result.valuePtr()) | ||
.iter(std::move(result).iter()) | ||
.build()); | ||
} | ||
auto colNames = result.value().getDataSet().colNames; | ||
DataSet dataset(std::move(colNames)); | ||
for (auto factor : factors) { | ||
if (factor.count <= 0) { | ||
iter->clear(); | ||
return finish(ResultBuilder() | ||
.value(result.valuePtr()) | ||
.iter(std::move(result).iter()) | ||
.build()); | ||
} | ||
if (factor.samplingType == SamplingFactor::SamplingType::BINARY) { | ||
executeBinarySample<SequentialIter>(iter, factor.colIdx, factor.count, | ||
dataset); | ||
} else { | ||
executeAliasSample<SequentialIter>(iter, factor.colIdx, factor.count, | ||
dataset); | ||
} | ||
} | ||
return finish(ResultBuilder() | ||
.value(Value(std::move(dataset))) | ||
.iter(Iterator::Kind::kSequential) | ||
.build()); | ||
} | ||
|
||
template <typename U> | ||
void SamplingExecutor::executeBinarySample(Iterator *iter, size_t index, | ||
size_t count, DataSet &list) { | ||
auto uIter = static_cast<U *>(iter); | ||
std::vector<WeightType> accumulate_weights; | ||
auto it = uIter->begin(); | ||
WeightType v; | ||
while (it != uIter->end()) { | ||
v = 1.0; | ||
if ((*it)[index].type() == Value::Type::NULLVALUE) { | ||
LOG(WARNING) << "Sampling type is nullvalue"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the dataset have many null, seem this may print a lot WARNING logs? If that is the condition, I advise not use WARNING logs |
||
} else if ((*it)[index].type() == Value::Type::FLOAT) { | ||
v = (float)((*it)[index].getFloat()); | ||
} else if ((*it)[index].type() == Value::Type::INT) { | ||
v = (float)((*it)[index].getInt()); | ||
} else { | ||
LOG(WARNING) << "Sampling type is wrong, must be int or float."; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
} | ||
if (!accumulate_weights.empty()) { | ||
v += accumulate_weights.back(); | ||
} | ||
accumulate_weights.emplace_back(std::move(v)); | ||
++it; | ||
} | ||
nebula::algorithm::Normalization<WeightType>(accumulate_weights); | ||
auto beg = uIter->begin(); | ||
for (size_t i = 0; i < count; ++i) { | ||
auto idx = | ||
nebula::algorithm::BinarySampleAcc<WeightType>(accumulate_weights); | ||
list.emplace_back(*(beg + idx)); | ||
} | ||
uIter->clear(); | ||
} | ||
|
||
template <typename U> | ||
void SamplingExecutor::executeAliasSample(Iterator *iter, size_t index, | ||
size_t count, DataSet &list) { | ||
auto uIter = static_cast<U *>(iter); | ||
std::vector<WeightType> weights; | ||
auto it = uIter->begin(); | ||
WeightType v; | ||
while (it != uIter->end()) { | ||
v = 1.0; | ||
if ((*it)[index].type() == Value::Type::NULLVALUE) { | ||
LOG(WARNING) << "Sampling type is nullvalue"; | ||
|
||
} else if ((*it)[index].type() == Value::Type::FLOAT) { | ||
v = (float)((*it)[index].getFloat()); | ||
} else if ((*it)[index].type() == Value::Type::INT) { | ||
v = (float)((*it)[index].getInt()); | ||
} else { | ||
LOG(WARNING) << "Sampling type is wrong, must be int or float."; | ||
} | ||
LOG(ERROR) << "lyj debug v:" << v; | ||
weights.emplace_back(std::move(v)); | ||
++it; | ||
} | ||
nebula::algorithm::AliasSampler<WeightType> sampler_; | ||
sampler_.Init(weights); | ||
auto beg = uIter->begin(); | ||
for (size_t i = 0; i < count; ++i) { | ||
auto idx = sampler_.Sample(); | ||
list.emplace_back(*(beg + idx)); | ||
} | ||
uIter->clear(); | ||
} | ||
|
||
} // namespace graph | ||
} // namespace nebula |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use the CamelCase format