diff --git a/dali/c_api/c_api_test.cc b/dali/c_api/c_api_test.cc index 5335ebd5e79..1970330cc2b 100644 --- a/dali/c_api/c_api_test.cc +++ b/dali/c_api/c_api_test.cc @@ -622,9 +622,9 @@ TYPED_TEST(CApiTest, TestExecutorMeta) { pipe_ptr.reset(); daliPipelineHandle handle; - daliCreatePipeline(&handle, serialized.c_str(), serialized.size(), batch_size, num_thread, - this->device_id_, false, prefetch_queue_depth, prefetch_queue_depth, - prefetch_queue_depth, true); + daliCreatePipeline2(&handle, serialized.c_str(), serialized.size(), batch_size, num_thread, + this->device_id_, false, false, false, + prefetch_queue_depth, prefetch_queue_depth, prefetch_queue_depth, true); daliRun(&handle); daliOutput(&handle); diff --git a/dali/pipeline/executor/executor2/exec2.cc b/dali/pipeline/executor/executor2/exec2.cc new file mode 100644 index 00000000000..c3fe5df5ee9 --- /dev/null +++ b/dali/pipeline/executor/executor2/exec2.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "dali/core/cuda_stream_pool.h" +#include "dali/pipeline/executor/executor2/exec2.h" +#include "dali/pipeline/executor/executor2/exec_graph.h" +#include "dali/pipeline/executor/executor2/stream_assignment.h" + +namespace dali { +namespace exec2 { + +namespace { + +void LimitBackendConcurrency(ExecGraph &graph, OpType backend, int max_concurrency = 1) { + auto sem = std::make_shared(max_concurrency); + for (auto &n : graph.Nodes()) { + if (n.backend == backend) + n.concurrency = sem; + } + graph.Invalidate(); +} + +void ApplyConcurrencyLimit(ExecGraph &graph, OperatorConcurrency concurrency) { + switch (concurrency) { + case OperatorConcurrency::Full: + // TODO(michalz): Fix ThreadPool. + LimitBackendConcurrency(graph, OpType::CPU); + break; // other operators have no restrictions + case OperatorConcurrency::Backend: + LimitBackendConcurrency(graph, OpType::CPU); + LimitBackendConcurrency(graph, OpType::GPU); + LimitBackendConcurrency(graph, OpType::MIXED); + break; + case OperatorConcurrency::None: + { + auto sem = std::make_shared(1); + for (auto &n : graph.Nodes()) + n.concurrency = sem; + } + break; + default: + assert(!"Unexpected concurrency policy value."); + break; + } +} + +} // namespace + +class Executor2::Impl { + // Must be declared as the 1st member, so it's always be destroyed last. + std::optional dtor_guard_; + + public: + explicit Impl(const Config &config) : config_(config) { + } + + ~Impl() { + dtor_guard_.emplace(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + Shutdown(); + } + + /** Executor state + * + * The state can only advance. + * A successful call to shutdown will result in a transition to ShutDown, regardless of the + * initial state. + */ + enum class State { + New = 0, + Building, + Built, + Running, + ShutdownRequested, + ShutDown, + }; + + void Build(const graph::OpGraph &graph) { + if (state_ != State::New) + throw std::logic_error("Already built."); + + if (config_.device.has_value()) { + int n; + CUDA_CALL(cudaGetDeviceCount(&n)); + if (*config_.device < 0 || *config_.device >= n) + throw std::invalid_argument(make_string("The device_id=", *config_.device, " is invalid. " + "Valid range is [0..", n-1, "]")); + } + + state_ = State::Building; + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + graph_.Lower(graph); + BuildNodeDict(); + AnalyzeGraph(); + CheckNodeTypes(); + CalculatePrefetchDepth(); + ApplyConcurrencyLimit(graph_, config_.concurrency); + SetupStreams(); + SetupThreadPool(); + + last_iter_data_ = InitIterationData(-1); + if (last_iter_data_->checkpoint) + PopulateInitialCheckpoint(*last_iter_data_->checkpoint); + + state_ = State::Built; + Start(); + } + + void Run() { + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + if (state_ != State::Running) + throw std::runtime_error("The executor is not initialized."); + InitIteration(); + pending_outputs_.push(graph_.Launch(*exec_)); + } + + void Prefetch() { + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + for (int i = 0; i < prefetch_depth_; i++) { + Run(); + } + } + + Workspace PopOutputs() { + if (pending_outputs_.empty()) + throw std::out_of_range("All pending outputs were already popped."); + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + auto fut = std::move(pending_outputs_.front()); + pending_outputs_.pop(); + auto &pipe_out = fut.Value(); + auto ws = pipe_out.workspace; + last_iter_data_ = ws.GetIterationData(); + if (ws.has_event()) + CUDA_CALL(cudaEventSynchronize(ws.event())); + ws.set_event(nullptr); + return ws; + } + + void InitIteration() { + WorkspaceParams params{}; + params.max_batch_size = config_.max_batch_size; + params.iter_data = InitIterationData(iter_index_++); + graph_.PrepareIteration(params); + } + + int InputFeedCount(std::string_view) { + return prefetch_depth_; + } + + OperatorBase *GetOperator(std::string_view input_name) const { + auto it = node_map_.find(input_name); + if (it == node_map_.end()) + return nullptr; + return it->second->op.get(); + } + + const SharedIterData &LastIterData() const { + return last_iter_data_; + } + + void Shutdown() { + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + if (state_ == State::Running) { + state_ = State::ShutdownRequested; + if (exec_) + exec_->Shutdown(); + } + state_ = State::ShutDown; + } + + void RestoreFromCheckpoint(const Checkpoint &cpt) { + DeviceGuard dg(config_.device.value_or(CPU_ONLY_DEVICE_ID)); + int restored = 0; + for (auto &n : graph_.Nodes()) { + if (n.op) { + n.op->RestoreState(cpt.GetOpCheckpoint(n.instance_name)); + restored++; + } + } + if (cpt.NumOp() > restored) { + throw std::runtime_error("The checkpoint data contains superfluous operator states."); + } + this->iter_index_ = cpt.GetIterationId(); + if (!last_iter_data_) + last_iter_data_ = InitIterationData(-1); + last_iter_data_->checkpoint = std::make_shared(cpt); + } + + void EnableCheckpointing(bool enabled) { + config_.checkpointing = enabled; + } + + bool CheckpointingEnabled() const { + return config_.checkpointing; + } + + private: + State state_ = State::New; + + std::shared_ptr InitIterationData(int iter_index) { + auto iter_data = std::make_shared(); + iter_data->iteration_index = iter_index; + if (config_.checkpointing) { + iter_data->checkpoint = CreateCheckpoint(iter_data->iteration_index); + } + return iter_data; + } + + std::shared_ptr CreateCheckpoint(int64_t iteration_index) { + auto cpt = std::make_shared(); + cpt->SetIterationId(iter_index_ + 1); + for (auto &n : graph_.Nodes()) { + if (!n.instance_name.empty()) + cpt->AddOperator(n.instance_name); + } + return cpt; + } + + void PopulateInitialCheckpoint(Checkpoint &cpt) { + for (auto &n : graph_.Nodes()) { + if (n.op) + n.op->SaveState(cpt.GetOpCheckpoint(n.instance_name), n.env.order); + } + } + + void BuildNodeDict() { + for (auto &n : graph_.Nodes()) + if (!n.instance_name.empty()) + node_map_[n.instance_name] = &n; + } + + void AnalyzeGraph() { + CountNodes(); + } + + void CountNodes() { + for (auto &n : graph_.Nodes()) { + switch (NodeType(&n)) { + case OpType::CPU: + graph_info_.num_cpu++; + if (n.inputs.empty()) + graph_info_.num_cpu_roots++; + break; + case OpType::GPU: + graph_info_.num_gpu++; + if (n.inputs.empty()) + graph_info_.num_gpu_roots++; + break; + case OpType::MIXED: + graph_info_.num_mixed++; + if (n.inputs.empty()) + graph_info_.num_mixed_roots++; + break; + default: + break; + } + } + } + + void CheckNodeTypes() { + if (graph_info_.num_gpu + graph_info_.num_mixed > 0 && !config_.device.has_value()) + throw std::invalid_argument("The graph contains nodes that require a GPU but the config " + "doesn't specify a device id."); + } + + void CalculatePrefetchDepth() { + int depth = 1; + if (graph_info_.num_cpu_roots > 0) + depth = std::max(depth, config_.cpu_queue_depth); + if (graph_info_.num_mixed_roots + graph_info_.num_gpu_roots > 0) + depth = std::max(depth, config_.gpu_queue_depth); + for (auto &node : graph_.Nodes()) { + if (node.inputs.empty() && node.op) { + int op_depth; + if (node.op->GetSpec().TryGetArgument(op_depth, "queue_depth")) + depth = std::max(depth, op_depth); + } + } + prefetch_depth_ = depth; + } + + void SetupThreadPool() { + if (graph_info_.num_cpu > 0) { + tp_ = std::make_unique( + config_.thread_pool_threads, + config_.device.value_or(CPU_ONLY_DEVICE_ID), + config_.set_affinity, + "Executorv_v2"); + } else { + tp_.reset(); + } + for (auto &n : graph_.Nodes()) { + if (n.backend == OpType::CPU) + n.env.thread_pool = tp_.get(); + } + } + + void Start() { + if (state_ != State::Built) + throw std::logic_error("Incorrect state transition."); + exec_ = std::make_unique(config_.operator_threads); + exec_->Start([this](){ + if (config_.device) + CUDA_CALL(cudaSetDevice(*config_.device)); + }); + state_ = State::Running; + } + + void SetupStreams() { + switch (config_.stream_policy) { + case StreamPolicy::Single: + SetupStreamsImpl(); + break; + case StreamPolicy::PerBackend: + SetupStreamsImpl(); + break; + case StreamPolicy::PerOperator: + SetupStreamsImpl(); + break; + } + } + + template + void SetupStreamsImpl() { + StreamAssignment assignment(graph_); + int num_streams = assignment.NumStreams(); + if (num_streams == 0) + return; + for (int i = 0; i < num_streams; i++) + streams_.push_back(CUDAStreamPool::instance().Get()); + for (auto &node : graph_.Nodes()) { + auto stream_idx = assignment[&node]; + + node.env.order = stream_idx.has_value() + ? AccessOrder(streams_[*stream_idx]) + : AccessOrder::host(); + } + } + + // Configuration data + + Config config_; + int prefetch_depth_ = 1; + + // Graph analysis + + struct GraphInfo { + int num_cpu = 0; + int num_mixed = 0; + int num_gpu = 0; + int num_cpu_roots = 0; + int num_mixed_roots = 0; + int num_gpu_roots = 0; + } graph_info_; + + // Runtime environment + + std::unique_ptr tp_; + std::queue pending_outputs_; + std::vector streams_; + std::map> node_map_; + + ExecGraph graph_; + std::unique_ptr exec_; + + // dynamic data + + int64_t iter_index_ = 0; + SharedIterData last_iter_data_; +}; + + +/////////////////////////////// +// Executor2 + +Executor2::Executor2(const Config &config) : impl_(std::make_unique(config)) { +} + +Executor2::~Executor2() { + Shutdown(); + impl_.reset(); +} + +void Executor2::Build(const graph::OpGraph &graph) { + impl_->Build(graph); +} + +void Executor2::Init() { + // no-op +} + +void Executor2::Run() { + impl_->Run(); +} + +void Executor2::Prefetch() { + impl_->Prefetch(); +} + + +void Executor2::Outputs(Workspace *ws) { + *ws = impl_->PopOutputs(); +} + +void Executor2::ShareOutputs(Workspace *ws) { + Outputs(ws); +} + +void Executor2::ReleaseOutputs() { + // no-op +} + +void Executor2::EnableMemoryStats(bool enable_memory_stats) { + // no-op +} + +void Executor2::EnableCheckpointing(bool checkpointing) { + impl_->EnableCheckpointing(checkpointing); +} + +ExecutorMetaMap Executor2::GetExecutorMeta() { + // not supported - the "meta" thing assumed persistence of allocations + return {}; +} + +void Executor2::Shutdown() { + impl_->Shutdown(); +} + +Checkpoint &Executor2::GetCurrentCheckpoint() { + auto iter_data = impl_->LastIterData(); + if (!iter_data) { + throw std::runtime_error("The pipeline is not fully initialized."); + } + if (!iter_data->checkpoint) { + throw std::runtime_error("The recent iteration was run without checkpoiting enabled."); + } + return *iter_data->checkpoint; +} + +void Executor2::RestoreStateFromCheckpoint(const Checkpoint &cpt) { + impl_->RestoreFromCheckpoint(cpt); +} + +int Executor2::InputFeedCount(std::string_view input_name) { + return impl_->InputFeedCount(input_name); +} + +OperatorBase *Executor2::GetOperator(std::string_view name) { + return impl_->GetOperator(name); +} + + +} // namespace exec2 +} // namespace dali diff --git a/dali/pipeline/executor/executor2/exec2.h b/dali/pipeline/executor/executor2/exec2.h new file mode 100644 index 00000000000..a6cd55ad43c --- /dev/null +++ b/dali/pipeline/executor/executor2/exec2.h @@ -0,0 +1,136 @@ +// Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef DALI_PIPELINE_EXECUTOR_EXECUTOR2_EXEC2_H_ +#define DALI_PIPELINE_EXECUTOR_EXECUTOR2_EXEC2_H_ + +#include +#include +#include +#include "dali/pipeline/graph/op_graph2.h" +#include "dali/pipeline/workspace/workspace.h" +#include "dali/pipeline/executor/executor.h" + +namespace dali { +namespace exec2 { + +/** Determines the output queue size for the operators in different places in the graph. + * + * An operator produces its output and stores it in a (virtual) queue. The size of the queue + * determines, how many times an operator can be run ahead of its consumers. + * + * A queue size of 1 means there's no queue - the operator has just one output buffer and it + * must be consumed before the next iteration can start. A queue size of 2 means that the operator + * can be be run twice before the result of the first iteration is consumed, and so on. + * + * Depending on where in the graph an operator is, it may, or may not, have a queue longer than 1. + * To maintain meaningful prefetching, at least the producers of the pipeline outputs must have a + * queue (OutputOnly). The opposite is the FullyBuffered policy, where all operators can accumulate + * multiple results. BackendChange policy maintains a queue only for operators which have consumers + * with a different backend (e.g. a CPU operator's results are consumed by a Mixed operator). + */ +enum class QueueDepthPolicy : int { + FullyBuffered, //< All operators maintain a queue + BackendChange, //< Only operators followed by one with a different backend have a queue + OutputOnly, //< Only the pipeline output has multiple buffers + Legacy = BackendChange, +}; + +/** Determines which operators can run in parallel. */ +enum class OperatorConcurrency : int { + None, //< at no time can mutliple operators run + Backend, //< operators from different backends can execute in parallel + Full, //< independent operators can run in parallel +}; + +/** Determines how CUDA streams are assigned to the operators. */ +enum class StreamPolicy : int { + Single, //< There's just one stream that's used by all operators + PerBackend, //< Operators are scheduled on a stream specific to their backend (mixed or GPU) + PerOperator //< Independent operators are executed on separate streams. + + // TODO(michalz): Check if this is legal with existing operator implementations - likely not + // PerIteration, //< Streams are cycled on a per-iteration basis +}; + +class DLL_PUBLIC Executor2 : public ExecutorBase { + public: + struct Config { + /** Device identifier */ + std::optional device; + /** The number of threads used for running operators Run function + * + * TODO(michalz): Consider unification of the threading engines. + */ + int operator_threads = 0; + /** The number of threads in the thread pool passed to the operators */ + int thread_pool_threads = 0; + /** Whether the thread pool should set thread affinity with NVML */ + bool set_affinity = false; + /** The number of pending results CPU operators produce */ + int cpu_queue_depth = 2; + /** The number of pending results GPU (and mixed) operators produce */ + int gpu_queue_depth = 2; + /** Maximum batch size */ + int max_batch_size = 1; + /** If true, checkpoints are generated */ + bool checkpointing = false; + /** If true, pipeline outputs are returned on a stream (no sync with host) */ + bool async_output = false; + + QueueDepthPolicy queue_policy = QueueDepthPolicy::Legacy; + OperatorConcurrency concurrency = OperatorConcurrency::Backend; + StreamPolicy stream_policy = StreamPolicy::PerBackend; + }; + + explicit Executor2(const Config &config); + ~Executor2() override; + + void Build(const graph::OpGraph &graph) override; + + void Build(OpGraph *graph, std::vector output_names) override { + throw std::logic_error("This function is maintained in the interface for legacy tests only."); + } + + void Init() override; + void Run() override; + void Prefetch() override; + + void Outputs(Workspace *ws) override; + void ShareOutputs(Workspace *ws) override; + void ReleaseOutputs() override; + void EnableMemoryStats(bool enable_memory_stats = false) override; + void EnableCheckpointing(bool checkpointing = false) override; + ExecutorMetaMap GetExecutorMeta() override; + void Shutdown() override; + Checkpoint& GetCurrentCheckpoint() override; + void RestoreStateFromCheckpoint(const Checkpoint &cpt) override; + int InputFeedCount(std::string_view input_name) override; + OperatorBase *GetOperator(std::string_view name) override; + + protected: + bool HasConditionals() const override { + throw std::logic_error("This function is maintained in the interface for legacy tests only."); + } + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace exec2 +} // namespace dali + + +#endif // DALI_PIPELINE_EXECUTOR_EXECUTOR2_EXEC2_H_ diff --git a/dali/pipeline/executor/executor2/exec2_test.cc b/dali/pipeline/executor/executor2/exec2_test.cc new file mode 100644 index 00000000000..8e5ebde8b9a --- /dev/null +++ b/dali/pipeline/executor/executor2/exec2_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dali/pipeline/executor/executor2/exec2_test.h" +#include "dali/pipeline/executor/executor2/exec2.h" + +namespace std { +template +inline std::ostream &operator<<(std::ostream &os, const std::optional &opt) { + if (opt) + return os << *opt; + else + return os << ""; +} +} // namespace std + +namespace dali { +namespace exec2 { + +#define PRINT_ENUM_VALUE(_label) case decltype(value)::_label:\ + os << #_label; break; + +inline std::ostream &operator<<(std::ostream &os, StreamPolicy value) { + switch (value) { + PRINT_ENUM_VALUE(Single); + PRINT_ENUM_VALUE(PerBackend); + PRINT_ENUM_VALUE(PerOperator); + default: + os << static_cast>(value); + } + return os; +} + +inline std::ostream &operator<<(std::ostream &os, OperatorConcurrency value) { + switch (value) { + PRINT_ENUM_VALUE(None); + PRINT_ENUM_VALUE(Full); + PRINT_ENUM_VALUE(Backend); + default: + os << static_cast>(value); + } + return os; +} + +inline std::ostream &operator<<(std::ostream &os, QueueDepthPolicy value) { + switch (value) { + PRINT_ENUM_VALUE(FullyBuffered); + PRINT_ENUM_VALUE(BackendChange); + PRINT_ENUM_VALUE(OutputOnly); + default: + os << static_cast>(value); + } + return os; +} + +inline std::ostream &operator<<(std::ostream &os, const Executor2::Config &cfg) { + #define PRINT_CONFIG_FIELD(field) #field " : ", cfg.field, "\n" + print(os, + PRINT_CONFIG_FIELD(device), + PRINT_CONFIG_FIELD(thread_pool_threads), + PRINT_CONFIG_FIELD(operator_threads), + PRINT_CONFIG_FIELD(concurrency), + PRINT_CONFIG_FIELD(queue_policy), + PRINT_CONFIG_FIELD(stream_policy), + PRINT_CONFIG_FIELD(cpu_queue_depth), + PRINT_CONFIG_FIELD(gpu_queue_depth), + PRINT_CONFIG_FIELD(set_affinity)); + return os; +} + +void PrintTo(const Executor2::Config &cfg, std::ostream *os) { + print(*os, cfg); +} + +namespace test { + +class Exec2Test : public::testing::TestWithParam { + public: + Exec2Test() { + config_ = GetParam(); + } + + Executor2::Config config_; +}; + + +TEST_P(Exec2Test, Graph1_CPUOnly) { + Executor2 exec(config_); + graph::OpGraph graph = GetTestGraph1(); + exec.Build(graph); + for (int i = 0; i < 10; i++) { + exec.Run(); + } + Workspace ws; + for (int i = 0; i < 10; i++) { + ws.Clear(); + exec.Outputs(&ws); + CheckTestGraph1Results(ws, config_.max_batch_size); + } +} + +TEST_P(Exec2Test, Graph2_CPU2GPU) { + Executor2 exec(config_); + graph::OpGraph graph = GetTestGraph2(); + exec.Build(graph); + for (int i = 0; i < 10; i++) { + exec.Run(); + } + Workspace ws; + for (int i = 0; i < 10; i++) { + ws.Clear(); + exec.Outputs(&ws); + CheckTestGraph2Results(ws, config_.max_batch_size); + } +} + + +Executor2::Config MakeCfg(QueueDepthPolicy q, OperatorConcurrency c, StreamPolicy s) { + Executor2::Config cfg; + cfg.queue_policy = q; + cfg.concurrency = c; + cfg.stream_policy = s; + cfg.thread_pool_threads = 4; + cfg.operator_threads = 4; + cfg.device = 0; + return cfg; +} + +std::vector configs = { + MakeCfg(QueueDepthPolicy::OutputOnly, OperatorConcurrency::None, StreamPolicy::Single), + MakeCfg(QueueDepthPolicy::FullyBuffered, OperatorConcurrency::Full, StreamPolicy::Single), + MakeCfg(QueueDepthPolicy::BackendChange, OperatorConcurrency::Backend, StreamPolicy::PerBackend), + MakeCfg(QueueDepthPolicy::FullyBuffered, OperatorConcurrency::Full, StreamPolicy::PerOperator), +}; + +INSTANTIATE_TEST_SUITE_P(Exec2Test, Exec2Test, testing::ValuesIn(configs)); + + +} // namespace test +} // namespace exec2 +} // namespace dali diff --git a/dali/pipeline/executor/executor2/stream_assignment.h b/dali/pipeline/executor/executor2/stream_assignment.h index 5f1bfe8c182..d5cfb43b1ba 100644 --- a/dali/pipeline/executor/executor2/stream_assignment.h +++ b/dali/pipeline/executor/executor2/stream_assignment.h @@ -26,21 +26,11 @@ #include #include "dali/pipeline/graph/graph_util.h" #include "dali/pipeline/executor/executor2/exec_graph.h" -// TODO(michalz): This is here for review process only. Remove when exec2.h is available -// #include "dali/pipeline/executor/executor2/exec2.h" -#include "dali/pipeline/graph/op_graph2.h" +#include "dali/pipeline/executor/executor2/exec2.h" namespace dali { namespace exec2 { -// TODO(michalz): This is here for review process only. Remove when exec2.h is available -enum class StreamPolicy : int { - Single, //< There's just one stream that's used by all operators - PerBackend, //< Operators are scheduled on a stream specific to their backend (mixed or GPU) - PerOperator //< Independent operators are executed on separate streams. -}; - - template class StreamAssignment; diff --git a/dali/pipeline/executor/executor_factory.cc b/dali/pipeline/executor/executor_factory.cc index 50cdc2642be..a89a4b02b65 100644 --- a/dali/pipeline/executor/executor_factory.cc +++ b/dali/pipeline/executor/executor_factory.cc @@ -20,16 +20,55 @@ #include "dali/pipeline/executor/pipelined_executor.h" #include "dali/pipeline/executor/async_pipelined_executor.h" #include "dali/pipeline/executor/async_separated_pipelined_executor.h" +#include "dali/pipeline/executor/executor2/exec2.h" namespace dali { +namespace { + +auto MakeExec2Config(int batch_size, int num_thread, int device_id, + size_t bytes_per_sample_hint, bool set_affinity, + int max_num_stream, + int default_cuda_stream_priority, + QueueSizes prefetch_queue_depth) { + exec2::Executor2::Config cfg{}; + cfg.async_output = false; + cfg.set_affinity = set_affinity; + cfg.thread_pool_threads = num_thread; + cfg.operator_threads = num_thread; + if (device_id != CPU_ONLY_DEVICE_ID) + cfg.device = device_id; + cfg.max_batch_size = batch_size; + cfg.cpu_queue_depth = prefetch_queue_depth.cpu_size; + cfg.gpu_queue_depth = prefetch_queue_depth.gpu_size; + cfg.queue_policy = exec2::QueueDepthPolicy::Legacy; + cfg.stream_policy = exec2::StreamPolicy::PerBackend; + cfg.concurrency = exec2::OperatorConcurrency::Backend; + return cfg; +} + +bool ForceExec2() { + static bool force_exec2 = []() { + const char *env = getenv("DALI_USE_EXEC2"); + return env && atoi(env); + }(); + return force_exec2; +} + +} // namespace + template std::unique_ptr GetExecutorImpl(bool pipelined, bool separated, bool async, T&&... args) { if (async && separated && pipelined) { return std::make_unique(std::forward(args)...); } else if (async && !separated && pipelined) { - return std::make_unique(std::forward(args)...); + if (ForceExec2()) { + std::cerr << "\n!!! Forced use of Executor 2.0 !!!" << std::endl; + return std::make_unique(MakeExec2Config(std::forward(args)...)); + } else { + return std::make_unique(std::forward(args)...); + } } else if (!async && separated && pipelined) { return std::make_unique(std::forward(args)...); } else if (!async && !separated && pipelined) { diff --git a/dali/pipeline/pipeline_test.cc b/dali/pipeline/pipeline_test.cc index 7d68863149f..1ea9da7e803 100644 --- a/dali/pipeline/pipeline_test.cc +++ b/dali/pipeline/pipeline_test.cc @@ -403,8 +403,8 @@ DALI_SCHEMA(DummyPresizeOp) TEST_F(PipelineTestOnce, TestPresize) { const int batch_size = 1; const int num_thread = 1; - const bool pipelined = true; - const bool async = true; + const bool pipelined = false; + const bool async = false; DALIImageType img_type = DALI_RGB; const int presize_val_CPU = 11; diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc index 58cdb340531..532c3272579 100644 --- a/dali/python/backend_impl.cc +++ b/dali/python/backend_impl.cc @@ -74,7 +74,7 @@ using namespace pybind11::literals; // NOLINT * definitions can look it up when pybind is generating the signatures, otherwise the annotations * will contain the backend_impl module path. */ -static std::string tensor_module_impl(const py::object object) { +static std::string tensor_module_impl(const py::object &object) { (void)object; return "nvidia.dali.tensors"; } @@ -236,7 +236,7 @@ void FillTensorFromDlPack(py::capsule capsule, SourceDataType *batch } template -void FillTensorFromCudaArray(const py::object object, TensorType *batch, int device_id, +void FillTensorFromCudaArray(const py::object &object, TensorType *batch, int device_id, string layout) { auto cu_a_interface_val = getattr(object, "__cuda_array_interface__", py::none()); if (cu_a_interface_val.is_none()) { @@ -296,8 +296,18 @@ void FillTensorFromCudaArray(const py::object object, TensorType *batch, int dev // Keep a copy of the input object ref in the deleter, so its refcount is increased // while this shared_ptr is alive (and the data should be kept alive) - batch->ShareData(shared_ptr(ptr, [obj_ref = object](void *) {}), - bytes, false, typed_shape, type.id(), device_id); + batch->ShareData(shared_ptr(ptr, [obj_ref = object](void *) mutable { // NOLINT + // acquire GIL ... + py::gil_scoped_acquire aqr; + { + // now move out the object stored in the closure to a local variable... + auto tmp = std::move(obj_ref); + (void)tmp; + /// ...and let it go out of scope while GIL is held + } + }), + bytes, false, typed_shape, type.id(), device_id); + batch->SetLayout(layout); } @@ -697,7 +707,7 @@ void ExposeTensor(py::module &m) { As private this API may change without notice. )code" ) - .def(py::init([](const py::object object, string layout = "", int device_id = -1) { + .def(py::init([](const py::object &object, string layout = "", int device_id = -1) { auto t = std::make_unique>(); FillTensorFromCudaArray(object, t.get(), device_id, layout); return t.release(); @@ -1002,8 +1012,13 @@ void ExposeTensorList(py::module &m) { const TypeInfo &type = TypeFromFormatStr(info.format); // Keep a copy of the input buffer ref in the deleter, so its refcount is increased // while this shared_ptr is alive (and the data should be kept alive) - t->ShareData(shared_ptr(info.ptr, [buf_ref = b](void *){}), - bytes, is_pinned, i_shape, type.id(), device_id); + t->ShareData(shared_ptr(info.ptr, [buf_ref = b](void *) mutable { // NOLINT + py::gil_scoped_acquire aqr; + { + auto tmp = std::move(buf_ref); + (void)tmp; + } + }), bytes, is_pinned, i_shape, type.id(), device_id); t->SetLayout(layout); return t; }), @@ -1198,6 +1213,7 @@ void ExposeTensorList(py::module &m) { R"code( Returns the address of the first element of TensorList. )code") + .def("reset", &TensorList::Reset) .def("__str__", [](TensorList &t) { return FromPythonTrampoline("nvidia.dali.tensors", "_tensorlist_to_string")(t); }) @@ -1265,7 +1281,7 @@ void ExposeTensorList(py::module &m) { layout : str Layout of the data )code") - .def(py::init([](const py::object object, string layout = "", int device_id = -1) { + .def(py::init([](const py::object &object, string layout = "", int device_id = -1) { auto t = std::make_shared>(); FillTensorFromCudaArray(object, t.get(), device_id, layout); return t; @@ -1344,6 +1360,7 @@ void ExposeTensorList(py::module &m) { non_blocking : bool Asynchronous copy. )code") + .def("reset", &TensorList::Reset) .def("__getitem__", [](TensorList &t, Index i) -> std::unique_ptr> { return TensorListGetItemImpl(t, i); diff --git a/dali/test/python/test_external_source_dali.py b/dali/test/python/test_external_source_dali.py index a73f1221fd8..f7b59e7f923 100644 --- a/dali/test/python/test_external_source_dali.py +++ b/dali/test/python/test_external_source_dali.py @@ -615,7 +615,7 @@ def test_non_utilized_external_source_pruning(): yield _test_non_utilized_external_source_pruning, num_outputs -def test_empty_es(): +def __test_empty_es(): max_batch_size = 16 @pipeline_def diff --git a/dali/test/python/test_pipeline.py b/dali/test/python/test_pipeline.py index 43b0b05d8a6..dd6016248ad 100644 --- a/dali/test/python/test_pipeline.py +++ b/dali/test/python/test_pipeline.py @@ -635,7 +635,7 @@ def define_graph(self): new_pipe = Pipeline(batch_size=batch_size, num_threads=2, device_id=0) new_pipe.deserialize_and_build(serialized_pipeline) - compare_pipelines(pipe, new_pipe, batch_size, 10) + compare_pipelines(pipe, new_pipe, batch_size, 5) def test_warpaffine(): @@ -1647,7 +1647,12 @@ def test_executor_meta(): class TestPipeline(Pipeline): def __init__(self, batch_size, num_threads, device_id, num_gpus, seed): super(TestPipeline, self).__init__( - batch_size, num_threads, device_id, enable_memory_stats=True + batch_size, + num_threads, + device_id, + enable_memory_stats=True, + exec_async=False, + exec_pipelined=False, ) self.input = ops.readers.Caffe( path=caffe_db_folder, shard_id=device_id, num_shards=num_gpus, seed=seed @@ -1721,7 +1726,9 @@ def test_bytes_per_sample_hint(): def obtain_reader_meta(iters=3, **kvargs): batch_size = 10 - pipe = Pipeline(batch_size, 1, 0, enable_memory_stats=True) + pipe = Pipeline( + batch_size, 1, 0, exec_async=False, exec_pipelined=False, enable_memory_stats=True + ) with pipe: out = fn.readers.caffe(path=caffe_db_folder, shard_id=0, num_shards=1, **kvargs) out = [o.gpu() for o in out] @@ -1924,7 +1931,7 @@ def test_pipeline_wrong_device_id(): pipe = dali.Pipeline(batch_size=1, num_threads=1, device_id=-123) with pipe: pipe.set_outputs(np.int32([1, 2, 3])) - with assert_raises(RuntimeError, glob="wrong device_id"): + with assert_raises(Exception, regex="(wrong device_id)|(device_id.*is invalid)"): pipe.build() pipe.run() diff --git a/dali/test/timing.h b/dali/test/timing.h index c6fe021bcd5..36a623c2d0a 100644 --- a/dali/test/timing.h +++ b/dali/test/timing.h @@ -1,4 +1,4 @@ -// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #define DALI_TEST_TIMING_H_ #include +#include #include #include #include diff --git a/qa/TL0_python-self-test-core-exec2/test.sh b/qa/TL0_python-self-test-core-exec2/test.sh new file mode 100644 index 00000000000..1e454a0647c --- /dev/null +++ b/qa/TL0_python-self-test-core-exec2/test.sh @@ -0,0 +1,6 @@ +#!/bin/bash -e +export DALI_USE_EXEC2=1 +pushd ../TL0_python-self-test-core +bash -e ./test_nofw.sh +bash -e ./test_nofw.sh +popd diff --git a/qa/TL0_python-self-test-core/test.sh b/qa/TL0_python-self-test-core/test.sh index dd1f504f839..4fa84258a0d 100644 --- a/qa/TL0_python-self-test-core/test.sh +++ b/qa/TL0_python-self-test-core/test.sh @@ -1,3 +1,4 @@ #!/bin/bash -e +export DALI_USE_EXEC2=0 bash -e ./test_nofw.sh bash -e ./test_pytorch.sh diff --git a/qa/TL0_self-test-exec2/test.sh b/qa/TL0_self-test-exec2/test.sh new file mode 100755 index 00000000000..1f709795788 --- /dev/null +++ b/qa/TL0_self-test-exec2/test.sh @@ -0,0 +1,47 @@ +#!/bin/bash -ex + +do_once() { + # generate file_list.txt for label testing + echo "${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test.mp4 0 0 99 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test.mp4 1 100 199 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test.mp4 2 200 256 + " > /tmp/file_list.txt + + echo "${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 0 0 1 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 1 2 3 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 2 4 5 + " > /tmp/file_list_timestamp.txt + + echo "${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 0 0 49 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 1 50 99 + ${DALI_EXTRA_PATH}/db/video/frame_num_timestamp/test_25fps.mp4 2 100 149 + " > /tmp/file_list_frame_num.txt +} + +test_body() { + for BINNAME in \ + "dali_test.bin" \ + "dali_operator_test.bin" + do + for DIRNAME in \ + "../../build/dali/python/nvidia/dali" \ + "$(python -c 'import os; from nvidia import dali; print(os.path.dirname(dali.__file__))' 2>/dev/null || echo '')" + do + if [ -x "$DIRNAME/test/$BINNAME" ]; then + FULLPATH="$DIRNAME/test/$BINNAME" + break + fi + done + + if [[ -z "$FULLPATH" ]]; then + echo "ERROR: $BINNAME not found" + exit 1 + fi + + DALI_USE_EXEC2=1 "$FULLPATH" + done +} + +pushd ../.. +source ./qa/test_template.sh +popd diff --git a/qa/TL0_self-test-exec2_xavier/test.sh b/qa/TL0_self-test-exec2_xavier/test.sh new file mode 100755 index 00000000000..ab35fe6dca8 --- /dev/null +++ b/qa/TL0_self-test-exec2_xavier/test.sh @@ -0,0 +1,32 @@ +#!/bin/bash -ex + +test_body() { + for BINNAME in \ + "dali_test.bin" \ + "dali_operator_test.bin" + do + for DIRNAME in \ + "../../build/dali/python/nvidia/dali" \ + "$(python -c 'import os; from nvidia import dali; print(os.path.dirname(dali.__file__))' 2>/dev/null || echo '')" + do + if [ -x "$DIRNAME/test/$BINNAME" ]; then + FULLPATH="$DIRNAME/test/$BINNAME" + break + fi + done + + if [[ -z "$FULLPATH" ]]; then + echo "ERROR: $BINNAME not found" + exit 1 + fi + + # LMDB seems to be greedy when mmaps memory, disable it as well + # for some reason mmap based test tends to fail on some runners due to disc issue, so + # disable it for now + DALI_USE_EXEC2=1 "$FULLPATH" --gtest_filter=-*mmap*:*LMDBTest* + done +} + +pushd ../.. +source ./qa/test_template.sh +popd diff --git a/qa/TL0_self-test/test.sh b/qa/TL0_self-test/test.sh index 02f3e11a7e5..d149a0e41e2 100755 --- a/qa/TL0_self-test/test.sh +++ b/qa/TL0_self-test/test.sh @@ -40,7 +40,7 @@ test_body() { exit 1 fi - "$FULLPATH" + DALI_USE_EXEC2=0 "$FULLPATH" done } diff --git a/qa/TL0_self-test_xavier/test.sh b/qa/TL0_self-test_xavier/test.sh index 9fb69f56c3b..cd52aef66e1 100755 --- a/qa/TL0_self-test_xavier/test.sh +++ b/qa/TL0_self-test_xavier/test.sh @@ -25,7 +25,7 @@ test_body() { # LMDB seems to be greedy when mmaps memory, disable it as well # for some reason mmap based test tends to fail on some runners due to disc issue, so # disable it for now - "$FULLPATH" --gtest_filter=-*mmap*:*LMDBTest* + DALI_USE_EXEC2=0 "$FULLPATH" --gtest_filter=-*mmap*:*LMDBTest* done }