Skip to content

Commit

Permalink
Adding -iree-stream-schedule-execution + -concurrency passes. (#7549)
Browse files Browse the repository at this point in the history
The passes themselves are rather simple and call into a partitioning
routine that performs the real work with the intent being that we
can have many and specify which one to use based on scoped attributes
in the IR (kind of like lowering configs in codegen). Today there's
just a reference implementation that does a single level of concurrency.
The hope is that someone who actually knows how to write a good
partitioning algorithm can contribute something better, but it's at
least no worse than what we have today and better than simple ML
systems that have no concurrency.

Though the passes are similar they operate at different scopes and
will have different partitioning algorithms. I thought about trying
to unify them however keeping them separate allows us to do things
like use a more complex execution partitioning pass while using the
same generic concurrency scheduling etc - including disabling the
concurrency scheduling entirely for debugging or environments where
there may be no benefits to such scheduling (single core execution,
etc). It's easy enough to reason about how they could be unified that
I wanted to err on the side of flexibility until we have an owner and
at least one or two more algorithms we can use to feel out the shape of
things.

A benefit of the independent execution and concurrency partitioning is
that debugging either is much simpler (and there's pretty good `-debug`
output). Since the concurrency scheduling operates only within the
scheduled execution regions there's no need to worry about host/device
interactions or the parent op CFG.
  • Loading branch information
benvanik committed Nov 8, 2021
1 parent bbb9eab commit b585156
Show file tree
Hide file tree
Showing 16 changed files with 1,500 additions and 0 deletions.
3 changes: 3 additions & 0 deletions iree/compiler/Dialect/Stream/Analysis/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ package(
cc_library(
name = "Analysis",
srcs = [
"Partitioning.cpp",
"Partitioning/ReferencePartitioning.cpp",
"ResourceUsage.cpp",
],
hdrs = [
"Partitioning.h",
"ResourceUsage.h",
],
deps = [
Expand Down
3 changes: 3 additions & 0 deletions iree/compiler/Dialect/Stream/Analysis/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ iree_cc_library(
NAME
Analysis
HDRS
"Partitioning.h"
"ResourceUsage.h"
SRCS
"Partitioning.cpp"
"Partitioning/ReferencePartitioning.cpp"
"ResourceUsage.cpp"
DEPS
LLVMSupport
Expand Down
183 changes: 183 additions & 0 deletions iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Dialect/Stream/Analysis/Partitioning.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/PatternMatch.h"

#define DEBUG_TYPE "iree-stream-partitioning"

namespace mlir {
namespace iree_compiler {
namespace IREE {
namespace Stream {

#ifndef NDEBUG

void dumpPartition(Partition &partition, AsmState &state) {
llvm::dbgs() << " INS:\n ";
llvm::interleaveComma(partition.ins, llvm::dbgs(), [&](Value in) {
in.printAsOperand(llvm::dbgs(), state);
});
llvm::dbgs() << "\n OUTS:\n ";
llvm::interleaveComma(partition.outs, llvm::dbgs(), [&](Value out) {
out.printAsOperand(llvm::dbgs(), state);
});
llvm::dbgs() << "\n OPS:\n";
for (auto *op : partition.ops) {
llvm::dbgs() << " ";
op->print(llvm::dbgs(), state);
llvm::dbgs() << "\n";
}
}

void Partition::dump(Operation *parentOp) {
AsmState state(parentOp);
dumpPartition(*this, state);
}

void PartitionSet::dump(Operation *parentOp) {
AsmState state(parentOp);
for (auto partition : llvm::enumerate(partitions)) {
llvm::dbgs() << "PARTITION[" << partition.index() << "]:\n";
dumpPartition(partition.value(), state);
}
}

#else
void Partition::dump(Operation *parentOp) {}
void PartitionSet::dump(Operation *parentOp) {}
#endif // !NDEBUG

LogicalResult Partition::verify(Location loc) {
// Ensure values are defined either by other ops in the partition or are
// declared as inputs.
SetVector<Value> defValues;
for (auto *op : ops) {
for (auto result : op->getResults()) {
defValues.insert(result);
}
}
for (auto *op : ops) {
for (auto operand : op->getOperands()) {
if (!ins.contains(operand) && !defValues.contains(operand)) {
return mlir::emitError(loc)
<< "operand not declared in partition inputs or by an op within "
"the partition";
}
}
}

// Ensure all outputs come from ops in the partition (or are pass-through
// operands, though those are silly).
for (auto out : outs) {
if (!ins.contains(out) && !defValues.contains(out)) {
return mlir::emitError(loc) << "output not defined by an op within the "
"partition (or captured)";
}
}

return success();
}

LogicalResult PartitionSet::verify(Location loc) {
// Verify each partition is consistent.
for (auto &partition : partitions) {
if (failed(partition.verify(loc))) return failure();
}

// Ensure no partitions duplicate escaping values as we need a single def to
// remap the value in the parent block.
SetVector<Value> outs;
for (auto &partition : partitions) {
for (auto out : partition.outs) {
if (outs.contains(out)) {
return mlir::emitError(loc)
<< "duplicate value found in partition set outputs";
}
outs.insert(out);
}
}

// Ensure a correct topological order of partitions. This only checks the
// order of the partitions and not any ops that aren't covered. We do this
// by walking backwards and checking that no partition captures values
// escaping any partitions after it.
SetVector<Value> declaredBelow;
for (auto &partition : llvm::reverse(partitions)) {
for (auto in : partition.ins) {
if (declaredBelow.contains(in)) {
return mlir::emitError(loc) << "partition set out of order; value "
"captured declared as escaping below";
}
}
for (auto out : partition.outs) {
declaredBelow.insert(out);
}
}

return success();
}

void PartitionSet::topologicalSort() {
if (partitions.empty()) return;

SetVector<Partition *> unsortedSet;
DenseMap<Value, SmallVector<Partition *>> consumers;
for (auto &partition : partitions) {
unsortedSet.insert(&partition);
for (auto in : partition.ins) {
consumers[in].push_back(&partition);
}
}

struct DFSState {
SmallVector<Partition *, 16> topologicalCounts;
DenseSet<Partition *> seen;
} state;
std::function<void(Partition *)> postorderWalk;
postorderWalk = [&](Partition *current) {
for (auto out : current->outs) {
for (auto *consumer : consumers[out]) {
postorderWalk(consumer);
}
}
auto it = state.seen.insert(current);
if (/*inserted=*/it.second) {
if (unsortedSet.contains(current)) {
state.topologicalCounts.push_back(current);
}
}
};
for (auto *partition : unsortedSet) postorderWalk(partition);

SmallVector<Partition> sortedSet;
sortedSet.reserve(partitions.size());
for (auto *partition : llvm::reverse(state.topologicalCounts)) {
sortedSet.push_back(std::move(*partition));
}
partitions = std::move(sortedSet);
}

PartitionSet partitionStreamableOps(IREE::Stream::PartitioningConfigAttr config,
Block *block) {
// Only one algorithm today.
return partitionStreamableOpsReference(config, block);
}

PartitionSet partitionRegionConcurrency(
IREE::Stream::PartitioningConfigAttr config, Block *block) {
// Only one algorithm today.
return partitionRegionConcurrencyReference(config, block);
}

} // namespace Stream
} // namespace IREE
} // namespace iree_compiler
} // namespace mlir
131 changes: 131 additions & 0 deletions iree/compiler/Dialect/Stream/Analysis/Partitioning.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_
#define IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_

#include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/Support/LLVM.h"

namespace mlir {
namespace iree_compiler {
namespace IREE {
namespace Stream {

//===----------------------------------------------------------------------===//
// Data structures
//===----------------------------------------------------------------------===//

// A single slice of ops.
struct Partition {
// SSA values defined outside of the partition.
// All values not defined by ops in the partition must be declared.
// Multiple partitions may capture the same value.
SetVector<Value> ins;
// SSA values defined by the partition with uses outside.
// All values used by ops outside of the partition must be declared.
// Only one partition may produce a new value.
SetVector<Value> outs;
// All ops covered by the partition. May contain ops that exist in other
// partitions in cases where the op is to be duplicated. Not all ops are
// streamable (such as constants and arithmetic).
SetVector<Operation *> ops;

void dump(Operation *parentOp);

// Verifies that the partition meets the required conditions.
LogicalResult verify(Location loc);
};

// A set of all partitions.
struct PartitionSet {
// All partitions in an undefined topological order.
SmallVector<Partition> partitions;

// Total number of partitions in the set.
size_t size() const { return partitions.size(); }
// Returns true if the set is empty (no streamable ops).
bool empty() const { return partitions.empty(); }

void dump(Operation *parentOp);

// Verifies that the partition set meets the required conditions.
LogicalResult verify(Location loc);

// Sorts all partitions in a topological order.
void topologicalSort();
};

//===----------------------------------------------------------------------===//
// Stream partitioning algorithms
//===----------------------------------------------------------------------===//
//
// When these algorithms run all streamable operations have had an affinity
// assigned and are lowered out of tensor form. Some resources may have
// lifetimes associated but most will remain unassigned (`!stream.resource<*>`)
// until after partitioning. Note that there may already exist partitioned ops
// in stream.execute regions already.
//
// The intent is that we can use the information we have about each operation,
// the resources moving between them, and where they should execute to better
// partition the DAG. This could optimize for reducing memory transfer between
// devices, reducing latency by minimizing cuts, maximizing concurrency by
// separating non-interfering subgraphs, etc.
//
// This is a well-researched area and there are many algorithms to choose from.
// We'll mostly want to focus on ones that are able to handle multiple critera
// (like memory consumption, compute utilization, available capacity, etc).
//
// See for example:
// dagP: https://github.com/GT-TDAlab/dagP
// Multilevel Algorithms for Acyclic Partitioning of Directed Acyclic Graphs
// https://hal.inria.fr/hal-02306566/document
// METIS: https://github.com/KarypisLab/METIS
// A Fast and High Quality Multilevel Scheme for Partitioning Ireegular
// Graphs
// http://glaros.dtc.umn.edu/gkhome/metis/metis/publications
// SCOTCH: https://www.labri.fr/perso/pelegrin/scotch/
// Contributions to Parallel Multilevel Graph Partitioning
// https://www.labri.fr/perso/pelegrin/papers/hdr.pdf
// Zoltan: https://cs.sandia.gov/Zoltan/
// https://cs.sandia.gov/Zoltan/Zoltan_pubs.html
// https://cs.sandia.gov/Zoltan/papers/zoltan_tutorial_dagstuhl09.pdf
//
// And some good papers/overviews:
// - Edge Partitioning of Large Graphs
// https://tel.archives-ouvertes.fr/tel-01956979/document
//

// Partitions the ops in |block| such that all streamable ops are in one or more
// partitions (with >1 implying duplication). Partitions may contain
// non-streamable ops if it is safe to do so (such as std arithmetic). Not all
// ops in the block will be covered by a partition.
PartitionSet partitionStreamableOps(IREE::Stream::PartitioningConfigAttr config,
Block *block);
PartitionSet partitionRegionConcurrency(
IREE::Stream::PartitioningConfigAttr config, Block *block);

//===----------------------------------------------------------------------===//
// Reference partitioning
//===----------------------------------------------------------------------===//

// Naive clustering based solely on correctness with no cost model or weighting.
// Produces the largest possible streams for any given block. Unsatisfactory.
PartitionSet partitionStreamableOpsReference(
IREE::Stream::PartitioningConfigAttr config, Block *block);

// Similarly poor algorithm to partitionStreamableOpsReference but for use
// within partitioned streams to produce waves of concurrently executable work.
PartitionSet partitionRegionConcurrencyReference(
IREE::Stream::PartitioningConfigAttr config, Block *block);

} // namespace Stream
} // namespace IREE
} // namespace iree_compiler
} // namespace mlir

#endif // IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_
Loading

0 comments on commit b585156

Please sign in to comment.