Adding -iree-stream-schedule-execution + -concurrency passes. (#7549)

The passes themselves are rather simple and call into a partitioning routine that performs the real work with the intent being that we can have many and specify which one to use based on scoped attributes in the IR (kind of like lowering configs in codegen). Today there's just a reference implementation that does a single level of concurrency. The hope is that someone who actually knows how to write a good partitioning algorithm can contribute something better, but it's at least no worse than what we have today and better than simple ML systems that have no concurrency. Though the passes are similar they operate at different scopes and will have different partitioning algorithms. I thought about trying to unify them however keeping them separate allows us to do things like use a more complex execution partitioning pass while using the same generic concurrency scheduling etc - including disabling the concurrency scheduling entirely for debugging or environments where there may be no benefits to such scheduling (single core execution, etc). It's easy enough to reason about how they could be unified that I wanted to err on the side of flexibility until we have an owner and at least one or two more algorithms we can use to feel out the shape of things. A benefit of the independent execution and concurrency partitioning is that debugging either is much simpler (and there's pretty good `-debug` output). Since the concurrency scheduling operates only within the scheduled execution regions there's no need to worry about host/device interactions or the parent op CFG.
iree-org · Nov 8, 2021 · b585156 · b585156
1 parent bbb9eab
commit b585156
Show file tree

Hide file tree

Showing 16 changed files with 1,500 additions and 0 deletions.
diff --git a/iree/compiler/Dialect/Stream/Analysis/BUILD b/iree/compiler/Dialect/Stream/Analysis/BUILD
@@ -13,9 +13,12 @@ package(
 cc_library(
     name = "Analysis",
     srcs = [
+        "Partitioning.cpp",
+        "Partitioning/ReferencePartitioning.cpp",
         "ResourceUsage.cpp",
     ],
     hdrs = [
+        "Partitioning.h",
         "ResourceUsage.h",
     ],
     deps = [

diff --git a/iree/compiler/Dialect/Stream/Analysis/CMakeLists.txt b/iree/compiler/Dialect/Stream/Analysis/CMakeLists.txt
@@ -14,8 +14,11 @@ iree_cc_library(
   NAME
     Analysis
   HDRS
+    "Partitioning.h"
     "ResourceUsage.h"
   SRCS
+    "Partitioning.cpp"
+    "Partitioning/ReferencePartitioning.cpp"
     "ResourceUsage.cpp"
   DEPS
     LLVMSupport

diff --git a/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp b/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp
@@ -0,0 +1,183 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Dialect/Stream/Analysis/Partitioning.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/PatternMatch.h"
+
+#define DEBUG_TYPE "iree-stream-partitioning"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace Stream {
+
+#ifndef NDEBUG
+
+void dumpPartition(Partition &partition, AsmState &state) {
+  llvm::dbgs() << " INS:\n  ";
+  llvm::interleaveComma(partition.ins, llvm::dbgs(), [&](Value in) {
+    in.printAsOperand(llvm::dbgs(), state);
+  });
+  llvm::dbgs() << "\n OUTS:\n  ";
+  llvm::interleaveComma(partition.outs, llvm::dbgs(), [&](Value out) {
+    out.printAsOperand(llvm::dbgs(), state);
+  });
+  llvm::dbgs() << "\n OPS:\n";
+  for (auto *op : partition.ops) {
+    llvm::dbgs() << "  ";
+    op->print(llvm::dbgs(), state);
+    llvm::dbgs() << "\n";
+  }
+}
+
+void Partition::dump(Operation *parentOp) {
+  AsmState state(parentOp);
+  dumpPartition(*this, state);
+}
+
+void PartitionSet::dump(Operation *parentOp) {
+  AsmState state(parentOp);
+  for (auto partition : llvm::enumerate(partitions)) {
+    llvm::dbgs() << "PARTITION[" << partition.index() << "]:\n";
+    dumpPartition(partition.value(), state);
+  }
+}
+
+#else
+void Partition::dump(Operation *parentOp) {}
+void PartitionSet::dump(Operation *parentOp) {}
+#endif  // !NDEBUG
+
+LogicalResult Partition::verify(Location loc) {
+  // Ensure values are defined either by other ops in the partition or are
+  // declared as inputs.
+  SetVector<Value> defValues;
+  for (auto *op : ops) {
+    for (auto result : op->getResults()) {
+      defValues.insert(result);
+    }
+  }
+  for (auto *op : ops) {
+    for (auto operand : op->getOperands()) {
+      if (!ins.contains(operand) && !defValues.contains(operand)) {
+        return mlir::emitError(loc)
+               << "operand not declared in partition inputs or by an op within "
+                  "the partition";
+      }
+    }
+  }
+
+  // Ensure all outputs come from ops in the partition (or are pass-through
+  // operands, though those are silly).
+  for (auto out : outs) {
+    if (!ins.contains(out) && !defValues.contains(out)) {
+      return mlir::emitError(loc) << "output not defined by an op within the "
+                                     "partition (or captured)";
+    }
+  }
+
+  return success();
+}
+
+LogicalResult PartitionSet::verify(Location loc) {
+  // Verify each partition is consistent.
+  for (auto &partition : partitions) {
+    if (failed(partition.verify(loc))) return failure();
+  }
+
+  // Ensure no partitions duplicate escaping values as we need a single def to
+  // remap the value in the parent block.
+  SetVector<Value> outs;
+  for (auto &partition : partitions) {
+    for (auto out : partition.outs) {
+      if (outs.contains(out)) {
+        return mlir::emitError(loc)
+               << "duplicate value found in partition set outputs";
+      }
+      outs.insert(out);
+    }
+  }
+
+  // Ensure a correct topological order of partitions. This only checks the
+  // order of the partitions and not any ops that aren't covered. We do this
+  // by walking backwards and checking that no partition captures values
+  // escaping any partitions after it.
+  SetVector<Value> declaredBelow;
+  for (auto &partition : llvm::reverse(partitions)) {
+    for (auto in : partition.ins) {
+      if (declaredBelow.contains(in)) {
+        return mlir::emitError(loc) << "partition set out of order; value "
+                                       "captured declared as escaping below";
+      }
+    }
+    for (auto out : partition.outs) {
+      declaredBelow.insert(out);
+    }
+  }
+
+  return success();
+}
+
+void PartitionSet::topologicalSort() {
+  if (partitions.empty()) return;
+
+  SetVector<Partition *> unsortedSet;
+  DenseMap<Value, SmallVector<Partition *>> consumers;
+  for (auto &partition : partitions) {
+    unsortedSet.insert(&partition);
+    for (auto in : partition.ins) {
+      consumers[in].push_back(&partition);
+    }
+  }
+
+  struct DFSState {
+    SmallVector<Partition *, 16> topologicalCounts;
+    DenseSet<Partition *> seen;
+  } state;
+  std::function<void(Partition *)> postorderWalk;
+  postorderWalk = [&](Partition *current) {
+    for (auto out : current->outs) {
+      for (auto *consumer : consumers[out]) {
+        postorderWalk(consumer);
+      }
+    }
+    auto it = state.seen.insert(current);
+    if (/*inserted=*/it.second) {
+      if (unsortedSet.contains(current)) {
+        state.topologicalCounts.push_back(current);
+      }
+    }
+  };
+  for (auto *partition : unsortedSet) postorderWalk(partition);
+
+  SmallVector<Partition> sortedSet;
+  sortedSet.reserve(partitions.size());
+  for (auto *partition : llvm::reverse(state.topologicalCounts)) {
+    sortedSet.push_back(std::move(*partition));
+  }
+  partitions = std::move(sortedSet);
+}
+
+PartitionSet partitionStreamableOps(IREE::Stream::PartitioningConfigAttr config,
+                                    Block *block) {
+  // Only one algorithm today.
+  return partitionStreamableOpsReference(config, block);
+}
+
+PartitionSet partitionRegionConcurrency(
+    IREE::Stream::PartitioningConfigAttr config, Block *block) {
+  // Only one algorithm today.
+  return partitionRegionConcurrencyReference(config, block);
+}
+
+}  // namespace Stream
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/iree/compiler/Dialect/Stream/Analysis/Partitioning.h b/iree/compiler/Dialect/Stream/Analysis/Partitioning.h
@@ -0,0 +1,131 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_
+#define IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_
+
+#include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace Stream {
+
+//===----------------------------------------------------------------------===//
+// Data structures
+//===----------------------------------------------------------------------===//
+
+// A single slice of ops.
+struct Partition {
+  // SSA values defined outside of the partition.
+  // All values not defined by ops in the partition must be declared.
+  // Multiple partitions may capture the same value.
+  SetVector<Value> ins;
+  // SSA values defined by the partition with uses outside.
+  // All values used by ops outside of the partition must be declared.
+  // Only one partition may produce a new value.
+  SetVector<Value> outs;
+  // All ops covered by the partition. May contain ops that exist in other
+  // partitions in cases where the op is to be duplicated. Not all ops are
+  // streamable (such as constants and arithmetic).
+  SetVector<Operation *> ops;
+
+  void dump(Operation *parentOp);
+
+  // Verifies that the partition meets the required conditions.
+  LogicalResult verify(Location loc);
+};
+
+// A set of all partitions.
+struct PartitionSet {
+  // All partitions in an undefined topological order.
+  SmallVector<Partition> partitions;
+
+  // Total number of partitions in the set.
+  size_t size() const { return partitions.size(); }
+  // Returns true if the set is empty (no streamable ops).
+  bool empty() const { return partitions.empty(); }
+
+  void dump(Operation *parentOp);
+
+  // Verifies that the partition set meets the required conditions.
+  LogicalResult verify(Location loc);
+
+  // Sorts all partitions in a topological order.
+  void topologicalSort();
+};
+
+//===----------------------------------------------------------------------===//
+// Stream partitioning algorithms
+//===----------------------------------------------------------------------===//
+//
+// When these algorithms run all streamable operations have had an affinity
+// assigned and are lowered out of tensor form. Some resources may have
+// lifetimes associated but most will remain unassigned (`!stream.resource<*>`)
+// until after partitioning. Note that there may already exist partitioned ops
+// in stream.execute regions already.
+//
+// The intent is that we can use the information we have about each operation,
+// the resources moving between them, and where they should execute to better
+// partition the DAG. This could optimize for reducing memory transfer between
+// devices, reducing latency by minimizing cuts, maximizing concurrency by
+// separating non-interfering subgraphs, etc.
+//
+// This is a well-researched area and there are many algorithms to choose from.
+// We'll mostly want to focus on ones that are able to handle multiple critera
+// (like memory consumption, compute utilization, available capacity, etc).
+//
+// See for example:
+//   dagP: https://github.com/GT-TDAlab/dagP
+//     Multilevel Algorithms for Acyclic Partitioning of Directed Acyclic Graphs
+//     https://hal.inria.fr/hal-02306566/document
+//  METIS: https://github.com/KarypisLab/METIS
+//     A Fast and High Quality Multilevel Scheme for Partitioning Ireegular
+//     Graphs
+//     http://glaros.dtc.umn.edu/gkhome/metis/metis/publications
+// SCOTCH: https://www.labri.fr/perso/pelegrin/scotch/
+//     Contributions to Parallel Multilevel Graph Partitioning
+//     https://www.labri.fr/perso/pelegrin/papers/hdr.pdf
+// Zoltan: https://cs.sandia.gov/Zoltan/
+//     https://cs.sandia.gov/Zoltan/Zoltan_pubs.html
+//     https://cs.sandia.gov/Zoltan/papers/zoltan_tutorial_dagstuhl09.pdf
+//
+// And some good papers/overviews:
+// - Edge Partitioning of Large Graphs
+//   https://tel.archives-ouvertes.fr/tel-01956979/document
+//
+
+// Partitions the ops in |block| such that all streamable ops are in one or more
+// partitions (with >1 implying duplication). Partitions may contain
+// non-streamable ops if it is safe to do so (such as std arithmetic). Not all
+// ops in the block will be covered by a partition.
+PartitionSet partitionStreamableOps(IREE::Stream::PartitioningConfigAttr config,
+                                    Block *block);
+PartitionSet partitionRegionConcurrency(
+    IREE::Stream::PartitioningConfigAttr config, Block *block);
+
+//===----------------------------------------------------------------------===//
+// Reference partitioning
+//===----------------------------------------------------------------------===//
+
+// Naive clustering based solely on correctness with no cost model or weighting.
+// Produces the largest possible streams for any given block. Unsatisfactory.
+PartitionSet partitionStreamableOpsReference(
+    IREE::Stream::PartitioningConfigAttr config, Block *block);
+
+// Similarly poor algorithm to partitionStreamableOpsReference but for use
+// within partitioned streams to produce waves of concurrently executable work.
+PartitionSet partitionRegionConcurrencyReference(
+    IREE::Stream::PartitioningConfigAttr config, Block *block);
+
+}  // namespace Stream
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_DIALECT_STREAM_ANALYSIS_PARTITIONING_H_