Skip to content

Commit

Permalink
[XLA] Add rule in latency hiding scheduler to hold back scheduling in…
Browse files Browse the repository at this point in the history
…structions valuable for selective overlaps.

Add a rule to latency hiding scheduler that holds back scheduling instructions that are valuable for selective overlaps. This rule applies if there are no selective overlaps currently open and there will be overlaps opened in the near future.

PiperOrigin-RevId: 661500850
  • Loading branch information
tensorflower-gardener committed Aug 10, 2024
1 parent 0679a36 commit 75fca3d
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 1 deletion.
74 changes: 74 additions & 0 deletions third_party/xla/xla/service/latency_hiding_scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,15 @@ bool AsyncTracker::ReleasesSelectiveResource(const HloGraphNode* node) const {
});
}

bool AsyncTracker::OccupiesSelectiveResource(const HloGraphNode* node) const {
return absl::c_any_of(
node->GetResources(), [&](const ResourcePair& resource) {
return resource.second == ResourceUsageType::kResourceOccupy &&
GetResourceHazardType(resource.first) ==
ResourceHazardType::kSelective;
});
}

BufferInfoTracker::BufferInfoTracker(
const HloModule* module, const HloAliasAnalysis* alias_analysis,
const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
Expand Down Expand Up @@ -731,6 +740,25 @@ DefaultSchedulerCore::ScheduleCandidate InitializeCandidate(

namespace {

// Find the num hops to the closest selective resource overlap in ready set that
// provided node can be scheduled in between.
int64_t GetNumHopsToClosestSelectiveOverlap(
const DefaultSchedulerCore::ReadyQueueSet& ready_set,
const HloGraphNode* node) {
int64_t num_hops_to_closest_selective_resource_occupier =
std::numeric_limits<int64_t>::max();
for (const HloGraphNode* n : ready_set) {
// Skip the node itself.
if (n == node) {
continue;
}
num_hops_to_closest_selective_resource_occupier =
std::min(num_hops_to_closest_selective_resource_occupier,
n->GetNumHopsToClosestSelectiveResourceOccupier());
}
return num_hops_to_closest_selective_resource_occupier;
}

// Comparator for the ready set. This class represents the priority policies
// for the nodes in the ready set. The policy can be whatever is appropriate to
// reduce the execution time of the graph or achieve interesting properties
Expand Down Expand Up @@ -1002,6 +1030,31 @@ class ReadySetLt {
return *value;
}
}
// If there are no selective overlaps open currently and there will be
// overlaps opened in the near future, hold off scheduling instructions
// that are valuable for selective overlaps.
if (sched_state_.config.enable_selective_resources &&
sched_state_.selective_resource_releasers.empty()) {
int64_t distance_to_selective_overlap_for_a =
GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, a.node);
int64_t distance_to_selective_overlap_for_b =
GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, b.node);
// If a is valuable for selective overlap and there is a selective
// overlap in the near future a can be scheduled inside, hold off
// scheduling a and schedule b instead. Same logic applies in reverse.
int64_t max_distance =
sched_state_.config.max_hops_to_closest_selective_overlap;
if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
(a.node->GetValuableForSelectiveOverlap() &&
distance_to_selective_overlap_for_a <= max_distance),
b,
(b.node->GetValuableForSelectiveOverlap() &&
distance_to_selective_overlap_for_b <= max_distance),
a, "kNotValuableForSelectiveOverlap")) {
return *value;
}
}

if (sched_state_.config.aggressive_scheduling_policies) {
// Favor nodes that unlock other nodes to be scheduled if possible.
// This makes us more flexible in what we can use in scheduling.
Expand Down Expand Up @@ -1693,6 +1746,8 @@ HloScheduleGraph::HloScheduleGraph(
new_node_it->second->GetResources());
new_node_it->second->releases_selective_resource_ =
async_tracker->ReleasesSelectiveResource(new_node_it->second.get());
new_node_it->second->occupies_selective_resource_ =
async_tracker->OccupiesSelectiveResource(new_node_it->second.get());
// Gather while instructions for subsequent send-done dependency checks.
if (instr->opcode() == HloOpcode::kWhile) {
while_instrs.push_back(instr);
Expand Down Expand Up @@ -1900,6 +1955,25 @@ void HloScheduleGraph::InitializeGraphAnalysis(
while (!stack.empty()) {
auto* node = stack.back();
stack.pop_back();
// If a node occupies a selective resource, it is the closest selective
// resource occupier to itself and is 0 hops away. Otherwise, the num hops
// to closest selective resource occupier is the minimum of that of all
// predecessors plus 1.
if (async_tracker->OccupiesSelectiveResource(node)) {
node->num_hops_to_closest_selective_resource_occupier_ = 0;
} else {
int64_t closest_predecessor_distance =
std::numeric_limits<int64_t>::max();
for (auto& pred : node->GetPredecessors()) {
closest_predecessor_distance = std::min(
closest_predecessor_distance,
pred.Target().num_hops_to_closest_selective_resource_occupier_);
}
if (closest_predecessor_distance != std::numeric_limits<int64_t>::max()) {
node->num_hops_to_closest_selective_resource_occupier_ =
closest_predecessor_distance + 1;
}
}
if (async_tracker->IsSupportedAsyncDone(node->GetInstr())) {
for (auto& pred : node->GetPredecessors()) {
node->SetAsyncDepth(
Expand Down
21 changes: 20 additions & 1 deletion third_party/xla/xla/service/latency_hiding_scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ struct SchedulerConfig {
bool resource_serializing = false;
bool depth_based_memory_pressure_reduction = false;
bool enable_selective_resources = false;
int64_t max_hops_to_closest_selective_overlap = 0;
int64_t rerun = 0;
};

Expand Down Expand Up @@ -284,6 +285,9 @@ class AsyncTracker {
// Returns whether the provided node releases a selective resource.
bool ReleasesSelectiveResource(const HloGraphNode* node) const;

// Returns whether the provided node occupies a selective resource.
bool OccupiesSelectiveResource(const HloGraphNode* node) const;

inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
return get_canonical_async_op_(hlo);
}
Expand Down Expand Up @@ -386,6 +390,17 @@ class HloGraphNode {
bool ReleasesSelectiveResource() const {
return releases_selective_resource_;
}
bool OccupiesSelectiveResource() const {
return occupies_selective_resource_;
}
int64_t GetNumHopsToClosestSelectiveResourceOccupier() const {
return num_hops_to_closest_selective_resource_occupier_;
}
void SetNumHopsToClosestSelectiveResourceOccupier(
int64_t num_hops_to_closest_selective_resource_occupier) {
num_hops_to_closest_selective_resource_occupier_ =
num_hops_to_closest_selective_resource_occupier;
}

ResourcesVector GetResources() const { return resources_; }
bool DoesOccupyAnyResource() const {
Expand Down Expand Up @@ -525,6 +540,11 @@ class HloGraphNode {
bool valuable_for_selective_overlap_ = true;
// Whether this node releases a selective resource.
bool releases_selective_resource_ = false;
// Whether this node occupies a selective resource.
bool occupies_selective_resource_ = false;
// Nums hops to closest selective resource occupier.
int64_t num_hops_to_closest_selective_resource_occupier_ =
std::numeric_limits<int64_t>::max();
};

// Schedule graph that can be used to drive scheduling
Expand Down Expand Up @@ -920,7 +940,6 @@ class DefaultSchedulerCore : public SchedulerCore {
virtual absl::StatusOr<HloGraphNode*> FindAndExtractBestNodeAvailable(
SchedulingState& sched_state,
DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node);
bool DoesNodeReleaseSelectiveResource(const HloGraphNode* node) const;
void DumpLatencyHidingSchedule(
const HloComputation* computation, const HloScheduleGraph& schedule_graph,
const std::vector<HloInstruction*>& instructions,
Expand Down

0 comments on commit 75fca3d

Please sign in to comment.