Should GuardWithIf use likely, or likely_if_innermost #8091

abadams · 2024-02-12T23:05:19Z

abadams
Feb 12, 2024
Maintainer

This was brought up in several dev meetings. I finally did a large experiment to try to settle the question. The surprising answer is that switching it to likely_if_innermost increased code size and runtime on average (5% and 3%).

To understand why, it's helpful to consider a small example with typical GuardWithIf usage

#include "Halide.h"
using namespace Halide;

int main() {
    Func f{"f"};
    Var x{"x"}, y{"y"}, xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};

    f(x, y) = x + y;

    Expr slice_size = (f.output_buffer().height() + 15) / 16;

    f.split(x, xo, xi, 8, TailStrategy::GuardWithIf)
        .split(y, yo, yi, slice_size, TailStrategy::GuardWithIf)
        .vectorize(xi)
        .parallel(yo);

    f.realize({100, 100});

    return 0;
}

Before loop partitioning it looks like this:

  parallel (f.s0.y.yo, 0, f.s0.y.yo.loop_extent) {
   for (f.s0.y.yi, 0, (f.extent.1 + 15)/16) {
    if ((uint1)likely(((((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi) + 1) <= (f.extent.1 + 
f.min.1))) {
     for (f.s0.x.xo, 0, (f.extent.0 + 7)/8) {
      if ((uint1)likely(((f.s0.x.xo*8) + 8) <= f.extent.0)) {
       f[ramp((((((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) 
+ f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((((f.extent.1 
+ 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi), 1, 8)
      } else {
       predicate ((uint1x8)likely(ramp(((f.s0.x.xo*8) + f.min.0) + 1, 1, 8) <= x8(f.extent.0 + f.min.0)))
        f[ramp((((((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8)
 + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((((f.extent.1
 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi), 1, 8)
      }
     }
    }
   }
  }

Note the 'likely' in the outer if has no else case. This means that if you partition the loop such that all cases where it's true are in steady-state, and all cases where it's false are in an epilogue, and the partition went perfectly, then the epilogue is a no-op, because the if has no else case, so you get this:

  parallel (f.s0.y.yo, 0, f.s0.y.yo.loop_extent) {
   let f.s0.y.yi.epilogue.s = min((f.extent.1 + 15)/16, f.extent.1 - (((f.extent.1 + 15)/16)*f.s0.y.yo))
   for (f.s0.y.yi, 0, max(f.s0.y.yi.epilogue.s, 0)) {
    for (f.s0.x.xo, 0, f.extent.0/8) {
     f[ramp((((((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi), 1, 8)
    }
    if ((f.extent.0 % 8) != 0) {
     predicate (ramp((((f.extent.0/8)*8) + f.min.0) + 1, 1, 8) <= x8(f.extent.0 + f.min.0))
      f[ramp((((((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi)*f.stride.1) + (((f.extent.0/8)*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp((((f.extent.0/8)*8) + f.min.0) + (((((f.extent.1 + 15)/16)*f.s0.y.yo) + f.min.1) + f.s0.y.yi), 1, 8)
    }
   }
  }

Loop partitioning as directed by that likely has simply reduced the extent of the yi loop, not increasing code size. It's basically the same code as if you had said .never_partition(y).

So with this kind of scheduling idiom, using likely doesn't hurt, and there are cases in the codebase I'm looking at where it helps (for reasons that aren't entirely clear to me).

But now consider a case where the split factor is a constant:

#include "Halide.h"
using namespace Halide;

int main() {
    Func f{"f"};
    Var x{"x"}, y{"y"}, xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};

    f(x, y) = x + y;

    Expr slice_size = 8;

    f.split(x, xo, xi, 8, TailStrategy::GuardWithIf)
        .split(y, yo, yi, slice_size, TailStrategy::GuardWithIf)
        .vectorize(xi)
        .parallel(yo);

    f.realize({100, 100});

    return 0;
}

Before loop partitioning we have:

  parallel (f.s0.y.yo, 0, (f.extent.1 + 7)/8) {
   for (f.s0.y.yi, 0, 8) {
    if ((uint1)likely(((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi) + 1) <= (f.extent.1 + f.min.1))) {
     for (f.s0.x.xo, 0, (f.extent.0 + 7)/8) {
      if ((uint1)likely(((f.s0.x.xo*8) + 8) <= f.extent.0)) {
       f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.min
.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.y
i), 1, 8)
      } else {
       predicate ((uint1x8)likely(ramp(((f.s0.x.xo*8) + f.min.0) + 1, 1, 8) <= x8(f.extent.0 + f.min.0)))
        f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.mi
n.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.
yi), 1, 8)
      }
     }
    }
   }
  }

It's much the same. After loop partitioning we get:

  parallel (f.s0.y.yo, 0, (f.extent.1 + 7)/8) {
   if (f.s0.y.yo < (f.extent.1/8)) {
    for (f.s0.y.yi, 0, 8) {
     for (f.s0.x.xo, 0, f.extent.0/8) {
      f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi), 1, 8)
     }
     if ((f.extent.0 % 8) != 0) {
      predicate (ramp((((f.extent.0/8)*8) + f.min.0) + 1, 1, 8) <= x8(f.extent.0 + f.min.0))
       f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + (((f.extent.0/8)*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp((((f.extent.0/8)*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi), 1, 8)
     }
    }
   } else {
    for (f.s0.y.yi, 0, 8) {
     if (((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi) + 1) <= (f.extent.1 + f.min.1)) {
      for (f.s0.x.xo, 0, (f.extent.0 + 7)/8) {
       if (((f.s0.x.xo*8) + 8) <= f.extent.0) {
        f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi), 1, 8)
       } else {
        predicate (ramp(((f.s0.x.xo*8) + f.min.0) + 1, 1, 8) <= x8(f.extent.0 + f.min.0))
         f[ramp((((((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi)*f.stride.1) + ((f.s0.x.xo*8) + f.min.0)) - ((f.min.1*f.stride.1) + f.min.0), 1, 8)] = ramp(((f.s0.x.xo*8) + f.min.0) + (((f.s0.y.yo*8) + f.min.1) + f.s0.y.yi), 1, 8)
       }
      }
     }
    }
   }
  }

Uh oh.

Digging into this more, it's happening because we try to partition loops from the outside in. In the more complicated case, first we try to partition the loop over yo. This fails because the IR is complicated, so we try to partition the loop over yi. This succeeds, and is what we want in this instance. In the second case we try to partition the loop over yo, and succeed. So we get a steady state group of 8 scanlines, and a tail group of 8 scanlines with the if still in it, in separate pieces of code.

So our heuristic for deciding which loops to split is not so good in this case. You can manually control it with .partition, and indeed if you say .never_partition(yo) you get good code.

We partition loops from the outside in because there are other cases where it's much better. If you're doing a blur on an input with a boundary condition, you probably want to partition the loop over output vectors, not the loop over the kernel taps. If you're doing something on a GPU, you probably want to partition the GPU block loop, so you get the same control flow within each warp, as opposed to partitioning the thread loop, which would lead to warp divergence.

The next questions are

Why am I seeing lots of cases where likely_if_innermost is worse?
If we keep it as likely, can the heuristic be improved?

abadams · 2024-02-13T21:26:44Z

abadams
Feb 13, 2024
Maintainer Author

The effect I was seeing where likely_if_innermost was substantially worse in some cases was almost entirely a quirk of how NoAsserts works, which I'll fix shortly. With that fixed, likely vs likely_if_innermost barely matters. Using likely_if_innermost instead of likely increases runtime by 1.3% and decreases code size by 0.8%. So that answers question 1. We still have the is-this-the-right-heuristic question though.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Should GuardWithIf use likely, or likely_if_innermost #8091

{{title}}

Replies: 1 comment

{{title}}

Select a reply

Should GuardWithIf use likely, or likely_if_innermost #8091

abadams Feb 12, 2024 Maintainer

Replies: 1 comment

abadams Feb 13, 2024 Maintainer Author

abadams
Feb 12, 2024
Maintainer

abadams
Feb 13, 2024
Maintainer Author