From ed9a54a5096620acd68c6766295afb468a49eab8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Mar 2022 14:34:13 -0600 Subject: [PATCH] implement vectorization fallback mechanisms --- loopy/__init__.py | 5 +++-- loopy/codegen/__init__.py | 42 ++++++++++++++++++++++++++++++--------- loopy/codegen/loop.py | 32 +++++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 62529016e..9d2177ef0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -43,7 +43,8 @@ AddressSpace, TemporaryVariable, SubstitutionRule, - CallMangleInfo) + CallMangleInfo, + VectorizeTag) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.translation_unit import ( @@ -190,7 +191,7 @@ "AddressSpace", "TemporaryVariable", "SubstitutionRule", - "CallMangleInfo", + "CallMangleInfo", "VectorizeTag", "make_kernel", "UniqueName", "make_function", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index a700b153a..9e2182a0f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -281,23 +281,47 @@ def try_vectorized(self, what, func): return self.unvectorize(func) def unvectorize(self, func): + from loopy.codegen.result import (merge_codegen_results, + CodeGenerationResult) + from loopy.target import VectorizationFallback + vinf = self.vectorization_info assert vinf is not None result = [] novec_self = self.copy(vectorization_info=None) - for i in range(vinf.length): - idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i - new_codegen_state = novec_self.fix(vinf.iname, idx_aff) - generated = func(new_codegen_state) - - if isinstance(generated, list): - result.extend(generated) + if self.target.vectorization_fallback == VectorizationFallback.UNROLL: + for i in range(vinf.length): + idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i + new_codegen_state = novec_self.fix(vinf.iname, idx_aff) + generated = func(new_codegen_state) + + if isinstance(generated, list): + result.extend(generated) + else: + result.append(generated) + elif self.target.vectorization_fallback == VectorizationFallback.OMP_SIMD: + astb = self.ast_builder + inner = func(novec_self) + if isinstance(inner, list): + inner = merge_codegen_results(novec_self, inner) + assert isinstance(inner, CodeGenerationResult) + if isinstance(inner.current_ast(novec_self), + astb.ast_comment_class): + # loop body is a comment => do not emit the loop + loop_cgr = inner else: - result.append(generated) + result.append(astb.emit_pragma("omp simd")) + loop_cgr = inner.with_new_ast( + novec_self, + astb.emit_sequential_loop( + novec_self, vinf.iname, self.kernel.index_dtype, + 0, vinf.length-1, inner.current_ast(novec_self))) + result.append(loop_cgr) + else: + raise NotImplementedError(self.target.vectorization_fallback) - from loopy.codegen.result import merge_codegen_results return merge_codegen_results(self, result) @property diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a0d22330f..a2ea89c9b 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -160,10 +160,25 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops +def raise_for_unvectorizable_loop(codegen_state, sched_index): + kernel = codegen_state.kernel + raise RuntimeError(f"Cannot vectorize {kernel.schedule[sched_index]}") + + def generate_vectorize_loop(codegen_state, sched_index): + from loopy.kernel.data import VectorizeTag + from loopy.target import VectorizationFallback kernel = codegen_state.kernel iname = kernel.linearization[sched_index].iname + vec_tag, = kernel.inames[iname].tags_of_type(VectorizeTag) + + if kernel.target.vectorization_fallback == VectorizationFallback.UNROLL: + fallback_codegen_routine = generate_unroll_loop + elif kernel.target.vectorization_fallback == VectorizationFallback.OMP_SIMD: + fallback_codegen_routine = generate_openmp_simd_loop + else: + raise NotImplementedError(kernel.target.vectorization_fallback) bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -177,7 +192,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) @@ -192,7 +207,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) # {{{ 'implement' vectorization bounds @@ -484,4 +499,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # }}} + +# {{{ omp simd loop + +def generate_openmp_simd_loop(codegen_state, sched_index): + return merge_codegen_results( + codegen_state, + [codegen_state.ast_builder.emit_pragma("omp simd"), + generate_sequential_loop_dim_code(codegen_state, + sched_index)]) + +# }}} + + # vim: foldmethod=marker