diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 49b25d6e0..feb00e029 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -22,7 +22,7 @@ def with_types(self, arg_id_to_dtype, callables_table): if vec_dtype.numpy_dtype == np.float32: name_in_target = "cblas_sgemv" - elif vec_dtype. numpy_dtype == np.float64: + elif vec_dtype.numpy_dtype == np.float64: name_in_target = "cblas_dgemv" else: raise LoopyError("GEMV is only supported for float32 and float64 " @@ -47,30 +47,37 @@ def with_descrs(self, arg_id_to_descr, callables_table): assert mat_descr.shape[0] == res_descr.shape[0] assert len(vec_descr.shape) == len(res_descr.shape) == 1 # handling only the easy case when stride == 1 - assert vec_descr.dim_tags[0].stride == 1 assert mat_descr.dim_tags[1].stride == 1 - assert res_descr.dim_tags[0].stride == 1 return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): from pymbolic import var + from loopy.codegen import UnvectorizableError mat_descr = self.arg_id_to_descr[0] + vec_descr = self.arg_id_to_descr[1] + res_descr = self.arg_id_to_descr[-1] m, n = mat_descr.shape ecm = expression_to_code_mapper + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize BLAS-gemv.") + mat, vec = insn.expression.parameters result, = insn.assignees c_parameters = [var("CblasRowMajor"), var("CblasNoTrans"), m, n, - 1, + 1, # alpha ecm(mat).expr, - 1, + 1, # LDA ecm(vec).expr, - 1, + vec_descr.dim_tags[0].stride, # INCX + 1, # beta ecm(result).expr, - 1] + res_descr.dim_tags[0].stride # INCY + ] return (var(self.name_in_target)(*c_parameters), False # cblas_gemv does not return anything ) @@ -83,17 +90,66 @@ def generate_preambles(self, target): # }}} -n = 10 +def transform_1(knl): + return knl + + +def transform_2(knl): + # A similar transformation is applied to kernels containing + # SLATE + # callables. + knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1)) + knl = lp.privatize_temporaries_with_inames(knl, "e_inner") + knl = lp.tag_inames(knl, {"e_inner": "vec"}) + if 0: + # Easy codegen exercise, but misses vectorizing certain instructions. + knl = lp.tag_array_axes(knl, "tmp3", "c,vec") + else: + knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec") + return knl + + +def main(): -knl = lp.make_kernel( - "{:}", + knl = lp.make_kernel( + "{[e,i1,i2]: 0<=e tmp3[i2] = 2 * tmp2[i2] + out[e, i2] = tmp3[i2] + end + end + """, + kernel_data=[ + lp.TemporaryVariable("tmp1", + shape=(4, ), + dtype=None), + lp.TemporaryVariable("tmp2", + shape=(4, ), + dtype=None), + lp.GlobalArg("A", + shape=(4, 4), + dtype="float64"), + lp.GlobalArg("x", + shape=lp.auto, + dtype="float64"), + ...], + target=lp.CVectorExtensionsTarget(), + lang_version=(2018, 2)) + + knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec")) + + for transform_func in [transform_1, transform_2]: + knl = transform_func(knl) + print("Generated code from '{transform_func.__name__} -----'") + print(lp.generate_code_v2(knl).device_code()) + print(75 * "-") + + +if __name__ == "__main__": + main()