Skip to content

Commit

Permalink
test vectorizability of blas-callables with vector inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
kaushikcfd committed May 11, 2022
1 parent 76bd6b5 commit 4c0c013
Showing 1 changed file with 107 additions and 20 deletions.
127 changes: 107 additions & 20 deletions examples/python/call-external.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
from loopy.target.c.c_execution import CCompiler
from codepy.toolchain import GCCToolchain


# {{{ blas callable
Expand All @@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table):

if vec_dtype.numpy_dtype == np.float32:
name_in_target = "cblas_sgemv"
elif vec_dtype. numpy_dtype == np.float64:
elif vec_dtype.numpy_dtype == np.float64:
name_in_target = "cblas_dgemv"
else:
raise LoopyError("GEMV is only supported for float32 and float64 "
Expand All @@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table):
assert mat_descr.shape[0] == res_descr.shape[0]
assert len(vec_descr.shape) == len(res_descr.shape) == 1
# handling only the easy case when stride == 1
assert vec_descr.dim_tags[0].stride == 1
assert mat_descr.dim_tags[1].stride == 1
assert res_descr.dim_tags[0].stride == 1

return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table

def emit_call_insn(self, insn, target, expression_to_code_mapper):
from pymbolic import var
from loopy.codegen import UnvectorizableError
mat_descr = self.arg_id_to_descr[0]
vec_descr = self.arg_id_to_descr[1]
res_descr = self.arg_id_to_descr[-1]
m, n = mat_descr.shape
ecm = expression_to_code_mapper

if ecm.codegen_state.vectorization_info is not None:
raise UnvectorizableError("cannot vectorize BLAS-gemv.")

mat, vec = insn.expression.parameters
result, = insn.assignees

c_parameters = [var("CblasRowMajor"),
var("CblasNoTrans"),
m, n,
1,
1, # alpha
ecm(mat).expr,
1,
mat_descr.dim_tags[0].stride, # LDA
ecm(vec).expr,
1,
vec_descr.dim_tags[0].stride, # INCX
0, # beta
ecm(result).expr,
1]
res_descr.dim_tags[0].stride # INCY
]
return (var(self.name_in_target)(*c_parameters),
False # cblas_gemv does not return anything
)
Expand All @@ -83,17 +92,95 @@ def generate_preambles(self, target):
# }}}


n = 10

knl = lp.make_kernel(
"{:}",
def transform_1(knl):
return knl


def transform_2(knl):
# A similar transformation is applied to kernels containing
# SLATE <https://www.firedrakeproject.org/firedrake.slate.html>
# callables.
knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1))
knl = lp.privatize_temporaries_with_inames(knl, "e_inner")
knl = lp.tag_inames(knl, {"e_inner": "vec"})
if 0:
# Easy codegen exercise, but misses vectorizing certain instructions.
knl = lp.tag_array_axes(knl, "tmp3", "c,vec")
else:
knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec")
return knl


def main():

compiler = CCompiler(toolchain=GCCToolchain(
cc="gcc",
cflags="-std=c99 -O3 -fPIC".split(),
ldflags="-shared".split(),
libraries=["blas"],
library_dirs=[],
defines=[],
undefines=[],
source_suffix="c",
so_ext=".so",
o_ext=".o",
include_dirs=[]))

knl = lp.make_kernel(
"{[e,i1,i2]: 0<=e<n and 0<=i1,i2<4}",
"""
y[:] = gemv(A[:, :], x[:])
""", [
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
lp.GlobalArg("y", shape=(n, )), ...],
target=CTarget())

knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
print(lp.generate_code_v2(knl).device_code())
for e
for i1
tmp1[i1] = 3*x[e, i1]
end
tmp2[:] = matvec(A[:, :], tmp1[:])
for i2
<> tmp3[i2] = 2 * tmp2[i2]
out[e, i2] = tmp3[i2]
end
end
""",
kernel_data=[
lp.TemporaryVariable("tmp1",
shape=(4, ),
dtype=None),
lp.TemporaryVariable("tmp2",
shape=(4, ),
dtype=None),
lp.GlobalArg("A",
shape=(4, 4),
dtype="float64"),
lp.GlobalArg("x",
shape=lp.auto,
dtype="float64"),
...],
target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler),
lang_version=(2018, 2))

knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec"))

for transform_func in [transform_1, transform_2]:
knl = transform_func(knl)
print("Generated code from '{transform_func.__name__} -----'")
print(lp.generate_code_v2(knl).device_code())
print(75 * "-")

# {{ verify the result is correct.

from numpy.random import default_rng

rng = default_rng(seed=0)
a = rng.random((4, 4))
x = rng.random((100, 4))

_, (out,) = knl(A=a, x=x)

np.testing.assert_allclose(6*np.einsum("ij,ej->ei",
a, x),
out)

# }}}


if __name__ == "__main__":
main()

0 comments on commit 4c0c013

Please sign in to comment.