We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
I'm not sure why it becomes unschedulable.
import loopy as lp import numpy as np from pymbolic.primitives import * import immutables e2p_from_single_box_knl = lp.make_kernel( [ "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }", "{ [idim, idim_0] : 0 <= idim <= 2 and 0 <= idim_0 <= 2 }", "{ [itgt_offset_outer, itgt_offset_inner] : itgt_offset_inner >= 0 and -32itgt_offset_outer <= itgt_offset_inner <= 46 - 32itgt_offset_outer and itgt_offset_inner <= 31 }", "{ [icoeff_outer, icoeff_inner] : icoeff_inner >= 0 and -32icoeff_outer <= icoeff_inner <= 120 - 32icoeff_outer and icoeff_inner <= 31 }", "{ [iknl, iknl_0] : iknl = 0 and iknl_0 = 0 }", "{ [dummy] : 0 <= dummy <= 31 }", "[ntargets] -> { [] : ntargets > 0 }", "{ [e2p_idim] : 0 <= e2p_idim <= 2 }", "{ [e2p_iorder0] : 0 < e2p_iorder0 <= 10 }", "{ [e2p_zero_idx] : 1 = 0 }", "{ [e2p_icoeff_outer, e2p_icoeff_inner] : e2p_icoeff_inner >= 0 and -32e2p_icoeff_outer <= e2p_icoeff_inner <= 120 - 32e2p_icoeff_outer and e2p_icoeff_inner <= 31 }", "{ [e2p_x0] : 0 <= e2p_x0 <= 10 }", "[e2p_x0] -> { [e2p_iorder1] : e2p_x0 <= e2p_iorder1 <= 10 }", "[e2p_iorder1, e2p_x0] -> { [e2p_x2] : 0 <= e2p_x2 <= e2p_iorder1 - e2p_x0 }", "[e2p_iorder1, e2p_x0, e2p_x2] -> { [e2p_x1] : e2p_x1 = e2p_iorder1 - e2p_x0 - e2p_x2 }", "[e2p_x0] -> { [e2p_iorder2] : e2p_x0 <= e2p_iorder2 <= 10 }", "[e2p_iorder2, e2p_x0] -> { [e2p_y2] : 0 <= e2p_y2 <= e2p_iorder2 - e2p_x0 }", "[e2p_iorder2, e2p_x0, e2p_y2] -> { [e2p_y1] : e2p_y1 = e2p_iorder2 - e2p_x0 - e2p_y2 }", ], ''' kernel_scaling = (1 / 4)*3.141592653589793**(-1) {id=kernel_scaling, inames=+dummy:itgt_box} tgt_ibox = target_boxes[itgt_box] {id=fetch_init0, inames=dummy:itgt_box} itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1, dep=fetch_init0, inames=dummy:itgt_box} itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=fetch_init2, dep=fetch_init0:fetch_init1, inames=dummy:itgt_box} center[idim] = centers[idim, tgt_ibox] {id=fetch_center, dep=fetch_init0, inames=dummy:itgt_box:idim} coeffs[icoeff_inner + icoeff_outer*32] = src_expansions[tgt_ibox + (-1)*src_base_ibox, icoeff_inner + icoeff_outer*32] {id=fetch_coeffs, dep=fetch_init0, inames=icoeff_outer:itgt_box:icoeff_inner} itgt = itgt_start + itgt_offset_inner + itgt_offset_outer*32 {id=insn, dep=fetch_init1, inames=itgt_offset_outer:itgt_offset_inner:itgt_box} run_itgt = itgt < itgt_end {id=insn_0, dep=fetch_init2:insn, inames=itgt_offset_outer:itgt_offset_inner:itgt_box} tgt[idim_0] = targets[idim_0, itgt] {id=fetch_tgt, dep=insn:insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:idim_0} result_temp[iknl_0] = 0 {id=init_result, dep=insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:iknl_0} ... nop {id=e2p__start, dep=fetch_coeffs:fetch_tgt:insn_0:init_result:fetch_center:insn, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box} e2p_b[e2p_idim] = (tgt[e2p_idim] + (-1)*center[e2p_idim])*(1 / rscale) {id=e2p_set_b, dep=e2p__start, inames=itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box} e2p_power_b[e2p_idim, e2p_zero_idx] = 0 {id=e2p_zero_monomials, dep=e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_zero_idx:itgt_box} e2p_power_b[e2p_idim, 0] = 1 {id=e2p_init_monomials, dep=e2p__start:e2p_zero_monomials, inames=+itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box} e2p_power_b[e2p_idim, e2p_iorder0] = e2p_power_b[e2p_idim, e2p_iorder0 + -1]*e2p_b[e2p_idim]*(1 / e2p_iorder0) {id=e2p_update_monomials, dep=e2p_set_b:e2p_init_monomials:e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_iorder0:itgt_box} e2p_coeffs_copy[e2p_icoeff_inner + e2p_icoeff_outer*32] = coeffs[e2p_icoeff_inner + e2p_icoeff_outer*32] {id=e2p_copy_coeffs, dep=e2p__start, inames=+e2p_icoeff_outer:e2p_icoeff_inner:itgt_box:itgt_offset_outer} e2p_coeffs_copy[((e2p_x0 % 2 + e2p_x1 + e2p_x2)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 1)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)) // 6 + (e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_x1 if e2p_x0 % 2 + e2p_x1 + e2p_x2 < 1 else (2*(e2p_x0 % 2 + e2p_x1 + e2p_x2)*(2 + e2p_x0 % 2 + e2p_x1 + e2p_x2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_x1 + e2p_x2))) // 2 + e2p_x1] = e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 + 2 if (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2))) // 2 + e2p_x1 + 2]*(-1.0) + e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 if (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2))) // 2 + e2p_x1]*(-1.0) {id=e2p_update_coeffs, dep=e2p__start:e2p_copy_coeffs, inames=+e2p_x2:e2p_iorder1:itgt_offset_outer:e2p_x0:e2p_x1:itgt_box} result_temp[0] = result_temp[0] + e2p_coeffs_copy[((e2p_x0 % 2 + e2p_y1 + e2p_y2)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 1)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)) // 6 + (e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_y1 if e2p_x0 % 2 + e2p_y1 + e2p_y2 < 1 else (2*(e2p_x0 % 2 + e2p_y1 + e2p_y2)*(2 + e2p_x0 % 2 + e2p_y1 + e2p_y2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_y1 + e2p_y2))) // 2 + e2p_y1]*e2p_power_b[0, e2p_x0]*e2p_power_b[1, e2p_y1]*e2p_power_b[2, e2p_y2] {id=e2p_write_0, dep=e2p_update_monomials:e2p_update_coeffs:e2p__start, inames=+itgt_offset_inner:itgt_offset_outer:e2p_iorder2:e2p_x0:e2p_y1:e2p_y2:itgt_box} ... nop {id=update_result, dep=e2p_write_0:e2p_update_monomials:e2p_zero_monomials:e2p_update_coeffs:e2p_set_b:e2p_init_monomials:e2p_copy_coeffs, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box} result[iknl, itgt] = result_temp[iknl]*kernel_scaling {id=write_result, dep=update_result:insn:insn_0:kernel_scaling, inames=iknl:itgt_offset_inner:itgt_box:itgt_offset_outer} ''', [ lp.GlobalArg( name="targets", dtype=None, shape=(3, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="box_target_starts", dtype=None, shape=None, for_atomic=False), lp.GlobalArg( name="box_target_counts_nonchild", dtype=None, shape=None, for_atomic=False), lp.GlobalArg( name="centers", dtype=None, shape=(3, Variable('naligned_boxes')), for_atomic=False), lp.ValueArg( name="rscale", dtype=None), lp.GlobalArg( name="result", dtype=None, shape=(1, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="src_expansions", dtype=None, shape=(Variable('nsrc_level_boxes'), 121), for_atomic=False), lp.ValueArg( name="nsrc_level_boxes", dtype=np.int32), lp.ValueArg( name="naligned_boxes", dtype=np.int32), lp.ValueArg( name="src_base_ibox", dtype=np.int32), lp.ValueArg( name="ntargets", dtype=np.int32), lp.ValueArg( name="ntgt_boxes", dtype=None), lp.GlobalArg( name="target_boxes", dtype=None, shape=(Variable('ntgt_boxes'),), for_atomic=False), lp.TemporaryVariable( name="kernel_scaling", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="tgt_ibox", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_start", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_end", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="center", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="coeffs", shape=(121,), for_atomic=False, address_space=lp.AddressSpace.LOCAL, read_only=False, ), lp.TemporaryVariable( name="itgt", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="run_itgt", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="tgt", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="result_temp", shape=(1,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_b", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_power_b", shape=(3, 11), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_coeffs_copy", shape=(121,), for_atomic=False, address_space=lp.AddressSpace.LOCAL, read_only=False, ), ], lang_version=(2018, 2), iname_slab_increments=immutables.Map({'itgt_offset_outer': (0, 0), 'e2p_icoeff_outer': (0, 0), 'icoeff_outer': (0, 0)}), applied_iname_rewrites=({Variable('itgt_offset'): Sum((Variable('itgt_offset_inner'), Product((Variable('itgt_offset_outer'), 32))))}, {Variable('icoeff'): Sum((Variable('icoeff_inner'), Product((Variable('icoeff_outer'), 32))))}, {Variable('e2p_icoeff'): Sum((Variable('e2p_icoeff_inner'), Product((Variable('e2p_icoeff_outer'), 32))))}), name="e2p_from_single_box", ) e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim_0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_offset_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_idim:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl_0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder1:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_icoeff_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "icoeff_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder2:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_box:g.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "dummy:l.0") knl = lp.merge([e2p_from_single_box_knl]) knl = lp.add_and_infer_dtypes(knl, {"targets": np.float64, "box_target_starts": np.int32, "box_target_counts_nonchild": np.int32, "target_boxes": np.int32, "centers": np.float64, "rscale": np.float64, "result": np.float64, "src_expansions": np.float64}) print(lp.generate_code_v2(knl).device_code()) knl = lp.split_iname(knl, "e2p_x0", 2) knl = lp.duplicate_inames(knl, "e2p_x0_inner", within="id:e2p_update_coeffs") print(lp.generate_code_v2(knl).device_code())
The text was updated successfully, but these errors were encountered:
I did not look closely at the provided kernel, but that can happen in the following case:
import loopy as lp knl = lp.make_kernel( "{[i, j]: 0<=i,j<10}", """ for i <> tmp = 10 for j out1[i, j] = i*j*tmp out2[i, j] = (i+j)*tmp end end """) lp.generate_code_v2(knl) # generates code knl = lp.duplicate_inames(knl, "i", within="writes:out1") lp.generate_code_v2(knl) # FAILS due to unschedulable loop nesting
I would not consider the above behavior a loopy bug as it simply did what the user demanded.
Sorry, something went wrong.
Agree with @kaushikcfd. Good to close?
No branches or pull requests
I'm not sure why it becomes unschedulable.
The text was updated successfully, but these errors were encountered: