Skip to content

Commit

Permalink
Actually flip local_max in CUDA (tinygrad#1462)
Browse files Browse the repository at this point in the history
* Actually do the flip

* Fixed typo

---------

Co-authored-by: terafo <[email protected]>
  • Loading branch information
terafo and terafo committed Aug 6, 2023
1 parent d7d1011 commit 24933ab
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 1 deletion.
1 change: 1 addition & 0 deletions tinygrad/codegen/linearizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class LinearizerOptions(NamedTuple):
supports_float4: bool = True
supports_float4_alu: bool = True
has_local: bool = True
# NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
global_max: Optional[List[int]] = None
local_max: Optional[List[int]] = None

Expand Down
2 changes: 1 addition & 1 deletion tinygrad/runtime/ops_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,4 @@ def __call__(self, global_size, local_size, *args, wait=False):
__device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
};
"""))
CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [1024, 1024, 64]), renderer, CUDAProgram, cuda.Context.synchronize)
CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize)

0 comments on commit 24933ab

Please sign in to comment.