diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 211dff5..92e88d0 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -3,9 +3,19 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "1.10"
-      - JuliaCI/julia-test#v1: ~
       - JuliaCI/julia-coverage#v1:
           codecov: true
+    commands: |
+      julia --project -e '
+        using Pkg
+
+        println("--- :julia: Instantiating project")
+        Pkg.develop(path="lib/intrinsics")
+        Pkg.instantiate()
+
+
+        println("+++ :julia: Running tests")
+        Pkg.test(; coverage=true)'
     agents:
       queue: "juliagpu"
       cuda: "*"
diff --git a/codecov.yml b/.github/codecov.yml
similarity index 57%
rename from codecov.yml
rename to .github/codecov.yml
index 8f3b3ed..11b566a 100644
--- a/codecov.yml
+++ b/.github/codecov.yml
@@ -1,7 +1,8 @@
 coverage:
   ignore:
-    - "lib/lib*.jl"
-    - "src/kernels"
+    - "lib/*/lib*.jl"
+    - "src/kernels/"
+    - "src/device/"
   status:
     patch: false
     project: false
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5fce3b2..57ad5df 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -28,7 +28,7 @@ jobs:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v2
-      - uses: julia-actions/julia-buildpkg@v1
+      - run: julia --project -e 'using Pkg; Pkg.develop(path="lib/intrinsics")'
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v4
diff --git a/Project.toml b/Project.toml
index e614d80..b53059b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,11 +3,17 @@ uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 version = "0.10.0"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
+SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
 
 [compat]
+LLVM = "9.1"
 OpenCL_jll = "2024.5.8"
 julia = "1.10"
diff --git a/README.md b/README.md
index 8b906ac..a8d8d31 100644
--- a/README.md
+++ b/README.md
@@ -58,38 +58,64 @@ Available platforms: 3
 
 ## Basic example: vector add
 
+The traditional way of using OpenCL is by writing kernel source code in OpenCL C. For
+example, a simple vector addition:
+
 ```julia
-using LinearAlgebra
 using OpenCL, pocl_jll
 
-const sum_kernel = "
-   __kernel void sum(__global const float *a,
-                     __global const float *b,
-                     __global float *c)
-    {
+const source = """
+   __kernel void vadd(__global const float *a,
+                      __global const float *b,
+                      __global float *c) {
       int gid = get_global_id(0);
       c[gid] = a[gid] + b[gid];
-    }
-"
+    }"""
+
 a = rand(Float32, 50_000)
 b = rand(Float32, 50_000)
 
-a_buff = cl.Buffer(Float32, length(a), (:r, :copy), hostbuf=a)
-b_buff = cl.Buffer(Float32, length(b), (:r, :copy), hostbuf=b)
-c_buff = cl.Buffer(Float32, length(a), :w)
+d_a = CLArray(a; access=:r)
+d_b = CLArray(b; access=:r)
+d_c = similar(d_a; access=:w)
+
+p = cl.Program(; source) |> cl.build!
+k = cl.Kernel(p, "vadd")
 
-p = cl.Program(source=sum_kernel) |> cl.build!
-k = cl.Kernel(p, "sum")
+clcall(k, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+       d_a, d_b, d_c; global_size=size(a))
 
-cl.call(k, a_buff, b_buff, c_buff; global_size=size(a))
+c = Array(d_c)
+
+@assert a + b ≈ c
+```
 
-r = cl.read(c_buff)
 
-if isapprox(norm(r - (a+b)), zero(Float32))
-    @info "Success!"
-else
-    @error "Norm should be 0.0f"
+## Native example: vector add
+
+If your platform supports SPIR-V, it's possible to use Julia functions as kernels:
+
+```julia
+using OpenCL, pocl_jll
+
+function vadd(a, b, c)
+    gid = get_global_id(1)
+    @inbounds c[gid] = a[gid] + b[gid]
+    return
 end
+
+a = rand(Float32, 50_000)
+b = rand(Float32, 50_000)
+
+d_a = CLArray(a; access=:r)
+d_b = CLArray(b; access=:r)
+d_c = similar(d_a; access=:w)
+
+@opencl global_size=size(a) vadd(d_a, d_b, d_c)
+
+c = Array(d_c)
+
+@assert a + b ≈ c
 ```
 
 
diff --git a/examples/demo.jl b/examples/demo.jl
deleted file mode 100644
index f79c2fc..0000000
--- a/examples/demo.jl
+++ /dev/null
@@ -1,36 +0,0 @@
-using OpenCL, LinearAlgebra
-
-const sum_kernel_src = "
-   __kernel void sum(__global const float *a,
-                     __global const float *b,
-                     __global float *c)
-    {
-      int gid = get_global_id(0);
-      c[gid] = a[gid] + b[gid];
-    }
-"
-a = rand(Float32, 50_000)
-b = rand(Float32, 50_000)
-
-# create opencl buffer objects
-# copies to the device initiated when the kernel function is called
-a_buff = cl.Buffer(Float32, length(a), (:r, :copy); hostbuf=a)
-b_buff = cl.Buffer(Float32, length(b), (:r, :copy); hostbuf=b)
-c_buff = cl.Buffer(Float32, length(a), :w)
-
-# build the program and construct a kernel object
-p = cl.Program(source=sum_kernel_src) |> cl.build!
-sum_kernel = cl.Kernel(p, "sum")
-
-# call the kernel object with global size set to the size our arrays
-sum_kernel[size(a)](a_buff, b_buff, c_buff)
-
-# perform a blocking read of the result from the device
-r = cl.read(c_buff)
-
-# check to see if our result is what we expect!
-if isapprox(norm(r - (a+b)), zero(Float32))
-    @info("Success!")
-else
-    error("Norm should be 0.0f")
-end
diff --git a/examples/vadd.jl b/examples/vadd.jl
new file mode 100644
index 0000000..d07851e
--- /dev/null
+++ b/examples/vadd.jl
@@ -0,0 +1,27 @@
+using OpenCL, pocl_jll, Test
+
+const source = """
+   __kernel void vadd(__global const float *a,
+                      __global const float *b,
+                      __global float *c) {
+      int i = get_global_id(0);
+      c[i] = a[i] + b[i];
+    }"""
+
+dims = (2,)
+a = round.(rand(Float32, dims) * 100)
+b = round.(rand(Float32, dims) * 100)
+c = similar(a)
+
+d_a = CLArray(a)
+d_b = CLArray(b)
+d_c = CLArray(c)
+
+prog = cl.Program(; source) |> cl.build!
+kern = cl.Kernel(prog, "vadd")
+
+len = prod(dims)
+clcall(kern, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+       d_a, d_b, d_c; global_size=(len,))
+c = Array(d_c)
+@test a+b ≈ c
diff --git a/examples/vadd_native.jl b/examples/vadd_native.jl
new file mode 100644
index 0000000..972209b
--- /dev/null
+++ b/examples/vadd_native.jl
@@ -0,0 +1,21 @@
+using OpenCL, pocl_jll, Test
+
+function vadd(a, b, c)
+    i = get_global_id()
+    @inbounds c[i] = a[i] + b[i]
+    return
+end
+
+dims = (2,)
+a = round.(rand(Float32, dims) * 100)
+b = round.(rand(Float32, dims) * 100)
+c = similar(a)
+
+d_a = CLArray(a)
+d_b = CLArray(b)
+d_c = CLArray(c)
+
+len = prod(dims)
+@opencl global_size=len vadd(d_a, d_b, d_c)
+c = Array(d_c)
+@test a+b ≈ c
diff --git a/lib/CL.jl b/lib/cl/CL.jl
similarity index 100%
rename from lib/CL.jl
rename to lib/cl/CL.jl
diff --git a/lib/api.jl b/lib/cl/api.jl
similarity index 98%
rename from lib/api.jl
rename to lib/cl/api.jl
index c3a52c5..4340876 100644
--- a/lib/api.jl
+++ b/lib/cl/api.jl
@@ -69,7 +69,7 @@ function retry_reclaim(f, isfailed)
     ret
 end
 
-include("../lib/libopencl.jl")
+include("libopencl.jl")
 
 # lazy initialization
 const initialized = Ref{Bool}(false)
diff --git a/lib/buffer.jl b/lib/cl/buffer.jl
similarity index 100%
rename from lib/buffer.jl
rename to lib/cl/buffer.jl
diff --git a/lib/cmdqueue.jl b/lib/cl/cmdqueue.jl
similarity index 100%
rename from lib/cmdqueue.jl
rename to lib/cl/cmdqueue.jl
diff --git a/lib/context.jl b/lib/cl/context.jl
similarity index 100%
rename from lib/context.jl
rename to lib/cl/context.jl
diff --git a/lib/device.jl b/lib/cl/device.jl
similarity index 100%
rename from lib/device.jl
rename to lib/cl/device.jl
diff --git a/lib/error.jl b/lib/cl/error.jl
similarity index 100%
rename from lib/error.jl
rename to lib/cl/error.jl
diff --git a/lib/event.jl b/lib/cl/event.jl
similarity index 100%
rename from lib/event.jl
rename to lib/cl/event.jl
diff --git a/lib/kernel.jl b/lib/cl/kernel.jl
similarity index 96%
rename from lib/kernel.jl
rename to lib/cl/kernel.jl
index fb3273a..b06c3de 100644
--- a/lib/kernel.jl
+++ b/lib/cl/kernel.jl
@@ -181,8 +181,13 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing;
 end
 
 function call(k::Kernel, args...; global_size=(1,), local_size=nothing,
-              global_work_offset=nothing, wait_on::Vector{Event}=Event[])
+              global_work_offset=nothing, wait_on::Vector{Event}=Event[],
+              svm_pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[])
     set_args!(k, args...)
+    if !isempty(svm_pointers)
+        clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                            sizeof(svm_pointers), svm_pointers)
+    end
     enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
 end
 
@@ -215,8 +220,8 @@ clcall(f::F, types::Tuple, args::Vararg{Any,N}; kwargs...) where {N,F} =
     clcall(f, _to_tuple_type(types), args...; kwargs...)
 
 function clcall(k::Kernel, types::Type{T}, args::Vararg{Any,N}; kwargs...) where {T,N}
-    call_closure = function (pointers::Vararg{Any,N})
-        call(k, pointers...; kwargs...)
+    call_closure = function (converted_args::Vararg{Any,N})
+        call(k, converted_args...; kwargs...)
     end
     convert_arguments(call_closure, types, args...)
 end
diff --git a/lib/libopencl.jl b/lib/cl/libopencl.jl
similarity index 100%
rename from lib/libopencl.jl
rename to lib/cl/libopencl.jl
diff --git a/lib/memory.jl b/lib/cl/memory.jl
similarity index 100%
rename from lib/memory.jl
rename to lib/cl/memory.jl
diff --git a/lib/platform.jl b/lib/cl/platform.jl
similarity index 100%
rename from lib/platform.jl
rename to lib/cl/platform.jl
diff --git a/lib/program.jl b/lib/cl/program.jl
similarity index 100%
rename from lib/program.jl
rename to lib/cl/program.jl
diff --git a/lib/state.jl b/lib/cl/state.jl
similarity index 100%
rename from lib/state.jl
rename to lib/cl/state.jl
diff --git a/lib/svm.jl b/lib/cl/svm.jl
similarity index 100%
rename from lib/svm.jl
rename to lib/cl/svm.jl
diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml
new file mode 100644
index 0000000..6d9ff96
--- /dev/null
+++ b/lib/intrinsics/Project.toml
@@ -0,0 +1,9 @@
+name = "SPIRVIntrinsics"
+uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
+authors = ["Tim Besard <tim.besard@gmail.com>"]
+version = "0.1.0"
+
+[deps]
+ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
diff --git a/lib/intrinsics/src/SPIRVIntrinsics.jl b/lib/intrinsics/src/SPIRVIntrinsics.jl
new file mode 100644
index 0000000..848de1a
--- /dev/null
+++ b/lib/intrinsics/src/SPIRVIntrinsics.jl
@@ -0,0 +1,25 @@
+module SPIRVIntrinsics
+
+using LLVM, LLVM.Interop
+using Core: LLVMPtr
+
+import ExprTools
+
+import SpecialFunctions
+
+include("pointer.jl")
+include("utils.jl")
+
+# OpenCL intrinsics
+#
+# we currently don't implement SPIR-V intrinsics directly, but rely on
+# the SPIR-V to LLVM translator supporting OpenCL intrinsics
+include("work_item.jl")
+include("synchronization.jl")
+include("memory.jl")
+include("printf.jl")
+include("math.jl")
+include("integer.jl")
+include("atomic.jl")
+
+end
diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
new file mode 100644
index 0000000..144026f
--- /dev/null
+++ b/lib/intrinsics/src/atomic.jl
@@ -0,0 +1,264 @@
+# Atomic Functions
+
+# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics)
+
+# "atomic operations on 32-bit signed, unsigned integers and single precision
+#  floating-point to locations in __global or __local memory"
+
+const atomic_integer_types = [UInt32, Int32]
+# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS
+# TODO: additional floating-point atomics with ZE_extension_float_atomics
+const atomic_memory_types = [AS.Local, AS.Global]
+
+
+# generically typed
+
+for gentype in atomic_integer_types, as in atomic_memory_types
+@eval begin
+
+@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_add", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_sub", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) =
+    @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
+
+@device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) =
+    @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
+
+@device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_min", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_max", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_and", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_or", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_xor", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
+    @builtin_ccall("atomic_xchg", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
+
+@device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) =
+    @builtin_ccall("atomic_cmpxchg", $gentype,
+                   (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val)
+
+end
+end
+
+
+# specifically typed
+
+for as in atomic_memory_types
+@eval begin
+
+@device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) =
+    @builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val)
+
+# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too,
+#      which should only perform bitwise comparisons?
+@device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) =
+    reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p),
+                                         reinterpret(UInt32, cmp),
+                                         reinterpret(UInt32, val)))
+
+end
+end
+
+
+
+# documentation
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old + val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_add!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old - val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_sub!
+
+"""
+Swaps the old value stored at location `p` with new value given by `val`.
+Returns old value.
+"""
+atomic_xchg!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute (`old` + 1) and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_inc!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute (`old` - 1) and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_dec!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `(old == cmp) ? val : old` and store result at location pointed by `p`.
+The function returns `old`.
+"""
+atomic_cmpxchg!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `min(old, val)` and store minimum value at location pointed by `p`. The
+function returns `old`.
+"""
+atomic_min!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `max(old, val)` and store maximum value at location pointed by `p`. The
+function returns `old`.
+"""
+atomic_max
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old & val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_and!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old | val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_or!
+
+"""
+Read the 32-bit value (referred to as `old`) stored at location pointed by `p`.
+Compute `old ^ val` and store result at location pointed by `p`. The function
+returns `old`.
+"""
+atomic_xor!
+
+
+
+#
+# High-level interface
+#
+
+# prototype of a high-level interface for performing atomic operations on arrays
+#
+# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
+# well as acquire/release operations to implement the fallback functionality where any
+# operation can be applied atomically.
+
+const inplace_ops = Dict(
+    :(+=) => :(+),
+    :(-=) => :(-),
+    :(*=) => :(*),
+    :(/=) => :(/),
+    :(÷=) => :(÷),
+    :(&=) => :(&),
+    :(|=) => :(|),
+    :(⊻=) => :(⊻),
+)
+
+struct AtomicError <: Exception
+    msg::AbstractString
+end
+
+Base.showerror(io::IO, err::AtomicError) =
+    print(io, "AtomicError: ", err.msg)
+
+"""
+    @atomic a[I] = op(a[I], val)
+    @atomic a[I] ...= val
+
+Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
+operation `op` on that value and a second value `val`, and writes the result back to the
+array. This sequence can be written out as a regular assignment, in which case the same
+array element should be used in the left and right hand side of the assignment, or as an
+in-place application of a known operator. In both cases, the array reference should be pure
+and not induce any side-effects.
+
+!!! warn
+    This interface is experimental, and might change without warning.  Use the lower-level
+    `atomic_...!` functions for a stable API.
+"""
+macro atomic(ex)
+    # decode assignment and call
+    if ex.head == :(=)
+        ref = ex.args[1]
+        rhs = ex.args[2]
+        Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
+        op = rhs.args[1]
+        if rhs.args[2] != ref
+            throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
+        end
+        val = rhs.args[3]
+    elseif haskey(inplace_ops, ex.head)
+        op = inplace_ops[ex.head]
+        ref = ex.args[1]
+        val = ex.args[2]
+    else
+        throw(AtomicError("unknown @atomic expression"))
+    end
+
+    # decode array expression
+    Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
+    array = ref.args[1]
+    indices = Expr(:tuple, ref.args[2:end]...)
+
+    esc(quote
+        $atomic_arrayset($array, $indices, $op, $val)
+    end)
+end
+
+# FIXME: make this respect the indexing style
+@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
+    atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
+
+# native atomics
+for (op,impl) in [(+)      => atomic_add!,
+                  (-)      => atomic_sub!,
+                  (&)      => atomic_and!,
+                  (|)      => atomic_or!,
+                  (⊻)      => atomic_xor!,
+                  Base.max => atomic_max!,
+                  Base.min => atomic_min!]
+    @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
+                                  val::T) where {T <: Union{Int32,UInt32}} =
+        $impl(pointer(A, I), val)
+end
+
+# fallback using compare-and-swap
+function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T}
+    ptr = pointer(A, I)
+    old = Base.unsafe_load(ptr, 1)
+    while true
+        cmp = old
+        new = convert(T, op(old, val))
+        old = atomic_cmpxchg!(ptr, cmp, new)
+        (old == cmp) && return new
+    end
+end
diff --git a/lib/intrinsics/src/integer.jl b/lib/intrinsics/src/integer.jl
new file mode 100644
index 0000000..edbd4bb
--- /dev/null
+++ b/lib/intrinsics/src/integer.jl
@@ -0,0 +1,53 @@
+# Integer Functions
+
+# TODO: vector types
+const generic_integer_types = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64]
+
+
+# generically typed
+
+for gentype in generic_integer_types
+@eval begin
+
+@device_override Base.abs(x::$gentype) = @builtin_ccall("abs", $gentype, ($gentype,), x)
+@device_function abs_diff(x::$gentype, y::$gentype) = @builtin_ccall("abs_diff", $gentype, ($gentype, $gentype), x, y)
+
+@device_function add_sat(x::$gentype, y::$gentype) = @builtin_ccall("add_sat", $gentype, ($gentype, $gentype), x, y)
+@device_function hadd(x::$gentype, y::$gentype) = @builtin_ccall("hadd", $gentype, ($gentype, $gentype), x, y)
+@device_function rhadd(x::$gentype, y::$gentype) = @builtin_ccall("rhadd", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.clamp(x::$gentype, minval::$gentype, maxval::$gentype) = @builtin_ccall("clamp", $gentype, ($gentype, $gentype, $gentype), x, minval, maxval)
+
+@device_function clz(x::$gentype) = @builtin_ccall("clz", $gentype, ($gentype,), x)
+@device_function ctz(x::$gentype) = @builtin_ccall("ctz", $gentype, ($gentype,), x)
+
+@device_function mad_hi(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_hi", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+@device_function mad_sat(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_sat", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+
+# XXX: these definitions introduce ambiguities
+#@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("max", $gentype, ($gentype, $gentype), x, y)
+#@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("min", $gentype, ($gentype, $gentype), x, y)
+
+@device_function mul_hi(x::$gentype, y::$gentype) = @builtin_ccall("mul_hi", $gentype, ($gentype, $gentype), x, y)
+
+@device_function rotate(v::$gentype, i::$gentype) = @builtin_ccall("rotate", $gentype, ($gentype, $gentype), v, i)
+
+@device_function sub_sat(x::$gentype, y::$gentype) = @builtin_ccall("sub_sat", $gentype, ($gentype, $gentype), x, y)
+
+@device_function popcount(x::$gentype) = @builtin_ccall("popcount", $gentype, ($gentype,), x)
+
+@device_function mad24(x::$gentype, y::$gentype, z::$gentype) = @builtin_ccall("mad24", $gentype, ($gentype, $gentype, $gentype), x, y, z)
+@device_function mul24(x::$gentype, y::$gentype) = @builtin_ccall("mul24", $gentype, ($gentype, $gentype), x, y)
+
+end
+end
+
+
+# specifically typed
+
+@device_function upsample(hi::Cchar, lo::Cuchar) = @builtin_ccall("upsample", Cshort, (Cchar, Cuchar), hi, lo)
+upsample(hi::Cuchar, lo::Cuchar) = @builtin_ccall("upsample", Cushort, (Cuchar, Cuchar), hi, lo)
+upsample(hi::Cshort, lo::Cushort) = @builtin_ccall("upsample", Cint, (Cshort, Cushort), hi, lo)
+upsample(hi::Cushort, lo::Cushort) = @builtin_ccall("upsample", Cuint, (Cushort, Cushort), hi, lo)
+upsample(hi::Cint, lo::Cuint) = @builtin_ccall("upsample", Clong, (Cint, Cuint), hi, lo)
+upsample(hi::Cuint, lo::Cuint) = @builtin_ccall("upsample", Culong, (Cuint, Cuint), hi, lo)
diff --git a/lib/intrinsics/src/math.jl b/lib/intrinsics/src/math.jl
new file mode 100644
index 0000000..1e4c2a9
--- /dev/null
+++ b/lib/intrinsics/src/math.jl
@@ -0,0 +1,214 @@
+# Math Functions
+
+# TODO: vector types
+const generic_types = [Float32,Float64]
+const generic_types_float = [Float32]
+const generic_types_double = [Float64]
+
+
+# generically typed
+
+for gentype in generic_types
+@eval begin
+
+@device_override Base.acos(x::$gentype) = @builtin_ccall("acos", $gentype, ($gentype,), x)
+@device_override Base.acosh(x::$gentype) = @builtin_ccall("acosh", $gentype, ($gentype,), x)
+@device_function acospi(x::$gentype) = @builtin_ccall("acospi", $gentype, ($gentype,), x)
+
+@device_override Base.asin(x::$gentype) = @builtin_ccall("asin", $gentype, ($gentype,), x)
+@device_override Base.asinh(x::$gentype) = @builtin_ccall("asinh", $gentype, ($gentype,), x)
+@device_function asinpi(x::$gentype) = @builtin_ccall("asinpi", $gentype, ($gentype,), x)
+
+@device_override Base.atan(y_over_x::$gentype) = @builtin_ccall("atan", $gentype, ($gentype,), y_over_x)
+@device_override Base.atan(y::$gentype, x::$gentype) = @builtin_ccall("atan2", $gentype, ($gentype, $gentype), y, x)
+@device_override Base.atanh(x::$gentype) = @builtin_ccall("atanh", $gentype, ($gentype,), x)
+@device_function atanpi(x::$gentype) = @builtin_ccall("atanpi", $gentype, ($gentype,), x)
+@device_function atanpi(y::$gentype, x::$gentype) = @builtin_ccall("atan2pi", $gentype, ($gentype, $gentype), y, x)
+
+@device_override Base.cbrt(x::$gentype) = @builtin_ccall("cbrt", $gentype, ($gentype,), x)
+
+@device_override Base.ceil(x::$gentype) = @builtin_ccall("ceil", $gentype, ($gentype,), x)
+
+@device_override Base.copysign(x::$gentype, y::$gentype) = @builtin_ccall("copysign", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.cos(x::$gentype) = @builtin_ccall("cos", $gentype, ($gentype,), x)
+@device_override Base.cosh(x::$gentype) = @builtin_ccall("cosh", $gentype, ($gentype,), x)
+@device_function cospi(x::$gentype) = @builtin_ccall("cospi", $gentype, ($gentype,), x)
+
+@device_override SpecialFunctions.erfc(x::$gentype) = @builtin_ccall("erfc", $gentype, ($gentype,), x)
+@device_override SpecialFunctions.erf(x::$gentype) = @builtin_ccall("erf", $gentype, ($gentype,), x)
+
+@device_override Base.exp(x::$gentype) = @builtin_ccall("exp", $gentype, ($gentype,), x)
+@device_override Base.exp2(x::$gentype) = @builtin_ccall("exp2", $gentype, ($gentype,), x)
+@device_override Base.exp10(x::$gentype) = @builtin_ccall("exp10", $gentype, ($gentype,), x)
+@device_override Base.expm1(x::$gentype) = @builtin_ccall("expm1", $gentype, ($gentype,), x)
+
+@device_override Base.abs(x::$gentype) = @builtin_ccall("fabs", $gentype, ($gentype,), x)
+
+@device_function dim(x::$gentype, y::$gentype) = @builtin_ccall("fdim", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.floor(x::$gentype) = @builtin_ccall("floor", $gentype, ($gentype,), x)
+
+@device_override Base.fma(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("fma", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+
+@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("fmax", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("fmin", $gentype, ($gentype, $gentype), x, y)
+
+# NOTE: Julia's mod behaves differently than fmod
+#@device_override Base.mod(x::$gentype, y::$gentype) = @builtin_ccall("fmod", $gentype, ($gentype, $gentype), x, y)
+# fract(x::$gentype, $gentype *iptr) = @builtin_ccall("fract", $gentype, ($gentype, $gentype *), x, iptr)
+
+@device_override Base.hypot(x::$gentype, y::$gentype) = @builtin_ccall("hypot", $gentype, ($gentype, $gentype), x, y)
+
+@device_override SpecialFunctions.loggamma(x::$gentype) = @builtin_ccall("lgamma", $gentype, ($gentype,), x)
+
+@device_override Base.log(x::$gentype) = @builtin_ccall("log", $gentype, ($gentype,), x)
+@device_override Base.log2(x::$gentype) = @builtin_ccall("log2", $gentype, ($gentype,), x)
+@device_override Base.log10(x::$gentype) = @builtin_ccall("log10", $gentype, ($gentype,), x)
+@device_override Base.log1p(x::$gentype) = @builtin_ccall("log1p", $gentype, ($gentype,), x)
+@device_function logb(x::$gentype) = @builtin_ccall("logb", $gentype, ($gentype,), x)
+
+@device_function mad(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad", $gentype, ($gentype, $gentype, $gentype), a, b, c)
+
+@device_function maxmag(x::$gentype, y::$gentype) = @builtin_ccall("maxmag", $gentype, ($gentype, $gentype), x, y)
+@device_function minmag(x::$gentype, y::$gentype) = @builtin_ccall("minmag", $gentype, ($gentype, $gentype), x, y)
+
+# modf(x::$gentype, $gentype *iptr) = @builtin_ccall("modf", $gentype, ($gentype, $gentype *), x, iptr)
+
+@device_function nextafter(x::$gentype, y::$gentype) = @builtin_ccall("nextafter", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.:(^)(x::$gentype, y::$gentype) = @builtin_ccall("pow", $gentype, ($gentype, $gentype), x, y)
+@device_function powr(x::$gentype, y::$gentype) = @builtin_ccall("powr", $gentype, ($gentype, $gentype), x, y)
+
+@device_override Base.rem(x::$gentype, y::$gentype) = @builtin_ccall("remainder", $gentype, ($gentype, $gentype), x, y)
+
+@device_function rint(x::$gentype) = @builtin_ccall("rint", $gentype, ($gentype,), x)
+
+@device_override Base.round(x::$gentype) = @builtin_ccall("round", $gentype, ($gentype,), x)
+
+@device_function rsqrt(x::$gentype) = @builtin_ccall("rsqrt", $gentype, ($gentype,), x)
+
+@device_override Base.sin(x::$gentype) = @builtin_ccall("sin", $gentype, ($gentype,), x)
+@device_override function Base.sincos(x::$gentype)
+    cosval = Ref{$gentype}()
+    sinval = GC.@preserve cosval begin
+        ptr = Base.unsafe_convert(Ptr{$gentype}, cosval)
+        llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Private}, ptr)
+        @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Private}), x, llvm_ptr)
+    end
+    return sinval, cosval[]
+end
+@device_override Base.sinh(x::$gentype) = @builtin_ccall("sinh", $gentype, ($gentype,), x)
+@device_function sinpi(x::$gentype) = @builtin_ccall("sinpi", $gentype, ($gentype,), x)
+
+@device_override Base.sqrt(x::$gentype) = @builtin_ccall("sqrt", $gentype, ($gentype,), x)
+
+@device_override Base.tan(x::$gentype) = @builtin_ccall("tan", $gentype, ($gentype,), x)
+@device_override Base.tanh(x::$gentype) = @builtin_ccall("tanh", $gentype, ($gentype,), x)
+@device_function tanpi(x::$gentype) = @builtin_ccall("tanpi", $gentype, ($gentype,), x)
+
+@device_override SpecialFunctions.gamma(x::$gentype) = @builtin_ccall("tgamma", $gentype, ($gentype,), x)
+
+@device_override Base.trunc(x::$gentype) = @builtin_ccall("trunc", $gentype, ($gentype,), x)
+
+end
+end
+
+
+# generically typed -- only floats
+
+for gentypef in generic_types_float
+
+if gentypef !== Float32
+@eval begin
+@device_override Base.max(x::$gentypef, y::Float32) = @builtin_ccall("fmax", $gentypef, ($gentypef, Float32), x, y)
+@device_override Base.min(x::$gentypef, y::Float32) = @builtin_ccall("fmin", $gentypef, ($gentypef, Float32), x, y)
+end
+end
+
+end
+
+
+# generically typed -- only doubles
+
+for gentyped in generic_types_double
+
+if gentyped !== Float64
+@eval begin
+@device_override Base.min(x::$gentyped, y::Float64) = @builtin_ccall("fmin", $gentyped, ($gentyped, Float64), x, y)
+@device_override Base.max(x::$gentyped, y::Float64) = @builtin_ccall("fmax", $gentyped, ($gentyped, Float64), x, y)
+end
+end
+
+end
+
+
+# specifically typed
+
+# frexp(x::Float32{n}, Int32{n} *exp) = @builtin_ccall("frexp", Float32{n}, (Float32{n}, Int32{n} *), x, exp)
+# frexp(x::Float32, Int32 *exp) = @builtin_ccall("frexp", Float32, (Float32, Int32 *), x, exp)
+# frexp(x::Float64{n}, Int32{n} *exp) = @builtin_ccall("frexp", Float64{n}, (Float64{n}, Int32{n} *), x, exp)
+# frexp(x::Float64, Int32 *exp) = @builtin_ccall("frexp", Float64, (Float64, Int32 *), x, exp)
+
+# ilogb(x::Float32{n}) = @builtin_ccall("ilogb", Int32{n}, (Float32{n},), x)
+@device_function ilogb(x::Float32) = @builtin_ccall("ilogb", Int32, (Float32,), x)
+# ilogb(x::Float64{n}) = @builtin_ccall("ilogb", Int32{n}, (Float64{n},), x)
+@device_function ilogb(x::Float64) = @builtin_ccall("ilogb", Int32, (Float64,), x)
+
+# ldexp(x::Float32{n}, k::Int32{n}) = @builtin_ccall("ldexp", Float32{n}, (Float32{n}, Int32{n}), x, k)
+# ldexp(x::Float32{n}, k::Int32) = @builtin_ccall("ldexp", Float32{n}, (Float32{n}, Int32), x, k)
+@device_override Base.ldexp(x::Float32, k::Int32) = @builtin_ccall("ldexp", Float32, (Float32, Int32), x, k)
+# ldexp(x::Float64{n}, k::Int32{n}) = @builtin_ccall("ldexp", Float64{n}, (Float64{n}, Int32{n}), x, k)
+# ldexp(x::Float64{n}, k::Int32) = @builtin_ccall("ldexp", Float64{n}, (Float64{n}, Int32), x, k)
+@device_override Base.ldexp(x::Float64, k::Int32) = @builtin_ccall("ldexp", Float64, (Float64, Int32), x, k)
+
+# lgamma_r(x::Float32{n}, Int32{n} *signp) = @builtin_ccall("lgamma_r", Float32{n}, (Float32{n}, Int32{n} *), x, signp)
+# lgamma_r(x::Float32, Int32 *signp) = @builtin_ccall("lgamma_r", Float32, (Float32, Int32 *), x, signp)
+# lgamma_r(x::Float64{n}, Int32{n} *signp) = @builtin_ccall("lgamma_r", Float64{n}, (Float64{n}, Int32{n} *), x, signp)
+# Float64 lgamma_r(x::Float64, Int32 *signp) = @builtin_ccall("lgamma_r", Float64, (Float64, Int32 *), x, signp)
+
+# nan(nancode::uintn) = @builtin_ccall("nan", Float32{n}, (uintn,), nancode)
+@device_function nan(nancode::UInt32) = @builtin_ccall("nan", Float32, (UInt32,), nancode)
+# nan(nancode::UInt64{n}) = @builtin_ccall("nan", Float64{n}, (UInt64{n},), nancode)
+@device_function nan(nancode::UInt64) = @builtin_ccall("nan", Float64, (UInt64,), nancode)
+
+# pown(x::Float32{n}, y::Int32{n}) = @builtin_ccall("pown", Float32{n}, (Float32{n}, Int32{n}), x, y)
+@device_override Base.:(^)(x::Float32, y::Int32) = @builtin_ccall("pown", Float32, (Float32, Int32), x, y)
+# pown(x::Float64{n}, y::Int32{n}) = @builtin_ccall("pown", Float64{n}, (Float64{n}, Int32{n}), x, y)
+@device_override Base.:(^)(x::Float64, y::Int32) = @builtin_ccall("pown", Float64, (Float64, Int32), x, y)
+
+# remquo(x::Float32{n}, y::Float32{n}, Int32{n} *quo) = @builtin_ccall("remquo", Float32{n}, (Float32{n}, Float32{n}, Int32{n} *), x, y, quo)
+# remquo(x::Float32, y::Float32, Int32 *quo) = @builtin_ccall("remquo", Float32, (Float32, Float32, Int32 *), x::Float32, y, quo)
+# remquo(x::Float64{n}, y::Float64{n}, Int32{n} *quo) = @builtin_ccall("remquo", Float64{n}, (Float64{n}, Float64{n}, Int32{n} *), x, y, quo)
+# remquo(x::Float64, y::Float64, Int32 *quo) = @builtin_ccall("remquo", Float64, (Float64, Float64, Int32 *), x, y, quo)
+
+# rootn(x::Float32{n}, y::Int32{n}) = @builtin_ccall("rootn", Float32{n}, (Float32{n}, Int32{n}), x, y)
+@device_function rootn(x::Float32, y::Int32) = @builtin_ccall("rootn", Float32, (Float32, Int32), x, y)
+# rootn(x::Float64{n}, y::Int32{n}) = @builtin_ccall("rootn", Float64{n}, (Float64{n}, Int32{n}), x, y)
+# rootn(x::Float64, y::Int32) = @builtin_ccall("rootn", Float64{n}, (Float64, Int32), x, y)
+
+
+# TODO: half and native
+
+function _mulhi(a::Int64, b::Int64)
+    shift = sizeof(a) * 4
+    mask = typemax(UInt32)
+    a1, a2 = (a >> shift), a & mask
+    b1, b2 = (b >> shift), b & mask
+    a1b1, a1b2, a2b1 = a1*b1, a1*b2, a2*b1
+    t1 = a1b2 + _mulhi(a2 % UInt32, b2 % UInt32)
+    t2 = a2b1 + (t1 & mask)
+    a1b1 + (t1 >> shift) + (t2 >> shift)
+end
+@static if isdefined(Base.MultiplicativeInverses, :_mul_high)
+    _mulhi(a::T, b::T) where {T<:Union{Signed, Unsigned}} = Base.MultiplicativeInverses._mul_high(a, b)
+    @device_override Base.MultiplicativeInverses._mul_high(a::Int64, b::Int64) = _mulhi(a, b)
+else
+    _mulhi(a::T, b::T) where {T<:Union{Signed, Unsigned}} = ((widen(a)*b) >>> (sizeof(a)*8)) % T
+    @device_override function Base.div(a::Int64, b::Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64})
+        x = _mulhi(a, b.multiplier)
+        x += (a*b.addmul) % Int64
+        ifelse(abs(b.divisor) == 1, a*b.divisor, (signbit(x) + (x >> b.shift)) % Int64)
+    end
+end
diff --git a/lib/intrinsics/src/memory.jl b/lib/intrinsics/src/memory.jl
new file mode 100644
index 0000000..d5ffe26
--- /dev/null
+++ b/lib/intrinsics/src/memory.jl
@@ -0,0 +1,38 @@
+# local memory
+
+# get a pointer to local memory, with known (static) or zero length (dynamic)
+@generated function emit_localmemory(::Type{T}, ::Val{len}=Val(0)) where {T,len}
+    Context() do ctx
+        # XXX: as long as LLVMPtr is emitted as i8*, it doesn't make sense to type the GV
+        eltyp = convert(LLVMType, LLVM.Int8Type())
+        T_ptr = convert(LLVMType, LLVMPtr{T,AS.Local})
+
+        # create a function
+        llvm_f, _ = create_function(T_ptr)
+
+        # create the global variable
+        mod = LLVM.parent(llvm_f)
+        gv_typ = LLVM.ArrayType(eltyp, len * sizeof(T))
+        gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Local)
+        if len > 0
+            linkage!(gv, LLVM.API.LLVMInternalLinkage)
+            initializer!(gv, null(gv_typ))
+        end
+        # TODO: Make the alignment configurable
+        alignment!(gv, Base.datatype_alignment(T))
+
+        # generate IR
+        IRBuilder() do builder
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            ptr = gep!(builder, gv_typ, gv, [ConstantInt(0), ConstantInt(0)])
+
+            untyped_ptr = bitcast!(builder, ptr, T_ptr)
+
+            ret!(builder, untyped_ptr)
+        end
+
+        call_function(llvm_f, LLVMPtr{T,AS.Local})
+    end
+end
diff --git a/lib/intrinsics/src/pointer.jl b/lib/intrinsics/src/pointer.jl
new file mode 100644
index 0000000..228740e
--- /dev/null
+++ b/lib/intrinsics/src/pointer.jl
@@ -0,0 +1,16 @@
+## adrspace aliases
+
+export AS
+
+module AS
+
+const Private    = 0
+const Global     = 1
+const Constant   = 2
+const Local      = 3
+const Generic    = 4
+const Input      = 5
+const Output     = 6
+const Count      = 7
+
+end
diff --git a/lib/intrinsics/src/printf.jl b/lib/intrinsics/src/printf.jl
new file mode 100644
index 0000000..f7429f8
--- /dev/null
+++ b/lib/intrinsics/src/printf.jl
@@ -0,0 +1,205 @@
+# printf
+
+# Formatted Output (B.17)
+
+@generated function promote_c_argument(arg)
+    # > When a function with a variable-length argument list is called, the variable
+    # > arguments are passed using C's old ``default argument promotions.'' These say that
+    # > types char and short int are automatically promoted to int, and type float is
+    # > automatically promoted to double. Therefore, varargs functions will never receive
+    # > arguments of type char, short int, or float.
+
+    if arg == Cchar || arg == Cshort || arg == Cuchar || arg == Cushort
+        return :(Cint(arg))
+    elseif arg == Cfloat
+        return :(Cdouble(arg))
+    else
+        return :(arg)
+    end
+end
+
+macro printf(fmt::String, args...)
+    fmt_val = Val(Symbol(fmt))
+
+    return :(emit_printf($fmt_val, $(map(arg -> :(promote_c_argument($arg)), esc.(args))...)))
+end
+
+@generated function emit_printf(::Val{fmt}, argspec...) where {fmt}
+    arg_exprs = [:( argspec[$i] ) for i in 1:length(argspec)]
+    arg_types = [argspec...]
+
+    Context() do ctx
+        T_void = LLVM.VoidType()
+        T_int32 = LLVM.Int32Type()
+        T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Constant)
+
+        # create functions
+        param_types = LLVMType[convert(LLVMType, typ) for typ in arg_types]
+        llvm_f, _ = create_function(T_int32, param_types)
+        mod = LLVM.parent(llvm_f)
+
+        # generate IR
+        IRBuilder() do builder
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            str = globalstring_ptr!(builder, String(fmt); addrspace=AS.Constant)
+
+            # invoke printf and return
+            printf_typ = LLVM.FunctionType(T_int32, [T_pint8]; vararg=true)
+            printf = LLVM.Function(mod, "printf", printf_typ)
+            push!(function_attributes(printf), EnumAttribute("nobuiltin"))
+            chars = call!(builder, printf_typ, printf, [str, parameters(llvm_f)...])
+
+            ret!(builder, chars)
+        end
+
+        call_function(llvm_f, Int32, Tuple{arg_types...}, arg_exprs...)
+    end
+end
+
+
+## print-like functionality
+
+# simple conversions, defining an expression and the resulting argument type. nothing fancy,
+# `@print` pretty directly maps to `@printf`; we should just support `write(::IO)`.
+const print_conversions = Dict(
+    Float32     => (x->:(Float64($x)),             Float64),
+    Ptr{<:Any}  => (x->:(convert(Ptr{Cvoid}, $x)), Ptr{Cvoid}),
+    Bool        => (x->:(Int32($x)),               Int32),
+)
+
+# format specifiers
+const print_specifiers = Dict(
+    # integers
+    Int16       => "%hd",
+    Int32       => "%d",
+    Int64       => Sys.iswindows() ? "%lld" : "%ld",
+    UInt16      => "%hu",
+    UInt32      => "%u",
+    UInt64      => Sys.iswindows() ? "%llu" : "%lu",
+
+    # floating-point
+    Float64     => "%f",
+
+    # other
+    Cchar       => "%c",
+    Ptr{Cvoid}  => "%p",
+)
+
+@generated function _print(parts...)
+    fmt = ""
+    args = Expr[]
+
+    for i in 1:length(parts)
+        part = :(parts[$i])
+        T = parts[i]
+
+        # put literals directly in the format string
+        if T <: Val
+            fmt *= string(T.parameters[1])
+            continue
+        end
+
+        # try to convert arguments if they are not supported directly
+        if !haskey(print_specifiers, T)
+            for Tmatch in keys(print_conversions)
+                if T <: Tmatch
+                    conv, T = print_conversions[Tmatch]
+                    part = conv(part)
+                    break
+                end
+            end
+        end
+
+        # render the argument
+        if haskey(print_specifiers, T)
+            fmt *= print_specifiers[T]
+            push!(args, part)
+        elseif T <: String
+            @error("@print does not support non-literal strings")
+        else
+            @error("@print does not support values of type $T")
+        end
+    end
+
+    quote
+        Base.@_inline_meta
+        @printf($fmt, $(args...))
+    end
+end
+
+"""
+    @print(xs...)
+    @println(xs...)
+
+Print a textual representation of values `xs` to standard output from the GPU. The
+functionality builds on `@printf`, and is intended as a more use friendly alternative of
+that API. However, that also means there's only limited support for argument types, handling
+16/32/64 signed and unsigned integers, 32 and 64-bit floating point numbers, `Cchar`s and
+pointers. For more complex output, use `@printf` directly.
+
+Limited string interpolation is also possible:
+
+```julia
+    @print("Hello, World ", 42, "\\n")
+    @print "Hello, World \$(42)\\n"
+```
+"""
+macro print(parts...)
+    args = Union{Val,Expr,Symbol}[]
+
+    parts = [parts...]
+    while true
+        isempty(parts) && break
+
+        part = popfirst!(parts)
+
+        # handle string interpolation
+        if isa(part, Expr) && part.head == :string
+            parts = vcat(part.args, parts)
+            continue
+        end
+
+        # expose literals to the generator by using Val types
+        if isbits(part) # literal numbers, etc
+            push!(args, Val(part))
+        elseif isa(part, QuoteNode) # literal symbols
+            push!(args, Val(part.value))
+        elseif isa(part, String) # literal strings need to be interned
+            push!(args, Val(Symbol(part)))
+        else # actual values that will be passed to printf
+            push!(args, part)
+        end
+    end
+
+    quote
+        _print($(map(esc, args)...))
+    end
+end
+
+@doc (@doc @print) ->
+macro println(parts...)
+    esc(quote
+        oneAPI.@print($(parts...), "\n")
+    end)
+end
+
+"""
+    @show(ex)
+
+GPU analog of `Base.@show`. It comes with the same type restrictions as [`@printf`](@ref).
+
+```julia
+@show threadIdx().x
+```
+"""
+macro show(exs...)
+    blk = Expr(:block)
+    for ex in exs
+        push!(blk.args, :(oneAPI.@println($(sprint(Base.show_unquoted,ex)*" = "),
+                                          begin local value = $(esc(ex)) end)))
+    end
+    isempty(exs) || push!(blk.args, :value)
+    blk
+end
diff --git a/lib/intrinsics/src/synchronization.jl b/lib/intrinsics/src/synchronization.jl
new file mode 100644
index 0000000..0d6c413
--- /dev/null
+++ b/lib/intrinsics/src/synchronization.jl
@@ -0,0 +1,21 @@
+# Synchronization Functions
+
+export barrier
+
+const cl_mem_fence_flags = UInt32
+const CLK_LOCAL_MEM_FENCE = cl_mem_fence_flags(1)
+const CLK_GLOBAL_MEM_FENCE = cl_mem_fence_flags(2)
+
+#barrier(flags=0) = @builtin_ccall("barrier", Cvoid, (UInt32,), flags)
+@device_function barrier(flags=0) = Base.llvmcall(("""
+        declare void @_Z7barrierj(i32) #0
+        define void @entry(i32 %0) #1 {
+            call void @_Z7barrierj(i32 %0)
+            ret void
+        }
+        attributes #0 = { convergent }
+        attributes #1 = { alwaysinline }
+        """, "entry"),
+    Cvoid, Tuple{Int32}, convert(Int32, flags))
+push!(opencl_builtins, "_Z7barrierj")
+# TODO: add support for attributes to @builting_ccall/LLVM.@typed_ccall
diff --git a/lib/intrinsics/src/utils.jl b/lib/intrinsics/src/utils.jl
new file mode 100644
index 0000000..0c69b80
--- /dev/null
+++ b/lib/intrinsics/src/utils.jl
@@ -0,0 +1,109 @@
+const opencl_builtins = String["printf"]
+
+# OpenCL functions need to be mangled according to the C++ Itanium spec. We implement a very
+# limited version of that spec here, just enough to support OpenCL built-ins.
+#
+# This macro also keeps track of called builtins, generating `ccall("extern...", llvmcall)`
+# expressions for them (so that we can exclude them during IR verification).
+macro builtin_ccall(name, ret, argtypes, args...)
+    @assert Meta.isexpr(argtypes, :tuple)
+    argtypes = argtypes.args
+
+    function mangle(T::Type)
+        if T == Cint
+            "i"
+        elseif T == Cuint
+            "j"
+        elseif T == Clong
+            "l"
+        elseif T == Culong
+            "m"
+        elseif T == Clonglong
+            "x"
+        elseif T == Culonglong
+            "y"
+        elseif T == Cshort
+            "s"
+        elseif T == Cushort
+            "t"
+        elseif T == Cchar
+            "c"
+        elseif T == Cuchar
+            "h"
+        elseif T == Cfloat
+            "f"
+        elseif T == Cdouble
+            "d"
+        elseif T <: LLVMPtr
+            elt, as = T.parameters
+
+            # mangle address space
+            ASstr = if as == AS.Global
+                "CLglobal"
+            #elseif as == AS.Global_device
+            #    "CLdevice"
+            #elseif as == AS.Global_host
+            #    "CLhost"
+            elseif as == AS.Local
+                "CLlocal"
+            elseif as == AS.Constant
+                "CLconstant"
+            elseif as == AS.Private
+                "CLprivate"
+            elseif as == AS.Generic
+                "CLgeneric"
+            else
+                error("Unknown address space $AS")
+            end
+
+            # encode as vendor qualifier
+            ASstr = "U" * string(length(ASstr)) * ASstr
+
+            # XXX: where does the V come from?
+            "P" * ASstr * "V" * mangle(elt)
+        else
+            error("Unknown type $T")
+        end
+    end
+
+    # C++-style mangling; very limited to just support these intrinsics
+    # TODO: generalize for use with other intrinsics? do we need to mangle those?
+    mangled = "_Z$(length(name))$name"
+    for t in argtypes
+        # with `@eval @builtin_ccall`, we get actual types in the ast, otherwise symbols
+        t = (isa(t, Symbol) || isa(t, Expr)) ? eval(t) : t
+        mangled *= mangle(t)
+    end
+
+    push!(opencl_builtins, mangled)
+    esc(quote
+        @typed_ccall($mangled, llvmcall, $ret, ($(argtypes...),), $(args...))
+    end)
+end
+
+
+## device overrides
+
+# local method table for device functions
+Base.Experimental.@MethodTable(method_table)
+
+macro device_override(ex)
+    esc(quote
+        Base.Experimental.@overlay($method_table, $ex)
+    end)
+end
+
+macro device_function(ex)
+    ex = macroexpand(__module__, ex)
+    def = ExprTools.splitdef(ex)
+
+    # generate a function that errors
+    def[:body] = quote
+        error("This function is not intended for use on the CPU")
+    end
+
+    esc(quote
+        $(ExprTools.combinedef(def))
+        @device_override $ex
+    end)
+end
diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl
new file mode 100644
index 0000000..ee3cf73
--- /dev/null
+++ b/lib/intrinsics/src/work_item.jl
@@ -0,0 +1,28 @@
+# Work-Item Functions
+
+export get_work_dim,
+       get_global_size, get_global_id,
+       get_local_size, get_enqueued_local_size, get_local_id,
+       get_num_groups, get_group_id,
+       get_global_offset,
+       get_global_linear_id, get_local_linear_id
+
+# NOTE: these functions now unsafely truncate to Int to avoid top bit checks.
+#       we should probably use range metadata instead.
+
+@device_function get_work_dim() = @builtin_ccall("get_work_dim", UInt32, ()) % Int
+
+@device_function get_global_size(dimindx::Integer=1) = @builtin_ccall("get_global_size", UInt, (UInt32,), dimindx-1) % Int
+@device_function get_global_id(dimindx::Integer=1) = @builtin_ccall("get_global_id", UInt, (UInt32,), dimindx-1) % Int + 1
+
+@device_function get_local_size(dimindx::Integer=1) = @builtin_ccall("get_local_size", UInt, (UInt32,), dimindx-1) % Int
+@device_function get_enqueued_local_size(dimindx::Integer=1) = @builtin_ccall("get_enqueued_local_size", UInt, (UInt32,), dimindx-1) % Int
+@device_function get_local_id(dimindx::Integer=1) = @builtin_ccall("get_local_id", UInt, (UInt32,), dimindx-1) % Int + 1
+
+@device_function get_num_groups(dimindx::Integer=1) = @builtin_ccall("get_num_groups", UInt, (UInt32,), dimindx-1) % Int
+@device_function get_group_id(dimindx::Integer=1) = @builtin_ccall("get_group_id", UInt, (UInt32,), dimindx-1) % Int + 1
+
+@device_function get_global_offset(dimindx::Integer=1) = @builtin_ccall("get_global_offset", UInt, (UInt32,), dimindx-1) % Int + 1
+
+@device_function get_global_linear_id() = @builtin_ccall("get_global_linear_id", UInt, ()) % Int + 1
+@device_function get_local_linear_id() = @builtin_ccall("get_local_linear_id", UInt, ()) % Int + 1
diff --git a/src/OpenCL.jl b/src/OpenCL.jl
index e69faed..70e19b9 100644
--- a/src/OpenCL.jl
+++ b/src/OpenCL.jl
@@ -1,12 +1,45 @@
 module OpenCL
 
+using GPUCompiler
+using LLVM, LLVM.Interop
+using SPIRV_LLVM_Translator_unified_jll
+using Adapt
 using Reexport
 
+using Core: LLVMPtr
+
 # library wrappers
-include("../lib/CL.jl")
+include("../lib/cl/CL.jl")
 @reexport using .cl
 export cl
 
+# device functionality
+include("device/runtime.jl")
+import SPIRVIntrinsics
+let
+    # re-export functionality from SPIRVIntrinsics
+    for name in names(SPIRVIntrinsics)
+        name == :SPIRVIntrinsics && continue
+        @eval export $name
+    end
+
+    # import all the others so that the user can refer to them through the OpenCL module
+    for name in names(SPIRVIntrinsics; all=true)
+        # bring all the names of this module in scope
+        name in (:SPIRVIntrinsics, :eval, :include) && continue
+        startswith(string(name), "#") && continue
+        @eval begin
+            using .SPIRVIntrinsics: $name
+        end
+    end
+end
+include("device/array.jl")
+
+# compiler implementation
+include("compiler/compilation.jl")
+include("compiler/execution.jl")
+include("compiler/reflection.jl")
+
 # high-level functionality
 include("util.jl")
 include("array.jl")
diff --git a/src/array.jl b/src/array.jl
index bd1a73b..5be7542 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -93,6 +93,11 @@ function Base.cconvert(::Type{Ptr{T}}, A::CLArray{T}) where T
     buffer(A)
 end
 
+function Adapt.adapt_storage(to::KernelAdaptor, xs::CLArray{T,N}) where {T,N}
+    ptr = adapt(to, buffer(xs))
+    CLDeviceArray{T,N,AS.Global}(size(xs), reinterpret(LLVMPtr{T,AS.Global}, ptr))
+end
+
 
 ## utilities
 
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
new file mode 100644
index 0000000..423439c
--- /dev/null
+++ b/src/compiler/compilation.jl
@@ -0,0 +1,82 @@
+## gpucompiler interface
+
+struct OpenCLCompilerParams <: AbstractCompilerParams end
+const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
+const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}
+
+GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = OpenCL
+
+GPUCompiler.method_table(::OpenCLCompilerJob) = method_table
+
+# filter out OpenCL built-ins
+# TODO: eagerly lower these using the translator API
+GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
+    invoke(GPUCompiler.isintrinsic,
+           Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)},
+           job, fn) ||
+    in(fn, opencl_builtins)
+
+
+## compiler implementation (cache, configure, compile, and link)
+
+# cache of compilation caches, per context
+const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}()
+function compiler_cache(ctx::cl.Context)
+    cache = get(_compiler_caches, ctx, nothing)
+    if cache === nothing
+        cache = Dict{Any, Any}()
+        _compiler_caches[ctx] = cache
+    end
+    return cache
+end
+
+# cache of compiler configurations, per device (but additionally configurable via kwargs)
+const _toolchain = Ref{Any}()
+const _compiler_configs = Dict{UInt, OpenCLCompilerConfig}()
+function compiler_config(dev::cl.Device; kwargs...)
+    h = hash(dev, hash(kwargs))
+    config = get(_compiler_configs, h, nothing)
+    if config === nothing
+        config = _compiler_config(dev; kwargs...)
+        _compiler_configs[h] = config
+    end
+    return config
+end
+@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...)
+    supports_fp16 = "cl_khr_fp16" in dev.extensions
+    supports_fp64 = "cl_khr_fp64" in dev.extensions
+
+    # create GPUCompiler objects
+    target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
+    params = OpenCLCompilerParams()
+    CompilerConfig(target, params; kernel, name, always_inline)
+end
+
+# compile to executable machine code
+function compile(@nospecialize(job::CompilerJob))
+    # TODO: this creates a context; cache those.
+    obj, meta = JuliaContext() do ctx
+        GPUCompiler.compile(:obj, job)
+    end
+
+    (obj, entry=LLVM.name(meta.entry))
+end
+
+# link into an executable kernel
+function link(@nospecialize(job::CompilerJob), compiled)
+    prog = if "cl_khr_il_program" in cl.device().extensions
+        cl.Program(; il=compiled.obj)
+    else
+        error("Your device does not support SPIR-V, which is currently required for native execution.")
+        # XXX: kpet/spirv2clc#87, caused by KhronosGroup/SPIRV-LLVM-Translator#2029
+        source = mktempdir() do dir
+            il = joinpath(dir, "kernel.spv")
+            write(il, compiled.obj)
+            cmd = `spirv2clc $il`
+            read(cmd, String)
+        end
+        cl.Program(; source)
+    end
+    cl.build!(prog)
+    cl.Kernel(prog, compiled.entry)
+end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
new file mode 100644
index 0000000..235f70e
--- /dev/null
+++ b/src/compiler/execution.jl
@@ -0,0 +1,211 @@
+export @opencl, clfunction, clconvert
+
+
+## high-level @opencl interface
+
+const MACRO_KWARGS = [:launch]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
+
+macro opencl(ex...)
+    call = ex[end]
+    kwargs = map(ex[1:end-1]) do kwarg
+        if kwarg isa Symbol
+            :($kwarg = $kwarg)
+        elseif Meta.isexpr(kwarg, :(=))
+            kwarg
+        else
+            throw(ArgumentError("Invalid keyword argument '$kwarg'"))
+        end
+    end
+
+    # destructure the kernel call
+    Meta.isexpr(call, :call) || throw(ArgumentError("second argument to @opencl should be a function call"))
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    vars, var_exprs = assign_args!(code, args)
+
+    # group keyword argument
+    macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
+        split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
+    if !isempty(other_kwargs)
+        key,val = first(other_kwargs).args
+        throw(ArgumentError("Unsupported keyword argument '$key'"))
+    end
+
+    # handle keyword arguments that influence the macro's behavior
+    launch = true
+    for kwarg in macro_kwargs
+        key,val = kwarg.args
+        if key == :launch
+            isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @opencl should be a constant value"))
+            launch = val::Bool
+        else
+            throw(ArgumentError("Unsupported keyword argument '$key'"))
+        end
+    end
+    if !launch && !isempty(call_kwargs)
+        error("@opencl with launch=false does not support launch-time keyword arguments; use them when calling the kernel")
+    end
+
+    # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
+    #        we esc() the whole thing now, necessitating gensyms...
+    @gensym f_var kernel_f kernel_args kernel_tt kernel
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            $f_var = $f
+            GC.@preserve $(vars...) $f_var begin
+                $kernel_f = $clconvert($f_var)
+                $kernel_args = map($clconvert, ($(var_exprs...),))
+                $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
+                $kernel = $clfunction($kernel_f, $kernel_tt; $(compiler_kwargs...))
+                if $launch
+                    $kernel($(var_exprs...); $(call_kwargs...))
+                end
+                $kernel
+            end
+         end)
+
+    return esc(quote
+        let
+            $code
+        end
+    end)
+end
+
+
+## argument conversion
+
+struct KernelAdaptor
+    svm_pointers::Vector{Ptr{Cvoid}}
+end
+
+# assume directly-passed pointers are SVM pointers
+function Adapt.adapt_storage(to::KernelAdaptor, ptr::Ptr{T}) where {T}
+    push!(to.svm_pointers, ptr)
+    return ptr
+end
+
+# convert SVM buffers to their GPU address
+function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.SVMBuffer)
+    ptr = pointer(buf)
+    push!(to.svm_pointers, ptr)
+    return ptr
+end
+
+# Base.RefValue isn't GPU compatible, so provide a compatible alternative
+# TODO: port improvements from CUDA.jl
+struct CLRefValue{T} <: Ref{T}
+  x::T
+end
+Base.getindex(r::CLRefValue) = r.x
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CLRefValue(adapt(to, r[]))
+
+# broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
+# avoid that by using a special kind of ref that knows about the boxed type.
+struct CLRefType{T} <: Ref{DataType} end
+Base.getindex(r::CLRefType{T}) where T = T
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) =
+    CLRefType{r[]}()
+
+# case where type is the function being broadcasted
+Adapt.adapt_structure(to::KernelAdaptor,
+                      bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
+    Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)
+
+"""
+    clconvert(x, [pointers])
+
+This function is called for every argument to be passed to a kernel, allowing it to be
+converted to a GPU-friendly format. By default, the function does nothing and returns the
+input object `x` as-is.
+
+Do not add methods to this function, but instead extend the underlying Adapt.jl package and
+register methods for the the `OpenCL.KernelAdaptor` type.
+
+The `pointers` argument is used to collect pointers to indirect SVM buffers, which need to
+be registered with OpenCL before invoking the kernel.
+"""
+function clconvert(arg, pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[])
+    adapt(KernelAdaptor(pointers), arg)
+end
+
+
+
+## abstract kernel functionality
+
+abstract type AbstractKernel{F,TT} end
+
+@inline @generated function (kernel::AbstractKernel{F,TT})(args...;
+                                                           call_kwargs...) where {F,TT}
+    sig = Tuple{F, TT.parameters...}    # Base.signature_type with a function type
+    args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...)
+
+    # filter out ghost arguments that shouldn't be passed
+    predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
+    to_pass = map(!predicate, sig.parameters)
+    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
+    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
+
+    # replace non-isbits arguments (they should be unused, or compilation would have failed)
+    for (i,dt) in enumerate(call_t)
+        if !isbitstype(dt)
+            call_t[i] = Ptr{Any}
+            call_args[i] = :C_NULL
+        end
+    end
+
+    # finalize types
+    call_tt = Base.to_tuple_type(call_t)
+
+    quote
+        svm_pointers = Ptr{Cvoid}[]
+        clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
+    end
+end
+
+
+
+## host-side kernels
+
+struct HostKernel{F,TT} <: AbstractKernel{F,TT}
+    f::F
+    fun::cl.Kernel
+end
+
+
+## host-side API
+
+const clfunction_lock = ReentrantLock()
+
+function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
+    ctx = cl.context()
+    dev = cl.device()
+
+    Base.@lock clfunction_lock begin
+        # compile the function
+        cache = compiler_cache(ctx)
+        source = methodinstance(F, tt)
+        config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig
+        fun = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+
+        # create a callable object that captures the function instance. we don't need to think
+        # about world age here, as GPUCompiler already does and will return a different object
+        h = hash(fun, hash(f, hash(tt)))
+        kernel = get(_kernel_instances, h, nothing)
+        if kernel === nothing
+            # create the kernel state object
+            kernel = HostKernel{F,tt}(f, fun)
+            _kernel_instances[h] = kernel
+        end
+        return kernel::HostKernel{F,tt}
+    end
+end
+
+# cache of kernel instances
+const _kernel_instances = Dict{UInt, Any}()
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
new file mode 100644
index 0000000..a8bb714
--- /dev/null
+++ b/src/compiler/reflection.jl
@@ -0,0 +1,74 @@
+# code reflection entry-points
+
+#
+# code_* replacements
+#
+
+# function to split off certain kwargs for selective forwarding, at run time.
+# `@opencl` does something similar at parse time, using `GPUCompiler.split_kwargs`.
+function split_kwargs_runtime(kwargs, wanted::Vector{Symbol})
+    remaining = Dict{Symbol, Any}()
+    extracted = Dict{Symbol, Any}()
+    for (key, value) in kwargs
+        if key in wanted
+            extracted[key] = value
+        else
+            remaining[key] = value
+        end
+    end
+    return extracted, remaining
+end
+
+for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
+    # only code_typed doesn't take a io argument
+    args = method == :code_typed ? (:job,) : (:io, :job)
+
+    @eval begin
+        function $method(io::IO, @nospecialize(func), @nospecialize(types);
+                         kernel::Bool=false, kwargs...)
+            compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS)
+            source = methodinstance(typeof(func), Base.to_tuple_type(types))
+            config = compiler_config(cl.device(); kernel, compiler_kwargs...)
+            job = CompilerJob(source, config)
+            GPUCompiler.$method($(args...); kwargs...)
+        end
+        $method(@nospecialize(func), @nospecialize(types); kwargs...) =
+            $method(stdout, func, types; kwargs...)
+    end
+end
+
+
+
+#
+# @device_code_* functions
+#
+
+export @device_code_lowered, @device_code_typed, @device_code_warntype, @device_code_llvm,
+       @device_code_native, @device_code
+
+# forward to GPUCompiler
+@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered")))
+@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed")))
+@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype")))
+@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm")))
+@eval $(Symbol("@device_code_native")) = $(getfield(GPUCompiler, Symbol("@device_code_native")))
+@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code")))
+
+
+#
+# other
+#
+
+"""
+    Metal.return_type(f, tt) -> r::Type
+
+Return a type `r` such that `f(args...)::r` where `args::tt`.
+"""
+function return_type(@nospecialize(func), @nospecialize(tt))
+    source = methodinstance(typeof(func), tt)
+    config = compiler_config(cl.device())
+    job = CompilerJob(source, config)
+    interp = GPUCompiler.get_interpreter(job)
+    sig = Base.signature_type(func, tt)
+    Core.Compiler.return_type(interp, sig)
+end
diff --git a/src/device/array.jl b/src/device/array.jl
new file mode 100644
index 0000000..24ae808
--- /dev/null
+++ b/src/device/array.jl
@@ -0,0 +1,239 @@
+# Contiguous on-device arrays
+
+export CLDeviceArray, CLDeviceVector, CLDeviceMatrix
+
+
+## construction
+
+# NOTE: we can't support the typical `tuple or series of integer` style construction,
+#       because we're currently requiring a trailing pointer argument.
+
+struct CLDeviceArray{T,N,A} <: DenseArray{T,N}
+    ptr::LLVMPtr{T,A}
+    maxsize::Int
+
+    dims::Dims{N}
+    len::Int
+
+    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
+    # TODO: deprecate; put `ptr` first like oneArray
+    CLDeviceArray{T,N,A}(dims::Dims{N}, ptr::LLVMPtr{T,A},
+                          maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
+        new(ptr, maxsize, dims, prod(dims))
+end
+
+const CLDeviceVector = CLDeviceArray{T,1,A} where {T,A}
+const CLDeviceMatrix = CLDeviceArray{T,2,A} where {T,A}
+
+# outer constructors, non-parameterized
+CLDeviceArray(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A})                where {T,A,N} = CLDeviceArray{T,N,A}(dims, p)
+CLDeviceArray(len::Integer,              p::LLVMPtr{T,A})                where {T,A}   = CLDeviceVector{T,A}((len,), p)
+
+# outer constructors, partially parameterized
+CLDeviceArray{T}(dims::NTuple{N,<:Integer},   p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p)
+CLDeviceArray{T}(len::Integer,                p::LLVMPtr{T,A}) where {T,A}   = CLDeviceVector{T,A}((len,), p)
+CLDeviceArray{T,N}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p)
+CLDeviceVector{T}(len::Integer,               p::LLVMPtr{T,A}) where {T,A}   = CLDeviceVector{T,A}((len,), p)
+
+# outer constructors, fully parameterized
+CLDeviceArray{T,N,A}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(Int.(dims), p)
+CLDeviceVector{T,A}(len::Integer,               p::LLVMPtr{T,A}) where {T,A}   = CLDeviceVector{T,A}((Int(len),), p)
+
+
+## array interface
+
+Base.elsize(::Type{<:CLDeviceArray{T}}) where {T} = sizeof(T)
+
+Base.size(g::CLDeviceArray) = g.dims
+Base.sizeof(x::CLDeviceArray) = Base.elsize(x) * length(x)
+
+# we store the array length too; computing prod(size) is expensive
+Base.length(g::CLDeviceArray) = g.len
+
+Base.pointer(x::CLDeviceArray{T,<:Any,A}) where {T,A} = Base.unsafe_convert(LLVMPtr{T,A}, x)
+@inline function Base.pointer(x::CLDeviceArray{T,<:Any,A}, i::Integer) where {T,A}
+    Base.unsafe_convert(LLVMPtr{T,A}, x) + Base._memory_offset(x, i)
+end
+
+typetagdata(a::CLDeviceArray{<:Any,<:Any,A}, i=1) where {A} =
+  reinterpret(LLVMPtr{UInt8,A}, a.ptr + a.maxsize) + i - one(i)
+
+
+## conversions
+
+Base.unsafe_convert(::Type{LLVMPtr{T,A}}, x::CLDeviceArray{T,<:Any,A}) where {T,A} =
+  x.ptr
+
+
+## indexing intrinsics
+
+# TODO: how are allocations aligned by the level zero API? keep track of this
+#       because it enables optimizations like Load Store Vectorization
+#       (cfr. shared memory and its wider-than-datatype alignment)
+
+@generated function alignment(::CLDeviceArray{T}) where {T}
+    if Base.isbitsunion(T)
+        _, sz, al = Base.uniontype_layout(T)
+        al
+    else
+        Base.datatype_alignment(T)
+    end
+end
+
+@device_function @inline function arrayref(A::CLDeviceArray{T}, index::Integer) where {T}
+    @boundscheck checkbounds(A, index)
+    if isbitstype(T)
+        arrayref_bits(A, index)
+    else #if isbitsunion(T)
+        arrayref_union(A, index)
+    end
+end
+
+@inline function arrayref_bits(A::CLDeviceArray{T}, index::Integer) where {T}
+    align = alignment(A)
+    unsafe_load(pointer(A), index, Val(align))
+end
+
+@inline @generated function arrayref_union(A::CLDeviceArray{T,<:Any,AS}, index::Integer) where {T,AS}
+    typs = Base.uniontypes(T)
+
+    # generate code that conditionally loads a value based on the selector value.
+    # lacking noreturn, we return T to avoid inference thinking this can return Nothing.
+    ex = :(Base.llvmcall("unreachable", $T, Tuple{}))
+    for (sel, typ) in Iterators.reverse(enumerate(typs))
+        ex = quote
+            if selector == $(sel-1)
+                ptr = reinterpret(LLVMPtr{$typ,AS}, data_ptr)
+                unsafe_load(ptr, 1, Val(align))
+            else
+                $ex
+            end
+        end
+    end
+
+    quote
+        selector_ptr = typetagdata(A, index)
+        selector = unsafe_load(selector_ptr)
+
+        align = alignment(A)
+        data_ptr = pointer(A, index)
+
+        return $ex
+    end
+end
+
+@device_function @inline function arrayset(A::CLDeviceArray{T}, x::T, index::Integer) where {T}
+    @boundscheck checkbounds(A, index)
+    if isbitstype(T)
+        arrayset_bits(A, x, index)
+    else #if isbitsunion(T)
+        arrayset_union(A, x, index)
+    end
+    return A
+end
+
+@inline function arrayset_bits(A::CLDeviceArray{T}, x::T, index::Integer) where {T}
+    align = alignment(A)
+    unsafe_store!(pointer(A), x, index, Val(align))
+end
+
+@inline @generated function arrayset_union(A::CLDeviceArray{T,<:Any,AS}, x::T, index::Integer) where {T,AS}
+    typs = Base.uniontypes(T)
+    sel = findfirst(isequal(x), typs)
+
+    quote
+        selector_ptr = typetagdata(A, index)
+        unsafe_store!(selector_ptr, $(UInt8(sel-1)))
+
+        align = alignment(A)
+        data_ptr = pointer(A, index)
+
+        unsafe_store!(reinterpret(LLVMPtr{$x,AS}, data_ptr), x, 1, Val(align))
+        return
+    end
+end
+
+@device_function @inline function const_arrayref(A::CLDeviceArray{T}, index::Integer) where {T}
+    @boundscheck checkbounds(A, index)
+    align = alignment(A)
+    unsafe_cached_load(pointer(A), index, Val(align))
+end
+
+
+## indexing
+
+Base.IndexStyle(::Type{<:CLDeviceArray}) = Base.IndexLinear()
+
+Base.@propagate_inbounds Base.getindex(A::CLDeviceArray{T}, i1::Integer) where {T} =
+    arrayref(A, i1)
+Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray{T}, x, i1::Integer) where {T} =
+    arrayset(A, convert(T,x)::T, i1)
+
+# preserve the specific integer type when indexing device arrays,
+# to avoid extending 32-bit hardware indices to 64-bit.
+Base.to_index(::CLDeviceArray, i::Integer) = i
+
+# Base doesn't like Integer indices, so we need our own ND get and setindex! routines.
+# See also: https://github.com/JuliaLang/julia/pull/42289
+Base.@propagate_inbounds Base.getindex(A::CLDeviceArray,
+                                       I::Union{Integer, CartesianIndex}...) =
+    A[Base._to_linear_index(A, to_indices(A, I)...)]
+Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray, x,
+                                        I::Union{Integer, CartesianIndex}...) =
+    A[Base._to_linear_index(A, to_indices(A, I)...)] = x
+
+
+## const indexing
+
+"""
+    Const(A::CLDeviceArray)
+
+Mark a CLDeviceArray as constant/read-only. The invariant guaranteed is that you will not
+modify an CLDeviceArray for the duration of the current kernel.
+
+This API can only be used on devices with compute capability 3.5 or higher.
+
+!!! warning
+    Experimental API. Subject to change without deprecation.
+"""
+struct Const{T,N,AS} <: DenseArray{T,N}
+    a::CLDeviceArray{T,N,AS}
+end
+Base.Experimental.Const(A::CLDeviceArray) = Const(A)
+
+Base.IndexStyle(::Type{<:Const}) = IndexLinear()
+Base.size(C::Const) = size(C.a)
+Base.axes(C::Const) = axes(C.a)
+Base.@propagate_inbounds Base.getindex(A::Const, i1::Integer) = const_arrayref(A.a, i1)
+
+
+## other
+
+Base.show(io::IO, a::CLDeviceVector) =
+    print(io, "$(length(a))-element device array at $(pointer(a))")
+Base.show(io::IO, a::CLDeviceArray) =
+    print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
+
+Base.show(io::IO, mime::MIME"text/plain", a::CLDeviceArray) = show(io, a)
+
+@inline function Base.iterate(A::CLDeviceArray, i=1)
+    if (i % UInt) - 1 < length(A)
+        (@inbounds A[i], i + 1)
+    else
+        nothing
+    end
+end
+
+function Base.reinterpret(::Type{T}, a::CLDeviceArray{S,N,A}) where {T,S,N,A}
+  err = _reinterpret_exception(T, a)
+  err === nothing || throw(err)
+
+  if sizeof(T) == sizeof(S) # fast case
+    return CLDeviceArray{T,N,A}(size(a), reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize)
+  end
+
+  isize = size(a)
+  size1 = div(isize[1]*sizeof(S), sizeof(T))
+  osize = tuple(size1, Base.tail(isize)...)
+  return CLDeviceArray{T,N,A}(osize, reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize)
+end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
new file mode 100644
index 0000000..cdd146a
--- /dev/null
+++ b/src/device/runtime.jl
@@ -0,0 +1,11 @@
+signal_exception() = return
+
+malloc(sz) = C_NULL
+
+report_oom(sz) = return
+
+report_exception(ex) = return
+
+report_exception_name(ex) = return
+
+report_exception_frame(idx, func, file, line) = return
diff --git a/test/Project.toml b/test/Project.toml
index 382d28b..ba8f907 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
diff --git a/test/execution.jl b/test/execution.jl
new file mode 100644
index 0000000..a64b542
--- /dev/null
+++ b/test/execution.jl
@@ -0,0 +1,124 @@
+if !in("cl_khr_il_program", cl.device().extensions)
+@warn "Skipping execution tests on $(cl.platform().name)"
+else
+
+@testset "execution" begin
+
+@testset "@opencl" begin
+
+dummy() = nothing
+
+@test_throws UndefVarError @opencl undefined()
+@test_throws MethodError @opencl dummy(1)
+
+
+@testset "launch configuration" begin
+    @opencl dummy()
+
+    global_size = 1
+    @opencl global_size dummy()
+    @opencl global_size=1 dummy()
+    @opencl global_size=(1,1) dummy()
+    @opencl global_size=(1,1,1) dummy()
+
+    local_size = 1
+    @opencl global_size local_size dummy()
+    @opencl global_size=1 local_size=1 dummy()
+    @opencl global_size=(1,1) local_size=(1,1) dummy()
+    @opencl global_size=(1,1,1) local_size=(1,1,1) dummy()
+
+    @test_throws ArgumentError @opencl global_size=(1,) local_size=(1,1) dummy()
+    @test_throws InexactError @opencl global_size=(-2) dummy()
+    @test_throws InexactError @opencl local_size=(-2) dummy()
+end
+
+@testset "launch=false" begin
+    # XXX: how are svm_pointers handled here?
+    k = @opencl launch=false dummy()
+    k()
+    k(; global_size=1)
+end
+
+@testset "inference" begin
+    foo() = @opencl dummy()
+    @inferred foo()
+
+    # with arguments, we call clconvert
+    kernel(a) = return
+    bar(a) = @opencl kernel(a)
+    @inferred bar(CLArray([1]))
+end
+
+
+@testset "reflection" begin
+    OpenCL.code_lowered(dummy, Tuple{})
+    OpenCL.code_typed(dummy, Tuple{})
+    OpenCL.code_warntype(devnull, dummy, Tuple{})
+    OpenCL.code_llvm(devnull, dummy, Tuple{})
+    OpenCL.code_native(devnull, dummy, Tuple{})
+
+    @device_code_lowered @opencl dummy()
+    @device_code_typed @opencl dummy()
+    @device_code_warntype io=devnull @opencl dummy()
+    @device_code_llvm io=devnull @opencl dummy()
+    @device_code_native io=devnull @opencl dummy()
+
+    mktempdir() do dir
+        @device_code dir=dir @opencl dummy()
+    end
+
+    @test_throws ErrorException @device_code_lowered nothing
+
+    # make sure kernel name aliases are preserved in the generated code
+    @test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @opencl dummy())))
+    @test occursin("dummy", sprint(io->(@device_code_llvm io=io @opencl dummy())))
+    @test occursin("dummy", sprint(io->(@device_code_native io=io @opencl dummy())))
+
+    # make sure invalid kernels can be partially reflected upon
+    let
+        invalid_kernel() = throw()
+        @test_throws OpenCL.InvalidIRError @opencl invalid_kernel()
+        @test_throws OpenCL.InvalidIRError IOCapture.capture() do
+            @device_code_warntype @opencl invalid_kernel()
+        end
+        c = IOCapture.capture() do
+            try
+                @device_code_warntype @opencl invalid_kernel()
+            catch
+            end
+        end
+        @test occursin("Body::Union{}", c.output)
+    end
+
+    # set name of kernel
+    @test occursin("mykernel", sprint(io->(@device_code_llvm io=io begin
+        @opencl name="mykernel" dummy()
+    end)))
+
+    @test OpenCL.return_type(identity, Tuple{Int}) === Int
+    @test OpenCL.return_type(sin, Tuple{Float32}) === Float32
+    @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.Global},Int32}) === Float32
+    @test OpenCL.return_type(getindex, Tuple{Base.RefValue{Integer}}) === Integer
+end
+
+end
+
+###############################################################################
+
+@testset "argument passing" begin
+
+function memset(a, val)
+    gid = get_global_id(1)
+    @inbounds a[gid] = val
+    return
+end
+
+a = CLArray{Int}(undef, 10)
+@opencl global_size=length(a) memset(a, 42)
+@test all(Array(a) .== 42)
+
+end
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index c41a005..bacb223 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,16 +2,19 @@ using Test
 using OpenCL
 using pocl_jll
 
+using IOCapture
+
 @info "System information:\n" * sprint(io->OpenCL.versioninfo(io))
 
 @testset "OpenCL.jl" begin
 
 @testset "$(platform.name): $(device.name)" for platform in cl.platforms(),
-                                                 device in cl.devices(platform)
+                                                device in cl.devices(platform)
 
 cl.platform!(platform)
 cl.device!(device)
 
+# libopencl wrappers
 include("platform.jl")
 include("context.jl")
 include("device.jl")
@@ -22,7 +25,9 @@ include("kernel.jl")
 include("behaviour.jl")
 include("memory.jl")
 include("buffer.jl")
+
 include("array.jl")
+include("execution.jl")
 
 end