From 8584ac79ff4701df509e54245cd05f275cd71d93 Mon Sep 17 00:00:00 2001
From: SolDev69 <michaelcraft1104@gmail.com>
Date: Thu, 21 Dec 2023 20:13:50 -0500
Subject: [PATCH] apply pancsf patches

---
 .ci-farms-disabled/lima                       |     0
 README-CSF.rst                                |    88 +
 README.rst                                    |   145 +-
 bin/ci/custom_logger.py                       |   334 +
 bin/ci/test/requirements.txt                  |     5 +
 bin/ci/test/test_custom_logger.py             |   669 +
 docs/features.txt.rej                         |    10 +
 include/dma-uapi/dma-buf.h                    |   182 +
 include/drm-uapi/drm_fourcc.h                 |     7 +
 meson.build.rej                               |    18 +
 patch.diff                                    | 25515 ++++++++++++++++
 src/amd/vulkan/radv_buffer_view.c             |   149 +
 src/amd/vulkan/radv_image_view.c              |   945 +
 src/amd/vulkan/radv_sdma.h                    |    93 +
 src/android_stub/meson.build.rej              |    10 +
 src/compiler/glsl/glsl_to_nir.cpp.rej         |    39 +
 src/compiler/glsl/standalone_scaffolding.cpp  |     3 +
 src/drm-shim/device.c                         |     4 +
 src/egl/drivers/dri2/egl_dri2.c               |     2 +
 src/egl/drivers/dri2/egl_dri2.c.rej           |    60 +
 src/egl/drivers/dri2/egl_dri2.h               |     4 +
 src/egl/drivers/dri2/platform_wayland.c       |    92 +-
 src/egl/drivers/dri2/platform_wayland.c.rej   |    89 +
 src/egl/meson.build.rej                       |    19 +
 .../mali-buffer-sharing/mali-buffer-sharing.c |   170 +
 .../mali-buffer-sharing/mali-buffer-sharing.h |    12 +
 .../mali-buffer-sharing.xml                   |    50 +
 .../wayland/mali-buffer-sharing/meson.build   |    51 +
 src/egl/wayland/wayland-drm/wayland-drm.c.rej |    10 +
 src/gallium/auxiliary/cso_cache/cso_context.c |     5 +
 src/gallium/auxiliary/cso_cache/cso_context.h |     3 +
 .../auxiliary/gallivm/lp_bld_nir_soa.c.rej    |    19 +
 .../pipe-loader/pipe_loader_drm.c.rej         |    10 +
 .../target-helpers/inline_sw_helper.h.rej     |    43 +
 src/gallium/drivers/panfrost/meson.build      |     1 +
 src/gallium/drivers/panfrost/meson.build.rej  |    10 +
 .../drivers/panfrost/pan_cmdstream.c.rej      |  1186 +
 .../drivers/panfrost/pan_context.c.rej        |   178 +
 .../drivers/panfrost/pan_context.h.rej        |    42 +
 src/gallium/drivers/panfrost/pan_disk_cache.c |     2 +
 .../drivers/panfrost/pan_disk_cache.c.rej     |    23 +
 src/gallium/drivers/panfrost/pan_fence.c.rej  |    66 +
 src/gallium/drivers/panfrost/pan_fence.h.rej  |     9 +
 src/gallium/drivers/panfrost/pan_job.c.rej    |   596 +
 src/gallium/drivers/panfrost/pan_job.h.rej    |    42 +
 .../drivers/panfrost/pan_resource.c.rej       |   426 +
 src/gallium/drivers/panfrost/pan_screen.c.rej |    87 +
 src/gallium/drivers/panfrost/pan_screen.h     |     1 +
 src/gallium/drivers/panfrost/pan_screen.h.rej |    28 +
 src/gallium/frontends/nine/nine_ff.c          |     2 +-
 src/gallium/frontends/nine/nine_shader.c      |     4 +-
 src/gallium/frontends/nine/nine_shader.c.rej  |    10 +
 src/gallium/frontends/nine/nine_state.c       |     4 +-
 src/gallium/frontends/nine/nine_state.c.rej   |    13 +
 .../targets/d3dadapter9/meson.build.rej       |    11 +
 src/gallium/targets/osmesa/meson.build.rej    |    14 +
 src/gallium/targets/rusticl/meson.build.rej   |     9 +
 .../winsys/kmsro/drm/kmsro_drm_winsys.c.rej   |    19 +
 src/mesa/main/shaderapi.c                     |     7 -
 src/mesa/main/shaderapi.c.rej                 |     9 +
 src/meson.build                               |     1 +
 .../base/include/csf/mali_base_csf_kernel.h   |   596 +
 .../base/include/csf/mali_gpu_csf_registers.h |    43 +
 .../base/include/csf/mali_kbase_csf_ioctl.h   |   530 +
 .../base/include/jm/mali_base_jm_kernel.h     |  1051 +
 .../base/include/jm/mali_kbase_jm_ioctl.h     |   231 +
 .../base/include/mali_base_common_kernel.h    |   231 +
 src/panfrost/base/include/mali_base_kernel.h  |   700 +
 .../base/include/mali_kbase_gpuprops.h        |   127 +
 src/panfrost/base/include/mali_kbase_ioctl.h  |   759 +
 .../base/include/old/mali-ioctl-midgard.h     |    80 +
 src/panfrost/base/include/old/mali-ioctl.h    |   743 +
 src/panfrost/base/include/old/mali-props.h    |   262 +
 src/panfrost/base/meson.build                 |    55 +
 src/panfrost/base/pan_base.c                  |   301 +
 src/panfrost/base/pan_base.h                  |   234 +
 src/panfrost/base/pan_base_noop.h             |   152 +
 src/panfrost/base/pan_cache.h                 |    95 +
 src/panfrost/base/pan_vX_base.c               |  1825 ++
 src/panfrost/ci/deqp-panfrost-g610.toml       |    11 +
 src/panfrost/csf_test/interpret.py            |  1820 ++
 src/panfrost/csf_test/mali_base_csf_kernel.h  |   721 +
 src/panfrost/csf_test/mali_base_kernel.h      |   746 +
 .../csf_test/mali_gpu_csf_registers.h         |    43 +
 src/panfrost/csf_test/mali_kbase_csf_ioctl.h  |   483 +
 src/panfrost/csf_test/mali_kbase_ioctl.h      |   854 +
 src/panfrost/csf_test/test.c                  |  1903 ++
 src/panfrost/lib/genxml/common.xml            |     2 +-
 src/panfrost/lib/genxml/decode.c.rej          |   940 +
 src/panfrost/lib/genxml/decode.h.rej          |    28 +
 src/panfrost/lib/genxml/decode_common.c.rej   |    52 +
 src/panfrost/lib/genxml/gen_macros.h.rej      |    11 +
 src/panfrost/lib/genxml/gen_pack.py           |   317 +-
 src/panfrost/lib/genxml/meson.build.rej       |    19 +
 src/panfrost/lib/genxml/v4.xml                |     2 +-
 src/panfrost/lib/genxml/v5.xml                |     2 +-
 src/panfrost/lib/genxml/v6.xml                |     8 +-
 src/panfrost/lib/genxml/v7.xml                |    12 +-
 src/panfrost/lib/genxml/v9.xml                |    75 +-
 src/panfrost/lib/genxml/v9.xml.rej            |    28 +
 src/panfrost/lib/meson.build                  |     2 +-
 src/panfrost/lib/meson.build.rej              |    10 +
 src/panfrost/lib/pan_afbc.c.rej               |    25 +
 src/panfrost/lib/pan_blend.c.rej              |    10 +
 src/panfrost/lib/pan_blitter.c.rej            |    28 +
 src/panfrost/lib/pan_bo.c                     |     1 +
 src/panfrost/lib/pan_bo.c.rej                 |   584 +
 src/panfrost/lib/pan_bo.h.rej                 |    84 +
 src/panfrost/lib/pan_device.h.rej             |    88 +
 src/panfrost/lib/pan_layout.c.rej             |    66 +
 src/panfrost/lib/pan_pool.h.rej               |    19 +
 src/panfrost/lib/pan_props.c.rej              |   365 +
 src/panfrost/lib/pan_texture.h.rej            |    55 +
 src/panfrost/lib/pan_util.h.rej               |    19 +
 src/panfrost/lib/wrap.h.rej                   |    21 +
 src/panfrost/meson.build                      |    42 +-
 src/panfrost/meson.build.rej                  |    10 +
 src/panfrost/midgard/disassemble.c.rej        |    12 +
 src/panfrost/tiler/tiler-hex-read             |   400 +
 src/util/os_misc.c.rej                        |   103 +
 src/util/perf/cpu_trace.h.rej                 |    21 +
 src/util/stable_array.h                       |   132 +
 src/util/u_debug_stack_android.cpp.rej        |    83 +
 123 files changed, 49782 insertions(+), 145 deletions(-)
 create mode 100644 .ci-farms-disabled/lima
 create mode 100644 README-CSF.rst
 create mode 100644 bin/ci/custom_logger.py
 create mode 100644 bin/ci/test/requirements.txt
 create mode 100644 bin/ci/test/test_custom_logger.py
 create mode 100644 docs/features.txt.rej
 create mode 100644 include/dma-uapi/dma-buf.h
 create mode 100644 meson.build.rej
 create mode 100644 patch.diff
 create mode 100644 src/amd/vulkan/radv_buffer_view.c
 create mode 100644 src/amd/vulkan/radv_image_view.c
 create mode 100644 src/amd/vulkan/radv_sdma.h
 create mode 100644 src/android_stub/meson.build.rej
 create mode 100644 src/compiler/glsl/glsl_to_nir.cpp.rej
 create mode 100644 src/egl/drivers/dri2/egl_dri2.c.rej
 create mode 100644 src/egl/drivers/dri2/platform_wayland.c.rej
 create mode 100644 src/egl/meson.build.rej
 create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c
 create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h
 create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml
 create mode 100644 src/egl/wayland/mali-buffer-sharing/meson.build
 create mode 100644 src/egl/wayland/wayland-drm/wayland-drm.c.rej
 create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej
 create mode 100644 src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej
 create mode 100644 src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej
 create mode 100644 src/gallium/drivers/panfrost/meson.build.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_cmdstream.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_context.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_context.h.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_disk_cache.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_fence.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_fence.h.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_job.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_job.h.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_resource.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_screen.c.rej
 create mode 100644 src/gallium/drivers/panfrost/pan_screen.h.rej
 create mode 100644 src/gallium/frontends/nine/nine_shader.c.rej
 create mode 100644 src/gallium/frontends/nine/nine_state.c.rej
 create mode 100644 src/gallium/targets/d3dadapter9/meson.build.rej
 create mode 100644 src/gallium/targets/osmesa/meson.build.rej
 create mode 100644 src/gallium/targets/rusticl/meson.build.rej
 create mode 100644 src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej
 create mode 100644 src/mesa/main/shaderapi.c.rej
 create mode 100644 src/panfrost/base/include/csf/mali_base_csf_kernel.h
 create mode 100644 src/panfrost/base/include/csf/mali_gpu_csf_registers.h
 create mode 100644 src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
 create mode 100644 src/panfrost/base/include/jm/mali_base_jm_kernel.h
 create mode 100644 src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
 create mode 100644 src/panfrost/base/include/mali_base_common_kernel.h
 create mode 100644 src/panfrost/base/include/mali_base_kernel.h
 create mode 100644 src/panfrost/base/include/mali_kbase_gpuprops.h
 create mode 100644 src/panfrost/base/include/mali_kbase_ioctl.h
 create mode 100644 src/panfrost/base/include/old/mali-ioctl-midgard.h
 create mode 100644 src/panfrost/base/include/old/mali-ioctl.h
 create mode 100644 src/panfrost/base/include/old/mali-props.h
 create mode 100644 src/panfrost/base/meson.build
 create mode 100644 src/panfrost/base/pan_base.c
 create mode 100644 src/panfrost/base/pan_base.h
 create mode 100644 src/panfrost/base/pan_base_noop.h
 create mode 100644 src/panfrost/base/pan_cache.h
 create mode 100644 src/panfrost/base/pan_vX_base.c
 create mode 100644 src/panfrost/ci/deqp-panfrost-g610.toml
 create mode 100755 src/panfrost/csf_test/interpret.py
 create mode 100644 src/panfrost/csf_test/mali_base_csf_kernel.h
 create mode 100644 src/panfrost/csf_test/mali_base_kernel.h
 create mode 100644 src/panfrost/csf_test/mali_gpu_csf_registers.h
 create mode 100644 src/panfrost/csf_test/mali_kbase_csf_ioctl.h
 create mode 100644 src/panfrost/csf_test/mali_kbase_ioctl.h
 create mode 100644 src/panfrost/csf_test/test.c
 create mode 100644 src/panfrost/lib/genxml/decode.c.rej
 create mode 100644 src/panfrost/lib/genxml/decode.h.rej
 create mode 100644 src/panfrost/lib/genxml/decode_common.c.rej
 create mode 100644 src/panfrost/lib/genxml/gen_macros.h.rej
 create mode 100644 src/panfrost/lib/genxml/meson.build.rej
 create mode 100644 src/panfrost/lib/genxml/v9.xml.rej
 create mode 100644 src/panfrost/lib/meson.build.rej
 create mode 100644 src/panfrost/lib/pan_afbc.c.rej
 create mode 100644 src/panfrost/lib/pan_blend.c.rej
 create mode 100644 src/panfrost/lib/pan_blitter.c.rej
 create mode 100644 src/panfrost/lib/pan_bo.c.rej
 create mode 100644 src/panfrost/lib/pan_bo.h.rej
 create mode 100644 src/panfrost/lib/pan_device.h.rej
 create mode 100644 src/panfrost/lib/pan_layout.c.rej
 create mode 100644 src/panfrost/lib/pan_pool.h.rej
 create mode 100644 src/panfrost/lib/pan_props.c.rej
 create mode 100644 src/panfrost/lib/pan_texture.h.rej
 create mode 100644 src/panfrost/lib/pan_util.h.rej
 create mode 100644 src/panfrost/lib/wrap.h.rej
 create mode 100644 src/panfrost/meson.build.rej
 create mode 100644 src/panfrost/midgard/disassemble.c.rej
 create mode 100755 src/panfrost/tiler/tiler-hex-read
 create mode 100644 src/util/os_misc.c.rej
 create mode 100644 src/util/perf/cpu_trace.h.rej
 create mode 100644 src/util/stable_array.h
 create mode 100644 src/util/u_debug_stack_android.cpp.rej

diff --git a/.ci-farms-disabled/lima b/.ci-farms-disabled/lima
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/README-CSF.rst b/README-CSF.rst
new file mode 100644
index 00000000000..9bd161005f9
--- /dev/null
+++ b/README-CSF.rst
@@ -0,0 +1,88 @@
+Valhall CSF Tests
+=================
+
+The ``csf`` branch contains a test program for v10 Valhall GPUs (G710
+etc.) which uses the Arm ``kbase`` kernel driver, which is generally
+present on vendor kernels but is not in the upstream Linux kernel.
+
+However, the kernel driver source can also be downloaded `from Arm
+<https://developer.arm.com/downloads/-/mali-drivers/valhall-kernel>`_,
+of which the newer releases should work well enough with a mainline
+kernel (though some work may be needed to integrate the vendor
+platform).
+
+Making sure that the ``libmali`` blob drivers work before trying this
+program is recommended, otherwise you will be trying to debug
+userspace and kernel bugs at the same time.
+
+Note that firmware is required for these GPUs, for RK3588 try
+downloading the file from the Rockchip `libmali
+<https://github.com/JeffyCN/rockchip_mirrors/tree/libmali/firmware/g610>`_
+repo, and placing it in ``/lib/firmware/``.
+
+Compiling
+---------
+
+.. code-block:: sh
+
+  $ mkdir build
+  $ cd build
+  $ meson --buildtype=debug -Dgallium-drivers=panfrost -Dvulkan-drivers=
+  $ ninja src/panfrost/csf_test
+
+Running
+-------
+
+.. code-block:: sh
+
+  $ src/panfrost/csf_test
+
+will run the tests.
+
+Normally it will start running cleanup steps as soon as one test
+fails, though setting the environment variable ``TEST_KEEP_GOING=1``
+will change this behaviour.
+
+Test failures
+-------------
+
+Gitlab issues can be created against `my repo
+<https://gitlab.freedesktop.org/icecream95/mesa/-/issues>`_, though
+some problems should be easy to fix (wrong permissions on
+``/dev/mali0``?).
+
+Include all output from running the test program. Including logs from
+``strace`` might also help.
+
+Command stream test script
+--------------------------
+
+``src/panfrost/csf_test/interpret.py`` is a test script for assembling
+and executing command streams.
+
+To use it, symlink the ``csf_test`` binary into ``$PATH`` and optionally
+also write a ``rebuild-mesa`` script which recompiles ``csf_test``.
+
+Then running ``interpret.py`` will execute the ``cmds`` variable,
+which is defined inside the script file.
+
+Example:
+
+.. code-block:: txt
+
+  @ comments are started with '@'
+
+  @ run on command stream 2
+  !cs 2
+  @ allocate some memory
+  !alloc x 4096
+  @ allocate event memory, for evstr instructions
+  !alloc ev 4096 0x8200f
+
+  mov x50, $x
+
+  @ dump all registers to the memory starting at x50
+  regdump x50
+
+  @ dump the memory region named 'x'
+  !dump x 0 4096
diff --git a/README.rst b/README.rst
index b35246e034c..fd140a96013 100644
--- a/README.rst
+++ b/README.rst
@@ -1,59 +1,136 @@
 `Mesa <https://mesa3d.org>`_ - The 3D Graphics Library
 ======================================================
 
+Valhall v10 "CSF" support branch—for Mali G710/G610.
+
+Note that firmware is required for these GPUs, for RK3588 try
+downloading the file from the Rockchip `libmali
+<https://github.com/JeffyCN/rockchip_mirrors/tree/libmali/firmware/g610>`_
+repo, and placing it in ``/lib/firmware/``.
+
+Windowing system support
+------------------------
+
+Panfrost Wayland compositor (wlroots):
+
+#. Panfrost Wayland clients
+#. Panfrost X11 clients via Xwayland [1]_
+#. Blob X11 clients via Xwayland + dri2to3 [2]_
+
+Panfrost Wayland compositor (non-wlroots):
+
+#. Panfrost Wayland clients
+#. Panfrost X11 clients via Xwayland
+#. Blob Wayland clients
+#. Blob X11 clients via Xwayland + dri2to3 [2]_
+
+Blob Wayland compositor:
+
+#. Panfrost Wayland clients
+#. Blob Wayland clients
+
+Panfrost Xorg server: [3]_
+
+#. Panfrost X11 clients
+#. Blob X11 clients
+
+Blob Xorg server:
+
+#. Panfrost X11 clients
+#. Blob X11 clients
+
+Applications using KMS/DRM will also work.
+
+.. [1] Requires ``CONFIG_DRM_IGNORE_IOTCL_PERMIT`` to be disabled in
+       the kernel configuration. The option is broken and should never
+       be enabled anyway.
+
+.. [2] See https://gitlab.com/panfork/dri2to3
+
+.. [3] For Radxa Debian/Ubuntu, the ``xserver-xorg-core`` version
+       installed by default is not compatible with Panfrost. But note
+       that upstream Xorg does not work will the blob, so Mesa must be
+       installed so that it is used by default. (see the "Usage"
+       section below). To switch between the upstream and Rockchip
+       versions, run:
+
+.. code-block:: sh
+
+  $ sudo apt install xserver-xorg-core="$(apt-cache show xserver-xorg-core | grep Version | grep -v "$(dpkg -s xserver-xorg-core | grep Version)" | cut -d" " -f2)"
+
+Broken combinations:
+
+#. Panfrost wlroots + Blob Wayland does not work because wlroots does
+   not expose the ``mali_buffer_sharing`` protocol. This might be
+   fixable.
+#. Blob Wayland compositor + Panfrost X11 does not work because the
+   blob does not expose the required protocols for Xwayland
+   acceleration to work
 
 Source
 ------
 
-This repository lives at https://gitlab.freedesktop.org/mesa/mesa.
-Other repositories are likely forks, and code found there is not supported.
+This repository lives at https://gitlab.com/panfork/mesa, and is a
+fork, so not supported by upstream.
 
+Upstream source is at https://gitlab.freedesktop.org/mesa/mesa.
 
-Build & install
----------------
+Depdendencies
+-------------
 
-You can find more information in our documentation (`docs/install.rst
-<https://mesa3d.org/install.html>`_), but the recommended way is to use
-Meson (`docs/meson.rst <https://mesa3d.org/meson.html>`_):
+For Debian-based distributions:
 
 .. code-block:: sh
 
-  $ mkdir build
-  $ cd build
-  $ meson ..
+  $ sudo apt install build-essential meson git python3-mako libexpat1-dev bison flex libwayland-egl-backend-dev libxext-dev libxfixes-dev libxcb-glx0-dev libxcb-shm0-dev libxcb-dri2-0-dev libxcb-dri3-dev libxcb-present-dev libxshmfence-dev libxxf86vm-dev libxrandr-dev
+
+Also needed is ``libdrm`` and ``wayland-protocols``, but those
+packages are too old in Debian Bullseye, and must be compiled from
+source:
+
+.. code-block:: sh
+
+  $ git clone https://gitlab.freedesktop.org/mesa/drm
+  $ mkdir drm/build
+  $ cd drm/build
+  $ meson
+  $ sudo ninja install
+
+.. code-block:: sh
+
+  $ git clone https://gitlab.freedesktop.org/wayland/wayland-protocols
+  $ mkdir wayland-protocols/build
+  $ cd wayland-protocols/build
+  $ git checkout 1.24
+  $ meson
   $ sudo ninja install
 
+Build & install
+---------------
 
-Support
--------
+To install to ``/opt/panfrost``:
 
-Many Mesa devs hang on IRC; if you're not sure which channel is
-appropriate, you should ask your question on `OFTC's #dri-devel
-<irc://irc.oftc.net/dri-devel>`_, someone will redirect you if
-necessary.
-Remember that not everyone is in the same timezone as you, so it might
-take a while before someone qualified sees your question.
-To figure out who you're talking to, or which nick to ping for your
-question, check out `Who's Who on IRC
-<https://dri.freedesktop.org/wiki/WhosWho/>`_.
+.. code-block:: sh
 
-The next best option is to ask your question in an email to the
-mailing lists: `mesa-dev\@lists.freedesktop.org
-<https://lists.freedesktop.org/mailman/listinfo/mesa-dev>`_
+  $ mkdir build
+  $ cd build
+  $ meson -Dgallium-drivers=panfrost -Dvulkan-drivers= -Dllvm=disabled --prefix=/opt/panfrost
+  $ sudo ninja install
 
+Usage
+-----
 
-Bug reports
------------
+To run an application with Panfrost (note the windowing system support
+section above):
 
-If you think something isn't working properly, please file a bug report
-(`docs/bugs.rst <https://mesa3d.org/bugs.html>`_).
+.. code-block:: sh
 
+  $ LD_LIBRARY_PATH=/opt/panfrost/lib/aarch64-linux-gnu glmark2-es2-wayland
 
-Contributing
-------------
+To use Panfrost by default, add the directory where you installed it
+to the library search path:
 
-Contributions are welcome, and step-by-step instructions can be found in our
-documentation (`docs/submittingpatches.rst
-<https://mesa3d.org/submittingpatches.html>`_).
+.. code-block:: sh
 
-Note that Mesa uses gitlab for patches submission, review and discussions.
+  $ echo /opt/panfrost/lib/aarch64-linux-gnu | sudo tee /etc/ld.so.conf.d/0-panfrost.conf
+  $ sudo ldconfig
diff --git a/bin/ci/custom_logger.py b/bin/ci/custom_logger.py
new file mode 100644
index 00000000000..7721be2f66e
--- /dev/null
+++ b/bin/ci/custom_logger.py
@@ -0,0 +1,334 @@
+import argparse
+import logging
+from datetime import datetime
+from pathlib import Path
+
+from structured_logger import StructuredLogger
+
+
+class CustomLogger:
+    def __init__(self, log_file):
+        self.log_file = log_file
+        self.logger = StructuredLogger(file_name=self.log_file)
+
+    def get_last_dut_job(self):
+        """
+        Gets the details of the most recent DUT job.
+
+        Returns:
+            dict: Details of the most recent DUT job.
+
+        Raises:
+            ValueError: If no DUT jobs are found in the logger's data.
+        """
+        try:
+            job = self.logger.data["dut_jobs"][-1]
+        except KeyError:
+            raise ValueError(
+                "No DUT jobs found. Please create a job via create_dut_job call."
+            )
+
+        return job
+
+    def update(self, **kwargs):
+        """
+        Updates the log file with provided key-value pairs.
+
+        Args:
+            **kwargs: Key-value pairs to be updated.
+
+        """
+        with self.logger.edit_context():
+            for key, value in kwargs.items():
+                self.logger.data[key] = value
+
+    def create_dut_job(self, **kwargs):
+        """
+        Creates a new DUT job with provided key-value pairs.
+
+        Args:
+            **kwargs: Key-value pairs for the new DUT job.
+
+        """
+        with self.logger.edit_context():
+            if "dut_jobs" not in self.logger.data:
+                self.logger.data["dut_jobs"] = []
+            new_job = {
+                "status": "",
+                "submitter_start_time": datetime.now().isoformat(),
+                "dut_submit_time": "",
+                "dut_start_time": "",
+                "dut_end_time": "",
+                "dut_name": "",
+                "dut_state": "pending",
+                "dut_job_phases": [],
+                **kwargs,
+            }
+            self.logger.data["dut_jobs"].append(new_job)
+
+    def update_dut_job(self, key, value):
+        """
+        Updates the last DUT job with a key-value pair.
+
+        Args:
+            key : The key to be updated.
+            value: The value to be assigned.
+
+        """
+        with self.logger.edit_context():
+            job = self.get_last_dut_job()
+            job[key] = value
+
+    def update_status_fail(self, reason=""):
+        """
+        Sets the status of the last DUT job to 'fail' and logs the failure reason.
+
+        Args:
+            reason (str, optional): The reason for the failure. Defaults to "".
+
+        """
+        with self.logger.edit_context():
+            job = self.get_last_dut_job()
+            job["status"] = "fail"
+            job["dut_job_fail_reason"] = reason
+
+    def create_job_phase(self, phase_name):
+        """
+        Creates a new job phase for the last DUT job.
+
+        Args:
+            phase_name : The name of the new job phase.
+
+        """
+        with self.logger.edit_context():
+            job = self.get_last_dut_job()
+            if job["dut_job_phases"] and job["dut_job_phases"][-1]["end_time"] == "":
+                # If the last phase exists and its end time is empty, set the end time
+                job["dut_job_phases"][-1]["end_time"] = datetime.now().isoformat()
+
+            # Create a new phase
+            phase_data = {
+                "name": phase_name,
+                "start_time": datetime.now().isoformat(),
+                "end_time": "",
+            }
+            job["dut_job_phases"].append(phase_data)
+
+    def check_dut_timings(self, job):
+        """
+        Check the timing sequence of a job to ensure logical consistency.
+
+        The function verifies that the job's submission time is not earlier than its start time and that
+        the job's end time is not earlier than its start time. If either of these conditions is found to be true,
+        an error is logged for each instance of inconsistency.
+
+        Args:
+        job (dict): A dictionary containing timing information of a job. Expected keys are 'dut_start_time',
+                    'dut_submit_time', and 'dut_end_time'.
+
+        Returns:
+        None: This function does not return a value; it logs errors if timing inconsistencies are detected.
+
+        The function checks the following:
+        - If 'dut_start_time' and 'dut_submit_time' are both present and correctly sequenced.
+        - If 'dut_start_time' and 'dut_end_time' are both present and correctly sequenced.
+        """
+
+        # Check if the start time and submit time exist
+        if job.get("dut_start_time") and job.get("dut_submit_time"):
+            # If they exist, check if the submission time is before the start time
+            if job["dut_start_time"] < job["dut_submit_time"]:
+                logging.error("Job submission is happening before job start.")
+
+        # Check if the start time and end time exist
+        if job.get("dut_start_time") and job.get("dut_end_time"):
+            # If they exist, check if the end time is after the start time
+            if job["dut_end_time"] < job["dut_start_time"]:
+                logging.error("Job ended before it started.")
+
+    # Method to update DUT start, submit and end time
+    def update_dut_time(self, value, custom_time):
+        """
+        Updates DUT start, submit, and end times.
+
+        Args:
+            value : Specifies which DUT time to update. Options: 'start', 'submit', 'end'.
+            custom_time : Custom time to set. If None, use current time.
+
+        Raises:
+            ValueError: If an invalid argument is provided for value.
+
+        """
+        with self.logger.edit_context():
+            job = self.get_last_dut_job()
+            timestamp = custom_time if custom_time else datetime.now().isoformat()
+            if value == "start":
+                job["dut_start_time"] = timestamp
+                job["dut_state"] = "running"
+            elif value == "submit":
+                job["dut_submit_time"] = timestamp
+                job["dut_state"] = "submitted"
+            elif value == "end":
+                job["dut_end_time"] = timestamp
+                job["dut_state"] = "finished"
+            else:
+                raise ValueError(
+                    "Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'."
+                )
+            # check the sanity of the partial structured log
+            self.check_dut_timings(job)
+
+    def close_dut_job(self):
+        """
+        Closes the most recent DUT (Device Under Test) job in the logger's data.
+
+        The method performs the following operations:
+        1. Validates if there are any DUT jobs in the logger's data.
+        2. If the last phase of the most recent DUT job has an empty end time, it sets the end time to the current time.
+
+        Raises:
+            ValueError: If no DUT jobs are found in the logger's data.
+        """
+        with self.logger.edit_context():
+            job = self.get_last_dut_job()
+            # Check if the last phase exists and its end time is empty, then set the end time
+            if job["dut_job_phases"] and job["dut_job_phases"][-1]["end_time"] == "":
+                job["dut_job_phases"][-1]["end_time"] = datetime.now().isoformat()
+
+    def close(self):
+        """
+        Closes the most recent DUT (Device Under Test) job in the logger's data.
+
+        The method performs the following operations:
+        1. Determines the combined status of all DUT jobs.
+        2. Sets the submitter's end time to the current time.
+        3. Updates the DUT attempt counter to reflect the total number of DUT jobs.
+
+        """
+        with self.logger.edit_context():
+            job_status = []
+            for job in self.logger.data["dut_jobs"]:
+                if "status" in job:
+                    job_status.append(job["status"])
+
+            if not job_status:
+                job_combined_status = "null"
+            else:
+                # Get job_combined_status
+                if "pass" in job_status:
+                    job_combined_status = "pass"
+                else:
+                    job_combined_status = "fail"
+
+            self.logger.data["job_combined_status"] = job_combined_status
+            self.logger.data["dut_attempt_counter"] = len(self.logger.data["dut_jobs"])
+            job["submitter_end_time"] = datetime.now().isoformat()
+
+
+def process_args(args):
+    # Function to process key-value pairs and call corresponding logger methods
+    def process_key_value_pairs(args_list, action_func):
+        if not args_list:
+            raise ValueError(
+                f"No key-value pairs provided for {action_func.__name__.replace('_', '-')}"
+            )
+        if len(args_list) % 2 != 0:
+            raise ValueError(
+                f"Incomplete key-value pairs for {action_func.__name__.replace('_', '-')}"
+            )
+        kwargs = dict(zip(args_list[::2], args_list[1::2]))
+        action_func(**kwargs)
+
+    # Create a CustomLogger object with the specified log file path
+    custom_logger = CustomLogger(Path(args.log_file))
+
+    if args.update:
+        process_key_value_pairs(args.update, custom_logger.update)
+
+    if args.create_dut_job:
+        process_key_value_pairs(args.create_dut_job, custom_logger.create_dut_job)
+
+    if args.update_dut_job:
+        key, value = args.update_dut_job
+        custom_logger.update_dut_job(key, value)
+
+    if args.create_job_phase:
+        custom_logger.create_job_phase(args.create_job_phase)
+
+    if args.update_status_fail:
+        custom_logger.update_status_fail(args.update_status_fail)
+
+    if args.update_dut_time:
+        if len(args.update_dut_time) == 2:
+            action, custom_time = args.update_dut_time
+        elif len(args.update_dut_time) == 1:
+            action, custom_time = args.update_dut_time[0], None
+        else:
+            raise ValueError("Invalid number of values for --update-dut-time")
+
+        if action in ["start", "end", "submit"]:
+            custom_logger.update_dut_time(action, custom_time)
+        else:
+            raise ValueError(
+                "Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'."
+            )
+
+    if args.close_dut_job:
+        custom_logger.close_dut_job()
+
+    if args.close:
+        custom_logger.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Custom Logger Command Line Tool")
+    parser.add_argument("log_file", help="Path to the log file")
+    parser.add_argument(
+        "--update",
+        nargs=argparse.ZERO_OR_MORE,
+        metavar=("key", "value"),
+        help="Update a key-value pair e.g., --update key1 value1 key2 value2)",
+    )
+    parser.add_argument(
+        "--create-dut-job",
+        nargs=argparse.ZERO_OR_MORE,
+        metavar=("key", "value"),
+        help="Create a new DUT job with key-value pairs (e.g., --create-dut-job key1 value1 key2 value2)",
+    )
+    parser.add_argument(
+        "--update-dut-job",
+        nargs=argparse.ZERO_OR_MORE,
+        metavar=("key", "value"),
+        help="Update a key-value pair in DUT job",
+    )
+    parser.add_argument(
+        "--create-job-phase",
+        help="Create a new job phase (e.g., --create-job-phase name)",
+    )
+    parser.add_argument(
+        "--update-status-fail",
+        help="Update fail as the status and log the failure reason (e.g., --update-status-fail reason)",
+    )
+    parser.add_argument(
+        "--update-dut-time",
+        nargs=argparse.ZERO_OR_MORE,
+        metavar=("action", "custom_time"),
+        help="Update DUT start and end time. Provide action ('start', 'submit', 'end') and custom_time (e.g., '2023-01-01T12:00:00')",
+    )
+    parser.add_argument(
+        "--close-dut-job",
+        action="store_true",
+        help="Close the dut job by updating end time of last dut job)",
+    )
+    parser.add_argument(
+        "--close",
+        action="store_true",
+        help="Updates combined status, submitter's end time and DUT attempt counter",
+    )
+    args = parser.parse_args()
+
+    process_args(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/ci/test/requirements.txt b/bin/ci/test/requirements.txt
new file mode 100644
index 00000000000..f80621af285
--- /dev/null
+++ b/bin/ci/test/requirements.txt
@@ -0,0 +1,5 @@
+filelock==3.12.4
+fire==0.5.0
+mock==5.1.0
+polars==0.19.3
+pytest==7.4.2
diff --git a/bin/ci/test/test_custom_logger.py b/bin/ci/test/test_custom_logger.py
new file mode 100644
index 00000000000..98ad9c00494
--- /dev/null
+++ b/bin/ci/test/test_custom_logger.py
@@ -0,0 +1,669 @@
+import logging
+import subprocess
+from datetime import datetime
+
+import pytest
+from custom_logger import CustomLogger
+
+
+@pytest.fixture
+def tmp_log_file(tmp_path):
+    return tmp_path / "test_log.json"
+
+
+@pytest.fixture
+def custom_logger(tmp_log_file):
+    return CustomLogger(tmp_log_file)
+
+
+def run_script_with_args(args):
+    import custom_logger
+
+    script_path = custom_logger.__file__
+    return subprocess.run(
+        ["python3", str(script_path), *args], capture_output=True, text=True
+    )
+
+
+# Test case for missing log file
+@pytest.mark.parametrize(
+    "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")]
+)
+def test_missing_log_file_argument(key, value):
+    result = run_script_with_args(["--update", "key", "value"])
+    assert result.returncode != 0
+
+
+# Parametrize test case for valid update arguments
+@pytest.mark.parametrize(
+    "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")]
+)
+def test_update_argument_valid(custom_logger, tmp_log_file, key, value):
+    result = run_script_with_args([str(tmp_log_file), "--update", key, value])
+    assert result.returncode == 0
+
+
+# Test case for passing only the key without a value
+def test_update_argument_key_only(custom_logger, tmp_log_file):
+    key = "dut_attempt_counter"
+    result = run_script_with_args([str(tmp_log_file), "--update", key])
+    assert result.returncode != 0
+
+
+# Test case for not passing any key-value pair
+def test_update_argument_no_values(custom_logger, tmp_log_file):
+    result = run_script_with_args([str(tmp_log_file), "--update"])
+    assert result.returncode == 0
+
+
+# Parametrize test case for valid arguments
+@pytest.mark.parametrize(
+    "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")]
+)
+def test_create_argument_valid(custom_logger, tmp_log_file, key, value):
+    result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key, value])
+    assert result.returncode == 0
+
+
+# Test case for passing only the key without a value
+def test_create_argument_key_only(custom_logger, tmp_log_file):
+    key = "dut_attempt_counter"
+    result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key])
+    assert result.returncode != 0
+
+
+# Test case for not passing any key-value pair
+def test_create_argument_no_values(custom_logger, tmp_log_file):
+    result = run_script_with_args([str(tmp_log_file), "--create-dut-job"])
+    assert result.returncode == 0
+
+
+# Test case for updating a DUT job
+@pytest.mark.parametrize(
+    "key, value", [("status", "hung"), ("dut_state", "Canceling"), ("dut_name", "asus")]
+)
+def test_update_dut_job(custom_logger, tmp_log_file, key, value):
+    result = run_script_with_args([str(tmp_log_file), "--update-dut-job", key, value])
+    assert result.returncode != 0
+
+    result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key, value])
+    assert result.returncode == 0
+
+    result = run_script_with_args([str(tmp_log_file), "--update-dut-job", key, value])
+    assert result.returncode == 0
+
+
+# Test case for updating last DUT job
+def test_update_dut_multiple_job(custom_logger, tmp_log_file):
+    # Create the first DUT job with the first key
+    result = run_script_with_args(
+        [str(tmp_log_file), "--create-dut-job", "status", "hung"]
+    )
+    assert result.returncode == 0
+
+    # Create the second DUT job with the second key
+    result = run_script_with_args(
+        [str(tmp_log_file), "--create-dut-job", "dut_state", "Canceling"]
+    )
+    assert result.returncode == 0
+
+    result = run_script_with_args(
+        [str(tmp_log_file), "--update-dut-job", "dut_name", "asus"]
+    )
+    assert result.returncode == 0
+
+
+# Parametrize test case for valid phase arguments
+@pytest.mark.parametrize(
+    "phase_name",
+    [("Phase1"), ("Phase2"), ("Phase3")],
+)
+def test_create_job_phase_valid(custom_logger, tmp_log_file, phase_name):
+    custom_logger.create_dut_job(status="pass")
+
+    result = run_script_with_args([str(tmp_log_file), "--create-job-phase", phase_name])
+    assert result.returncode == 0
+
+
+# Test case for not passing any arguments for create-job-phase
+def test_create_job_phase_no_arguments(custom_logger, tmp_log_file):
+    custom_logger.create_dut_job(status="pass")
+
+    result = run_script_with_args([str(tmp_log_file), "--create-job-phase"])
+    assert result.returncode != 0
+
+
+# Test case for trying to create a phase job without an existing DUT job
+def test_create_job_phase_no_dut_job(custom_logger, tmp_log_file):
+    phase_name = "Phase1"
+
+    result = run_script_with_args([str(tmp_log_file), "--create-job-phase", phase_name])
+    assert result.returncode != 0
+
+
+# Combined test cases for valid scenarios
+def test_valid_scenarios(custom_logger, tmp_log_file):
+    valid_update_args = [("dut_attempt_counter", "1"), ("job_combined_status", "pass")]
+    for key, value in valid_update_args:
+        result = run_script_with_args([str(tmp_log_file), "--update", key, value])
+        assert result.returncode == 0
+
+    valid_create_args = [
+        ("status", "hung"),
+        ("dut_state", "Canceling"),
+        ("dut_name", "asus"),
+        ("phase_name", "Bootloader"),
+    ]
+    for key, value in valid_create_args:
+        result = run_script_with_args(
+            [str(tmp_log_file), "--create-dut-job", key, value]
+        )
+        assert result.returncode == 0
+
+    result = run_script_with_args(
+        [str(tmp_log_file), "--create-dut-job", "status", "hung"]
+    )
+    assert result.returncode == 0
+
+    result = run_script_with_args(
+        [str(tmp_log_file), "--update-dut-job", "dut_name", "asus"]
+    )
+    assert result.returncode == 0
+
+    result = run_script_with_args(
+        [
+            str(tmp_log_file),
+            "--create-job-phase",
+            "phase_name",
+        ]
+    )
+    assert result.returncode == 0
+
+
+# Parametrize test case for valid update arguments
+@pytest.mark.parametrize(
+    "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")]
+)
+def test_update(custom_logger, key, value):
+    custom_logger.update(**{key: value})
+    logger_data = custom_logger.logger.data
+
+    assert key in logger_data
+    assert logger_data[key] == value
+
+
+# Test case for updating with a key that already exists
+def test_update_existing_key(custom_logger):
+    key = "status"
+    value = "new_value"
+    custom_logger.logger.data[key] = "old_value"
+    custom_logger.update(**{key: value})
+    logger_data = custom_logger.logger.data
+
+    assert key in logger_data
+    assert logger_data[key] == value
+
+
+# Test case for updating "dut_jobs"
+def test_update_dut_jobs(custom_logger):
+    key1 = "status"
+    value1 = "fail"
+    key2 = "state"
+    value2 = "hung"
+
+    custom_logger.create_dut_job(**{key1: value1})
+    logger_data = custom_logger.logger.data
+
+    job1 = logger_data["dut_jobs"][0]
+    assert key1 in job1
+    assert job1[key1] == value1
+
+    custom_logger.update_dut_job(key2, value2)
+    logger_data = custom_logger.logger.data
+
+    job2 = logger_data["dut_jobs"][0]
+    assert key2 in job2
+    assert job2[key2] == value2
+
+
+# Test case for creating and updating DUT job
+def test_create_dut_job(custom_logger):
+    key = "status"
+    value1 = "pass"
+    value2 = "fail"
+    value3 = "hung"
+
+    reason = "job_combined_status"
+    result = "Finished"
+
+    custom_logger.update(**{reason: result})
+    logger_data = custom_logger.logger.data
+
+    assert reason in logger_data
+    assert logger_data[reason] == result
+
+    # Create the first DUT job
+    custom_logger.create_dut_job(**{key: value1})
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 1
+    assert isinstance(logger_data["dut_jobs"][0], dict)
+
+    # Check the values of the keys in the created first DUT job
+    job1 = logger_data["dut_jobs"][0]
+    assert key in job1
+    assert job1[key] == value1
+
+    # Create the second DUT job
+    custom_logger.create_dut_job(**{key: value2})
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 2
+    assert isinstance(logger_data["dut_jobs"][1], dict)
+
+    # Check the values of the keys in the created second DUT job
+    job2 = logger_data["dut_jobs"][1]
+    assert key in job2
+    assert job2[key] == value2
+
+    # Update the second DUT job with value3
+    custom_logger.update_dut_job(key, value3)
+    logger_data = custom_logger.logger.data
+
+    # Check the updated value in the second DUT job
+    job2 = logger_data["dut_jobs"][1]
+    assert key in job2
+    assert job2[key] == value3
+
+    # Find the index of the last DUT job
+    last_job_index = len(logger_data["dut_jobs"]) - 1
+
+    # Update the last DUT job
+    custom_logger.update_dut_job("dut_name", "asus")
+    logger_data = custom_logger.logger.data
+
+    # Check the updated value in the last DUT job
+    job2 = logger_data["dut_jobs"][last_job_index]
+    assert "dut_name" in job2
+    assert job2["dut_name"] == "asus"
+
+    # Check that "dut_name" is not present in other DUT jobs
+    for idx, job in enumerate(logger_data["dut_jobs"]):
+        if idx != last_job_index:
+            assert job.get("dut_name") == ""
+
+
+# Test case for updating with missing "dut_jobs" key
+def test_update_dut_job_missing_dut_jobs(custom_logger):
+    key = "status"
+    value = "fail"
+
+    # Attempt to update a DUT job when "dut_jobs" is missing
+    with pytest.raises(ValueError, match="No DUT jobs found."):
+        custom_logger.update_dut_job(key, value)
+
+
+# Test case for creating a job phase
+def test_create_job_phase(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+    phase_name = "Phase1"
+
+    custom_logger.create_job_phase(phase_name)
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 1
+
+    job = logger_data["dut_jobs"][0]
+    assert "dut_job_phases" in job
+    assert isinstance(job["dut_job_phases"], list)
+    assert len(job["dut_job_phases"]) == 1
+
+    phase = job["dut_job_phases"][0]
+    assert phase["name"] == phase_name
+    try:
+        datetime.fromisoformat(phase["start_time"])
+        assert True
+    except ValueError:
+        assert False
+    assert phase["end_time"] == ""
+
+
+# Test case for creating multiple phase jobs
+def test_create_multiple_phase_jobs(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+
+    phase_data = [
+        {
+            "phase_name": "Phase1",
+        },
+        {
+            "phase_name": "Phase2",
+        },
+        {
+            "phase_name": "Phase3",
+        },
+    ]
+
+    for data in phase_data:
+        phase_name = data["phase_name"]
+
+        custom_logger.create_job_phase(phase_name)
+
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 1
+
+    job = logger_data["dut_jobs"][0]
+    assert "dut_job_phases" in job
+    assert isinstance(job["dut_job_phases"], list)
+    assert len(job["dut_job_phases"]) == len(phase_data)
+
+    for data in phase_data:
+        phase_name = data["phase_name"]
+
+        phase = job["dut_job_phases"][phase_data.index(data)]
+
+        assert phase["name"] == phase_name
+        try:
+            datetime.fromisoformat(phase["start_time"])
+            assert True
+        except ValueError:
+            assert False
+
+        if phase_data.index(data) != len(phase_data) - 1:
+            try:
+                datetime.fromisoformat(phase["end_time"])
+                assert True
+            except ValueError:
+                assert False
+
+    # Check if the end_time of the last phase is an empty string
+    last_phase = job["dut_job_phases"][-1]
+    assert last_phase["end_time"] == ""
+
+
+# Test case for creating multiple dut jobs and updating phase job for last dut job
+def test_create_two_dut_jobs_and_add_phase(custom_logger):
+    # Create the first DUT job
+    custom_logger.create_dut_job(status="pass")
+
+    # Create the second DUT job
+    custom_logger.create_dut_job(status="fail")
+
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 2
+
+    first_dut_job = logger_data["dut_jobs"][0]
+    second_dut_job = logger_data["dut_jobs"][1]
+
+    # Add a phase to the second DUT job
+    custom_logger.create_job_phase("Phase1")
+
+    logger_data = custom_logger.logger.data
+
+    assert "dut_jobs" in logger_data
+    assert isinstance(logger_data["dut_jobs"], list)
+    assert len(logger_data["dut_jobs"]) == 2
+
+    first_dut_job = logger_data["dut_jobs"][0]
+    second_dut_job = logger_data["dut_jobs"][1]
+
+    # Check first DUT job does not have a phase
+    assert not first_dut_job.get("dut_job_phases")
+
+    # Check second DUT job has a phase
+    assert second_dut_job.get("dut_job_phases")
+    assert isinstance(second_dut_job["dut_job_phases"], list)
+    assert len(second_dut_job["dut_job_phases"]) == 1
+
+
+# Test case for updating DUT start time
+def test_update_dut_start_time(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+    custom_logger.update_dut_time("start", None)
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "dut_start_time" in dut_job
+    assert dut_job["dut_start_time"] != ""
+
+    try:
+        datetime.fromisoformat(dut_job["dut_start_time"])
+        assert True
+    except ValueError:
+        assert False
+
+
+# Test case for updating DUT submit time
+def test_update_dut_submit_time(custom_logger):
+    custom_time = "2023-11-09T02:37:06Z"
+    custom_logger.create_dut_job(status="pass")
+    custom_logger.update_dut_time("submit", custom_time)
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "dut_submit_time" in dut_job
+
+    try:
+        datetime.fromisoformat(dut_job["dut_submit_time"])
+        assert True
+    except ValueError:
+        assert False
+
+
+# Test case for updating DUT end time
+def test_update_dut_end_time(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+    custom_logger.update_dut_time("end", None)
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "dut_end_time" in dut_job
+
+    try:
+        datetime.fromisoformat(dut_job["dut_end_time"])
+        assert True
+    except ValueError:
+        assert False
+
+
+# Test case for updating DUT time with invalid value
+def test_update_dut_time_invalid_value(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+    with pytest.raises(
+        ValueError,
+        match="Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'.",
+    ):
+        custom_logger.update_dut_time("invalid_value", None)
+
+
+# Test case for close_dut_job
+def test_close_dut_job(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+
+    custom_logger.create_job_phase("Phase1")
+    custom_logger.create_job_phase("Phase2")
+
+    custom_logger.close_dut_job()
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "dut_job_phases" in dut_job
+    dut_job_phases = dut_job["dut_job_phases"]
+
+    phase1 = dut_job_phases[0]
+    assert phase1["name"] == "Phase1"
+
+    try:
+        datetime.fromisoformat(phase1["start_time"])
+        assert True
+    except ValueError:
+        assert False
+
+    try:
+        datetime.fromisoformat(phase1["end_time"])
+        assert True
+    except ValueError:
+        assert False
+
+    phase2 = dut_job_phases[1]
+    assert phase2["name"] == "Phase2"
+
+    try:
+        datetime.fromisoformat(phase2["start_time"])
+        assert True
+    except ValueError:
+        assert False
+
+    try:
+        datetime.fromisoformat(phase2["end_time"])
+        assert True
+    except ValueError:
+        assert False
+
+
+# Test case for close
+def test_close(custom_logger):
+    custom_logger.create_dut_job(status="pass")
+
+    custom_logger.close()
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+    assert "dut_attempt_counter" in logger_data
+    assert logger_data["dut_attempt_counter"] == len(logger_data["dut_jobs"])
+    assert "job_combined_status" in logger_data
+    assert logger_data["job_combined_status"] != ""
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "submitter_end_time" in dut_job
+    try:
+        datetime.fromisoformat(dut_job["submitter_end_time"])
+        assert True
+    except ValueError:
+        assert False
+
+
+# Test case for updating status to fail with a reason
+def test_update_status_fail_with_reason(custom_logger):
+    custom_logger.create_dut_job()
+
+    reason = "kernel panic"
+    custom_logger.update_status_fail(reason)
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "status" in dut_job
+    assert dut_job["status"] == "fail"
+    assert "dut_job_fail_reason" in dut_job
+    assert dut_job["dut_job_fail_reason"] == reason
+
+
+# Test case for updating status to fail without providing a reason
+def test_update_status_fail_without_reason(custom_logger):
+    custom_logger.create_dut_job()
+
+    custom_logger.update_status_fail()
+
+    # Check if the status is updated and fail reason is empty
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    dut_job = logger_data["dut_jobs"][0]
+    assert "status" in dut_job
+    assert dut_job["status"] == "fail"
+    assert "dut_job_fail_reason" in dut_job
+    assert dut_job["dut_job_fail_reason"] == ""
+
+
+# Test case for check_dut_timings with submission time earlier than start time
+def test_check_dut_timings_submission_earlier_than_start(custom_logger, caplog):
+    custom_logger.create_dut_job()
+
+    # Set submission time to be earlier than start time
+    custom_logger.update_dut_time("start", "2023-01-01T11:00:00")
+    custom_logger.update_dut_time("submit", "2023-01-01T12:00:00")
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    job = logger_data["dut_jobs"][0]
+
+    # Call check_dut_timings
+    custom_logger.check_dut_timings(job)
+
+    # Check if an error message is logged
+    assert "Job submission is happening before job start." in caplog.text
+
+
+# Test case for check_dut_timings with end time earlier than start time
+def test_check_dut_timings_end_earlier_than_start(custom_logger, caplog):
+    custom_logger.create_dut_job()
+
+    # Set end time to be earlier than start time
+    custom_logger.update_dut_time("end", "2023-01-01T11:00:00")
+    custom_logger.update_dut_time("start", "2023-01-01T12:00:00")
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    job = logger_data["dut_jobs"][0]
+
+    # Call check_dut_timings
+    custom_logger.check_dut_timings(job)
+
+    # Check if an error message is logged
+    assert "Job ended before it started." in caplog.text
+
+
+# Test case for check_dut_timings with valid timing sequence
+def test_check_dut_timings_valid_timing_sequence(custom_logger, caplog):
+    custom_logger.create_dut_job()
+
+    # Set valid timing sequence
+    custom_logger.update_dut_time("submit", "2023-01-01T12:00:00")
+    custom_logger.update_dut_time("start", "2023-01-01T12:30:00")
+    custom_logger.update_dut_time("end", "2023-01-01T13:00:00")
+
+    logger_data = custom_logger.logger.data
+    assert "dut_jobs" in logger_data
+    assert len(logger_data["dut_jobs"]) == 1
+
+    job = logger_data["dut_jobs"][0]
+
+    # Call check_dut_timings
+    custom_logger.check_dut_timings(job)
+
+    # Check that no error messages are logged
+    assert "Job submission is happening before job start." not in caplog.text
+    assert "Job ended before it started." not in caplog.text
diff --git a/docs/features.txt.rej b/docs/features.txt.rej
new file mode 100644
index 00000000000..cb296b346d3
--- /dev/null
+++ b/docs/features.txt.rej
@@ -0,0 +1,10 @@
+diff a/docs/features.txt b/docs/features.txt	(rejected hunks)
+@@ -213,7 +213,7 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, i965/gen8+, nvc0, r600, radeonsi,
+ GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, zink
+ 
+   GL_ARB_ES3_1_compatibility                            DONE (freedreno/a6xx, i965/hsw+, softpipe, virgl)
+-  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12)
++  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12, panfrost)
+   GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12)
+   GL_ARB_cull_distance                                  DONE (freedreno/a6xx, i965, nv50, softpipe, virgl)
+   GL_ARB_derivative_control                             DONE (freedreno/a3xx+, i965, nv50, softpipe, virgl)
diff --git a/include/dma-uapi/dma-buf.h b/include/dma-uapi/dma-buf.h
new file mode 100644
index 00000000000..5a6fda66d9a
--- /dev/null
+++ b/include/dma-uapi/dma-buf.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Framework for buffer objects that can be shared across devices/subsystems.
+ *
+ * Copyright(C) 2015 Intel Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _DMA_BUF_UAPI_H_
+#define _DMA_BUF_UAPI_H_
+
+#include <linux/types.h>
+
+/**
+ * struct dma_buf_sync - Synchronize with CPU access.
+ *
+ * When a DMA buffer is accessed from the CPU via mmap, it is not always
+ * possible to guarantee coherency between the CPU-visible map and underlying
+ * memory.  To manage coherency, DMA_BUF_IOCTL_SYNC must be used to bracket
+ * any CPU access to give the kernel the chance to shuffle memory around if
+ * needed.
+ *
+ * Prior to accessing the map, the client must call DMA_BUF_IOCTL_SYNC
+ * with DMA_BUF_SYNC_START and the appropriate read/write flags.  Once the
+ * access is complete, the client should call DMA_BUF_IOCTL_SYNC with
+ * DMA_BUF_SYNC_END and the same read/write flags.
+ *
+ * The synchronization provided via DMA_BUF_IOCTL_SYNC only provides cache
+ * coherency.  It does not prevent other processes or devices from
+ * accessing the memory at the same time.  If synchronization with a GPU or
+ * other device driver is required, it is the client's responsibility to
+ * wait for buffer to be ready for reading or writing before calling this
+ * ioctl with DMA_BUF_SYNC_START.  Likewise, the client must ensure that
+ * follow-up work is not submitted to GPU or other device driver until
+ * after this ioctl has been called with DMA_BUF_SYNC_END?
+ *
+ * If the driver or API with which the client is interacting uses implicit
+ * synchronization, waiting for prior work to complete can be done via
+ * poll() on the DMA buffer file descriptor.  If the driver or API requires
+ * explicit synchronization, the client may have to wait on a sync_file or
+ * other synchronization primitive outside the scope of the DMA buffer API.
+ */
+struct dma_buf_sync {
+	/**
+	 * @flags: Set of access flags
+	 *
+	 * DMA_BUF_SYNC_START:
+	 *     Indicates the start of a map access session.
+	 *
+	 * DMA_BUF_SYNC_END:
+	 *     Indicates the end of a map access session.
+	 *
+	 * DMA_BUF_SYNC_READ:
+	 *     Indicates that the mapped DMA buffer will be read by the
+	 *     client via the CPU map.
+	 *
+	 * DMA_BUF_SYNC_WRITE:
+	 *     Indicates that the mapped DMA buffer will be written by the
+	 *     client via the CPU map.
+	 *
+	 * DMA_BUF_SYNC_RW:
+	 *     An alias for DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE.
+	 */
+	__u64 flags;
+};
+
+#define DMA_BUF_SYNC_READ      (1 << 0)
+#define DMA_BUF_SYNC_WRITE     (2 << 0)
+#define DMA_BUF_SYNC_RW        (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
+#define DMA_BUF_SYNC_START     (0 << 2)
+#define DMA_BUF_SYNC_END       (1 << 2)
+#define DMA_BUF_SYNC_VALID_FLAGS_MASK \
+	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
+
+#define DMA_BUF_NAME_LEN	32
+
+/**
+ * struct dma_buf_export_sync_file - Get a sync_file from a dma-buf
+ *
+ * Userspace can perform a DMA_BUF_IOCTL_EXPORT_SYNC_FILE to retrieve the
+ * current set of fences on a dma-buf file descriptor as a sync_file.  CPU
+ * waits via poll() or other driver-specific mechanisms typically wait on
+ * whatever fences are on the dma-buf at the time the wait begins.  This
+ * is similar except that it takes a snapshot of the current fences on the
+ * dma-buf for waiting later instead of waiting immediately.  This is
+ * useful for modern graphics APIs such as Vulkan which assume an explicit
+ * synchronization model but still need to inter-operate with dma-buf.
+ *
+ * The intended usage pattern is the following:
+ *
+ *  1. Export a sync_file with flags corresponding to the expected GPU usage
+ *     via DMA_BUF_IOCTL_EXPORT_SYNC_FILE.
+ *
+ *  2. Submit rendering work which uses the dma-buf.  The work should wait on
+ *     the exported sync file before rendering and produce another sync_file
+ *     when complete.
+ *
+ *  3. Import the rendering-complete sync_file into the dma-buf with flags
+ *     corresponding to the GPU usage via DMA_BUF_IOCTL_IMPORT_SYNC_FILE.
+ *
+ * Unlike doing implicit synchronization via a GPU kernel driver's exec ioctl,
+ * the above is not a single atomic operation.  If userspace wants to ensure
+ * ordering via these fences, it is the respnosibility of userspace to use
+ * locks or other mechanisms to ensure that no other context adds fences or
+ * submits work between steps 1 and 3 above.
+ */
+struct dma_buf_export_sync_file {
+	/**
+	 * @flags: Read/write flags
+	 *
+	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
+	 *
+	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
+	 * the returned sync file waits on any writers of the dma-buf to
+	 * complete.  Waiting on the returned sync file is equivalent to
+	 * poll() with POLLIN.
+	 *
+	 * If DMA_BUF_SYNC_WRITE is set, the returned sync file waits on
+	 * any users of the dma-buf (read or write) to complete.  Waiting
+	 * on the returned sync file is equivalent to poll() with POLLOUT.
+	 * If both DMA_BUF_SYNC_WRITE and DMA_BUF_SYNC_READ are set, this
+	 * is equivalent to just DMA_BUF_SYNC_WRITE.
+	 */
+	__u32 flags;
+	/** @fd: Returned sync file descriptor */
+	__s32 fd;
+};
+
+/**
+ * struct dma_buf_import_sync_file - Insert a sync_file into a dma-buf
+ *
+ * Userspace can perform a DMA_BUF_IOCTL_IMPORT_SYNC_FILE to insert a
+ * sync_file into a dma-buf for the purposes of implicit synchronization
+ * with other dma-buf consumers.  This allows clients using explicitly
+ * synchronized APIs such as Vulkan to inter-op with dma-buf consumers
+ * which expect implicit synchronization such as OpenGL or most media
+ * drivers/video.
+ */
+struct dma_buf_import_sync_file {
+	/**
+	 * @flags: Read/write flags
+	 *
+	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
+	 *
+	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
+	 * this inserts the sync_file as a read-only fence.  Any subsequent
+	 * implicitly synchronized writes to this dma-buf will wait on this
+	 * fence but reads will not.
+	 *
+	 * If DMA_BUF_SYNC_WRITE is set, this inserts the sync_file as a
+	 * write fence.  All subsequent implicitly synchronized access to
+	 * this dma-buf will wait on this fence.
+	 */
+	__u32 flags;
+	/** @fd: Sync file descriptor */
+	__s32 fd;
+};
+
+#define DMA_BUF_BASE		'b'
+#define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+
+/* 32/64bitness of this uapi was botched in android, there's no difference
+ * between them in actual uapi, they're just different numbers.
+ */
+#define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
+#define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, __u32)
+#define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, __u64)
+#define DMA_BUF_IOCTL_EXPORT_SYNC_FILE	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file)
+#define DMA_BUF_IOCTL_IMPORT_SYNC_FILE	_IOW(DMA_BUF_BASE, 3, struct dma_buf_import_sync_file)
+
+#endif
diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h
index 6b6235f7a7c..30343c7c9c3 100644
--- a/include/drm-uapi/drm_fourcc.h
+++ b/include/drm-uapi/drm_fourcc.h
@@ -1219,6 +1219,13 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier)
  */
 #define AFBC_FORMAT_MOD_USM	(1ULL << 12)
 
+/* AFBC native swizzle
+ *
+ * Indicates that the buffer is using RGBA component order regardless of the
+ * actual format.
+ */
+#define AFBC_FORMAT_MOD_NATIVE_SWIZZLE	(1ULL << 32)
+
 /*
  * Arm Fixed-Rate Compression (AFRC) modifiers
  *
diff --git a/meson.build.rej b/meson.build.rej
new file mode 100644
index 00000000000..322ff95a368
--- /dev/null
+++ b/meson.build.rej
@@ -0,0 +1,18 @@
+diff a/meson.build b/meson.build	(rejected hunks)
+@@ -865,14 +865,13 @@ endif
+ 
+ with_gallium_st_nine =  get_option('gallium-nine')
+ if with_gallium_st_nine
+-  if not with_gallium_softpipe
+-    error('The nine state tracker requires gallium softpipe/llvmpipe.')
+-  elif not [
++  if not [
+              with_gallium_crocus,
+              with_gallium_freedreno,
+              with_gallium_i915,
+              with_gallium_iris,
+              with_gallium_nouveau,
++             with_gallium_panfrost,
+              with_gallium_r300,
+              with_gallium_r600,
+              with_gallium_radeonsi,
diff --git a/patch.diff b/patch.diff
new file mode 100644
index 00000000000..ad36794a25b
--- /dev/null
+++ b/patch.diff
@@ -0,0 +1,25515 @@
+diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
+new file mode 100644
+index 00000000000..37fcc4b92b1
+--- /dev/null
++++ b/.github/workflows/android.yml
+@@ -0,0 +1,67 @@
++name: Build Android
++
++on:
++  [push, pull_request]
++
++# A workflow run is made up of one or more jobs that can run sequentially or in parallel
++jobs:
++  build:
++    strategy:
++      matrix:
++        arch: [ "arm32", "aarch64" ]
++      fail-fast: false
++
++    name: "Build for ${{matrix.arch}}"
++
++    runs-on: ubuntu-latest
++
++    steps:
++        
++      - uses: actions/checkout@v2
++
++      - name: Build
++        run: |
++          sudo apt update
++          sudo apt install -y meson libxrandr-dev libxxf86vm-dev libxcb-*-dev libx11-xcb-dev libxfixes-dev libdrm-dev libx11-dev
++          pip3 install mako
++          export ANDROID_NDK_HOME="$ANDROID_SDK_ROOT/ndk-bundle"
++          envsubst <android-drm-${{matrix.arch}} >build-crossfile-drm
++          git clone --depth 1 https://gitlab.freedesktop.org/mesa/drm.git
++          cd drm
++          meson setup "build-android" \
++            --prefix=/tmp/drm-static \
++            --cross-file "../build-crossfile-drm" \
++            -Ddefault_library=static \
++            -Dintel=disabled \
++            -Dradeon=disabled \
++            -Damdgpu=disabled \
++            -Dnouveau=disabled \
++            -Dvmwgfx=disabled \
++            -Dfreedreno=disabled \
++            -Dvc4=disabled \
++            -Detnaviv=disabled
++          ninja -C "build-android" install
++          cd ..
++          envsubst <android-${{matrix.arch}} >build-crossfile
++          meson setup "build-android" \
++            --prefix=/tmp/pan \
++            --cross-file "build-crossfile" \
++            -Dplatforms=android \
++            -Dplatform-sdk-version=26 \
++            -Dandroid-stub=true \
++            -Dllvm=disabled \
++            -Dxlib-lease=disabled \
++            -Degl=disabled \
++            -Dgbm=disabled \
++            -Dglx=disabled \
++            -Dopengl=true \
++            -Dosmesa=true \
++            -Dvulkan-drivers= \
++            -Dgallium-drivers=swrast,panfrost \
++            -Dshared-glapi=false
++          ninja -C "build-android" install
++      - name: Upload libraries
++        uses: actions/upload-artifact@v2
++        with:
++          name: panfrost_${{matrix.arch}}
++          path: /tmp/pan
+diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
+deleted file mode 100644
+index d1b66ef4cad..00000000000
+--- a/.github/workflows/macos.yml
++++ /dev/null
+@@ -1,60 +0,0 @@
+-name: macOS-CI
+-on: push
+-
+-permissions:
+-  contents: read
+-
+-jobs:
+-  macOS-CI:
+-    strategy:
+-      matrix:
+-        glx_option: ['dri', 'xlib']
+-    runs-on: macos-11
+-    env:
+-      GALLIUM_DUMP_CPU: true
+-      MESON_EXEC: /Users/runner/Library/Python/3.11/bin/meson
+-    steps:
+-    - name: Checkout
+-      uses: actions/checkout@v3
+-    - name: Install Dependencies
+-      run: |
+-        cat > Brewfile <<EOL
+-        brew "bison"
+-        brew "expat"
+-        brew "gettext"
+-        brew "libx11"
+-        brew "libxcb"
+-        brew "libxdamage"
+-        brew "libxext"
+-        brew "molten-vk"
+-        brew "ninja"
+-        brew "pkg-config"
+-        brew "python@3.10"
+-        EOL
+-
+-        brew update
+-        brew bundle --verbose
+-    - name: Install Mako and meson
+-      run: pip3 install --user mako meson
+-    - name: Configure
+-      run: |
+-        cat > native_config <<EOL
+-        [binaries]
+-        llvm-config = '/usr/local/opt/llvm/bin/llvm-config'
+-        EOL
+-        $MESON_EXEC . build --native-file=native_config -Dmoltenvk-dir=$(brew --prefix molten-vk) -Dbuild-tests=true -Dosmesa=true -Dgallium-drivers=swrast,zink -Dglx=${{ matrix.glx_option }}
+-    - name: Build
+-      run: $MESON_EXEC compile -C build
+-    - name: Test
+-      run: $MESON_EXEC test -C build --print-errorlogs
+-    - name: Install
+-      run: $MESON_EXEC install -C build --destdir $PWD/install
+-    - name: 'Upload Artifact'
+-      if: always()
+-      uses: actions/upload-artifact@v3
+-      with:
+-        name: macos-${{ matrix.glx_option }}-result
+-        path: |
+-          build/meson-logs/
+-          install/
+-        retention-days: 5
+diff --git a/README-CSF.rst b/README-CSF.rst
+new file mode 100644
+index 00000000000..9bd161005f9
+--- /dev/null
++++ b/README-CSF.rst
+@@ -0,0 +1,88 @@
++Valhall CSF Tests
++=================
++
++The ``csf`` branch contains a test program for v10 Valhall GPUs (G710
++etc.) which uses the Arm ``kbase`` kernel driver, which is generally
++present on vendor kernels but is not in the upstream Linux kernel.
++
++However, the kernel driver source can also be downloaded `from Arm
++<https://developer.arm.com/downloads/-/mali-drivers/valhall-kernel>`_,
++of which the newer releases should work well enough with a mainline
++kernel (though some work may be needed to integrate the vendor
++platform).
++
++Making sure that the ``libmali`` blob drivers work before trying this
++program is recommended, otherwise you will be trying to debug
++userspace and kernel bugs at the same time.
++
++Note that firmware is required for these GPUs, for RK3588 try
++downloading the file from the Rockchip `libmali
++<https://github.com/JeffyCN/rockchip_mirrors/tree/libmali/firmware/g610>`_
++repo, and placing it in ``/lib/firmware/``.
++
++Compiling
++---------
++
++.. code-block:: sh
++
++  $ mkdir build
++  $ cd build
++  $ meson --buildtype=debug -Dgallium-drivers=panfrost -Dvulkan-drivers=
++  $ ninja src/panfrost/csf_test
++
++Running
++-------
++
++.. code-block:: sh
++
++  $ src/panfrost/csf_test
++
++will run the tests.
++
++Normally it will start running cleanup steps as soon as one test
++fails, though setting the environment variable ``TEST_KEEP_GOING=1``
++will change this behaviour.
++
++Test failures
++-------------
++
++Gitlab issues can be created against `my repo
++<https://gitlab.freedesktop.org/icecream95/mesa/-/issues>`_, though
++some problems should be easy to fix (wrong permissions on
++``/dev/mali0``?).
++
++Include all output from running the test program. Including logs from
++``strace`` might also help.
++
++Command stream test script
++--------------------------
++
++``src/panfrost/csf_test/interpret.py`` is a test script for assembling
++and executing command streams.
++
++To use it, symlink the ``csf_test`` binary into ``$PATH`` and optionally
++also write a ``rebuild-mesa`` script which recompiles ``csf_test``.
++
++Then running ``interpret.py`` will execute the ``cmds`` variable,
++which is defined inside the script file.
++
++Example:
++
++.. code-block:: txt
++
++  @ comments are started with '@'
++
++  @ run on command stream 2
++  !cs 2
++  @ allocate some memory
++  !alloc x 4096
++  @ allocate event memory, for evstr instructions
++  !alloc ev 4096 0x8200f
++
++  mov x50, $x
++
++  @ dump all registers to the memory starting at x50
++  regdump x50
++
++  @ dump the memory region named 'x'
++  !dump x 0 4096
+diff --git a/README.rst b/README.rst
+index b35246e034c..fd140a96013 100644
+--- a/README.rst
++++ b/README.rst
+@@ -1,59 +1,136 @@
+ `Mesa <https://mesa3d.org>`_ - The 3D Graphics Library
+ ======================================================
+ 
++Valhall v10 "CSF" support branch—for Mali G710/G610.
++
++Note that firmware is required for these GPUs, for RK3588 try
++downloading the file from the Rockchip `libmali
++<https://github.com/JeffyCN/rockchip_mirrors/tree/libmali/firmware/g610>`_
++repo, and placing it in ``/lib/firmware/``.
++
++Windowing system support
++------------------------
++
++Panfrost Wayland compositor (wlroots):
++
++#. Panfrost Wayland clients
++#. Panfrost X11 clients via Xwayland [1]_
++#. Blob X11 clients via Xwayland + dri2to3 [2]_
++
++Panfrost Wayland compositor (non-wlroots):
++
++#. Panfrost Wayland clients
++#. Panfrost X11 clients via Xwayland
++#. Blob Wayland clients
++#. Blob X11 clients via Xwayland + dri2to3 [2]_
++
++Blob Wayland compositor:
++
++#. Panfrost Wayland clients
++#. Blob Wayland clients
++
++Panfrost Xorg server: [3]_
++
++#. Panfrost X11 clients
++#. Blob X11 clients
++
++Blob Xorg server:
++
++#. Panfrost X11 clients
++#. Blob X11 clients
++
++Applications using KMS/DRM will also work.
++
++.. [1] Requires ``CONFIG_DRM_IGNORE_IOTCL_PERMIT`` to be disabled in
++       the kernel configuration. The option is broken and should never
++       be enabled anyway.
++
++.. [2] See https://gitlab.com/panfork/dri2to3
++
++.. [3] For Radxa Debian/Ubuntu, the ``xserver-xorg-core`` version
++       installed by default is not compatible with Panfrost. But note
++       that upstream Xorg does not work will the blob, so Mesa must be
++       installed so that it is used by default. (see the "Usage"
++       section below). To switch between the upstream and Rockchip
++       versions, run:
++
++.. code-block:: sh
++
++  $ sudo apt install xserver-xorg-core="$(apt-cache show xserver-xorg-core | grep Version | grep -v "$(dpkg -s xserver-xorg-core | grep Version)" | cut -d" " -f2)"
++
++Broken combinations:
++
++#. Panfrost wlroots + Blob Wayland does not work because wlroots does
++   not expose the ``mali_buffer_sharing`` protocol. This might be
++   fixable.
++#. Blob Wayland compositor + Panfrost X11 does not work because the
++   blob does not expose the required protocols for Xwayland
++   acceleration to work
+ 
+ Source
+ ------
+ 
+-This repository lives at https://gitlab.freedesktop.org/mesa/mesa.
+-Other repositories are likely forks, and code found there is not supported.
++This repository lives at https://gitlab.com/panfork/mesa, and is a
++fork, so not supported by upstream.
+ 
++Upstream source is at https://gitlab.freedesktop.org/mesa/mesa.
+ 
+-Build & install
+----------------
++Depdendencies
++-------------
+ 
+-You can find more information in our documentation (`docs/install.rst
+-<https://mesa3d.org/install.html>`_), but the recommended way is to use
+-Meson (`docs/meson.rst <https://mesa3d.org/meson.html>`_):
++For Debian-based distributions:
+ 
+ .. code-block:: sh
+ 
+-  $ mkdir build
+-  $ cd build
+-  $ meson ..
++  $ sudo apt install build-essential meson git python3-mako libexpat1-dev bison flex libwayland-egl-backend-dev libxext-dev libxfixes-dev libxcb-glx0-dev libxcb-shm0-dev libxcb-dri2-0-dev libxcb-dri3-dev libxcb-present-dev libxshmfence-dev libxxf86vm-dev libxrandr-dev
++
++Also needed is ``libdrm`` and ``wayland-protocols``, but those
++packages are too old in Debian Bullseye, and must be compiled from
++source:
++
++.. code-block:: sh
++
++  $ git clone https://gitlab.freedesktop.org/mesa/drm
++  $ mkdir drm/build
++  $ cd drm/build
++  $ meson
++  $ sudo ninja install
++
++.. code-block:: sh
++
++  $ git clone https://gitlab.freedesktop.org/wayland/wayland-protocols
++  $ mkdir wayland-protocols/build
++  $ cd wayland-protocols/build
++  $ git checkout 1.24
++  $ meson
+   $ sudo ninja install
+ 
++Build & install
++---------------
+ 
+-Support
+--------
++To install to ``/opt/panfrost``:
+ 
+-Many Mesa devs hang on IRC; if you're not sure which channel is
+-appropriate, you should ask your question on `OFTC's #dri-devel
+-<irc://irc.oftc.net/dri-devel>`_, someone will redirect you if
+-necessary.
+-Remember that not everyone is in the same timezone as you, so it might
+-take a while before someone qualified sees your question.
+-To figure out who you're talking to, or which nick to ping for your
+-question, check out `Who's Who on IRC
+-<https://dri.freedesktop.org/wiki/WhosWho/>`_.
++.. code-block:: sh
+ 
+-The next best option is to ask your question in an email to the
+-mailing lists: `mesa-dev\@lists.freedesktop.org
+-<https://lists.freedesktop.org/mailman/listinfo/mesa-dev>`_
++  $ mkdir build
++  $ cd build
++  $ meson -Dgallium-drivers=panfrost -Dvulkan-drivers= -Dllvm=disabled --prefix=/opt/panfrost
++  $ sudo ninja install
+ 
++Usage
++-----
+ 
+-Bug reports
+------------
++To run an application with Panfrost (note the windowing system support
++section above):
+ 
+-If you think something isn't working properly, please file a bug report
+-(`docs/bugs.rst <https://mesa3d.org/bugs.html>`_).
++.. code-block:: sh
+ 
++  $ LD_LIBRARY_PATH=/opt/panfrost/lib/aarch64-linux-gnu glmark2-es2-wayland
+ 
+-Contributing
+-------------
++To use Panfrost by default, add the directory where you installed it
++to the library search path:
+ 
+-Contributions are welcome, and step-by-step instructions can be found in our
+-documentation (`docs/submittingpatches.rst
+-<https://mesa3d.org/submittingpatches.html>`_).
++.. code-block:: sh
+ 
+-Note that Mesa uses gitlab for patches submission, review and discussions.
++  $ echo /opt/panfrost/lib/aarch64-linux-gnu | sudo tee /etc/ld.so.conf.d/0-panfrost.conf
++  $ sudo ldconfig
+diff --git a/android-aarch64 b/android-aarch64
+new file mode 100644
+index 00000000000..2737a2d01bd
+--- /dev/null
++++ b/android-aarch64
+@@ -0,0 +1,26 @@
++[binaries]
++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar'
++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android26-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC']
++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android26-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++']
++c_ld = 'lld'
++cpp_ld = 'lld'
++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip'
++# Android doesn't come with a pkg-config, but we need one for meson to be happy not
++# finding all the optional deps it looks for.  Use system pkg-config pointing at a
++# directory we get to populate with any .pc files we want to add for Android
++
++# Also, include the plain DRM lib we found earlier. Panfrost relies on it rather heavily, especially when
++# interacting with the panfrost DRM module and not kbase
++
++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.:/tmp/drm-static/lib/pkgconfig', '/usr/bin/pkg-config']
++
++[host_machine]
++system = 'linux'
++# cpu_family = 'x86_64'
++# cpu = 'amd64'
++
++# ik this is wrong but workaround sanity check
++cpu_family = 'arm'
++cpu = 'armv8'
++
++endian = 'little'
+diff --git a/android-arm32 b/android-arm32
+new file mode 100644
+index 00000000000..6bd6af4e902
+--- /dev/null
++++ b/android-arm32
+@@ -0,0 +1,26 @@
++[binaries]
++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar'
++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi26-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC']
++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi26-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++']
++c_ld = 'lld'
++cpp_ld = 'lld'
++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip'
++# Android doesn't come with a pkg-config, but we need one for meson to be happy not
++# finding all the optional deps it looks for.  Use system pkg-config pointing at a
++# directory we get to populate with any .pc files we want to add for Android
++
++# Also, include the plain DRM lib we found earlier. Panfrost relies on it rather heavily, especially when
++# interacting with the panfrost DRM module and not kbase
++
++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.:/tmp/drm-static/lib/pkgconfig', '/usr/bin/pkg-config']
++
++[host_machine]
++system = 'linux'
++# cpu_family = 'x86_64'
++# cpu = 'amd64'
++
++# ik this is wrong but workaround sanity check
++cpu_family = 'arm'
++cpu = 'armv7'
++
++endian = 'little'
+diff --git a/android-drm-aarch64 b/android-drm-aarch64
+new file mode 100644
+index 00000000000..eb91f638435
+--- /dev/null
++++ b/android-drm-aarch64
+@@ -0,0 +1,22 @@
++[binaries]
++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar'
++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android24-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC']
++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android24-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++']
++c_ld = 'lld'
++cpp_ld = 'lld'
++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip'
++# Android doesn't come with a pkg-config, but we need one for meson to be happy not
++# finding all the optional deps it looks for.  Use system pkg-config pointing at a
++# directory we get to populate with any .pc files we want to add for Android
++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.', '/usr/bin/pkg-config']
++
++[host_machine]
++system = 'linux'
++# cpu_family = 'x86_64'
++# cpu = 'amd64'
++
++# ik this is wrong but workaround sanity check
++cpu_family = 'arm'
++cpu = 'armv8'
++
++endian = 'little'
+diff --git a/android-drm-arm32 b/android-drm-arm32
+new file mode 100644
+index 00000000000..5fae96b7d1e
+--- /dev/null
++++ b/android-drm-arm32
+@@ -0,0 +1,22 @@
++[binaries]
++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar'
++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi24-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC']
++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi24-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++']
++c_ld = 'lld'
++cpp_ld = 'lld'
++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip'
++# Android doesn't come with a pkg-config, but we need one for meson to be happy not
++# finding all the optional deps it looks for.  Use system pkg-config pointing at a
++# directory we get to populate with any .pc files we want to add for Android
++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.', '/usr/bin/pkg-config']
++
++[host_machine]
++system = 'linux'
++# cpu_family = 'x86_64'
++# cpu = 'amd64'
++
++# ik this is wrong but workaround sanity check
++cpu_family = 'arm'
++cpu = 'armv7'
++
++endian = 'little'
+diff --git a/docs/features.txt b/docs/features.txt
+index 40ebfd68028..d5233eb5010 100644
+--- a/docs/features.txt
++++ b/docs/features.txt
+@@ -213,7 +213,7 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, i965/gen8+, nvc0, r600, radeonsi,
+ GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, zink
+ 
+   GL_ARB_ES3_1_compatibility                            DONE (freedreno/a6xx, i965/hsw+, softpipe, virgl)
+-  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12)
++  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12, panfrost)
+   GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12)
+   GL_ARB_cull_distance                                  DONE (freedreno/a6xx, i965, nv50, softpipe, virgl)
+   GL_ARB_derivative_control                             DONE (freedreno/a3xx+, i965, nv50, softpipe, virgl)
+diff --git a/include/dma-uapi/dma-buf.h b/include/dma-uapi/dma-buf.h
+new file mode 100644
+index 00000000000..5a6fda66d9a
+--- /dev/null
++++ b/include/dma-uapi/dma-buf.h
+@@ -0,0 +1,182 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ * Framework for buffer objects that can be shared across devices/subsystems.
++ *
++ * Copyright(C) 2015 Intel Ltd
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published by
++ * the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program.  If not, see <http://www.gnu.org/licenses/>.
++ */
++
++#ifndef _DMA_BUF_UAPI_H_
++#define _DMA_BUF_UAPI_H_
++
++#include <linux/types.h>
++
++/**
++ * struct dma_buf_sync - Synchronize with CPU access.
++ *
++ * When a DMA buffer is accessed from the CPU via mmap, it is not always
++ * possible to guarantee coherency between the CPU-visible map and underlying
++ * memory.  To manage coherency, DMA_BUF_IOCTL_SYNC must be used to bracket
++ * any CPU access to give the kernel the chance to shuffle memory around if
++ * needed.
++ *
++ * Prior to accessing the map, the client must call DMA_BUF_IOCTL_SYNC
++ * with DMA_BUF_SYNC_START and the appropriate read/write flags.  Once the
++ * access is complete, the client should call DMA_BUF_IOCTL_SYNC with
++ * DMA_BUF_SYNC_END and the same read/write flags.
++ *
++ * The synchronization provided via DMA_BUF_IOCTL_SYNC only provides cache
++ * coherency.  It does not prevent other processes or devices from
++ * accessing the memory at the same time.  If synchronization with a GPU or
++ * other device driver is required, it is the client's responsibility to
++ * wait for buffer to be ready for reading or writing before calling this
++ * ioctl with DMA_BUF_SYNC_START.  Likewise, the client must ensure that
++ * follow-up work is not submitted to GPU or other device driver until
++ * after this ioctl has been called with DMA_BUF_SYNC_END?
++ *
++ * If the driver or API with which the client is interacting uses implicit
++ * synchronization, waiting for prior work to complete can be done via
++ * poll() on the DMA buffer file descriptor.  If the driver or API requires
++ * explicit synchronization, the client may have to wait on a sync_file or
++ * other synchronization primitive outside the scope of the DMA buffer API.
++ */
++struct dma_buf_sync {
++	/**
++	 * @flags: Set of access flags
++	 *
++	 * DMA_BUF_SYNC_START:
++	 *     Indicates the start of a map access session.
++	 *
++	 * DMA_BUF_SYNC_END:
++	 *     Indicates the end of a map access session.
++	 *
++	 * DMA_BUF_SYNC_READ:
++	 *     Indicates that the mapped DMA buffer will be read by the
++	 *     client via the CPU map.
++	 *
++	 * DMA_BUF_SYNC_WRITE:
++	 *     Indicates that the mapped DMA buffer will be written by the
++	 *     client via the CPU map.
++	 *
++	 * DMA_BUF_SYNC_RW:
++	 *     An alias for DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE.
++	 */
++	__u64 flags;
++};
++
++#define DMA_BUF_SYNC_READ      (1 << 0)
++#define DMA_BUF_SYNC_WRITE     (2 << 0)
++#define DMA_BUF_SYNC_RW        (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
++#define DMA_BUF_SYNC_START     (0 << 2)
++#define DMA_BUF_SYNC_END       (1 << 2)
++#define DMA_BUF_SYNC_VALID_FLAGS_MASK \
++	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
++
++#define DMA_BUF_NAME_LEN	32
++
++/**
++ * struct dma_buf_export_sync_file - Get a sync_file from a dma-buf
++ *
++ * Userspace can perform a DMA_BUF_IOCTL_EXPORT_SYNC_FILE to retrieve the
++ * current set of fences on a dma-buf file descriptor as a sync_file.  CPU
++ * waits via poll() or other driver-specific mechanisms typically wait on
++ * whatever fences are on the dma-buf at the time the wait begins.  This
++ * is similar except that it takes a snapshot of the current fences on the
++ * dma-buf for waiting later instead of waiting immediately.  This is
++ * useful for modern graphics APIs such as Vulkan which assume an explicit
++ * synchronization model but still need to inter-operate with dma-buf.
++ *
++ * The intended usage pattern is the following:
++ *
++ *  1. Export a sync_file with flags corresponding to the expected GPU usage
++ *     via DMA_BUF_IOCTL_EXPORT_SYNC_FILE.
++ *
++ *  2. Submit rendering work which uses the dma-buf.  The work should wait on
++ *     the exported sync file before rendering and produce another sync_file
++ *     when complete.
++ *
++ *  3. Import the rendering-complete sync_file into the dma-buf with flags
++ *     corresponding to the GPU usage via DMA_BUF_IOCTL_IMPORT_SYNC_FILE.
++ *
++ * Unlike doing implicit synchronization via a GPU kernel driver's exec ioctl,
++ * the above is not a single atomic operation.  If userspace wants to ensure
++ * ordering via these fences, it is the respnosibility of userspace to use
++ * locks or other mechanisms to ensure that no other context adds fences or
++ * submits work between steps 1 and 3 above.
++ */
++struct dma_buf_export_sync_file {
++	/**
++	 * @flags: Read/write flags
++	 *
++	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
++	 *
++	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
++	 * the returned sync file waits on any writers of the dma-buf to
++	 * complete.  Waiting on the returned sync file is equivalent to
++	 * poll() with POLLIN.
++	 *
++	 * If DMA_BUF_SYNC_WRITE is set, the returned sync file waits on
++	 * any users of the dma-buf (read or write) to complete.  Waiting
++	 * on the returned sync file is equivalent to poll() with POLLOUT.
++	 * If both DMA_BUF_SYNC_WRITE and DMA_BUF_SYNC_READ are set, this
++	 * is equivalent to just DMA_BUF_SYNC_WRITE.
++	 */
++	__u32 flags;
++	/** @fd: Returned sync file descriptor */
++	__s32 fd;
++};
++
++/**
++ * struct dma_buf_import_sync_file - Insert a sync_file into a dma-buf
++ *
++ * Userspace can perform a DMA_BUF_IOCTL_IMPORT_SYNC_FILE to insert a
++ * sync_file into a dma-buf for the purposes of implicit synchronization
++ * with other dma-buf consumers.  This allows clients using explicitly
++ * synchronized APIs such as Vulkan to inter-op with dma-buf consumers
++ * which expect implicit synchronization such as OpenGL or most media
++ * drivers/video.
++ */
++struct dma_buf_import_sync_file {
++	/**
++	 * @flags: Read/write flags
++	 *
++	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
++	 *
++	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
++	 * this inserts the sync_file as a read-only fence.  Any subsequent
++	 * implicitly synchronized writes to this dma-buf will wait on this
++	 * fence but reads will not.
++	 *
++	 * If DMA_BUF_SYNC_WRITE is set, this inserts the sync_file as a
++	 * write fence.  All subsequent implicitly synchronized access to
++	 * this dma-buf will wait on this fence.
++	 */
++	__u32 flags;
++	/** @fd: Sync file descriptor */
++	__s32 fd;
++};
++
++#define DMA_BUF_BASE		'b'
++#define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
++
++/* 32/64bitness of this uapi was botched in android, there's no difference
++ * between them in actual uapi, they're just different numbers.
++ */
++#define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
++#define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, __u32)
++#define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, __u64)
++#define DMA_BUF_IOCTL_EXPORT_SYNC_FILE	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file)
++#define DMA_BUF_IOCTL_IMPORT_SYNC_FILE	_IOW(DMA_BUF_BASE, 3, struct dma_buf_import_sync_file)
++
++#endif
+diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h
+index 0e70e36cd9d..37711252619 100644
+--- a/include/drm-uapi/drm_fourcc.h
++++ b/include/drm-uapi/drm_fourcc.h
+@@ -1164,6 +1164,13 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier)
+  */
+ #define AFBC_FORMAT_MOD_USM	(1ULL << 12)
+ 
++/* AFBC native swizzle
++ *
++ * Indicates that the buffer is using RGBA component order regardless of the
++ * actual format.
++ */
++#define AFBC_FORMAT_MOD_NATIVE_SWIZZLE	(1ULL << 32)
++
+ /*
+  * Arm Fixed-Rate Compression (AFRC) modifiers
+  *
+diff --git a/meson.build b/meson.build
+index 1e6ccd8cbb9..2a305cdc742 100644
+--- a/meson.build
++++ b/meson.build
+@@ -865,14 +865,13 @@ endif
+ 
+ with_gallium_st_nine =  get_option('gallium-nine')
+ if with_gallium_st_nine
+-  if not with_gallium_softpipe
+-    error('The nine state tracker requires gallium softpipe/llvmpipe.')
+-  elif not [
++  if not [
+              with_gallium_crocus,
+              with_gallium_freedreno,
+              with_gallium_i915,
+              with_gallium_iris,
+              with_gallium_nouveau,
++             with_gallium_panfrost,
+              with_gallium_r300,
+              with_gallium_r600,
+              with_gallium_radeonsi,
+diff --git a/src/android_stub/meson.build b/src/android_stub/meson.build
+index 86f88caea34..a43a9ddfd6b 100644
+--- a/src/android_stub/meson.build
++++ b/src/android_stub/meson.build
+@@ -1,7 +1,7 @@
+ if with_android_stub
+   stub_libs = []
+ 
+-  foreach lib : ['backtrace', 'cutils', 'hardware', 'log', 'nativewindow', 'sync']
++  foreach lib : ['hardware', 'log', 'nativewindow']
+     stub_libs += shared_library(
+       lib,
+       files(lib + '_stub.cpp'),
+diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
+index fc498fc8a24..6073c912c19 100644
+--- a/src/compiler/glsl/glsl_to_nir.cpp
++++ b/src/compiler/glsl/glsl_to_nir.cpp
+@@ -81,9 +81,6 @@ class nir_visitor : public ir_visitor
+ 
+    void create_function(ir_function_signature *ir);
+ 
+-   /* True if we have any output rvalues */
+-   bool has_output_rvalue;
+-
+ private:
+    void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size);
+    nir_ssa_def *evaluate_rvalue(ir_rvalue *ir);
+@@ -274,9 +271,6 @@ glsl_to_nir(const struct gl_constants *consts,
+          if (var->data.mode == nir_var_shader_in && var->data.sample)
+             shader->info.fs.uses_sample_shading = true;
+       }
+-
+-      if (v1.has_output_rvalue)
+-         shader->info.fs.uses_sample_shading = true;
+    }
+ 
+    return shader;
+@@ -287,7 +281,6 @@ nir_visitor::nir_visitor(const struct gl_constants *consts, nir_shader *shader)
+    this->supports_std430 = consts->UseSTD430AsDefaultPacking;
+    this->shader = shader;
+    this->is_global = true;
+-   this->has_output_rvalue = false;
+    this->var_table = _mesa_pointer_hash_table_create(NULL);
+    this->overload_table = _mesa_pointer_hash_table_create(NULL);
+    this->sparse_variable_set = _mesa_pointer_set_create(NULL);
+@@ -1826,9 +1819,6 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir)
+ 
+       enum gl_access_qualifier access = deref_get_qualifier(this->deref);
+       this->result = nir_load_deref_with_access(&b, this->deref, access);
+-
+-      if (nir_deref_mode_is(this->deref, nir_var_shader_out))
+-         this->has_output_rvalue = true;
+    }
+ 
+    return this->result;
+diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
+index f875e2e08bf..7de2edf586e 100644
+--- a/src/compiler/glsl/standalone_scaffolding.cpp
++++ b/src/compiler/glsl/standalone_scaffolding.cpp
+@@ -262,6 +262,9 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
+    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxUniformComponents = 1024;
+    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxInputComponents = 0; /* not used */
+    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxOutputComponents = 0; /* not used */
++   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = 16;
++   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicCounters = 16;
++   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxShaderStorageBlocks = 16;
+ 
+    /* Set up default shader compiler options. */
+    struct gl_shader_compiler_options options;
+diff --git a/src/drm-shim/device.c b/src/drm-shim/device.c
+index 6c9c994643b..11825d717c4 100644
+--- a/src/drm-shim/device.c
++++ b/src/drm-shim/device.c
+@@ -292,6 +292,10 @@ drm_shim_ioctl(int fd, unsigned long request, void *arg)
+    ASSERTED int type = _IOC_TYPE(request);
+    int nr = _IOC_NR(request);
+ 
++   /* Used by kbase; do not claim to be a kbase FD */
++   if (type == 0x80)
++      return -EINVAL;
++
+    assert(type == DRM_IOCTL_BASE);
+ 
+    if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) {
+diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
+index de29d03940f..2fb7d55b508 100644
+--- a/src/egl/drivers/dri2/egl_dri2.c
++++ b/src/egl/drivers/dri2/egl_dri2.c
+@@ -52,6 +52,8 @@
+ #include <wayland-client.h>
+ #include "wayland-drm.h"
+ #include "wayland-drm-client-protocol.h"
++#include "mali-buffer-sharing.h"
++#include "mali-buffer-sharing-client-protocol.h"
+ #include "linux-dmabuf-unstable-v1-client-protocol.h"
+ #endif
+ 
+@@ -2259,6 +2261,9 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx,
+ 
+    buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm,
+                                    (struct wl_resource *) _buffer);
++   if (!buffer)
++           buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali,
++                                   (struct wl_resource *) _buffer);
+    if (!buffer)
+        return NULL;
+ 
+@@ -3256,11 +3261,27 @@ dri2_bind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy)
+            wayland_drm_init(wl_dpy, device_name,
+                             &wl_drm_callbacks, disp, flags);
+ 
++   drmSetVersion sv = {
++      .drm_di_major = 1,
++      .drm_di_minor = 4,
++      .drm_dd_major = -1,
++      .drm_dd_minor = -1,
++   };
++   drmSetInterfaceVersion(dri2_dpy->fd, &sv);
++
++   char *busid = drmGetBusid(dri2_dpy->fd);
++   dri2_dpy->wl_server_mali =
++           mali_buffer_sharing_init(wl_dpy, busid,
++                                    &wl_drm_callbacks,
++                                    disp);
++   drmFreeBusid(busid);
++
+    free(device_name);
+ 
+    if (!dri2_dpy->wl_server_drm)
+       goto fail;
+ 
++   // TODO: Do this for mali_buffer_sharing
+ #ifdef HAVE_DRM_PLATFORM
+    /* We have to share the wl_drm instance with gbm, so gbm can convert
+     * wl_buffers to gbm bos. */
+@@ -3281,6 +3302,11 @@ dri2_unbind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy)
+ {
+    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ 
++   if (dri2_dpy->wl_server_mali) {
++           wayland_drm_uninit(dri2_dpy->wl_server_mali);
++           dri2_dpy->wl_server_mali = NULL;
++   }
++
+    if (!dri2_dpy->wl_server_drm)
+            return EGL_FALSE;
+ 
+@@ -3299,6 +3325,8 @@ dri2_query_wayland_buffer_wl(_EGLDisplay *disp, struct wl_resource *buffer_resou
+    const struct wl_drm_components_descriptor *format;
+ 
+    buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, buffer_resource);
++   if (!buffer)
++           buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, buffer_resource);
+    if (!buffer)
+       return EGL_FALSE;
+ 
+diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
+index e4c15875091..f5143147014 100644
+--- a/src/egl/drivers/dri2/egl_dri2.h
++++ b/src/egl/drivers/dri2/egl_dri2.h
+@@ -284,8 +284,11 @@ struct dri2_egl_display
+    struct wl_display *wl_dpy_wrapper;
+    struct wl_registry *wl_registry;
+    struct wl_drm *wl_server_drm;
++   struct wl_drm *wl_server_mali;
+    struct wl_drm *wl_drm;
++   struct wl_drm *wl_mali;
+    uint32_t wl_drm_version, wl_drm_name;
++   uint32_t wl_mali_version, wl_mali_name;
+    struct wl_shm *wl_shm;
+    struct wl_event_queue *wl_queue;
+    struct zwp_linux_dmabuf_v1 *wl_dmabuf;
+@@ -337,6 +340,7 @@ struct dri2_egl_surface
+    struct wl_surface *wl_surface_wrapper;
+    struct wl_display *wl_dpy_wrapper;
+    struct wl_drm *wl_drm_wrapper;
++   struct wl_drm *wl_mali_wrapper;
+    struct wl_callback *throttle_callback;
+    struct zwp_linux_dmabuf_feedback_v1 *wl_dmabuf_feedback;
+    struct dmabuf_feedback dmabuf_feedback, pending_dmabuf_feedback;
+diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
+index 260eb9c82af..70c07ccf127 100644
+--- a/src/egl/drivers/dri2/platform_wayland.c
++++ b/src/egl/drivers/dri2/platform_wayland.c
+@@ -51,6 +51,7 @@
+ #include <wayland-egl-backend.h>
+ #include <wayland-client.h>
+ #include "wayland-drm-client-protocol.h"
++#include "mali-buffer-sharing-client-protocol.h"
+ #include "linux-dmabuf-unstable-v1-client-protocol.h"
+ 
+ /*
+@@ -668,7 +669,7 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
+                                                dri2_surf->base.PresentOpaque);
+    assert(visual_idx != -1);
+ 
+-   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) {
++   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm || dri2_dpy->wl_mali) {
+       dri2_surf->format = dri2_wl_visuals[visual_idx].wl_drm_format;
+    } else {
+       assert(dri2_dpy->wl_shm);
+@@ -691,6 +692,16 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
+                          dri2_surf->wl_queue);
+    }
+ 
++   if (dri2_dpy->wl_mali) {
++      dri2_surf->wl_mali_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_mali);
++      if (!dri2_surf->wl_mali_wrapper) {
++         _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
++         goto cleanup_queue;
++      }
++      wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_mali_wrapper,
++                         dri2_surf->wl_queue);
++   }
++
+    dri2_surf->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy);
+    if (!dri2_surf->wl_dpy_wrapper) {
+       _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
+@@ -765,6 +776,8 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
+  cleanup_drm:
+    if (dri2_surf->wl_drm_wrapper)
+       wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
++   if (dri2_surf->wl_mali_wrapper)
++      wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper);
+  cleanup_queue:
+    wl_event_queue_destroy(dri2_surf->wl_queue);
+  cleanup_surf:
+@@ -827,6 +840,8 @@ dri2_wl_destroy_surface(_EGLDisplay *disp, _EGLSurface *surf)
+    wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper);
+    if (dri2_surf->wl_drm_wrapper)
+       wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
++   if (dri2_surf->wl_mali_wrapper)
++      wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper);
+    if (dri2_surf->wl_dmabuf_feedback) {
+       zwp_linux_dmabuf_feedback_v1_destroy(dri2_surf->wl_dmabuf_feedback);
+       dmabuf_feedback_fini(&dri2_surf->dmabuf_feedback);
+@@ -1460,6 +1475,26 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy,
+       ret = zwp_linux_buffer_params_v1_create_immed(params, width, height,
+                                                     fourcc, 0);
+       zwp_linux_buffer_params_v1_destroy(params);
++   } else if (dri2_surf->wl_mali_wrapper || dri2_dpy->wl_mali) {
++      struct wl_drm *wl_mali =
++         dri2_surf ? dri2_surf->wl_mali_wrapper : dri2_dpy->wl_mali;
++      int fd = -1, stride;
++
++      if (num_planes > 1)
++         return NULL;
++
++      query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd);
++      query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
++      if (!query) {
++         if (fd >= 0)
++            close(fd);
++         return NULL;
++      }
++
++      ret = mali_buffer_sharing_create_buffer((void *)wl_mali,
++                                              width, height, stride,
++                                              fourcc, 0, 0, fd);
++      close(fd);
+    } else {
+       struct wl_drm *wl_drm =
+          dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm;
+@@ -1733,6 +1768,62 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
+    }
+ }
+ 
++static void
++mali_handle_device(void *data, struct mali_buffer_sharing *drm, const char *device)
++{
++   struct dri2_egl_display *dri2_dpy = data;
++   drm_magic_t magic;
++
++   // hack
++   //printf("device '%s'\n", device);
++   dri2_dpy->device_name = strdup("/dev/dri/card0");
++
++   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
++   if (dri2_dpy->fd == -1) {
++      _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)",
++              dri2_dpy->device_name, strerror(errno));
++      free(dri2_dpy->device_name);
++      dri2_dpy->device_name = NULL;
++      return;
++   }
++
++   if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
++      dri2_dpy->authenticated = true;
++   } else {
++      roundtrip(dri2_dpy);
++      if (drmGetMagic(dri2_dpy->fd, &magic)) {
++         close(dri2_dpy->fd);
++         dri2_dpy->fd = -1;
++         free(dri2_dpy->device_name);
++         dri2_dpy->device_name = NULL;
++         _eglLog(_EGL_WARNING, "wayland-egl: drmGetMagic failed");
++         return;
++      }
++
++      mali_buffer_sharing_auth((void *)dri2_dpy->wl_mali, magic);
++      roundtrip(dri2_dpy);
++      // We don't get a callback
++      dri2_dpy->authenticated = true;
++   }
++
++   int supported_fourcc[] = {
++      WL_DRM_FORMAT_ABGR16F,
++      WL_DRM_FORMAT_ABGR2101010,
++      WL_DRM_FORMAT_XRGB8888,
++      WL_DRM_FORMAT_ARGB8888,
++      WL_DRM_FORMAT_ABGR8888,
++      WL_DRM_FORMAT_XBGR8888,
++      WL_DRM_FORMAT_RGB565,
++   };
++
++   for (unsigned i = 0; i < ARRAY_SIZE(supported_fourcc); ++i) {
++      int visual_idx = dri2_wl_visual_idx_from_fourcc(supported_fourcc[i]);
++      assert(visual_idx != -1);
++
++      BITSET_SET(dri2_dpy->formats.formats_bitmap, visual_idx);
++   }
++}
++
+ static void
+ drm_handle_format(void *data, struct wl_drm *drm, uint32_t format)
+ {
+@@ -1768,6 +1859,10 @@ static const struct wl_drm_listener drm_listener = {
+    .capabilities = drm_handle_capabilities
+ };
+ 
++static const struct mali_buffer_sharing_listener mali_listener = {
++   .alloc_device = mali_handle_device,
++};
++
+ static void
+ dmabuf_ignore_format(void *data, struct zwp_linux_dmabuf_v1 *dmabuf,
+                      uint32_t format)
+@@ -1813,6 +1908,14 @@ wl_drm_bind(struct dri2_egl_display *dri2_dpy)
+    wl_drm_add_listener(dri2_dpy->wl_drm, &drm_listener, dri2_dpy);
+ }
+ 
++static void
++wl_mali_bind(struct dri2_egl_display *dri2_dpy)
++{
++   dri2_dpy->wl_mali = wl_registry_bind(dri2_dpy->wl_registry, dri2_dpy->wl_mali_name,
++                                        &mali_buffer_sharing_interface, dri2_dpy->wl_mali_version);
++   mali_buffer_sharing_add_listener((void *)dri2_dpy->wl_mali, &mali_listener, dri2_dpy);
++}
++
+ static void
+ default_dmabuf_feedback_format_table(void *data,
+                                      struct zwp_linux_dmabuf_feedback_v1 *zwp_linux_dmabuf_feedback_v1,
+@@ -1943,6 +2046,9 @@ registry_handle_global_drm(void *data, struct wl_registry *registry,
+    if (strcmp(interface, wl_drm_interface.name) == 0) {
+       dri2_dpy->wl_drm_version = MIN2(version, 2);
+       dri2_dpy->wl_drm_name = name;
++   } else if (strcmp(interface, mali_buffer_sharing_interface.name) == 0) {
++      dri2_dpy->wl_mali_version = MIN2(version, 5);
++      dri2_dpy->wl_mali_name = name;
+    } else if (strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0 && version >= 3) {
+       dri2_dpy->wl_dmabuf =
+          wl_registry_bind(registry, name, &zwp_linux_dmabuf_v1_interface,
+@@ -2145,10 +2251,7 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
+ 
+    /* We couldn't retrieve a render node from the dma-buf feedback (or the
+     * feedback was not advertised at all), so we must fallback to wl_drm. */
+-   if (dri2_dpy->fd == -1) {
+-      /* wl_drm not advertised by compositor, so can't continue */
+-      if (dri2_dpy->wl_drm_name == 0)
+-         goto cleanup;
++   if (dri2_dpy->fd == -1 && dri2_dpy->wl_drm_name) {
+       wl_drm_bind(dri2_dpy);
+ 
+       if (dri2_dpy->wl_drm == NULL)
+@@ -2161,6 +2264,22 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
+          goto cleanup;
+    }
+ 
++   if (dri2_dpy->fd == -1 && dri2_dpy->wl_mali_name) {
++      wl_mali_bind(dri2_dpy);
++
++      if (dri2_dpy->wl_mali == NULL)
++         goto cleanup;
++      if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1)
++         goto cleanup;
++
++      if (!dri2_dpy->authenticated &&
++          (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated))
++         goto cleanup;
++   }
++
++   if (dri2_dpy->fd == -1)
++           goto cleanup;
++
+    dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd,
+                                                &dri2_dpy->is_different_gpu);
+    dev = _eglAddDevice(dri2_dpy->fd, false);
+@@ -2786,6 +2905,8 @@ dri2_teardown_wayland(struct dri2_egl_display *dri2_dpy)
+    dri2_wl_formats_fini(&dri2_dpy->formats);
+    if (dri2_dpy->wl_drm)
+       wl_drm_destroy(dri2_dpy->wl_drm);
++   if (dri2_dpy->wl_mali)
++      wl_drm_destroy(dri2_dpy->wl_mali);
+    if (dri2_dpy->wl_dmabuf)
+       zwp_linux_dmabuf_v1_destroy(dri2_dpy->wl_dmabuf);
+    if (dri2_dpy->wl_shm)
+diff --git a/src/egl/meson.build b/src/egl/meson.build
+index 5b4644940a5..80dc6c94f33 100644
+--- a/src/egl/meson.build
++++ b/src/egl/meson.build
+@@ -122,14 +122,16 @@ if with_dri2
+   endif
+   if with_platform_wayland
+     deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers]
+-    link_for_egl += libwayland_drm
++    link_for_egl += [libwayland_drm, libmali_buffer_sharing]
+     files_egl += files('drivers/dri2/platform_wayland.c')
+     files_egl += [
+       linux_dmabuf_unstable_v1_protocol_c,
+       linux_dmabuf_unstable_v1_client_protocol_h,
+       wayland_drm_client_protocol_h,
++      mali_buffer_sharing_client_protocol_h,
+     ]
+     incs_for_egl += include_directories('wayland/wayland-drm')
++    incs_for_egl += include_directories('wayland/mali-buffer-sharing')
+   endif
+   if with_platform_android
+     deps_for_egl += dep_android
+diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c
+new file mode 100644
+index 00000000000..d3c9a6f0dd2
+--- /dev/null
++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c
+@@ -0,0 +1,170 @@
++/*
++ * Copyright © 2022 Icecream95
++ * Copyright © 2011 Kristian Høgsberg
++ * Copyright © 2011 Benjamin Franzke
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
++ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
++ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Authors:
++ *    Kristian Høgsberg <krh@bitplanet.net>
++ *    Benjamin Franzke <benjaminfranzke@googlemail.com>
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <unistd.h>
++
++#include <wayland-server.h>
++#include "mali-buffer-sharing.h"
++#include "mali-buffer-sharing-server-protocol.h"
++#include "wayland-drm-client-protocol.h"
++
++#define MIN(x,y) (((x)<(y))?(x):(y))
++
++static void
++destroy_buffer(struct wl_resource *resource)
++{
++        struct wl_drm_buffer *buffer = wl_resource_get_user_data(resource);
++        struct wl_drm *drm = buffer->drm;
++
++        drm->callbacks.release_buffer(drm->user_data, buffer);
++        free(buffer);
++}
++
++static void
++buffer_destroy(struct wl_client *client, struct wl_resource *resource)
++{
++        wl_resource_destroy(resource);
++}
++
++static void
++create_buffer(struct wl_client *client, struct wl_resource *resource,
++              uint32_t id, uint32_t name, int fd,
++              int32_t width, int32_t height,
++              uint32_t format,
++              int32_t offset, int32_t stride)
++{
++        struct wl_drm *drm = wl_resource_get_user_data(resource);
++        struct wl_drm_buffer *buffer;
++
++        buffer = calloc(1, sizeof *buffer);
++        if (buffer == NULL) {
++                wl_resource_post_no_memory(resource);
++                return;
++        }
++
++        buffer->drm = drm;
++        buffer->width = width;
++        buffer->height = height;
++        buffer->format = format;
++        buffer->offset[0] = offset;
++        buffer->stride[0] = stride;
++
++        drm->callbacks.reference_buffer(drm->user_data, name, fd, buffer);
++        if (buffer->driver_buffer == NULL) {
++                // TODO: We should return an error
++                return;
++        }
++
++        buffer->resource =
++                wl_resource_create(client, &wl_buffer_interface, 1, id);
++        if (!buffer->resource) {
++                wl_resource_post_no_memory(resource);
++                free(buffer);
++                return;
++        }
++
++        wl_resource_set_implementation(buffer->resource,
++                                       (void (**)(void)) &drm->buffer_interface,
++                                       buffer, destroy_buffer);
++}
++
++static void
++mali_create_buffer(struct wl_client *client,
++                   struct wl_resource *resource,
++                   uint32_t id,
++                   int32_t width, int32_t height, uint32_t stride,
++                   enum wl_drm_format format, uint32_t unk1, uint32_t unk2,
++                   int fd)
++{
++        create_buffer(client, resource, id, 0, fd, width, height, format,
++                      0, stride);
++        close(fd);
++}
++
++static void
++mali_auth(struct wl_client *client,
++          struct wl_resource *resource, uint32_t id)
++{
++        struct wl_drm *drm = wl_resource_get_user_data(resource);
++
++        drm->callbacks.authenticate(drm->user_data, id);
++}
++
++static const struct mali_buffer_sharing_interface mali_interface = {
++        mali_create_buffer,
++        mali_auth,
++};
++
++static void
++bind_mali(struct wl_client *client, void *data, uint32_t version, uint32_t id)
++{
++        struct wl_drm *drm = data;
++        struct wl_resource *resource;
++
++        resource = wl_resource_create(client, &mali_buffer_sharing_interface,
++                                      MIN(version, 4), id);
++        if (!resource) {
++                wl_client_post_no_memory(client);
++                return;
++        }
++
++        wl_resource_set_implementation(resource, &mali_interface, data, NULL);
++
++        mali_buffer_sharing_send_alloc_device(resource, drm->device_name);
++}
++
++struct wl_drm *
++mali_buffer_sharing_init(struct wl_display *display, char *device_name,
++                 const struct wayland_drm_callbacks *callbacks, void *user_data)
++{
++        struct wl_drm *drm;
++
++        drm = malloc(sizeof *drm);
++        if (!drm)
++                return NULL;
++
++        drm->display = display;
++        drm->device_name = strdup(device_name ?: "");
++        drm->callbacks = *callbacks;
++        drm->user_data = user_data;
++        drm->flags = 1;
++
++        drm->buffer_interface.destroy = buffer_destroy;
++
++        drm->wl_drm_global =
++                wl_global_create(display, &mali_buffer_sharing_interface, 5,
++                                 drm, bind_mali);
++
++        return drm;
++}
+diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h
+new file mode 100644
+index 00000000000..f7f2c4811df
+--- /dev/null
++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h
+@@ -0,0 +1,12 @@
++#ifndef MALI_BUFFER_H
++#define MALI_BUFFER_H
++
++#include <wayland-server.h>
++
++#include "wayland-drm.h"
++
++struct wl_drm *
++mali_buffer_sharing_init(struct wl_display *display, char *device_name,
++		 const struct wayland_drm_callbacks *callbacks, void *user_data);
++
++#endif
+diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml
+new file mode 100644
+index 00000000000..0ad02488118
+--- /dev/null
++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml
+@@ -0,0 +1,50 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<protocol name="mali">
++
++  <copyright>
++    Copyright © 2022 Icecream95
++
++    Permission to use, copy, modify, distribute, and sell this
++    software and its documentation for any purpose is hereby granted
++    without fee, provided that\n the above copyright notice appear in
++    all copies and that both that copyright notice and this permission
++    notice appear in supporting documentation, and that the name of
++    the copyright holders not be used in advertising or publicity
++    pertaining to distribution of the software without specific,
++    written prior permission.  The copyright holders make no
++    representations about the suitability of this software for any
++    purpose.  It is provided "as is" without express or implied
++    warranty.
++
++    THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
++    SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
++    FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
++    SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
++    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
++    AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
++    ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
++    THIS SOFTWARE.
++  </copyright>
++
++  <interface name="mali_buffer_sharing" version="5">
++    <event name="alloc_device">
++      <arg name="name" type="string"/>
++    </event>
++
++    <request name="create_buffer">
++      <arg name="id" type="new_id" interface="wl_buffer"/>
++      <arg name="width" type="int"/>
++      <arg name="height" type="int"/>
++      <arg name="stride" type="uint"/>
++      <arg name="format" type="uint"/>
++      <arg name="unk1" type="uint"/>
++      <arg name="unk2" type="uint"/>
++      <!-- FD of the dmabuf to send -->
++      <arg name="name" type="fd"/>
++    </request>
++
++    <request name="auth">
++      <arg name="id" type="uint"/>
++    </request>
++  </interface>
++</protocol>
+diff --git a/src/egl/wayland/mali-buffer-sharing/meson.build b/src/egl/wayland/mali-buffer-sharing/meson.build
+new file mode 100644
+index 00000000000..0693bf8668c
+--- /dev/null
++++ b/src/egl/wayland/mali-buffer-sharing/meson.build
+@@ -0,0 +1,51 @@
++# Copyright © 2017 Intel Corporation
++
++# Permission is hereby granted, free of charge, to any person obtaining a copy
++# of this software and associated documentation files (the "Software"), to deal
++# in the Software without restriction, including without limitation the rights
++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++# copies of the Software, and to permit persons to whom the Software is
++# furnished to do so, subject to the following conditions:
++
++# The above copyright notice and this permission notice shall be included in
++# all copies or substantial portions of the Software.
++
++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++# SOFTWARE.
++
++inc_mali_buffer_sharing = include_directories('.')
++
++mali_buffer_sharing_protocol_c = custom_target(
++  'mali-buffer-sharing-protocol.c',
++  input : 'mali-buffer-sharing.xml',
++  output : 'mali-buffer-sharing-protocol.c',
++  command : [prog_wl_scanner, wl_scanner_arg, '@INPUT@', '@OUTPUT@'],
++)
++
++mali_buffer_sharing_client_protocol_h = custom_target(
++  'mali-buffer-sharing-client-protocol.h',
++  input : 'mali-buffer-sharing.xml',
++  output : 'mali-buffer-sharing-client-protocol.h',
++  command : [prog_wl_scanner, 'client-header', '@INPUT@', '@OUTPUT@'],
++)
++
++mali_buffer_sharing_server_protocol_h = custom_target(
++  'mali-buffer-sharing-server-protocol.h',
++  input : 'mali-buffer-sharing.xml',
++  output : 'mali-buffer-sharing-server-protocol.h',
++  command : [prog_wl_scanner, 'server-header', '@INPUT@', '@OUTPUT@'],
++)
++
++libmali_buffer_sharing = static_library(
++  'mali-buffer-sharing',
++  ['mali-buffer-sharing.c', mali_buffer_sharing_protocol_c, mali_buffer_sharing_server_protocol_h, wayland_drm_client_protocol_h],
++  include_directories : inc_wayland_drm,
++  gnu_symbol_visibility : 'hidden',
++  dependencies : [dep_wayland_server],
++  build_by_default : false,
++)
+diff --git a/src/egl/wayland/wayland-drm/wayland-drm.c b/src/egl/wayland/wayland-drm/wayland-drm.c
+index 29558ea910e..ad9e64b72ee 100644
+--- a/src/egl/wayland/wayland-drm/wayland-drm.c
++++ b/src/egl/wayland/wayland-drm/wayland-drm.c
+@@ -212,7 +212,7 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
+ 
+ 	wl_resource_set_implementation(resource, &drm_interface, data, NULL);
+ 
+-	wl_resource_post_event(resource, WL_DRM_DEVICE, drm->device_name);
++	wl_resource_post_event(resource, WL_DRM_DEVICE, "/dev/dri/card0");
+ 
+ 	if (drm->callbacks.is_format_supported(drm->user_data,
+ 					       WL_DRM_FORMAT_ARGB2101010)) {
+diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
+index efce6f6737e..6c0242770c6 100644
+--- a/src/gallium/auxiliary/cso_cache/cso_context.c
++++ b/src/gallium/auxiliary/cso_cache/cso_context.c
+@@ -1368,6 +1368,11 @@ cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage,
+    }
+ }
+ 
++void
++cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen)
++{
++   ctx->max_sampler_seen = max_sampler_seen;
++}
+ 
+ /**
+  * Send staged sampler state to the driver.
+diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
+index 4b9ec2098bf..24535f62b35 100644
+--- a/src/gallium/auxiliary/cso_cache/cso_context.h
++++ b/src/gallium/auxiliary/cso_cache/cso_context.h
+@@ -83,6 +83,9 @@ void
+ cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage,
+                    unsigned idx, const struct pipe_sampler_state *states);
+ 
++void
++cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen);
++
+ void
+ cso_single_sampler_done(struct cso_context *cso,
+                         enum pipe_shader_type shader_stage);
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+index 57c953a8d3b..ed76910c66d 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+@@ -1027,7 +1027,7 @@ static void emit_atomic_global(struct lp_build_nir_context *bld_base,
+       case nir_intrinsic_global_atomic_fadd:
+          op = LLVMAtomicRMWBinOpFAdd;
+          break;
+-#if LLVM_VERSION_MAJOR >= 15
++#if LLVM_VERSION_MAJOR >= 16
+       case nir_intrinsic_global_atomic_fmin:
+          op = LLVMAtomicRMWBinOpFMin;
+          break;
+@@ -1542,7 +1542,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
+       case nir_intrinsic_ssbo_atomic_fadd:
+          op = LLVMAtomicRMWBinOpFAdd;
+          break;
+-#if LLVM_VERSION_MAJOR >= 15
++#if LLVM_VERSION_MAJOR >= 16
+       case nir_intrinsic_shared_atomic_fmin:
+       case nir_intrinsic_ssbo_atomic_fmin:
+          op = LLVMAtomicRMWBinOpFMin;
+diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+index b27858ab467..ba7c1b8d586 100644
+--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
++++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+@@ -189,7 +189,7 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
+    int new_fd;
+ 
+    if (fd < 0 || (new_fd = os_dupfd_cloexec(fd)) < 0)
+-     return false;
++      return false;
+ 
+    ret = pipe_loader_drm_probe_fd_nodup(dev, new_fd);
+    if (!ret)
+diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+index d821008b534..dfef7a24c8c 100644
+--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
++++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+@@ -8,6 +8,10 @@
+ #include "frontend/sw_winsys.h"
+ #include "target-helpers/inline_debug_helper.h"
+ 
++#include <stdio.h>
++#include <fcntl.h>
++#include <errno.h>
++
+ /* Helper function to choose and instantiate one of the software rasterizers:
+  * llvmpipe, softpipe.
+  */
+@@ -33,6 +37,10 @@
+ #include "asahi/agx_public.h"
+ #endif
+ 
++#if defined(GALLIUM_PANFROST)
++#include "panfrost/pan_public.h"
++#endif
++
+ static inline struct pipe_screen *
+ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+ {
+@@ -71,6 +79,19 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+       screen = agx_screen_create(0, NULL, winsys);
+ #endif
+ 
++#if defined(GALLIUM_PANFROST)
++   if(screen == NULL && strcmp(driver, "panfrost") == 0) {
++      int kbase_device_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK);
++      if(kbase_device_fd == -1) { 
++         printf("PAN_OSMESA: Failed to open kbase device: %s", strerror(errno));
++      }else {
++      	screen = panfrost_create_screen(kbase_device_fd, NULL);
++      }
++   }
++#else
++#error You forgot to include Panfrost
++#endif
++
+    return screen ? debug_screen_wrap(screen) : NULL;
+ }
+ 
+diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build
+index 8d6317292e9..58634b46c40 100644
+--- a/src/gallium/drivers/panfrost/meson.build
++++ b/src/gallium/drivers/panfrost/meson.build
+@@ -44,6 +44,7 @@ panfrost_includes = [
+   inc_include,
+   inc_src,
+   inc_panfrost,
++  inc_panfrost_hw,
+ ]
+ 
+ compile_args_panfrost = [
+@@ -51,7 +52,7 @@ compile_args_panfrost = [
+   '-Wno-pointer-arith'
+ ]
+ 
+-panfrost_versions = ['4', '5', '6', '7', '9']
++panfrost_versions = ['4', '5', '6', '7', '9', '10']
+ libpanfrost_versions = []
+ 
+ foreach ver : panfrost_versions
+diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
+index eda56974409..227b6550d19 100644
+--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
++++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
+@@ -23,12 +23,15 @@
+  * SOFTWARE.
+  */
+ 
++#include "dma-uapi/dma-buf.h"
++
+ #include "util/macros.h"
+ #include "util/u_prim.h"
+ #include "util/u_vbuf.h"
+ #include "util/u_helpers.h"
+ #include "util/u_draw.h"
+ #include "util/u_memory.h"
++#include "util/u_viewport.h"
+ #include "pipe/p_defines.h"
+ #include "pipe/p_state.h"
+ #include "gallium/auxiliary/util/u_blend.h"
+@@ -749,8 +752,8 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+         float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
+         float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
+         float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
+-        float minz = (vp->translate[2] - fabsf(vp->scale[2]));
+-        float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
++        float minz, maxz;
++        util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
+ 
+         /* Scissor to the intersection of viewport and to the scissor, clamped
+          * to the framebuffer */
+@@ -778,10 +781,16 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+         maxx--;
+         maxy--;
+ 
+-        batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY;
+-        batch->maximum_z = rast->depth_clip_far  ? maxz : +INFINITY;
+-
+ #if PAN_ARCH <= 7
++        /* Proper depth clamp support was only introduced in v9, before then
++         * all that can be done is disabling clipping by adjusting the
++         * viewport. This means that the result will be wrong for float depth
++         * buffers or non-[0, 1] depth range. */
++        if (!rast->depth_clip_near)
++                minz = -INFINITY;
++        if (!rast->depth_clip_far)
++                maxz = +INFINITY;
++
+         struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
+ 
+         pan_pack(T.cpu, VIEWPORT, cfg) {
+@@ -790,19 +799,22 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+                 cfg.scissor_maximum_x = maxx;
+                 cfg.scissor_maximum_y = maxy;
+ 
+-                cfg.minimum_z = batch->minimum_z;
+-                cfg.maximum_z = batch->maximum_z;
++                cfg.minimum_z = minz;
++                cfg.maximum_z = maxz;
+         }
+ 
+         return T.gpu;
+ #else
+-        pan_pack(&batch->scissor, SCISSOR, cfg) {
++        pan_pack_cs_v10(&batch->scissor, &batch->cs_vertex, SCISSOR, cfg) {
+                 cfg.scissor_minimum_x = minx;
+                 cfg.scissor_minimum_y = miny;
+                 cfg.scissor_maximum_x = maxx;
+                 cfg.scissor_maximum_y = maxy;
+         }
+ 
++        batch->minimum_z = minz;
++        batch->maximum_z = maxz;
++
+         return 0;
+ #endif
+ }
+@@ -838,6 +850,14 @@ panfrost_emit_depth_stencil(struct panfrost_batch *batch)
+                 cfg.depth_units = rast->base.offset_units * 2.0f;
+                 cfg.depth_factor = rast->base.offset_scale;
+                 cfg.depth_bias_clamp = rast->base.offset_clamp;
++
++                if (rast->base.depth_clip_near && rast->base.depth_clip_far) {
++                        cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_0_1;
++                        cfg.depth_cull_enable = true;
++                } else {
++                        cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
++                        cfg.depth_cull_enable = false;
++                }
+         }
+ 
+         pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
+@@ -1482,9 +1502,17 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+         size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
+         struct panfrost_ptr transfer =
+                 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
++        void *sys_cpu = malloc(sys_size);
++
++        /* Write to a shadow buffer to make pushing cheaper */
++        struct panfrost_ptr sys_shadow = {
++                .cpu = sys_cpu,
++                .gpu = transfer.gpu,
++        };
+ 
+         /* Upload sysvals requested by the shader */
+-        panfrost_upload_sysvals(batch, &transfer, ss, stage);
++        panfrost_upload_sysvals(batch, &sys_shadow, ss, stage);
++        memcpy(transfer.cpu, sys_cpu, sys_size);
+ 
+         /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
+         struct panfrost_compiled_shader *shader = ctx->prog[stage];
+@@ -1527,8 +1555,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+         if (pushed_words)
+                 *pushed_words = ss->info.push.count;
+ 
+-        if (ss->info.push.count == 0)
++        if (ss->info.push.count == 0) {
++                free(sys_cpu);
+                 return ubos.gpu;
++        }
+ 
+         /* Copy push constants required by the shader */
+         struct panfrost_ptr push_transfer =
+@@ -1580,13 +1610,15 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+                  * off to upload sysvals to a staging buffer on the CPU on the
+                  * assumption sysvals will get pushed (TODO) */
+ 
+-                const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
++                const void *mapped_ubo = (src.ubo == sysval_ubo) ? sys_cpu :
+                         panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
+ 
+                 /* TODO: Is there any benefit to combining ranges */
+                 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
+         }
+ 
++        free(sys_cpu);
++
+         return ubos.gpu;
+ }
+ 
+@@ -2777,6 +2809,385 @@ emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
+         return transfer.gpu;
+ }
+ 
++#if PAN_ARCH >= 10
++
++static int
++panfrost_export_dmabuf_fence(int dmabuf)
++{
++        struct dma_buf_export_sync_file export = {
++                .flags = DMA_BUF_SYNC_RW,
++        };
++
++        int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export);
++        if (err < 0) {
++                fprintf(stderr, "failed to export fence: %s\n",
++                        strerror(errno));
++                return -1;
++        }
++
++        return export.fd;
++}
++
++static bool
++panfrost_import_dmabuf_fence(int dmabuf, int fence)
++{
++        struct dma_buf_import_sync_file import = {
++                .flags = DMA_BUF_SYNC_RW,
++                .fd = fence,
++        };
++
++        int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import);
++        if (err < 0) {
++                fprintf(stderr, "failed to import fence: %s\n",
++                        strerror(errno));
++                return false;
++        }
++
++        return true;
++}
++
++static uint64_t *
++panfrost_cs_ring_allocate_instrs(struct panfrost_cs *cs, unsigned count)
++{
++        pan_command_stream c = cs->cs;
++
++        if (c.ptr + count > c.end) {
++                assert(c.ptr <= c.end);
++                assert(c.begin + count <= c.ptr);
++
++                /* Instructions are in a ring buffer, simply NOP out the end
++                 * and start back from the start. Possibly, doing a TAILCALL
++                 * straight to the start could also work. */
++                memset(c.ptr, 0, (c.end - c.ptr) * 8);
++                c.ptr = c.begin;
++
++                cs->offset += cs->base.size;
++                cs->cs = c;
++        }
++
++        /* TODO: Check against the extract offset */
++        return c.ptr + count;
++}
++
++// TODO: Rewrite this!
++static void
++emit_csf_queue(struct panfrost_batch *batch, struct panfrost_cs *cs,
++               pan_command_stream s, struct util_dynarray *deps,
++               bool first, bool last)
++{
++        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
++
++        assert(s.ptr <= s.end);
++
++        bool fragment = (cs->hw_resources & 2);
++        bool vertex = (cs->hw_resources & 12); /* TILER | IDVS */
++
++        uint64_t *limit = panfrost_cs_ring_allocate_instrs(cs,
++                128 + util_dynarray_num_elements(deps, struct panfrost_usage) * 4);
++
++        pan_command_stream *c = &cs->cs;
++
++        /* First, do some waiting at the start of the job */
++
++        pan_emit_cs_32(c, 0x54, *cs->base.latest_flush);
++        // TODO genxmlify
++        pan_emit_cs_ins(c, 0x24, 0x540000000233ULL);
++        // TODO: What does this need to be?
++        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 0xff; }
++
++        /* For the first job in the batch, wait on dependencies */
++        // TODO: Usually the vertex job shouldn't have to wait for dmabufs!
++        if (first) {
++                mali_ptr seqnum_ptr_base = dev->mali.event_mem.gpu;
++
++                util_dynarray_foreach(deps, struct panfrost_usage, u) {
++                        /* Note the multiplication in the call to
++                         * cs_ring_allocate_instrs. pan_emit_cs_64 might be
++                         * split, so the total is four instructions. */
++                        pan_emit_cs_48(c, 0x42, seqnum_ptr_base +
++                                       u->queue * PAN_EVENT_SIZE);
++                        pan_emit_cs_64(c, 0x40, u->seqnum);
++                        pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                                cfg.no_error = true;
++                                cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                                cfg.value = 0x40;
++                                cfg.addr = 0x42;
++                        }
++                }
++
++                uint64_t kcpu_seqnum = ++cs->kcpu_seqnum;
++
++                util_dynarray_foreach(&batch->dmabufs, int, fd) {
++                        int fence = panfrost_export_dmabuf_fence(*fd);
++
++                        /* TODO: poll on the dma-buf? */
++                        if (fence == -1)
++                                continue;
++
++                        // TODO: What if we reach the limit for number of KCPU
++                        // commands in a queue? It's pretty low (256)
++                        dev->mali.kcpu_fence_import(&dev->mali, cs->base.ctx,
++                                                    fence);
++
++                        close(fence);
++                }
++
++                bool ret = dev->mali.kcpu_cqs_set(&dev->mali, cs->base.ctx,
++                                  cs->kcpu_event_ptr, kcpu_seqnum + 1);
++
++                if (ret) {
++                        /* If we don't set no_error, kbase might decide to
++                         * pass on errors from waiting for fences. */
++                        pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr);
++                        pan_emit_cs_64(c, 0x40, kcpu_seqnum);
++                        pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                                cfg.no_error = true;
++                                cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                                cfg.value = 0x40;
++                                cfg.addr = 0x42;
++                        }
++                }
++        }
++
++        /* Fragment jobs need to wait for the vertex job */
++        if (fragment && !first) {
++                pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                        cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                        cfg.value = 0x4e;
++                        cfg.addr = 0x4c;
++                }
++        }
++
++        if (vertex) {
++                pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 3; }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
++                pan_pack_ins(c, CS_HEAPINC, cfg) {
++                        cfg.type = MALI_HEAP_STATISTIC_V_T_START;
++                }
++        } else if (fragment) {
++                pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 4; }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 4; }
++        }
++
++        // copying to the main buffer can make debugging easier.
++        // TODO: This needs to be more reliable.
++#if 0
++        unsigned length = (s.ptr - s.begin) * 8;
++        unsigned clamped = MIN2(length, cs->bo->ptr.cpu + cs->bo->size - (void *)c->ptr);
++        memcpy(c->ptr, s->begin, clamped);
++        c->ptr += clamped / 8;
++
++        if (clamped != length) {
++                unsigned rest = length - clamped;
++                c->ptr = cs->bo->ptr.cpu;
++                memcpy(c->ptr, s->begin, rest);
++                c->ptr += rest / 8;
++
++                cs->offset += cs->bo->size;
++        }
++#else
++
++        pan_emit_cs_48(c, 0x48, s.gpu);
++        pan_emit_cs_32(c, 0x4a, (s.ptr - s.begin) * 8);
++        pan_pack_ins(c, CS_CALL, cfg) { cfg.address = 0x48; cfg.length = 0x4a; }
++#endif
++
++        if (vertex) {
++                pan_pack_ins(c, CS_FLUSH_TILER, _) { }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
++                pan_pack_ins(c, CS_HEAPINC, cfg) {
++                        cfg.type = MALI_HEAP_STATISTIC_V_T_END;
++                }
++        }
++
++        if (fragment) {
++                /* Skip the next operation if the batch doesn't use a tiler
++                 * heap (i.e. it's just a blit) */
++                pan_emit_cs_ins(c, 22, 0x560030000001); /* b.ne w56, skip 1 */
++                pan_emit_cs_ins(c, 22, 0x570020000007); /* b.eq w57, skip 7 */
++
++                pan_pack_ins(c, CS_LDR, cfg) {
++                        cfg.offset = 4 * 10; /* Heap Start */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++                pan_pack_ins(c, CS_LDR, cfg) {
++                        cfg.offset = 4 * 12; /* Heap End */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4c;
++                }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 0) | (1 << 3); }
++
++                pan_pack_ins(c, CS_HEAPCLEAR, cfg) {
++                        cfg.start = 0x4a;
++                        cfg.end = 0x4c;
++                        cfg.slots = 1 << 3;
++                }
++
++                /* Reset the fields so that the clear operation isn't done again */
++                pan_emit_cs_48(c, 0x4a, 0);
++                pan_pack_ins(c, CS_STR, cfg) {
++                        cfg.offset = 4 * 10; /* Heap Start */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++                pan_pack_ins(c, CS_STR, cfg) {
++                        cfg.offset = 4 * 12; /* Heap End */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++
++                /* Branch target for above branch */
++
++                // This seems to be done by the HEAPCLEAR
++                //pan_pack_ins(c, CS_HEAPINC, cfg) {
++                //        cfg.type = MALI_HEAP_STATISTIC_FRAGMENT_END;
++                //}
++        }
++
++        if (fragment) {
++                pan_emit_cs_32(c, 0x54, 0);
++                pan_emit_cs_ins(c, 0x24, 0x2540000f80211);
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 1; }
++        }
++
++        {
++                // This could I think be optimised to 0xf80211 rather than 0x233
++                // TODO: Does this need to run for vertex jobs?
++                // What about when doing transform feedback?
++                // I think we at least need it for compute?
++
++                //pan_emit_cs_32(c, 0x54, 0);
++                //pan_emit_cs_ins(c, 0x24, 0x540000000233ULL);
++        }
++
++        if (last) {
++                uint64_t kcpu_seqnum = ++cs->kcpu_seqnum;
++
++                pan_emit_cs_64(c, 0x40, kcpu_seqnum + 1);
++                pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr);
++                pan_pack_ins(c, CS_EVSTR_64, cfg) {
++                        /* This is the scoreboard mask, right?.. */
++                        cfg.unk_2 = (3 << 3);
++                        cfg.value = 0x40;
++                        cfg.addr = 0x42;
++                }
++
++                dev->mali.kcpu_cqs_wait(&dev->mali, cs->base.ctx,
++                                        cs->kcpu_event_ptr, kcpu_seqnum);
++
++                int fence = dev->mali.kcpu_fence_export(&dev->mali, cs->base.ctx);
++
++                if (fence != -1) {
++                        util_dynarray_foreach(&batch->dmabufs, int, fd) {
++                                panfrost_import_dmabuf_fence(*fd, fence);
++                        }
++                }
++
++                close(fence);
++        }
++
++        pan_emit_cs_48(c, 0x48, cs->event_ptr);
++        pan_emit_cs_64(c, 0x4a, cs->seqnum + 1);
++        pan_pack_ins(c, CS_EVSTR_64, cfg) {
++                /* This is the scoreboard mask, right?.. */
++                cfg.unk_2 = (3 << 3);
++                cfg.value = 0x4a;
++                cfg.addr = 0x48;
++        }
++
++        // TODO: is this just a weird ddk thing, or is it required?
++        // Probably it just lessens the WC impact
++        while ((uintptr_t)c->ptr & 63)
++                pan_emit_cs_ins(c, 0, 0);
++
++        assert(c->ptr <= limit);
++}
++
++static void
++emit_csf_toplevel(struct panfrost_batch *batch)
++{
++        pan_command_stream *cv = &batch->ctx->kbase_cs_vertex.cs;
++        pan_command_stream *cf = &batch->ctx->kbase_cs_fragment.cs;
++
++        pan_command_stream v = batch->cs_vertex;
++        pan_command_stream f = batch->cs_fragment;
++
++        if (batch->cs_vertex_last_size) {
++                assert(v.ptr <= v.end);
++                *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8;
++                v = batch->cs_vertex_first;
++        }
++
++        bool vert = (v.ptr != v.begin);
++        bool frag = (f.ptr != f.begin);
++
++        // TODO: Clean up control-flow?
++
++        if (vert) {
++                pan_emit_cs_48(cv, 0x48, batch->ctx->kbase_ctx->tiler_heap_va);
++                pan_pack_ins(cv, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++
++                emit_csf_queue(batch, &batch->ctx->kbase_cs_vertex, v,
++                               &batch->vert_deps, true, !frag);
++        }
++
++        if (!frag)
++                return;
++
++        pan_emit_cs_48(cf, 0x48, batch->ctx->kbase_ctx->tiler_heap_va);
++        pan_pack_ins(cf, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++
++        uint64_t vertex_seqnum = batch->ctx->kbase_cs_vertex.seqnum;
++        // TODO: this assumes SAME_VA
++        mali_ptr seqnum_ptr = (uintptr_t) batch->ctx->kbase_cs_vertex.event_ptr;
++
++        pan_emit_cs_48(cf, 0x4c, seqnum_ptr);
++        pan_emit_cs_64(cf, 0x4e, vertex_seqnum);
++
++        // What does this instruction do?
++        //pan_emit_cs_32(cf, 0x54, 0);
++        //pan_emit_cs_ins(cf, 0x24, 0x540000000200);
++
++        assert(vert || batch->tiler_ctx.bifrost == 0);
++        pan_emit_cs_48(cf, 0x56, batch->tiler_ctx.bifrost);
++
++        emit_csf_queue(batch, &batch->ctx->kbase_cs_fragment, f,
++                       &batch->frag_deps, !vert, true);
++}
++
++static void
++init_cs(struct panfrost_context *ctx, struct panfrost_cs *cs)
++{
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++        pan_command_stream *c = &cs->cs;
++
++        cs->seqnum = 0;
++
++        cs->offset = 0;
++        c->ptr = cs->bo->ptr.cpu;
++        c->begin = cs->bo->ptr.cpu;
++        c->end = cs->bo->ptr.cpu + cs->base.size;
++        c->gpu = cs->bo->ptr.gpu;
++
++        // eight instructions == 64 bytes
++        pan_pack_ins(c, CS_RESOURCES, cfg) { cfg.mask = cs->hw_resources; }
++        pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 2; }
++        pan_emit_cs_48(c, 0x48, ctx->kbase_ctx->tiler_heap_va);
++        pan_pack_ins(c, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++        for (unsigned i = 0; i < 4; ++i)
++                pan_pack_ins(c, CS_NOP, _);
++
++        dev->mali.cs_submit(&dev->mali, &cs->base, 64, NULL, 0);
++        //dev->mali.cs_wait(&dev->mali, &cs->base, 64);
++}
++
++#endif
++
+ #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
+ 
+ static uint8_t
+@@ -2904,14 +3315,14 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch,
+ #endif
+ 
+ static void
+-panfrost_emit_primitive_size(struct panfrost_context *ctx,
++panfrost_emit_primitive_size(struct panfrost_batch *batch,
+                              bool points, mali_ptr size_array,
+                              void *prim_size)
+ {
+-        struct panfrost_rasterizer *rast = ctx->rasterizer;
++        struct panfrost_rasterizer *rast = batch->ctx->rasterizer;
+ 
+-        pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
+-                if (panfrost_writes_point_size(ctx)) {
++        pan_pack_cs_v10(prim_size, &batch->cs_vertex, PRIMITIVE_SIZE, cfg) {
++                if (panfrost_writes_point_size(batch->ctx)) {
+                         cfg.size_array = size_array;
+                 } else {
+                         cfg.constant = points ?
+@@ -3037,6 +3448,43 @@ panfrost_update_state_3d(struct panfrost_batch *batch)
+ }
+ 
+ #if PAN_ARCH >= 6
++
++#if PAN_ARCH >= 10
++static mali_ptr
++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch)
++{
++        struct panfrost_context *ctx = batch->ctx;
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++
++        if (ctx->tiler_heap_desc)
++                return ctx->tiler_heap_desc->ptr.gpu;
++
++        ctx->tiler_heap_desc = panfrost_bo_create(dev, 4096, 0, "Tiler heap descriptor");
++
++        pan_pack(ctx->tiler_heap_desc->ptr.cpu, TILER_HEAP, heap) {
++                heap.size = ctx->kbase_ctx->tiler_heap_chunk_size;
++                heap.base = ctx->kbase_ctx->tiler_heap_header;
++                heap.bottom = heap.base + 64;
++                heap.top = heap.base + heap.size;
++        }
++
++        return ctx->tiler_heap_desc->ptr.gpu;
++}
++#else
++static mali_ptr
++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch)
++{
++        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
++
++        struct panfrost_ptr t =
++                pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
++
++        GENX(pan_emit_tiler_heap)(dev, t.cpu);
++
++        return t.gpu;
++}
++#endif
++
+ static mali_ptr
+ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
+ {
+@@ -3048,18 +3496,32 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
+         if (batch->tiler_ctx.bifrost)
+                 return batch->tiler_ctx.bifrost;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
++        mali_ptr heap = panfrost_get_tiler_heap_desc(batch);
+ 
+-        GENX(pan_emit_tiler_heap)(dev, t.cpu);
++        mali_ptr scratch = 0;
++
++#if PAN_ARCH >= 10
++        // TODO: Dynamically size?
++        unsigned scratch_bits = 16;
++
++        /* Allocate scratch space for vertex positions / point sizes */
++        // TODO: Should this be shared?
++        struct panfrost_ptr sc =
++                pan_pool_alloc_aligned(&batch->pool.base, 1 << scratch_bits, 4096);
++
++        /* I think the scratch size is passed in the low bits of the
++         * pointer... but trying to go above 16 gives a CS_INHERIT_FAULT.
++         */
++        scratch = sc.gpu + scratch_bits;
++#endif
+ 
+-        mali_ptr heap = t.gpu;
++        struct panfrost_ptr t =
++                pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+ 
+-        t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+         GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
+                                  util_framebuffer_get_num_samples(&batch->key),
+                                  pan_tristate_get(batch->first_provoking_vertex),
+-                                 heap, t.cpu);
++                                 heap, scratch, t.cpu);
+ 
+         batch->tiler_ctx.bifrost = t.gpu;
+         return batch->tiler_ctx.bifrost;
+@@ -3070,18 +3532,19 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
+  * jobs and Valhall IDVS jobs
+  */
+ static void
+-panfrost_emit_primitive(struct panfrost_context *ctx,
++panfrost_emit_primitive(struct panfrost_batch *batch,
+                         const struct pipe_draw_info *info,
+                         const struct pipe_draw_start_count_bias *draw,
+                         mali_ptr indices, bool secondary_shader, void *out)
+ {
+-        UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
++        struct panfrost_context *ctx = batch->ctx;
++        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+ 
+         bool lines = (info->mode == PIPE_PRIM_LINES ||
+                       info->mode == PIPE_PRIM_LINE_LOOP ||
+                       info->mode == PIPE_PRIM_LINE_STRIP);
+ 
+-        pan_pack(out, PRIMITIVE, cfg) {
++        pan_pack_cs_v10(out, &batch->cs_vertex, PRIMITIVE, cfg) {
+                 cfg.draw_mode = pan_draw_mode(info->mode);
+                 if (panfrost_writes_point_size(ctx))
+                         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
+@@ -3113,12 +3576,20 @@ panfrost_emit_primitive(struct panfrost_context *ctx,
+ 
+                 /* Non-fixed restart indices should have been lowered */
+                 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
++
++                /* TODO: This is in a hot function, optimise? */
++                if (ctx->pipe_viewport.scale[2] > 0) {
++                        cfg.low_depth_cull = rast->depth_clip_near;
++                        cfg.high_depth_cull = rast->depth_clip_far;
++                } else {
++                        cfg.low_depth_cull = rast->depth_clip_far;
++                        cfg.high_depth_cull = rast->depth_clip_near;
++                }
+ #endif
+ 
+                 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
+                 cfg.index_type = panfrost_translate_index_size(info->index_size);
+ 
+-
+                 if (PAN_ARCH >= 9) {
+                         /* Base vertex offset on Valhall is used for both
+                          * indexed and non-indexed draws, in a simple way for
+@@ -3240,7 +3711,7 @@ panfrost_emit_draw(void *out,
+         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+         bool polygon = (prim == PIPE_PRIM_TRIANGLES);
+ 
+-        pan_pack(out, DRAW, cfg) {
++        pan_pack_cs_v10(out, &batch->cs_vertex, DRAW, cfg) {
+                 /*
+                  * From the Gallium documentation,
+                  * pipe_rasterizer_state::cull_face "indicates which faces of
+@@ -3270,6 +3741,7 @@ panfrost_emit_draw(void *out,
+                         ctx->prog[PIPE_SHADER_FRAGMENT];
+ 
+                 cfg.multisample_enable = rast->multisample;
++
+                 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
+ 
+                 /* Use per-sample shading if required by API Also use it when a
+@@ -3283,7 +3755,10 @@ panfrost_emit_draw(void *out,
+ 
+                 cfg.single_sampled_lines = !rast->multisample;
+ 
++                /* This is filled in by hardware on v10 */
++#if PAN_ARCH < 10
+                 cfg.vertex_array.packet = true;
++#endif
+ 
+                 cfg.minimum_z = batch->minimum_z;
+                 cfg.maximum_z = batch->maximum_z;
+@@ -3411,14 +3886,18 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+          */
+         secondary_shader &= fs_required;
+ 
+-        panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader,
++#if PAN_ARCH < 10
++        panfrost_emit_primitive(batch, info, draw, 0, secondary_shader,
+                                 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
++#else
++        panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, job);
++#endif
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
+                 cfg.count = info->instance_count;
+         }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
+                 if (secondary_shader) {
+                         unsigned v = vs->info.varyings.output_count;
+                         unsigned f = fs->info.varyings.input_count;
+@@ -3427,34 +3906,45 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+                         unsigned size = slots * 16;
+ 
+                         /* Assumes 16 byte slots. We could do better. */
++#if PAN_ARCH < 10
+                         cfg.vertex_packet_stride = size + 16;
++#endif
+                         cfg.vertex_attribute_stride = size;
+                 } else {
+                         /* Hardware requirement for "no varyings" */
++#if PAN_ARCH < 10
+                         cfg.vertex_packet_stride = 16;
++#endif
+                         cfg.vertex_attribute_stride = 0;
+                 }
+         }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, TILER, cfg) {
+                 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
+         }
+ 
++        /* For v10, the scissor is emitted directly by
++         * panfrost_emit_viewport */
++#if PAN_ARCH < 10
+         STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
+         memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR),
+                &batch->scissor, pan_size(SCISSOR));
++#endif
+ 
+-        panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0,
++        panfrost_emit_primitive_size(batch, info->mode == PIPE_PRIM_POINTS, 0,
+                                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INDICES, cfg) {
+                 cfg.address = indices;
++#if PAN_ARCH >= 10
++                cfg.size = draw->count * info->index_size;
++#endif
+         }
+ 
+         panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW),
+                            batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0);
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, POSITION, cfg) {
+                 /* IDVS/points vertex shader */
+                 mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX];
+ 
+@@ -3464,20 +3954,21 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+ 
+                 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr,
+                                      batch->tls.gpu);
+-        }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
+-                /* If a varying shader is used, we configure it with the same
+-                 * state as the position shader for backwards compatible
+-                 * behaviour with Bifrost. This could be optimized.
+-                 */
+-                if (!secondary_shader) continue;
++                pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, VARYING, vary) {
++                        /* If a varying shader is used, we configure it with the same
++                         * state as the position shader for backwards compatible
++                         * behaviour with Bifrost. This could be optimized.
++                         */
++                        if (!secondary_shader) continue;
+ 
+-                mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
++                        mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
+                                 (2 * pan_size(SHADER_PROGRAM));
+ 
+-                panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX,
+-                             ptr, batch->tls.gpu);
++                        vary.shader = ptr;
++
++                        // TODO: Fix this function for v9!
++                }
+         }
+ }
+ #endif
+@@ -3492,12 +3983,10 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
+                          mali_ptr pos, mali_ptr psiz, bool secondary_shader,
+                          void *job)
+ {
+-        struct panfrost_context *ctx = batch->ctx;
+-
+         void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
+         memcpy(section, invocation_template, pan_size(INVOCATION));
+ 
+-        panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader,
++        panfrost_emit_primitive(batch, info, draw, indices, secondary_shader,
+                                 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
+ 
+         void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
+@@ -3514,7 +4003,7 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
+         panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW),
+                            batch, true, prim, pos, fs_vary, varyings);
+ 
+-        panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
++        panfrost_emit_primitive_size(batch, prim == PIPE_PRIM_POINTS, psiz, prim_size);
+ }
+ #endif
+ 
+@@ -3526,8 +4015,8 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+ {
+         struct panfrost_context *ctx = batch->ctx;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
++        UNUSED struct panfrost_ptr t =
++                pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
+ 
+         /* Nothing to do */
+         if (batch->ctx->streamout.num_targets == 0)
+@@ -3556,7 +4045,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
+ 
+ #if PAN_ARCH >= 9
+-        pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
++        pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) {
+                 cfg.workgroup_size_x = 1;
+                 cfg.workgroup_size_y = 1;
+                 cfg.workgroup_size_z = 1;
+@@ -3569,15 +4058,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+                                      batch->rsd[PIPE_SHADER_VERTEX],
+                                      batch->tls.gpu);
+ 
++#if PAN_ARCH < 10
+                 /* TODO: Indexing. Also, this is a legacy feature... */
+                 cfg.compute.attribute_offset = batch->ctx->offset_start;
++#endif
+ 
+                 /* Transform feedback shaders do not use barriers or shared
+                  * memory, so we may merge workgroups.
+                  */
+                 cfg.allow_merging_workgroups = true;
++
++#if PAN_ARCH < 10
+                 cfg.task_increment = 1;
+                 cfg.task_axis = MALI_TASK_AXIS_Z;
++#endif
+         }
+ #else
+         struct mali_invocation_packed invocation;
+@@ -3593,12 +4087,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
+                                   attribs, attrib_bufs, t.cpu);
+ #endif
++#if PAN_ARCH >= 10
++        // TODO: Use a seperate compute queue?
++        pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) {
++                // TODO v10: Set parameters
++        }
++        batch->scoreboard.first_job = 1;
++#else
+         enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
+ #if PAN_ARCH <= 5
+         job_type = MALI_JOB_TYPE_VERTEX;
+ #endif
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
+                          true, false, 0, 0, &t, false);
++#endif
+ 
+         ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
+         ctx->prog[PIPE_SHADER_VERTEX] = vs;
+@@ -3607,6 +4109,54 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
+ }
+ 
++#if PAN_ARCH >= 10
++static pan_command_stream
++panfrost_batch_create_cs(struct panfrost_batch *batch, unsigned count)
++{
++        struct panfrost_ptr cs = pan_pool_alloc_aligned(&batch->pool.base, count * 8, 64);
++
++        return (pan_command_stream) {
++                .ptr = cs.cpu,
++                .begin = cs.cpu,
++                .end = cs.cpu + count,
++                .gpu = cs.gpu,
++        };
++}
++
++static uint64_t *
++panfrost_cs_vertex_allocate_instrs(struct panfrost_batch *batch, unsigned count)
++{
++        /* Doing a tail call to another buffer takes three instructions */
++        count += 3;
++
++        pan_command_stream v = batch->cs_vertex;
++
++        if (v.ptr + count > v.end) {
++                batch->cs_vertex = panfrost_batch_create_cs(batch, MAX2(count, 1 << 13));
++
++                /* The size will be filled in later. */
++                uint32_t *last_size = (uint32_t *)v.ptr;
++                pan_emit_cs_32(&v, 0x5e, 0);
++
++                pan_emit_cs_48(&v, 0x5c, batch->cs_vertex.gpu);
++                pan_pack_ins(&v, CS_TAILCALL, cfg) { cfg.address = 0x5c; cfg.length = 0x5e; }
++
++                assert(v.ptr <= v.end);
++
++                /* This is not strictly required, but makes disassembly look
++                 * nicer */
++                if (batch->cs_vertex_last_size)
++                        *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8;
++
++                batch->cs_vertex_last_size = last_size;
++                if (!batch->cs_vertex_first.gpu)
++                        batch->cs_vertex_first = v;
++        }
++
++        return batch->cs_vertex.ptr + count;
++}
++#endif
++
+ static void
+ panfrost_direct_draw(struct panfrost_batch *batch,
+                      const struct pipe_draw_info *info,
+@@ -3618,6 +4168,11 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         struct panfrost_context *ctx = batch->ctx;
+ 
++#if PAN_ARCH >= 10
++        /* TODO: We don't need quite so much space */
++        uint64_t *limit = panfrost_cs_vertex_allocate_instrs(batch, 64);
++#endif
++
+         /* If we change whether we're drawing points, or whether point sprites
+          * are enabled (specified in the rasterizer), we may need to rebind
+          * shaders accordingly. This implicitly covers the case of rebinding
+@@ -3647,18 +4202,19 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         UNUSED struct panfrost_ptr tiler, vertex;
+ 
+-        if (idvs) {
+ #if PAN_ARCH >= 9
+-                tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
+-#elif PAN_ARCH >= 6
++        tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, MALLOC_VERTEX_JOB);
++#else /* PAN_ARCH < 9 */
++        if (idvs) {
++#if PAN_ARCH >= 6
+                 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
+-#else
+-                unreachable("IDVS is unsupported on Midgard");
+ #endif
++                unreachable("IDVS is unsupported on Midgard");
+         } else {
+-                vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
+-                tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
++                vertex = pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
++                tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, TILER_JOB);
+         }
++#endif /* PAN_ARCH */
+ 
+         unsigned vertex_count = ctx->vertex_count;
+ 
+@@ -3726,7 +4282,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         mali_ptr attribs, attrib_bufs;
+         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+         panfrost_update_state_3d(batch);
+         panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
+@@ -3752,13 +4308,25 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ #if PAN_ARCH >= 9
+         assert(idvs && "Memory allocated IDVS required on Valhall");
+ 
+-        panfrost_emit_malloc_vertex(batch, info, draw, indices,
+-                                    secondary_shader, tiler.cpu);
++        panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, tiler.cpu);
+ 
++#if PAN_ARCH >= 10
++        pan_pack_ins(&batch->cs_vertex, IDVS_LAUNCH, _);
++        /* TODO: Find a better way to specify that there were jobs */
++        batch->scoreboard.first_job = 1;
++        batch->scoreboard.first_tiler = NULL + 1;
++
++        /* Make sure we didn't use more CS instructions than we allocated
++         * space for */
++        assert(batch->cs_vertex.ptr <= limit);
++
++#else /* PAN_ARCH < 10 */
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                          MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
+                          0, &tiler, false);
+-#else
++#endif
++#else /* PAN_ARCH < 9 */
++
+         /* Fire off the draw itself */
+         panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
+                                  fs_vary, varyings, pos, psiz, secondary_shader,
+@@ -3773,7 +4341,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+                 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                                  MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
+                                  0, 0, &tiler, false);
+-#endif
++#endif /* PAN_ARCH < 6 */
+         } else {
+                 panfrost_draw_emit_vertex(batch, info, &invocation,
+                                           vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
+@@ -4102,8 +4670,8 @@ panfrost_launch_grid(struct pipe_context *pipe,
+ 
+         ctx->compute_grid = info;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
++        UNUSED struct panfrost_ptr t =
++                pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
+ 
+         /* Invoke according to the grid info */
+ 
+@@ -4143,7 +4711,7 @@ panfrost_launch_grid(struct pipe_context *pipe,
+ #else
+         struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
+ 
+-        pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
++        pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) {
+                 cfg.workgroup_size_x = info->block[0];
+                 cfg.workgroup_size_y = info->block[1];
+                 cfg.workgroup_size_z = info->block[2];
+@@ -4166,12 +4734,14 @@ panfrost_launch_grid(struct pipe_context *pipe,
+                         cs->info.cs.allow_merging_workgroups &&
+                         (info->variable_shared_mem == 0);
+ 
++#if PAN_ARCH < 10
+                 cfg.task_increment = 1;
+                 cfg.task_axis = MALI_TASK_AXIS_Z;
++#endif
+         }
+ #endif
+ 
+-        unsigned indirect_dep = 0;
++        UNUSED unsigned indirect_dep = 0; // TODO v10 (unused)
+ #if PAN_GPU_INDIRECTS
+         if (info->indirect) {
+                 struct pan_indirect_dispatch_info indirect = {
+@@ -4191,9 +4761,17 @@ panfrost_launch_grid(struct pipe_context *pipe,
+         }
+ #endif
+ 
++#if PAN_ARCH >= 10
++        pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) {
++                /* TODO: Change this as needed */
++                cfg.unk_1 = 512;
++        }
++        batch->scoreboard.first_job = 1;
++#else
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                          MALI_JOB_TYPE_COMPUTE, true, false,
+                          indirect_dep, 0, &t, false);
++#endif
+         panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
+ }
+ 
+@@ -4453,6 +5031,30 @@ panfrost_create_sampler_view(
+         return (struct pipe_sampler_view *) so;
+ }
+ 
++static void
++panfrost_init_logicop_blend_state(struct panfrost_blend_state *so)
++{
++        for (unsigned c = 0; c < so->pan.rt_count; ++c) {
++                unsigned g = so->base.independent_blend_enable ? c : 0;
++                const struct pipe_rt_blend_state pipe = so->base.rt[g];
++
++                struct pan_blend_equation equation = {0};
++
++                equation.color_mask = pipe.colormask;
++                equation.blend_enable = false;
++
++                so->info[c] = (struct pan_blend_info) {
++                        .enabled = (pipe.colormask != 0),
++                        .load_dest = true,
++                        .fixed_function = false,
++                };
++
++                so->pan.rts[c].equation = equation;
++
++                so->load_dest_mask |= BITFIELD_BIT(c);
++        }
++}
++
+ /* A given Gallium blend state can be encoded to the hardware in numerous,
+  * dramatically divergent ways due to the interactions of blending with
+  * framebuffer formats. Conceptually, there are two modes:
+@@ -4492,6 +5094,11 @@ panfrost_create_blend_state(struct pipe_context *pipe,
+         so->pan.logicop_func = blend->logicop_func;
+         so->pan.rt_count = blend->max_rt + 1;
+ 
++        if (blend->logicop_enable) {
++                panfrost_init_logicop_blend_state(so);
++                return so;
++        }
++
+         for (unsigned c = 0; c < so->pan.rt_count; ++c) {
+                 unsigned g = blend->independent_blend_enable ? c : 0;
+                 const struct pipe_rt_blend_state pipe = blend->rt[g];
+@@ -4521,12 +5128,10 @@ panfrost_create_blend_state(struct pipe_context *pipe,
+                         .opaque = pan_blend_is_opaque(equation),
+                         .constant_mask = constant_mask,
+ 
+-                        /* TODO: check the dest for the logicop */
+-                        .load_dest = blend->logicop_enable ||
+-                                pan_blend_reads_dest(equation),
++                        .load_dest = pan_blend_reads_dest(equation),
+ 
+                         /* Could this possibly be fixed-function? */
+-                        .fixed_function = !blend->logicop_enable &&
++                        .fixed_function =
+                                 pan_blend_can_fixed_function(equation,
+                                                              supports_2src) &&
+                                 (!constant_mask ||
+@@ -4612,10 +5217,12 @@ prepare_shader(struct panfrost_compiled_shader *state,
+ 
+         state->state = panfrost_pool_take_ref(pool, ptr.gpu);
+ 
++        // TODO: Why set primary_shader to false again?
++
+         /* Generic, or IDVS/points */
+         pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
+                 cfg.stage = pan_shader_stage(&state->info);
+-                cfg.primary_shader = true;
++                cfg.primary_shader = false;
+                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
+                 cfg.binary = state->bin.gpu;
+                 cfg.preload.r48_r63 = (state->info.preload >> 48);
+@@ -4631,7 +5238,7 @@ prepare_shader(struct panfrost_compiled_shader *state,
+         /* IDVS/triangles */
+         pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
+                 cfg.stage = pan_shader_stage(&state->info);
+-                cfg.primary_shader = true;
++                cfg.primary_shader = false;
+                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
+                 cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
+                 cfg.preload.r48_r63 = (state->info.preload >> 48);
+@@ -4707,6 +5314,11 @@ init_batch(struct panfrost_batch *batch)
+         /* On Midgard, the TLS is embedded in the FB descriptor */
+         batch->tls = batch->framebuffer;
+ #endif
++
++#if PAN_ARCH >= 10
++        batch->cs_vertex = panfrost_batch_create_cs(batch, 1 << 13);
++        batch->cs_fragment = panfrost_batch_create_cs(batch, 1 << 9);
++#endif
+ }
+ 
+ static void
+@@ -4821,6 +5433,10 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
+         screen->vtbl.init_polygon_list = init_polygon_list;
+         screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
+         screen->vtbl.compile_shader = GENX(pan_shader_compile);
++#if PAN_ARCH >= 10
++        screen->vtbl.emit_csf_toplevel = emit_csf_toplevel;
++        screen->vtbl.init_cs = init_cs;
++#endif
+ 
+         GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
+                                &screen->blitter.desc_pool.base);
+diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
+index 80a39a3a220..7b0f021bf47 100644
+--- a/src/gallium/drivers/panfrost/pan_context.c
++++ b/src/gallium/drivers/panfrost/pan_context.c
+@@ -34,7 +34,6 @@
+ 
+ #include "util/macros.h"
+ #include "util/format/u_format.h"
+-#include "util/libsync.h"
+ #include "util/u_inlines.h"
+ #include "util/u_upload_mgr.h"
+ #include "util/u_memory.h"
+@@ -571,6 +570,19 @@ panfrost_destroy(struct pipe_context *pipe)
+         struct panfrost_context *panfrost = pan_context(pipe);
+         struct panfrost_device *dev = pan_device(pipe->screen);
+ 
++        if (dev->kbase && dev->mali.context_create) {
++                dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_vertex.base);
++                dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_fragment.base);
++
++                dev->mali.context_destroy(&dev->mali, panfrost->kbase_ctx);
++
++                panfrost_bo_unreference(panfrost->kbase_cs_vertex.bo);
++                panfrost_bo_unreference(panfrost->kbase_cs_fragment.bo);
++        }
++
++        if (panfrost->tiler_heap_desc)
++                panfrost_bo_unreference(panfrost->tiler_heap_desc);
++
+         _mesa_hash_table_destroy(panfrost->writers, NULL);
+ 
+         if (panfrost->blitter)
+@@ -582,11 +594,15 @@ panfrost_destroy(struct pipe_context *pipe)
+         panfrost_pool_cleanup(&panfrost->descs);
+         panfrost_pool_cleanup(&panfrost->shaders);
+ 
+-        drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj);
+-        if (panfrost->in_sync_fd != -1)
+-                close(panfrost->in_sync_fd);
++        if (dev->kbase) {
++                dev->mali.syncobj_destroy(&dev->mali, panfrost->syncobj_kbase);
++        } else {
++                drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj);
++                if (panfrost->in_sync_fd != -1)
++                        close(panfrost->in_sync_fd);
+ 
+-        drmSyncobjDestroy(dev->fd, panfrost->syncobj);
++                drmSyncobjDestroy(dev->fd, panfrost->syncobj);
++        }
+         ralloc_free(pipe);
+ }
+ 
+@@ -873,6 +889,58 @@ panfrost_create_fence_fd(struct pipe_context *pctx,
+         *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type);
+ }
+ 
++struct sync_merge_data {
++	char	name[32];
++	int32_t	fd2;
++	int32_t	fence;
++	uint32_t	flags;
++	uint32_t	pad;
++};
++
++#define SYNC_IOC_MAGIC		'>'
++#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
++
++static inline int sync_merge(const char *name, int fd1, int fd2)
++{
++	struct sync_merge_data data = {{0}};
++	int ret;
++
++	data.fd2 = fd2;
++	strncpy(data.name, name, sizeof(data.name));
++
++	do {
++		ret = ioctl(fd1, SYNC_IOC_MERGE, &data);
++	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
++
++	if (ret < 0)
++		return ret;
++
++	return data.fence;
++}
++
++static inline int sync_accumulate(const char *name, int *fd1, int fd2)
++{
++	int ret;
++
++	assert(fd2 >= 0);
++
++	if (*fd1 < 0) {
++		*fd1 = dup(fd2);
++		return 0;
++	}
++
++	ret = sync_merge(name, *fd1, fd2);
++	if (ret < 0) {
++		/* leave *fd1 as it is */
++		return ret;
++	}
++
++	close(*fd1);
++	*fd1 = ret;
++
++	return 0;
++}
++
+ static void
+ panfrost_fence_server_sync(struct pipe_context *pctx,
+                            struct pipe_fence_handle *f)
+@@ -888,6 +956,28 @@ panfrost_fence_server_sync(struct pipe_context *pctx,
+         close(fd);
+ }
+ 
++static struct panfrost_cs
++panfrost_cs_create(struct panfrost_context *ctx, unsigned size, unsigned mask)
++{
++        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++        struct kbase_context *kctx = ctx->kbase_ctx;
++
++        struct panfrost_cs c = {0};
++
++        c.bo = panfrost_bo_create(dev, size, 0, "Command stream");
++
++        c.base = dev->mali.cs_bind(&dev->mali, kctx, c.bo->ptr.gpu, size);
++
++        c.event_ptr = dev->mali.event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE;
++        c.kcpu_event_ptr = dev->mali.kcpu_event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE;
++
++        c.hw_resources = mask;
++        screen->vtbl.init_cs(ctx, &c);
++
++        return c;
++}
++
+ struct pipe_context *
+ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+ {
+@@ -981,6 +1071,14 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+ 
+         assert(ctx->blitter);
+ 
++        if (dev->kbase && dev->mali.context_create)
++                ctx->kbase_ctx = dev->mali.context_create(&dev->mali);
++
++        if (dev->arch >= 10) {
++                ctx->kbase_cs_vertex = panfrost_cs_create(ctx, 65536, 13);
++                ctx->kbase_cs_fragment = panfrost_cs_create(ctx, 65536, 2);
++        }
++
+         /* Prepare for render! */
+ 
+         /* By default mask everything on */
+@@ -992,13 +1090,18 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+         /* Create a syncobj in a signaled state. Will be updated to point to the
+          * last queued job out_sync every time we submit a new job.
+          */
+-        ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj);
+-        assert(!ret && ctx->syncobj);
+-
+-        /* Sync object/FD used for NATIVE_FENCE_FD. */
+-        ctx->in_sync_fd = -1;
+-        ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj);
+-        assert(!ret);
++        if (dev->kbase) {
++                ctx->syncobj_kbase = dev->mali.syncobj_create(&dev->mali);
++                ctx->in_sync_fd = -1;
++        } else {
++                ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj);
++                assert(!ret && ctx->syncobj);
++
++                /* Sync object/FD used for NATIVE_FENCE_FD. */
++                ctx->in_sync_fd = -1;
++                ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj);
++                assert(!ret);
++        }
+ 
+         return gallium;
+ }
+diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
+index 37c0f6fc099..197f5641362 100644
+--- a/src/gallium/drivers/panfrost/pan_context.h
++++ b/src/gallium/drivers/panfrost/pan_context.h
+@@ -117,6 +117,19 @@ struct panfrost_streamout {
+         unsigned num_targets;
+ };
+ 
++// TODO: This struct is a mess
++struct panfrost_cs {
++        struct kbase_cs base;
++        struct panfrost_bo *bo;
++        pan_command_stream cs;
++        mali_ptr event_ptr;
++        uint64_t seqnum;
++        mali_ptr kcpu_event_ptr;
++        uint64_t kcpu_seqnum;
++        uint64_t offset;
++        unsigned hw_resources;
++};
++
+ struct panfrost_context {
+         /* Gallium context */
+         struct pipe_context base;
+@@ -132,6 +145,7 @@ struct panfrost_context {
+ 
+         /* Sync obj used to keep track of in-flight jobs. */
+         uint32_t syncobj;
++        struct kbase_syncobj *syncobj_kbase;
+ 
+         /* Set of 32 batches. When the set is full, the LRU entry (the batch
+          * with the smallest seqnum) is flushed to free a slot.
+@@ -229,6 +243,12 @@ struct panfrost_context {
+ 
+         int in_sync_fd;
+         uint32_t in_sync_obj;
++
++        struct kbase_context *kbase_ctx;
++        struct panfrost_bo *event_bo;
++        struct panfrost_cs kbase_cs_vertex;
++        struct panfrost_cs kbase_cs_fragment;
++        struct panfrost_bo *tiler_heap_desc;
+ };
+ 
+ /* Corresponds to the CSO */
+diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c
+index e00053aad44..e1ad57ce3e8 100644
+--- a/src/gallium/drivers/panfrost/pan_disk_cache.c
++++ b/src/gallium/drivers/panfrost/pan_disk_cache.c
+@@ -34,7 +34,9 @@
+ 
+ #include "pan_context.h"
+ 
++#ifdef ENABLE_SHADER_CACHE
+ static bool debug = false;
++#endif
+ 
+ extern int midgard_debug;
+ extern int bifrost_debug;
+@@ -141,6 +143,8 @@ panfrost_disk_cache_retrieve(struct disk_cache *cache,
+         blob_copy_bytes(&blob, ptr, binary_size);
+         blob_copy_bytes(&blob, &binary->info, sizeof(binary->info));
+ 
++        free(buffer);
++
+         return true;
+ #else
+         return false;
+@@ -156,11 +160,7 @@ panfrost_disk_cache_init(struct panfrost_screen *screen)
+ #ifdef ENABLE_SHADER_CACHE
+         const char *renderer = screen->base.get_name(&screen->base);
+ 
+-        const struct build_id_note *note =
+-                build_id_find_nhdr_for_addr(panfrost_disk_cache_init);
+-        assert(note && build_id_length(note) == 20); /* sha1 */
+-
+-        const uint8_t *id_sha1 = build_id_data(note);
++        const uint8_t *id_sha1 = "1";
+         assert(id_sha1);
+ 
+         char timestamp[41];
+diff --git a/src/gallium/drivers/panfrost/pan_fence.c b/src/gallium/drivers/panfrost/pan_fence.c
+index 655644ec495..f989269978c 100644
+--- a/src/gallium/drivers/panfrost/pan_fence.c
++++ b/src/gallium/drivers/panfrost/pan_fence.c
+@@ -42,7 +42,10 @@ panfrost_fence_reference(struct pipe_screen *pscreen,
+         struct pipe_fence_handle *old = *ptr;
+ 
+         if (pipe_reference(&old->reference, &fence->reference)) {
+-                drmSyncobjDestroy(dev->fd, old->syncobj);
++                if (dev->kbase)
++                        dev->mali.syncobj_destroy(&dev->mali, old->kbase);
++                else
++                        drmSyncobjDestroy(dev->fd, old->syncobj);
+                 free(old);
+         }
+ 
+@@ -65,6 +68,13 @@ panfrost_fence_finish(struct pipe_screen *pscreen,
+         if (abs_timeout == OS_TIMEOUT_INFINITE)
+                 abs_timeout = INT64_MAX;
+ 
++        if (dev->kbase) {
++                /* TODO: Use the timeout */
++                bool ret = dev->mali.syncobj_wait(&dev->mali, fence->kbase);
++                fence->signaled = ret;
++                return ret;
++        }
++
+         ret = drmSyncobjWait(dev->fd, &fence->syncobj,
+                              1,
+                              abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+@@ -81,6 +91,10 @@ panfrost_fence_get_fd(struct pipe_screen *screen,
+         struct panfrost_device *dev = pan_device(screen);
+         int fd = -1;
+ 
++        /* TODO: Export a sync file using KCPU */
++        if (dev->kbase)
++                return fd;
++
+         drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd);
+         return fd;
+ }
+@@ -92,6 +106,10 @@ panfrost_fence_from_fd(struct panfrost_context *ctx, int fd,
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+         int ret;
+ 
++        /* TODO: Implement this for kbase */
++        if (dev->kbase)
++                return NULL;
++
+         struct pipe_fence_handle *f = calloc(1, sizeof(*f));
+         if (!f)
+                 return NULL;
+@@ -134,6 +152,16 @@ panfrost_fence_create(struct panfrost_context *ctx)
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+         int fd = -1, ret;
+ 
++        if (dev->kbase) {
++                struct pipe_fence_handle *f = calloc(1, sizeof(*f));
++                if (!f)
++                        return NULL;
++
++                f->kbase = dev->mali.syncobj_dup(&dev->mali, ctx->syncobj_kbase);
++                pipe_reference_init(&f->reference, 1);
++                return f;
++        }
++
+         /* Snapshot the last rendering out fence. We'd rather have another
+          * syncobj instead of a sync file, but this is all we get.
+          * (HandleToFD/FDToHandle just gives you another syncobj ID for the
+diff --git a/src/gallium/drivers/panfrost/pan_fence.h b/src/gallium/drivers/panfrost/pan_fence.h
+index 350f3682343..a52c5c72c92 100644
+--- a/src/gallium/drivers/panfrost/pan_fence.h
++++ b/src/gallium/drivers/panfrost/pan_fence.h
+@@ -32,6 +32,7 @@ struct panfrost_context;
+ struct pipe_fence_handle {
+         struct pipe_reference reference;
+         uint32_t syncobj;
++        struct kbase_syncobj *kbase;
+         bool signaled;
+ };
+ 
+diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
+index 75408594735..4eb1a941f1e 100644
+--- a/src/gallium/drivers/panfrost/pan_job.c
++++ b/src/gallium/drivers/panfrost/pan_job.c
+@@ -25,6 +25,7 @@
+  */
+ 
+ #include <assert.h>
++#include <unistd.h>
+ 
+ #include "drm-uapi/panfrost_drm.h"
+ 
+@@ -81,6 +82,14 @@ panfrost_batch_init(struct panfrost_context *ctx,
+         batch->resources =_mesa_set_create(NULL, _mesa_hash_pointer,
+                                           _mesa_key_pointer_equal);
+ 
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i)
++                util_dynarray_init(&batch->resource_bos[i], NULL);
++
++        util_dynarray_init(&batch->vert_deps, NULL);
++        util_dynarray_init(&batch->frag_deps, NULL);
++
++        util_dynarray_init(&batch->dmabufs, NULL);
++
+         /* Preallocate the main pool, since every batch has at least one job
+          * structure so it will be used */
+         panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, true);
+@@ -96,6 +105,9 @@ panfrost_batch_init(struct panfrost_context *ctx,
+ 
+         panfrost_batch_add_surface(batch, batch->key.zsbuf);
+ 
++        if ((dev->debug & PAN_DBG_SYNC) || !(dev->debug & PAN_DBG_GOFASTER))
++                batch->needs_sync = true;
++
+         screen->vtbl.init_batch(batch);
+ }
+ 
+@@ -115,15 +127,30 @@ static void
+ panfrost_batch_add_resource(struct panfrost_batch *batch,
+                             struct panfrost_resource *rsrc)
+ {
++        struct panfrost_context *ctx = batch->ctx;
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++
+         bool found = false;
+         _mesa_set_search_or_add(batch->resources, rsrc, &found);
+ 
+-        if (!found) {
+-                /* Cache number of batches accessing a resource */
+-                rsrc->track.nr_users++;
++        /* Nothing to do if we already have the resource */
++        if (found)
++                return;
++
++        /* Cache number of batches accessing a resource */
++        rsrc->track.nr_users++;
++
++        /* Reference the resource on the batch */
++        pipe_reference(NULL, &rsrc->base.reference);
+ 
+-                /* Reference the resource on the batch */
+-                pipe_reference(NULL, &rsrc->base.reference);
++        if (rsrc->scanout) {
++                if (dev->has_dmabuf_fence) {
++                        int fd = rsrc->image.data.bo->dmabuf_fd;
++                        util_dynarray_append(&batch->dmabufs, int, fd);
++                } else {
++                        perf_debug_ctx(ctx, "Forcing sync on batch");
++                        batch->needs_sync = true;
++                }
+         }
+ }
+ 
+@@ -172,6 +199,10 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc
+ {
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+ 
++        /* Make sure we keep handling events, to free old BOs */
++        if (dev->kbase)
++                kbase_ensure_handle_events(&dev->mali);
++
+         assert(batch->seqnum);
+ 
+         if (ctx->batch == batch)
+@@ -186,10 +217,18 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc
+                 if (!flags[i])
+                         continue;
+ 
+-                struct panfrost_bo *bo = pan_lookup_bo(dev, i);
++                struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i);
+                 panfrost_bo_unreference(bo);
+         }
+ 
++        util_dynarray_fini(&batch->dmabufs);
++
++        util_dynarray_fini(&batch->vert_deps);
++        util_dynarray_fini(&batch->frag_deps);
++
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i)
++                util_dynarray_fini(&batch->resource_bos[i]);
++
+         panfrost_batch_destroy_resources(ctx, batch);
+         panfrost_pool_cleanup(&batch->pool);
+         panfrost_pool_cleanup(&batch->invisible_pool);
+@@ -313,7 +352,7 @@ panfrost_batch_update_access(struct panfrost_batch *batch,
+                 }
+         }
+ 
+-        if (writes) {
++        if (writes && (writer != batch)) {
+                 _mesa_hash_table_insert(ctx->writers, rsrc, batch);
+                 rsrc->track.nr_writers++;
+         }
+@@ -380,6 +419,12 @@ panfrost_batch_read_rsrc(struct panfrost_batch *batch,
+         uint32_t access = PAN_BO_ACCESS_READ |
+                 panfrost_access_for_stage(stage);
+ 
++        enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ?
++                PAN_USAGE_READ_FRAGMENT : PAN_USAGE_READ_VERTEX;
++
++        util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *,
++                             rsrc->image.data.bo);
++
+         panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access);
+ 
+         if (rsrc->separate_stencil)
+@@ -396,6 +441,12 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch,
+         uint32_t access = PAN_BO_ACCESS_WRITE |
+                 panfrost_access_for_stage(stage);
+ 
++        enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ?
++                PAN_USAGE_WRITE_FRAGMENT : PAN_USAGE_WRITE_VERTEX;
++
++        util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *,
++                             rsrc->image.data.bo);
++
+         panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access);
+ 
+         if (rsrc->separate_stencil)
+@@ -489,7 +540,7 @@ panfrost_batch_get_shared_memory(struct panfrost_batch *batch,
+ }
+ 
+ static void
+-panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
++panfrost_batch_to_fb_info(struct panfrost_batch *batch,
+                           struct pan_fb_info *fb,
+                           struct pan_image_view *rts,
+                           struct pan_image_view *zs,
+@@ -511,6 +562,7 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         fb->rt_count = batch->key.nr_cbufs;
+         fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin);
+         fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex);
++        fb->cs_fragment = &batch->cs_fragment;
+ 
+         static const unsigned char id_swz[] = {
+                 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W,
+@@ -604,22 +656,22 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH);
+         fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL);
+ 
+-        if (!fb->zs.clear.z &&
++        if (!fb->zs.clear.z && z_rsrc &&
+             ((batch->read & PIPE_CLEAR_DEPTH) ||
+              ((batch->draws & PIPE_CLEAR_DEPTH) &&
+-              z_rsrc && BITSET_TEST(z_rsrc->valid.data, z_view->first_level))))
++              BITSET_TEST(z_rsrc->valid.data, z_view->first_level))))
+                 fb->zs.preload.z = true;
+ 
+-        if (!fb->zs.clear.s &&
++        if (!fb->zs.clear.s && s_rsrc &&
+             ((batch->read & PIPE_CLEAR_STENCIL) ||
+              ((batch->draws & PIPE_CLEAR_STENCIL) &&
+-              s_rsrc && BITSET_TEST(s_rsrc->valid.data, s_view->first_level))))
++              BITSET_TEST(s_rsrc->valid.data, s_view->first_level))))
+                 fb->zs.preload.s = true;
+ 
+         /* Preserve both component if we have a combined ZS view and
+          * one component needs to be preserved.
+          */
+-        if (s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) {
++        if (z_view && s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) {
+                 bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level);
+ 
+                 fb->zs.discard.z = false;
+@@ -629,6 +681,28 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         }
+ }
+ 
++static int
++panfrost_batch_submit_kbase(struct panfrost_device *dev,
++                            struct drm_panfrost_submit *submit,
++                            struct kbase_syncobj *syncobj)
++{
++        dev->mali.handle_events(&dev->mali);
++
++        int atom = dev->mali.submit(&dev->mali,
++                                    submit->jc,
++                                    submit->requirements,
++                                    syncobj,
++                                    (int32_t *)(uintptr_t) submit->bo_handles,
++                                    submit->bo_handle_count);
++
++        if (atom == -1) {
++                errno = EINVAL;
++                return -1;
++        }
++
++        return 0;
++}
++
+ static int
+ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+                             mali_ptr first_job_desc,
+@@ -695,7 +769,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+                  * We also preserve existing flags as this batch might not
+                  * be the first one to access the BO.
+                  */
+-                struct panfrost_bo *bo = pan_lookup_bo(dev, i);
++                struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i);
+ 
+                 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
+         }
+@@ -718,6 +792,8 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+         submit.bo_handles = (u64) (uintptr_t) bo_handles;
+         if (ctx->is_noop)
+                 ret = 0;
++        else if (dev->kbase)
++                ret = panfrost_batch_submit_kbase(dev, &submit, ctx->syncobj_kbase);
+         else
+                 ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
+         free(bo_handles);
+@@ -728,8 +804,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+         /* Trace the job if we're doing that */
+         if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
+                 /* Wait so we can get errors reported back */
+-                drmSyncobjWait(dev->fd, &out_sync, 1,
+-                               INT64_MAX, 0, NULL);
++                if (dev->kbase)
++                        dev->mali.syncobj_wait(&dev->mali, ctx->syncobj_kbase);
++                else
++                        drmSyncobjWait(dev->fd, &out_sync, 1,
++                                       INT64_MAX, 0, NULL);
+ 
+                 if (dev->debug & PAN_DBG_TRACE)
+                         pandecode_jc(submit.jc, dev->gpu_id);
+@@ -799,6 +878,323 @@ panfrost_batch_submit_jobs(struct panfrost_batch *batch,
+         return ret;
+ }
+ 
++#define BASE_MEM_MMU_DUMP_HANDLE (1 << 12)
++
++static void
++mmu_dump(struct panfrost_device *dev)
++{
++        unsigned size = 16 * 1024 * 1024;
++
++        fprintf(stderr, "dumping MMU tables\n");
++        sleep(3);
++
++        void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED,
++                         dev->mali.fd, BASE_MEM_MMU_DUMP_HANDLE);
++        if (mem == MAP_FAILED) {
++                perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)");
++                return;;
++        }
++
++        fprintf(stderr, "writing to file\n");
++        sleep(1);
++
++        char template[] = {"/tmp/mmu-dump.XXXXXX"};
++        int fd = mkstemp(template);
++        if (fd == -1) {
++                perror("mkstemp(/tmp/mmu-dump.XXXXXX)");
++                goto unmap;
++        }
++
++        write(fd, mem, size);
++        close(fd);
++
++unmap:
++        munmap(mem, size);
++}
++
++static void
++reset_context(struct panfrost_context *ctx)
++{
++        struct pipe_screen *pscreen = ctx->base.screen;
++        struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
++
++        /* Don't recover from the fault if PAN_MESA_DEBUG=sync is specified,
++         * to somewhat mimic behaviour with JM GPUs. TODO: Just abort? */
++        bool recover = !(dev->debug & PAN_DBG_SYNC);
++
++        mesa_loge("Context reset");
++
++        dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_vertex.base);
++        dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_fragment.base);
++
++        dev->mali.context_recreate(&dev->mali, ctx->kbase_ctx);
++
++        //mmu_dump(dev);
++
++        if (recover) {
++                dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_vertex.base);
++                dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_fragment.base);
++        } else {
++                ctx->kbase_cs_vertex.base.user_io = NULL;
++                ctx->kbase_cs_fragment.base.user_io = NULL;
++        }
++
++        ctx->kbase_cs_vertex.base.last_insert = 0;
++        ctx->kbase_cs_fragment.base.last_insert = 0;
++
++        screen->vtbl.init_cs(ctx, &ctx->kbase_cs_vertex);
++        screen->vtbl.init_cs(ctx, &ctx->kbase_cs_fragment);
++
++        /* TODO: this leaks memory */
++        ctx->tiler_heap_desc = 0;
++}
++
++static void
++pandecode_cs_ring(struct panfrost_device *dev, struct panfrost_cs *cs,
++                  uint64_t insert)
++{
++        insert %= cs->base.size;
++        uint64_t start = cs->base.last_insert % cs->base.size;
++
++        if (insert < start) {
++                pandecode_cs(cs->base.va + start, cs->base.size - start, dev->gpu_id);
++                start = 0;
++        }
++
++        pandecode_cs(cs->base.va + start, insert - start, dev->gpu_id);
++}
++
++static unsigned
++panfrost_add_dep_after(struct util_dynarray *deps,
++                       struct panfrost_usage u,
++                       unsigned index)
++{
++        unsigned size = util_dynarray_num_elements(deps, struct panfrost_usage);
++
++        for (unsigned i = index; i < size; ++i) {
++                struct panfrost_usage *d =
++                        util_dynarray_element(deps, struct panfrost_usage, i);
++
++                /* TODO: Remove d if it is an invalid entry? */
++
++                if ((d->queue == u.queue) && (d->write == u.write)) {
++                        d->seqnum = MAX2(d->seqnum, u.seqnum);
++                        return i;
++
++                } else if (d->queue > u.queue) {
++                        void *p = util_dynarray_grow(deps, struct panfrost_usage, 1);
++                        assert(p);
++                        memmove(util_dynarray_element(deps, struct panfrost_usage, i + 1),
++                                util_dynarray_element(deps, struct panfrost_usage, i),
++                                (size - i) * sizeof(struct panfrost_usage));
++
++                        *util_dynarray_element(deps, struct panfrost_usage, i) = u;
++                        return i;
++                }
++        }
++
++        util_dynarray_append(deps, struct panfrost_usage, u);
++        return size;
++}
++
++static void
++panfrost_update_deps(struct util_dynarray *deps, struct panfrost_bo *bo, bool write)
++{
++        /* Both lists should be sorted, so each dependency is at a higher
++         * index than the last */
++        unsigned index = 0;
++        util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) {
++                /* read->read access does not require a dependency */
++                if (!write && !u->write)
++                        continue;
++
++                index = panfrost_add_dep_after(deps, *u, index);
++        }
++}
++
++static inline bool
++panfrost_usage_writes(enum panfrost_usage_type usage)
++{
++        return (usage == PAN_USAGE_WRITE_VERTEX) || (usage == PAN_USAGE_WRITE_FRAGMENT);
++}
++
++static inline bool
++panfrost_usage_fragment(enum panfrost_usage_type usage)
++{
++        return (usage == PAN_USAGE_READ_FRAGMENT) || (usage == PAN_USAGE_WRITE_FRAGMENT);
++}
++
++/* Removes invalid dependencies from deps */
++static void
++panfrost_clean_deps(struct panfrost_device *dev, struct util_dynarray *deps)
++{
++        kbase k = &dev->mali;
++
++        struct panfrost_usage *rebuild = util_dynarray_begin(deps);
++        unsigned index = 0;
++
++        util_dynarray_foreach(deps, struct panfrost_usage, u) {
++                /* Usages are ordered, so we can break here */
++                if (u->queue >= k->event_slot_usage)
++                        break;
++
++                struct kbase_event_slot *slot = &k->event_slots[u->queue];
++                uint64_t seqnum = u->seqnum;
++
++                /* There is a race condition, where we can depend on an
++                 * unsubmitted batch. In that cade, decrease the seqnum.
++                 * Otherwise, skip invalid dependencies. */
++                if (slot->last_submit == seqnum)
++                        --seqnum;
++                else if (slot->last_submit < seqnum)
++                        continue;
++
++                /* This usage is valid, add it to the returned list */
++                rebuild[index++] = (struct panfrost_usage) {
++                        .queue = u->queue,
++                        .write = u->write,
++                        .seqnum = seqnum,
++                };
++        }
++
++        /* No need to check the return value, it can only shrink */
++        (void)! util_dynarray_resize(deps, struct panfrost_usage, index);
++}
++
++static int
++panfrost_batch_submit_csf(struct panfrost_batch *batch,
++                          const struct pan_fb_info *fb)
++{
++        struct panfrost_context *ctx = batch->ctx;
++        struct pipe_screen *pscreen = ctx->base.screen;
++        struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
++
++        ++ctx->kbase_cs_vertex.seqnum;
++
++        if (panfrost_has_fragment_job(batch)) {
++                screen->vtbl.emit_fragment_job(batch, fb);
++                ++ctx->kbase_cs_fragment.seqnum;
++        }
++
++        pthread_mutex_lock(&dev->bo_usage_lock);
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) {
++
++                bool write = panfrost_usage_writes(i);
++                pan_bo_access access = write ? PAN_BO_ACCESS_RW : PAN_BO_ACCESS_READ;
++                struct util_dynarray *deps;
++                unsigned queue;
++                uint64_t seqnum;
++
++                if (panfrost_usage_fragment(i)) {
++                        deps = &batch->frag_deps;
++                        queue = ctx->kbase_cs_fragment.base.event_mem_offset;
++                        seqnum = ctx->kbase_cs_fragment.seqnum;
++                } else {
++                        deps = &batch->vert_deps;
++                        queue = ctx->kbase_cs_vertex.base.event_mem_offset;
++                        seqnum = ctx->kbase_cs_vertex.seqnum;
++                }
++
++                util_dynarray_foreach(&batch->resource_bos[i], struct panfrost_bo *, bo) {
++                        panfrost_update_deps(deps, *bo, write);
++                        struct panfrost_usage u = {
++                                .queue = queue,
++                                .write = write,
++                                .seqnum = seqnum,
++                        };
++
++                        panfrost_add_dep_after(&(*bo)->usage, u, 0);
++                        (*bo)->gpu_access |= access;
++                }
++        }
++        pthread_mutex_unlock(&dev->bo_usage_lock);
++
++        /* For now, only a single batch can use each tiler heap at once */
++        if (ctx->tiler_heap_desc) {
++                panfrost_update_deps(&batch->vert_deps, ctx->tiler_heap_desc, true);
++
++                struct panfrost_usage u = {
++                        .queue = ctx->kbase_cs_fragment.base.event_mem_offset,
++                        .write = true,
++                        .seqnum = ctx->kbase_cs_fragment.seqnum,
++                };
++                panfrost_add_dep_after(&ctx->tiler_heap_desc->usage, u, 0);
++        }
++
++        /* TODO: Use atomics in kbase code to avoid lock? */
++        pthread_mutex_lock(&dev->mali.queue_lock);
++
++        panfrost_clean_deps(dev, &batch->vert_deps);
++        panfrost_clean_deps(dev, &batch->frag_deps);
++
++        pthread_mutex_unlock(&dev->mali.queue_lock);
++
++        screen->vtbl.emit_csf_toplevel(batch);
++
++        uint64_t vs_offset = ctx->kbase_cs_vertex.offset +
++                (void *)ctx->kbase_cs_vertex.cs.ptr - ctx->kbase_cs_vertex.bo->ptr.cpu;
++        uint64_t fs_offset = ctx->kbase_cs_fragment.offset +
++                (void *)ctx->kbase_cs_fragment.cs.ptr - ctx->kbase_cs_fragment.bo->ptr.cpu;
++
++        if (dev->debug & PAN_DBG_TRACE) {
++                pandecode_cs_ring(dev, &ctx->kbase_cs_vertex, vs_offset);
++                pandecode_cs_ring(dev, &ctx->kbase_cs_fragment, fs_offset);
++        }
++
++        bool log = (dev->debug & PAN_DBG_LOG);
++
++        // TODO: We need better synchronisation than a single fake syncobj!
++
++        if (log)
++                printf("About to submit\n");
++
++        dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset,
++                            ctx->syncobj_kbase, ctx->kbase_cs_vertex.seqnum);
++
++        dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset,
++                            ctx->syncobj_kbase, ctx->kbase_cs_fragment.seqnum);
++
++        bool reset = false;
++
++        // TODO: How will we know to reset a CS when waiting is not done?
++        if (batch->needs_sync) {
++                if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ctx->syncobj_kbase))
++                        reset = true;
++
++                if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ctx->syncobj_kbase))
++                        reset = true;
++        }
++
++        if (dev->debug & PAN_DBG_TILER) {
++                fflush(stdout);
++                FILE *stream = popen("tiler-hex-read", "w");
++
++                /* TODO: Dump more than just the first chunk */
++                unsigned size = batch->ctx->kbase_ctx->tiler_heap_chunk_size;
++                uint64_t va = batch->ctx->kbase_ctx->tiler_heap_header;
++
++                fprintf(stream, "width %i\n" "height %i\n" "mask %i\n"
++                        "vaheap 0x%"PRIx64"\n" "size %i\n",
++                        batch->key.width, batch->key.height, 0xfe, va, size);
++
++                void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
++                                 MAP_SHARED, dev->mali.fd, va);
++
++                pan_hexdump(stream, ptr, size, false);
++                //memset(ptr, 0, size);
++                munmap(ptr, size);
++
++                pclose(stream);
++        }
++
++        if (reset)
++                reset_context(ctx);
++
++        return 0;
++}
++
+ static void
+ panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb)
+ {
+@@ -824,6 +1220,7 @@ panfrost_batch_submit(struct panfrost_context *ctx,
+ {
+         struct pipe_screen *pscreen = ctx->base.screen;
+         struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
+         int ret;
+ 
+         /* Nothing to do! */
+@@ -867,7 +1264,11 @@ panfrost_batch_submit(struct panfrost_context *ctx,
+         if (batch->scoreboard.first_tiler || batch->clear)
+                 screen->vtbl.emit_fbd(batch, &fb);
+ 
+-        ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj);
++        /* TODO: Don't hardcode the arch number */
++        if (dev->arch < 10)
++                ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj);
++        else
++                ret = panfrost_batch_submit_csf(batch, &fb);
+ 
+         if (ret)
+                 fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret);
+@@ -969,6 +1370,8 @@ panfrost_batch_clear(struct panfrost_batch *batch,
+                 for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) {
+                         if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
+                                 continue;
++                        if (!ctx->pipe_framebuffer.cbufs[i])
++                                continue;
+ 
+                         enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
+                         pan_pack_color(batch->clear_color[i], color, format, false);
+diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
+index 23263c54e07..6867476a3dc 100644
+--- a/src/gallium/drivers/panfrost/pan_job.h
++++ b/src/gallium/drivers/panfrost/pan_job.h
+@@ -79,6 +79,14 @@ pan_tristate_get(struct pan_tristate state)
+         return (state.v == PAN_TRISTATE_TRUE);
+ }
+ 
++enum panfrost_usage_type {
++        PAN_USAGE_READ_VERTEX,
++        PAN_USAGE_WRITE_VERTEX,
++        PAN_USAGE_READ_FRAGMENT,
++        PAN_USAGE_WRITE_FRAGMENT,
++        PAN_USAGE_COUNT,
++};
++
+ /* A panfrost_batch corresponds to a bound FBO we're rendering to,
+  * collecting over multiple draws. */
+ 
+@@ -194,6 +202,25 @@ struct panfrost_batch {
+ 
+         /* Referenced resources, holds a pipe_reference. */
+         struct set *resources;
++
++        struct util_dynarray resource_bos[PAN_USAGE_COUNT];
++
++        /* struct panfrost_usage */
++        struct util_dynarray vert_deps;
++        struct util_dynarray frag_deps;
++
++        /* Referenced dma-bufs FDs, for emitting synchronisation commands. */
++        struct util_dynarray dmabufs;
++
++        /* Command stream pointers for CSF Valhall. Vertex CS tracking is more
++         * complicated as there may be multiple buffers. */
++        pan_command_stream cs_vertex;
++        uint32_t *cs_vertex_last_size;
++        pan_command_stream cs_vertex_first;
++
++        pan_command_stream cs_fragment;
++
++        bool needs_sync;
+ };
+ 
+ /* Functions for managing the above */
+diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c
+index 9e95b793391..c8127987ad2 100644
+--- a/src/gallium/drivers/panfrost/pan_resource.c
++++ b/src/gallium/drivers/panfrost/pan_resource.c
+@@ -33,6 +33,7 @@
+ #include <xf86drm.h>
+ #include <fcntl.h>
+ #include "drm-uapi/drm_fourcc.h"
++#include "drm-uapi/drm.h"
+ 
+ #include "frontend/winsys_handle.h"
+ #include "util/format/u_format.h"
+@@ -51,6 +52,46 @@
+ #include "pan_tiling.h"
+ #include "decode.h"
+ 
++/* The kbase kernel driver always maps imported BOs with caching. When we
++ * don't want that, instead do mmap from the display driver side to get a
++ * write-combine mapping.
++ */
++static void
++panfrost_bo_mmap_scanout(struct panfrost_bo *bo,
++                         struct renderonly *ro,
++                         struct renderonly_scanout *scanout)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        /* If we are fine with a cached mapping, just return */
++        if (!(dev->debug & PAN_DBG_UNCACHED_CPU))
++                return;
++
++        struct drm_mode_map_dumb map_dumb = {
++                .handle = scanout->handle,
++        };
++
++        int err = drmIoctl(ro->kms_fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
++        if (err < 0) {
++                fprintf(stderr, "DRM_IOCTL_MODE_MAP_DUMB failed: %s\n",
++                        strerror(errno));
++                return;
++        }
++
++        void *addr = mmap(NULL, bo->size,
++                          PROT_READ | PROT_WRITE, MAP_SHARED,
++                          ro->kms_fd, map_dumb.offset);
++        if (addr == MAP_FAILED) {
++                fprintf(stderr, "kms_fd mmap failed: %s\n",
++                        strerror(errno));
++                return;
++        }
++
++        bo->munmap_ptr = bo->ptr.cpu;
++        bo->ptr.cpu = addr;
++        bo->cached = false;
++}
++
+ static struct pipe_resource *
+ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                               const struct pipe_resource *templat,
+@@ -102,15 +143,17 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                 return NULL;
+         }
+ 
+-        rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle);
++        struct panfrost_bo *bo = panfrost_bo_import(dev, whandle->handle);
+         /* Sometimes an import can fail e.g. on an invalid buffer fd, out of
+          * memory space to mmap it etc.
+          */
+-        if (!rsc->image.data.bo) {
++        if (!bo) {
+                 FREE(rsc);
+                 return NULL;
+         }
+ 
++        rsc->image.data.bo = bo;
++
+         rsc->modifier_constant = true;
+ 
+         BITSET_SET(rsc->valid.data, 0);
+@@ -122,6 +165,9 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                 /* failure is expected in some cases.. */
+         }
+ 
++        if (rsc->scanout)
++                panfrost_bo_mmap_scanout(bo, dev->ro, rsc->scanout);
++
+         return prsc;
+ }
+ 
+@@ -473,7 +519,9 @@ panfrost_resource_setup(struct panfrost_device *dev,
+ static void
+ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres)
+ {
+-        panfrost_bo_mmap(pres->image.data.bo);
++        struct panfrost_bo *bo = pres->image.data.bo;
++
++        panfrost_bo_mmap(bo);
+ 
+         unsigned nr_samples = MAX2(pres->base.nr_samples, 1);
+ 
+@@ -482,16 +530,16 @@ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres)
+                         struct pan_image_slice_layout *slice = &pres->image.layout.slices[l];
+ 
+                         for (unsigned s = 0; s < nr_samples; ++s) {
+-                                void *ptr = pres->image.data.bo->ptr.cpu +
+-                                            (i * pres->image.layout.array_stride) +
+-                                            slice->offset +
+-                                            (s * slice->afbc.surface_stride);
++                                size_t offset = (i * pres->image.layout.array_stride) +
++                                                slice->offset +
++                                                (s * slice->afbc.surface_stride);
+ 
+                                 /* Zero-ed AFBC headers seem to encode a plain
+                                  * black. Let's use this pattern to keep the
+                                  * initialization simple.
+                                  */
+-                                memset(ptr, 0, slice->afbc.header_size);
++                                memset(bo->ptr.cpu + offset, 0, slice->afbc.header_size);
++                                panfrost_bo_mem_clean(bo, offset, slice->afbc.header_size);
+                         }
+                 }
+         }
+@@ -643,7 +691,9 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
+                 (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" :
+                 "Other resource";
+ 
+-        if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) {
++        /* Revert to doing a kmsro allocation for any shared BO, because kbase
++         * cannot do export */
++        if (dev->ro && (template->bind & PAN_BIND_SHARED_MASK)) {
+                 struct winsys_handle handle;
+                 struct pan_block_size blocksize = panfrost_block_size(modifier, template->format);
+ 
+@@ -702,12 +752,21 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
+                         free(so);
+                         return NULL;
+                 }
++
++                panfrost_bo_mmap_scanout(so->image.data.bo, dev->ro, so->scanout);
+         } else {
+                 /* We create a BO immediately but don't bother mapping, since we don't
+                  * care to map e.g. FBOs which the CPU probably won't touch */
+ 
++                /* For now, don't cache buffers as syncing can be slow when
++                 * too much memory is mapped. TODO: dynamically switch, or use
++                 * the STREAM_READ etc. hints? */
++                bool buffer = (template->target == PIPE_BUFFER);
++                unsigned cache_flag = buffer ? 0 : PAN_BO_CACHEABLE;
++
+                 so->image.data.bo =
+-                        panfrost_bo_create(dev, so->image.layout.data_size, PAN_BO_DELAY_MMAP, label);
++                        panfrost_bo_create(dev, so->image.layout.data_size,
++                                           PAN_BO_DELAY_MMAP | cache_flag, label);
+ 
+                 so->constant_stencil = true;
+         }
+@@ -741,10 +800,22 @@ panfrost_resource_create_with_modifiers(struct pipe_screen *screen,
+                          const struct pipe_resource *template,
+                          const uint64_t *modifiers, int count)
+ {
++        struct panfrost_device *dev = pan_device(screen);
++
+         for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) {
+-                if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) {
+-                        return panfrost_resource_create_with_modifier(screen, template,
+-                                        pan_best_modifiers[i]);
++                uint64_t mod = pan_best_modifiers[i];
++
++                if (drm_is_afbc(mod) && !dev->has_afbc)
++                        continue;
++
++                if (mod != DRM_FORMAT_MOD_LINEAR && (dev->debug & PAN_DBG_LINEAR))
++                        continue;
++
++                /* TODO: What if mod is an unsupported AFBC variant for this
++                 * format? */
++
++                if (drm_find_modifier(mod, modifiers, count)) {
++                        return panfrost_resource_create_with_modifier(screen, template, mod);
+                 }
+         }
+ 
+@@ -773,6 +844,71 @@ panfrost_resource_destroy(struct pipe_screen *screen,
+         free(rsrc);
+ }
+ 
++static void
++panfrost_clear_render_target(struct pipe_context *pipe,
++                             struct pipe_surface *dst,
++                             const union pipe_color_union *color,
++                             unsigned dstx, unsigned dsty,
++                             unsigned width, unsigned height,
++                             bool render_condition_enabled)
++{
++        struct panfrost_context *ctx = pan_context(pipe);
++
++        /* TODO: dstx, etc. */
++
++        struct pipe_framebuffer_state tmp = {0};
++        util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer);
++
++        struct pipe_framebuffer_state fb = {
++                .width = dst->width,
++                .height = dst->height,
++                .layers = 1,
++                .samples = 1,
++                .nr_cbufs = 1,
++                .cbufs[0] = dst,
++        };
++        pipe->set_framebuffer_state(pipe, &fb);
++
++        struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear render target");
++        panfrost_batch_clear(batch, PIPE_CLEAR_COLOR0, color, 0, 0);
++
++        pipe->set_framebuffer_state(pipe, &tmp);
++        util_unreference_framebuffer_state(&tmp);
++}
++
++static void
++panfrost_clear_depth_stencil(struct pipe_context *pipe,
++                             struct pipe_surface *dst,
++                             unsigned clear_flags,
++                             double depth, unsigned stencil,
++                             unsigned dstx, unsigned dsty,
++                             unsigned width, unsigned height,
++                             bool render_condition_enabled)
++{
++        struct panfrost_context *ctx = pan_context(pipe);
++
++        /* TODO: dstx, etc. */
++
++        struct pipe_framebuffer_state tmp = {0};
++        util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer);
++
++        struct pipe_framebuffer_state fb = {
++                .width = dst->width,
++                .height = dst->height,
++                .layers = 1,
++                .samples = 1,
++                .nr_cbufs = 0,
++                .zsbuf = dst,
++        };
++        pipe->set_framebuffer_state(pipe, &fb);
++
++        struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear depth/stencil");
++        panfrost_batch_clear(batch, clear_flags, NULL, depth, stencil);
++
++        pipe->set_framebuffer_state(pipe, &tmp);
++        util_unreference_framebuffer_state(&tmp);
++}
++
+ /* Most of the time we can do CPU-side transfers, but sometimes we need to use
+  * the 3D pipe for this. Let's wrap u_blitter to blit to/from staging textures.
+  * Code adapted from freedreno */
+@@ -968,6 +1104,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                 struct panfrost_resource *staging = pan_alloc_staging(ctx, rsrc, level, box);
+                 assert(staging);
+ 
++                panfrost_bo_mmap(staging->image.data.bo);
++
+                 /* Staging resources have one LOD: level 0. Query the strides
+                  * on this LOD.
+                  */
+@@ -990,9 +1128,11 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                         pan_blit_to_staging(pctx, transfer);
+                         panfrost_flush_writer(ctx, staging, "AFBC read staging blit");
+                         panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false);
++
++                        panfrost_bo_mem_invalidate(staging->image.data.bo, 0,
++                                                   staging->image.data.bo->size);
+                 }
+ 
+-                panfrost_bo_mmap(staging->image.data.bo);
+                 return staging->image.data.bo->ptr.cpu;
+         }
+ 
+@@ -1029,7 +1169,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+             !(usage & PIPE_MAP_UNSYNCHRONIZED) &&
+             !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
+             (usage & PIPE_MAP_WRITE) &&
+-            rsrc->track.nr_users > 0) {
++            rsrc->track.nr_users > 0 &&
++            bo->size < 16 * 1024 * 1024) {
+ 
+                 /* When a resource to be modified is already being used by a
+                  * pending batch, it is often faster to copy the whole BO than
+@@ -1051,6 +1192,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                 copy_resource = false;
+         }
+ 
++        bool cache_inval = true;
++
+         if (create_new_bo) {
+                 /* Make sure we re-emit any descriptors using this resource */
+                 panfrost_dirty_state_all(ctx);
+@@ -1075,12 +1218,14 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                                                            flags, bo->label);
+ 
+                         if (newbo) {
+-                                if (copy_resource)
+-                                        memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size);
++                                if (copy_resource) {
++                                        panfrost_bo_mem_invalidate(bo, 0, bo->size);
++                                        memcpy(newbo->ptr.cpu, bo->ptr.cpu, bo->size);
++                                }
+ 
+                                 panfrost_resource_swap_bo(ctx, rsrc, newbo);
+ 
+-	                        if (!copy_resource &&
++                                if (!copy_resource &&
+                                     drm_is_afbc(rsrc->image.layout.modifier))
+                                         panfrost_resource_init_afbc_headers(rsrc);
+ 
+@@ -1102,6 +1247,22 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                         panfrost_flush_writer(ctx, rsrc, "Synchronized read");
+                         panfrost_bo_wait(bo, INT64_MAX, false);
+                 }
++        } else {
++                /* No flush for writes to uninitialized */
++                cache_inval = false;
++        }
++
++        /* TODO: Only the accessed region for textures */
++        if (cache_inval) {
++                size_t offset = 0;
++                size_t size = bo->size;
++
++                if (resource->target == PIPE_BUFFER) {
++                        offset = box->x * (size_t) bytes_per_block;
++                        size = box->width * (size_t) bytes_per_block;
++                }
++
++                panfrost_bo_mem_invalidate(bo, offset, size);
+         }
+ 
+         /* For access to compressed textures, we want the (x, y, w, h)
+@@ -1128,6 +1289,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                  * caching... I don't know if this is actually possible but we
+                  * should still get it right */
+ 
++                // TODO: Fix this for cached BOs
++
+                 unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT;
+ 
+                 if ((usage & dpw) == dpw && rsrc->index_cache)
+@@ -1281,8 +1444,15 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+          * reloads that can cascade into DATA_INVALID_FAULTs due to reading
+          * malformed AFBC data if uninitialized */
+ 
+-        if (trans->staging.rsrc) {
++        bool afbc = trans->staging.rsrc;
++
++        if (afbc) {
+                 if (transfer->usage & PIPE_MAP_WRITE) {
++                        struct panfrost_resource *trans_rsrc = pan_resource(trans->staging.rsrc);
++                        struct panfrost_bo *trans_bo = trans_rsrc->image.data.bo;
++
++                        panfrost_bo_mem_clean(trans_bo, 0, trans_bo->size);
++
+                         if (panfrost_should_linear_convert(dev, prsrc, transfer)) {
+ 
+                                 panfrost_bo_unreference(prsrc->image.data.bo);
+@@ -1290,7 +1460,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                                 panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR,
+                                                         prsrc->image.layout.format);
+ 
+-                                prsrc->image.data.bo = pan_resource(trans->staging.rsrc)->image.data.bo;
++                                prsrc->image.data.bo = trans_bo;
+                                 panfrost_bo_reference(prsrc->image.data.bo);
+                         } else {
+                                 pan_blit_from_staging(pctx, trans);
+@@ -1315,10 +1485,13 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                                         panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR,
+                                                                 prsrc->image.layout.format);
+                                         if (prsrc->image.layout.data_size > bo->size) {
++                                                /* We want the BO to be MMAPed. */
++                                                uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP;
+                                                 const char *label = bo->label;
++
+                                                 panfrost_bo_unreference(bo);
+                                                 bo = prsrc->image.data.bo =
+-                                                        panfrost_bo_create(dev, prsrc->image.layout.data_size, 0, label);
++                                                        panfrost_bo_create(dev, prsrc->image.layout.data_size, flags, label);
+                                                 assert(bo);
+                                         }
+ 
+@@ -1339,6 +1512,25 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                 }
+         }
+ 
++        /* TODO: Only the accessed region */
++        /* It is important to not do this for AFBC resources, or else the
++         * clean might overwrite the result of the blit. */
++        if (!afbc && (transfer->usage & PIPE_MAP_WRITE)) {
++                size_t offset = 0;
++                size_t size = prsrc->image.data.bo->size;
++
++                /* TODO: Don't recalculate */
++                if (prsrc->base.target == PIPE_BUFFER) {
++                        enum pipe_format format = prsrc->image.layout.format;
++                        int bytes_per_block = util_format_get_blocksize(format);
++
++                        offset = transfer->box.x * (size_t) bytes_per_block;
++                        size = transfer->box.width * (size_t) bytes_per_block;
++                }
++
++                panfrost_bo_mem_clean(prsrc->image.data.bo,
++                                      offset, size);
++        }
+ 
+         util_range_add(&prsrc->base, &prsrc->valid_buffer_range,
+                        transfer->box.x,
+@@ -1353,6 +1545,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+         ralloc_free(transfer);
+ }
+ 
++// TODO: does this need to be changed for cached resources?
+ static void
+ panfrost_ptr_flush_region(struct pipe_context *pctx,
+                                struct pipe_transfer *transfer,
+@@ -1486,6 +1679,8 @@ panfrost_resource_context_init(struct pipe_context *pctx)
+         pctx->texture_unmap = u_transfer_helper_transfer_unmap;
+         pctx->create_surface = panfrost_create_surface;
+         pctx->surface_destroy = panfrost_surface_destroy;
++        pctx->clear_render_target = panfrost_clear_render_target;
++        pctx->clear_depth_stencil = panfrost_clear_depth_stencil;
+         pctx->resource_copy_region = util_resource_copy_region;
+         pctx->blit = panfrost_blit;
+         pctx->generate_mipmap = panfrost_generate_mipmap;
+diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c
+index ee6dbb7b57f..ea315f8be64 100644
+--- a/src/gallium/drivers/panfrost/pan_screen.c
++++ b/src/gallium/drivers/panfrost/pan_screen.c
+@@ -56,7 +56,7 @@
+ 
+ static const struct debug_named_value panfrost_debug_options[] = {
+         {"perf",      PAN_DBG_PERF,     "Enable performance warnings"},
+-        {"trace",     PAN_DBG_TRACE,    "Trace the command stream"},
++        {"trace",     PAN_DBG_TRACE | PAN_DBG_BO_CLEAR, "Trace the command stream"},
+         {"deqp",      PAN_DBG_DEQP,     "Hacks for dEQP"},
+         {"dirty",     PAN_DBG_DIRTY,    "Always re-emit all state"},
+         {"sync",      PAN_DBG_SYNC,     "Wait for each job's completion and abort on GPU faults"},
+@@ -72,6 +72,13 @@ static const struct debug_named_value panfrost_debug_options[] = {
+ #ifdef PAN_DBG_OVERFLOW
+         {"overflow",  PAN_DBG_OVERFLOW, "Check for buffer overflows in pool uploads"},
+ #endif
++        {"tiler",     PAN_DBG_TILER,    "Decode the tiler heap"},
++        {"bolog",     PAN_DBG_BO_LOG,   "Log BO allocations/deallocations"},
++        {"boclear",   PAN_DBG_BO_CLEAR, "Clear BOs on allocation"},
++        {"nogpuc",    PAN_DBG_UNCACHED_GPU, "Use uncached GPU memory for textures"},
++        {"nocpuc",    PAN_DBG_UNCACHED_CPU, "Use uncached CPU mappings for textures"},
++        {"log",       PAN_DBG_LOG,      "Log job submission etc."},
++        {"gofaster",  PAN_DBG_GOFASTER, "Experimental performance improvements"},
+         DEBUG_NAMED_VALUE_END
+ };
+ 
+@@ -122,6 +129,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
+         case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+         case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+         case PIPE_CAP_SHADER_PACK_HALF_FLOAT:
++        case PIPE_CAP_CLIP_HALFZ:
+                 return 1;
+ 
+         case PIPE_CAP_MAX_RENDER_TARGETS:
+@@ -300,7 +308,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
+          * still supported as it is core GLES3.0 functionality
+          */
+         case PIPE_CAP_PRIMITIVE_RESTART:
+-                return dev->arch <= 7;
++                return is_gl3 || dev->arch <= 7;
+ 
+         case PIPE_CAP_FLATSHADE:
+         case PIPE_CAP_TWO_SIDED_COLOR:
+@@ -606,6 +614,7 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen,
+         bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format);
+         bool ytr = panfrost_afbc_can_ytr(format);
+         bool tiled_afbc = panfrost_afbc_can_tile(dev);
++        bool native = panfrost_afbc_only_native(dev->arch, format);
+ 
+         unsigned count = 0;
+ 
+@@ -619,6 +628,9 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen,
+                 if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc)
+                         continue;
+ 
++                if (drm_is_afbc(pan_best_modifiers[i]) && !(pan_best_modifiers[i] & AFBC_FORMAT_MOD_NATIVE_SWIZZLE) && native)
++                        continue;
++
+                 if (test_modifier != DRM_FORMAT_MOD_INVALID &&
+                     test_modifier != pan_best_modifiers[i])
+                         continue;
+@@ -822,13 +834,17 @@ panfrost_create_screen(int fd, struct renderonly *ro)
+ 
+         /* Bail early on unsupported hardware */
+         if (dev->model == NULL) {
+-                debug_printf("panfrost: Unsupported model %X", dev->gpu_id);
++                debug_printf("panfrost: Unsupported model %X\n", dev->gpu_id);
+                 panfrost_destroy_screen(&(screen->base));
+                 return NULL;
+         }
+ 
+         dev->ro = ro;
+ 
++        /* The functionality is only useful with kbase */
++        if (dev->kbase)
++                dev->has_dmabuf_fence = panfrost_check_dmabuf_fence(dev);
++
+         screen->base.destroy = panfrost_destroy_screen;
+ 
+         screen->base.get_name = panfrost_get_name;
+@@ -874,6 +890,8 @@ panfrost_create_screen(int fd, struct renderonly *ro)
+                 panfrost_cmdstream_screen_init_v7(screen);
+         else if (dev->arch == 9)
+                 panfrost_cmdstream_screen_init_v9(screen);
++        else if (dev->arch == 10)
++                panfrost_cmdstream_screen_init_v10(screen);
+         else
+                 unreachable("Unhandled architecture major");
+ 
+diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h
+index 656a4948a42..94cfcf472a5 100644
+--- a/src/gallium/drivers/panfrost/pan_screen.h
++++ b/src/gallium/drivers/panfrost/pan_screen.h
+@@ -50,6 +50,7 @@ static const struct pipe_driver_query_info panfrost_driver_query_list[] = {
+ 
+ struct panfrost_batch;
+ struct panfrost_context;
++struct panfrost_cs;
+ struct panfrost_resource;
+ struct panfrost_compiled_shader;
+ struct pan_fb_info;
+@@ -57,6 +58,7 @@ struct pan_blend_state;
+ 
+ /* Virtual table of per-generation (GenXML) functions */
+ 
++
+ struct panfrost_vtable {
+         /* Prepares the renderer state descriptor or shader program descriptor
+          * for a given compiled shader, and if desired uploads it as well */
+@@ -100,6 +102,10 @@ struct panfrost_vtable {
+                                struct panfrost_compile_inputs *inputs,
+                                struct util_dynarray *binary,
+                                struct pan_shader_info *info);
++
++        void (*emit_csf_toplevel)(struct panfrost_batch *);
++
++        void (*init_cs)(struct panfrost_context *ctx, struct panfrost_cs *cs);
+ };
+ 
+ struct panfrost_screen {
+@@ -138,6 +144,7 @@ void panfrost_cmdstream_screen_init_v5(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen);
++void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen);
+ 
+ #define perf_debug(dev, ...) \
+         do { \
+diff --git a/src/gallium/frontends/nine/nine_ff.c b/src/gallium/frontends/nine/nine_ff.c
+index 6705fc2208c..6eb94ef8ccd 100644
+--- a/src/gallium/frontends/nine/nine_ff.c
++++ b/src/gallium/frontends/nine/nine_ff.c
+@@ -1413,7 +1413,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
+             struct ureg_src texture_coord = ps.vT[s];
+             struct ureg_dst delta;
+             switch (key->ts[s].textarget) {
+-            case 0: target = TGSI_TEXTURE_1D; break;
++            case 0: target = TGSI_TEXTURE_2D; break;
+             case 1: target = TGSI_TEXTURE_2D; break;
+             case 2: target = TGSI_TEXTURE_3D; break;
+             case 3: target = TGSI_TEXTURE_CUBE; break;
+diff --git a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c
+index d1742a59c0e..e78288c5010 100644
+--- a/src/gallium/frontends/nine/nine_shader.c
++++ b/src/gallium/frontends/nine/nine_shader.c
+@@ -2159,7 +2159,7 @@ static inline unsigned
+ d3dstt_to_tgsi_tex(BYTE sampler_type)
+ {
+     switch (sampler_type) {
+-    case NINED3DSTT_1D:     return TGSI_TEXTURE_1D;
++    case NINED3DSTT_1D:     return TGSI_TEXTURE_2D;
+     case NINED3DSTT_2D:     return TGSI_TEXTURE_2D;
+     case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D;
+     case NINED3DSTT_CUBE:   return TGSI_TEXTURE_CUBE;
+@@ -2172,7 +2172,7 @@ static inline unsigned
+ d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
+ {
+     switch (sampler_type) {
+-    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D;
++    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW2D;
+     case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D;
+     case NINED3DSTT_VOLUME:
+     case NINED3DSTT_CUBE:
+@@ -2186,7 +2186,7 @@ ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
+ {
+     boolean shadow = !!(info->sampler_mask_shadow & (1 << stage));
+     switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
+-    case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D;
++    case 1: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
+     case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
+     case 3: return TGSI_TEXTURE_3D;
+     default:
+diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c
+index cd627c83d1e..b07e361ff41 100644
+--- a/src/gallium/frontends/nine/nine_state.c
++++ b/src/gallium/frontends/nine/nine_state.c
+@@ -1039,8 +1039,10 @@ update_textures_and_samplers(struct NineDevice9 *device)
+                             false, view);
+     context->enabled_sampler_count_ps = num_textures;
+ 
+-    if (commit_samplers)
++    if (commit_samplers) {
++        cso_set_max_sampler(context->cso, num_textures - 1);
+         cso_single_sampler_done(context->cso, PIPE_SHADER_FRAGMENT);
++    }
+ 
+     commit_samplers = FALSE;
+     sampler_mask = context->programmable_vs ? context->vs->sampler_mask : 0;
+@@ -1084,8 +1086,10 @@ update_textures_and_samplers(struct NineDevice9 *device)
+                             false, view);
+     context->enabled_sampler_count_vs = num_textures;
+ 
+-    if (commit_samplers)
++    if (commit_samplers) {
++        cso_set_max_sampler(context->cso, num_textures - 1);
+         cso_single_sampler_done(context->cso, PIPE_SHADER_VERTEX);
++    }
+ }
+ 
+ /* State commit only */
+diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
+index 73f948c5460..e9f942901b8 100644
+--- a/src/gallium/targets/d3dadapter9/meson.build
++++ b/src/gallium/targets/d3dadapter9/meson.build
+@@ -64,7 +64,8 @@ libgallium_nine = shared_library(
+     dep_selinux, dep_libdrm, dep_llvm, dep_thread,
+     idep_xmlconfig, idep_mesautil, idep_nir,
+     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
+-    driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno
++    driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno,
++    driver_panfrost, driver_kmsro,
+   ],
+   name_prefix : '',
+   version : '.'.join(nine_version),
+diff --git a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build
+index 024bac32b58..23938ec73a1 100644
+--- a/src/gallium/targets/osmesa/meson.build
++++ b/src/gallium/targets/osmesa/meson.build
+@@ -55,10 +55,10 @@ libosmesa = shared_library(
+     libmesa, libgallium, libws_null, osmesa_link_with,
+   ],
+   dependencies : [
+-    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast
++    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast, driver_panfrost, dep_libdrm
+   ],
+   name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
+-  soversion : host_machine.system() == 'windows' ? '' : '8',
++  soversion : '',
+   version : '8.0.0',
+   darwin_versions : '9.0.0',
+   install : true,
+diff --git a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build
+index 71c5da2129e..a4b4c7639f0 100644
+--- a/src/gallium/targets/rusticl/meson.build
++++ b/src/gallium/targets/rusticl/meson.build
+@@ -43,6 +43,7 @@ librusticl = shared_library(
+   ],
+   dependencies : [
+     driver_iris,
++    driver_kmsro,
+     driver_nouveau,
+     driver_panfrost,
+     driver_swrast,
+diff --git a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c
+index 3c8a3c4519f..4011f45f853 100644
+--- a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c
++++ b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c
+@@ -101,9 +101,15 @@ struct pipe_screen *kmsro_drm_screen_create(int fd,
+ #endif
+ 
+ #if defined(GALLIUM_PANFROST)
+-   ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER);
++   bool noop = getenv("KBASE_NOOP");
+ 
+-   if (ro->gpu_fd >= 0) {
++   if (!noop) {
++      ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER);
++      if (ro->gpu_fd < 0)
++         ro->gpu_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK);
++   }
++
++   if ((ro->gpu_fd >= 0) || noop) {
+       ro->create_for_resource = renderonly_create_kms_dumb_buffer_for_resource;
+       screen = panfrost_drm_screen_create_renderonly(ro);
+       if (!screen)
+diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
+index 048106dccd5..71992ca72c5 100644
+--- a/src/mesa/main/shaderapi.c
++++ b/src/mesa/main/shaderapi.c
+@@ -70,7 +70,6 @@
+ #include "state_tracker/st_context.h"
+ #include "state_tracker/st_program.h"
+ 
+-#ifdef ENABLE_SHADER_CACHE
+ #if CUSTOM_SHADER_REPLACEMENT
+ #include "shader_replacement.h"
+ /* shader_replacement.h must declare a variable like this:
+@@ -116,7 +115,6 @@ static char* load_shader_replacement(struct _shader_replacement *repl)
+    return NULL;
+ }
+ #endif
+-#endif
+ 
+ /**
+  * Return mask of GLSL_x flags by examining the MESA_GLSL env var.
+@@ -1929,8 +1927,6 @@ _mesa_LinkProgram(GLuint programObj)
+    link_program_error(ctx, shProg);
+ }
+ 
+-#ifdef ENABLE_SHADER_CACHE
+-
+ /**
+  * Construct a full path for shader replacement functionality using
+  * following format:
+@@ -2063,8 +2059,6 @@ _mesa_read_shader_source(const gl_shader_stage stage, const char *source,
+    return buffer;
+ }
+ 
+-#endif /* ENABLE_SHADER_CACHE */
+-
+ /**
+  * Called via glShaderSource() and glShaderSourceARB() API functions.
+  * Basically, concatenate the source code strings into one long string
+@@ -2146,7 +2140,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count,
+    uint8_t original_sha1[SHA1_DIGEST_LENGTH];
+    _mesa_sha1_compute(source, strlen(source), original_sha1);
+ 
+-#ifdef ENABLE_SHADER_CACHE
+    GLcharARB *replacement;
+ 
+    /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace
+@@ -2159,7 +2152,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count,
+       free(source);
+       source = replacement;
+    }
+-#endif /* ENABLE_SHADER_CACHE */
+ 
+    set_shader_source(sh, source, original_sha1);
+ 
+diff --git a/src/meson.build b/src/meson.build
+index e5510452775..1890db00c0d 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -77,6 +77,7 @@ if with_imgui
+ endif
+ if with_platform_wayland
+   subdir('egl/wayland/wayland-drm')
++  subdir('egl/wayland/mali-buffer-sharing')
+ endif
+ if with_any_vk or with_gallium_zink
+   subdir('vulkan')
+diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
+new file mode 100644
+index 00000000000..3b02350c08b
+--- /dev/null
++++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
+@@ -0,0 +1,596 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_BASE_CSF_KERNEL_H_
++#define _UAPI_BASE_CSF_KERNEL_H_
++
++#include <linux/types.h>
++#include "../mali_base_common_kernel.h"
++
++/* Memory allocation, access/hint flags & mask specific to CSF GPU.
++ *
++ * See base_mem_alloc_flags.
++ */
++
++/* Must be FIXED memory. */
++#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8)
++
++/* CSF event memory
++ *
++ * If Outer shareable coherence is not specified or not available, then on
++ * allocation kbase will automatically use the uncached GPU mapping.
++ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
++ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
++ *
++ * This memory requires a permanent mapping
++ *
++ * See also kbase_reg_needs_kernel_mapping()
++ */
++#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
++
++#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
++
++
++/* Must be FIXABLE memory: its GPU VA will be determined at a later point,
++ * at which time it will be at a fixed GPU VA.
++ */
++#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29)
++
++/* Note that the number of bits used for base_mem_alloc_flags
++ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
++ */
++
++/* A mask of all the flags which are only valid for allocations within kbase,
++ * and may not be passed from user space.
++ */
++#define BASEP_MEM_FLAGS_KERNEL_ONLY \
++	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
++
++/* A mask of all currently reserved flags
++ */
++#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20
++
++/* Special base mem handles specific to CSF.
++ */
++#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
++#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
++
++#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
++	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
++	 LOCAL_PAGE_SHIFT)
++
++/* Valid set of just-in-time memory allocation flags */
++#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
++
++/* flags for base context specific to CSF */
++
++/* Base context creates a CSF event notification thread.
++ *
++ * The creation of a CSF event notification thread is conditional but
++ * mandatory for the handling of CSF events.
++ */
++#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
++
++/* Bitpattern describing the ::base_context_create_flags that can be
++ * passed to base_context_init()
++ */
++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
++	(BASE_CONTEXT_CCTX_EMBEDDED | \
++	 BASE_CONTEXT_CSF_EVENT_THREAD | \
++	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
++
++/* Flags for base tracepoint specific to CSF */
++
++/* Enable KBase tracepoints for CSF builds */
++#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
++
++/* Enable additional CSF Firmware side tracepoints */
++#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
++
++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
++		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
++		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
++		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
++
++/* Number of pages mapped into the process address space for a bound GPU
++ * command queue. A pair of input/output pages and a Hw doorbell page
++ * are mapped to enable direct submission of commands to Hw.
++ */
++#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
++
++#define BASE_QUEUE_MAX_PRIORITY (15U)
++
++/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
++#define BASEP_EVENT_VAL_INDEX (0U)
++#define BASEP_EVENT_ERR_INDEX (1U)
++
++/* The upper limit for number of objects that could be waited/set per command.
++ * This limit is now enforced as internally the error inherit inputs are
++ * converted to 32-bit flags in a __u32 variable occupying a previously padding
++ * field.
++ */
++#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
++
++/* CSF CSI EXCEPTION_HANDLER_FLAGS */
++#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0)
++#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG)
++
++/**
++ * enum base_kcpu_command_type - Kernel CPU queue command type.
++ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
++ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
++ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
++ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
++ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
++ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
++ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
++ */
++enum base_kcpu_command_type {
++	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
++	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
++	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
++	BASE_KCPU_COMMAND_TYPE_CQS_SET,
++	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
++	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
++	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
++	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
++	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
++	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
++	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
++	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
++	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
++};
++
++/**
++ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
++ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
++ *                                      priority levels.
++ *
++ * Currently this is in order of highest to lowest, but if new levels are added
++ * then those new levels may be out of order to preserve the ABI compatibility
++ * with previous releases. At that point, ensure assignment to
++ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
++ * a linear ordering.
++ *
++ * There should be no gaps in the enum, otherwise use of
++ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
++ */
++enum base_queue_group_priority {
++	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
++	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
++	BASE_QUEUE_GROUP_PRIORITY_LOW,
++	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
++	BASE_QUEUE_GROUP_PRIORITY_COUNT
++};
++
++struct base_kcpu_command_fence_info {
++	__u64 fence;
++};
++
++struct base_cqs_wait_info {
++	__u64 addr;
++	__u32 val;
++	__u32 padding;
++};
++
++struct base_kcpu_command_cqs_wait_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 inherit_err_flags;
++};
++
++struct base_cqs_set {
++	__u64 addr;
++};
++
++struct base_kcpu_command_cqs_set_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 padding;
++};
++
++/**
++ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
++ *
++ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
++ *                           is an unsigned 32-bit integer
++ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
++ *                           is an unsigned 64-bit integer
++ */
++typedef enum PACKED {
++	BASEP_CQS_DATA_TYPE_U32 = 0,
++	BASEP_CQS_DATA_TYPE_U64 = 1,
++} basep_cqs_data_type;
++
++/**
++ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
++ *                                Operation conditions
++ *
++ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
++ *                                wait will be satisfied when a CQS Object's
++ *                                value is Less than or Equal to
++ *                                the Wait Operation value
++ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
++ *                                wait will be satisfied when a CQS Object's
++ *                                value is Greater than the Wait Operation value
++ */
++typedef enum {
++	BASEP_CQS_WAIT_OPERATION_LE = 0,
++	BASEP_CQS_WAIT_OPERATION_GT = 1,
++} basep_cqs_wait_operation_op;
++
++struct base_cqs_wait_operation_info {
++	__u64 addr;
++	__u64 val;
++	__u8 operation;
++	__u8 data_type;
++	__u8 padding[6];
++};
++
++/**
++ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
++ *		about the Timeline CQS wait objects
++ *
++ * @objs:              An array of Timeline CQS waits.
++ * @nr_objs:           Number of Timeline CQS waits in the array.
++ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
++ *                     to be served as the source for importing into the
++ *                     queue's error-state.
++ */
++struct base_kcpu_command_cqs_wait_operation_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 inherit_err_flags;
++};
++
++/**
++ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
++ *
++ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
++ *                                to a synchronization object
++ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
++ *                                of a synchronization object
++ */
++typedef enum {
++	BASEP_CQS_SET_OPERATION_ADD = 0,
++	BASEP_CQS_SET_OPERATION_SET = 1,
++} basep_cqs_set_operation_op;
++
++struct base_cqs_set_operation_info {
++	__u64 addr;
++	__u64 val;
++	__u8 operation;
++	__u8 data_type;
++	__u8 padding[6];
++};
++
++/**
++ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
++ *		about the Timeline CQS set objects
++ *
++ * @objs:    An array of Timeline CQS sets.
++ * @nr_objs: Number of Timeline CQS sets in the array.
++ * @padding: Structure padding, unused bytes.
++ */
++struct base_kcpu_command_cqs_set_operation_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 padding;
++};
++
++/**
++ * struct base_kcpu_command_import_info - structure which contains information
++ *		about the imported buffer.
++ *
++ * @handle:	Address of imported user buffer.
++ */
++struct base_kcpu_command_import_info {
++	__u64 handle;
++};
++
++/**
++ * struct base_kcpu_command_jit_alloc_info - structure which contains
++ *		information about jit memory allocation.
++ *
++ * @info:	An array of elements of the
++ *		struct base_jit_alloc_info type.
++ * @count:	The number of elements in the info array.
++ * @padding:	Padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_jit_alloc_info {
++	__u64 info;
++	__u8 count;
++	__u8 padding[7];
++};
++
++/**
++ * struct base_kcpu_command_jit_free_info - structure which contains
++ *		information about jit memory which is to be freed.
++ *
++ * @ids:	An array containing the JIT IDs to free.
++ * @count:	The number of elements in the ids array.
++ * @padding:	Padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_jit_free_info {
++	__u64 ids;
++	__u8 count;
++	__u8 padding[7];
++};
++
++/**
++ * struct base_kcpu_command_group_suspend_info - structure which contains
++ *		suspend buffer data captured for a suspended queue group.
++ *
++ * @buffer:		Pointer to an array of elements of the type char.
++ * @size:		Number of elements in the @buffer array.
++ * @group_handle:	Handle to the mapping of CSG.
++ * @padding:		padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_group_suspend_info {
++	__u64 buffer;
++	__u32 size;
++	__u8 group_handle;
++	__u8 padding[3];
++};
++
++
++/**
++ * struct base_kcpu_command - kcpu command.
++ * @type:	type of the kcpu command, one enum base_kcpu_command_type
++ * @padding:	padding to a multiple of 64 bits
++ * @info:	structure which contains information about the kcpu command;
++ *		actual type is determined by @p type
++ * @info.fence:              Fence
++ * @info.cqs_wait:           CQS wait
++ * @info.cqs_set:            CQS set
++ * @info.cqs_wait_operation: CQS wait operation
++ * @info.cqs_set_operation:  CQS set operation
++ * @info.import:             import
++ * @info.jit_alloc:          JIT allocation
++ * @info.jit_free:           JIT deallocation
++ * @info.suspend_buf_copy:   suspend buffer copy
++ * @info.sample_time:        sample time
++ * @info.padding:            padding
++ */
++struct base_kcpu_command {
++	__u8 type;
++	__u8 padding[sizeof(__u64) - sizeof(__u8)];
++	union {
++		struct base_kcpu_command_fence_info fence;
++		struct base_kcpu_command_cqs_wait_info cqs_wait;
++		struct base_kcpu_command_cqs_set_info cqs_set;
++		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
++		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
++		struct base_kcpu_command_import_info import;
++		struct base_kcpu_command_jit_alloc_info jit_alloc;
++		struct base_kcpu_command_jit_free_info jit_free;
++		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
++		__u64 padding[2]; /* No sub-struct should be larger */
++	} info;
++};
++
++/**
++ * struct basep_cs_stream_control - CSI capabilities.
++ *
++ * @features: Features of this stream
++ * @padding:  Padding to a multiple of 64 bits.
++ */
++struct basep_cs_stream_control {
++	__u32 features;
++	__u32 padding;
++};
++
++/**
++ * struct basep_cs_group_control - CSG interface capabilities.
++ *
++ * @features:     Features of this group
++ * @stream_num:   Number of streams in this group
++ * @suspend_size: Size in bytes of the suspend buffer for this group
++ * @padding:      Padding to a multiple of 64 bits.
++ */
++struct basep_cs_group_control {
++	__u32 features;
++	__u32 stream_num;
++	__u32 suspend_size;
++	__u32 padding;
++};
++
++/**
++ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
++ *        error information associated with GPU command queue group.
++ *
++ * @sideband:     Additional information of the unrecoverable fault.
++ * @status:       Unrecoverable fault information.
++ *                This consists of exception type (least significant byte) and
++ *                data (remaining bytes). One example of exception type is
++ *                CS_INVALID_INSTRUCTION (0x49).
++ * @padding:      Padding to make multiple of 64bits
++ */
++struct base_gpu_queue_group_error_fatal_payload {
++	__u64 sideband;
++	__u32 status;
++	__u32 padding;
++};
++
++/**
++ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
++ *        error information related to GPU command queue.
++ *
++ * @sideband:     Additional information about this unrecoverable fault.
++ * @status:       Unrecoverable fault information.
++ *                This consists of exception type (least significant byte) and
++ *                data (remaining bytes). One example of exception type is
++ *                CS_INVALID_INSTRUCTION (0x49).
++ * @csi_index:    Index of the CSF interface the queue is bound to.
++ * @padding:      Padding to make multiple of 64bits
++ */
++struct base_gpu_queue_error_fatal_payload {
++	__u64 sideband;
++	__u32 status;
++	__u8 csi_index;
++	__u8 padding[3];
++};
++
++/**
++ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
++ *
++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
++ *                                          command queue group.
++ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
++ *                                          command queue.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
++ *                                          progress timeout.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
++ *                                             of tiler heap memory.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
++ *
++ * This type is used for &struct_base_gpu_queue_group_error.error_type.
++ */
++enum base_gpu_queue_group_error_type {
++	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
++	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
++	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
++	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
++	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
++};
++
++/**
++ * struct base_gpu_queue_group_error - Unrecoverable fault information
++ * @error_type:          Error type of @base_gpu_queue_group_error_type
++ *                       indicating which field in union payload is filled
++ * @padding:             Unused bytes for 64bit boundary
++ * @payload:             Input Payload
++ * @payload.fatal_group: Unrecoverable fault error associated with
++ *                       GPU command queue group
++ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
++ */
++struct base_gpu_queue_group_error {
++	__u8 error_type;
++	__u8 padding[7];
++	union {
++		struct base_gpu_queue_group_error_fatal_payload fatal_group;
++		struct base_gpu_queue_error_fatal_payload fatal_queue;
++	} payload;
++};
++
++/**
++ * enum base_csf_notification_type - Notification type
++ *
++ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
++ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
++ *                                               error
++ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
++ *                                               queue
++ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
++ *
++ * This type is used for &struct_base_csf_notification.type.
++ */
++enum base_csf_notification_type {
++	BASE_CSF_NOTIFICATION_EVENT = 0,
++	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
++	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
++	BASE_CSF_NOTIFICATION_COUNT
++};
++
++/**
++ * struct base_csf_notification - Event or error notification
++ *
++ * @type:                      Notification type of @base_csf_notification_type
++ * @padding:                   Padding for 64bit boundary
++ * @payload:                   Input Payload
++ * @payload.align:             To fit the struct into a 64-byte cache line
++ * @payload.csg_error:         CSG error
++ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
++ *                             fatal error
++ * @payload.csg_error.padding: Padding
++ * @payload.csg_error.error:   Unrecoverable fault error
++ *
++ */
++struct base_csf_notification {
++	__u8 type;
++	__u8 padding[7];
++	union {
++		struct {
++			__u8 handle;
++			__u8 padding[7];
++			struct base_gpu_queue_group_error error;
++		} csg_error;
++
++		__u8 align[56];
++	} payload;
++};
++
++/**
++ * struct mali_base_gpu_core_props - GPU core props info
++ *
++ * @product_id: Pro specific value.
++ * @version_status: Status of the GPU release. No defined values, but starts at
++ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
++ *   4 bit values (0-15).
++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
++ *   release number.
++ *   8 bit values (0-255).
++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
++ *   release number.
++ *   4 bit values (0-15).
++ * @padding: padding to align to 8-byte
++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
++ *   clGetDeviceInfo()
++ * @log2_program_counter_size: Size of the shader program counter, in bits.
++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
++ *   is a bitpattern where a set bit indicates that the format is supported.
++ *   Before using a texture format, it is recommended that the corresponding
++ *   bit be checked.
++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
++ *   It is unlikely that a client will be able to allocate all of this memory
++ *   for their own purposes, but this at least provides an upper bound on the
++ *   memory available to the GPU.
++ *   This is required for OpenCL's clGetDeviceInfo() call when
++ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
++ *   client will not be expecting to allocate anywhere near this value.
++ */
++struct mali_base_gpu_core_props {
++	__u32 product_id;
++	__u16 version_status;
++	__u16 minor_revision;
++	__u16 major_revision;
++	__u16 padding;
++	__u32 gpu_freq_khz_max;
++	__u32 log2_program_counter_size;
++	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++	__u64 gpu_available_memory_size;
++};
++
++#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
+diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
+new file mode 100644
+index 00000000000..17e338cb238
+--- /dev/null
++++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
+@@ -0,0 +1,43 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++/*
++ * This header was originally autogenerated, but it is now ok (and
++ * expected) to have to add to it.
++ */
++
++#ifndef _UAPI_GPU_CSF_REGISTERS_H_
++#define _UAPI_GPU_CSF_REGISTERS_H_
++
++/* Only user block defines are included. HI words have been removed */
++
++/* CS_USER_INPUT_BLOCK register offsets */
++#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
++#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
++
++/* CS_USER_OUTPUT_BLOCK register offsets */
++#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
++#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
++
++/* USER register offsets */
++#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
++
++#endif
+diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
+new file mode 100644
+index 00000000000..db7252605f0
+--- /dev/null
++++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
+@@ -0,0 +1,530 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_CSF_IOCTL_H_
++#define _UAPI_KBASE_CSF_IOCTL_H_
++
++#include <asm-generic/ioctl.h>
++#include <linux/types.h>
++
++/*
++ * 1.0:
++ * - CSF IOCTL header separated from JM
++ * 1.1:
++ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
++ * - Add ioctl 54: This controls the priority setting.
++ * 1.2:
++ * - Add new CSF GPU_FEATURES register into the property structure
++ *   returned by KBASE_IOCTL_GET_GPUPROPS
++ * 1.3:
++ * - Add __u32 group_uid member to
++ *   &struct_kbase_ioctl_cs_queue_group_create.out
++ * 1.4:
++ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
++ *   instr_features member of same size
++ * 1.5:
++ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
++ *   queue registration call with extended format for supporting CS
++ *   trace configurations with CSF trace_command.
++ * 1.6:
++ * - Added new HW performance counters interface to all GPUs.
++ * 1.7:
++ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
++ * 1.8:
++ * - Removed Kernel legacy HWC interface
++ * 1.9:
++ * - Reorganization of GPU-VA memory zones, including addition of
++ *   FIXED_VA zone and auto-initialization of EXEC_VA zone.
++ * - Added new Base memory allocation interface
++ * 1.10:
++ * - First release of new HW performance counters interface.
++ * 1.11:
++ * - Dummy model (no mali) backend will now clear HWC values after each sample
++ * 1.12:
++ * - Added support for incremental rendering flag in CSG create call
++ */
++
++#define BASE_UK_VERSION_MAJOR 1
++#define BASE_UK_VERSION_MINOR 12
++
++/**
++ * struct kbase_ioctl_version_check - Check version compatibility between
++ * kernel and userspace
++ *
++ * @major: Major version number
++ * @minor: Minor version number
++ */
++struct kbase_ioctl_version_check {
++	__u16 major;
++	__u16 minor;
++};
++
++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
++	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
++
++/**
++ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
++ *                                        base back-end
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @buffer_size: Size of the buffer in bytes
++ * @priority: Priority of the queue within a group when run within a process
++ * @padding: Currently unused, must be zero
++ *
++ * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
++ *        Any change of this struct should also be mirrored to the latter.
++ */
++struct kbase_ioctl_cs_queue_register {
++	__u64 buffer_gpu_addr;
++	__u32 buffer_size;
++	__u8 priority;
++	__u8 padding[3];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_REGISTER \
++	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
++
++/**
++ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
++ *                                    to notify that a queue has been updated
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ */
++struct kbase_ioctl_cs_queue_kick {
++	__u64 buffer_gpu_addr;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_KICK \
++	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
++
++/**
++ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
++ *
++ * @in:                 Input parameters
++ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @in.group_handle:    Handle of the group to which the queue should be bound
++ * @in.csi_index:       Index of the CSF interface the queue should be bound to
++ * @in.padding:         Currently unused, must be zero
++ * @out:                Output parameters
++ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
++ *                      input/output pages
++ */
++union kbase_ioctl_cs_queue_bind {
++	struct {
++		__u64 buffer_gpu_addr;
++		__u8 group_handle;
++		__u8 csi_index;
++		__u8 padding[6];
++	} in;
++	struct {
++		__u64 mmap_handle;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_BIND \
++	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
++
++/**
++ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
++ *                                           base back-end in extended format,
++ *                                           involving trace buffer configuration
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @buffer_size: Size of the buffer in bytes
++ * @priority: Priority of the queue within a group when run within a process
++ * @padding: Currently unused, must be zero
++ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
++ * @ex_buffer_base: Trace buffer GPU base address for the queue
++ * @ex_buffer_size: Size of the trace buffer in bytes
++ * @ex_event_size: Trace event write size, in log2 designation
++ * @ex_event_state: Trace event states configuration
++ * @ex_padding: Currently unused, must be zero
++ *
++ * Note: There is an identical sub-section at the start of this struct to that
++ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
++ *        must also be mirrored to the latter. Following the said sub-section,
++ *        the remaining fields forms the extension, marked with ex_*.
++ */
++struct kbase_ioctl_cs_queue_register_ex {
++	__u64 buffer_gpu_addr;
++	__u32 buffer_size;
++	__u8 priority;
++	__u8 padding[3];
++	__u64 ex_offset_var_addr;
++	__u64 ex_buffer_base;
++	__u32 ex_buffer_size;
++	__u8 ex_event_size;
++	__u8 ex_event_state;
++	__u8 ex_padding[2];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
++	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
++
++/**
++ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ */
++struct kbase_ioctl_cs_queue_terminate {
++	__u64 buffer_gpu_addr;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
++	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
++
++/**
++ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
++ *                                               group
++ * @in:               Input parameters
++ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
++ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
++ * @in.cs_min:        Minimum number of CSs required.
++ * @in.priority:      Queue group's priority within a process.
++ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
++ *                    to use.
++ * @in.fragment_max:  Maximum number of fragment endpoints the group is
++ *                    allowed to use.
++ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
++ *                    to use.
++ * @in.padding:       Currently unused, must be zero
++ * @out:              Output parameters
++ * @out.group_handle: Handle of a newly created queue group.
++ * @out.padding:      Currently unused, must be zero
++ * @out.group_uid:    UID of the queue group available to base.
++ */
++union kbase_ioctl_cs_queue_group_create_1_6 {
++	struct {
++		__u64 tiler_mask;
++		__u64 fragment_mask;
++		__u64 compute_mask;
++		__u8 cs_min;
++		__u8 priority;
++		__u8 tiler_max;
++		__u8 fragment_max;
++		__u8 compute_max;
++		__u8 padding[3];
++
++	} in;
++	struct {
++		__u8 group_handle;
++		__u8 padding[3];
++		__u32 group_uid;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
++	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
++
++/**
++ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
++ * @in:               Input parameters
++ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
++ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
++ * @in.cs_min:        Minimum number of CSs required.
++ * @in.priority:      Queue group's priority within a process.
++ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
++ *                    to use.
++ * @in.fragment_max:  Maximum number of fragment endpoints the group is
++ *                    allowed to use.
++ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
++ *                    to use.
++ * @in.csi_handlers:  Flags to signal that the application intends to use CSI
++ *                    exception handlers in some linear buffers to deal with
++ *                    the given exception types.
++ * @in.padding:       Currently unused, must be zero
++ * @out:              Output parameters
++ * @out.group_handle: Handle of a newly created queue group.
++ * @out.padding:      Currently unused, must be zero
++ * @out.group_uid:    UID of the queue group available to base.
++ */
++union kbase_ioctl_cs_queue_group_create {
++	struct {
++		__u64 tiler_mask;
++		__u64 fragment_mask;
++		__u64 compute_mask;
++		__u8 cs_min;
++		__u8 priority;
++		__u8 tiler_max;
++		__u8 fragment_max;
++		__u8 compute_max;
++		__u8 csi_handlers;
++		__u8 padding[2];
++		/**
++		 * @in.reserved: Reserved
++		 */
++		__u64 reserved;
++	} in;
++	struct {
++		__u8 group_handle;
++		__u8 padding[3];
++		__u32 group_uid;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
++	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
++
++/**
++ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
++ *
++ * @group_handle: Handle of the queue group to be terminated
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_cs_queue_group_term {
++	__u8 group_handle;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
++	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
++
++#define KBASE_IOCTL_CS_EVENT_SIGNAL \
++	_IO(KBASE_IOCTL_TYPE, 44)
++
++typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
++
++/**
++ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
++ *
++ * @id: ID of the new command queue returned by the kernel
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_new {
++	base_kcpu_queue_id id;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
++	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
++
++/**
++ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
++ *
++ * @id: ID of the command queue to be destroyed
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_delete {
++	base_kcpu_queue_id id;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
++	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
++
++/**
++ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
++ *
++ * @addr: Memory address of an array of struct base_kcpu_queue_command
++ * @nr_commands: Number of commands in the array
++ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_enqueue {
++	__u64 addr;
++	__u32 nr_commands;
++	base_kcpu_queue_id id;
++	__u8 padding[3];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
++	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
++
++/**
++ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
++ * @in:                Input parameters
++ * @in.chunk_size:     Size of each chunk.
++ * @in.initial_chunks: Initial number of chunks that heap will be created with.
++ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
++ * @in.target_in_flight: Number of render-passes that the driver should attempt to
++ *                     keep in flight for which allocation of new chunks is
++ *                     allowed.
++ * @in.group_id:       Group ID to be used for physical allocations.
++ * @in.padding:        Padding
++ * @out:               Output parameters
++ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
++ *                     for the heap.
++ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
++ *                     actually points to the header of heap chunk and not to
++ *                     the low address of free memory in the chunk.
++ */
++union kbase_ioctl_cs_tiler_heap_init {
++	struct {
++		__u32 chunk_size;
++		__u32 initial_chunks;
++		__u32 max_chunks;
++		__u16 target_in_flight;
++		__u8 group_id;
++		__u8 padding;
++	} in;
++	struct {
++		__u64 gpu_heap_va;
++		__u64 first_chunk_va;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
++	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
++
++/**
++ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
++ *                                         instance
++ *
++ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
++ */
++struct kbase_ioctl_cs_tiler_heap_term {
++	__u64 gpu_heap_va;
++};
++
++#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
++	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
++
++/**
++ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
++ *                                        of CSF interface capabilities
++ *
++ * @in:                    Input parameters
++ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
++ *                         which case groups_ptr is unused.
++ * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in
++ *                         which case streams_ptr is unused.
++ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
++ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
++ * @out:                   Output parameters
++ * @out.glb_version:       Global interface version.
++ * @out.features:          Bit mask of features (e.g. whether certain types of job
++ *                         can be suspended).
++ * @out.group_num:         Number of CSGs supported.
++ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
++ *                         hold the size of firmware performance counter data
++ *                         and 15:0 hold the size of hardware performance counter
++ *                         data.
++ * @out.total_stream_num:  Total number of CSs, summed across all groups.
++ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
++ *                         size of events. Bits 3:0 hold the offset update rate.
++ *                         (csf >= 1.1.0)
++ *
++ */
++union kbase_ioctl_cs_get_glb_iface {
++	struct {
++		__u32 max_group_num;
++		__u32 max_total_stream_num;
++		__u64 groups_ptr;
++		__u64 streams_ptr;
++	} in;
++	struct {
++		__u32 glb_version;
++		__u32 features;
++		__u32 group_num;
++		__u32 prfcnt_size;
++		__u32 total_stream_num;
++		__u32 instr_features;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_GET_GLB_IFACE \
++	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
++
++struct kbase_ioctl_cs_cpu_queue_info {
++	__u64 buffer;
++	__u64 size;
++};
++
++#define KBASE_IOCTL_VERSION_CHECK \
++	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
++
++#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
++	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
++
++/**
++ * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU
++ * @in: Input parameters
++ * @in.va_pages: The number of pages of virtual address space to reserve
++ * @in.commit_pages: The number of physical pages to allocate
++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
++ * @in.flags: Flags
++ * @in.fixed_address: The GPU virtual address requested for the allocation,
++ *                    if the allocation is using the BASE_MEM_FIXED flag.
++ * @in.extra: Space for extra parameters that may be added in the future.
++ * @out: Output parameters
++ * @out.flags: Flags
++ * @out.gpu_va: The GPU virtual address which is allocated
++ */
++union kbase_ioctl_mem_alloc_ex {
++	struct {
++		__u64 va_pages;
++		__u64 commit_pages;
++		__u64 extension;
++		__u64 flags;
++		__u64 fixed_address;
++		__u64 extra[3];
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex)
++
++/***************
++ * test ioctls *
++ ***************/
++#if MALI_UNIT_TEST
++/* These ioctls are purely for test purposes and are not used in the production
++ * driver, they therefore may change without notice
++ */
++
++/**
++ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
++ * @cpu_addr: Memory address to write
++ * @value: Value to write
++ * @padding: Currently unused, must be zero
++ */
++struct kbase_ioctl_cs_event_memory_write {
++	__u64 cpu_addr;
++	__u8 value;
++	__u8 padding[7];
++};
++
++/**
++ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
++ * @in: Input parameters
++ * @in.cpu_addr: Memory address to read
++ * @out: Output parameters
++ * @out.value: Value read
++ * @out.padding: Currently unused, must be zero
++ */
++union kbase_ioctl_cs_event_memory_read {
++	struct {
++		__u64 cpu_addr;
++	} in;
++	struct {
++		__u8 value;
++		__u8 padding[7];
++	} out;
++};
++
++#endif /* MALI_UNIT_TEST */
++
++#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
+diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
+new file mode 100644
+index 00000000000..ae43908b936
+--- /dev/null
++++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
+@@ -0,0 +1,1051 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_BASE_JM_KERNEL_H_
++#define _UAPI_BASE_JM_KERNEL_H_
++
++#include <linux/types.h>
++#include "../mali_base_common_kernel.h"
++
++/* Memory allocation, access/hint flags & mask specific to JM GPU.
++ *
++ * See base_mem_alloc_flags.
++ */
++
++/* Used as BASE_MEM_FIXED in other backends */
++#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
++
++/**
++ * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved.
++ *
++ * Do not remove, use the next unreserved bit for new flags
++ */
++#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19)
++
++/**
++ * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned
++ * to 'extension' pages, where 'extension' must be a power of 2 and no more than
++ * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
++ */
++#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20)
++
++/* Use the GPU VA chosen by the kernel client */
++#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27)
++
++/* Force trimming of JIT allocations when creating a new allocation */
++#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29)
++
++/* Note that the number of bits used for base_mem_alloc_flags
++ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
++ */
++
++/* A mask of all the flags which are only valid for allocations within kbase,
++ * and may not be passed from user space.
++ */
++#define BASEP_MEM_FLAGS_KERNEL_ONLY \
++	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \
++	 BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM)
++
++/* A mask of all currently reserved flags
++ */
++#define BASE_MEM_FLAGS_RESERVED \
++	(BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19)
++
++
++/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the
++ * initial commit is aligned to 'extension' pages, where 'extension' must be a power
++ * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
++ */
++#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP  (1 << 0)
++
++/**
++ * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points
++ * to a __u32 holding the used size in bytes;
++ * otherwise it points to a __u64 holding the lowest address of unused memory.
++ */
++#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE  (1 << 1)
++
++/**
++ * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags
++ *
++ * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr
++ * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set
++ * and heap_info_gpu_addr being 0 will be rejected).
++ */
++#define BASE_JIT_ALLOC_VALID_FLAGS \
++	(BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE)
++
++/* Bitpattern describing the ::base_context_create_flags that can be
++ * passed to base_context_init()
++ */
++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
++	(BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
++
++/*
++ * Private flags used on the base context
++ *
++ * These start at bit 31, and run down to zero.
++ *
++ * They share the same space as base_context_create_flags, and so must
++ * not collide with them.
++ */
++
++/* Private flag tracking whether job descriptor dumping is disabled */
++#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \
++	((base_context_create_flags)(1 << 31))
++
++/* Flags for base tracepoint specific to JM */
++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
++		BASE_TLSTREAM_JOB_DUMPING_ENABLED)
++/*
++ * Dependency stuff, keep it private for now. May want to expose it if
++ * we decide to make the number of semaphores a configurable
++ * option.
++ */
++#define BASE_JD_ATOM_COUNT              256
++
++/* Maximum number of concurrent render passes.
++ */
++#define BASE_JD_RP_COUNT (256)
++
++/* Set/reset values for a software event */
++#define BASE_JD_SOFT_EVENT_SET             ((unsigned char)1)
++#define BASE_JD_SOFT_EVENT_RESET           ((unsigned char)0)
++
++/**
++ * struct base_jd_udata - Per-job data
++ *
++ * @blob: per-job data array
++ *
++ * This structure is used to store per-job data, and is completely unused
++ * by the Base driver. It can be used to store things such as callback
++ * function pointer, data to handle job completion. It is guaranteed to be
++ * untouched by the Base driver.
++ */
++struct base_jd_udata {
++	__u64 blob[2];
++};
++
++/**
++ * typedef base_jd_dep_type - Job dependency type.
++ *
++ * A flags field will be inserted into the atom structure to specify whether a
++ * dependency is a data or ordering dependency (by putting it before/after
++ * 'core_req' in the structure it should be possible to add without changing
++ * the structure size).
++ * When the flag is set for a particular dependency to signal that it is an
++ * ordering only dependency then errors will not be propagated.
++ */
++typedef __u8 base_jd_dep_type;
++
++#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
++#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
++#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
++
++/**
++ * typedef base_jd_core_req - Job chain hardware requirements.
++ *
++ * A job chain must specify what GPU features it needs to allow the
++ * driver to schedule the job correctly.  By not specifying the
++ * correct settings can/will cause an early job termination.  Multiple
++ * values can be ORed together to specify multiple requirements.
++ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
++ * dependencies, and that doesn't execute anything on the hardware.
++ */
++typedef __u32 base_jd_core_req;
++
++/* Requirements that come from the HW */
++
++/* No requirement, dependency only
++ */
++#define BASE_JD_REQ_DEP ((base_jd_core_req)0)
++
++/* Requires fragment shaders
++ */
++#define BASE_JD_REQ_FS  ((base_jd_core_req)1 << 0)
++
++/* Requires compute shaders
++ *
++ * This covers any of the following GPU job types:
++ * - Vertex Shader Job
++ * - Geometry Shader Job
++ * - An actual Compute Shader Job
++ *
++ * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
++ * job is specifically just the "Compute Shader" job type, and not the "Vertex
++ * Shader" nor the "Geometry Shader" job type.
++ */
++#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1)
++
++/* Requires tiling */
++#define BASE_JD_REQ_T  ((base_jd_core_req)1 << 2)
++
++/* Requires cache flushes */
++#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3)
++
++/* Requires value writeback */
++#define BASE_JD_REQ_V  ((base_jd_core_req)1 << 4)
++
++/* SW-only requirements - the HW does not expose these as part of the job slot
++ * capabilities
++ */
++
++/* Requires fragment job with AFBC encoding */
++#define BASE_JD_REQ_FS_AFBC  ((base_jd_core_req)1 << 13)
++
++/* SW-only requirement: coalesce completion events.
++ * If this bit is set then completion of this atom will not cause an event to
++ * be sent to userspace, whether successful or not; completion events will be
++ * deferred until an atom completes which does not have this bit set.
++ *
++ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
++ */
++#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5)
++
++/* SW Only requirement: the job chain requires a coherent core group. We don't
++ * mind which coherent core group is used.
++ */
++#define BASE_JD_REQ_COHERENT_GROUP  ((base_jd_core_req)1 << 6)
++
++/* SW Only requirement: The performance counters should be enabled only when
++ * they are needed, to reduce power consumption.
++ */
++#define BASE_JD_REQ_PERMON               ((base_jd_core_req)1 << 7)
++
++/* SW Only requirement: External resources are referenced by this atom.
++ *
++ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and
++ * BASE_JD_REQ_SOFT_EVENT_WAIT.
++ */
++#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((base_jd_core_req)1 << 8)
++
++/* SW Only requirement: Software defined job. Jobs with this bit set will not be
++ * submitted to the hardware but will cause some action to happen within the
++ * driver
++ */
++#define BASE_JD_REQ_SOFT_JOB        ((base_jd_core_req)1 << 9)
++
++#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
++#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
++#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
++
++/* 0x4 RESERVED for now */
++
++/* SW only requirement: event wait/trigger job.
++ *
++ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
++ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
++ *   other waiting jobs. It completes immediately.
++ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
++ *   possible for other jobs to wait upon. It completes immediately.
++ */
++#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
++#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
++#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
++
++#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
++
++/* SW only requirement: Just In Time allocation
++ *
++ * This job requests a single or multiple just-in-time allocations through a
++ * list of base_jit_alloc_info structure which is passed via the jc element of
++ * the atom. The number of base_jit_alloc_info structures present in the
++ * list is passed via the nr_extres element of the atom
++ *
++ * It should be noted that the id entry in base_jit_alloc_info must not
++ * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE.
++ *
++ * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE
++ * soft job to free the JIT allocation is still made.
++ *
++ * The job will complete immediately.
++ */
++#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
++
++/* SW only requirement: Just In Time free
++ *
++ * This job requests a single or multiple just-in-time allocations created by
++ * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time
++ * allocations is passed via the jc element of the atom.
++ *
++ * The job will complete immediately.
++ */
++#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
++
++/* SW only requirement: Map external resource
++ *
++ * This job requests external resource(s) are mapped once the dependencies
++ * of the job have been satisfied. The list of external resources are
++ * passed via the jc element of the atom which is a pointer to a
++ * base_external_resource_list.
++ */
++#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
++
++/* SW only requirement: Unmap external resource
++ *
++ * This job requests external resource(s) are unmapped once the dependencies
++ * of the job has been satisfied. The list of external resources are
++ * passed via the jc element of the atom which is a pointer to a
++ * base_external_resource_list.
++ */
++#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
++
++/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
++ *
++ * This indicates that the Job Chain contains GPU jobs of the 'Compute
++ * Shaders' type.
++ *
++ * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job
++ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
++ */
++#define BASE_JD_REQ_ONLY_COMPUTE    ((base_jd_core_req)1 << 10)
++
++/* HW Requirement: Use the base_jd_atom::device_nr field to specify a
++ * particular core group
++ *
++ * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
++ * takes priority
++ *
++ * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms.
++ */
++#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11)
++
++/* SW Flag: If this bit is set then the successful completion of this atom
++ * will not cause an event to be sent to userspace
++ */
++#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((base_jd_core_req)1 << 12)
++
++/* SW Flag: If this bit is set then completion of this atom will not cause an
++ * event to be sent to userspace, whether successful or not.
++ */
++#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14)
++
++/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
++ *
++ * If this bit is set then the GPU's cache will not be cleaned and invalidated
++ * until a GPU job starts which does not have this bit set or a job completes
++ * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use
++ * if the CPU may have written to memory addressed by the job since the last job
++ * without this bit set was submitted.
++ */
++#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15)
++
++/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
++ *
++ * If this bit is set then the GPU's cache will not be cleaned and invalidated
++ * until a GPU job completes which does not have this bit set or a job starts
++ * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use
++ * if the CPU may read from or partially overwrite memory addressed by the job
++ * before the next job without this bit set completes.
++ */
++#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16)
++
++/* Request the atom be executed on a specific job slot.
++ *
++ * When this flag is specified, it takes precedence over any existing job slot
++ * selection logic.
++ */
++#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17)
++
++/* SW-only requirement: The atom is the start of a renderpass.
++ *
++ * If this bit is set then the job chain will be soft-stopped if it causes the
++ * GPU to write beyond the end of the physical pages backing the tiler heap, and
++ * committing more memory to the heap would exceed an internal threshold. It may
++ * be resumed after running one of the job chains attached to an atom with
++ * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be
++ * resumed multiple times until it completes without memory usage exceeding the
++ * threshold.
++ *
++ * Usually used with BASE_JD_REQ_T.
++ */
++#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18)
++
++/* SW-only requirement: The atom is the end of a renderpass.
++ *
++ * If this bit is set then the atom incorporates the CPU address of a
++ * base_jd_fragment object instead of the GPU address of a job chain.
++ *
++ * Which job chain is run depends upon whether the atom with the same renderpass
++ * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or
++ * was soft-stopped when it exceeded an upper threshold for tiler heap memory
++ * usage.
++ *
++ * It also depends upon whether one of the job chains attached to the atom has
++ * already been run as part of the same renderpass (in which case it would have
++ * written unresolved multisampled and otherwise-discarded output to temporary
++ * buffers that need to be read back). The job chain for doing a forced read and
++ * forced write (from/to temporary buffers) is run as many times as necessary.
++ *
++ * Usually used with BASE_JD_REQ_FS.
++ */
++#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19)
++
++/* SW-only requirement: The atom needs to run on a limited core mask affinity.
++ *
++ * If this bit is set then the kbase_context.limited_core_mask will be applied
++ * to the affinity.
++ */
++#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20)
++
++/* These requirement bits are currently unused in base_jd_core_req
++ */
++#define BASEP_JD_REQ_RESERVED \
++	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
++	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \
++	BASE_JD_REQ_EVENT_COALESCE | \
++	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
++	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
++	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \
++	BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \
++	BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK))
++
++/* Mask of all bits in base_jd_core_req that control the type of the atom.
++ *
++ * This allows dependency only atoms to have flags set
++ */
++#define BASE_JD_REQ_ATOM_TYPE \
++	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
++	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
++
++/**
++ * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that
++ * controls the type of a soft job.
++ */
++#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
++
++/* Returns non-zero value if core requirements passed define a soft job or
++ * a dependency only job.
++ */
++#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
++	(((core_req) & BASE_JD_REQ_SOFT_JOB) || \
++	((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
++
++/**
++ * enum kbase_jd_atom_state - Atom states
++ *
++ * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used.
++ * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD.
++ * @KBASE_JD_ATOM_STATE_IN_JS:  Atom has been given to JS (is runnable/running).
++ * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet
++ *                                    handed back to job dispatcher for
++ *                                    dependency resolution.
++ * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed
++ *                                 back to userspace.
++ */
++enum kbase_jd_atom_state {
++	KBASE_JD_ATOM_STATE_UNUSED,
++	KBASE_JD_ATOM_STATE_QUEUED,
++	KBASE_JD_ATOM_STATE_IN_JS,
++	KBASE_JD_ATOM_STATE_HW_COMPLETED,
++	KBASE_JD_ATOM_STATE_COMPLETED
++};
++
++/**
++ * typedef base_atom_id - Type big enough to store an atom number in.
++ */
++typedef __u8 base_atom_id;
++
++/**
++ * struct base_dependency - base dependency
++ *
++ * @atom_id:         An atom number
++ * @dependency_type: Dependency type
++ */
++struct base_dependency {
++	base_atom_id atom_id;
++	base_jd_dep_type dependency_type;
++};
++
++/**
++ * struct base_jd_fragment - Set of GPU fragment job chains used for rendering.
++ *
++ * @norm_read_norm_write: Job chain for full rendering.
++ *                        GPU address of a fragment job chain to render in the
++ *                        circumstance where the tiler job chain did not exceed
++ *                        its memory usage threshold and no fragment job chain
++ *                        was previously run for the same renderpass.
++ *                        It is used no more than once per renderpass.
++ * @norm_read_forced_write: Job chain for starting incremental
++ *                          rendering.
++ *                          GPU address of a fragment job chain to render in
++ *                          the circumstance where the tiler job chain exceeded
++ *                          its memory usage threshold for the first time and
++ *                          no fragment job chain was previously run for the
++ *                          same renderpass.
++ *                          Writes unresolved multisampled and normally-
++ *                          discarded output to temporary buffers that must be
++ *                          read back by a subsequent forced_read job chain
++ *                          before the renderpass is complete.
++ *                          It is used no more than once per renderpass.
++ * @forced_read_forced_write: Job chain for continuing incremental
++ *                            rendering.
++ *                            GPU address of a fragment job chain to render in
++ *                            the circumstance where the tiler job chain
++ *                            exceeded its memory usage threshold again
++ *                            and a fragment job chain was previously run for
++ *                            the same renderpass.
++ *                            Reads unresolved multisampled and
++ *                            normally-discarded output from temporary buffers
++ *                            written by a previous forced_write job chain and
++ *                            writes the same to temporary buffers again.
++ *                            It is used as many times as required until
++ *                            rendering completes.
++ * @forced_read_norm_write: Job chain for ending incremental rendering.
++ *                          GPU address of a fragment job chain to render in the
++ *                          circumstance where the tiler job chain did not
++ *                          exceed its memory usage threshold this time and a
++ *                          fragment job chain was previously run for the same
++ *                          renderpass.
++ *                          Reads unresolved multisampled and normally-discarded
++ *                          output from temporary buffers written by a previous
++ *                          forced_write job chain in order to complete a
++ *                          renderpass.
++ *                          It is used no more than once per renderpass.
++ *
++ * This structure is referenced by the main atom structure if
++ * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req.
++ */
++struct base_jd_fragment {
++	__u64 norm_read_norm_write;
++	__u64 norm_read_forced_write;
++	__u64 forced_read_forced_write;
++	__u64 forced_read_norm_write;
++};
++
++/**
++ * typedef base_jd_prio - Base Atom priority.
++ *
++ * Only certain priority levels are actually implemented, as specified by the
++ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
++ * level that is not one of those defined below.
++ *
++ * Priority levels only affect scheduling after the atoms have had dependencies
++ * resolved. For example, a low priority atom that has had its dependencies
++ * resolved might run before a higher priority atom that has not had its
++ * dependencies resolved.
++ *
++ * In general, fragment atoms do not affect non-fragment atoms with
++ * lower priorities, and vice versa. One exception is that there is only one
++ * priority value for each context. So a high-priority (e.g.) fragment atom
++ * could increase its context priority, causing its non-fragment atoms to also
++ * be scheduled sooner.
++ *
++ * The atoms are scheduled as follows with respect to their priorities:
++ * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies
++ *   resolved, and atom 'X' has a higher priority than atom 'Y'
++ * * If atom 'Y' is currently running on the HW, then it is interrupted to
++ *   allow atom 'X' to run soon after
++ * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing
++ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
++ * * Any two atoms that have the same priority could run in any order with
++ *   respect to each other. That is, there is no ordering constraint between
++ *   atoms of the same priority.
++ *
++ * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are
++ * scheduled between contexts. The default value, 0, will cause higher-priority
++ * atoms to be scheduled first, regardless of their context. The value 1 will
++ * use a round-robin algorithm when deciding which context's atoms to schedule
++ * next, so higher-priority atoms can only preempt lower priority atoms within
++ * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and
++ * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details.
++ */
++typedef __u8 base_jd_prio;
++
++/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */
++#define BASE_JD_PRIO_MEDIUM  ((base_jd_prio)0)
++/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and
++ * BASE_JD_PRIO_LOW
++ */
++#define BASE_JD_PRIO_HIGH    ((base_jd_prio)1)
++/* Low atom priority. */
++#define BASE_JD_PRIO_LOW     ((base_jd_prio)2)
++/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH,
++ * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW
++ */
++#define BASE_JD_PRIO_REALTIME    ((base_jd_prio)3)
++
++/* Invalid atom priority (max uint8_t value) */
++#define BASE_JD_PRIO_INVALID ((base_jd_prio)255)
++
++/* Count of the number of priority levels. This itself is not a valid
++ * base_jd_prio setting
++ */
++#define BASE_JD_NR_PRIO_LEVELS 4
++
++/**
++ * struct base_jd_atom_v2 - Node of a dependency graph used to submit a
++ *                          GPU job chain or soft-job to the kernel driver.
++ *
++ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
++ *                 is set in the base_jd_core_req) the CPU address of a
++ *                 base_jd_fragment object.
++ * @udata:         User data.
++ * @extres_list:   List of external resources.
++ * @nr_extres:     Number of external resources or JIT allocations.
++ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
++ *                 allocations written to by the atom. When the atom
++ *                 completes, the value stored at the
++ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
++ *                 each allocation is read in order to enforce an
++ *                 overall physical memory usage limit.
++ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
++ *                 this field; this is done in order to reduce possibility of
++ *                 improper assignment of a dependency field.
++ * @atom_number:   Unique number to identify the atom.
++ * @prio:          Atom priority. Refer to base_jd_prio for more details.
++ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
++ *                 specified.
++ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
++ * @core_req:      Core requirements.
++ * @renderpass_id: Renderpass identifier used to associate an atom that has
++ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
++ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
++ * @padding:       Unused. Must be zero.
++ *
++ * This structure has changed since UK 10.2 for which base_jd_core_req was a
++ * __u16 value.
++ *
++ * In UK 10.3 a core_req field of a __u32 type was added to the end of the
++ * structure, and the place in the structure previously occupied by __u16
++ * core_req was kept but renamed to compat_core_req.
++ *
++ * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2].
++ * Compatibility with UK 10.x from UK 11.y is not handled because
++ * the major version increase prevents this.
++ *
++ * For UK 11.20 jit_id[2] must be initialized to zero.
++ */
++struct base_jd_atom_v2 {
++	__u64 jc;
++	struct base_jd_udata udata;
++	__u64 extres_list;
++	__u16 nr_extres;
++	__u8 jit_id[2];
++	struct base_dependency pre_dep[2];
++	base_atom_id atom_number;
++	base_jd_prio prio;
++	__u8 device_nr;
++	__u8 jobslot;
++	base_jd_core_req core_req;
++	__u8 renderpass_id;
++	__u8 padding[7];
++};
++
++/**
++ * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr
++ *                          at the beginning.
++ *
++ * @seq_nr:        Sequence number of logical grouping of atoms.
++ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
++ *                 is set in the base_jd_core_req) the CPU address of a
++ *                 base_jd_fragment object.
++ * @udata:         User data.
++ * @extres_list:   List of external resources.
++ * @nr_extres:     Number of external resources or JIT allocations.
++ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
++ *                 allocations written to by the atom. When the atom
++ *                 completes, the value stored at the
++ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
++ *                 each allocation is read in order to enforce an
++ *                 overall physical memory usage limit.
++ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
++ *                 this field; this is done in order to reduce possibility of
++ *                 improper assignment of a dependency field.
++ * @atom_number:   Unique number to identify the atom.
++ * @prio:          Atom priority. Refer to base_jd_prio for more details.
++ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
++ *                 specified.
++ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
++ * @core_req:      Core requirements.
++ * @renderpass_id: Renderpass identifier used to associate an atom that has
++ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
++ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
++ * @padding:       Unused. Must be zero.
++ */
++typedef struct base_jd_atom {
++	__u64 seq_nr;
++	__u64 jc;
++	struct base_jd_udata udata;
++	__u64 extres_list;
++	__u16 nr_extres;
++	__u8 jit_id[2];
++	struct base_dependency pre_dep[2];
++	base_atom_id atom_number;
++	base_jd_prio prio;
++	__u8 device_nr;
++	__u8 jobslot;
++	base_jd_core_req core_req;
++	__u8 renderpass_id;
++	__u8 padding[7];
++} base_jd_atom;
++
++/* Job chain event code bits
++ * Defines the bits used to create ::base_jd_event_code
++ */
++enum {
++	BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */
++	BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */
++	/* Event indicates success (SW events only) */
++	BASE_JD_SW_EVENT_SUCCESS = (1u << 13),
++	BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */
++	BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */
++	BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */
++	BASE_JD_SW_EVENT_RESERVED = (3u << 11),	/* Reserved event type */
++	/* Mask to extract the type from an event code */
++	BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11)
++};
++
++/**
++ * enum base_jd_event_code - Job chain event codes
++ *
++ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status
++ *                                         codes.
++ *                                         Obscurely, BASE_JD_EVENT_TERMINATED
++ *                                         indicates a real fault, because the
++ *                                         job was hard-stopped.
++ * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as
++ *                             'previous job done'.
++ * @BASE_JD_EVENT_STOPPED:     Can't be seen by userspace, becomes
++ *                             TERMINATED, DONE or JOB_CANCELLED.
++ * @BASE_JD_EVENT_TERMINATED:  This is actually a fault status code - the job
++ *                             was hard stopped.
++ * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on
++ *                        complete/fail/cancel.
++ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes.
++ *                                       Obscurely, BASE_JD_EVENT_TERMINATED
++ *                                       indicates a real fault,
++ *                                       because the job was hard-stopped.
++ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and
++ *                                                  software error status codes.
++ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and
++ *                                                software error status codes.
++ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status
++ *                                        codes.
++ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes.
++ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes.
++ *                                         Such codes are never returned to
++ *                                         user-space.
++ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes.
++ * @BASE_JD_EVENT_DONE: atom has completed successfull
++ * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which
++ *                                  shall result in a failed atom
++ * @BASE_JD_EVENT_JOB_POWER_FAULT:  The job could not be executed because the
++ *                                  part of the memory system required to access
++ *                                  job descriptors was not powered on
++ * @BASE_JD_EVENT_JOB_READ_FAULT:   Reading a job descriptor into the Job
++ *                                  manager failed
++ * @BASE_JD_EVENT_JOB_WRITE_FAULT:  Writing a job descriptor from the Job
++ *                                  manager failed
++ * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the
++ *                                    specified affinity mask does not intersect
++ *                                    any available cores
++ * @BASE_JD_EVENT_JOB_BUS_FAULT:    A bus access failed while executing a job
++ * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program
++ *                                  counter was executed.
++ * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal
++ *                                  encoding was executed.
++ * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where
++ *                                  the instruction encoding did not match the
++ *                                  instruction type encoded in the program
++ *                                  counter.
++ * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that
++ *                                  contained invalid combinations of operands.
++ * @BASE_JD_EVENT_INSTR_TLS_FAULT:  A shader instruction was executed that tried
++ *                                  to access the thread local storage section
++ *                                  of another thread.
++ * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that
++ *                                  tried to do an unsupported unaligned memory
++ *                                  access.
++ * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that
++ *                                  failed to complete an instruction barrier.
++ * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job
++ *                                  contains invalid combinations of data.
++ * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to
++ *                                  process a tile that is entirely outside the
++ *                                  bounding box of the frame.
++ * @BASE_JD_EVENT_STATE_FAULT:      Matches ADDR_RANGE_FAULT. A virtual address
++ *                                  has been found that exceeds the virtual
++ *                                  address range.
++ * @BASE_JD_EVENT_OUT_OF_MEMORY:    The tiler ran out of memory when executing a job.
++ * @BASE_JD_EVENT_UNKNOWN:          If multiple jobs in a job chain fail, only
++ *                                  the first one the reports an error will set
++ *                                  and return full error information.
++ *                                  Subsequent failing jobs will not update the
++ *                                  error status registers, and may write an
++ *                                  error status of UNKNOWN.
++ * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to
++ *                                  physical memory where the original virtual
++ *                                  address is no longer available.
++ * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache
++ *                                  has detected that the same line has been
++ *                                  accessed as both shareable and non-shareable
++ *                                  memory from inside the GPU.
++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table
++ *                                  entry at level 1 of the translation table.
++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table
++ *                                  entry at level 2 of the translation table.
++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table
++ *                                  entry at level 3 of the translation table.
++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table
++ *                                  entry at level 4 of the translation table.
++ * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to
++ *                                  the permission flags set in translation
++ *                                  table
++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading
++ *                                  level 0 of the translation tables.
++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading
++ *                                  level 1 of the translation tables.
++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading
++ *                                  level 2 of the translation tables.
++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading
++ *                                  level 3 of the translation tables.
++ * @BASE_JD_EVENT_ACCESS_FLAG:      Matches ACCESS_FLAG_0. A memory access hit a
++ *                                  translation table entry with the ACCESS_FLAG
++ *                                  bit set to zero in level 0 of the
++ *                                  page table, and the DISABLE_AF_FAULT flag
++ *                                  was not set.
++ * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to
++ *                                   grow memory on demand
++ * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its
++ *                               dependencies failed
++ * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data
++ *                             in the atom which overlaps with
++ *                             BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the
++ *                             platform doesn't support the feature specified in
++ *                             the atom.
++ * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate
++ *                                to userspace that the KBase context has been
++ *                                destroyed and Base should stop listening for
++ *                                further events
++ * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in
++ *                                   the GPU has to be retried (but it has not
++ *                                   started) due to e.g., GPU reset
++ * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal
++ *                             the completion of a renderpass. This value
++ *                             shouldn't be returned to userspace but I haven't
++ *                             seen where it is reset back to JD_EVENT_DONE.
++ *
++ * HW and low-level SW events are represented by event codes.
++ * The status of jobs which succeeded are also represented by
++ * an event code (see @BASE_JD_EVENT_DONE).
++ * Events are usually reported as part of a &struct base_jd_event.
++ *
++ * The event codes are encoded in the following way:
++ * * 10:0  - subtype
++ * * 12:11 - type
++ * * 13    - SW success (only valid if the SW bit is set)
++ * * 14    - SW event (HW event if not set)
++ * * 15    - Kernel event (should never be seen in userspace)
++ *
++ * Events are split up into ranges as follows:
++ * * BASE_JD_EVENT_RANGE_<description>_START
++ * * BASE_JD_EVENT_RANGE_<description>_END
++ *
++ * code is in <description>'s range when:
++ * BASE_JD_EVENT_RANGE_<description>_START <= code <
++ *   BASE_JD_EVENT_RANGE_<description>_END
++ *
++ * Ranges can be asserted for adjacency by testing that the END of the previous
++ * is equal to the START of the next. This is useful for optimizing some tests
++ * for range.
++ *
++ * A limitation is that the last member of this enum must explicitly be handled
++ * (with an assert-unreachable statement) in switch statements that use
++ * variables of this type. Otherwise, the compiler warns that we have not
++ * handled that enum value.
++ */
++enum base_jd_event_code {
++	/* HW defined exceptions */
++	BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0,
++
++	/* non-fatal exceptions */
++	BASE_JD_EVENT_NOT_STARTED = 0x00,
++	BASE_JD_EVENT_DONE = 0x01,
++	BASE_JD_EVENT_STOPPED = 0x03,
++	BASE_JD_EVENT_TERMINATED = 0x04,
++	BASE_JD_EVENT_ACTIVE = 0x08,
++
++	BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40,
++	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40,
++
++	/* job exceptions */
++	BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40,
++	BASE_JD_EVENT_JOB_POWER_FAULT = 0x41,
++	BASE_JD_EVENT_JOB_READ_FAULT = 0x42,
++	BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43,
++	BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44,
++	BASE_JD_EVENT_JOB_BUS_FAULT = 0x48,
++	BASE_JD_EVENT_INSTR_INVALID_PC = 0x50,
++	BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51,
++	BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52,
++	BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53,
++	BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54,
++	BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55,
++	BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56,
++	BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58,
++	BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59,
++	BASE_JD_EVENT_STATE_FAULT = 0x5A,
++	BASE_JD_EVENT_OUT_OF_MEMORY = 0x60,
++	BASE_JD_EVENT_UNKNOWN = 0x7F,
++
++	/* GPU exceptions */
++	BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80,
++	BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88,
++
++	/* MMU exceptions */
++	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1,
++	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2,
++	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3,
++	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4,
++	BASE_JD_EVENT_PERMISSION_FAULT = 0xC8,
++	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1,
++	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2,
++	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3,
++	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4,
++	BASE_JD_EVENT_ACCESS_FLAG = 0xD8,
++
++	/* SW defined exceptions */
++	BASE_JD_EVENT_MEM_GROWTH_FAILED =
++		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000,
++	BASE_JD_EVENT_JOB_CANCELLED =
++		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002,
++	BASE_JD_EVENT_JOB_INVALID =
++		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003,
++
++	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_RESERVED | 0x3FF,
++
++	BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_SUCCESS | 0x000,
++
++	BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000,
++
++	BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF,
++
++	BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_KERNEL | 0x000,
++	BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000,
++	BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001,
++
++	BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT |
++		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF
++};
++
++/**
++ * struct base_jd_event_v2 - Event reporting structure
++ *
++ * @event_code:  event code of type @ref base_jd_event_code.
++ * @atom_number: the atom number that has completed.
++ * @padding:     padding.
++ * @udata:       user data.
++ *
++ * This structure is used by the kernel driver to report information
++ * about GPU events. They can either be HW-specific events or low-level
++ * SW events, such as job-chain completion.
++ *
++ * The event code contains an event type field which can be extracted
++ * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK.
++ */
++struct base_jd_event_v2 {
++	__u32 event_code;
++	base_atom_id atom_number;
++	__u8 padding[3];
++	struct base_jd_udata udata;
++};
++
++/**
++ * struct base_dump_cpu_gpu_counters - Structure for
++ *                                     BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS
++ *                                     jobs.
++ * @system_time:   gpu timestamp
++ * @cycle_counter: gpu cycle count
++ * @sec:           cpu time(sec)
++ * @usec:          cpu time(usec)
++ * @padding:       padding
++ *
++ * This structure is stored into the memory pointed to by the @jc field
++ * of &struct base_jd_atom.
++ *
++ * It must not occupy the same CPU cache line(s) as any neighboring data.
++ * This is to avoid cases where access to pages containing the structure
++ * is shared between cached and un-cached memory regions, which would
++ * cause memory corruption.
++ */
++
++struct base_dump_cpu_gpu_counters {
++	__u64 system_time;
++	__u64 cycle_counter;
++	__u64 sec;
++	__u32 usec;
++	__u8 padding[36];
++};
++
++/**
++ * struct mali_base_gpu_core_props - GPU core props info
++ *
++ * @product_id: Pro specific value.
++ * @version_status: Status of the GPU release. No defined values, but starts at
++ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
++ *   4 bit values (0-15).
++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
++ *   release number.
++ *   8 bit values (0-255).
++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
++ *   release number.
++ *   4 bit values (0-15).
++ * @padding: padding to align to 8-byte
++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
++ *   clGetDeviceInfo()
++ * @log2_program_counter_size: Size of the shader program counter, in bits.
++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
++ *   is a bitpattern where a set bit indicates that the format is supported.
++ *   Before using a texture format, it is recommended that the corresponding
++ *   bit be checked.
++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
++ *   It is unlikely that a client will be able to allocate all of this memory
++ *   for their own purposes, but this at least provides an upper bound on the
++ *   memory available to the GPU.
++ *   This is required for OpenCL's clGetDeviceInfo() call when
++ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
++ *   client will not be expecting to allocate anywhere near this value.
++ * @num_exec_engines: The number of execution engines. Only valid for tGOX
++ *   (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise,
++ *   this is always 0.
++ */
++struct mali_base_gpu_core_props {
++	__u32 product_id;
++	__u16 version_status;
++	__u16 minor_revision;
++	__u16 major_revision;
++	__u16 padding;
++	__u32 gpu_freq_khz_max;
++	__u32 log2_program_counter_size;
++	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++	__u64 gpu_available_memory_size;
++	__u8 num_exec_engines;
++};
++
++#endif /* _UAPI_BASE_JM_KERNEL_H_ */
+diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
+new file mode 100644
+index 00000000000..20d931adc9b
+--- /dev/null
++++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
+@@ -0,0 +1,231 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_JM_IOCTL_H_
++#define _UAPI_KBASE_JM_IOCTL_H_
++
++#include <asm-generic/ioctl.h>
++#include <linux/types.h>
++
++/*
++ * 11.1:
++ * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags
++ * 11.2:
++ * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED,
++ *   which some user-side clients prior to 11.2 might fault if they received
++ *   them
++ * 11.3:
++ * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and
++ *   KBASE_IOCTL_STICKY_RESOURCE_UNMAP
++ * 11.4:
++ * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET
++ * 11.5:
++ * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD)
++ * 11.6:
++ * - Added flags field to base_jit_alloc_info structure, which can be used to
++ *   specify pseudo chunked tiler alignment for JIT allocations.
++ * 11.7:
++ * - Removed UMP support
++ * 11.8:
++ * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags
++ * 11.9:
++ * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY
++ *   under base_mem_alloc_flags
++ * 11.10:
++ * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for
++ *   JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations
++ *   with one softjob.
++ * 11.11:
++ * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags
++ * 11.12:
++ * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS
++ * 11.13:
++ * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT
++ * 11.14:
++ * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set
++ *   under base_mem_alloc_flags
++ * 11.15:
++ * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags.
++ * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be
++ *   passed to mmap().
++ * 11.16:
++ * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf.
++ * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for
++ *   dma-buf. Now, buffers are mapped on GPU when first imported, no longer
++ *   requiring external resource or sticky resource tracking. UNLESS,
++ *   CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled.
++ * 11.17:
++ * - Added BASE_JD_REQ_JOB_SLOT.
++ * - Reused padding field in base_jd_atom_v2 to pass job slot number.
++ * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO
++ * 11.18:
++ * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags
++ * 11.19:
++ * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified.
++ * 11.20:
++ * - Added new phys_pages member to kbase_ioctl_mem_jit_init for
++ *   KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2
++ *   (replacing '_OLD') and _11_5 suffixes
++ * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in
++ *   base_jd_atom_v2. It must currently be initialized to zero.
++ * - Added heap_info_gpu_addr to base_jit_alloc_info, and
++ *   BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's
++ *   flags member. Previous variants of this structure are kept and given _10_2
++ *   and _11_5 suffixes.
++ * - The above changes are checked for safe values in usual builds
++ * 11.21:
++ * - v2.0 of mali_trace debugfs file, which now versions the file separately
++ * 11.22:
++ * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2.
++ *   KBASE_IOCTL_JOB_SUBMIT supports both in parallel.
++ * 11.23:
++ * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify
++ *   the physical memory backing of JIT allocations. This was not supposed
++ *   to be a valid use case, but it was allowed by the previous implementation.
++ * 11.24:
++ * - Added a sysfs file 'serialize_jobs' inside a new sub-directory
++ *   'scheduling'.
++ * 11.25:
++ * - Enabled JIT pressure limit in base/kbase by default
++ * 11.26
++ * - Added kinstr_jm API
++ * 11.27
++ * - Backwards compatible extension to HWC ioctl.
++ * 11.28:
++ * - Added kernel side cache ops needed hint
++ * 11.29:
++ * - Reserve ioctl 52
++ * 11.30:
++ * - Add a new priority level BASE_JD_PRIO_REALTIME
++ * - Add ioctl 54: This controls the priority setting.
++ * 11.31:
++ * - Added BASE_JD_REQ_LIMITED_CORE_MASK.
++ * - Added ioctl 55: set_limited_core_count.
++ * 11.32:
++ * - Added new HW performance counters interface to all GPUs.
++ * 11.33:
++ * - Removed Kernel legacy HWC interface
++ * 11.34:
++ * - First release of new HW performance counters interface.
++ * 11.35:
++ * - Dummy model (no mali) backend will now clear HWC values after each sample
++ */
++#define BASE_UK_VERSION_MAJOR 11
++#define BASE_UK_VERSION_MINOR 35
++
++/**
++ * struct kbase_ioctl_version_check - Check version compatibility between
++ * kernel and userspace
++ *
++ * @major: Major version number
++ * @minor: Minor version number
++ */
++struct kbase_ioctl_version_check {
++	__u16 major;
++	__u16 minor;
++};
++
++#define KBASE_IOCTL_VERSION_CHECK \
++	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
++
++
++/**
++ * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel
++ *
++ * @addr: Memory address of an array of struct base_jd_atom_v2 or v3
++ * @nr_atoms: Number of entries in the array
++ * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom)
++ */
++struct kbase_ioctl_job_submit {
++	__u64 addr;
++	__u32 nr_atoms;
++	__u32 stride;
++};
++
++#define KBASE_IOCTL_JOB_SUBMIT \
++	_IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit)
++
++#define KBASE_IOCTL_POST_TERM \
++	_IO(KBASE_IOCTL_TYPE, 4)
++
++/**
++ * struct kbase_ioctl_soft_event_update - Update the status of a soft-event
++ * @event: GPU address of the event which has been updated
++ * @new_status: The new status to set
++ * @flags: Flags for future expansion
++ */
++struct kbase_ioctl_soft_event_update {
++	__u64 event;
++	__u32 new_status;
++	__u32 flags;
++};
++
++#define KBASE_IOCTL_SOFT_EVENT_UPDATE \
++	_IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update)
++
++/**
++ * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for
++ * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the
++ * kernel
++ *
++ * @size:    The size of the `struct kbase_kinstr_jm_atom_state_change`
++ * @version: Represents a breaking change in the
++ *           `struct kbase_kinstr_jm_atom_state_change`
++ * @padding: Explicit padding to get the structure up to 64bits. See
++ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
++ *
++ * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the
++ * end of the structure that older user space might not understand. If the
++ * `version` is the same, the structure is still compatible with newer kernels.
++ * The `size` can be used to cast the opaque memory returned from the kernel.
++ */
++struct kbase_kinstr_jm_fd_out {
++	__u16 size;
++	__u8 version;
++	__u8 padding[5];
++};
++
++/**
++ * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor
++ *
++ * @count: Number of atom states that can be stored in the kernel circular
++ *         buffer. Must be a power of two
++ * @padding: Explicit padding to get the structure up to 64bits. See
++ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
++ */
++struct kbase_kinstr_jm_fd_in {
++	__u16 count;
++	__u8 padding[6];
++};
++
++union kbase_kinstr_jm_fd {
++	struct kbase_kinstr_jm_fd_in in;
++	struct kbase_kinstr_jm_fd_out out;
++};
++
++#define KBASE_IOCTL_KINSTR_JM_FD \
++	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd)
++
++
++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
++	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
++
++#endif /* _UAPI_KBASE_JM_IOCTL_H_ */
+diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h
+new file mode 100644
+index 00000000000..f8378146ace
+--- /dev/null
++++ b/src/panfrost/base/include/mali_base_common_kernel.h
+@@ -0,0 +1,231 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_BASE_COMMON_KERNEL_H_
++#define _UAPI_BASE_COMMON_KERNEL_H_
++
++#include <linux/types.h>
++
++struct base_mem_handle {
++	struct {
++		__u64 handle;
++	} basep;
++};
++
++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
++
++/* Memory allocation, access/hint flags & mask.
++ *
++ * See base_mem_alloc_flags.
++ */
++
++/* IN */
++/* Read access CPU side
++ */
++#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
++
++/* Write access CPU side
++ */
++#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
++
++/* Read access GPU side
++ */
++#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
++
++/* Write access GPU side
++ */
++#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
++
++/* Execute allowed on the GPU side
++ */
++#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
++
++/* Will be permanently mapped in kernel space.
++ * Flag is only allowed on allocations originating from kbase.
++ */
++#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
++
++/* The allocation will completely reside within the same 4GB chunk in the GPU
++ * virtual space.
++ * Since this flag is primarily required only for the TLS memory which will
++ * not be used to contain executable code and also not used for Tiler heap,
++ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
++ */
++#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
++
++/* Userspace is not allowed to free this memory.
++ * Flag is only allowed on allocations originating from kbase.
++ */
++#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
++
++/* Grow backing store on GPU Page Fault
++ */
++#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
++
++/* Page coherence Outer shareable, if available
++ */
++#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
++
++/* Page coherence Inner shareable
++ */
++#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
++
++/* IN/OUT */
++/* Should be cached on the CPU, returned if actually cached
++ */
++#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
++
++/* IN/OUT */
++/* Must have same VA on both the GPU and the CPU
++ */
++#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
++
++/* OUT */
++/* Must call mmap to acquire a GPU address for the allocation
++ */
++#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
++
++/* IN */
++/* Page coherence Outer shareable, required.
++ */
++#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
++
++/* Protected memory
++ */
++#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
++
++/* Not needed physical memory
++ */
++#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
++
++/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
++ * addresses to be the same
++ */
++#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
++
++/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
++ * mode. Some components within the GPU might only be able to access memory
++ * that is GPU cacheable. Refer to the specific GPU implementation for more
++ * details. The 3 shareability flags will be ignored for GPU uncached memory.
++ * If used while importing USER_BUFFER type memory, then the import will fail
++ * if the memory is not aligned to GPU and CPU cache line width.
++ */
++#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
++
++/*
++ * Bits [22:25] for group_id (0~15).
++ *
++ * base_mem_group_id_set() should be used to pack a memory group ID into a
++ * base_mem_alloc_flags value instead of accessing the bits directly.
++ * base_mem_group_id_get() should be used to extract the memory group ID from
++ * a base_mem_alloc_flags value.
++ */
++#define BASEP_MEM_GROUP_ID_SHIFT 22
++#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
++
++/* Must do CPU cache maintenance when imported memory is mapped/unmapped
++ * on GPU. Currently applicable to dma-buf type only.
++ */
++#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
++
++/* OUT */
++/* Kernel side cache sync ops required */
++#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
++
++/* Number of bits used as flags for base memory management
++ *
++ * Must be kept in sync with the base_mem_alloc_flags flags
++ */
++#define BASE_MEM_FLAGS_NR_BITS 30
++
++/* A mask for all output bits, excluding IN/OUT bits.
++ */
++#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
++
++/* A mask for all input bits, including IN/OUT bits.
++ */
++#define BASE_MEM_FLAGS_INPUT_MASK                                                                  \
++	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
++
++/* Special base mem handles.
++ */
++#define BASEP_MEM_INVALID_HANDLE (0ul)
++#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
++#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
++/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
++#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
++
++/* Flags to pass to ::base_context_init.
++ * Flags can be ORed together to enable multiple things.
++ *
++ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
++ * not collide with them.
++ */
++typedef __u32 base_context_create_flags;
++
++/* Flags for base context */
++
++/* No flags set */
++#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
++
++/* Base context is embedded in a cctx object (flag used for CINSTR
++ * software counter macros)
++ */
++#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
++
++/* Base context is a 'System Monitor' context for Hardware counters.
++ *
++ * One important side effect of this is that job submission is disabled.
++ */
++#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1)
++
++/* Bit-shift used to encode a memory group ID in base_context_create_flags
++ */
++#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
++
++/* Bitmask used to encode a memory group ID in base_context_create_flags
++ */
++#define BASEP_CONTEXT_MMU_GROUP_ID_MASK                                                            \
++	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
++
++/* Bitpattern describing the base_context_create_flags that can be
++ * passed to the kernel
++ */
++#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS                                                          \
++	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK)
++
++/* Flags for base tracepoint
++ */
++
++/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
++ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
++ */
++#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
++
++/* Indicate that job dumping is enabled. This could affect certain timers
++ * to account for the performance impact.
++ */
++#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
++
++#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */
+diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h
+new file mode 100644
+index 00000000000..3d826c720b2
+--- /dev/null
++++ b/src/panfrost/base/include/mali_base_kernel.h
+@@ -0,0 +1,700 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++/*
++ * Base structures shared with the kernel.
++ */
++
++#ifndef _UAPI_BASE_KERNEL_H_
++#define _UAPI_BASE_KERNEL_H_
++
++#include <linux/types.h>
++#include "mali_base_common_kernel.h"
++
++#define BASE_MAX_COHERENT_GROUPS 16
++
++#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
++#define LOCAL_PAGE_SHIFT PAGE_SHIFT
++#define LOCAL_PAGE_LSB ~PAGE_MASK
++#else
++#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
++#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
++#endif
++
++#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
++#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
++#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
++#else
++#error Failed to find page size
++#endif
++#endif
++
++/* Physical memory group ID for normal usage.
++ */
++#define BASE_MEM_GROUP_DEFAULT (0)
++
++/* Number of physical memory groups.
++ */
++#define BASE_MEM_GROUP_COUNT (16)
++
++/**
++ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
++ *
++ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
++ * in order to determine the best cache policy. Some combinations are
++ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
++ * which defines a write-only region on the CPU side, which is
++ * heavily read by the CPU...
++ * Other flags are only meaningful to a particular allocator.
++ * More flags can be added to this list, as long as they don't clash
++ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
++ */
++typedef __u32 base_mem_alloc_flags;
++
++/* A mask for all the flags which are modifiable via the base_mem_set_flags
++ * interface.
++ */
++#define BASE_MEM_FLAGS_MODIFIABLE \
++	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
++	 BASE_MEM_COHERENT_LOCAL)
++
++/* A mask of all the flags that can be returned via the base_mem_get_flags()
++ * interface.
++ */
++#define BASE_MEM_FLAGS_QUERYABLE \
++	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
++		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
++		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
++		BASEP_MEM_FLAGS_KERNEL_ONLY))
++
++/**
++ * enum base_mem_import_type - Memory types supported by @a base_mem_import
++ *
++ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
++ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
++ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
++ * base_mem_import_user_buffer
++ *
++ * Each type defines what the supported handle type is.
++ *
++ * If any new type is added here ARM must be contacted
++ * to allocate a numeric value for it.
++ * Do not just add a new type without synchronizing with ARM
++ * as future releases from ARM might include other new types
++ * which could clash with your custom types.
++ */
++enum base_mem_import_type {
++	BASE_MEM_IMPORT_TYPE_INVALID = 0,
++	/*
++	 * Import type with value 1 is deprecated.
++	 */
++	BASE_MEM_IMPORT_TYPE_UMM = 2,
++	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
++};
++
++/**
++ * struct base_mem_import_user_buffer - Handle of an imported user buffer
++ *
++ * @ptr:	address of imported user buffer
++ * @length:	length of imported user buffer in bytes
++ *
++ * This structure is used to represent a handle of an imported user buffer.
++ */
++
++struct base_mem_import_user_buffer {
++	__u64 ptr;
++	__u64 length;
++};
++
++/* Mask to detect 4GB boundary alignment */
++#define BASE_MEM_MASK_4GB  0xfffff000UL
++/* Mask to detect 4GB boundary (in page units) alignment */
++#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
++
++/* Limit on the 'extension' parameter for an allocation with the
++ * BASE_MEM_TILER_ALIGN_TOP flag set
++ *
++ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
++ */
++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
++	(21u - (LOCAL_PAGE_SHIFT))
++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
++	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
++
++/* Bit mask of cookies used for memory allocation setup */
++#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
++
++/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
++#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
++
++/*
++ * struct base_fence - Cross-device synchronisation fence.
++ *
++ * A fence is used to signal when the GPU has finished accessing a resource that
++ * may be shared with other devices, and also to delay work done asynchronously
++ * by the GPU until other devices have finished accessing a shared resource.
++ */
++struct base_fence {
++	struct {
++		int fd;
++		int stream_fd;
++	} basep;
++};
++
++/**
++ * struct base_mem_aliasing_info - Memory aliasing info
++ *
++ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
++ * @offset: Offset within the handle to start aliasing from, in pages.
++ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
++ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
++ *          specifies the number of times the special page is needed.
++ *
++ * Describes a memory handle to be aliased.
++ * A subset of the handle can be chosen for aliasing, given an offset and a
++ * length.
++ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
++ * region where a special page is mapped with a write-alloc cache setup,
++ * typically used when the write result of the GPU isn't needed, but the GPU
++ * must write anyway.
++ *
++ * Offset and length are specified in pages.
++ * Offset must be within the size of the handle.
++ * Offset+length must not overrun the size of the handle.
++ */
++struct base_mem_aliasing_info {
++	struct base_mem_handle handle;
++	__u64 offset;
++	__u64 length;
++};
++
++/* Maximum percentage of just-in-time memory allocation trimming to perform
++ * on free.
++ */
++#define BASE_JIT_MAX_TRIM_LEVEL (100)
++
++/* Maximum number of concurrent just-in-time memory allocations.
++ */
++#define BASE_JIT_ALLOC_COUNT (255)
++
++/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
++ *
++ * jit_version is 1
++ *
++ * Due to the lack of padding specified, user clients between 32 and 64-bit
++ * may have assumed a different size of the struct
++ *
++ * An array of structures was not supported
++ */
++struct base_jit_alloc_info_10_2 {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++};
++
++/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
++ * to 11.19
++ *
++ * This structure had a number of modifications during and after kernel driver
++ * version 11.5, but remains size-compatible throughout its version history, and
++ * with earlier variants compatible with future variants by requiring
++ * zero-initialization to the unused space in the structure.
++ *
++ * jit_version is 2
++ *
++ * Kernel driver version history:
++ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
++ *       must be zero. Kbase minor version was not incremented, so some
++ *       versions of 11.5 do not have this change.
++ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
++ *       minor version not incremented)
++ * 11.6: Added 'flags', replacing 1 padding byte
++ * 11.10: Arrays of this structure are supported
++ */
++struct base_jit_alloc_info_11_5 {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++	__u8 bin_id;
++	__u8 max_allocations;
++	__u8 flags;
++	__u8 padding[2];
++	__u16 usage_id;
++};
++
++/**
++ * struct base_jit_alloc_info - Structure which describes a JIT allocation
++ *                              request.
++ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
++ *                              allocated GPU virtual address to.
++ * @va_pages:                   The minimum number of virtual pages required.
++ * @commit_pages:               The minimum number of physical pages which
++ *                              should back the allocation.
++ * @extension:                     Granularity of physical pages to grow the
++ *                              allocation by during a fault.
++ * @id:                         Unique ID provided by the caller, this is used
++ *                              to pair allocation and free requests.
++ *                              Zero is not a valid value.
++ * @bin_id:                     The JIT allocation bin, used in conjunction with
++ *                              @max_allocations to limit the number of each
++ *                              type of JIT allocation.
++ * @max_allocations:            The maximum number of allocations allowed within
++ *                              the bin specified by @bin_id. Should be the same
++ *                              for all allocations within the same bin.
++ * @flags:                      flags specifying the special requirements for
++ *                              the JIT allocation, see
++ *                              %BASE_JIT_ALLOC_VALID_FLAGS
++ * @padding:                    Expansion space - should be initialised to zero
++ * @usage_id:                   A hint about which allocation should be reused.
++ *                              The kernel should attempt to use a previous
++ *                              allocation with the same usage_id
++ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
++ *                              the actual usage of the region.
++ *
++ * jit_version is 3.
++ *
++ * When modifications are made to this structure, it is still compatible with
++ * jit_version 3 when: a) the size is unchanged, and b) new members only
++ * replace the padding bytes.
++ *
++ * Previous jit_version history:
++ * jit_version == 1, refer to &base_jit_alloc_info_10_2
++ * jit_version == 2, refer to &base_jit_alloc_info_11_5
++ *
++ * Kbase version history:
++ * 11.20: added @heap_info_gpu_addr
++ */
++struct base_jit_alloc_info {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++	__u8 bin_id;
++	__u8 max_allocations;
++	__u8 flags;
++	__u8 padding[2];
++	__u16 usage_id;
++	__u64 heap_info_gpu_addr;
++};
++
++enum base_external_resource_access {
++	BASE_EXT_RES_ACCESS_SHARED,
++	BASE_EXT_RES_ACCESS_EXCLUSIVE
++};
++
++struct base_external_resource {
++	__u64 ext_resource;
++};
++
++/**
++ * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources
++ * which can be mapped/unmapped in a single request.
++ */
++#define BASE_EXT_RES_COUNT_MAX 10
++
++/**
++ * struct base_external_resource_list - Structure which describes a list of
++ *                                      external resources.
++ * @count:                              The number of resources.
++ * @ext_res:                            Array of external resources which is
++ *                                      sized at allocation time.
++ */
++struct base_external_resource_list {
++	__u64 count;
++	struct base_external_resource ext_res[1];
++};
++
++struct base_jd_debug_copy_buffer {
++	__u64 address;
++	__u64 size;
++	struct base_external_resource extres;
++};
++
++#define GPU_MAX_JOB_SLOTS 16
++
++/**
++ * DOC: User-side Base GPU Property Queries
++ *
++ * The User-side Base GPU Property Query interface encapsulates two
++ * sub-modules:
++ *
++ * - "Dynamic GPU Properties"
++ * - "Base Platform Config GPU Properties"
++ *
++ * Base only deals with properties that vary between different GPU
++ * implementations - the Dynamic GPU properties and the Platform Config
++ * properties.
++ *
++ * For properties that are constant for the GPU Architecture, refer to the
++ * GPU module. However, we will discuss their relevance here just to
++ * provide background information.
++ *
++ * About the GPU Properties in Base and GPU modules
++ *
++ * The compile-time properties (Platform Config, GPU Compile-time
++ * properties) are exposed as pre-processor macros.
++ *
++ * Complementing the compile-time properties are the Dynamic GPU
++ * Properties, which act as a conduit for the GPU Configuration
++ * Discovery.
++ *
++ * In general, the dynamic properties are present to verify that the platform
++ * has been configured correctly with the right set of Platform Config
++ * Compile-time Properties.
++ *
++ * As a consistent guide across the entire DDK, the choice for dynamic or
++ * compile-time should consider the following, in order:
++ * 1. Can the code be written so that it doesn't need to know the
++ * implementation limits at all?
++ * 2. If you need the limits, get the information from the Dynamic Property
++ * lookup. This should be done once as you fetch the context, and then cached
++ * as part of the context data structure, so it's cheap to access.
++ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
++ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
++ * property). Examples of where this might be sensible follow:
++ *  - Part of a critical inner-loop
++ *  - Frequent re-use throughout the driver, causing significant extra load
++ * instructions or control flow that would be worthwhile optimizing out.
++ *
++ * We cannot provide an exhaustive set of examples, neither can we provide a
++ * rule for every possible situation. Use common sense, and think about: what
++ * the rest of the driver will be doing; how the compiler might represent the
++ * value if it is a compile-time constant; whether an OEM shipping multiple
++ * devices would benefit much more from a single DDK binary, instead of
++ * insignificant micro-optimizations.
++ *
++ * Dynamic GPU Properties
++ *
++ * Dynamic GPU properties are presented in two sets:
++ * 1. the commonly used properties in @ref base_gpu_props, which have been
++ * unpacked from GPU register bitfields.
++ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
++ * (also a member of base_gpu_props). All of these are presented in
++ * the packed form, as presented by the GPU  registers themselves.
++ *
++ * The raw properties in gpu_raw_gpu_props are necessary to
++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
++ * behaving differently?". In this case, all information about the
++ * configuration is potentially useful, but it does not need to be processed
++ * by the driver. Instead, the raw registers can be processed by the Mali
++ * Tools software on the host PC.
++ *
++ * The properties returned extend the GPU Configuration Discovery
++ * registers. For example, GPU clock speed is not specified in the GPU
++ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
++ *
++ * The GPU properties are obtained by a call to
++ * base_get_gpu_props(). This simply returns a pointer to a const
++ * base_gpu_props structure. It is constant for the life of a base
++ * context. Multiple calls to base_get_gpu_props() to a base context
++ * return the same pointer to a constant structure. This avoids cache pollution
++ * of the common data.
++ *
++ * This pointer must not be freed, because it does not point to the start of a
++ * region allocated by the memory allocator; instead, just close the @ref
++ * base_context.
++ *
++ *
++ * Kernel Operation
++ *
++ * During Base Context Create time, user-side makes a single kernel call:
++ * - A call to fill user memory with GPU information structures
++ *
++ * The kernel-side will fill the provided the entire processed base_gpu_props
++ * structure, because this information is required in both
++ * user and kernel side; it does not make sense to decode it twice.
++ *
++ * Coherency groups must be derived from the bitmasks, but this can be done
++ * kernel side, and just once at kernel startup: Coherency groups must already
++ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
++ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
++ *
++ * Coherency Group calculation
++ *
++ * Creation of the coherent group data is done at device-driver startup, and so
++ * is one-time. This will most likely involve a loop with CLZ, shifting, and
++ * bit clearing on the L2_PRESENT mask, depending on whether the
++ * system is L2 Coherent. The number of shader cores is done by a
++ * population count, since faulty cores may be disabled during production,
++ * producing a non-contiguous mask.
++ *
++ * The memory requirements for this algorithm can be determined either by a __u64
++ * population count on the L2_PRESENT mask (a LUT helper already is
++ * required for the above), or simple assumption that there can be no more than
++ * 16 coherent groups, since core groups are typically 4 cores.
++ */
++
++/*
++ * More information is possible - but associativity and bus width are not
++ * required by upper-level apis.
++ */
++struct mali_base_gpu_l2_cache_props {
++	__u8 log2_line_size;
++	__u8 log2_cache_size;
++	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
++	__u8 padding[5];
++};
++
++struct mali_base_gpu_tiler_props {
++	__u32 bin_size_bytes;	/* Max is 4*2^15 */
++	__u32 max_active_levels;	/* Max is 2^15 */
++};
++
++/**
++ * struct mali_base_gpu_thread_props - GPU threading system details.
++ * @max_threads: Max. number of threads per core
++ * @max_workgroup_size:     Max. number of threads per workgroup
++ * @max_barrier_size:       Max. number of threads that can synchronize on a
++ *                          simple barrier
++ * @max_registers:          Total size [1..65535] of the register file available
++ *                          per core.
++ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
++ *                          before it becomes blocked.
++ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
++ *                          field.
++ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
++ *                          3 = SW Model/Emulation
++ * @padding:                padding to align to 8-byte
++ * @tls_alloc:              Number of threads per core that TLS must be
++ *                          allocated for
++ */
++struct mali_base_gpu_thread_props {
++	__u32 max_threads;
++	__u32 max_workgroup_size;
++	__u32 max_barrier_size;
++	__u16 max_registers;
++	__u8 max_task_queue;
++	__u8 max_thread_group_split;
++	__u8 impl_tech;
++	__u8  padding[3];
++	__u32 tls_alloc;
++};
++
++/**
++ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
++ * @core_mask: Core restriction mask required for the group
++ * @num_cores: Number of cores in the group
++ * @padding:   padding to align to 8-byte
++ *
++ * \c core_mask exposes all cores in that coherent group, and \c num_cores
++ * provides a cached population-count for that mask.
++ *
++ * @note Whilst all cores are exposed in the mask, not all may be available to
++ *       the application, depending on the Kernel Power policy.
++ *
++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
++ *       wastage.
++ */
++struct mali_base_gpu_coherent_group {
++	__u64 core_mask;
++	__u16 num_cores;
++	__u16 padding[3];
++};
++
++/**
++ * struct mali_base_gpu_coherent_group_info - Coherency group information
++ * @num_groups: Number of coherent groups in the GPU.
++ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
++ *                   Equivalent to the number of L2 Caches.
++ *                   The GPU Counter dumping writes 2048 bytes per core group,
++ *                   regardless of whether the core groups are coherent or not.
++ *                   Hence this member is needed to calculate how much memory
++ *                   is required for dumping.
++ *                   @note Do not use it to work out how many valid elements
++ *                         are in the group[] member. Use num_groups instead.
++ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
++ *             methods
++ * @padding: padding to align to 8-byte
++ * @group: Descriptors of coherent groups
++ *
++ * Note that the sizes of the members could be reduced. However, the \c group
++ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
++ * aligned, thus leading to wastage if the other members sizes were reduced.
++ *
++ * The groups are sorted by core mask. The core masks are non-repeating and do
++ * not intersect.
++ */
++struct mali_base_gpu_coherent_group_info {
++	__u32 num_groups;
++	__u32 num_core_groups;
++	__u32 coherency;
++	__u32 padding;
++	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
++};
++
++#if MALI_USE_CSF
++#include "csf/mali_base_csf_kernel.h"
++#else
++#include "jm/mali_base_jm_kernel.h"
++#endif
++
++/**
++ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
++ *                            Configuration Discovery registers.
++ * @shader_present: Shader core present bitmap
++ * @tiler_present: Tiler core present bitmap
++ * @l2_present: Level 2 cache present bitmap
++ * @stack_present: Core stack present bitmap
++ * @l2_features: L2 features
++ * @core_features: Core features
++ * @mem_features: Mem features
++ * @mmu_features: Mmu features
++ * @as_present: Bitmap of address spaces present
++ * @js_present: Job slots present
++ * @js_features: Array of job slot features.
++ * @tiler_features: Tiler features
++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
++ * @gpu_id: GPU and revision identifier
++ * @thread_max_threads: Maximum number of threads per core
++ * @thread_max_workgroup_size: Maximum number of threads per workgroup
++ * @thread_max_barrier_size: Maximum number of threads per barrier
++ * @thread_features: Thread features
++ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
++ *                  available modes as exposed in the coherency_features register
++ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
++ * @gpu_features: GPU features
++ *
++ * The information is presented inefficiently for access. For frequent access,
++ * the values should be better expressed in an unpacked form in the
++ * base_gpu_props structure.
++ *
++ * The raw properties in gpu_raw_gpu_props are necessary to
++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
++ * behaving differently?". In this case, all information about the
++ * configuration is potentially useful, but it does not need to be processed
++ * by the driver. Instead, the raw registers can be processed by the Mali
++ * Tools software on the host PC.
++ *
++ */
++struct gpu_raw_gpu_props {
++	__u64 shader_present;
++	__u64 tiler_present;
++	__u64 l2_present;
++	__u64 stack_present;
++	__u32 l2_features;
++	__u32 core_features;
++	__u32 mem_features;
++	__u32 mmu_features;
++
++	__u32 as_present;
++
++	__u32 js_present;
++	__u32 js_features[GPU_MAX_JOB_SLOTS];
++	__u32 tiler_features;
++	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++
++	__u32 gpu_id;
++
++	__u32 thread_max_threads;
++	__u32 thread_max_workgroup_size;
++	__u32 thread_max_barrier_size;
++	__u32 thread_features;
++
++	/*
++	 * Note: This is the _selected_ coherency mode rather than the
++	 * available modes as exposed in the coherency_features register.
++	 */
++	__u32 coherency_mode;
++
++	__u32 thread_tls_alloc;
++	__u64 gpu_features;
++};
++
++/**
++ * struct base_gpu_props - Return structure for base_get_gpu_props().
++ * @core_props:     Core props.
++ * @l2_props:       L2 props.
++ * @unused_1:       Keep for backwards compatibility.
++ * @tiler_props:    Tiler props.
++ * @thread_props:   Thread props.
++ * @raw_props:      This member is large, likely to be 128 bytes.
++ * @coherency_info: This must be last member of the structure.
++ *
++ * NOTE: the raw_props member in this data structure contains the register
++ * values from which the value of the other members are derived. The derived
++ * members exist to allow for efficient access and/or shielding the details
++ * of the layout of the registers.
++ */
++struct base_gpu_props {
++	struct mali_base_gpu_core_props core_props;
++	struct mali_base_gpu_l2_cache_props l2_props;
++	__u64 unused_1;
++	struct mali_base_gpu_tiler_props tiler_props;
++	struct mali_base_gpu_thread_props thread_props;
++	struct gpu_raw_gpu_props raw_props;
++	struct mali_base_gpu_coherent_group_info coherency_info;
++};
++
++#define BASE_MEM_GROUP_ID_GET(flags)                                           \
++	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
++
++#define BASE_MEM_GROUP_ID_SET(id)                                              \
++	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
++					 BASE_MEM_GROUP_DEFAULT :              \
++					 id)                                   \
++	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
++	 BASE_MEM_GROUP_ID_MASK)
++
++#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
++	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
++	 ((base_context_create_flags)(group_id)                                \
++	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
++
++#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
++	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
++	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
++
++/*
++ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
++ * flags are also used, where applicable, for specifying which fields
++ * are valid following the request operation.
++ */
++
++/* For monotonic (counter) timefield */
++#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
++/* For system wide timestamp */
++#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
++/* For GPU cycle counter */
++#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
++/* Specify kernel GPU register timestamp */
++#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
++/* Specify userspace cntvct_el0 timestamp source */
++#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
++
++#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
++		BASE_TIMEINFO_MONOTONIC_FLAG | \
++		BASE_TIMEINFO_TIMESTAMP_FLAG | \
++		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
++		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
++		BASE_TIMEINFO_USER_SOURCE_FLAG)
++
++/* Maximum number of source allocations allowed to create an alias allocation.
++ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
++ * layers, since each cube map in the array will have 6 faces.
++ */
++#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
++
++#endif /* _UAPI_BASE_KERNEL_H_ */
+diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h
+new file mode 100644
+index 00000000000..b250feca022
+--- /dev/null
++++ b/src/panfrost/base/include/mali_kbase_gpuprops.h
+@@ -0,0 +1,127 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_GPUPROP_H_
++#define _UAPI_KBASE_GPUPROP_H_
++
++/**********************************
++ * Definitions for GPU properties *
++ **********************************/
++#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
++#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
++#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
++#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
++
++#define KBASE_GPUPROP_PRODUCT_ID			1
++#define KBASE_GPUPROP_VERSION_STATUS			2
++#define KBASE_GPUPROP_MINOR_REVISION			3
++#define KBASE_GPUPROP_MAJOR_REVISION			4
++/* 5 previously used for GPU speed */
++#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
++/* 7 previously used for minimum GPU speed */
++#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
++#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
++#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
++#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
++#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
++
++#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
++#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
++#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
++
++#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
++#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
++
++#define KBASE_GPUPROP_MAX_THREADS			18
++#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
++#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
++#define KBASE_GPUPROP_MAX_REGISTERS			21
++#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
++#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
++#define KBASE_GPUPROP_IMPL_TECH				24
++
++#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
++#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
++#define KBASE_GPUPROP_RAW_L2_PRESENT			27
++#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
++#define KBASE_GPUPROP_RAW_L2_FEATURES			29
++#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
++#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
++#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
++#define KBASE_GPUPROP_RAW_AS_PRESENT			33
++#define KBASE_GPUPROP_RAW_JS_PRESENT			34
++#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
++#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
++#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
++#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
++#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
++#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
++#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
++#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
++#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
++#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
++#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
++#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
++#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
++#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
++#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
++#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
++#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
++#define KBASE_GPUPROP_RAW_GPU_ID			55
++#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
++#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
++#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
++#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
++#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
++
++#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
++#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
++#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
++#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
++#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
++#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
++#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
++#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
++#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
++#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
++#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
++#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
++#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
++#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
++#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
++#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
++#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
++#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
++#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
++
++#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
++
++#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
++
++#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
++#define KBASE_GPUPROP_TLS_ALLOC				84
++#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
++
++#endif
+diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h
+new file mode 100644
+index 00000000000..96f606af5f8
+--- /dev/null
++++ b/src/panfrost/base/include/mali_kbase_ioctl.h
+@@ -0,0 +1,759 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_IOCTL_H_
++#define _UAPI_KBASE_IOCTL_H_
++
++#ifdef __cpluscplus
++extern "C" {
++#endif
++
++#include <asm-generic/ioctl.h>
++#include <linux/types.h>
++
++#if MALI_USE_CSF
++#include "csf/mali_kbase_csf_ioctl.h"
++#else
++#include "jm/mali_kbase_jm_ioctl.h"
++#endif /* MALI_USE_CSF */
++
++#define KBASE_IOCTL_TYPE 0x80
++
++/**
++ * struct kbase_ioctl_set_flags - Set kernel context creation flags
++ *
++ * @create_flags: Flags - see base_context_create_flags
++ */
++struct kbase_ioctl_set_flags {
++	__u32 create_flags;
++};
++
++#define KBASE_IOCTL_SET_FLAGS \
++	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
++
++/**
++ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
++ *
++ * @buffer: Pointer to the buffer to store properties into
++ * @size: Size of the buffer
++ * @flags: Flags - must be zero for now
++ *
++ * The ioctl will return the number of bytes stored into @buffer or an error
++ * on failure (e.g. @size is too small). If @size is specified as 0 then no
++ * data will be written but the return value will be the number of bytes needed
++ * for all the properties.
++ *
++ * @flags may be used in the future to request a different format for the
++ * buffer. With @flags == 0 the following format is used.
++ *
++ * The buffer will be filled with pairs of values, a __u32 key identifying the
++ * property followed by the value. The size of the value is identified using
++ * the bottom bits of the key. The value then immediately followed the key and
++ * is tightly packed (there is no padding). All keys and values are
++ * little-endian.
++ *
++ * 00 = __u8
++ * 01 = __u16
++ * 10 = __u32
++ * 11 = __u64
++ */
++struct kbase_ioctl_get_gpuprops {
++	__u64 buffer;
++	__u32 size;
++	__u32 flags;
++};
++
++#define KBASE_IOCTL_GET_GPUPROPS \
++	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
++
++/**
++ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
++ * @in: Input parameters
++ * @in.va_pages: The number of pages of virtual address space to reserve
++ * @in.commit_pages: The number of physical pages to allocate
++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
++ * @in.flags: Flags
++ * @out: Output parameters
++ * @out.flags: Flags
++ * @out.gpu_va: The GPU virtual address which is allocated
++ */
++union kbase_ioctl_mem_alloc {
++	struct {
++		__u64 va_pages;
++		__u64 commit_pages;
++		__u64 extension;
++		__u64 flags;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_ALLOC \
++	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
++
++/**
++ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
++ * @in: Input parameters
++ * @in.gpu_addr: A GPU address contained within the region
++ * @in.query: The type of query
++ * @out: Output parameters
++ * @out.value: The result of the query
++ *
++ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
++ */
++union kbase_ioctl_mem_query {
++	struct {
++		__u64 gpu_addr;
++		__u64 query;
++	} in;
++	struct {
++		__u64 value;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_QUERY \
++	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
++
++#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
++#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
++#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
++
++/**
++ * struct kbase_ioctl_mem_free - Free a memory region
++ * @gpu_addr: Handle to the region to free
++ */
++struct kbase_ioctl_mem_free {
++	__u64 gpu_addr;
++};
++
++#define KBASE_IOCTL_MEM_FREE \
++	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
++
++/**
++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
++ * @buffer_count: requested number of dumping buffers
++ * @fe_bm:        counters selection bitmask (Front end)
++ * @shader_bm:    counters selection bitmask (Shader)
++ * @tiler_bm:     counters selection bitmask (Tiler)
++ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
++ *
++ * A fd is returned from the ioctl if successful, or a negative value on error
++ */
++struct kbase_ioctl_hwcnt_reader_setup {
++	__u32 buffer_count;
++	__u32 fe_bm;
++	__u32 shader_bm;
++	__u32 tiler_bm;
++	__u32 mmu_l2_bm;
++};
++
++#define KBASE_IOCTL_HWCNT_READER_SETUP \
++	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
++
++/**
++ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
++ * @data:    Counter samples for the dummy model.
++ * @size:    Size of the counter sample data.
++ * @padding: Padding.
++ */
++struct kbase_ioctl_hwcnt_values {
++	__u64 data;
++	__u32 size;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_HWCNT_SET \
++	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
++
++/**
++ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
++ * @counter:   A counter of disjoint events in the kernel
++ */
++struct kbase_ioctl_disjoint_query {
++	__u32 counter;
++};
++
++#define KBASE_IOCTL_DISJOINT_QUERY \
++	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
++
++/**
++ * struct kbase_ioctl_get_ddk_version - Query the kernel version
++ * @version_buffer: Buffer to receive the kernel version string
++ * @size: Size of the buffer
++ * @padding: Padding
++ *
++ * The ioctl will return the number of bytes written into version_buffer
++ * (which includes a NULL byte) or a negative error code
++ *
++ * The ioctl request code has to be _IOW because the data in ioctl struct is
++ * being copied to the kernel, even though the kernel then writes out the
++ * version info to the buffer specified in the ioctl.
++ */
++struct kbase_ioctl_get_ddk_version {
++	__u64 version_buffer;
++	__u32 size;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_GET_DDK_VERSION \
++	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
++
++/**
++ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
++ *                                        allocator (between kernel driver
++ *                                        version 10.2--11.4)
++ * @va_pages: Number of VA pages to reserve for JIT
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ *
++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
++ * backwards compatibility.
++ */
++struct kbase_ioctl_mem_jit_init_10_2 {
++	__u64 va_pages;
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
++
++/**
++ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
++ *                                        allocator (between kernel driver
++ *                                        version 11.5--11.19)
++ * @va_pages: Number of VA pages to reserve for JIT
++ * @max_allocations: Maximum number of concurrent allocations
++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
++ * @group_id: Group ID to be used for physical allocations
++ * @padding: Currently unused, must be zero
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ *
++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
++ * backwards compatibility.
++ */
++struct kbase_ioctl_mem_jit_init_11_5 {
++	__u64 va_pages;
++	__u8 max_allocations;
++	__u8 trim_level;
++	__u8 group_id;
++	__u8 padding[5];
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
++
++/**
++ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
++ *                                   allocator
++ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
++ *            memory allocations
++ * @max_allocations: Maximum number of concurrent allocations
++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
++ * @group_id: Group ID to be used for physical allocations
++ * @padding: Currently unused, must be zero
++ * @phys_pages: Maximum number of physical pages to allocate just-in-time
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ */
++struct kbase_ioctl_mem_jit_init {
++	__u64 va_pages;
++	__u8 max_allocations;
++	__u8 trim_level;
++	__u8 group_id;
++	__u8 padding[5];
++	__u64 phys_pages;
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
++
++/**
++ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
++ *
++ * @handle: GPU memory handle (GPU VA)
++ * @user_addr: The address where it is mapped in user space
++ * @size: The number of bytes to synchronise
++ * @type: The direction to synchronise: 0 is sync to memory (clean),
++ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_mem_sync {
++	__u64 handle;
++	__u64 user_addr;
++	__u64 size;
++	__u8 type;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_MEM_SYNC \
++	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
++
++/**
++ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
++ *
++ * @in: Input parameters
++ * @in.gpu_addr: The GPU address of the memory region
++ * @in.cpu_addr: The CPU address to locate
++ * @in.size: A size in bytes to validate is contained within the region
++ * @out: Output parameters
++ * @out.offset: The offset from the start of the memory region to @cpu_addr
++ */
++union kbase_ioctl_mem_find_cpu_offset {
++	struct {
++		__u64 gpu_addr;
++		__u64 cpu_addr;
++		__u64 size;
++	} in;
++	struct {
++		__u64 offset;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
++	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
++
++/**
++ * struct kbase_ioctl_get_context_id - Get the kernel context ID
++ *
++ * @id: The kernel context ID
++ */
++struct kbase_ioctl_get_context_id {
++	__u32 id;
++};
++
++#define KBASE_IOCTL_GET_CONTEXT_ID \
++	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
++
++/**
++ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
++ *
++ * @flags: Flags
++ *
++ * The ioctl returns a file descriptor when successful
++ */
++struct kbase_ioctl_tlstream_acquire {
++	__u32 flags;
++};
++
++#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
++	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
++
++#define KBASE_IOCTL_TLSTREAM_FLUSH \
++	_IO(KBASE_IOCTL_TYPE, 19)
++
++/**
++ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
++ *
++ * @gpu_addr: The memory region to modify
++ * @pages:    The number of physical pages that should be present
++ *
++ * The ioctl may return on the following error codes or 0 for success:
++ *   -ENOMEM: Out of memory
++ *   -EINVAL: Invalid arguments
++ */
++struct kbase_ioctl_mem_commit {
++	__u64 gpu_addr;
++	__u64 pages;
++};
++
++#define KBASE_IOCTL_MEM_COMMIT \
++	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
++
++/**
++ * union kbase_ioctl_mem_alias - Create an alias of memory regions
++ * @in: Input parameters
++ * @in.flags: Flags, see BASE_MEM_xxx
++ * @in.stride: Bytes between start of each memory region
++ * @in.nents: The number of regions to pack together into the alias
++ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
++ * @out: Output parameters
++ * @out.flags: Flags, see BASE_MEM_xxx
++ * @out.gpu_va: Address of the new alias
++ * @out.va_pages: Size of the new alias
++ */
++union kbase_ioctl_mem_alias {
++	struct {
++		__u64 flags;
++		__u64 stride;
++		__u64 nents;
++		__u64 aliasing_info;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++		__u64 va_pages;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_ALIAS \
++	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
++
++/**
++ * union kbase_ioctl_mem_import - Import memory for use by the GPU
++ * @in: Input parameters
++ * @in.flags: Flags, see BASE_MEM_xxx
++ * @in.phandle: Handle to the external memory
++ * @in.type: Type of external memory, see base_mem_import_type
++ * @in.padding: Amount of extra VA pages to append to the imported buffer
++ * @out: Output parameters
++ * @out.flags: Flags, see BASE_MEM_xxx
++ * @out.gpu_va: Address of the new alias
++ * @out.va_pages: Size of the new alias
++ */
++union kbase_ioctl_mem_import {
++	struct {
++		__u64 flags;
++		__u64 phandle;
++		__u32 type;
++		__u32 padding;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++		__u64 va_pages;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_IMPORT \
++	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
++
++/**
++ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
++ * @gpu_va: The GPU region to modify
++ * @flags: The new flags to set
++ * @mask: Mask of the flags to modify
++ */
++struct kbase_ioctl_mem_flags_change {
++	__u64 gpu_va;
++	__u64 flags;
++	__u64 mask;
++};
++
++#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
++	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
++
++/**
++ * struct kbase_ioctl_stream_create - Create a synchronisation stream
++ * @name: A name to identify this stream. Must be NULL-terminated.
++ *
++ * Note that this is also called a "timeline", but is named stream to avoid
++ * confusion with other uses of the word.
++ *
++ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
++ *
++ * The ioctl returns a file descriptor.
++ */
++struct kbase_ioctl_stream_create {
++	char name[32];
++};
++
++#define KBASE_IOCTL_STREAM_CREATE \
++	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
++
++/**
++ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
++ * @fd: The file descriptor to validate
++ */
++struct kbase_ioctl_fence_validate {
++	int fd;
++};
++
++#define KBASE_IOCTL_FENCE_VALIDATE \
++	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
++
++/**
++ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
++ * @buffer: Pointer to the information
++ * @len: Length
++ * @padding: Padding
++ *
++ * The data provided is accessible through a debugfs file
++ */
++struct kbase_ioctl_mem_profile_add {
++	__u64 buffer;
++	__u32 len;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_MEM_PROFILE_ADD \
++	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
++
++/**
++ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
++ * @count: Number of resources
++ * @address: Array of __u64 GPU addresses of the external resources to map
++ */
++struct kbase_ioctl_sticky_resource_map {
++	__u64 count;
++	__u64 address;
++};
++
++#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
++	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
++
++/**
++ * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was
++ *                                          previously permanently mapped
++ * @count: Number of resources
++ * @address: Array of __u64 GPU addresses of the external resources to unmap
++ */
++struct kbase_ioctl_sticky_resource_unmap {
++	__u64 count;
++	__u64 address;
++};
++
++#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
++	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
++
++/**
++ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
++ *                                                   the GPU memory region for
++ *                                                   the given gpu address and
++ *                                                   the offset of that address
++ *                                                   into the region
++ * @in: Input parameters
++ * @in.gpu_addr: GPU virtual address
++ * @in.size: Size in bytes within the region
++ * @out: Output parameters
++ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
++ *             for the length of @offset bytes
++ * @out.offset: The offset from the start of the memory region to @gpu_addr
++ */
++union kbase_ioctl_mem_find_gpu_start_and_offset {
++	struct {
++		__u64 gpu_addr;
++		__u64 size;
++	} in;
++	struct {
++		__u64 start;
++		__u64 offset;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
++	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
++
++#define KBASE_IOCTL_CINSTR_GWT_START \
++	_IO(KBASE_IOCTL_TYPE, 33)
++
++#define KBASE_IOCTL_CINSTR_GWT_STOP \
++	_IO(KBASE_IOCTL_TYPE, 34)
++
++/**
++ * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault
++ *                                     addresses.
++ * @in: Input parameters
++ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
++ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
++ * @in.len: Number of addresses the buffers can hold.
++ * @in.padding: padding
++ * @out: Output parameters
++ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
++ * @out.more_data_available: Status indicating if more addresses are available.
++ * @out.padding: padding
++ *
++ * This structure is used when performing a call to dump GPU write fault
++ * addresses.
++ */
++union kbase_ioctl_cinstr_gwt_dump {
++	struct {
++		__u64 addr_buffer;
++		__u64 size_buffer;
++		__u32 len;
++		__u32 padding;
++
++	} in;
++	struct {
++		__u32 no_of_addr_collected;
++		__u8 more_data_available;
++		__u8 padding[27];
++	} out;
++};
++
++#define KBASE_IOCTL_CINSTR_GWT_DUMP \
++	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
++
++/**
++ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
++ *
++ * @va_pages: Number of VA pages to reserve for EXEC_VA
++ */
++struct kbase_ioctl_mem_exec_init {
++	__u64 va_pages;
++};
++
++#define KBASE_IOCTL_MEM_EXEC_INIT \
++	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
++
++/**
++ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
++ *                                          cpu/gpu time (counter values)
++ * @in: Input parameters
++ * @in.request_flags: Bit-flags indicating the requested types.
++ * @in.paddings:      Unused, size alignment matching the out.
++ * @out: Output parameters
++ * @out.sec:           Integer field of the monotonic time, unit in seconds.
++ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
++ * @out.padding:       Unused, for __u64 alignment
++ * @out.timestamp:     System wide timestamp (counter) value.
++ * @out.cycle_counter: GPU cycle counter value.
++ */
++union kbase_ioctl_get_cpu_gpu_timeinfo {
++	struct {
++		__u32 request_flags;
++		__u32 paddings[7];
++	} in;
++	struct {
++		__u64 sec;
++		__u32 nsec;
++		__u32 padding;
++		__u64 timestamp;
++		__u64 cycle_counter;
++	} out;
++};
++
++#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
++	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
++
++/**
++ * struct kbase_ioctl_context_priority_check - Check the max possible priority
++ * @priority: Input priority & output priority
++ */
++
++struct kbase_ioctl_context_priority_check {
++	__u8 priority;
++};
++
++#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
++	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
++
++/**
++ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
++ *
++ * @max_core_count: Maximum core count
++ */
++struct kbase_ioctl_set_limited_core_count {
++	__u8 max_core_count;
++};
++
++#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
++	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
++
++/**
++ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
++ *                                              information
++ * @info_item_size:  Performance counter item size in bytes.
++ * @info_item_count: Performance counter item count in the info_list_ptr.
++ * @info_list_ptr:   Performance counter item list pointer which points to a
++ *                   list with info_item_count of items.
++ *
++ * On success: returns info_item_size and info_item_count if info_list_ptr is
++ * NULL, returns performance counter information if info_list_ptr is not NULL.
++ * On error: returns a negative error code.
++ */
++struct kbase_ioctl_kinstr_prfcnt_enum_info {
++	__u32 info_item_size;
++	__u32 info_item_count;
++	__u64 info_list_ptr;
++};
++
++#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
++	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
++
++/**
++ * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader
++ * @in: input parameters.
++ * @in.request_item_count: Number of requests in the requests array.
++ * @in.request_item_size:  Size in bytes of each request in the requests array.
++ * @in.requests_ptr:       Pointer to the requests array.
++ * @out: output parameters.
++ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
++ *                                 each sample.
++ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
++ *                                 for reading performance counter samples.
++ *
++ * A fd is returned from the ioctl if successful, or a negative value on error.
++ */
++union kbase_ioctl_kinstr_prfcnt_setup {
++	struct {
++		__u32 request_item_count;
++		__u32 request_item_size;
++		__u64 requests_ptr;
++	} in;
++	struct {
++		__u32 prfcnt_metadata_item_size;
++		__u32 prfcnt_mmap_size_bytes;
++	} out;
++};
++
++#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
++	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
++
++/***************
++ * test ioctls *
++ ***************/
++#if MALI_UNIT_TEST
++/* These ioctls are purely for test purposes and are not used in the production
++ * driver, they therefore may change without notice
++ */
++
++#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
++
++
++/**
++ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
++ * @bytes_collected: number of bytes read by user
++ * @bytes_generated: number of bytes generated by tracepoints
++ */
++struct kbase_ioctl_tlstream_stats {
++	__u32 bytes_collected;
++	__u32 bytes_generated;
++};
++
++#define KBASE_IOCTL_TLSTREAM_STATS \
++	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
++
++#endif /* MALI_UNIT_TEST */
++
++/* Customer extension range */
++#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
++
++/* If the integration needs extra ioctl add them there
++ * like this:
++ *
++ * struct my_ioctl_args {
++ *  ....
++ * }
++ *
++ * #define KBASE_IOCTL_MY_IOCTL \
++ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
++ */
++
++#ifdef __cpluscplus
++}
++#endif
++
++#endif /* _UAPI_KBASE_IOCTL_H_ */
+diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h
+new file mode 100644
+index 00000000000..5f33f5c4c4b
+--- /dev/null
++++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h
+@@ -0,0 +1,80 @@
++/*
++ * © Copyright 2017-2018 The Panfrost Community
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * A copy of the licence is included with the program, and can also be obtained
++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
++ * Boston, MA  02110-1301, USA.
++ *
++ */
++
++#ifndef __KBASE_IOCTL_MIDGARD_H__
++#define __KBASE_IOCTL_MIDGARD_H__
++
++#define KBASE_IOCTL_TYPE_BASE  0x80
++#define KBASE_IOCTL_TYPE_MAX   0x82
++
++union kbase_ioctl_mem_alloc {
++        struct {
++                union kbase_ioctl_header header;
++                u64 va_pages;
++                u64 commit_pages;
++                u64 extension;
++                u64 flags;
++        } in;
++        struct {
++                union kbase_ioctl_header header;
++                u64 pad[3];
++                u64 flags;
++                mali_ptr gpu_va;
++                u16 va_alignment;
++        } out;
++        u64 pad[7];
++} __attribute__((packed));
++
++#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1)
++
++#define KBASE_IOCTL_GET_VERSION             (_IOWR(0x80,  0, struct kbase_ioctl_get_version))
++#define KBASE_IOCTL_MEM_ALLOC               (_IOWR(0x82,  0, union kbase_ioctl_mem_alloc))
++#define KBASE_IOCTL_MEM_IMPORT              (_IOWR(0x82,  1, union kbase_ioctl_mem_import))
++#define KBASE_IOCTL_MEM_COMMIT              (_IOWR(0x82,  2, struct kbase_ioctl_mem_commit))
++#define KBASE_IOCTL_MEM_QUERY               (_IOWR(0x82,  3, struct kbase_ioctl_mem_query))
++#define KBASE_IOCTL_MEM_FREE                (_IOWR(0x82,  4, struct kbase_ioctl_mem_free))
++#define KBASE_IOCTL_MEM_FLAGS_CHANGE        (_IOWR(0x82,  5, struct kbase_ioctl_mem_flags_change))
++#define KBASE_IOCTL_MEM_ALIAS               (_IOWR(0x82,  6, struct kbase_ioctl_mem_alias))
++#define KBASE_IOCTL_MEM_SYNC                (_IOWR(0x82,  8, struct kbase_ioctl_mem_sync))
++#define KBASE_IOCTL_POST_TERM               (_IOWR(0x82,  9, __ioctl_placeholder))
++#define KBASE_IOCTL_HWCNT_SETUP             (_IOWR(0x82, 10, __ioctl_placeholder))
++#define KBASE_IOCTL_HWCNT_DUMP              (_IOWR(0x82, 11, __ioctl_placeholder))
++#define KBASE_IOCTL_HWCNT_CLEAR             (_IOWR(0x82, 12, __ioctl_placeholder))
++#define KBASE_IOCTL_GPU_PROPS_REG_DUMP      (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump))
++#define KBASE_IOCTL_FIND_CPU_OFFSET         (_IOWR(0x82, 15, __ioctl_placeholder))
++#define KBASE_IOCTL_GET_VERSION_NEW         (_IOWR(0x82, 16, struct kbase_ioctl_get_version))
++#define KBASE_IOCTL_SET_FLAGS               (_IOWR(0x82, 18, struct kbase_ioctl_set_flags))
++#define KBASE_IOCTL_SET_TEST_DATA           (_IOWR(0x82, 19, __ioctl_placeholder))
++#define KBASE_IOCTL_INJECT_ERROR            (_IOWR(0x82, 20, __ioctl_placeholder))
++#define KBASE_IOCTL_MODEL_CONTROL           (_IOWR(0x82, 21, __ioctl_placeholder))
++#define KBASE_IOCTL_KEEP_GPU_POWERED        (_IOWR(0x82, 22, __ioctl_placeholder))
++#define KBASE_IOCTL_FENCE_VALIDATE          (_IOWR(0x82, 23, __ioctl_placeholder))
++#define KBASE_IOCTL_STREAM_CREATE           (_IOWR(0x82, 24, struct kbase_ioctl_stream_create))
++#define KBASE_IOCTL_GET_PROFILING_CONTROLS  (_IOWR(0x82, 25, __ioctl_placeholder))
++#define KBASE_IOCTL_SET_PROFILING_CONTROLS  (_IOWR(0x82, 26, __ioctl_placeholder))
++#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder))
++#define KBASE_IOCTL_JOB_SUBMIT              (_IOWR(0x82, 28, struct kbase_ioctl_job_submit))
++#define KBASE_IOCTL_DISJOINT_QUERY          (_IOWR(0x82, 29, __ioctl_placeholder))
++#define KBASE_IOCTL_GET_CONTEXT_ID          (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id))
++#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4  (_IOWR(0x82, 32, __ioctl_placeholder))
++#define KBASE_IOCTL_TLSTREAM_TEST           (_IOWR(0x82, 33, __ioctl_placeholder))
++#define KBASE_IOCTL_TLSTREAM_STATS          (_IOWR(0x82, 34, __ioctl_placeholder))
++#define KBASE_IOCTL_TLSTREAM_FLUSH          (_IOWR(0x82, 35, __ioctl_placeholder))
++#define KBASE_IOCTL_HWCNT_READER_SETUP      (_IOWR(0x82, 36, __ioctl_placeholder))
++#define KBASE_IOCTL_SET_PRFCNT_VALUES       (_IOWR(0x82, 37, __ioctl_placeholder))
++#define KBASE_IOCTL_SOFT_EVENT_UPDATE       (_IOWR(0x82, 38, __ioctl_placeholder))
++#define KBASE_IOCTL_MEM_JIT_INIT            (_IOWR(0x82, 39, __ioctl_placeholder))
++#define KBASE_IOCTL_TLSTREAM_ACQUIRE        (_IOWR(0x82, 40, __ioctl_placeholder))
++
++#endif /* __KBASE_IOCTL_MIDGARD_H__ */
+diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h
+new file mode 100644
+index 00000000000..5c76f2dc8e5
+--- /dev/null
++++ b/src/panfrost/base/include/old/mali-ioctl.h
+@@ -0,0 +1,743 @@
++/*
++ * © Copyright 2017-2018 The Panfrost Community
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * A copy of the licence is included with the program, and can also be obtained
++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
++ * Boston, MA  02110-1301, USA.
++ *
++ */
++
++/**
++ * Definitions for all of the ioctls for the original open source bifrost GPU
++ * kernel driver, written by ARM.
++ */
++
++#ifndef __KBASE_IOCTL_H__
++#define __KBASE_IOCTL_H__
++
++typedef uint8_t u8;
++typedef uint16_t u16;
++typedef uint32_t u32;
++typedef uint64_t u64;
++
++typedef int32_t s32;
++typedef int64_t s64;
++
++
++typedef u8 mali_atom_id;
++
++/**
++ * Since these structs are passed to and from the kernel we need to make sure
++ * that we get the size of each struct to match exactly what the kernel is
++ * expecting. So, when editing this file make sure to add static asserts that
++ * check each struct's size against the arg length you see in strace.
++ */
++
++enum kbase_ioctl_mem_flags {
++	/* IN */
++	BASE_MEM_PROT_CPU_RD = (1U << 0),      /**< Read access CPU side */
++	BASE_MEM_PROT_CPU_WR = (1U << 1),      /**< Write access CPU side */
++	BASE_MEM_PROT_GPU_RD = (1U << 2),      /**< Read access GPU side */
++	BASE_MEM_PROT_GPU_WR = (1U << 3),      /**< Write access GPU side */
++	BASE_MEM_PROT_GPU_EX = (1U << 4),      /**< Execute allowed on the GPU
++						    side */
++
++	BASE_MEM_GROW_ON_GPF = (1U << 9),      /**< Grow backing store on GPU
++						    Page Fault */
++
++	BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer
++						    shareable, if available */
++	BASE_MEM_COHERENT_LOCAL = (1U << 11),  /**< Page coherence Inner
++						    shareable */
++	BASE_MEM_CACHED_CPU = (1U << 12),      /**< Should be cached on the
++						    CPU */
++
++	/* IN/OUT */
++	BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU
++					    and the CPU */
++	/* OUT */
++	BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU
++					     address for the alloc */
++	/* IN */
++	BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence
++					     Outer shareable, required. */
++	BASE_MEM_SECURE = (1U << 16),          /**< Secure memory */
++	BASE_MEM_DONT_NEED = (1U << 17),       /**< Not needed physical
++						    memory */
++	BASE_MEM_IMPORT_SHARED = (1U << 18),   /**< Must use shared CPU/GPU zone
++						    (SAME_VA zone) but doesn't
++						    require the addresses to
++						    be the same */
++};
++
++#define KBASE_IOCTL_MEM_FLAGS_IN_MASK                                          \
++	(BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |                        \
++	 BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \
++	 BASE_MEM_GROW_ON_GPF |                                               \
++	 BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL |                 \
++	 BASE_MEM_CACHED_CPU |                                                \
++	 BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE |                \
++	 BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED)
++#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12)
++
++enum kbase_ioctl_coherency_mode {
++	COHERENCY_ACE_LITE = 0,
++	COHERENCY_ACE      = 1,
++	COHERENCY_NONE     = 31
++};
++
++/*
++ * Mali Atom priority
++ *
++ * Only certain priority levels are actually implemented, as specified by the
++ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
++ * level that is not one of those defined below.
++ *
++ * Priority levels only affect scheduling between atoms of the same type within
++ * a mali context, and only after the atoms have had dependencies resolved.
++ * Fragment atoms does not affect non-frament atoms with lower priorities, and
++ * the other way around. For example, a low priority atom that has had its
++ * dependencies resolved might run before a higher priority atom that has not
++ * had its dependencies resolved.
++ *
++ * The scheduling between mali contexts/processes and between atoms from
++ * different mali contexts/processes is unaffected by atom priority.
++ *
++ * The atoms are scheduled as follows with respect to their priorities:
++ * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies
++ *   resolved, and atom 'X' has a higher priority than atom 'Y'
++ * - If atom 'Y' is currently running on the HW, then it is interrupted to
++ *   allow atom 'X' to run soon after
++ * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing
++ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
++ * - Any two atoms that have the same priority could run in any order with
++ *   respect to each other. That is, there is no ordering constraint between
++ *   atoms of the same priority.
++ */
++typedef u8 mali_jd_prio;
++#define BASE_JD_PRIO_MEDIUM  ((mali_jd_prio)0)
++#define BASE_JD_PRIO_HIGH    ((mali_jd_prio)1)
++#define BASE_JD_PRIO_LOW     ((mali_jd_prio)2)
++
++/**
++ * @brief Job dependency type.
++ *
++ * A flags field will be inserted into the atom structure to specify whether a
++ * dependency is a data or ordering dependency (by putting it before/after
++ * 'core_req' in the structure it should be possible to add without changing
++ * the structure size).  When the flag is set for a particular dependency to
++ * signal that it is an ordering only dependency then errors will not be
++ * propagated.
++ */
++typedef u8 mali_jd_dep_type;
++#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
++#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
++#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
++
++/**
++ * @brief Job chain hardware requirements.
++ *
++ * A job chain must specify what GPU features it needs to allow the
++ * driver to schedule the job correctly.  By not specifying the
++ * correct settings can/will cause an early job termination.  Multiple
++ * values can be ORed together to specify multiple requirements.
++ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
++ * dependencies, and that doesn't execute anything on the hardware.
++ */
++typedef u32 mali_jd_core_req;
++
++/* Requirements that come from the HW */
++
++/**
++ * No requirement, dependency only
++ */
++#define BASE_JD_REQ_DEP ((mali_jd_core_req)0)
++
++/**
++ * Requires fragment shaders
++ */
++#define BASE_JD_REQ_FS  ((mali_jd_core_req)1 << 0)
++
++/**
++ * Requires compute shaders
++ * This covers any of the following Midgard Job types:
++ * - Vertex Shader Job
++ * - Geometry Shader Job
++ * - An actual Compute Shader Job
++ *
++ * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
++ * job is specifically just the "Compute Shader" job type, and not the "Vertex
++ * Shader" nor the "Geometry Shader" job type.
++ */
++#define BASE_JD_REQ_CS  ((mali_jd_core_req)1 << 1)
++#define BASE_JD_REQ_T   ((mali_jd_core_req)1 << 2)   /**< Requires tiling */
++#define BASE_JD_REQ_CF  ((mali_jd_core_req)1 << 3)   /**< Requires cache flushes */
++#define BASE_JD_REQ_V   ((mali_jd_core_req)1 << 4)   /**< Requires value writeback */
++
++/* SW-only requirements - the HW does not expose these as part of the job slot
++ * capabilities */
++
++/* Requires fragment job with AFBC encoding */
++#define BASE_JD_REQ_FS_AFBC  ((mali_jd_core_req)1 << 13)
++
++/**
++ * SW-only requirement: coalesce completion events.
++ * If this bit is set then completion of this atom will not cause an event to
++ * be sent to userspace, whether successful or not; completion events will be
++ * deferred until an atom completes which does not have this bit set.
++ *
++ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
++ */
++#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5)
++
++/**
++ * SW Only requirement: the job chain requires a coherent core group. We don't
++ * mind which coherent core group is used.
++ */
++#define BASE_JD_REQ_COHERENT_GROUP  ((mali_jd_core_req)1 << 6)
++
++/**
++ * SW Only requirement: The performance counters should be enabled only when
++ * they are needed, to reduce power consumption.
++ */
++
++#define BASE_JD_REQ_PERMON               ((mali_jd_core_req)1 << 7)
++
++/**
++ * SW Only requirement: External resources are referenced by this atom.  When
++ * external resources are referenced no syncsets can be bundled with the atom
++ * but should instead be part of a NULL jobs inserted into the dependency
++ * tree.  The first pre_dep object must be configured for the external
++ * resouces to use, the second pre_dep object can be used to create other
++ * dependencies.
++ *
++ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE.
++ */
++#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((mali_jd_core_req)1 << 8)
++
++/**
++ * SW Only requirement: Software defined job. Jobs with this bit set will not
++ * be submitted to the hardware but will cause some action to happen within
++ * the driver
++ */
++#define BASE_JD_REQ_SOFT_JOB        ((mali_jd_core_req)1 << 9)
++
++#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
++#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
++#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
++
++/**
++ * SW Only requirement : Replay job.
++ *
++ * If the preceding job fails, the replay job will cause the jobs specified in
++ * the list of mali_jd_replay_payload pointed to by the jc pointer to be
++ * replayed.
++ *
++ * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT
++ * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay
++ * job is failed, as well as any following dependencies.
++ *
++ * The replayed jobs will require a number of atom IDs. If there are not enough
++ * free atom IDs then the replay job will fail.
++ *
++ * If the preceding job does not fail, then the replay job is returned as
++ * completed.
++ *
++ * The replayed jobs will never be returned to userspace. The preceding failed
++ * job will be returned to userspace as failed; the status of this job should
++ * be ignored. Completion should be determined by the status of the replay soft
++ * job.
++ *
++ * In order for the jobs to be replayed, the job headers will have to be
++ * modified. The Status field will be reset to NOT_STARTED. If the Job Type
++ * field indicates a Vertex Shader Job then it will be changed to Null Job.
++ *
++ * The replayed jobs have the following assumptions :
++ *
++ * - No external resources. Any required external resources will be held by the
++ *   replay atom.
++ * - Pre-dependencies are created based on job order.
++ * - Atom numbers are automatically assigned.
++ * - device_nr is set to 0. This is not relevant as
++ *   BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set.
++ * - Priority is inherited from the replay job.
++ */
++#define BASE_JD_REQ_SOFT_REPLAY                 (BASE_JD_REQ_SOFT_JOB | 0x4)
++/**
++ * SW only requirement: event wait/trigger job.
++ *
++ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
++ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
++ *   other waiting jobs. It completes immediately.
++ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
++ *   possible for other jobs to wait upon. It completes immediately.
++ */
++#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
++#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
++#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
++
++#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
++
++/**
++ * SW only requirement: Just In Time allocation
++ *
++ * This job requests a JIT allocation based on the request in the
++ * @base_jit_alloc_info structure which is passed via the jc element of
++ * the atom.
++ *
++ * It should be noted that the id entry in @base_jit_alloc_info must not
++ * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE.
++ *
++ * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE
++ * soft job to free the JIT allocation is still made.
++ *
++ * The job will complete immediately.
++ */
++#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
++/**
++ * SW only requirement: Just In Time free
++ *
++ * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC
++ * to be freed. The ID of the JIT allocation is passed via the jc element of
++ * the atom.
++ *
++ * The job will complete immediately.
++ */
++#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
++
++/**
++ * SW only requirement: Map external resource
++ *
++ * This job requests external resource(s) are mapped once the dependencies
++ * of the job have been satisfied. The list of external resources are
++ * passed via the jc element of the atom which is a pointer to a
++ * @base_external_resource_list.
++ */
++#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
++/**
++ * SW only requirement: Unmap external resource
++ *
++ * This job requests external resource(s) are unmapped once the dependencies
++ * of the job has been satisfied. The list of external resources are
++ * passed via the jc element of the atom which is a pointer to a
++ * @base_external_resource_list.
++ */
++#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
++
++/**
++ * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
++ *
++ * This indicates that the Job Chain contains Midgard Jobs of the 'Compute
++ * Shaders' type.
++ *
++ * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job
++ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
++ */
++#define BASE_JD_REQ_ONLY_COMPUTE    ((mali_jd_core_req)1 << 10)
++
++/**
++ * HW Requirement: Use the mali_jd_atom::device_nr field to specify a
++ * particular core group
++ *
++ * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
++ * takes priority
++ *
++ * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms.
++ *
++ * If the core availability policy is keeping the required core group turned
++ * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code.
++ */
++#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11)
++
++/**
++ * SW Flag: If this bit is set then the successful completion of this atom
++ * will not cause an event to be sent to userspace
++ */
++#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((mali_jd_core_req)1 << 12)
++
++/**
++ * SW Flag: If this bit is set then completion of this atom will not cause an
++ * event to be sent to userspace, whether successful or not.
++ */
++#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14)
++
++/**
++ * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
++ *
++ * If this bit is set then the GPU's cache will not be cleaned and invalidated
++ * until a GPU job starts which does not have this bit set or a job completes
++ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if
++ * the CPU may have written to memory addressed by the job since the last job
++ * without this bit set was submitted.
++ */
++#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15)
++
++/**
++ * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
++ *
++ * If this bit is set then the GPU's cache will not be cleaned and invalidated
++ * until a GPU job completes which does not have this bit set or a job starts
++ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not
++ * use if the CPU may read from or partially overwrite memory addressed by the
++ * job before the next job without this bit set completes.
++ */
++#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16)
++
++/**
++ * These requirement bits are currently unused in mali_jd_core_req
++ */
++#define MALIP_JD_REQ_RESERVED \
++	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
++	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \
++	BASE_JD_REQ_EVENT_COALESCE | \
++	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
++	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
++	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END))
++
++/**
++ * Mask of all bits in mali_jd_core_req that control the type of the atom.
++ *
++ * This allows dependency only atoms to have flags set
++ */
++#define BASE_JD_REQ_ATOM_TYPE \
++	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
++	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
++
++/**
++ * Mask of all bits in mali_jd_core_req that control the type of a soft job.
++ */
++#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
++
++/*
++ * Returns non-zero value if core requirements passed define a soft job or
++ * a dependency only job.
++ */
++#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
++	((core_req & BASE_JD_REQ_SOFT_JOB) || \
++	(core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
++
++/**
++ * @brief The payload for a replay job. This must be in GPU memory.
++ */
++struct mali_jd_replay_payload {
++	/**
++	 * Pointer to the first entry in the mali_jd_replay_jc list.  These
++	 * will be replayed in @b reverse order (so that extra ones can be added
++	 * to the head in future soft jobs without affecting this soft job)
++	 */
++	u64 tiler_jc_list;
++
++	/**
++	 * Pointer to the fragment job chain.
++	 */
++	u64 fragment_jc;
++
++	/**
++	 * Pointer to the tiler heap free FBD field to be modified.
++	 */
++	u64 tiler_heap_free;
++
++	/**
++	 * Hierarchy mask for the replayed fragment jobs. May be zero.
++	 */
++	u16 fragment_hierarchy_mask;
++
++	/**
++	 * Hierarchy mask for the replayed tiler jobs. May be zero.
++	 */
++	u16 tiler_hierarchy_mask;
++
++	/**
++	 * Default weight to be used for hierarchy levels not in the original
++	 * mask.
++	 */
++	u32 hierarchy_default_weight;
++
++	/**
++	 * Core requirements for the tiler job chain
++	 */
++	mali_jd_core_req tiler_core_req;
++
++	/**
++	 * Core requirements for the fragment job chain
++	 */
++	mali_jd_core_req fragment_core_req;
++};
++
++/**
++ * @brief An entry in the linked list of job chains to be replayed. This must
++ *        be in GPU memory.
++ */
++struct mali_jd_replay_jc {
++	/**
++	 * Pointer to next entry in the list. A setting of NULL indicates the
++	 * end of the list.
++	 */
++	u64 next;
++
++	/**
++	 * Pointer to the job chain.
++	 */
++	u64 jc;
++};
++
++typedef u64 mali_ptr;
++
++#define MALI_PTR_FMT "0x%" PRIx64
++#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR
++
++#ifdef __LP64__
++#define PAD_CPU_PTR(p) p
++#else
++#define PAD_CPU_PTR(p) p; u32 :32;
++#endif
++
++/* FIXME: Again, they don't specify any of these as packed structs. However,
++ * looking at these structs I'm worried that there is already spots where the
++ * compiler is potentially sticking in padding...
++ * Going to try something a little crazy, and just hope that our compiler
++ * happens to add the same kind of offsets since we can't really compare sizes
++ */
++
++/*
++ * Blob provided by the driver to store callback driver, not actually modified
++ * by the driver itself
++ */
++struct mali_jd_udata {
++	u64 blob[2];
++};
++
++struct mali_jd_dependency {
++	mali_atom_id  atom_id;               /**< An atom number */
++	mali_jd_dep_type dependency_type;    /**< Dependency type */
++};
++
++#define MALI_EXT_RES_MAX 10
++
++/* The original header never explicitly defines any values for these. In C,
++ * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we
++ * actually need to decode here is EXCLUSIVE
++ */
++enum mali_external_resource_access {
++	MALI_EXT_RES_ACCESS_SHARED,
++	MALI_EXT_RES_ACCESS_EXCLUSIVE,
++};
++
++/* An aligned address to the resource | mali_external_resource_access */
++typedef u64 mali_external_resource;
++
++struct base_jd_atom_v2 {
++	mali_ptr jc;           /**< job-chain GPU address */
++	struct mali_jd_udata udata;	    /**< user data */
++	u64 extres_list; /**< list of external resources */
++	u16 nr_extres;			    /**< nr of external resources */
++	u16 compat_core_req;	            /**< core requirements which
++					      correspond to the legacy support
++					      for UK 10.2 */
++	struct mali_jd_dependency pre_dep[2];  /**< pre-dependencies, one need to
++					      use SETTER function to assign
++					      this field, this is done in
++					      order to reduce possibility of
++					      improper assigment of a
++					      dependency field */
++	mali_atom_id atom_number;	    /**< unique number to identify the
++					      atom */
++	mali_jd_prio prio;                  /**< Atom priority. Refer to @ref
++					      mali_jd_prio for more details */
++	u8 device_nr;			    /**< coregroup when
++					      BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
++					      specified */
++	u8 :8;
++	mali_jd_core_req core_req;          /**< core requirements */
++} __attribute__((packed));
++
++/**
++ * enum mali_error - Mali error codes shared with userspace
++ *
++ * This is subset of those common Mali errors that can be returned to userspace.
++ * Values of matching user and kernel space enumerators MUST be the same.
++ * MALI_ERROR_NONE is guaranteed to be 0.
++ *
++ * @MALI_ERROR_NONE: Success
++ * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver
++ * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure
++ * @MALI_ERROR_FUNCTION_FAILED: Generic error code
++ */
++enum mali_error {
++	MALI_ERROR_NONE = 0,
++	MALI_ERROR_OUT_OF_GPU_MEMORY,
++	MALI_ERROR_OUT_OF_MEMORY,
++	MALI_ERROR_FUNCTION_FAILED,
++};
++
++/**
++ * Header used by all ioctls
++ */
++union kbase_ioctl_header {
++#ifdef dvalin
++	u32 pad[0];
++#else
++	/* [in] The ID of the UK function being called */
++	u32 id :32;
++	/* [out] The return value of the UK function that was called */
++	enum mali_error rc :32;
++
++	u64 :64;
++#endif
++} __attribute__((packed));
++
++struct kbase_ioctl_get_version {
++	union kbase_ioctl_header header;
++	u16 major; /* [out] */
++	u16 minor; /* [out] */
++	u32 :32;
++} __attribute__((packed));
++
++struct mali_mem_import_user_buffer {
++	u64 ptr;
++	u64 length;
++};
++
++union kbase_ioctl_mem_import {
++        struct {
++                union kbase_ioctl_header header;
++                u64 phandle;
++                enum {
++                        BASE_MEM_IMPORT_TYPE_INVALID = 0,
++                        BASE_MEM_IMPORT_TYPE_UMP = 1,
++                        BASE_MEM_IMPORT_TYPE_UMM = 2,
++                        BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3,
++                } type :32;
++                u32 :32;
++                u64 flags;
++        } in;
++        struct {
++                union kbase_ioctl_header header;
++                u64 pad[2];
++                u64 flags;
++                u64 gpu_va;
++                u64 va_pages;
++        } out;
++} __attribute__((packed));
++
++struct kbase_ioctl_mem_commit {
++	union kbase_ioctl_header header;
++	/* [in] */
++	mali_ptr gpu_addr;
++	u64 pages;
++	/* [out] */
++	u32 result_subcode;
++	u32 :32;
++} __attribute__((packed));
++
++enum kbase_ioctl_mem_query_type {
++	BASE_MEM_QUERY_COMMIT_SIZE = 1,
++	BASE_MEM_QUERY_VA_SIZE     = 2,
++	BASE_MEM_QUERY_FLAGS       = 3
++};
++
++struct kbase_ioctl_mem_query {
++	union kbase_ioctl_header header;
++	/* [in] */
++	mali_ptr gpu_addr;
++	enum kbase_ioctl_mem_query_type query : 32;
++	u32 :32;
++	/* [out] */
++	u64 value;
++} __attribute__((packed));
++
++struct kbase_ioctl_mem_free {
++	union kbase_ioctl_header header;
++	mali_ptr gpu_addr; /* [in] */
++} __attribute__((packed));
++/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
++
++struct kbase_ioctl_mem_flags_change {
++	union kbase_ioctl_header header;
++	/* [in] */
++	mali_ptr gpu_va;
++	u64 flags;
++	u64 mask;
++} __attribute__((packed));
++/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
++
++struct kbase_ioctl_mem_alias {
++	union kbase_ioctl_header header;
++	/* [in/out] */
++	u64 flags;
++	/* [in] */
++	u64 stride;
++	u64 nents;
++	u64 ai;
++	/* [out] */
++	mali_ptr gpu_va;
++	u64 va_pages;
++} __attribute__((packed));
++
++struct kbase_ioctl_mem_sync {
++	union kbase_ioctl_header header;
++	mali_ptr handle;
++	u64 user_addr;
++	u64 size;
++	enum {
++		MALI_SYNC_TO_DEVICE = 1,
++		MALI_SYNC_TO_CPU = 2,
++	} type :8;
++	u64 :56;
++} __attribute__((packed));
++
++struct kbase_ioctl_set_flags {
++	union kbase_ioctl_header header;
++	u32 create_flags; /* [in] */
++	u32 :32;
++} __attribute__((packed));
++
++struct kbase_ioctl_stream_create {
++	union kbase_ioctl_header header;
++	/* [in] */
++	char name[32];
++	/* [out] */
++	s32 fd;
++	u32 :32;
++} __attribute__((packed));
++
++struct kbase_ioctl_job_submit {
++	union kbase_ioctl_header header;
++	/* [in] */
++	u64 addr;
++	u32 nr_atoms;
++	u32 stride;
++} __attribute__((packed));
++
++struct kbase_ioctl_get_context_id {
++	union kbase_ioctl_header header;
++	/* [out] */
++	s64 id;
++} __attribute__((packed));
++
++#undef PAD_CPU_PTR
++
++enum base_jd_event_code {
++        BASE_JD_EVENT_DONE = 1,
++};
++
++struct base_jd_event_v2 {
++	enum base_jd_event_code event_code;
++	mali_atom_id atom_number;
++	struct mali_jd_udata udata;
++};
++
++/* Defined in mali-props.h */
++struct kbase_ioctl_gpu_props_reg_dump;
++
++/* For ioctl's we haven't written decoding stuff for yet */
++typedef struct {
++	union kbase_ioctl_header header;
++} __ioctl_placeholder;
++
++#endif /* __KBASE_IOCTL_H__ */
+diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h
+new file mode 100644
+index 00000000000..5b9d8723600
+--- /dev/null
++++ b/src/panfrost/base/include/old/mali-props.h
+@@ -0,0 +1,262 @@
++/*
++ * © Copyright 2017-2018 The Panfrost Community
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * A copy of the licence is included with the program, and can also be obtained
++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
++ * Boston, MA  02110-1301, USA.
++ *
++ */
++
++#ifndef __MALI_PROPS_H__
++#define __MALI_PROPS_H__
++
++#include "mali-ioctl.h"
++
++#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3
++#define MALI_GPU_MAX_JOB_SLOTS 16
++#define MALI_MAX_COHERENT_GROUPS 16
++
++/* Capabilities of a job slot as reported by JS_FEATURES registers */
++
++#define JS_FEATURE_NULL_JOB              (1u << 1)
++#define JS_FEATURE_SET_VALUE_JOB         (1u << 2)
++#define JS_FEATURE_CACHE_FLUSH_JOB       (1u << 3)
++#define JS_FEATURE_COMPUTE_JOB           (1u << 4)
++#define JS_FEATURE_VERTEX_JOB            (1u << 5)
++#define JS_FEATURE_GEOMETRY_JOB          (1u << 6)
++#define JS_FEATURE_TILER_JOB             (1u << 7)
++#define JS_FEATURE_FUSED_JOB             (1u << 8)
++#define JS_FEATURE_FRAGMENT_JOB          (1u << 9)
++
++struct mali_gpu_core_props {
++	/**
++	 * Product specific value.
++	 */
++	u32 product_id;
++
++	/**
++	 * Status of the GPU release.
++	 * No defined values, but starts at 0 and increases by one for each
++	 * release status (alpha, beta, EAC, etc.).
++	 * 4 bit values (0-15).
++	 */
++	u16 version_status;
++
++	/**
++	 * Minor release number of the GPU. "P" part of an "RnPn" release
++	 * number.
++	 * 8 bit values (0-255).
++	 */
++	u16 minor_revision;
++
++	/**
++	 * Major release number of the GPU. "R" part of an "RnPn" release
++	 * number.
++	 * 4 bit values (0-15).
++	 */
++	u16 major_revision;
++
++	u16 :16;
++
++	/**
++	 * @usecase GPU clock speed is not specified in the Midgard
++	 * Architecture, but is <b>necessary for OpenCL's clGetDeviceInfo()
++	 * function</b>.
++	 */
++	u32 gpu_speed_mhz;
++
++	/**
++	 * @usecase GPU clock max/min speed is required for computing
++	 * best/worst case in tasks as job scheduling ant irq_throttling. (It
++	 * is not specified in the Midgard Architecture).
++	 */
++	u32 gpu_freq_khz_max;
++	u32 gpu_freq_khz_min;
++
++	/**
++	 * Size of the shader program counter, in bits.
++	 */
++	u32 log2_program_counter_size;
++
++	/**
++	 * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a
++	 * bitpattern where a set bit indicates that the format is supported.
++	 *
++	 * Before using a texture format, it is recommended that the
++	 * corresponding bit be checked.
++	 */
++	u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++
++	/**
++	 * Theoretical maximum memory available to the GPU. It is unlikely
++	 * that a client will be able to allocate all of this memory for their
++	 * own purposes, but this at least provides an upper bound on the
++	 * memory available to the GPU.
++	 *
++	 * This is required for OpenCL's clGetDeviceInfo() call when
++	 * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
++	 * client will not be expecting to allocate anywhere near this value.
++	 */
++	u64 gpu_available_memory_size;
++};
++
++struct mali_gpu_l2_cache_props {
++	u8 log2_line_size;
++	u8 log2_cache_size;
++	u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
++	u64 :40;
++};
++
++struct mali_gpu_tiler_props {
++	u32 bin_size_bytes;	/* Max is 4*2^15 */
++	u32 max_active_levels;	/* Max is 2^15 */
++};
++
++struct mali_gpu_thread_props {
++	u32 max_threads;            /* Max. number of threads per core */
++	u32 max_workgroup_size;     /* Max. number of threads per workgroup */
++	u32 max_barrier_size;       /* Max. number of threads that can
++				       synchronize on a simple barrier */
++	u16 max_registers;          /* Total size [1..65535] of the register
++				       file available per core. */
++	u8  max_task_queue;         /* Max. tasks [1..255] which may be sent
++				       to a core before it becomes blocked. */
++	u8  max_thread_group_split; /* Max. allowed value [1..15] of the
++				       Thread Group Split field. */
++	enum {
++		MALI_GPU_IMPLEMENTATION_UNKNOWN = 0,
++		MALI_GPU_IMPLEMENTATION_SILICON = 1,
++		MALI_GPU_IMPLEMENTATION_FPGA    = 2,
++		MALI_GPU_IMPLEMENTATION_SW      = 3,
++	} impl_tech :8;
++	u64 :56;
++};
++
++/**
++ * @brief descriptor for a coherent group
++ *
++ * \c core_mask exposes all cores in that coherent group, and \c num_cores
++ * provides a cached population-count for that mask.
++ *
++ * @note Whilst all cores are exposed in the mask, not all may be available to
++ * the application, depending on the Kernel Power policy.
++ *
++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
++ * wastage.
++ */
++struct mali_ioctl_gpu_coherent_group {
++	u64 core_mask;	       /**< Core restriction mask required for the
++				 group */
++	u16 num_cores;	       /**< Number of cores in the group */
++	u64 :48;
++};
++
++/**
++ * @brief Coherency group information
++ *
++ * Note that the sizes of the members could be reduced. However, the \c group
++ * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
++ * aligned, thus leading to wastage if the other members sizes were reduced.
++ *
++ * The groups are sorted by core mask. The core masks are non-repeating and do
++ * not intersect.
++ */
++struct mali_gpu_coherent_group_info {
++	u32 num_groups;
++
++	/**
++	 * Number of core groups (coherent or not) in the GPU. Equivalent to
++	 * the number of L2 Caches.
++	 *
++	 * The GPU Counter dumping writes 2048 bytes per core group,
++	 * regardless of whether the core groups are coherent or not. Hence
++	 * this member is needed to calculate how much memory is required for
++	 * dumping.
++	 *
++	 * @note Do not use it to work out how many valid elements are in the
++	 * group[] member. Use num_groups instead.
++	 */
++	u32 num_core_groups;
++
++	/**
++	 * Coherency features of the memory, accessed by @ref gpu_mem_features
++	 * methods
++	 */
++	u32 coherency;
++
++	u32 :32;
++
++	/**
++	 * Descriptors of coherent groups
++	 */
++	struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS];
++};
++
++/**
++ * A complete description of the GPU's Hardware Configuration Discovery
++ * registers.
++ *
++ * The information is presented inefficiently for access. For frequent access,
++ * the values should be better expressed in an unpacked form in the
++ * base_gpu_props structure.
++ *
++ * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to
++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
++ * behaving differently?". In this case, all information about the
++ * configuration is potentially useful, but it <b>does not need to be processed
++ * by the driver</b>. Instead, the raw registers can be processed by the Mali
++ * Tools software on the host PC.
++ *
++ */
++struct mali_gpu_raw_props {
++	u64 shader_present;
++	u64 tiler_present;
++	u64 l2_present;
++	u64 stack_present;
++
++	u32 l2_features;
++	u32 suspend_size; /* API 8.2+ */
++	u32 mem_features;
++	u32 mmu_features;
++
++	u32 as_present;
++
++	u32 js_present;
++	u32 js_features[MALI_GPU_MAX_JOB_SLOTS];
++	u32 tiler_features;
++	u32 texture_features[3];
++
++	u32 gpu_id;
++
++	u32 thread_max_threads;
++	u32 thread_max_workgroup_size;
++	u32 thread_max_barrier_size;
++	u32 thread_features;
++
++	/*
++	 * Note: This is the _selected_ coherency mode rather than the
++	 * available modes as exposed in the coherency_features register.
++	 */
++	u32 coherency_mode;
++};
++
++struct kbase_ioctl_gpu_props_reg_dump {
++	union kbase_ioctl_header header;
++	struct mali_gpu_core_props core;
++	struct mali_gpu_l2_cache_props l2;
++	u64 :64;
++	struct mali_gpu_tiler_props tiler;
++	struct mali_gpu_thread_props thread;
++
++	struct mali_gpu_raw_props raw;
++
++	/** This must be last member of the structure */
++	struct mali_gpu_coherent_group_info coherency_info;
++} __attribute__((packed));
++
++#endif
+diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build
+new file mode 100644
+index 00000000000..5d7b9f1dff9
+--- /dev/null
++++ b/src/panfrost/base/meson.build
+@@ -0,0 +1,55 @@
++# Copyright © 2018 Rob Clark
++# Copyright © 2019 Collabora
++# Copyright © 2022 Icecream95
++
++# Permission is hereby granted, free of charge, to any person obtaining a copy
++# of this software and associated documentation files (the "Software"), to deal
++# in the Software without restriction, including without limitation the rights
++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++# copies of the Software, and to permit persons to whom the Software is
++# furnished to do so, subject to the following conditions:
++
++# The above copyright notice and this permission notice shall be included in
++# all copies or substantial portions of the Software.
++
++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++# SOFTWARE.
++
++libpanfrost_base_versions = ['0', '1', '2', '258']
++libpanfrost_base_per_arch = []
++
++foreach ver : libpanfrost_base_versions
++  libpanfrost_base_per_arch += static_library(
++    'pan-base-v' + ver,
++    'pan_vX_base.c',
++    include_directories : [
++      inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux,
++      include_directories('include'),
++    ],
++    c_args : ['-DPAN_BASE_VER=' + ver],
++    gnu_symbol_visibility : 'hidden',
++    dependencies: [dep_valgrind],
++)
++endforeach
++
++libpanfrost_base = static_library(
++  'panfrost_base',
++  'pan_base.c',
++  include_directories : [
++    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw,
++    include_directories('include'),
++  ],
++  gnu_symbol_visibility : 'hidden',
++  build_by_default : false,
++  link_with: [libpanfrost_base_per_arch],
++)
++
++libpanfrost_base_dep = declare_dependency(
++  link_with: [libpanfrost_base_per_arch, libpanfrost_base],
++  include_directories: [include_directories('.')],
++)
+diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c
+new file mode 100644
+index 00000000000..22dc09cfb52
+--- /dev/null
++++ b/src/panfrost/base/pan_base.c
+@@ -0,0 +1,301 @@
++/*
++ * Copyright (C) 2022 Icecream95
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <inttypes.h>
++#include <string.h>
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++#include <poll.h>
++#include <pthread.h>
++
++#include "util/macros.h"
++#include "pan_base.h"
++
++#include "mali_kbase_ioctl.h"
++
++bool
++kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose)
++{
++        *k = (struct kbase_) {0};
++        k->fd = fd;
++        k->cs_queue_count = cs_queue_count;
++        k->page_size = sysconf(_SC_PAGE_SIZE);
++        k->verbose = verbose;
++
++        if (k->fd == -1)
++           return kbase_open_csf_noop(k);
++
++        struct kbase_ioctl_version_check ver = { 0 };
++
++        if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) {
++                return kbase_open_csf(k);
++        } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) {
++                if (ver.major == 3)
++                        return kbase_open_old(k);
++                else
++                        return kbase_open_new(k);
++        }
++
++        return false;
++}
++
++/* If fd != -1, ownership is passed in */
++int
++kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd)
++{
++        kbase_handle h = {
++                .va = va,
++                .fd = fd
++        };
++
++        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
++
++        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
++
++        for (unsigned i = 0; i < size; ++i) {
++                if (handles[i].fd == -2) {
++                        handles[i] = h;
++                        return i;
++                }
++        }
++
++        util_dynarray_append(&k->gem_handles, kbase_handle, h);
++
++        return size;
++}
++
++int
++kbase_alloc_gem_handle(kbase k, base_va va, int fd)
++{
++        pthread_mutex_lock(&k->handle_lock);
++
++        int ret = kbase_alloc_gem_handle_locked(k, va, fd);
++
++        pthread_mutex_unlock(&k->handle_lock);
++
++        return ret;
++}
++
++void
++kbase_free_gem_handle(kbase k, int handle)
++{
++        pthread_mutex_lock(&k->handle_lock);
++
++        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
++
++        int fd;
++
++        if (handle >= size) {
++                pthread_mutex_unlock(&k->handle_lock);
++                return;
++        }
++
++        if (handle + 1 < size) {
++                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
++                fd = ptr->fd;
++                ptr->fd = -2;
++        } else {
++                fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd;
++        }
++
++        if (fd != -1)
++                close(fd);
++
++        pthread_mutex_unlock(&k->handle_lock);
++}
++
++kbase_handle
++kbase_gem_handle_get(kbase k, int handle)
++{
++        kbase_handle h = { .fd = -1 };
++
++        pthread_mutex_lock(&k->handle_lock);
++
++        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
++
++        if (handle < size)
++                h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle);
++
++        pthread_mutex_unlock(&k->handle_lock);
++
++        return h;
++}
++
++int
++kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers)
++{
++        struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns);
++
++        while (kbase_wait_for_event(&wait)) {
++                pthread_mutex_lock(&k->handle_lock);
++                if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) {
++                        pthread_mutex_unlock(&k->handle_lock);
++                        kbase_wait_fini(wait);
++                        errno = EINVAL;
++                        return -1;
++                }
++                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
++                if (!ptr->use_count) {
++                        pthread_mutex_unlock(&k->handle_lock);
++                        kbase_wait_fini(wait);
++                        return 0;
++                }
++                pthread_mutex_unlock(&k->handle_lock);
++        }
++
++        kbase_wait_fini(wait);
++        errno = ETIMEDOUT;
++        return -1;
++}
++
++static void
++adjust_time(struct timespec *tp, int64_t ns)
++{
++        ns += tp->tv_nsec;
++        tp->tv_nsec = ns % 1000000000;
++        tp->tv_sec += ns / 1000000000;
++}
++
++static int64_t
++ns_until(struct timespec tp)
++{
++        struct timespec now;
++        clock_gettime(CLOCK_MONOTONIC, &now);
++
++        int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000;
++        int64_t ns = tp.tv_nsec - now.tv_nsec;
++
++        /* Clamp the value to zero to avoid errors from ppoll */
++        return MAX2(sec + ns, 0);
++}
++
++static void
++kbase_wait_signal(kbase k)
++{
++        /* We must acquire the event condition lock, otherwise another
++         * thread could be between the trylock and the cond_wait, and
++         * not notice the broadcast. */
++        pthread_mutex_lock(&k->event_cnd_lock);
++        pthread_cond_broadcast(&k->event_cnd);
++        pthread_mutex_unlock(&k->event_cnd_lock);
++}
++
++struct kbase_wait_ctx
++kbase_wait_init(kbase k, int64_t timeout_ns)
++{
++        struct timespec tp;
++        clock_gettime(CLOCK_MONOTONIC, &tp);
++
++        adjust_time(&tp, timeout_ns);
++
++        return (struct kbase_wait_ctx) {
++                .k = k,
++                .until = tp,
++        };
++}
++
++bool
++kbase_wait_for_event(struct kbase_wait_ctx *ctx)
++{
++        kbase k = ctx->k;
++
++        /* Return instantly the first time so that a check outside the
++         * wait_for_Event loop is not required */
++        if (!ctx->has_cnd_lock) {
++                pthread_mutex_lock(&k->event_cnd_lock);
++                ctx->has_cnd_lock = true;
++                return true;
++        }
++
++        if (!ctx->has_lock) {
++                if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
++                        ctx->has_lock = true;
++                        pthread_mutex_unlock(&k->event_cnd_lock);
++                } else {
++                        int ret = pthread_cond_timedwait(&k->event_cnd,
++                                         &k->event_cnd_lock, &ctx->until);
++                        return ret != ETIMEDOUT;
++                }
++        }
++
++        bool event = k->poll_event(k, ns_until(ctx->until));
++        k->handle_events(k);
++        kbase_wait_signal(k);
++        return event;
++}
++
++void
++kbase_wait_fini(struct kbase_wait_ctx ctx)
++{
++        kbase k = ctx.k;
++
++        if (ctx.has_lock) {
++                pthread_mutex_unlock(&k->event_read_lock);
++                kbase_wait_signal(k);
++        } else if (ctx.has_cnd_lock) {
++                pthread_mutex_unlock(&k->event_cnd_lock);
++        }
++}
++
++void
++kbase_ensure_handle_events(kbase k)
++{
++        /* If we don't manage to take the lock, then events have recently/will
++         * soon be handled, there is no need to do anything. */
++        if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
++                k->handle_events(k);
++                pthread_mutex_unlock(&k->event_read_lock);
++                kbase_wait_signal(k);
++        }
++}
++
++bool
++kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp)
++{
++        struct pollfd pfd = {
++                .fd = fd,
++                .events = wait_shared ? POLLOUT : POLLIN,
++        };
++
++        uint64_t timeout = ns_until(tp);
++
++        struct timespec t = {
++                .tv_sec = timeout / 1000000000,
++                .tv_nsec = timeout % 1000000000,
++        };
++
++        int ret = ppoll(&pfd, 1, &t, NULL);
++
++        if (ret == -1 && errno != EINTR)
++                perror("kbase_poll_fd_until");
++
++        return ret != 0;
++}
+diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h
+new file mode 100644
+index 00000000000..878f7468433
+--- /dev/null
++++ b/src/panfrost/base/pan_base.h
+@@ -0,0 +1,234 @@
++/*
++ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++/* Library for interfacing with kbase */
++#ifndef PAN_BASE_H
++#define PAN_BASE_H
++
++#include "util/u_dynarray.h"
++#include "util/list.h"
++
++#define PAN_EVENT_SIZE 16
++
++typedef uint64_t base_va;
++struct base_ptr {
++        void *cpu;
++        base_va gpu;
++};
++
++struct kbase_syncobj;
++
++/* The job is done when the queue seqnum > seqnum */
++struct kbase_sync_link {
++        struct kbase_sync_link *next; /* must be first */
++        uint64_t seqnum;
++        void (*callback)(void *);
++        void *data;
++};
++
++struct kbase_event_slot {
++        struct kbase_sync_link *syncobjs;
++        struct kbase_sync_link **back;
++        uint64_t last_submit;
++        uint64_t last;
++};
++
++struct kbase_context {
++        uint8_t csg_handle;
++        uint8_t kcpu_queue;
++        bool kcpu_init; // TODO: Always create a queue?
++        uint32_t csg_uid;
++        unsigned num_csi;
++
++        unsigned tiler_heap_chunk_size;
++        base_va tiler_heap_va;
++        base_va tiler_heap_header;
++};
++
++struct kbase_cs {
++        struct kbase_context *ctx;
++        void *user_io;
++        base_va va;
++        unsigned size;
++        unsigned event_mem_offset;
++        unsigned csi;
++
++        uint64_t last_insert;
++
++        // TODO: This is only here because it's convenient for emit_csf_queue
++        uint32_t *latest_flush;
++};
++
++#define KBASE_SLOT_COUNT 2
++
++typedef struct {
++        base_va va;
++        int fd;
++        uint8_t use_count;
++        /* For emulating implicit sync. TODO make this work on v10 */
++        uint8_t last_access[KBASE_SLOT_COUNT];
++} kbase_handle;
++
++struct kbase_;
++typedef struct kbase_ *kbase;
++
++struct kbase_ {
++        unsigned setup_state;
++        bool verbose;
++
++        int fd;
++        unsigned api;
++        unsigned page_size;
++        // TODO: Actually we may want to try to pack multiple contexts / queue
++        // "sets" into a single group...
++        unsigned cs_queue_count;
++
++        /* Must not hold handle_lock while acquiring event_read_lock */
++        pthread_mutex_t handle_lock;
++        pthread_mutex_t event_read_lock;
++        pthread_mutex_t event_cnd_lock;
++        pthread_cond_t event_cnd;
++        /* TODO: Per-context/queue locks? */
++        pthread_mutex_t queue_lock;
++
++        struct list_head syncobjs;
++
++        unsigned gpuprops_size;
++        void *gpuprops;
++
++        void *tracking_region;
++        void *csf_user_reg;
++        struct base_ptr event_mem;
++        struct base_ptr kcpu_event_mem;
++        // TODO: dynamically size
++        struct kbase_event_slot event_slots[256];
++        // TODO: USe a bitset?
++        unsigned event_slot_usage;
++
++        uint8_t atom_number;
++
++        struct util_dynarray gem_handles;
++        struct util_dynarray atom_bos[256];
++        uint64_t job_seq;
++
++        void (*close)(kbase k);
++
++        bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value);
++        bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value);
++
++        struct base_ptr (*alloc)(kbase k, size_t size,
++                                 unsigned pan_flags,
++                                 unsigned mali_flags);
++        void (*free)(kbase k, base_va va);
++
++        int (*import_dmabuf)(kbase k, int fd);
++        void *(*mmap_import)(kbase k, base_va va, size_t size);
++
++        void (*cache_clean)(void *ptr, size_t size);
++        void (*cache_invalidate)(void *ptr, size_t size);
++
++        /* Returns false on timeout */
++        bool (*poll_event)(kbase k, int64_t timeout_ns);
++        bool (*handle_events)(kbase k);
++
++        /* <= v9 GPUs */
++        int (*submit)(kbase k, uint64_t va, unsigned req,
++                      struct kbase_syncobj *o,
++                      int32_t *handles, unsigned num_handles);
++
++        /* >= v10 GPUs */
++        struct kbase_context *(*context_create)(kbase k);
++        void (*context_destroy)(kbase k, struct kbase_context *ctx);
++        bool (*context_recreate)(kbase k, struct kbase_context *ctx);
++
++        // TODO: Pass in a priority?
++        struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx,
++                                   base_va va, unsigned size);
++        void (*cs_term)(kbase k, struct kbase_cs *cs);
++        void (*cs_rebind)(kbase k, struct kbase_cs *cs);
++
++        bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
++                          struct kbase_syncobj *o, uint64_t seqnum);
++        bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
++                        struct kbase_syncobj *o);
++
++        int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx);
++        bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd);
++
++        bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx,
++                             base_va addr, uint64_t value);
++        bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx,
++                              base_va addr, uint64_t value);
++
++        /* syncobj functions */
++        struct kbase_syncobj *(*syncobj_create)(kbase k);
++        void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o);
++        struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o);
++        /* TODO: timeout? (and for cs_wait) */
++        bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o);
++
++        /* Returns false if there are no active queues */
++        bool (*callback_all_queues)(kbase k, int32_t *count,
++                                    void (*callback)(void *), void *data);
++
++        void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size,
++                         bool invalidate);
++};
++
++bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose);
++
++/* Called from kbase_open */
++bool kbase_open_old(kbase k);
++bool kbase_open_new(kbase k);
++bool kbase_open_csf(kbase k);
++bool kbase_open_csf_noop(kbase k);
++
++/* BO management */
++int kbase_alloc_gem_handle(kbase k, base_va va, int fd);
++int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd);
++void kbase_free_gem_handle(kbase k, int handle);
++kbase_handle kbase_gem_handle_get(kbase k, int handle);
++int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers);
++
++/* Event waiting */
++struct kbase_wait_ctx {
++        kbase k;
++        struct timespec until;
++        bool has_lock;
++        bool has_cnd_lock;
++};
++
++struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns);
++/* Returns false on timeout, kbase_wait_fini must still be called */
++bool kbase_wait_for_event(struct kbase_wait_ctx *ctx);
++void kbase_wait_fini(struct kbase_wait_ctx ctx);
++
++void kbase_ensure_handle_events(kbase k);
++
++bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp);
++
++/* Must not conflict with PANFROST_BO_* flags */
++#define MALI_BO_CACHED_CPU   (1 << 16)
++#define MALI_BO_UNCACHED_GPU (1 << 17)
++
++#endif
+diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h
+new file mode 100644
+index 00000000000..750a445a995
+--- /dev/null
++++ b/src/panfrost/base/pan_base_noop.h
+@@ -0,0 +1,152 @@
++/*
++ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef PAN_BASE_NOOP_H
++#define PAN_BASE_NOOP_H
++
++/* For Mali-G610 as used in RK3588 */
++#define PROP(name, value) ((name << 2) | 2), value
++static const uint32_t gpu_props[] = {
++   PROP(KBASE_GPUPROP_RAW_GPU_ID,             0xa8670000),
++   PROP(KBASE_GPUPROP_PRODUCT_ID,                 0xa867),
++   PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT,        0x50005),
++   PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e),
++   PROP(KBASE_GPUPROP_TLS_ALLOC,                   0x800),
++   PROP(KBASE_GPUPROP_RAW_TILER_FEATURES,          0x809),
++};
++#undef PROP
++
++#define NOOP_COOKIE_ALLOC     0x41000
++#define NOOP_COOKIE_USER_IO   0x42000
++#define NOOP_COOKIE_MEM_ALLOC 0x43000
++
++static int
++kbase_ioctl(int fd, unsigned long request, ...)
++{
++   int ret = 0;
++
++   va_list args;
++
++   va_start(args, request);
++   void *ptr = va_arg(args, void *);
++   va_end(args);
++
++   switch (request) {
++   case KBASE_IOCTL_GET_GPUPROPS: {
++      struct kbase_ioctl_get_gpuprops *props = ptr;
++
++      if (props->size)
++         memcpy((void *)(uintptr_t) props->buffer,
++                gpu_props, MIN2(props->size, sizeof(gpu_props)));
++
++      ret = sizeof(gpu_props);
++      break;
++   }
++
++   case KBASE_IOCTL_MEM_ALLOC: {
++      union kbase_ioctl_mem_alloc *alloc = ptr;
++
++      alloc->out.gpu_va = NOOP_COOKIE_ALLOC;
++      alloc->out.flags = BASE_MEM_SAME_VA;
++      break;
++   }
++
++   case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: {
++      union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr;
++
++      // TODO: Don't return duplicates?
++      create->out.group_handle = 0;
++      create->out.group_uid = 1;
++      break;
++   }
++
++   case KBASE_IOCTL_CS_TILER_HEAP_INIT: {
++      union kbase_ioctl_cs_tiler_heap_init *init = ptr;
++
++      /* The values don't really matter, the CPU has no business in accessing
++       * these. */
++      init->out.gpu_heap_va = 0x60000;
++      init->out.first_chunk_va = 0x61000;
++      break;
++   }
++
++   case KBASE_IOCTL_CS_QUEUE_BIND: {
++      union kbase_ioctl_cs_queue_bind *bind = ptr;
++      bind->out.mmap_handle = NOOP_COOKIE_USER_IO;
++      break;
++   }
++
++   case KBASE_IOCTL_MEM_IMPORT: {
++      union kbase_ioctl_mem_import *import = ptr;
++
++      if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) {
++         ret = -1;
++         errno = EINVAL;
++         break;
++      }
++
++      int *fd = (int *)(uintptr_t) import->in.phandle;
++
++      off_t size = lseek(*fd, 0, SEEK_END);
++
++      import->out.flags = BASE_MEM_NEED_MMAP;
++      import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC;
++      import->out.va_pages = DIV_ROUND_UP(size, 4096);
++   }
++
++   case KBASE_IOCTL_SET_FLAGS:
++   case KBASE_IOCTL_MEM_EXEC_INIT:
++   case KBASE_IOCTL_MEM_JIT_INIT:
++   case KBASE_IOCTL_CS_QUEUE_REGISTER:
++   case KBASE_IOCTL_CS_QUEUE_KICK:
++   case KBASE_IOCTL_CS_TILER_HEAP_TERM:
++   case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE:
++   case KBASE_IOCTL_MEM_SYNC:
++      break;
++
++   default:
++      ret = -1;
++      errno = ENOSYS;
++   }
++
++   return ret;
++}
++
++static void *
++kbase_mmap(void *addr, size_t length, int prot, int flags,
++           int fd, off_t offset)
++{
++   switch (offset) {
++   case BASE_MEM_MAP_TRACKING_HANDLE:
++   case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE:
++   case NOOP_COOKIE_ALLOC:
++   case NOOP_COOKIE_USER_IO:
++   case NOOP_COOKIE_MEM_ALLOC:
++      return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++
++   default:
++      errno = ENOSYS;
++      return MAP_FAILED;
++   }
++}
++#endif
+diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h
+new file mode 100644
+index 00000000000..ad5af0c7098
+--- /dev/null
++++ b/src/panfrost/base/pan_cache.h
+@@ -0,0 +1,95 @@
++/*
++ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef PAN_CACHE_H
++#define PAN_CACHE_H
++
++#ifdef __aarch64__
++
++static void
++cache_clean(volatile void *addr)
++{
++        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
++}
++
++static void
++cache_invalidate(volatile void *addr)
++{
++        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
++}
++
++typedef void (*cacheline_op)(volatile void *addr);
++
++#define CACHELINE_SIZE 64
++
++static void
++cacheline_op_range(volatile void *start, size_t length, cacheline_op op)
++{
++        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
++        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
++        for (; ptr < end; ptr += CACHELINE_SIZE)
++                op(ptr);
++}
++
++static void
++cache_clean_range(volatile void *start, size_t length)
++{
++        /* TODO: Do an invalidate at the start of the range? */
++        cacheline_op_range(start, length, cache_clean);
++}
++
++static void
++cache_invalidate_range(volatile void *start, size_t length)
++{
++        cacheline_op_range(start, length, cache_invalidate);
++}
++
++#endif /* __aarch64__ */
++
++/* The #ifdef covers both 32-bit and 64-bit ARM */
++#ifdef __ARM_ARCH
++static void
++cache_barrier(void)
++{
++        __asm__ volatile ("dsb sy" ::: "memory");
++}
++
++static void
++memory_barrier(void)
++{
++        __asm__ volatile ("dmb sy" ::: "memory");
++}
++#else
++
++/* TODO: How to do cache barriers when emulated? */
++static void
++cache_barrier(void)
++{
++}
++
++static void
++memory_barrier(void)
++{
++}
++#endif
++#endif
+diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c
+new file mode 100644
+index 00000000000..99bd356c536
+--- /dev/null
++++ b/src/panfrost/base/pan_vX_base.c
+@@ -0,0 +1,1825 @@
++/*
++ * Copyright (C) 2022 Icecream95
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <inttypes.h>
++#include <string.h>
++#include <stdarg.h>
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++#include <poll.h>
++#include <pthread.h>
++
++#ifdef HAVE_VALGRIND
++#include <valgrind.h>
++#else
++#define RUNNING_ON_VALGRIND 0
++#endif
++
++#include "util/macros.h"
++#include "util/list.h"
++#include "util/u_atomic.h"
++#include "util/os_file.h"
++
++#include "pan_base.h"
++#include "pan_cache.h"
++
++#include "drm-uapi/panfrost_drm.h"
++
++#define PAN_BASE_API (PAN_BASE_VER & 0xff)
++#if (PAN_BASE_VER & 0x100) == 0x100
++#define PAN_BASE_NOOP
++#endif
++
++#if PAN_BASE_API >= 2
++#include "csf/mali_gpu_csf_registers.h"
++
++#define MALI_USE_CSF 1
++#endif
++
++#include "mali_kbase_gpuprops.h"
++
++#ifndef PAN_BASE_NOOP
++#define kbase_mmap mmap
++#endif
++
++#if PAN_BASE_API >= 1
++#include "mali_base_kernel.h"
++#include "mali_kbase_ioctl.h"
++
++#ifdef PAN_BASE_NOOP
++#include "pan_base_noop.h"
++#else
++#define kbase_ioctl ioctl
++#endif
++#else
++
++#include "old/mali-ioctl.h"
++#include "old/mali-ioctl-midgard.h"
++#include "old/mali-props.h"
++#endif
++
++#define LOG(fmt, ...) do { \
++                if (k->verbose) { \
++                        struct timespec tp; \
++                        clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \
++                        printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \
++                } \
++        } while (0)
++
++#if PAN_BASE_API == 0
++static int
++kbase_ioctl(int fd, unsigned long request, ...)
++{
++        int ioc_size = _IOC_SIZE(request);
++
++        assert(ioc_size);
++
++        va_list args;
++
++        va_start(args, request);
++        int *ptr = va_arg(args, void *);
++        va_end(args);
++
++        *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request);
++
++        int ret = ioctl(fd, request, ptr);
++        if (ret)
++                return ret;
++
++        int r = *ptr;
++        switch (r) {
++        case MALI_ERROR_OUT_OF_GPU_MEMORY:
++                errno = ENOSPC;
++                return -1;
++        case MALI_ERROR_OUT_OF_MEMORY:
++                errno = ENOMEM;
++                return -1;
++        case MALI_ERROR_FUNCTION_FAILED:
++                errno = EINVAL;
++                return -1;
++        default:
++                return 0;
++        }
++}
++#endif
++
++#if PAN_BASE_API >= 1
++static bool
++kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
++{
++        int i = 0;
++        uint64_t x = 0;
++        while (i < k->gpuprops_size) {
++                x = 0;
++                memcpy(&x, k->gpuprops + i, 4);
++                i += 4;
++
++                int size = 1 << (x & 3);
++                int this_name = x >> 2;
++
++                x = 0;
++                memcpy(&x, k->gpuprops + i, size);
++                i += size;
++
++                if (this_name == name) {
++                        *value = x;
++                        return true;
++                }
++        }
++
++        return false;
++}
++#else
++static bool
++kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
++{
++        struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops;
++
++        switch (name) {
++        case KBASE_GPUPROP_PRODUCT_ID:
++                *value = props->core.product_id;
++                return true;
++        case KBASE_GPUPROP_RAW_SHADER_PRESENT:
++                *value = props->raw.shader_present;
++                return true;
++        case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0:
++                *value = props->raw.texture_features[0];
++                return true;
++        case KBASE_GPUPROP_RAW_TILER_FEATURES:
++                *value = props->raw.tiler_features;
++                return true;
++        case KBASE_GPUPROP_RAW_GPU_ID:
++                *value = props->raw.gpu_id;
++                return true;
++        default:
++                return false;
++        }
++}
++#endif
++
++static bool
++alloc_handles(kbase k)
++{
++        util_dynarray_init(&k->gem_handles, NULL);
++        return true;
++}
++
++static bool
++free_handles(kbase k)
++{
++        util_dynarray_fini(&k->gem_handles);
++        return true;
++}
++
++static bool
++set_flags(kbase k)
++{
++        struct kbase_ioctl_set_flags flags = {
++                .create_flags = 0
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++mmap_tracking(kbase k)
++{
++        k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE,
++                                        MAP_SHARED, k->fd,
++                                        BASE_MEM_MAP_TRACKING_HANDLE);
++
++        if (k->tracking_region == MAP_FAILED) {
++                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
++                k->tracking_region = NULL;
++                return false;
++        }
++        return true;
++}
++
++static bool
++munmap_tracking(kbase k)
++{
++        if (k->tracking_region)
++                return munmap(k->tracking_region, k->page_size) == 0;
++        return true;
++}
++
++#if PAN_BASE_API >= 1
++static bool
++get_gpuprops(kbase k)
++{
++        struct kbase_ioctl_get_gpuprops props = { 0 };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
++                return false;
++        } else if (!ret) {
++                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
++                return false;
++        }
++
++        k->gpuprops_size = ret;
++        k->gpuprops = calloc(k->gpuprops_size, 1);
++
++        props.size = k->gpuprops_size;
++        props.buffer = (uint64_t)(uintptr_t) k->gpuprops;
++
++        ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
++                return false;
++        }
++
++        return true;
++}
++#else
++static bool
++get_gpuprops(kbase k)
++{
++        k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump));
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)");
++                return false;
++        }
++
++        return true;
++}
++#endif
++
++static bool
++free_gpuprops(kbase k)
++{
++        free(k->gpuprops);
++        return true;
++}
++
++#if PAN_BASE_API >= 2
++static bool
++mmap_user_reg(kbase k)
++{
++        k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ,
++                                     MAP_SHARED, k->fd,
++                                     BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
++
++        if (k->csf_user_reg == MAP_FAILED) {
++                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
++                k->csf_user_reg = NULL;
++                return false;
++        }
++        return true;
++}
++
++static bool
++munmap_user_reg(kbase k)
++{
++        if (k->csf_user_reg)
++                return munmap(k->csf_user_reg, k->page_size) == 0;
++        return true;
++}
++#endif
++
++#if PAN_BASE_API >= 1
++static bool
++init_mem_exec(kbase k)
++{
++        struct kbase_ioctl_mem_exec_init init = {
++                .va_pages = 0x100000,
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++init_mem_jit(kbase k)
++{
++        struct kbase_ioctl_mem_jit_init init = {
++                .va_pages = 1 << 25,
++                .max_allocations = 255,
++                .phys_pages = 1 << 25,
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
++                return false;
++        }
++        return true;
++}
++#endif
++
++#if PAN_BASE_API >= 2
++static struct base_ptr
++kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags);
++
++static bool
++alloc_event_mem(kbase k)
++{
++        k->event_mem = kbase_alloc(k, k->page_size * 2,
++                                   PANFROST_BO_NOEXEC,
++                                   BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
++                                   BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
++                                   BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT);
++        k->kcpu_event_mem = (struct base_ptr) {
++                .cpu = k->event_mem.cpu + k->page_size,
++                .gpu = k->event_mem.gpu + k->page_size,
++        };
++        return k->event_mem.cpu;
++}
++
++static bool
++free_event_mem(kbase k)
++{
++        if (k->event_mem.cpu)
++                return munmap(k->event_mem.cpu, k->page_size * 2) == 0;
++        return true;
++}
++#endif
++
++#if PAN_BASE_API >= 2
++static bool
++cs_group_create(kbase k, struct kbase_context *c)
++{
++        /* TODO: What about compute-only contexts? */
++        union kbase_ioctl_cs_queue_group_create_1_6 create = {
++                .in = {
++                        /* Mali *still* only supports a single tiler unit */
++                        .tiler_mask = 1,
++                        .fragment_mask = ~0ULL,
++                        .compute_mask = ~0ULL,
++
++                        .cs_min = k->cs_queue_count,
++
++                        .priority = 1,
++                        .tiler_max = 1,
++                        .fragment_max = 64,
++                        .compute_max = 64,
++                }
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
++                return false;
++        }
++
++        c->csg_handle = create.out.group_handle;
++        c->csg_uid = create.out.group_uid;
++
++        /* Should be at least 1 */
++        assert(c->csg_uid);
++
++        return true;
++}
++
++static bool
++cs_group_term(kbase k, struct kbase_context *c)
++{
++        if (!c->csg_uid)
++                return true;
++
++        struct kbase_ioctl_cs_queue_group_term term = {
++                .group_handle = c->csg_handle
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
++                return false;
++        }
++        return true;
++}
++#endif
++
++#if PAN_BASE_API >= 2
++static bool
++tiler_heap_create(kbase k, struct kbase_context *c)
++{
++        c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */
++
++        union kbase_ioctl_cs_tiler_heap_init init = {
++                .in = {
++                        .chunk_size = c->tiler_heap_chunk_size,
++                        .initial_chunks = 5,
++                        .max_chunks = 200,
++                        .target_in_flight = 65535,
++                }
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
++                return false;
++        }
++
++        c->tiler_heap_va = init.out.gpu_heap_va;
++        c->tiler_heap_header = init.out.first_chunk_va;
++
++        return true;
++}
++
++static bool
++tiler_heap_term(kbase k, struct kbase_context *c)
++{
++        if (!c->tiler_heap_va)
++                return true;
++
++        struct kbase_ioctl_cs_tiler_heap_term term = {
++                .gpu_heap_va = c->tiler_heap_va
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
++                return false;
++        }
++        return true;
++}
++#endif
++
++typedef bool (* kbase_func)(kbase k);
++
++struct kbase_op {
++        kbase_func part;
++        kbase_func cleanup;
++        const char *label;
++};
++
++static struct kbase_op kbase_main[] = {
++        { alloc_handles, free_handles, "Allocate handle array" },
++#if PAN_BASE_API >= 1
++        { set_flags, NULL, "Set flags" },
++#endif
++        { mmap_tracking, munmap_tracking, "Map tracking handle" },
++#if PAN_BASE_API == 0
++        { set_flags, NULL, "Set flags" },
++#endif
++        { get_gpuprops, free_gpuprops, "Get GPU properties" },
++#if PAN_BASE_API >= 2
++        { mmap_user_reg, munmap_user_reg, "Map user register page" },
++#endif
++#if PAN_BASE_API >= 1
++        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
++        { init_mem_jit, NULL, "Initialise JIT allocator" },
++#endif
++#if PAN_BASE_API >= 2
++        { alloc_event_mem, free_event_mem, "Allocate event memory" },
++#endif
++};
++
++static void
++kbase_close(kbase k)
++{
++        while (k->setup_state) {
++                unsigned i = k->setup_state - 1;
++                if (kbase_main[i].cleanup)
++                        kbase_main[i].cleanup(k);
++                --k->setup_state;
++        }
++
++        pthread_mutex_destroy(&k->handle_lock);
++        pthread_mutex_destroy(&k->event_read_lock);
++        pthread_mutex_destroy(&k->event_cnd_lock);
++        pthread_mutex_destroy(&k->queue_lock);
++        pthread_cond_destroy(&k->event_cnd);
++
++        close(k->fd);
++}
++
++static bool
++kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value)
++{
++        unsigned conv[] = {
++                [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID,
++                [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT,
++                [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0,
++                [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC,
++                [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES,
++        };
++
++        if (name < ARRAY_SIZE(conv) && conv[name])
++                return kbase_get_mali_gpuprop(k, conv[name], value);
++
++        switch (name) {
++        case DRM_PANFROST_PARAM_AFBC_FEATURES:
++                *value = 0;
++                return true;
++        case DRM_PANFROST_PARAM_GPU_REVISION: {
++                if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value))
++                        return false;
++                *value &= 0xffff;
++                return true;
++        }
++        default:
++                return false;
++        }
++}
++
++static void
++kbase_free(kbase k, base_va va)
++{
++        struct kbase_ioctl_mem_free f = {
++                .gpu_addr = va
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f);
++
++        if (ret == -1)
++                perror("ioctl(KBASE_IOCTL_MEM_FREE)");
++}
++
++static struct base_ptr
++kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags)
++{
++        struct base_ptr r = {0};
++
++        unsigned pages = DIV_ROUND_UP(size, k->page_size);
++
++        union kbase_ioctl_mem_alloc a = {
++                .in = {
++                        .va_pages = pages,
++                        .commit_pages = pages,
++                }
++        };
++
++        size_t alloc_size = size;
++        unsigned flags = mali_flags;
++        bool exec_align = false;
++
++        if (!flags) {
++                flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
++                        BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
++                        BASE_MEM_SAME_VA;
++
++                /* Add COHERENT_LOCAL to keep GPU cores coherent with each
++                 * other. */
++                if (PAN_BASE_API >= 1)
++                        flags |= BASE_MEM_COHERENT_LOCAL;
++        }
++
++        if (pan_flags & PANFROST_BO_HEAP) {
++                size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */
++
++                a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size);
++                a.in.commit_pages = 0;
++                a.in.extension = align_size;
++                flags |= BASE_MEM_GROW_ON_GPF;
++        }
++
++#if PAN_BASE_API >= 1
++        if (pan_flags & MALI_BO_CACHED_CPU)
++                flags |= BASE_MEM_CACHED_CPU;
++#endif
++
++#if PAN_BASE_API >= 2
++        if (pan_flags & MALI_BO_UNCACHED_GPU)
++                flags |= BASE_MEM_UNCACHED_GPU;
++#endif
++
++        if (!(pan_flags & PANFROST_BO_NOEXEC)) {
++                /* Using SAME_VA for executable BOs would make it too likely
++                 * for a blend shader to end up on the wrong side of a 4 GB
++                 * boundary. */
++                flags |= BASE_MEM_PROT_GPU_EX;
++                flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA);
++
++                if (PAN_BASE_API == 0) {
++                        /* Assume 4K pages */
++                        a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */
++                        size = 1 << 26; /* Four times the alignment */
++                        exec_align = true;
++                }
++        }
++
++        a.in.flags = flags;
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
++                return r;
++        }
++
++        // TODO: Is this always true, even in the face of multithreading?
++        if (PAN_BASE_API == 0)
++                a.out.gpu_va = 0x41000;
++
++        if ((flags & BASE_MEM_SAME_VA) &&
++            !((a.out.flags & BASE_MEM_SAME_VA) &&
++              a.out.gpu_va < 0x80000)) {
++
++                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
++                        (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va);
++                errno = EINVAL;
++                return r;
++        }
++
++        void *ptr = kbase_mmap(NULL, size,
++                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                               k->fd, a.out.gpu_va);
++
++        if (ptr == MAP_FAILED) {
++                perror("mmap(GPU BO)");
++                kbase_free(k, a.out.gpu_va);
++                return r;
++        }
++
++        uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ?
++                (uintptr_t) ptr : a.out.gpu_va;
++
++        if (exec_align) {
++                gpu_va = ALIGN_POT(gpu_va, 1 << 24);
++
++                ptr = kbase_mmap(NULL, alloc_size,
++                                 PROT_READ | PROT_WRITE, MAP_SHARED,
++                                 k->fd, gpu_va);
++
++                if (ptr == MAP_FAILED) {
++                        perror("mmap(GPU EXEC BO)");
++                        kbase_free(k, gpu_va);
++                        return r;
++                }
++        }
++
++        r.cpu = ptr;
++        r.gpu = gpu_va;
++
++        return r;
++}
++
++static int
++kbase_import_dmabuf(kbase k, int fd)
++{
++        int ret;
++
++        pthread_mutex_lock(&k->handle_lock);
++
++        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
++
++        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
++
++        for (unsigned i = 0; i < size; ++i) {
++                kbase_handle h = handles[i];
++
++                if (h.fd < 0)
++                        continue;
++
++                ret = os_same_file_description(h.fd, fd);
++
++                if (ret == 0) {
++                        pthread_mutex_unlock(&k->handle_lock);
++                        return i;
++                } else if (ret < 0) {
++                        printf("error in os_same_file_description(%i, %i)\n", h.fd, fd);
++                }
++        }
++
++        int dup = os_dupfd_cloexec(fd);
++
++        union kbase_ioctl_mem_import import = {
++                .in = {
++                        .phandle = (uintptr_t) &dup,
++                        .type = BASE_MEM_IMPORT_TYPE_UMM,
++                        /* Usage flags: CPU/GPU reads/writes */
++                        .flags = 0xf,
++                }
++        };
++
++        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import);
++
++        int handle;
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_IMPORT)");
++                handle = -1;
++        } else if (import.out.flags & BASE_MEM_NEED_MMAP) {
++                uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size,
++                                                     PROT_READ | PROT_WRITE,
++                                                     MAP_SHARED, k->fd, import.out.gpu_va);
++
++                if (va == (uintptr_t) MAP_FAILED) {
++                        perror("mmap(IMPORTED BO)");
++                        handle = -1;
++                } else {
++                        handle = kbase_alloc_gem_handle_locked(k, va, dup);
++                }
++        } else {
++                handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup);
++        }
++
++        pthread_mutex_unlock(&k->handle_lock);
++
++        return handle;
++}
++
++static void *
++kbase_mmap_import(kbase k, base_va va, size_t size)
++{
++        return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va);
++}
++
++struct kbase_fence {
++        struct list_head link;
++
++        unsigned slot;
++        uint64_t value;
++};
++
++struct kbase_syncobj {
++        struct list_head link;
++
++        struct list_head fences;
++};
++
++static struct kbase_syncobj *
++kbase_syncobj_create(kbase k)
++{
++        struct kbase_syncobj *o = calloc(1, sizeof(*o));
++        list_inithead(&o->fences);
++        pthread_mutex_lock(&k->queue_lock);
++        list_add(&o->link, &k->syncobjs);
++        pthread_mutex_unlock(&k->queue_lock);
++        return o;
++}
++
++static void
++kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o)
++{
++        pthread_mutex_lock(&k->queue_lock);
++        list_del(&o->link);
++        pthread_mutex_unlock(&k->queue_lock);
++
++        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
++                list_del(&fence->link);
++                free(fence);
++        }
++
++        free(o);
++}
++
++static void
++kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
++{
++        struct kbase_fence *fence = calloc(1, sizeof(*fence));
++
++        fence->slot = slot;
++        fence->value = value;
++
++        list_add(&fence->link, &o->fences);
++}
++
++static void
++kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
++{
++        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
++                if (fence->slot == slot) {
++                        if (value > fence->value)
++                                fence->value = value;
++
++                        return;
++                }
++        }
++
++        kbase_syncobj_add_fence(o, slot, value);
++}
++
++static struct kbase_syncobj *
++kbase_syncobj_dup(kbase k, struct kbase_syncobj *o)
++{
++        struct kbase_syncobj *dup = kbase_syncobj_create(k);
++
++        pthread_mutex_lock(&k->queue_lock);
++
++        list_for_each_entry(struct kbase_fence, fence, &o->fences, link)
++                kbase_syncobj_add_fence(dup, fence->slot, fence->value);
++
++        pthread_mutex_unlock(&k->queue_lock);
++
++        return dup;
++}
++
++static void
++kbase_syncobj_update(kbase k, struct kbase_syncobj *o)
++{
++        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
++                uint64_t value = k->event_slots[fence->slot].last;
++
++                if (value > fence->value) {
++                        LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n",
++                            o, fence->slot, fence->value, value);
++
++                        list_del(&fence->link);
++                        free(fence);
++                }
++        }
++}
++
++static bool
++kbase_syncobj_wait(kbase k, struct kbase_syncobj *o)
++{
++        if (list_is_empty(&o->fences)) {
++                LOG("syncobj has no fences\n");
++                return true;
++        }
++
++        struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL);
++
++        while (kbase_wait_for_event(&wait)) {
++                kbase_syncobj_update(k, o);
++
++                if (list_is_empty(&o->fences)) {
++                        kbase_wait_fini(wait);
++                        return true;
++                }
++        }
++
++        kbase_wait_fini(wait);
++
++        fprintf(stderr, "syncobj %p wait timeout\n", o);
++        return false;
++}
++
++static bool
++kbase_poll_event(kbase k, int64_t timeout_ns)
++{
++        struct pollfd pfd = {
++                .fd = k->fd,
++                .events = POLLIN,
++        };
++
++        struct timespec t = {
++                .tv_sec = timeout_ns / 1000000000,
++                .tv_nsec = timeout_ns % 1000000000,
++        };
++
++        int ret = ppoll(&pfd, 1, &t, NULL);
++
++        if (ret == -1 && errno != EINTR)
++                perror("poll(mali fd)");
++
++        LOG("poll returned %i\n", pfd.revents);
++
++        return ret != 0;
++}
++
++#if PAN_BASE_API < 2
++static bool
++kbase_handle_events(kbase k)
++{
++        struct base_jd_event_v2 event;
++        bool ret = true;
++
++        for (;;) {
++                int ret = read(k->fd, &event, sizeof(event));
++
++                if (ret == -1) {
++                        if (errno == EAGAIN) {
++                                return true;
++                        } else {
++                                perror("read(mali fd)");
++                                return false;
++                        }
++                }
++
++                if (event.event_code != BASE_JD_EVENT_DONE) {
++                        fprintf(stderr, "Atom %i reported event 0x%x!\n",
++                                event.atom_number, event.event_code);
++                        ret = false;
++                }
++
++                pthread_mutex_lock(&k->handle_lock);
++
++                k->event_slots[event.atom_number].last = event.udata.blob[0];
++
++                unsigned size = util_dynarray_num_elements(&k->gem_handles,
++                                                           kbase_handle);
++                kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles);
++
++                struct util_dynarray *handles = k->atom_bos + event.atom_number;
++
++                util_dynarray_foreach(handles, int32_t, h) {
++                        if (*h >= size)
++                                continue;
++                        assert(handle_data[*h].use_count);
++                        --handle_data[*h].use_count;
++                }
++                util_dynarray_fini(handles);
++
++                pthread_mutex_unlock(&k->handle_lock);
++        }
++
++        return ret;
++}
++
++#else
++
++static bool
++kbase_read_event(kbase k)
++{
++        struct base_csf_notification event;
++        int ret = read(k->fd, &event, sizeof(event));
++
++        if (ret == -1) {
++                if (errno == EAGAIN) {
++                        return true;
++                } else {
++                        perror("read(mali_fd)");
++                        return false;
++                }
++        }
++
++        if (ret != sizeof(event)) {
++                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
++                        ret, (int) sizeof(event));
++                return false;
++        }
++
++        switch (event.type) {
++        case BASE_CSF_NOTIFICATION_EVENT:
++                LOG("Notification event!\n");
++                return true;
++
++        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
++                break;
++
++        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
++                fprintf(stderr, "No event from mali_fd!\n");
++                return true;
++
++        default:
++                fprintf(stderr, "Unknown event type!\n");
++                return true;
++        }
++
++        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
++
++        switch (e.error_type) {
++        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
++                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
++                fprintf(stderr, "Queue group error: status 0x%x "
++                        "sideband 0x%"PRIx64"\n",
++                        e.payload.fatal_group.status,
++                        (uint64_t) e.payload.fatal_group.sideband);
++                break;
++        }
++        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
++                unsigned queue = e.payload.fatal_queue.csi_index;
++
++                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
++                fprintf(stderr, "Queue %i error: status 0x%x "
++                        "sideband 0x%"PRIx64"\n",
++                        queue, e.payload.fatal_queue.status,
++                        (uint64_t) e.payload.fatal_queue.sideband);
++
++                /* TODO: Decode the instruct that it got stuck at */
++
++                break;
++        }
++
++        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
++                fprintf(stderr, "Command stream timeout!\n");
++                break;
++        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
++                fprintf(stderr, "Command stream OOM!\n");
++                break;
++        default:
++                fprintf(stderr, "Unknown error type!\n");
++        }
++
++        return false;
++}
++
++static void
++kbase_update_queue_callbacks(kbase k,
++                             struct kbase_event_slot *slot,
++                             uint64_t seqnum)
++{
++        struct kbase_sync_link **list = &slot->syncobjs;
++        struct kbase_sync_link **back = slot->back;
++
++        while (*list) {
++                struct kbase_sync_link *link = *list;
++
++                LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum);
++
++                /* Items in the list should be in order, there is no need to
++                 * check any more if we can't process this link yet. */
++                if (seqnum <= link->seqnum)
++                        break;
++
++                LOG("done, calling %p(%p)\n", link->callback, link->data);
++                link->callback(link->data);
++                *list = link->next;
++                if (&link->next == back)
++                        slot->back = list;
++                free(link);
++        }
++}
++
++static bool
++kbase_handle_events(kbase k)
++{
++#ifdef PAN_BASE_NOOP
++        return true;
++#endif
++
++        /* This will clear the event count, so there's no need to do it in a
++         * loop. */
++        bool ret = kbase_read_event(k);
++
++        uint64_t *event_mem = k->event_mem.cpu;
++
++        pthread_mutex_lock(&k->queue_lock);
++
++        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
++                uint64_t seqnum = event_mem[i * 2];
++                uint64_t cmp = k->event_slots[i].last;
++
++                LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp);
++
++                if (seqnum < cmp) {
++                        if (false)
++                                fprintf(stderr, "seqnum at offset %i went backward "
++                                        "from %"PRIu64" to %"PRIu64"!\n",
++                                        i, cmp, seqnum);
++                } else /*if (seqnum > cmp)*/ {
++                        kbase_update_queue_callbacks(k, &k->event_slots[i],
++                                                     seqnum);
++                }
++
++                /* TODO: Atomic operations? */
++                k->event_slots[i].last = seqnum;
++        }
++
++        pthread_mutex_unlock(&k->queue_lock);
++
++        return ret;
++}
++
++#endif
++
++#if PAN_BASE_API < 2
++static uint8_t
++kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest)
++{
++        /* If a == 4 and newest == 5, a will become 255 */
++        a -= newest;
++        b -= newest;
++        a = MAX2(a, b);
++        a += newest;
++        return a;
++}
++
++static int
++kbase_submit(kbase k, uint64_t va, unsigned req,
++             struct kbase_syncobj *o,
++             int32_t *handles, unsigned num_handles)
++{
++        struct util_dynarray buf;
++        util_dynarray_init(&buf, NULL);
++
++        memcpy(util_dynarray_resize(&buf, int32_t, num_handles),
++               handles, num_handles * sizeof(int32_t));
++
++        pthread_mutex_lock(&k->handle_lock);
++
++        unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1;
++        unsigned dep_slots[KBASE_SLOT_COUNT];
++
++        uint8_t nr = k->atom_number++;
++
++        struct base_jd_atom_v2 atom = {
++                .jc = va,
++                .atom_number = nr,
++                .udata.blob[0] = k->job_seq++,
++        };
++
++        for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i)
++                dep_slots[i] = nr;
++
++        /* Make sure that we haven't taken an atom that's already in use. */
++        assert(!k->atom_bos[nr].data);
++        k->atom_bos[atom.atom_number] = buf;
++
++        unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
++        kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles);
++
++        struct util_dynarray extres;
++        util_dynarray_init(&extres, NULL);
++
++        /* Mark the BOs as in use */
++        for (unsigned i = 0; i < num_handles; ++i) {
++                int32_t h = handles[i];
++                assert(h < handle_buf_size);
++                assert(handle_buf[h].use_count < 255);
++
++                /* Implicit sync */
++                if (handle_buf[h].use_count)
++                        for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s)
++                                dep_slots[s] =
++                                        kbase_latest_slot(dep_slots[s],
++                                                          handle_buf[h].last_access[s],
++                                                          nr);
++
++                handle_buf[h].last_access[slot] = nr;
++                ++handle_buf[h].use_count;
++
++                if (handle_buf[h].fd != -1)
++                        util_dynarray_append(&extres, base_va, handle_buf[h].va);
++        }
++
++        pthread_mutex_unlock(&k->handle_lock);
++
++        /* TODO: Better work out the difference between handle_lock and
++         * queue_lock. */
++        if (o) {
++                pthread_mutex_lock(&k->queue_lock);
++                kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]);
++                pthread_mutex_unlock(&k->queue_lock);
++        }
++
++        assert(KBASE_SLOT_COUNT == 2);
++        if (dep_slots[0] != nr) {
++                atom.pre_dep[0].atom_id = dep_slots[0];
++                /* TODO: Use data dependencies?  */
++                atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER;
++        }
++        if (dep_slots[1] != nr) {
++                atom.pre_dep[1].atom_id = dep_slots[1];
++                atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER;
++        }
++
++        if (extres.size) {
++                atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES;
++                atom.nr_extres = util_dynarray_num_elements(&extres, base_va);
++                atom.extres_list = (uintptr_t) util_dynarray_begin(&extres);
++        }
++
++        if (req & PANFROST_JD_REQ_FS)
++                atom.core_req |= BASE_JD_REQ_FS;
++        else
++                atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T;
++
++        struct kbase_ioctl_job_submit submit = {
++                .nr_atoms = 1,
++                .stride = sizeof(atom),
++                .addr = (uintptr_t) &atom,
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit);
++
++        util_dynarray_fini(&extres);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)");
++                return -1;
++        }
++
++        return atom.atom_number;
++}
++
++#else
++static struct kbase_context *
++kbase_context_create(kbase k)
++{
++        struct kbase_context *c = calloc(1, sizeof(*c));
++
++        if (!cs_group_create(k, c)) {
++                free(c);
++                return NULL;
++        }
++
++        if (!tiler_heap_create(k, c)) {
++                cs_group_term(k, c);
++                free(c);
++                return NULL;
++        }
++
++        return c;
++}
++
++static void
++kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx);
++
++static void
++kbase_context_destroy(kbase k, struct kbase_context *ctx)
++{
++        kbase_kcpu_queue_destroy(k, ctx);
++        tiler_heap_term(k, ctx);
++        cs_group_term(k, ctx);
++        free(ctx);
++}
++
++static bool
++kbase_context_recreate(kbase k, struct kbase_context *ctx)
++{
++        kbase_kcpu_queue_destroy(k, ctx);
++        tiler_heap_term(k, ctx);
++        cs_group_term(k, ctx);
++
++        if (!cs_group_create(k, ctx)) {
++                free(ctx);
++                return false;
++        }
++
++        if (!tiler_heap_create(k, ctx)) {
++                free(ctx);
++                return false;
++        }
++
++        return true;
++}
++
++static struct kbase_cs
++kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx,
++                      base_va va, unsigned size, unsigned csi)
++{
++        struct kbase_cs cs = {
++                .ctx = ctx,
++                .va = va,
++                .size = size,
++                .csi = csi,
++                .latest_flush = (uint32_t *)k->csf_user_reg,
++        };
++
++        struct kbase_ioctl_cs_queue_register reg = {
++                .buffer_gpu_addr = va,
++                .buffer_size = size,
++                .priority = 1,
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
++                return cs;
++        }
++
++        union kbase_ioctl_cs_queue_bind bind = {
++                .in = {
++                        .buffer_gpu_addr = va,
++                        .group_handle = ctx->csg_handle,
++                        .csi_index = csi,
++                }
++        };
++
++        ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
++                // hack
++                cs.user_io = (void *)1;
++                return cs;
++        }
++
++        cs.user_io =
++                kbase_mmap(NULL,
++                           k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
++                           PROT_READ | PROT_WRITE, MAP_SHARED,
++                           k->fd, bind.out.mmap_handle);
++
++        if (cs.user_io == MAP_FAILED) {
++                perror("mmap(CS USER IO)");
++                cs.user_io = NULL;
++        }
++
++        return cs;
++}
++
++static struct kbase_cs
++kbase_cs_bind(kbase k, struct kbase_context *ctx,
++              base_va va, unsigned size)
++{
++        struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++);
++
++        // TODO: Fix this problem properly
++        if (k->event_slot_usage >= 256) {
++                fprintf(stderr, "error: Too many contexts created!\n");
++
++                /* *very* dangerous, but might just work */
++                --k->event_slot_usage;
++        }
++
++        // TODO: This is a misnomer... it isn't a byte offset
++        cs.event_mem_offset = k->event_slot_usage++;
++        k->event_slots[cs.event_mem_offset].back =
++                &k->event_slots[cs.event_mem_offset].syncobjs;
++
++        uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
++
++        /* We use the "Higher" wait condition, so initialise to 1 to allow
++         * waiting before writing... */
++        event_data[0] = 1;
++        /* And reset the error field to 0, to avoid INHERITing faults */
++        event_data[1] = 0;
++
++        /* Just a zero-init is fine... reads and writes are always paired */
++        uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
++        kcpu_data[0] = 0;
++        kcpu_data[1] = 0;
++
++        /* To match the event data */
++        k->event_slots[cs.event_mem_offset].last = 1;
++        k->event_slots[cs.event_mem_offset].last_submit = 1;
++
++        return cs;
++}
++
++static void
++kbase_cs_term(kbase k, struct kbase_cs *cs)
++{
++        if (cs->user_io) {
++                LOG("unmapping %p user_io %p\n", cs, cs->user_io);
++                munmap(cs->user_io,
++                       k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES);
++        }
++
++        struct kbase_ioctl_cs_queue_terminate term = {
++                .buffer_gpu_addr = cs->va,
++        };
++
++        kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term);
++
++        pthread_mutex_lock(&k->queue_lock);
++        kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset],
++                                     ~0ULL);
++
++        k->event_slots[cs->event_mem_offset].last = ~0ULL;
++
++        /* Make sure that no syncobjs are referencing this CS */
++        list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link)
++                kbase_syncobj_update(k, o);
++
++
++        k->event_slots[cs->event_mem_offset].last = 0;
++        pthread_mutex_unlock(&k->queue_lock);
++}
++
++static void
++kbase_cs_rebind(kbase k, struct kbase_cs *cs)
++{
++        struct kbase_cs new;
++        new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi);
++
++        cs->user_io = new.user_io;
++        LOG("remapping %p user_io %p\n", cs, cs->user_io);
++
++        fprintf(stderr, "bound csi %i again\n", cs->csi);
++}
++
++static bool
++kbase_cs_kick(kbase k, struct kbase_cs *cs)
++{
++        struct kbase_ioctl_cs_queue_kick kick = {
++                .buffer_gpu_addr = cs->va,
++        };
++
++        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
++                return false;
++        }
++
++        return true;
++}
++
++#define CS_RING_DOORBELL(cs) \
++        *((uint32_t *)(cs->user_io)) = 1
++
++#define CS_READ_REGISTER(cs, r) \
++        *((uint64_t *)(cs->user_io + 4096 * 2 + r))
++
++#define CS_WRITE_REGISTER(cs, r, v) \
++        *((uint64_t *)(cs->user_io + 4096 + r)) = v
++
++static bool
++kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
++                struct kbase_syncobj *o, uint64_t seqnum)
++{
++        LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n",
++            cs, seqnum, cs->last_insert, insert_offset);
++
++        if (!cs->user_io)
++                return false;
++
++        if (insert_offset == cs->last_insert)
++                return true;
++
++#ifndef PAN_BASE_NOOP
++        struct kbase_event_slot *slot =
++                &k->event_slots[cs->event_mem_offset];
++
++        pthread_mutex_lock(&k->queue_lock);
++        slot->last_submit = seqnum + 1;
++
++        if (o)
++                kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum);
++        pthread_mutex_unlock(&k->queue_lock);
++#endif
++
++        memory_barrier();
++
++        bool active = CS_READ_REGISTER(cs, CS_ACTIVE);
++        LOG("active is %i\n", active);
++
++        CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset);
++        cs->last_insert = insert_offset;
++
++        if (false /*active*/) {
++                memory_barrier();
++                CS_RING_DOORBELL(cs);
++                memory_barrier();
++
++                active = CS_READ_REGISTER(cs, CS_ACTIVE);
++                LOG("active is now %i\n", active);
++        } else {
++                kbase_cs_kick(k, cs);
++        }
++
++        return true;
++}
++
++static bool
++kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
++              struct kbase_syncobj *o)
++{
++        if (!cs->user_io)
++                return false;
++
++        if (kbase_syncobj_wait(k, o))
++                return true;
++
++        uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT);
++        unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE);
++
++        fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", "
++                "CS_ACTIVE (%i)\n",
++                cs->csi, e, extract_offset, a);
++
++        fprintf(stderr, "fences:\n");
++        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
++                fprintf(stderr, " slot %i: seqnum %"PRIu64"\n",
++                        fence->slot, fence->value);
++        }
++
++        return false;
++}
++
++static bool
++kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx)
++{
++#ifdef PAN_BASE_NOOP
++        return false;
++#endif
++
++        if (ctx->kcpu_init)
++                return true;
++
++        struct kbase_ioctl_kcpu_queue_new create = {0};
++
++        int ret;
++        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)");
++                return false;
++        }
++
++        ctx->kcpu_queue = create.id;
++        ctx->kcpu_init = true;
++        return true;
++}
++
++static void
++kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx)
++{
++        if (!ctx->kcpu_init)
++                return;
++
++        struct kbase_ioctl_kcpu_queue_delete destroy = {
++                .id = ctx->kcpu_queue,
++        };
++
++        int ret;
++        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)");
++        }
++
++        ctx->kcpu_init = false;
++}
++
++static bool
++kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd)
++{
++        int err;
++        bool ret = true;
++
++        if (!kbase_kcpu_queue_create(k, ctx))
++                return false;
++
++        struct kbase_ioctl_kcpu_queue_enqueue enqueue = {
++                .addr = (uintptr_t) cmd,
++                .nr_commands = 1,
++                .id = ctx->kcpu_queue,
++        };
++
++        err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
++        if (err != -1)
++                return ret;
++
++        /* If the enqueue failed, probably we hit the limit of enqueued
++         * commands (256), wait a bit and try again.
++         */
++
++        struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000);
++        while (kbase_wait_for_event(&wait)) {
++                err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
++                if (err != -1)
++                        break;
++
++                if (errno != EBUSY) {
++                        ret = false;
++                        perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE");
++                        break;
++                }
++        }
++        kbase_wait_fini(wait);
++
++        return ret;
++}
++
++static int
++kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx)
++{
++        struct base_fence fence = {
++                .basep.fd = -1,
++        };
++
++        struct base_kcpu_command fence_cmd = {
++                .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
++                .info.fence.fence = (uintptr_t) &fence,
++        };
++
++        return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1;
++}
++
++static bool
++kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd)
++{
++        struct base_kcpu_command fence_cmd = {
++                .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
++                .info.fence.fence = (uintptr_t) &(struct base_fence) {
++                        .basep.fd = fd,
++                },
++        };
++
++        return kbase_kcpu_command(k, ctx, &fence_cmd);
++}
++
++static bool
++kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx,
++                   base_va addr, uint64_t value)
++{
++        struct base_kcpu_command set_cmd = {
++                .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
++                .info.cqs_set_operation = {
++                        .objs = (uintptr_t) &(struct base_cqs_set_operation_info) {
++                                .addr = addr,
++                                .val = value,
++                                .operation = BASEP_CQS_SET_OPERATION_SET,
++                                .data_type = BASEP_CQS_DATA_TYPE_U64,
++                        },
++                        .nr_objs = 1,
++                },
++        };
++
++        return kbase_kcpu_command(k, ctx, &set_cmd);
++}
++
++static bool
++kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx,
++                    base_va addr, uint64_t value)
++{
++        struct base_kcpu_command wait_cmd = {
++                .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
++                .info.cqs_wait_operation = {
++                        .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) {
++                                .addr = addr,
++                                .val = value,
++                                .operation = BASEP_CQS_WAIT_OPERATION_GT,
++                                .data_type = BASEP_CQS_DATA_TYPE_U64,
++                        },
++                        .nr_objs = 1,
++                        .inherit_err_flags = 0,
++                },
++        };
++
++        return kbase_kcpu_command(k, ctx, &wait_cmd);
++}
++#endif
++
++// TODO: Only define for CSF kbases?
++static bool
++kbase_callback_all_queues(kbase k, int32_t *count,
++                          void (*callback)(void *), void *data)
++{
++        pthread_mutex_lock(&k->queue_lock);
++
++        int32_t queue_count = 0;
++
++        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
++                struct kbase_event_slot *slot = &k->event_slots[i];
++
++                /* There is no need to do anything for idle slots */
++                if (slot->last == slot->last_submit)
++                        continue;
++
++                struct kbase_sync_link *link = malloc(sizeof(*link));
++                *link = (struct kbase_sync_link) {
++                        .next = NULL,
++                        .seqnum = slot->last_submit,
++                        .callback = callback,
++                        .data = data,
++                };
++
++                // TODO: Put insertion code into its own function
++                struct kbase_sync_link **list = slot->back;
++                slot->back = &link->next;
++                assert(!*list);
++                *list = link;
++
++                ++queue_count;
++        }
++
++        p_atomic_add(count, queue_count);
++
++        pthread_mutex_unlock(&k->queue_lock);
++
++        return queue_count != 0;
++}
++
++static void
++kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size,
++               bool invalidate)
++{
++#ifdef __aarch64__
++        /* Valgrind replaces the operations with DC CVAU, which is not enough
++         * for CPU<->GPU coherency. The ioctl can be used instead. */
++        if (!RUNNING_ON_VALGRIND) {
++                /* I don't that memory barriers are needed here... having the
++                 * DMB SY before submit should be enough. TODO what about
++                 * dma-bufs? */
++                if (invalidate)
++                        cache_invalidate_range(cpu, size);
++                else
++                        cache_clean_range(cpu, size);
++                return;
++        }
++#endif
++
++        struct kbase_ioctl_mem_sync sync = {
++                .handle = gpu,
++                .user_addr = (uintptr_t) cpu,
++                .size = size,
++                .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1),
++        };
++
++        int ret;
++        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync);
++        if (ret == -1)
++                perror("ioctl(KBASE_IOCTL_MEM_SYNC)");
++}
++
++bool
++#if defined(PAN_BASE_NOOP)
++kbase_open_csf_noop
++#elif PAN_BASE_API == 0
++kbase_open_old
++#elif PAN_BASE_API == 1
++kbase_open_new
++#elif PAN_BASE_API == 2
++kbase_open_csf
++#endif
++(kbase k)
++{
++        k->api = PAN_BASE_API;
++
++        pthread_mutex_init(&k->handle_lock, NULL);
++        pthread_mutex_init(&k->event_read_lock, NULL);
++        pthread_mutex_init(&k->event_cnd_lock, NULL);
++        pthread_mutex_init(&k->queue_lock, NULL);
++
++        pthread_condattr_t attr;
++        pthread_condattr_init(&attr);
++        pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
++        pthread_cond_init(&k->event_cnd, &attr);
++        pthread_condattr_destroy(&attr);
++
++        list_inithead(&k->syncobjs);
++
++        /* For later APIs, we've already checked the version in pan_base.c */
++#if PAN_BASE_API == 0
++        struct kbase_ioctl_get_version ver = { 0 };
++        kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver);
++#endif
++
++        k->close = kbase_close;
++
++        k->get_pan_gpuprop = kbase_get_pan_gpuprop;
++        k->get_mali_gpuprop = kbase_get_mali_gpuprop;
++
++        k->alloc = kbase_alloc;
++        k->free = kbase_free;
++        k->import_dmabuf = kbase_import_dmabuf;
++        k->mmap_import = kbase_mmap_import;
++
++        k->poll_event = kbase_poll_event;
++        k->handle_events = kbase_handle_events;
++
++#if PAN_BASE_API < 2
++        k->submit = kbase_submit;
++#else
++        k->context_create = kbase_context_create;
++        k->context_destroy = kbase_context_destroy;
++        k->context_recreate = kbase_context_recreate;
++
++        k->cs_bind = kbase_cs_bind;
++        k->cs_term = kbase_cs_term;
++        k->cs_rebind = kbase_cs_rebind;
++        k->cs_submit = kbase_cs_submit;
++        k->cs_wait = kbase_cs_wait;
++
++        k->kcpu_fence_export = kbase_kcpu_fence_export;
++        k->kcpu_fence_import = kbase_kcpu_fence_import;
++        k->kcpu_cqs_set = kbase_kcpu_cqs_set;
++        k->kcpu_cqs_wait = kbase_kcpu_cqs_wait;
++#endif
++
++        k->syncobj_create = kbase_syncobj_create;
++        k->syncobj_destroy = kbase_syncobj_destroy;
++        k->syncobj_dup = kbase_syncobj_dup;
++        k->syncobj_wait = kbase_syncobj_wait;
++
++        k->callback_all_queues = kbase_callback_all_queues;
++
++        k->mem_sync = kbase_mem_sync;
++
++        for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) {
++                ++k->setup_state;
++                if (!kbase_main[i].part(k)) {
++                        k->close(k);
++                        return false;
++                }
++        }
++        return true;
++}
+diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
+index e8b6cf73e82..ec6ddb1837d 100644
+--- a/src/panfrost/bifrost/bifrost_compile.c
++++ b/src/panfrost/bifrost/bifrost_compile.c
+@@ -404,6 +404,24 @@ bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
+                        srcs, channels, nr, nir_dest_bit_size(instr->dest));
+ }
+ 
++static bi_index
++bi_load_sysval(bi_builder *b, int sysval,
++               unsigned nr_components, unsigned offset);
++
++static bi_index
++bi_vertex_id_offset(bi_builder *b, bool offset)
++{
++        bi_index vtx = bi_vertex_id(b);
++
++        if (!offset)
++                return vtx;
++
++        bi_index first =
++                bi_load_sysval(b, PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, 1, 0);
++
++        return bi_iadd_u32(b, vtx, first, false);
++}
++
+ static void
+ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
+ {
+@@ -419,8 +437,15 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
+         bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
+         bi_instr *I;
+ 
++        /* The attribute offset field was removed from the compute job payload
++         * in v10. */
++        bool needs_offset = b->shader->arch >= 10 &&
++                b->shader->nir->info.has_transform_feedback_varyings;
++
++        bi_index vertex_id = bi_vertex_id_offset(b, needs_offset);
++
+         if (immediate) {
+-                I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b),
++                I = bi_ld_attr_imm_to(b, dest, vertex_id,
+                                       bi_instance_id(b), regfmt, vecsize,
+                                       imm_index);
+         } else {
+@@ -431,7 +456,7 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
+                 else if (base != 0)
+                         idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+ 
+-                I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b),
++                I = bi_ld_attr_to(b, dest, vertex_id, bi_instance_id(b),
+                                   idx, regfmt, vecsize);
+         }
+ 
+@@ -1878,16 +1903,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
+          * and lower here if needed.
+          */
+         case nir_intrinsic_load_vertex_id:
+-                if (b->shader->malloc_idvs) {
+-                        bi_mov_i32_to(b, dst, bi_vertex_id(b));
+-                } else {
+-                        bi_index first = bi_load_sysval(b,
+-                                                        PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS,
+-                                                        1, 0);
+-
+-                        bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false);
+-                }
+-
++                bi_mov_i32_to(b, dst, bi_vertex_id_offset(b, !b->shader->malloc_idvs));
+                 break;
+ 
+         /* We only use in our transform feedback lowering */
+@@ -2884,7 +2900,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
+                 break;
+ 
+         case nir_op_i2i16:
+-                assert(src_sz == 8 || src_sz == 32);
++                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
+ 
+                 if (src_sz == 8)
+                         bi_v2s8_to_v2s16_to(b, dst, s0);
+@@ -2893,7 +2909,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
+                 break;
+ 
+         case nir_op_u2u16:
+-                assert(src_sz == 8 || src_sz == 32);
++                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
+ 
+                 if (src_sz == 8)
+                         bi_v2u8_to_v2u16_to(b, dst, s0);
+@@ -4976,6 +4992,8 @@ bi_compile_variant_nir(nir_shader *nir,
+ 
+         bi_validate(ctx, "NIR -> BIR");
+ 
++        _mesa_hash_table_u64_destroy(ctx->allocated_vec);
++
+         /* If the shader doesn't write any colour or depth outputs, it may
+          * still need an ATEST at the very end! */
+         bool need_dummy_atest =
+diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c
+index 2a11486cbed..9a39159e7d4 100644
+--- a/src/panfrost/bifrost/cmdline.c
++++ b/src/panfrost/bifrost/cmdline.c
+@@ -130,7 +130,7 @@ compile_shader(int stages, char **files)
+                 shader_types[i] = filename_to_stage(files[i]);
+ 
+         struct standalone_options options = {
+-                .glsl_version = 300, /* ES - needed for precision */
++                .glsl_version = 460,
+                 .do_link = true,
+                 .lower_precision = true
+         };
+diff --git a/src/panfrost/ci/deqp-panfrost-g610.toml b/src/panfrost/ci/deqp-panfrost-g610.toml
+new file mode 100644
+index 00000000000..6bad2fb44de
+--- /dev/null
++++ b/src/panfrost/ci/deqp-panfrost-g610.toml
+@@ -0,0 +1,11 @@
++# Basic test set
++[[deqp]]
++deqp = "/deqp/modules/gles2/deqp-gles2"
++caselists = ["/deqp/mustpass/gles2-master.txt"]
++deqp_args = [
++    "--deqp-surface-width=256", "--deqp-surface-height=256",
++    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
++    "--deqp-gl-config-name=rgba8888d24s8ms0",
++]
++version_check = "GL ES 3.1.*git"
++renderer_check = "Mali-G610"
+diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py
+new file mode 100755
+index 00000000000..081d32d94c9
+--- /dev/null
++++ b/src/panfrost/csf_test/interpret.py
+@@ -0,0 +1,1820 @@
++#!/usr/bin/env python3
++
++import os
++import re
++import struct
++import subprocess
++import sys
++
++try:
++    py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall"
++except:
++    py_path = "../bifrost/valhall"
++
++if py_path not in sys.path:
++    sys.path.insert(0, py_path)
++
++import asm
++import struct
++
++def ff(val):
++    return struct.unpack("=f", struct.pack("=I", val))[0]
++
++def ii(val):
++    return struct.unpack("=I", struct.pack("=f", val))[0]
++
++shaders = {
++    "atomic": """
++IADD_IMM.i32.reconverge r0, 0x0, #0x0
++NOP.wait0
++ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
++BRANCHZ.eq.reconverge ^r1.h0, offset:1
++BRANCHZ.eq 0x0, offset:3
++ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0
++IADD_IMM.i32 r0, ^r0, #0x1
++BRANCHZ.eq.reconverge 0x0, offset:-7
++NOP.end
++""",
++    "rmw": """
++IADD_IMM.i32.reconverge r0, 0x0, #0x0
++ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
++BRANCHZ.eq.reconverge r1.h0, offset:1
++BRANCHZ.eq 0x0, offset:6
++NOP.wait1
++LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0
++IADD_IMM.i32 r1, ^r1, #0x1
++STORE.i32.slot1 @r1, u0, offset:0
++IADD_IMM.i32 r0, ^r0, #0x1
++BRANCHZ.eq.reconverge 0x0, offset:-9
++NOP.end
++""",
++    "global_invocation": """
++IADD_IMM.i32 r0, ^r60, #0x1
++STORE.i32.slot0.end @r0, u0, offset:0
++""",
++    "invoc_offset": """
++LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
++IADD.s32 r0, u0, ^r0
++ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
++IADD.s32 r1, ^r1, u1
++MOV.i32 r2, u2
++STORE.i32.slot0.end @r2, ^r0, offset:0
++""",
++    "invoc_rmw": """
++LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
++IADD.s32 r0, u0, ^r0
++ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
++IADD.s32 r1, ^r1, u1
++LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0
++IADD.s32 r2, ^r2, u2
++STORE.i32.slot1.end @r2, ^r0, offset:0
++""",
++
++    "preframe": """
++U16_TO_U32.discard r0, r59.h00
++U16_TO_U32 r1, ^r59.h10
++IADD_IMM.i32 r2, 0x0, #0x1
++IADD_IMM.i32 r3, 0x0, #0x0
++TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2
++FADD.f32 r4, ^r4, 0x40490FDB
++FADD.f32 r5, ^r5, 0x40490FDB
++BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
++""",
++
++
++    "position": """
++LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0
++#BRANCHZI.absolute 0x1000000, ^r4
++# position of 16384
++IADD_IMM.i32 r2, 0x0, #0x0e
++# position of 16
++IADD_IMM.i32 r2, 0x0, #0x04
++LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0
++LSHIFT_AND.i32 r0, r60, r2, ^r0
++IADD_IMM.i32 r1, 0x0, #0x01
++RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1
++LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0
++S32_TO_F32 r0, ^r0
++S32_TO_F32 r1, ^r1
++
++RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0
++S32_TO_F32 r2, ^r2
++FADD.f32 r0, ^r0, r2.neg
++#FADD.f32 r1, ^r1, ^r2
++S32_TO_F32 r2, ^r60
++#MOV.i32 r1, 0x0
++
++FADD.f32 r0, ^r0, 0x40490FDB
++FADD.f32 r1, ^r1, 0x40490FDB
++#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0
++MOV.i32 r2, 0x3DCCCCCD
++MOV.i32 r3, 0x0
++
++#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0
++
++IADD_IMM.i32 r8, 0x0, #0x00004000
++STORE.i16.istream.slot0 @r8, r4, offset:64
++
++STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0
++STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000
++""",
++
++    "fragment": """
++ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0
++IADD_IMM.i32 r1, 0x0, #0x1ff
++LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1
++SHADDX.u64 r2, u2, ^r0.w0, shift:0x2
++STORE.i32.slot0.wait0 @r59, ^r2, offset:0
++
++IADD_IMM.i32 r4, 0x0, #0x3f100000
++IADD_IMM.i32 r5, 0x0, #0x3f400000
++IADD_IMM.i32 r6, 0x0, #0x3f300000
++IADD_IMM.i32 r7, 0x0, #0x32cccccd
++BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
++""",
++
++}
++
++flg = 0xf
++#flg = 0x20000f # Uncached!
++
++HEAP_SIZE = 1024 * 1024
++
++memory = {
++    "ev": (8192, 0x8200f),
++    "x": 1024 * 1024,
++    "y": 4096,
++    "ls_alloc": 4096,
++    "occlusion": 4096,
++
++    "ssbo": 4096,
++    "tls": 4096,
++
++    #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB
++    "plane_0": (256 * 256 * 32, 0x280f), # 2 MB
++
++    "idk": HEAP_SIZE,
++    "heap": HEAP_SIZE,
++}
++
++w = 0xffffffff
++
++# Words are 32-bit, apart from address references
++descriptors = {
++    "shader": [0x118, 1 << 12, "invoc_rmw"],
++    "ls": [3, 31, "ls_alloc"],
++    "fau": [("ssbo", 0), ("ssbo", 16)],
++    "fau2": [("ev", 8 + (0 << 34)), 7, 0],
++
++    "tiler_heap": [
++        0x029, 1 << 21, #HEAP_SIZE,
++        0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60
++        #"heap", ("heap", 64), ("heap", HEAP_SIZE),
++    ],
++
++} | {
++    x: [
++        0, 0,
++        # Hierarchy mask,
++        # Single-sampled
++        # Last provoking vertex
++        0x6 | (0 << 18),
++        0x00ff00ff,
++        # Layer
++        0, 0,
++        "tiler_heap",
++        ("idk", 0x10),
++        #("tiler_heap", -0xfff0),
++        # "Weights"
++    ] + ([0] * (32 - 10)) + [
++        # "State"
++        0,
++        31,
++        0,
++        0x10000000,
++    ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3")
++} | {
++
++    "thread_storage": [
++        1, 31,
++        "tls",
++        0, 0,
++    ],
++
++    # Preload r59/r60
++    "preframe_shader": [0x128, 3 << 11, "preframe"],
++    "position_shader": [0x138, 3 << 11, "position"],
++    "fragment_shader": [0x128, 3 << 11, "fragment"],
++
++    "idvs_zs": [
++        0x70077, # Depth/stencil type, Always for stencil tests
++        0, 0, # Stencil state
++        0, # unk
++        # Depth source minimum, write disabled
++        # [0, 1] Depth clamp
++        # Depth function: Always
++        (1 << 23) | (7 << 29),
++        0, # Depth units
++        0, # Depth factor
++        0, # Depth bias clamp
++    ],
++
++    "preframe_zs": [
++        0x70077, # Depth/stencil type, Always for stencil tests
++        0, 0, # Stencil state
++        0, # unk
++        # Depth source minimum, write disabled
++        # [0, 1] Depth clamp
++        # Depth function: Always
++        (1 << 23) | (7 << 29),
++        0, # Depth units
++        0, # Depth factor
++        0, # Depth bias clamp
++    ],
++
++    "idvs_blend": [
++        # Load dest, enable
++        1 | (1 << 9),
++        # RGB/Alpha: Src + Zero * Src
++        # All channels
++        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
++        # Fixed function blending, four components
++        2 | (3 << 3),
++        # RGBA8 TB pixel format / F32 register format
++        0 | (237 << 12) | (0 << 22) | (1 << 24),
++    ],
++
++    "preframe_blend": [
++        # Load dest, enable
++        1 | (1 << 9),
++        # RGB/Alpha: Src + Zero * Src
++        # All channels
++        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
++        # Fixed function blending, four components
++        2 | (3 << 3),
++        # RGBA8 TB pixel format / F32 register format
++        0 | (237 << 12) | (0 << 22) | (1 << 24),
++    ],
++
++    "preframe_surface": [
++        # Plane descriptor, generic, tiled, RAW32 clump format
++        10 | (1 << 4) | (1 << 8) | (2 << 24),
++        256 * 256 * 4,
++        "plane_0",
++        0,
++        0, 0,
++        0, # was 15,
++    ],
++
++    "preframe_table": [
++        # Texture descriptor, 2D, format
++        2 | (2 << 4) | (187 << (10 + 12)),
++        # Width, height
++        255 | (255 << 16),
++        # Swizzle, interleave
++        1672 | (1 << 12),
++        0,
++        "preframe_surface",
++        0, 0,
++
++        # Sampler descriptor, clamp to edge
++        1 | (9 << 8) | (9 << 12) | (9 << 16),
++        0, 0, 0, 0, 0, 0, 0,
++    ],
++
++    "preframe_resources": [
++        ("preframe_table", (1 << (32 + 24))), 0x40, 0,
++    ],
++
++    "dcds": [
++        # Clean fragment write, primitive barrier
++        (1 << 9) | (1 << 10),
++        # Sample mask of 0xffff, RT mask of 1
++        0x1ffff,
++        0, 0, # vertex array
++        0, 0, # unk
++        0, 0x3f800000, # min/max depth
++        0, 0, # unk
++        "preframe_zs", # depth/stencil
++        ("preframe_blend", 1), # blend (count == 1)
++        0, 0, # occlusion
++
++        # Shader environment:
++        0, # Attribute offset
++        2, # FAU count
++        0, 0, 0, 0, 0, 0, # unk
++        ("preframe_resources", 1), # Resources
++        "preframe_shader", # Shader
++        0, 0, # Thread storage
++        "fau", # FAU
++    ],
++
++    "framebuffer": [
++        1, 0, # Pre/post, downscale, layer index
++        0x10000, 0, # Argument
++        "ls_alloc", # Sample locations
++        "dcds", # DCDs
++        0x00ff00ff, # width / height
++        0, 0x00ff00ff, # bound min/max
++        # 32x32 tile size
++        # 4096 byte buffer allocation (maybe?)
++        (10 << 9) | (4 << 24),
++        0, # Disable S, ZS/CRC, Empty Tile, CRC
++        0, # Z Clear
++        "tiler_ctx", # Tiler
++
++        # Framebuffer padding
++        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++
++        # Render target
++        # R8G8B8A8 internal format
++        (1 << 26),
++        # Write Enable
++        # R8G8B8A8 colour format
++        # Linear block format
++        # 0123 swizzle
++        # Clean pixel write enable
++        1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31),
++
++        # AFBC overlay
++        # No YTR, no split, no wide, no reverse, no front, no alpha
++        # RGBA8 compression mode
++        0 | (10 << 10),
++        0, 0, 0, 0, 0,
++
++        # RT Buffer
++        "plane_0",
++        256 * 4 * 16, # Row stride (for tiling)
++        0x400, # Surface stride / Body offset
++
++        # RT Clear
++        0x2e234589, 0, 0, 0,
++    ],
++
++    "index_buffer": [
++        0, 1, 2,
++        0, 2, 1,
++        1, 0, 2,
++        1, 2, 0,
++        2, 0, 1,
++        2, 1, 0,
++
++        #63, 64, 65,
++        1, 2, 3,
++        4, 5, 6,
++        12, 13, 14,
++        0, 1, 2,
++        4, 5, 6,
++        8, 9, 10,
++        3, 4, 5,
++    ],
++
++    "point_index": [x * 4 for x in range(32)] + [
++        0, 64, 440, 0,
++    ],
++
++    "position_data": [
++        ii(10.0), ii(10.0), ii(1.0), ii(1.0),
++    ],
++}
++
++# TODO: Use mako? Or just change the syntax for "LDM/STM"
++# and use f-strings again?
++
++cmds = """
++!cs 0
++resources fragment
++
++@ Bound min
++mov w2a, i16:0,0
++@ Bound max
++mov w2b, i16:255,255
++mov x28, $framebuffer+1
++
++slot 2
++
++fragment
++
++mov w4a, #0x0
++UNK 02 24, #0x4a0000ff0211
++wait 1
++
++mov x50, $ev
++evstr w5f, [x50], unk 0xfd, irq
++
++!raw sleep 20
++!memset plane_0 0 0 262144
++!raw sleep 200
++!dump plane_0 0 12
++!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768
++"""
++
++altcmds = """
++!cs 0
++
++@ Some time is required for the change to become active
++@ Just submitting a second job appears to be enough
++resources compute fragment tiler idvs
++mov x48, #0x6000000000
++heapctx x48
++
++!cs 0
++
++slot 3
++wait 3
++heapinc vt_start
++
++@ Base vertex count
++mov w24, 0
++@ Instance count
++mov w22, 1
++
++@ Vertex attribute stride
++mov x30, 0
++
++@ Primitive
++mov w38, 0x430000
++@@ Draw
++@ Pixel kill etc.
++@   Enable occlusion query
++@mov w39, 0xc000
++mov w39, 0
++@ Unk...
++mov w26, 0x1000
++@ Sample mask / render target mask
++mov w3a, 0x1ffff
++@ Min/max Z
++mov w2c, float:0
++mov w2d, float:1.0
++@ Depth/stencil
++mov x34, $idvs_zs
++@ Blend
++mov x32, $idvs_blend+1
++@ Occlusion
++mov x2e, $occlusion
++
++@ Primitive size
++mov x3c, float:3.75
++@ Fragment shader environment
++mov x14, $fragment_shader
++@ FAU count == 2
++movp x0c, $fau+0x0200000000000000
++
++@ Position shader environment
++mov x10, $position_shader
++
++mov x18, $thread_storage
++
++@ is this right?! "Vertex attribute stride" apparently?
++@  that was for pure tiler jobs, for idvs it messes up points/lines
++@  for some reason
++@mov x30, $position_data
++
++@ Tiler
++mov x28, $tiler_ctx
++
++@ Scissor min
++mov w2a, i16:0,0
++@ Scissor max
++mov w2b, i16:255,255
++
++mov w21, 18
++mov w27, 4096
++mov x36, $index_buffer
++
++idvs 0x4002, mode triangles, index uint32
++
++mov w21, 1 @36
++mov w27, 4096
++mov x36, $point_index
++
++@idvs 0x4a42, mode points, index uint32
++
++mov w21, 400000
++mov w21, 18
++@idvs 0x4a42, mode triangles, index none
++
++@idvs 0x4a42, mode points, index none
++@idvs 0x4a42, mode line-loop, index none
++
++flush_tiler
++wait 3
++heapinc vt_end
++
++mov x50, $ev
++evstr w5f, [x50], unk 0xfd, irq
++
++UNK 00 24, #0x5f0000000233
++wait all
++
++!dump64 tiler_heap 0 4096
++@!dump idk 0 1048576
++@!dump position_data 0 4096
++
++!cs 0
++
++UNK 00 24, #0x5f0000000233
++wait all
++
++slot 4
++wait 4
++heapinc vt_start
++
++mov x28, $tiler_ctx2
++idvs 0x4002, mode triangles, index none
++flush_tiler
++wait 4
++heapinc vt_end
++
++UNK 00 24, #0x5f0000000233
++wait all
++
++mov x50, $ev
++evstr w5f, [x50], unk 0xfd, irq
++
++!dump64 tiler_heap 0 4096
++
++!cs 0
++
++mov x50, $ev
++
++@ Bound min
++mov w2a, i16:0,0
++@ Bound max
++mov w2b, i16:255,255
++mov x28, $framebuffer+1
++@ Tile enable map
++mov x2c, $x
++mov x2e, 64
++
++mov w40, 1
++str w40, [x2c]
++@str w40, [x2c, 128]
++
++@ Use tile enable map
++@fragment tem 1
++
++fragment
++
++@ Does this actually do anytihng?
++mov x48, $tiler_ctx
++ldr x4a, [x48, 40]
++ldr x4c, [x48, 48]
++wait 0,4
++UNK 02 0b, 0x4a4c00100001
++
++mov x48, $tiler_ctx2
++ldr x4a, [x48, 40]
++ldr x4c, [x48, 48]
++wait 0,4
++UNK 02 0b, 0x4a4c00100001
++
++UNK 02 24, #0x5f0000f80211
++@UNK 00 24, #0x5f0000000233
++wait 1
++
++mov x54, $plane_0
++ldr x56, [x54]
++wait 0
++
++mov x52, $y
++str x56, [x52]
++
++evstr w5f, [x50], unk 0xfd, irq
++
++!raw td
++!fdump heap 0 1048576
++!tiler heap 0 1048576
++
++
++@!dump rt_buffer 0 4096
++!dump y 0 4096
++@!dump plane_0 0 524288
++@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000
++!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400
++!dump occlusion 0 4096
++@!dump ssbo 0 4096
++
++!dump64 tiler_heap 0 4096
++!dump tiler_ctx 0 4096
++!dump tiler_ctx2 0 4096
++
++@!fdump heap 0 1048576
++
++!cs 0
++
++slot 3
++wait 3
++heapinc vt_start
++
++mov x28, $tiler_ctx3
++mov w2c, float:0
++mov w2d, float:1.0
++mov x2e, $occlusion
++
++idvs 0x4002, mode triangles, index none
++flush_tiler
++wait 3
++heapinc vt_end
++
++UNK 00 24, #0x5f0000000233
++wait all
++
++mov x50, $ev
++evstr w5f, [x50], unk 0xfd, irq
++
++!dump64 tiler_heap 0 4096
++!dump tiler_ctx 0 4096
++!raw td
++
++"""
++
++docopy = """
++ldr {w00-w0f}, [x52]
++ldr {w10-w1f}, [x52, 64]
++ldr {w20-w2f}, [x52, 128]
++ldr {w30-w3f}, [x52, 192]
++add x52, x52, 256
++
++loop:
++wait 0
++
++str {w00-w0f}, [x54]
++ldr {w00-w0f}, [x52]
++str {w10-w1f}, [x54, 64]
++ldr {w10-w1f}, [x52, 64]
++str {w20-w2f}, [x54, 128]
++ldr {w20-w2f}, [x52, 128]
++str {w30-w3f}, [x54, 192]
++ldr {w30-w3f}, [x52, 192]
++
++add x54, x54, 256
++add x52, x52, 256
++add x50, x50, -256
++
++b.ne w50, loop
++b.ne w51, loop
++"""
++
++oldcmds = f"""
++!cs 0
++
++mov x50, 0x8000000
++
++mov x52, $from
++mov x54, $to
++mov x56, $x
++mov x58, $ev
++mov x5a, $y
++
++str cycles, [x56]
++{docopy}
++str cycles, [x56, 8]
++
++UNK 00 24, #0x5f0000000233
++evstr w5f, [x58], unk 0xfd, irq
++
++!cs 1
++
++mov x50, 0x8000000
++
++mov x52, $from
++mov x54, $to
++mov x56, $x
++mov x58, $ev
++mov x5a, $y
++
++add x52, x52, 0x8000000
++add x54, x54, 0x8000000
++add x56, x56, 32
++
++nop
++nop
++
++str cycles, [x56]
++{docopy}
++str cycles, [x56, 8]
++
++UNK 00 24, #0x5f0000000233
++evstr w5f, [x58], unk 0xfd, irq
++
++!delta x 0 4096
++"""
++
++oldcmds = """
++!cs 0
++endpt compute
++!cs 0
++
++@ Workgroup size 1x1x1, merging allowed
++mov w21, 0x80000000
++
++@ Workgroup count 1x1x1
++mov w25, 1
++mov w26, 1
++mov w27, 1
++
++@ Offset 0,0,0
++mov w22, 0
++mov w23, 0
++mov w24, 0
++
++@ TODO: offset x/y/z
++
++@ Resources
++mov x06, 0
++
++@ Shader
++mov x16, $shader
++
++@ Local storage
++mov x1e, $ls
++
++@ FAU
++movp x0e, $fau+0x0200000000000000
++
++slot 2
++wait 2
++
++UNK 0400000000008200
++
++mov x58, $fau
++ldr x56, [x58]
++wait 0
++
++@mov w4a, 0
++
++@slot 6
++@mov x54, $x
++@UNK 02 24, #0x4a0000f80211
++@ldr x52, [x56]
++@wait 0,1
++@str x52, [x54]
++
++mov w40, 60
++1: add w40, w40, -1
++
++@mov w4a, #0x0
++@UNK 02 24, #0x4a0000f80211
++@wait 1
++
++@mov w54, #0
++@UNK 00 24, #0x540000000233
++@wait all
++
++slot 2
++wait 2
++
++add w22, w22, 1
++@UNK 0400ff0000008200
++
++@b.ne w40, 1b
++
++!dump x 0 4096
++!dump y 0 4096
++!dump ev 0 4096
++"""
++
++oldcmds = """
++!cs 0
++
++mov x48, $x
++
++mov w21, 0x80000000
++mov w25, 1
++mov w26, 1
++mov w27, 1
++
++movp x0e, $fau+0x0200000000000000
++
++@ Write FAUs
++@add x0e, x48, 64
++@mov x50, $ev
++@str x50, [x0e]
++@mov x30, 10
++@str x30, [x0e, 8]
++@add w0f, w0f, 0x02000000
++
++@ Write shader descriptor
++@add x16, x48, 128
++@mov x30, 0x118
++@str x30, [x16]
++@mov x30, $compute
++@str x30, [x16, 8]
++
++wait 0
++
++add x1e, x48, 192
++
++mov x30, $y
++@regdump x30
++@mov x30, 0
++
++resources compute
++slot 2
++mov w54, #0xffffe0
++UNK 00 24, #0x540000000233
++
++wait all
++
++mov x54, 0
++mov w56, 0
++mov w5d, 1
++
++slot 2
++wait 2
++wait 2
++regdump x30
++UNK 0400ff0000008200
++add x30, x30, 0x200
++regdump x30
++slot 2
++wait 2
++
++mov w40, 1000
++1: add w40, w40, -1
++str cycles, [x50, 32]
++b.ne w40, 1b
++
++wait 0
++wait all
++
++@ 6 / 10 / 14
++mov w40, 1
++1: add w40, w40, -1
++UNK 0400ff0000000200
++b.ne w40, 1b
++
++mov w40, 1000
++1: add w40, w40, -1
++str cycles, [x50, 32]
++b.ne w40, 1b
++
++mov w42, 200
++mov w40, 100
++1: add w40, w40, -1
++@wait all
++@UNK 0400ff0000008001 @ compute
++
++@UNK 0400ff0000000001
++@UNK 2501504200000004 @ evadd
++@UNK 3 24, #0x4a0000000211
++
++@wait all
++b.ne w40, 1b
++
++@UNK 2601504200000004
++
++str cycles, [x50, 40]
++str cycles, [x50, 48]
++UNK 02 24, #0x4a0000000211
++wait 0
++
++add x5c, x50, 64
++evadd w5e, [x5c], unk 0xfd
++evadd w5e, [x5c], unk 0xfd, irq, unk0
++
++!dump x 0 4096
++!dump y 0 4096
++!delta ev 0 4096
++"""
++
++altcmds = """
++!cs 0
++!alloc x 4096
++!alloc ev 4096 0x8200f
++!alloc ev2 4096 0x8200f
++
++mov x10, $x
++UNK 00 30, #0x100000000000
++add x12, x10, 256
++str cycles, [x12]
++mov x5a, $ev2
++mov x48, 0
++mov w4a, 0
++slot 3
++wait 3
++UNK 00 31, 0
++mov x48, $ev
++mov w4a, 0x4321
++add x46, x48, 64
++mov w42, 0
++
++str cycles, [x12, 8]
++UNK 01 26, 0x484a00000005
++str cycles, [x12, 16]
++UNK 01 26, 0x484a00000005
++str cycles, [x12, 24]
++
++nop
++
++mov w10, 10000
++1:
++UNK 01 26, 0x484a00000005
++add w10, w10, -1
++b.ne w10, 1b
++str cycles, [x12, 32]
++
++mov w10, 10000
++1:
++UNK 01 26, 0x484a00000005
++@UNK 02 24, #0x420000000211
++add w10, w10, -1
++b.ne w10, 1b
++str cycles, [x12, 40]
++
++ldr x16, [x48, 0]
++wait 0
++str x16, [x48, 16]
++
++UNK 00 31, 0x100000000
++
++mov w4a, #0x0
++UNK 02 24, #0x4a0000000211
++
++mov w5e, 1
++add x5c, x5a, 0x100
++UNK 01 25, 0x5c5e00f80001
++
++!delta x 0 4096
++!dump ev 0 4096
++!dump ev2 0 4096
++"""
++
++altcmds = """
++!cs 0
++!alloc x 4096
++!alloc ev 4096 0x8200f
++
++iter vertex
++slot 2
++
++mov x40, $x
++mov w10, 1
++mov x48, 0
++mov w4a, 0
++call w4a, x48
++  nop
++  nop
++  nop
++  mov x20, $.
++@  movp x22, 0x0126000011223344
++  movp x22, 0x1600000060000001
++  str x22, [x20, 56]
++  1: nop
++  b 1b
++  nop
++  add x40, x40, #256
++  regdump x40
++
++mov x5a, #0x5ff7fd6000
++mov x48, $ev
++mov x40, #0x5ff7fd6000
++mov w54, #0x1
++UNK 00 24, #0x540000000233
++wait 0
++slot 6
++@UNK 00 31, #0x0
++UNK 00 09, #0x0
++wait 6
++@UNK 00 31, #0x100000000
++mov x4a, x40
++UNK 01 26, 0x484a00040001
++
++!dump x 0 4096
++@!dump ev 0 4096
++@!delta x 0 4096
++"""
++
++cycletest = """
++mov w10, 10
++1:
++str cycles, [x5c]
++add x5c, x5c, 8
++add w10, w10, -1
++mov w11, 100000
++
++inner:
++add w11, w11, -1
++b.ne w11, inner
++
++b.ne w10, 1b
++"""
++
++def get_cmds(cmd):
++    return cmds.replace("{cmd}", str(cmd))
++
++def assemble_shader(text):
++    lines = text.strip().split("\n")
++    lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"]
++    return [asm.parse_asm(ln) for ln in lines]
++
++class Buffer:
++    id = 0
++
++    def __init__(self):
++        self.id = Buffer.id
++        Buffer.id += 1
++
++def resolve_rel(to, branch):
++    return (to - branch) // 8 - 1
++
++def to_int16(value):
++    assert(value < 36768)
++    assert(value >= -32768)
++    return value & 0xffff
++
++class Level(Buffer):
++    def __init__(self, indent):
++        super().__init__()
++
++        self.indent = indent
++        self.buffer = []
++        self.call_addr_offset = None
++        self.call_len_offset = None
++
++        self.labels = {}
++        self.label_refs = []
++        # Numeric labels can be reused, so have to be handled specially.
++        self.num_labels = {}
++        self.num_refs = {}
++
++    def offset(self):
++        return len(self.buffer) * 8
++
++    def __repr__(self):
++        buf = " ".join(hex(x) for x in self.buffer)
++        return f"buffer {self.id} {self.offset()} 0x200f {buf}"
++
++    def buffer_add_value(self, offset, value):
++        self.buffer[offset // 8] += value
++
++    def process_relocs(self, refs, to=None):
++        for ref, offset, type_ in refs:
++            assert(type_ == "rel")
++
++            if to is None:
++                goto = self.labels[ref]
++            else:
++                goto = to
++
++            value = to_int16(resolve_rel(goto, offset))
++            self.buffer_add_value(offset, value)
++
++    def finish(self):
++        self.process_relocs(self.label_refs)
++
++class Alloc(Buffer):
++    def __init__(self, size, flags=0x280f):
++        super().__init__()
++
++        self.size = size
++        self.flags = flags
++        self.buffer = []
++
++    def __repr__(self):
++        buf = " ".join(hex(x) for x in self.buffer)
++        return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}"
++
++def fmt_reloc(r, name="reloc"):
++    dst, offset, src, src_offset = r
++    return f"{name} {dst}+{offset} {src}+{src_offset}"
++
++def fmt_exe(e):
++    return " ".join(str(x) for x in e)
++
++class Context:
++    def __init__(self):
++        self.levels = []
++        self.l = None
++
++        self.allocs = {}
++        self.completed = []
++        self.reloc = []
++        self.reloc_split = []
++
++        self.exe = []
++        self.last_exe = None
++
++        self.is_call = False
++
++    def set_l(self):
++        if len(self.levels):
++            self.l = self.levels[-1]
++
++    def pop_until(self, indent):
++        while self.l.indent != indent:
++            l = self.levels.pop()
++            self.completed.append(l)
++
++            self.set_l()
++            if not len(self.levels):
++                return
++
++            buf_len = l.offset()
++
++            r = self.l
++            self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0))
++            r.buffer[r.call_len_offset] = (
++                (r.buffer[r.call_len_offset] & (0xffff << 48)) +
++                buf_len)
++            r.buffer[r.call_addr_offset] &= (0xffff << 48)
++
++            r.call_addr_offset = None
++            r.call_len_offset = None
++
++    def flush_exe(self):
++        ind = self.levels[0].indent
++
++        self.pop_until(ind)
++        if len(self.levels[0].buffer):
++            l = self.levels.pop()
++            l.finish()
++            self.completed.append(l)
++
++            self.levels.append(Level(ind))
++            self.set_l()
++
++        if not len(self.exe):
++            return
++
++        if self.last_exe is None:
++            print("# Trying to add multiple CSs to an exe line, becoming confused")
++            return
++
++        if len(self.completed):
++            p = self.completed[-1]
++            assert(p.indent == ind)
++
++            self.exe[self.last_exe] += [p.id, p.offset()]
++
++        self.last_exe = None
++
++    def add_shaders(self, shaders):
++        for sh in shaders:
++            qwords = assemble_shader(shaders[sh])
++            sh = sh.lower()
++
++            a = Alloc(len(qwords) * 8, flags=0x2017)
++            a.buffer = qwords
++            self.allocs[sh] = a
++
++    def add_memory(self, memory):
++        for m in memory:
++            f = memory[m]
++            if isinstance(f, int):
++                size, flags = f, 0x280f
++            else:
++                size, flags = f
++            self.allocs[m] = Alloc(size, flags)
++
++    def add_descriptors(self, descriptors):
++        for d in descriptors:
++            words = descriptors[d]
++            a = Alloc(0)
++
++            buf = []
++            for w in words:
++                if isinstance(w, int):
++                    buf.append(w)
++                else:
++                    if isinstance(w, str):
++                        alloc, offset = w, 0
++                    else:
++                        alloc, offset = w
++                    ref = self.allocs[alloc]
++                    self.reloc.append((a.id, len(buf) * 4,
++                                       ref.id, offset))
++                    buf.append(0)
++                    buf.append(0)
++
++            it = iter(buf)
++            a.buffer = [x | (y << 32) for x, y in zip(it, it)]
++            a.size = len(a.buffer) * 8
++            self.allocs[d] = a
++
++    def interpret(self, text):
++        text = text.split("\n")
++
++        old_indent = None
++
++        for orig_line in text:
++            #print(orig_line, file=sys.stderr)
++
++            line = orig_line.split("@")[0].expandtabs().rstrip().lower()
++            if not line:
++                continue
++
++            indent = len(line) - len(line.lstrip())
++            line = line.lstrip()
++
++            if old_indent is None:
++                self.levels.append(Level(indent))
++            elif indent != old_indent:
++                if indent > old_indent:
++                    assert(self.is_call)
++
++                    self.levels.append(Level(indent))
++                else:
++                    self.pop_until(indent)
++
++            self.set_l()
++
++            old_indent = indent
++            self.is_call = False
++
++            given_code = None
++
++            # TODO: Check against this to test the disassembler?
++            if re.match(r"[0-9a-f]{16} ", line):
++                given_code = int(line[:16], 16)
++                line = line[16:].lstrip()
++
++            s = [x.strip(",") for x in line.split()]
++
++            if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])):
++                label = s[0]
++                if s[0].endswith(":"):
++                    label = label[:-1]
++
++                if is_num(label):
++                    label = int(label)
++                    if label in self.l.num_refs:
++                        self.l.process_relocs(self.l.num_refs[label], self.l.offset())
++                        del self.l.num_refs[label]
++                    self.l.num_labels[label] = self.l.offset()
++                else:
++                    if label in self.l.labels:
++                        print("Label reuse is not supported for non-numeric labels")
++                    self.l.labels[label] = self.l.offset()
++
++                s = s[1:]
++                if not len(s):
++                    continue
++
++            for i in range(len(s)):
++                if s[i].startswith("$"):
++                    name, *offset = s[i][1:].split("+")
++                    if name == ".":
++                        buf = self.l
++                    else:
++                        buf = self.allocs[name]
++                    if len(offset):
++                        assert(len(offset) == 1)
++                        offset = int(offset[0], 0)
++                    else:
++                        offset = 0
++
++                    if s[0] == "movp":
++                        rels = self.reloc_split
++                    else:
++                        rels = self.reloc
++
++                    rels.append((self.l.id, self.l.offset(),
++                                 buf.id, offset))
++                    s[i] = "#0x0"
++
++            def is_num(str):
++                return re.fullmatch(r"[0-9]+", str)
++
++            def hx(word):
++                return int(word, 16)
++
++            def reg(word):
++                return hx(word[1:])
++
++            def val(word):
++                if word.startswith("float:"):
++                    return ii(float(word.split(":")[1]))
++                elif word.startswith("i16:"):
++                    lo, hi = word.split(":")[1].split(",")
++                    lo, hi = val(lo), val(hi)
++                    assert(lo < (1 << 16))
++                    assert(hi < (1 << 16))
++                    return (lo & 0xffff) | (hi << 16)
++
++                value = int(word.strip("#"), 0)
++                assert(value < (1 << 48))
++                return value
++
++            sk = True
++
++            if s[0] == "!cs":
++                assert(len(s) == 2)
++                self.flush_exe()
++                self.last_exe = len(self.exe)
++                self.exe.append(["exe", int(s[1])])
++                continue
++            elif s[0] == "!parallel":
++                assert(len(s) == 2)
++                self.flush_exe()
++                self.last_exe = len(self.exe) - 1
++                self.exe[-1] += [int(s[1])]
++                continue
++            elif s[0] == "!alloc":
++                assert(len(s) == 3 or len(s) == 4)
++                alloc_id = s[1]
++                size = int(s[2])
++                flags = val(s[3]) if len(s) == 4 else 0x280f
++                self.allocs[alloc_id] = Alloc(size, flags)
++                continue
++            elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"):
++                assert(len(s) == 4)
++                alloc_id = s[1]
++                offset = val(s[2])
++                size = val(s[3])
++                mode = {
++                    "!dump": "hex",
++                    "!dump64": "hex64",
++                    "!fdump": "filehex",
++                    "!delta": "delta",
++                    "!tiler": "tiler",
++                }[s[0]]
++                self.exe.append(("dump", self.allocs[alloc_id].id,
++                                 offset, size, mode))
++                continue
++            elif s[0] == "!heatmap":
++                assert(len(s) == 10)
++                assert(s[4] == "gran")
++                assert(s[6] == "len")
++                assert(s[8] == "stride")
++                alloc_id = s[1]
++                offset = val(s[2])
++                size = val(s[3])
++                granularity = val(s[5])
++                length = val(s[7])
++                stride = val(s[9])
++                mode = "heatmap"
++                self.exe.append(("heatmap", self.allocs[alloc_id].id,
++                                 offset, size, granularity, length, stride))
++                continue
++            elif s[0] == "!memset":
++                assert(len(s) == 5)
++                alloc_id = s[1]
++                offset = val(s[2])
++                value = val(s[3])
++                size = val(s[4])
++                self.exe.append(("memset", self.allocs[alloc_id].id,
++                                 offset, value, size))
++                continue
++            elif s[0] == "!raw":
++                self.exe.append(s[1:])
++                continue
++            elif s[0] == "movp":
++                assert(len(s) == 3)
++                assert(s[1][0] == "x")
++                addr = reg(s[1])
++                # Can't use val() as that has a max of 48 bits
++                value = int(s[2].strip("#"), 0)
++
++                self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff))
++                self.l.buffer.append((2 << 56) | ((addr + 1) << 48)
++                                       | ((value >> 32) & 0xffffffff))
++                continue
++            elif s[0] == "regdump":
++                assert(len(s) == 2)
++                assert(s[1][0] == "x")
++                dest = reg(s[1])
++
++                # Number of registers to write per instruction
++                regs = 16
++
++                cmd = 21
++                value = (dest << 40) | (((1 << regs) - 1) << 16)
++
++                for i in range(0, 0x60, regs):
++                    code = (cmd << 56) | (i << 48) | value | (i << 2)
++                    self.l.buffer.append(code)
++
++                del cmd, value
++                continue
++
++            elif s[0] == "unk":
++                if len(s) == 2:
++                    h = hx(s[1])
++                    cmd = h >> 56
++                    addr = (h >> 48) & 0xff
++                    value = h & 0xffffffffffff
++                else:
++                    assert(len(s) == 4)
++                    cmd = hx(s[2])
++                    addr = hx(s[1])
++                    value = val(s[3])
++            elif s[0] == "nop":
++                if len(s) == 1:
++                    addr = 0
++                    value = 0
++                    cmd = 0
++                else:
++                    assert(len(s) == 3)
++                    addr = hx(s[1])
++                    value = val(s[2])
++                    cmd = 0
++            elif s[0] == "mov" and s[2][0] in "xw":
++                # This is actually an addition command
++                assert(len(s) == 3)
++                assert(s[1][0] == s[2][0])
++                cmd = { "x": 17, "w": 16 }[s[1][0]]
++                addr = reg(s[1])
++                value = reg(s[2]) << 40
++            elif s[0] == "mov":
++                assert(len(s) == 3)
++                cmd = { "x": 1, "w": 2 }[s[1][0]]
++                addr = reg(s[1])
++                value = val(s[2])
++            elif s[0] == "add":
++                assert(len(s) == 4)
++                assert(s[1][0] == s[2][0])
++                assert(s[1][0] in "wx")
++                cmd = 16 if s[1][0] == "w" else 17
++                addr = reg(s[1])
++                value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff)
++            elif s[0] == "resources":
++                assert(len(s) >= 2)
++                types = ["compute", "fragment", "tiler", "idvs"]
++                cmd = 34
++                addr = 0
++                value = 0
++                for t in s[1:]:
++                    if t in types:
++                        value |= 1 << types.index(t)
++                    else:
++                        value |= int(t, 0)
++            elif s[0] == "fragment":
++                cmd = 7
++                addr = 0
++                value = 0
++                if len(s) != 1:
++                    arg_map = {
++                        "tem": {"0": 0, "1": 1},
++                        "render": {
++                            "z_order": 0,
++                            "horizontal": 0x10,
++                            "vertical": 0x20,
++                            "reverse_horizontal": 0x50,
++                            "reverse_vertical": 0x60,
++                        },
++                        "unk": {"0": 0, "1": 1 << 32},
++                    }
++                    for arg, val in zip(s[1::2], s[2::2]):
++                        value |= arg_map[arg][val]
++            elif s[0] == "wait":
++                assert(len(s) == 2)
++                cmd = 3
++                addr = 0
++                if s[1] == "all":
++                    value = 255
++                else:
++                    value = sum(1 << int(x) for x in s[1].split(","))
++                value <<= 16
++            elif s[0] == "slot":
++                assert(len(s) == 2)
++                cmd = 23
++                addr = 0
++                value = int(s[1], 0)
++            elif s[0] == "add":
++                # TODO: unk variant
++                assert(len(s) == 4)
++                assert(s[1][0] == "x")
++                assert(s[2][0] == "x")
++                cmd = 17
++                addr = reg(s[1])
++                v = val(s[3])
++                assert(v < (1 << 32))
++                assert(v >= (-1 << 31))
++                value = (reg(s[2]) << 40) | (v & 0xffffffff)
++            elif s[0] == "idvs":
++                assert(len(s) == 6)
++                unk = val(s[1])
++                assert(s[2] == "mode")
++                modes = {
++                    "none": 0,
++                    "points": 1,
++                    "lines": 2,
++                    "line-strip": 4,
++                    "line-loop": 6,
++                    "triangles": 8,
++                    "triangle-strip": 10,
++                    "triangle-fan": 12,
++                    "polygon": 13,
++                    "quads": 14,
++                }
++                if s[3] in modes:
++                    mode = modes[s[3]]
++                else:
++                    mode = int(s[3])
++                assert(s[4] == "index")
++                itypes = {
++                    "none": 0,
++                    "uint8": 1,
++                    "uint16": 2,
++                    "uint32": 3,
++                }
++                if s[5] in itypes:
++                    index = itypes[s[5]]
++                else:
++                    index = int(s[5])
++
++                cmd = 6
++                addr = 0
++                value = (unk << 32) | (index << 8) | mode
++            elif s[0] == "flush_tiler":
++                assert(len(s) == 1)
++                cmd = 9
++                addr = 0
++                value = 0
++            elif s[0] == "str" and s[1] in ("cycles", "timestamp"):
++                assert(len(s) == 3 or len(s) == 4)
++                assert(s[2][0] == "[")
++                assert(s[-1][-1] == "]")
++                s = [x.strip("[]") for x in s]
++                assert(s[2][0] == "x")
++
++                type_ = 1 if s[1] == "cycles" else 0
++                dest = reg(s[2])
++                if len(s) == 4:
++                    offset = val(s[3])
++                else:
++                    offset = 0
++
++                cmd = 40
++                addr = 0
++                value = (dest << 40) | (type_ << 32) | to_int16(offset)
++            elif s[0] in ("ldr", "str"):
++                reglist = s[1]
++                if reglist[0] == "{":
++                    end = [x[-1] for x in s].index("}")
++                    reglist = s[1:end + 1]
++                    s = s[:1] + s[end:]
++
++                assert(len(s) == 3 or len(s) == 4)
++                assert(s[2][0] == "[")
++                assert(s[-1][-1] == "]")
++                s = [x.strip("[]") for x in s]
++                assert(s[2][0] == "x")
++
++                if isinstance(reglist, str):
++                    assert(reglist[0] in "xw")
++                    src = reg(reglist)
++                    mask = 3 if reglist[0] == "x" else 1
++                else:
++                    src = None
++                    mask = 0
++
++                    for r in ",".join(reglist).strip("{}").split(","):
++                        r = r.split("-")
++                        assert(len(r) in (1, 2))
++                        regno = [reg(x) for x in r]
++
++                        if src is None:
++                            src = regno[0]
++
++                        if len(r) == 1:
++                            assert(r[0][0] in "xw")
++                            new = 3 if r[0][0] == "x" else 1
++                            new = (new << regno[0]) >> src
++                        else:
++                            assert(regno[1] > regno[0])
++                            new = ((2 << regno[1]) - (1 << regno[0])) >> src
++
++                        assert(new < (1 << 16))
++                        assert(mask & new == 0)
++                        mask |= new
++
++                # Name is correct for str, but inverted for ldr
++                # (The same holds for src above)
++                dest = reg(s[2])
++                if len(s) == 4:
++                    offset = val(s[3])
++                else:
++                    offset = 0
++
++                cmd = 20 if s[0] == "ldr" else 21
++                addr = src
++                value = (dest << 40) | (mask << 16) | to_int16(offset)
++            elif s[0] == "b" or s[0].startswith("b."):
++                # For unconditional jumps, use w00 as a source register if it
++                # is not specified
++                if s[0] == "b" and (len(s) == 2 or
++                                    (len(s) == 3 and
++                                     s[1] in ("back", "skip"))):
++                    s = [s[0], "w00", *s[1:]]
++
++                assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip")))
++                assert(s[1][0] == "w")
++
++                ops = {
++                    "b.le": 0, "b.gt": 1,
++                    "b.eq": 2, "b.ne": 3,
++                    "b.lt": 4, "b.ge": 5,
++                    "b": 6, "b.al": 6,
++                }
++
++                src = reg(s[1])
++                if len(s) == 4:
++                    offset = val(s[3])
++                    if s[2] == "back":
++                        offset = -1 - offset
++                else:
++                    label = s[2]
++                    if re.fullmatch(r"[0-9]+b", label):
++                        label = int(label[:-1])
++                        assert(label in self.l.num_labels)
++                        offset = resolve_rel(self.l.num_labels[label],
++                                             self.l.offset())
++                    elif re.fullmatch(r"[0-9]+f", label):
++                        label = int(label[:-1])
++                        if label not in self.l.num_refs:
++                            self.l.num_refs[label] = []
++                        self.l.num_refs[label].append((label, self.l.offset(), "rel"))
++                        offset = 0
++                    else:
++                        assert(not re.fullmatch(r"[0-9]+", label))
++                        self.l.label_refs.append((label, self.l.offset(), "rel"))
++                        offset = 0
++
++                cmd = 22
++                addr = 0
++                value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset)
++
++            elif s[0] in ("evadd", "evstr"):
++                assert(len(s) in range(5, 8))
++                assert(s[1][0] in "wx")
++                assert(s[2].startswith("[x"))
++                assert(s[2][-1] == "]")
++                assert(s[3] == "unk")
++                s = [x.strip("[]()") for x in s]
++
++                val = reg(s[1])
++                dst = reg(s[2])
++                mask = hx(s[4])
++                irq = "irq" not in s
++                unk0 = "unk0" in s
++
++                if s[1][0] == "w":
++                    cmd = 37 if s[0] == "evadd" else 38
++                else:
++                    cmd = 51 if s[0] == "evadd" else 52
++                addr = 1
++                value = ((dst << 40) | (val << 32) | (mask << 16) |
++                         (irq << 2) | unk0)
++            elif s[0].split(".")[0] == "evwait":
++                for mod in s[0].split(".")[1:]:
++                    assert(mod in {"lo", "hi", "inherit", "no_error"})
++                assert(len(s) == 3)
++                assert(s[1][0] in "wx")
++                assert(s[2][0] == "[")
++                assert(s[-1][-1] == "]")
++                s = [x.strip("[]()") for x in s]
++                src = reg(s[2])
++                val = reg(s[1])
++                cond = 1 if ".hi" in s[0] else 0
++                error = 1 if ".no_error" in s[0] else 0
++
++                cmd = 53 if s[1][0] == "x" else 39
++                addr = 0
++                value = (src << 40) | (val << 32) | (cond << 28) | error
++            elif s[0] in ("call", "tailcall"):
++                ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1]
++                assert(len(ss) == 3)
++                assert(ss[1][0] == "w")
++                assert(ss[2][0] == "x")
++                cmd = { "call": 32, "tailcall": 33 }[s[0]]
++                addr = 0
++                num = reg(ss[1])
++                target = reg(ss[2])
++                value = (num << 32) | (target << 40)
++
++                l = self.l
++
++                cur = len(l.buffer)
++                for ofs in range(cur - 2, cur):
++                    if l.buffer[ofs] >> 48 == 0x100 + target:
++                        l.call_addr_offset = ofs
++                    if l.buffer[ofs] >> 48 == 0x200 + num:
++                        l.call_len_offset = ofs
++                assert(l.call_addr_offset is not None)
++                assert(l.call_len_offset is not None)
++
++                self.is_call = True
++            elif s[0] == "heapctx":
++                assert(len(s) == 2)
++                assert(s[1][0] == "x")
++                cmd = 48
++                addr = 0
++                value = reg(s[1]) << 40
++            elif s[0] == "heapinc":
++                assert(len(s) == 2)
++                modes = {
++                    "vt_start": 0,
++                    "vt_end": 1,
++                    "frag_end": 3,
++                }
++                if s[1] in modes:
++                    mode = modes[s[1]]
++                else:
++                    mode = int(s[1])
++                cmd = 49
++                addr = 0
++                value = mode << 32
++            else:
++                print("Unknown command:", orig_line, file=sys.stderr)
++                # TODO remove
++                cmd = 0
++                addr = 0
++                value = 0
++                sk = False
++                pass
++
++            code = (cmd << 56) | (addr << 48) | value
++
++            if given_code and code != given_code:
++                print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}")
++
++            self.l.buffer.append(code)
++
++            del cmd, addr, value
++
++            if False and not sk:
++                print(orig_line, file=sys.stderr)
++                print(indent, s, hex(code) if sk else "", file=sys.stderr)
++
++        self.pop_until(self.levels[0].indent)
++        self.flush_exe()
++
++    def __repr__(self):
++        r = []
++        r += [str(self.allocs[x]) for x in self.allocs]
++        r += [str(x) for x in self.completed]
++        r += [fmt_reloc(x) for x in self.reloc]
++        r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split]
++        r += [fmt_exe(x) for x in self.exe]
++        return "\n".join(r)
++
++def interpret(text):
++    c = Context()
++    c.add_shaders(shaders)
++    c.add_memory(memory)
++    c.add_descriptors(descriptors)
++    c.interpret(text)
++    #print(str(c))
++    return str(c)
++
++def run(text, capture=False):
++    if capture:
++        cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
++    else:
++        cap = {}
++
++    i = interpret(text) + "\n"
++
++    with open("/tmp/csf.cmds", "w") as f:
++        f.write(i)
++
++    # TODO: Keep seperate or merge stdout/stderr?
++    ret = subprocess.run(["csf_test", "/dev/stdin"],
++                         input=i, text=True, **cap)
++    if ret.stderr is None:
++        ret.stderr = ""
++    if ret.stdout is None:
++        ret.stdout = ""
++    return ret.stderr + ret.stdout
++
++def rebuild():
++    try:
++        p = subprocess.run(["rebuild-mesa"])
++        if p.returncode != 0:
++            return False
++    except FileNotFoundError:
++        pass
++    return True
++
++def go(text):
++    #print(interpret(text))
++    #return
++
++    if not rebuild():
++        return
++
++    print(run(text))
++    #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s",
++    #               shell=True)
++
++os.environ["CSF_QUIET"] = "1"
++
++go(get_cmds(""))
++
++#for c in range(1, 64):
++#    val = c
++#    ret = run(get_cmds(ii(val)))
++#    print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0])
++
++#rebuild()
++#for c in range(256):
++#    print(c, end=":")
++#    sys.stdout.flush()
++#    cmd = f"UNK 00 {hex(c)[2:]} 0x00000000"
++#    run(get_cmds(cmd))
++
++#interpret(cmds)
++#go(cmds)
+diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h
+new file mode 100644
+index 00000000000..f5f859eb9ad
+--- /dev/null
++++ b/src/panfrost/csf_test/mali_base_csf_kernel.h
+@@ -0,0 +1,721 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_BASE_CSF_KERNEL_H_
++#define _UAPI_BASE_CSF_KERNEL_H_
++
++#include <linux/types.h>
++
++/* Memory allocation, access/hint flags.
++ *
++ * See base_mem_alloc_flags.
++ */
++
++/* IN */
++/* Read access CPU side
++ */
++#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
++
++/* Write access CPU side
++ */
++#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
++
++/* Read access GPU side
++ */
++#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
++
++/* Write access GPU side
++ */
++#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
++
++/* Execute allowed on the GPU side
++ */
++#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
++
++/* Will be permanently mapped in kernel space.
++ * Flag is only allowed on allocations originating from kbase.
++ */
++#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
++
++/* The allocation will completely reside within the same 4GB chunk in the GPU
++ * virtual space.
++ * Since this flag is primarily required only for the TLS memory which will
++ * not be used to contain executable code and also not used for Tiler heap,
++ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
++ */
++#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
++
++/* Userspace is not allowed to free this memory.
++ * Flag is only allowed on allocations originating from kbase.
++ */
++#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
++
++#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
++
++/* Grow backing store on GPU Page Fault
++ */
++#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
++
++/* Page coherence Outer shareable, if available
++ */
++#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
++
++/* Page coherence Inner shareable
++ */
++#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
++
++/* IN/OUT */
++/* Should be cached on the CPU, returned if actually cached
++ */
++#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
++
++/* IN/OUT */
++/* Must have same VA on both the GPU and the CPU
++ */
++#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
++
++/* OUT */
++/* Must call mmap to acquire a GPU address for the alloc
++ */
++#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
++
++/* IN */
++/* Page coherence Outer shareable, required.
++ */
++#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
++
++/* Protected memory
++ */
++#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
++
++/* Not needed physical memory
++ */
++#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
++
++/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
++ * addresses to be the same
++ */
++#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
++
++/* CSF event memory
++ *
++ * If Outer shareable coherence is not specified or not available, then on
++ * allocation kbase will automatically use the uncached GPU mapping.
++ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
++ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
++ *
++ * This memory requires a permanent mapping
++ *
++ * See also kbase_reg_needs_kernel_mapping()
++ */
++#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
++
++#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
++
++/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
++ * mode. Some components within the GPU might only be able to access memory
++ * that is GPU cacheable. Refer to the specific GPU implementation for more
++ * details. The 3 shareability flags will be ignored for GPU uncached memory.
++ * If used while importing USER_BUFFER type memory, then the import will fail
++ * if the memory is not aligned to GPU and CPU cache line width.
++ */
++#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
++
++/*
++ * Bits [22:25] for group_id (0~15).
++ *
++ * base_mem_group_id_set() should be used to pack a memory group ID into a
++ * base_mem_alloc_flags value instead of accessing the bits directly.
++ * base_mem_group_id_get() should be used to extract the memory group ID from
++ * a base_mem_alloc_flags value.
++ */
++#define BASEP_MEM_GROUP_ID_SHIFT 22
++#define BASE_MEM_GROUP_ID_MASK \
++	((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
++
++/* Must do CPU cache maintenance when imported memory is mapped/unmapped
++ * on GPU. Currently applicable to dma-buf type only.
++ */
++#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
++
++/* OUT */
++/* Kernel side cache sync ops required */
++#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
++
++/* Number of bits used as flags for base memory management
++ *
++ * Must be kept in sync with the base_mem_alloc_flags flags
++ */
++#define BASE_MEM_FLAGS_NR_BITS 29
++
++/* A mask of all the flags which are only valid for allocations within kbase,
++ * and may not be passed from user space.
++ */
++#define BASEP_MEM_FLAGS_KERNEL_ONLY \
++	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
++
++/* A mask for all output bits, excluding IN/OUT bits.
++ */
++#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
++
++/* A mask for all input bits, including IN/OUT bits.
++ */
++#define BASE_MEM_FLAGS_INPUT_MASK \
++	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
++
++/* A mask of all currently reserved flags
++ */
++#define BASE_MEM_FLAGS_RESERVED \
++	BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20
++
++#define BASEP_MEM_INVALID_HANDLE (0ul)
++#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
++#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
++/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
++#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
++#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
++#define BASE_MEM_FIRST_FREE_ADDRESS                                            \
++	((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
++
++#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
++	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
++	 LOCAL_PAGE_SHIFT)
++
++/**
++ * Valid set of just-in-time memory allocation flags
++ */
++#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
++
++/* Flags to pass to ::base_context_init.
++ * Flags can be ORed together to enable multiple things.
++ *
++ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
++ * not collide with them.
++ */
++typedef __u32 base_context_create_flags;
++
++/* No flags set */
++#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
++
++/* Base context is embedded in a cctx object (flag used for CINSTR
++ * software counter macros)
++ */
++#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
++
++/* Base context is a 'System Monitor' context for Hardware counters.
++ *
++ * One important side effect of this is that job submission is disabled.
++ */
++#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \
++	((base_context_create_flags)1 << 1)
++
++/* Base context creates a CSF event notification thread.
++ *
++ * The creation of a CSF event notification thread is conditional but
++ * mandatory for the handling of CSF events.
++ */
++#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
++
++/* Bit-shift used to encode a memory group ID in base_context_create_flags
++ */
++#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
++
++/* Bitmask used to encode a memory group ID in base_context_create_flags
++ */
++#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \
++	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
++
++/* Bitpattern describing the base_context_create_flags that can be
++ * passed to the kernel
++ */
++#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \
++	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \
++	 BASEP_CONTEXT_MMU_GROUP_ID_MASK)
++
++/* Bitpattern describing the ::base_context_create_flags that can be
++ * passed to base_context_init()
++ */
++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
++	(BASE_CONTEXT_CCTX_EMBEDDED | \
++	 BASE_CONTEXT_CSF_EVENT_THREAD | \
++	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
++
++/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
++ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
++ */
++#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
++
++/* Indicate that job dumping is enabled. This could affect certain timers
++ * to account for the performance impact.
++ */
++#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
++
++/* Enable KBase tracepoints for CSF builds */
++#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
++
++/* Enable additional CSF Firmware side tracepoints */
++#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
++
++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
++		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
++		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
++		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
++
++/* Number of pages mapped into the process address space for a bound GPU
++ * command queue. A pair of input/output pages and a Hw doorbell page
++ * are mapped to enable direct submission of commands to Hw.
++ */
++#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
++
++#define BASE_QUEUE_MAX_PRIORITY (15U)
++
++/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
++#define BASEP_EVENT_VAL_INDEX (0U)
++#define BASEP_EVENT_ERR_INDEX (1U)
++
++/* The upper limit for number of objects that could be waited/set per command.
++ * This limit is now enforced as internally the error inherit inputs are
++ * converted to 32-bit flags in a __u32 variable occupying a previously padding
++ * field.
++ */
++#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
++
++/**
++ * enum base_kcpu_command_type - Kernel CPU queue command type.
++ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
++ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
++ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
++ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
++ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
++ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
++ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
++ */
++enum base_kcpu_command_type {
++	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
++	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
++	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
++	BASE_KCPU_COMMAND_TYPE_CQS_SET,
++	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
++	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
++	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
++	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
++	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
++	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
++	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
++	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
++	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
++};
++
++/**
++ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
++ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
++ *                                      priority.
++ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
++ *                                      priority levels.
++ *
++ * Currently this is in order of highest to lowest, but if new levels are added
++ * then those new levels may be out of order to preserve the ABI compatibility
++ * with previous releases. At that point, ensure assignment to
++ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
++ * a linear ordering.
++ *
++ * There should be no gaps in the enum, otherwise use of
++ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
++ */
++enum base_queue_group_priority {
++	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
++	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
++	BASE_QUEUE_GROUP_PRIORITY_LOW,
++	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
++	BASE_QUEUE_GROUP_PRIORITY_COUNT
++};
++
++struct base_kcpu_command_fence_info {
++	__u64 fence;
++};
++
++struct base_cqs_wait_info {
++	__u64 addr;
++	__u32 val;
++	__u32 padding;
++};
++
++struct base_kcpu_command_cqs_wait_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 inherit_err_flags;
++};
++
++struct base_cqs_set {
++	__u64 addr;
++};
++
++struct base_kcpu_command_cqs_set_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 padding;
++};
++
++/**
++ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
++ *
++ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
++ *                           is an unsigned 32-bit integer
++ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
++ *                           is an unsigned 64-bit integer
++ */
++typedef enum PACKED {
++	BASEP_CQS_DATA_TYPE_U32 = 0,
++	BASEP_CQS_DATA_TYPE_U64 = 1,
++} basep_cqs_data_type;
++
++/**
++ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
++ *                                Operation conditions
++ *
++ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
++ *                                wait will be satisfied when a CQS Object's
++ *                                value is Less than or Equal to
++ *                                the Wait Operation value
++ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
++ *                                wait will be satisfied when a CQS Object's
++ *                                value is Greater than the Wait Operation value
++ */
++typedef enum {
++	BASEP_CQS_WAIT_OPERATION_LE = 0,
++	BASEP_CQS_WAIT_OPERATION_GT = 1,
++} basep_cqs_wait_operation_op;
++
++struct base_cqs_wait_operation_info {
++	__u64 addr;
++	__u64 val;
++	__u8 operation;
++	__u8 data_type;
++	__u8 padding[6];
++};
++
++/**
++ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
++ *		about the Timeline CQS wait objects
++ *
++ * @objs:              An array of Timeline CQS waits.
++ * @nr_objs:           Number of Timeline CQS waits in the array.
++ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
++ *                     to be served as the source for importing into the
++ *                     queue's error-state.
++ */
++struct base_kcpu_command_cqs_wait_operation_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 inherit_err_flags;
++};
++
++/**
++ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
++ *
++ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
++ *                                to a synchronization object
++ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
++ *                                of a synchronization object
++ */
++typedef enum {
++	BASEP_CQS_SET_OPERATION_ADD = 0,
++	BASEP_CQS_SET_OPERATION_SET = 1,
++} basep_cqs_set_operation_op;
++
++struct base_cqs_set_operation_info {
++	__u64 addr;
++	__u64 val;
++	__u8 operation;
++	__u8 data_type;
++	__u8 padding[6];
++};
++
++/**
++ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
++ *		about the Timeline CQS set objects
++ *
++ * @objs:    An array of Timeline CQS sets.
++ * @nr_objs: Number of Timeline CQS sets in the array.
++ * @padding: Structure padding, unused bytes.
++ */
++struct base_kcpu_command_cqs_set_operation_info {
++	__u64 objs;
++	__u32 nr_objs;
++	__u32 padding;
++};
++
++/**
++ * struct base_kcpu_command_import_info - structure which contains information
++ *		about the imported buffer.
++ *
++ * @handle:	Address of imported user buffer.
++ */
++struct base_kcpu_command_import_info {
++	__u64 handle;
++};
++
++/**
++ * struct base_kcpu_command_jit_alloc_info - structure which contains
++ *		information about jit memory allocation.
++ *
++ * @info:	An array of elements of the
++ *		struct base_jit_alloc_info type.
++ * @count:	The number of elements in the info array.
++ * @padding:	Padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_jit_alloc_info {
++	__u64 info;
++	__u8 count;
++	__u8 padding[7];
++};
++
++/**
++ * struct base_kcpu_command_jit_free_info - structure which contains
++ *		information about jit memory which is to be freed.
++ *
++ * @ids:	An array containing the JIT IDs to free.
++ * @count:	The number of elements in the ids array.
++ * @padding:	Padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_jit_free_info {
++	__u64 ids;
++	__u8 count;
++	__u8 padding[7];
++};
++
++/**
++ * struct base_kcpu_command_group_suspend_info - structure which contains
++ *		suspend buffer data captured for a suspended queue group.
++ *
++ * @buffer:		Pointer to an array of elements of the type char.
++ * @size:		Number of elements in the @buffer array.
++ * @group_handle:	Handle to the mapping of CSG.
++ * @padding:		padding to a multiple of 64 bits.
++ */
++struct base_kcpu_command_group_suspend_info {
++	__u64 buffer;
++	__u32 size;
++	__u8 group_handle;
++	__u8 padding[3];
++};
++
++
++/**
++ * struct base_kcpu_command - kcpu command.
++ * @type:	type of the kcpu command, one enum base_kcpu_command_type
++ * @padding:	padding to a multiple of 64 bits
++ * @info:	structure which contains information about the kcpu command;
++ *		actual type is determined by @p type
++ * @info.fence:            Fence
++ * @info.cqs_wait:         CQS wait
++ * @info.cqs_set:          CQS set
++ * @info.import:           import
++ * @info.jit_alloc:        jit allocation
++ * @info.jit_free:         jit deallocation
++ * @info.suspend_buf_copy: suspend buffer copy
++ * @info.sample_time:      sample time
++ * @info.padding:          padding
++ */
++struct base_kcpu_command {
++	__u8 type;
++	__u8 padding[sizeof(__u64) - sizeof(__u8)];
++	union {
++		struct base_kcpu_command_fence_info fence;
++		struct base_kcpu_command_cqs_wait_info cqs_wait;
++		struct base_kcpu_command_cqs_set_info cqs_set;
++		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
++		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
++		struct base_kcpu_command_import_info import;
++		struct base_kcpu_command_jit_alloc_info jit_alloc;
++		struct base_kcpu_command_jit_free_info jit_free;
++		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
++		__u64 padding[2]; /* No sub-struct should be larger */
++	} info;
++};
++
++/**
++ * struct basep_cs_stream_control - CSI capabilities.
++ *
++ * @features: Features of this stream
++ * @padding:  Padding to a multiple of 64 bits.
++ */
++struct basep_cs_stream_control {
++	__u32 features;
++	__u32 padding;
++};
++
++/**
++ * struct basep_cs_group_control - CSG interface capabilities.
++ *
++ * @features:     Features of this group
++ * @stream_num:   Number of streams in this group
++ * @suspend_size: Size in bytes of the suspend buffer for this group
++ * @padding:      Padding to a multiple of 64 bits.
++ */
++struct basep_cs_group_control {
++	__u32 features;
++	__u32 stream_num;
++	__u32 suspend_size;
++	__u32 padding;
++};
++
++/**
++ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
++ *        error information associated with GPU command queue group.
++ *
++ * @sideband:     Additional information of the unrecoverable fault.
++ * @status:       Unrecoverable fault information.
++ *                This consists of exception type (least significant byte) and
++ *                data (remaining bytes). One example of exception type is
++ *                CS_INVALID_INSTRUCTION (0x49).
++ * @padding:      Padding to make multiple of 64bits
++ */
++struct base_gpu_queue_group_error_fatal_payload {
++	__u64 sideband;
++	__u32 status;
++	__u32 padding;
++};
++
++/**
++ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
++ *        error information related to GPU command queue.
++ *
++ * @sideband:     Additional information about this unrecoverable fault.
++ * @status:       Unrecoverable fault information.
++ *                This consists of exception type (least significant byte) and
++ *                data (remaining bytes). One example of exception type is
++ *                CS_INVALID_INSTRUCTION (0x49).
++ * @csi_index:    Index of the CSF interface the queue is bound to.
++ * @padding:      Padding to make multiple of 64bits
++ */
++struct base_gpu_queue_error_fatal_payload {
++	__u64 sideband;
++	__u32 status;
++	__u8 csi_index;
++	__u8 padding[3];
++};
++
++/**
++ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
++ *
++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
++ *                                          command queue group.
++ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
++ *                                          command queue.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
++ *                                          progress timeout.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
++ *                                             of tiler heap memory.
++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
++ *
++ * This type is used for &struct_base_gpu_queue_group_error.error_type.
++ */
++enum base_gpu_queue_group_error_type {
++	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
++	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
++	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
++	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
++	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
++};
++
++/**
++ * struct base_gpu_queue_group_error - Unrecoverable fault information
++ * @error_type:          Error type of @base_gpu_queue_group_error_type
++ *                       indicating which field in union payload is filled
++ * @padding:             Unused bytes for 64bit boundary
++ * @payload:             Input Payload
++ * @payload.fatal_group: Unrecoverable fault error associated with
++ *                       GPU command queue group
++ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
++ */
++struct base_gpu_queue_group_error {
++	__u8 error_type;
++	__u8 padding[7];
++	union {
++		struct base_gpu_queue_group_error_fatal_payload fatal_group;
++		struct base_gpu_queue_error_fatal_payload fatal_queue;
++	} payload;
++};
++
++/**
++ * enum base_csf_notification_type - Notification type
++ *
++ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
++ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
++ *                                               error
++ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
++ *                                               queue
++ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
++ *
++ * This type is used for &struct_base_csf_notification.type.
++ */
++enum base_csf_notification_type {
++	BASE_CSF_NOTIFICATION_EVENT = 0,
++	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
++	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
++	BASE_CSF_NOTIFICATION_COUNT
++};
++
++/**
++ * struct base_csf_notification - Event or error notification
++ *
++ * @type:                      Notification type of @base_csf_notification_type
++ * @padding:                   Padding for 64bit boundary
++ * @payload:                   Input Payload
++ * @payload.align:             To fit the struct into a 64-byte cache line
++ * @payload.csg_error:         CSG error
++ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
++ *                             fatal error
++ * @payload.csg_error.padding: Padding
++ * @payload.csg_error.error:   Unrecoverable fault error
++ *
++ */
++struct base_csf_notification {
++	__u8 type;
++	__u8 padding[7];
++	union {
++		struct {
++			__u8 handle;
++			__u8 padding[7];
++			struct base_gpu_queue_group_error error;
++		} csg_error;
++
++		__u8 align[56];
++	} payload;
++};
++
++#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
+diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h
+new file mode 100644
+index 00000000000..305956f341a
+--- /dev/null
++++ b/src/panfrost/csf_test/mali_base_kernel.h
+@@ -0,0 +1,746 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++/*
++ * Base structures shared with the kernel.
++ */
++
++#ifndef _UAPI_BASE_KERNEL_H_
++#define _UAPI_BASE_KERNEL_H_
++
++#include <linux/types.h>
++
++struct base_mem_handle {
++	struct {
++		__u64 handle;
++	} basep;
++};
++
++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
++
++#define BASE_MAX_COHERENT_GROUPS 16
++
++#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
++#define LOCAL_PAGE_SHIFT PAGE_SHIFT
++#define LOCAL_PAGE_LSB ~PAGE_MASK
++#else
++#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
++#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
++#endif
++
++#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
++#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
++#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
++#else
++#error Failed to find page size
++#endif
++#endif
++
++/* Physical memory group ID for normal usage.
++ */
++#define BASE_MEM_GROUP_DEFAULT (0)
++
++/* Number of physical memory groups.
++ */
++#define BASE_MEM_GROUP_COUNT (16)
++
++/**
++ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
++ *
++ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
++ * in order to determine the best cache policy. Some combinations are
++ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
++ * which defines a write-only region on the CPU side, which is
++ * heavily read by the CPU...
++ * Other flags are only meaningful to a particular allocator.
++ * More flags can be added to this list, as long as they don't clash
++ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
++ */
++typedef __u32 base_mem_alloc_flags;
++
++/* A mask for all the flags which are modifiable via the base_mem_set_flags
++ * interface.
++ */
++#define BASE_MEM_FLAGS_MODIFIABLE \
++	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
++	 BASE_MEM_COHERENT_LOCAL)
++
++/* A mask of all the flags that can be returned via the base_mem_get_flags()
++ * interface.
++ */
++#define BASE_MEM_FLAGS_QUERYABLE \
++	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
++		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
++		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
++		BASEP_MEM_FLAGS_KERNEL_ONLY))
++
++/**
++ * enum base_mem_import_type - Memory types supported by @a base_mem_import
++ *
++ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
++ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
++ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
++ * base_mem_import_user_buffer
++ *
++ * Each type defines what the supported handle type is.
++ *
++ * If any new type is added here ARM must be contacted
++ * to allocate a numeric value for it.
++ * Do not just add a new type without synchronizing with ARM
++ * as future releases from ARM might include other new types
++ * which could clash with your custom types.
++ */
++enum base_mem_import_type {
++	BASE_MEM_IMPORT_TYPE_INVALID = 0,
++	/*
++	 * Import type with value 1 is deprecated.
++	 */
++	BASE_MEM_IMPORT_TYPE_UMM = 2,
++	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
++};
++
++/**
++ * struct base_mem_import_user_buffer - Handle of an imported user buffer
++ *
++ * @ptr:	address of imported user buffer
++ * @length:	length of imported user buffer in bytes
++ *
++ * This structure is used to represent a handle of an imported user buffer.
++ */
++
++struct base_mem_import_user_buffer {
++	__u64 ptr;
++	__u64 length;
++};
++
++/* Mask to detect 4GB boundary alignment */
++#define BASE_MEM_MASK_4GB  0xfffff000UL
++/* Mask to detect 4GB boundary (in page units) alignment */
++#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
++
++/* Limit on the 'extension' parameter for an allocation with the
++ * BASE_MEM_TILER_ALIGN_TOP flag set
++ *
++ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
++ */
++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
++	(21u - (LOCAL_PAGE_SHIFT))
++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
++	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
++
++/* Bit mask of cookies used for for memory allocation setup */
++#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
++
++/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
++#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
++
++/*
++ * struct base_fence - Cross-device synchronisation fence.
++ *
++ * A fence is used to signal when the GPU has finished accessing a resource that
++ * may be shared with other devices, and also to delay work done asynchronously
++ * by the GPU until other devices have finished accessing a shared resource.
++ */
++struct base_fence {
++	struct {
++		int fd;
++		int stream_fd;
++	} basep;
++};
++
++/**
++ * struct base_mem_aliasing_info - Memory aliasing info
++ *
++ * Describes a memory handle to be aliased.
++ * A subset of the handle can be chosen for aliasing, given an offset and a
++ * length.
++ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
++ * region where a special page is mapped with a write-alloc cache setup,
++ * typically used when the write result of the GPU isn't needed, but the GPU
++ * must write anyway.
++ *
++ * Offset and length are specified in pages.
++ * Offset must be within the size of the handle.
++ * Offset+length must not overrun the size of the handle.
++ *
++ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
++ * @offset: Offset within the handle to start aliasing from, in pages.
++ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
++ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
++ *          specifies the number of times the special page is needed.
++ */
++struct base_mem_aliasing_info {
++	struct base_mem_handle handle;
++	__u64 offset;
++	__u64 length;
++};
++
++/* Maximum percentage of just-in-time memory allocation trimming to perform
++ * on free.
++ */
++#define BASE_JIT_MAX_TRIM_LEVEL (100)
++
++/* Maximum number of concurrent just-in-time memory allocations.
++ */
++#define BASE_JIT_ALLOC_COUNT (255)
++
++/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
++ *
++ * jit_version is 1
++ *
++ * Due to the lack of padding specified, user clients between 32 and 64-bit
++ * may have assumed a different size of the struct
++ *
++ * An array of structures was not supported
++ */
++struct base_jit_alloc_info_10_2 {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++};
++
++/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
++ * to 11.19
++ *
++ * This structure had a number of modifications during and after kernel driver
++ * version 11.5, but remains size-compatible throughout its version history, and
++ * with earlier variants compatible with future variants by requiring
++ * zero-initialization to the unused space in the structure.
++ *
++ * jit_version is 2
++ *
++ * Kernel driver version history:
++ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
++ *       must be zero. Kbase minor version was not incremented, so some
++ *       versions of 11.5 do not have this change.
++ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
++ *       minor version not incremented)
++ * 11.6: Added 'flags', replacing 1 padding byte
++ * 11.10: Arrays of this structure are supported
++ */
++struct base_jit_alloc_info_11_5 {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++	__u8 bin_id;
++	__u8 max_allocations;
++	__u8 flags;
++	__u8 padding[2];
++	__u16 usage_id;
++};
++
++/**
++ * struct base_jit_alloc_info - Structure which describes a JIT allocation
++ *                              request.
++ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
++ *                              allocated GPU virtual address to.
++ * @va_pages:                   The minimum number of virtual pages required.
++ * @commit_pages:               The minimum number of physical pages which
++ *                              should back the allocation.
++ * @extension:                     Granularity of physical pages to grow the
++ *                              allocation by during a fault.
++ * @id:                         Unique ID provided by the caller, this is used
++ *                              to pair allocation and free requests.
++ *                              Zero is not a valid value.
++ * @bin_id:                     The JIT allocation bin, used in conjunction with
++ *                              @max_allocations to limit the number of each
++ *                              type of JIT allocation.
++ * @max_allocations:            The maximum number of allocations allowed within
++ *                              the bin specified by @bin_id. Should be the same
++ *                              for all allocations within the same bin.
++ * @flags:                      flags specifying the special requirements for
++ *                              the JIT allocation, see
++ *                              %BASE_JIT_ALLOC_VALID_FLAGS
++ * @padding:                    Expansion space - should be initialised to zero
++ * @usage_id:                   A hint about which allocation should be reused.
++ *                              The kernel should attempt to use a previous
++ *                              allocation with the same usage_id
++ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
++ *                              the actual usage of the region.
++ *
++ * jit_version is 3.
++ *
++ * When modifications are made to this structure, it is still compatible with
++ * jit_version 3 when: a) the size is unchanged, and b) new members only
++ * replace the padding bytes.
++ *
++ * Previous jit_version history:
++ * jit_version == 1, refer to &base_jit_alloc_info_10_2
++ * jit_version == 2, refer to &base_jit_alloc_info_11_5
++ *
++ * Kbase version history:
++ * 11.20: added @heap_info_gpu_addr
++ */
++struct base_jit_alloc_info {
++	__u64 gpu_alloc_addr;
++	__u64 va_pages;
++	__u64 commit_pages;
++	__u64 extension;
++	__u8 id;
++	__u8 bin_id;
++	__u8 max_allocations;
++	__u8 flags;
++	__u8 padding[2];
++	__u16 usage_id;
++	__u64 heap_info_gpu_addr;
++};
++
++enum base_external_resource_access {
++	BASE_EXT_RES_ACCESS_SHARED,
++	BASE_EXT_RES_ACCESS_EXCLUSIVE
++};
++
++struct base_external_resource {
++	__u64 ext_resource;
++};
++
++
++/**
++ * The maximum number of external resources which can be mapped/unmapped
++ * in a single request.
++ */
++#define BASE_EXT_RES_COUNT_MAX 10
++
++/**
++ * struct base_external_resource_list - Structure which describes a list of
++ *                                      external resources.
++ * @count:                              The number of resources.
++ * @ext_res:                            Array of external resources which is
++ *                                      sized at allocation time.
++ */
++struct base_external_resource_list {
++	__u64 count;
++	struct base_external_resource ext_res[1];
++};
++
++struct base_jd_debug_copy_buffer {
++	__u64 address;
++	__u64 size;
++	struct base_external_resource extres;
++};
++
++#define GPU_MAX_JOB_SLOTS 16
++
++/**
++ * User-side Base GPU Property Queries
++ *
++ * The User-side Base GPU Property Query interface encapsulates two
++ * sub-modules:
++ *
++ * - "Dynamic GPU Properties"
++ * - "Base Platform Config GPU Properties"
++ *
++ * Base only deals with properties that vary between different GPU
++ * implementations - the Dynamic GPU properties and the Platform Config
++ * properties.
++ *
++ * For properties that are constant for the GPU Architecture, refer to the
++ * GPU module. However, we will discuss their relevance here just to
++ * provide background information.
++ *
++ * About the GPU Properties in Base and GPU modules
++ *
++ * The compile-time properties (Platform Config, GPU Compile-time
++ * properties) are exposed as pre-processor macros.
++ *
++ * Complementing the compile-time properties are the Dynamic GPU
++ * Properties, which act as a conduit for the GPU Configuration
++ * Discovery.
++ *
++ * In general, the dynamic properties are present to verify that the platform
++ * has been configured correctly with the right set of Platform Config
++ * Compile-time Properties.
++ *
++ * As a consistent guide across the entire DDK, the choice for dynamic or
++ * compile-time should consider the following, in order:
++ * 1. Can the code be written so that it doesn't need to know the
++ * implementation limits at all?
++ * 2. If you need the limits, get the information from the Dynamic Property
++ * lookup. This should be done once as you fetch the context, and then cached
++ * as part of the context data structure, so it's cheap to access.
++ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
++ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
++ * property). Examples of where this might be sensible follow:
++ *  - Part of a critical inner-loop
++ *  - Frequent re-use throughout the driver, causing significant extra load
++ * instructions or control flow that would be worthwhile optimizing out.
++ *
++ * We cannot provide an exhaustive set of examples, neither can we provide a
++ * rule for every possible situation. Use common sense, and think about: what
++ * the rest of the driver will be doing; how the compiler might represent the
++ * value if it is a compile-time constant; whether an OEM shipping multiple
++ * devices would benefit much more from a single DDK binary, instead of
++ * insignificant micro-optimizations.
++ *
++ * Dynamic GPU Properties
++ *
++ * Dynamic GPU properties are presented in two sets:
++ * 1. the commonly used properties in @ref base_gpu_props, which have been
++ * unpacked from GPU register bitfields.
++ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
++ * (also a member of base_gpu_props). All of these are presented in
++ * the packed form, as presented by the GPU  registers themselves.
++ *
++ * The raw properties in gpu_raw_gpu_props are necessary to
++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
++ * behaving differently?". In this case, all information about the
++ * configuration is potentially useful, but it does not need to be processed
++ * by the driver. Instead, the raw registers can be processed by the Mali
++ * Tools software on the host PC.
++ *
++ * The properties returned extend the GPU Configuration Discovery
++ * registers. For example, GPU clock speed is not specified in the GPU
++ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
++ *
++ * The GPU properties are obtained by a call to
++ * base_get_gpu_props(). This simply returns a pointer to a const
++ * base_gpu_props structure. It is constant for the life of a base
++ * context. Multiple calls to base_get_gpu_props() to a base context
++ * return the same pointer to a constant structure. This avoids cache pollution
++ * of the common data.
++ *
++ * This pointer must not be freed, because it does not point to the start of a
++ * region allocated by the memory allocator; instead, just close the @ref
++ * base_context.
++ *
++ *
++ * Kernel Operation
++ *
++ * During Base Context Create time, user-side makes a single kernel call:
++ * - A call to fill user memory with GPU information structures
++ *
++ * The kernel-side will fill the provided the entire processed base_gpu_props
++ * structure, because this information is required in both
++ * user and kernel side; it does not make sense to decode it twice.
++ *
++ * Coherency groups must be derived from the bitmasks, but this can be done
++ * kernel side, and just once at kernel startup: Coherency groups must already
++ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
++ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
++ *
++ * Coherency Group calculation
++ *
++ * Creation of the coherent group data is done at device-driver startup, and so
++ * is one-time. This will most likely involve a loop with CLZ, shifting, and
++ * bit clearing on the L2_PRESENT mask, depending on whether the
++ * system is L2 Coherent. The number of shader cores is done by a
++ * population count, since faulty cores may be disabled during production,
++ * producing a non-contiguous mask.
++ *
++ * The memory requirements for this algorithm can be determined either by a __u64
++ * population count on the L2_PRESENT mask (a LUT helper already is
++ * required for the above), or simple assumption that there can be no more than
++ * 16 coherent groups, since core groups are typically 4 cores.
++ */
++
++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
++
++#define BASE_MAX_COHERENT_GROUPS 16
++/**
++ * struct mali_base_gpu_core_props - GPU core props info
++ * @product_id: Pro specific value.
++ * @version_status: Status of the GPU release. No defined values, but starts at
++ * 	0 and increases by one for each release status (alpha, beta, EAC, etc.).
++ * 	4 bit values (0-15).
++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
++ * 	release number.
++ * 	8 bit values (0-255).
++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
++ * 	release number.
++ * 	4 bit values (0-15).
++ * @padding: padding to allign to 8-byte
++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
++ * 	clGetDeviceInfo()
++ * @log2_program_counter_size: Size of the shader program counter, in bits.
++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
++ * 	is a bitpattern where a set bit indicates that the format is supported.
++ * 	Before using a texture format, it is recommended that the corresponding
++ * 	bit be checked.
++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
++ * 	It is unlikely that a client will be able to allocate all of this memory
++ * 	for their own purposes, but this at least provides an upper bound on the
++ * 	memory available to the GPU.
++ * 	This is required for OpenCL's clGetDeviceInfo() call when
++ * 	CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
++ * 	client will not be expecting to allocate anywhere near this value.
++ * @num_exec_engines: The number of execution engines.
++ */
++struct mali_base_gpu_core_props {
++	__u32 product_id;
++	__u16 version_status;
++	__u16 minor_revision;
++	__u16 major_revision;
++	__u16 padding;
++	__u32 gpu_freq_khz_max;
++	__u32 log2_program_counter_size;
++	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++	__u64 gpu_available_memory_size;
++	__u8 num_exec_engines;
++};
++
++/*
++ * More information is possible - but associativity and bus width are not
++ * required by upper-level apis.
++ */
++struct mali_base_gpu_l2_cache_props {
++	__u8 log2_line_size;
++	__u8 log2_cache_size;
++	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
++	__u8 padding[5];
++};
++
++struct mali_base_gpu_tiler_props {
++	__u32 bin_size_bytes;	/* Max is 4*2^15 */
++	__u32 max_active_levels;	/* Max is 2^15 */
++};
++
++/**
++ * struct mali_base_gpu_thread_props - GPU threading system details.
++ * @max_threads: Max. number of threads per core
++ * @max_workgroup_size:     Max. number of threads per workgroup
++ * @max_barrier_size:       Max. number of threads that can synchronize on a
++ *                          simple barrier
++ * @max_registers:          Total size [1..65535] of the register file available
++ *                          per core.
++ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
++ *                          before it becomes blocked.
++ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
++ *                          field.
++ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
++ *                          3 = SW Model/Emulation
++ * @padding:                padding to allign to 8-byte
++ * @tls_alloc:              Number of threads per core that TLS must be
++ *                          allocated for
++ */
++struct mali_base_gpu_thread_props {
++	__u32 max_threads;
++	__u32 max_workgroup_size;
++	__u32 max_barrier_size;
++	__u16 max_registers;
++	__u8 max_task_queue;
++	__u8 max_thread_group_split;
++	__u8 impl_tech;
++	__u8  padding[3];
++	__u32 tls_alloc;
++};
++
++/**
++ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
++ * @core_mask: Core restriction mask required for the group
++ * @num_cores: Number of cores in the group
++ * @padding:   padding to allign to 8-byte
++ *
++ * \c core_mask exposes all cores in that coherent group, and \c num_cores
++ * 	provides a cached population-count for that mask.
++ *
++ * @note Whilst all cores are exposed in the mask, not all may be available to
++ * 	the application, depending on the Kernel Power policy.
++ *
++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
++ * 	wastage.
++ */
++struct mali_base_gpu_coherent_group {
++	__u64 core_mask;
++	__u16 num_cores;
++	__u16 padding[3];
++};
++
++/**
++ * struct mali_base_gpu_coherent_group_info - Coherency group information
++ * @num_groups: Number of coherent groups in the GPU.
++ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
++ * 	Equivalent to the number of L2 Caches.
++ * 	  The GPU Counter dumping writes 2048 bytes per core group, regardless
++ * 	of whether the core groups are coherent or not. Hence this member is
++ * 	needed to calculate how much memory is required for dumping.
++ * 	  @note Do not use it to work out how many valid elements are in the
++ * 	group[] member. Use num_groups instead.
++ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
++ * 	methods
++ * @padding: padding to allign to 8-byte
++ * @group: Descriptors of coherent groups
++ *
++ * Note that the sizes of the members could be reduced. However, the \c group
++ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
++ * aligned, thus leading to wastage if the other members sizes were reduced.
++ *
++ * The groups are sorted by core mask. The core masks are non-repeating and do
++ * not intersect.
++ */
++struct mali_base_gpu_coherent_group_info {
++	__u32 num_groups;
++	__u32 num_core_groups;
++	__u32 coherency;
++	__u32 padding;
++	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
++};
++
++/**
++ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
++ *                            Configuration Discovery registers.
++ * @shader_present: Shader core present bitmap
++ * @tiler_present: Tiler core present bitmap
++ * @l2_present: Level 2 cache present bitmap
++ * @stack_present: Core stack present bitmap
++ * @l2_features: L2 features
++ * @core_features: Core features
++ * @mem_features: Mem features
++ * @mmu_features: Mmu features
++ * @as_present: Bitmap of address spaces present
++ * @js_present: Job slots present
++ * @js_features: Array of job slot features.
++ * @tiler_features: Tiler features
++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
++ * @gpu_id: GPU and revision identifier
++ * @thread_max_threads: Maximum number of threads per core
++ * @thread_max_workgroup_size: Maximum number of threads per workgroup
++ * @thread_max_barrier_size: Maximum number of threads per barrier
++ * @thread_features: Thread features
++ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
++ *                  available modes as exposed in the coherency_features register
++ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
++ * @gpu_features: GPU features
++ *
++ * The information is presented inefficiently for access. For frequent access,
++ * the values should be better expressed in an unpacked form in the
++ * base_gpu_props structure.
++ *
++ * The raw properties in gpu_raw_gpu_props are necessary to
++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
++ * behaving differently?". In this case, all information about the
++ * configuration is potentially useful, but it does not need to be processed
++ * by the driver. Instead, the raw registers can be processed by the Mali
++ * Tools software on the host PC.
++ *
++ */
++struct gpu_raw_gpu_props {
++	__u64 shader_present;
++	__u64 tiler_present;
++	__u64 l2_present;
++	__u64 stack_present;
++	__u32 l2_features;
++	__u32 core_features;
++	__u32 mem_features;
++	__u32 mmu_features;
++
++	__u32 as_present;
++
++	__u32 js_present;
++	__u32 js_features[GPU_MAX_JOB_SLOTS];
++	__u32 tiler_features;
++	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
++
++	__u32 gpu_id;
++
++	__u32 thread_max_threads;
++	__u32 thread_max_workgroup_size;
++	__u32 thread_max_barrier_size;
++	__u32 thread_features;
++
++	/*
++	 * Note: This is the _selected_ coherency mode rather than the
++	 * available modes as exposed in the coherency_features register.
++	 */
++	__u32 coherency_mode;
++
++	__u32 thread_tls_alloc;
++	__u64 gpu_features;
++};
++
++/**
++ * struct base_gpu_props - Return structure for base_get_gpu_props().
++ * @core_props:     Core props.
++ * @l2_props:       L2 props.
++ * @unused_1:       Keep for backwards compatibility.
++ * @tiler_props:    Tiler props.
++ * @thread_props:   Thread props.
++ * @raw_props:      This member is large, likely to be 128 bytes.
++ * @coherency_info: This must be last member of the structure.
++ *
++ * NOTE: the raw_props member in this data structure contains the register
++ * values from which the value of the other members are derived. The derived
++ * members exist to allow for efficient access and/or shielding the details
++ * of the layout of the registers.
++ */
++struct base_gpu_props {
++	struct mali_base_gpu_core_props core_props;
++	struct mali_base_gpu_l2_cache_props l2_props;
++	__u64 unused_1;
++	struct mali_base_gpu_tiler_props tiler_props;
++	struct mali_base_gpu_thread_props thread_props;
++	struct gpu_raw_gpu_props raw_props;
++	struct mali_base_gpu_coherent_group_info coherency_info;
++};
++
++#define BASE_MEM_GROUP_ID_GET(flags)                                           \
++	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
++
++#define BASE_MEM_GROUP_ID_SET(id)                                              \
++	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
++					 BASE_MEM_GROUP_DEFAULT :              \
++					 id)                                   \
++	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
++	 BASE_MEM_GROUP_ID_MASK)
++
++#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
++	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
++	 ((base_context_create_flags)(group_id)                                \
++	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
++
++#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
++	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
++	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
++
++/*
++ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
++ * flags are also used, where applicable, for specifying which fields
++ * are valid following the request operation.
++ */
++
++/* For monotonic (counter) timefield */
++#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
++/* For system wide timestamp */
++#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
++/* For GPU cycle counter */
++#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
++/* Specify kernel GPU register timestamp */
++#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
++/* Specify userspace cntvct_el0 timestamp source */
++#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
++
++#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
++		BASE_TIMEINFO_MONOTONIC_FLAG | \
++		BASE_TIMEINFO_TIMESTAMP_FLAG | \
++		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
++		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
++		BASE_TIMEINFO_USER_SOURCE_FLAG)
++
++/* Maximum number of source allocations allowed to create an alias allocation.
++ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
++ * layers, since each cube map in the array will have 6 faces.
++ */
++#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
++
++#endif /* _UAPI_BASE_KERNEL_H_ */
+diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h
+new file mode 100644
+index 00000000000..17e338cb238
+--- /dev/null
++++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h
+@@ -0,0 +1,43 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++/*
++ * This header was originally autogenerated, but it is now ok (and
++ * expected) to have to add to it.
++ */
++
++#ifndef _UAPI_GPU_CSF_REGISTERS_H_
++#define _UAPI_GPU_CSF_REGISTERS_H_
++
++/* Only user block defines are included. HI words have been removed */
++
++/* CS_USER_INPUT_BLOCK register offsets */
++#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
++#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
++
++/* CS_USER_OUTPUT_BLOCK register offsets */
++#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
++#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
++
++/* USER register offsets */
++#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
++
++#endif
+diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
+new file mode 100644
+index 00000000000..3df8a01699f
+--- /dev/null
++++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
+@@ -0,0 +1,483 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_CSF_IOCTL_H_
++#define _UAPI_KBASE_CSF_IOCTL_H_
++
++#include <asm-generic/ioctl.h>
++#include <linux/types.h>
++
++/*
++ * 1.0:
++ * - CSF IOCTL header separated from JM
++ * 1.1:
++ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
++ * - Add ioctl 54: This controls the priority setting.
++ * 1.2:
++ * - Add new CSF GPU_FEATURES register into the property structure
++ *   returned by KBASE_IOCTL_GET_GPUPROPS
++ * 1.3:
++ * - Add __u32 group_uid member to
++ *   &struct_kbase_ioctl_cs_queue_group_create.out
++ * 1.4:
++ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
++ *   instr_features member of same size
++ * 1.5:
++ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
++ *   queue registration call with extended format for supporting CS
++ *   trace configurations with CSF trace_command.
++ * 1.6:
++ * - Added new HW performance counters interface to all GPUs.
++ * 1.7:
++ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
++ * 1.8:
++ * - Removed Kernel legacy HWC interface
++ */
++
++#define BASE_UK_VERSION_MAJOR 1
++#define BASE_UK_VERSION_MINOR 8
++
++/**
++ * struct kbase_ioctl_version_check - Check version compatibility between
++ * kernel and userspace
++ *
++ * @major: Major version number
++ * @minor: Minor version number
++ */
++struct kbase_ioctl_version_check {
++	__u16 major;
++	__u16 minor;
++};
++
++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
++	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
++
++
++/**
++ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
++ *                                        base back-end
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @buffer_size: Size of the buffer in bytes
++ * @priority: Priority of the queue within a group when run within a process
++ * @padding: Currently unused, must be zero
++ *
++ * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
++ *        Any change of this struct should also be mirrored to the latter.
++ */
++struct kbase_ioctl_cs_queue_register {
++	__u64 buffer_gpu_addr;
++	__u32 buffer_size;
++	__u8 priority;
++	__u8 padding[3];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_REGISTER \
++	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
++
++/**
++ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
++ *                                    to notify that a queue has been updated
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ */
++struct kbase_ioctl_cs_queue_kick {
++	__u64 buffer_gpu_addr;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_KICK \
++	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
++
++/**
++ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
++ *
++ * @in:                 Input parameters
++ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @in.group_handle:    Handle of the group to which the queue should be bound
++ * @in.csi_index:       Index of the CSF interface the queue should be bound to
++ * @in.padding:         Currently unused, must be zero
++ * @out:                Output parameters
++ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
++ *                      input/output pages
++ */
++union kbase_ioctl_cs_queue_bind {
++	struct {
++		__u64 buffer_gpu_addr;
++		__u8 group_handle;
++		__u8 csi_index;
++		__u8 padding[6];
++	} in;
++	struct {
++		__u64 mmap_handle;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_BIND \
++	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
++
++/**
++ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
++ *                                           base back-end in extended format,
++ *                                           involving trace buffer configuration
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ * @buffer_size: Size of the buffer in bytes
++ * @priority: Priority of the queue within a group when run within a process
++ * @padding: Currently unused, must be zero
++ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
++ * @ex_buffer_base: Trace buffer GPU base address for the queue
++ * @ex_buffer_size: Size of the trace buffer in bytes
++ * @ex_event_size: Trace event write size, in log2 designation
++ * @ex_event_state: Trace event states configuration
++ * @ex_padding: Currently unused, must be zero
++ *
++ * @Note: There is an identical sub-section at the start of this struct to that
++ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
++ *        must also be mirrored to the latter. Following the said sub-section,
++ *        the remaining fields forms the extension, marked with ex_*.
++ */
++struct kbase_ioctl_cs_queue_register_ex {
++	__u64 buffer_gpu_addr;
++	__u32 buffer_size;
++	__u8 priority;
++	__u8 padding[3];
++	__u64 ex_offset_var_addr;
++	__u64 ex_buffer_base;
++	__u32 ex_buffer_size;
++	__u8 ex_event_size;
++	__u8 ex_event_state;
++	__u8 ex_padding[2];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
++	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
++
++/**
++ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
++ *
++ * @buffer_gpu_addr: GPU address of the buffer backing the queue
++ */
++struct kbase_ioctl_cs_queue_terminate {
++	__u64 buffer_gpu_addr;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
++	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
++
++/**
++ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
++ *                                               group
++ * @in:               Input parameters
++ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
++ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
++ * @in.cs_min:        Minimum number of CSs required.
++ * @in.priority:      Queue group's priority within a process.
++ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
++ *                    to use.
++ * @in.fragment_max:  Maximum number of fragment endpoints the group is
++ *                    allowed to use.
++ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
++ *                    to use.
++ * @in.padding:       Currently unused, must be zero
++ * @out:              Output parameters
++ * @out.group_handle: Handle of a newly created queue group.
++ * @out.padding:      Currently unused, must be zero
++ * @out.group_uid:    UID of the queue group available to base.
++ */
++union kbase_ioctl_cs_queue_group_create_1_6 {
++	struct {
++		__u64 tiler_mask;
++		__u64 fragment_mask;
++		__u64 compute_mask;
++		__u8 cs_min;
++		__u8 priority;
++		__u8 tiler_max;
++		__u8 fragment_max;
++		__u8 compute_max;
++		__u8 padding[3];
++
++	} in;
++	struct {
++		__u8 group_handle;
++		__u8 padding[3];
++		__u32 group_uid;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
++	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
++
++/**
++ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
++ * @in:               Input parameters
++ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
++ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
++ * @in.cs_min:        Minimum number of CSs required.
++ * @in.priority:      Queue group's priority within a process.
++ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
++ *                    to use.
++ * @in.fragment_max:  Maximum number of fragment endpoints the group is
++ *                    allowed to use.
++ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
++ *                    to use.
++ * @in.padding:       Currently unused, must be zero
++ * @out:              Output parameters
++ * @out.group_handle: Handle of a newly created queue group.
++ * @out.padding:      Currently unused, must be zero
++ * @out.group_uid:    UID of the queue group available to base.
++ */
++union kbase_ioctl_cs_queue_group_create {
++	struct {
++		__u64 tiler_mask;
++		__u64 fragment_mask;
++		__u64 compute_mask;
++		__u8 cs_min;
++		__u8 priority;
++		__u8 tiler_max;
++		__u8 fragment_max;
++		__u8 compute_max;
++		__u8 padding[3];
++		__u64 reserved;
++	} in;
++	struct {
++		__u8 group_handle;
++		__u8 padding[3];
++		__u32 group_uid;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
++	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
++
++/**
++ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
++ *
++ * @group_handle: Handle of the queue group to be terminated
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_cs_queue_group_term {
++	__u8 group_handle;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
++	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
++
++#define KBASE_IOCTL_CS_EVENT_SIGNAL \
++	_IO(KBASE_IOCTL_TYPE, 44)
++
++typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
++
++/**
++ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
++ *
++ * @id: ID of the new command queue returned by the kernel
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_new {
++	base_kcpu_queue_id id;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
++	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
++
++/**
++ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
++ *
++ * @id: ID of the command queue to be destroyed
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_delete {
++	base_kcpu_queue_id id;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
++	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
++
++/**
++ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
++ *
++ * @addr: Memory address of an array of struct base_kcpu_queue_command
++ * @nr_commands: Number of commands in the array
++ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_kcpu_queue_enqueue {
++	__u64 addr;
++	__u32 nr_commands;
++	base_kcpu_queue_id id;
++	__u8 padding[3];
++};
++
++#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
++	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
++
++/**
++ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
++ * @in:                Input parameters
++ * @in.chunk_size:     Size of each chunk.
++ * @in.initial_chunks: Initial number of chunks that heap will be created with.
++ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
++ * @in.target_in_flight: Number of render-passes that the driver should attempt to
++ *                     keep in flight for which allocation of new chunks is
++ *                     allowed.
++ * @in.group_id:       Group ID to be used for physical allocations.
++ * @in.padding:        Padding
++ * @out:               Output parameters
++ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
++ *                     for the heap.
++ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
++ *                     actually points to the header of heap chunk and not to
++ *                     the low address of free memory in the chunk.
++ */
++union kbase_ioctl_cs_tiler_heap_init {
++	struct {
++		__u32 chunk_size;
++		__u32 initial_chunks;
++		__u32 max_chunks;
++		__u16 target_in_flight;
++		__u8 group_id;
++		__u8 padding;
++	} in;
++	struct {
++		__u64 gpu_heap_va;
++		__u64 first_chunk_va;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
++	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
++
++/**
++ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
++ *                                         instance
++ *
++ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
++ */
++struct kbase_ioctl_cs_tiler_heap_term {
++	__u64 gpu_heap_va;
++};
++
++#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
++	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
++
++/**
++ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
++ *                                        of CSF interface capabilities
++ *
++ * @in:                    Input parameters
++ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
++ *                         which case groups_ptr is unused.
++ * @in.max_total_stream    _num: The maximum number of CSs to be read. Can be 0, in
++ *                         which case streams_ptr is unused.
++ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
++ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
++ * @out:                   Output parameters
++ * @out.glb_version:       Global interface version.
++ * @out.features:          Bit mask of features (e.g. whether certain types of job
++ *                         can be suspended).
++ * @out.group_num:         Number of CSGs supported.
++ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
++ *                         hold the size of firmware performance counter data
++ *                         and 15:0 hold the size of hardware performance counter
++ *                         data.
++ * @out.total_stream_num:  Total number of CSs, summed across all groups.
++ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
++ *                         size of events. Bits 3:0 hold the offset update rate.
++ *                         (csf >= 1.1.0)
++ *
++ */
++union kbase_ioctl_cs_get_glb_iface {
++	struct {
++		__u32 max_group_num;
++		__u32 max_total_stream_num;
++		__u64 groups_ptr;
++		__u64 streams_ptr;
++	} in;
++	struct {
++		__u32 glb_version;
++		__u32 features;
++		__u32 group_num;
++		__u32 prfcnt_size;
++		__u32 total_stream_num;
++		__u32 instr_features;
++	} out;
++};
++
++#define KBASE_IOCTL_CS_GET_GLB_IFACE \
++	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
++
++struct kbase_ioctl_cs_cpu_queue_info {
++	__u64 buffer;
++	__u64 size;
++};
++
++#define KBASE_IOCTL_VERSION_CHECK \
++	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
++
++#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
++	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
++
++/***************
++ * test ioctls *
++ ***************/
++#if MALI_UNIT_TEST
++/* These ioctls are purely for test purposes and are not used in the production
++ * driver, they therefore may change without notice
++ */
++
++/**
++ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
++ * @cpu_addr: Memory address to write
++ * @value: Value to write
++ * @padding: Currently unused, must be zero
++ */
++struct kbase_ioctl_cs_event_memory_write {
++	__u64 cpu_addr;
++	__u8 value;
++	__u8 padding[7];
++};
++
++/**
++ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
++ * @in: Input parameters
++ * @in.cpu_addr: Memory address to read
++ * @out: Output parameters
++ * @out.value: Value read
++ * @out.padding: Currently unused, must be zero
++ */
++union kbase_ioctl_cs_event_memory_read {
++	struct {
++		__u64 cpu_addr;
++	} in;
++	struct {
++		__u8 value;
++		__u8 padding[7];
++	} out;
++};
++
++#endif /* MALI_UNIT_TEST */
++
++#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
+diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h
+new file mode 100644
+index 00000000000..fc81b71b46a
+--- /dev/null
++++ b/src/panfrost/csf_test/mali_kbase_ioctl.h
+@@ -0,0 +1,854 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ *
++ * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved.
++ *
++ * This program is free software and is provided to you under the terms of the
++ * GNU General Public License version 2 as published by the Free Software
++ * Foundation, and any use by you of this program is subject to the terms
++ * of such GNU license.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, you can access it online at
++ * http://www.gnu.org/licenses/gpl-2.0.html.
++ *
++ */
++
++#ifndef _UAPI_KBASE_IOCTL_H_
++#define _UAPI_KBASE_IOCTL_H_
++
++#ifdef __cpluscplus
++extern "C" {
++#endif
++
++#include <asm-generic/ioctl.h>
++#include <linux/types.h>
++
++#define KBASE_IOCTL_TYPE 0x80
++
++/**
++ * struct kbase_ioctl_set_flags - Set kernel context creation flags
++ *
++ * @create_flags: Flags - see base_context_create_flags
++ */
++struct kbase_ioctl_set_flags {
++	__u32 create_flags;
++};
++
++#define KBASE_IOCTL_SET_FLAGS \
++	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
++
++/**
++ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
++ *
++ * @buffer: Pointer to the buffer to store properties into
++ * @size: Size of the buffer
++ * @flags: Flags - must be zero for now
++ *
++ * The ioctl will return the number of bytes stored into @buffer or an error
++ * on failure (e.g. @size is too small). If @size is specified as 0 then no
++ * data will be written but the return value will be the number of bytes needed
++ * for all the properties.
++ *
++ * @flags may be used in the future to request a different format for the
++ * buffer. With @flags == 0 the following format is used.
++ *
++ * The buffer will be filled with pairs of values, a __u32 key identifying the
++ * property followed by the value. The size of the value is identified using
++ * the bottom bits of the key. The value then immediately followed the key and
++ * is tightly packed (there is no padding). All keys and values are
++ * little-endian.
++ *
++ * 00 = __u8
++ * 01 = __u16
++ * 10 = __u32
++ * 11 = __u64
++ */
++struct kbase_ioctl_get_gpuprops {
++	__u64 buffer;
++	__u32 size;
++	__u32 flags;
++};
++
++#define KBASE_IOCTL_GET_GPUPROPS \
++	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
++
++/**
++ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
++ * @in: Input parameters
++ * @in.va_pages: The number of pages of virtual address space to reserve
++ * @in.commit_pages: The number of physical pages to allocate
++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
++ * @in.flags: Flags
++ * @out: Output parameters
++ * @out.flags: Flags
++ * @out.gpu_va: The GPU virtual address which is allocated
++ */
++union kbase_ioctl_mem_alloc {
++	struct {
++		__u64 va_pages;
++		__u64 commit_pages;
++		__u64 extension;
++		__u64 flags;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_ALLOC \
++	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
++
++/**
++ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
++ * @in: Input parameters
++ * @in.gpu_addr: A GPU address contained within the region
++ * @in.query: The type of query
++ * @out: Output parameters
++ * @out.value: The result of the query
++ *
++ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
++ */
++union kbase_ioctl_mem_query {
++	struct {
++		__u64 gpu_addr;
++		__u64 query;
++	} in;
++	struct {
++		__u64 value;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_QUERY \
++	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
++
++#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
++#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
++#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
++
++/**
++ * struct kbase_ioctl_mem_free - Free a memory region
++ * @gpu_addr: Handle to the region to free
++ */
++struct kbase_ioctl_mem_free {
++	__u64 gpu_addr;
++};
++
++#define KBASE_IOCTL_MEM_FREE \
++	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
++
++/**
++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
++ * @buffer_count: requested number of dumping buffers
++ * @fe_bm:        counters selection bitmask (Front end)
++ * @shader_bm:    counters selection bitmask (Shader)
++ * @tiler_bm:     counters selection bitmask (Tiler)
++ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
++ *
++ * A fd is returned from the ioctl if successful, or a negative value on error
++ */
++struct kbase_ioctl_hwcnt_reader_setup {
++	__u32 buffer_count;
++	__u32 fe_bm;
++	__u32 shader_bm;
++	__u32 tiler_bm;
++	__u32 mmu_l2_bm;
++};
++
++#define KBASE_IOCTL_HWCNT_READER_SETUP \
++	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
++
++/**
++ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
++ * @data:    Counter samples for the dummy model.
++ * @size:    Size of the counter sample data.
++ * @padding: Padding.
++ */
++struct kbase_ioctl_hwcnt_values {
++	__u64 data;
++	__u32 size;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_HWCNT_SET \
++	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
++
++/**
++ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
++ * @counter:   A counter of disjoint events in the kernel
++ */
++struct kbase_ioctl_disjoint_query {
++	__u32 counter;
++};
++
++#define KBASE_IOCTL_DISJOINT_QUERY \
++	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
++
++/**
++ * struct kbase_ioctl_get_ddk_version - Query the kernel version
++ * @version_buffer: Buffer to receive the kernel version string
++ * @size: Size of the buffer
++ * @padding: Padding
++ *
++ * The ioctl will return the number of bytes written into version_buffer
++ * (which includes a NULL byte) or a negative error code
++ *
++ * The ioctl request code has to be _IOW because the data in ioctl struct is
++ * being copied to the kernel, even though the kernel then writes out the
++ * version info to the buffer specified in the ioctl.
++ */
++struct kbase_ioctl_get_ddk_version {
++	__u64 version_buffer;
++	__u32 size;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_GET_DDK_VERSION \
++	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
++
++/**
++ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
++ *                                        allocator (between kernel driver
++ *                                        version 10.2--11.4)
++ * @va_pages: Number of VA pages to reserve for JIT
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ *
++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
++ * backwards compatibility.
++ */
++struct kbase_ioctl_mem_jit_init_10_2 {
++	__u64 va_pages;
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
++
++/**
++ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
++ *                                        allocator (between kernel driver
++ *                                        version 11.5--11.19)
++ * @va_pages: Number of VA pages to reserve for JIT
++ * @max_allocations: Maximum number of concurrent allocations
++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
++ * @group_id: Group ID to be used for physical allocations
++ * @padding: Currently unused, must be zero
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ *
++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
++ * backwards compatibility.
++ */
++struct kbase_ioctl_mem_jit_init_11_5 {
++	__u64 va_pages;
++	__u8 max_allocations;
++	__u8 trim_level;
++	__u8 group_id;
++	__u8 padding[5];
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
++
++/**
++ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
++ *                                   allocator
++ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
++ *            memory allocations
++ * @max_allocations: Maximum number of concurrent allocations
++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
++ * @group_id: Group ID to be used for physical allocations
++ * @padding: Currently unused, must be zero
++ * @phys_pages: Maximum number of physical pages to allocate just-in-time
++ *
++ * Note that depending on the VA size of the application and GPU, the value
++ * specified in @va_pages may be ignored.
++ */
++struct kbase_ioctl_mem_jit_init {
++	__u64 va_pages;
++	__u8 max_allocations;
++	__u8 trim_level;
++	__u8 group_id;
++	__u8 padding[5];
++	__u64 phys_pages;
++};
++
++#define KBASE_IOCTL_MEM_JIT_INIT \
++	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
++
++/**
++ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
++ *
++ * @handle: GPU memory handle (GPU VA)
++ * @user_addr: The address where it is mapped in user space
++ * @size: The number of bytes to synchronise
++ * @type: The direction to synchronise: 0 is sync to memory (clean),
++ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
++ */
++struct kbase_ioctl_mem_sync {
++	__u64 handle;
++	__u64 user_addr;
++	__u64 size;
++	__u8 type;
++	__u8 padding[7];
++};
++
++#define KBASE_IOCTL_MEM_SYNC \
++	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
++
++/**
++ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
++ *
++ * @in: Input parameters
++ * @in.gpu_addr: The GPU address of the memory region
++ * @in.cpu_addr: The CPU address to locate
++ * @in.size: A size in bytes to validate is contained within the region
++ * @out: Output parameters
++ * @out.offset: The offset from the start of the memory region to @cpu_addr
++ */
++union kbase_ioctl_mem_find_cpu_offset {
++	struct {
++		__u64 gpu_addr;
++		__u64 cpu_addr;
++		__u64 size;
++	} in;
++	struct {
++		__u64 offset;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
++	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
++
++/**
++ * struct kbase_ioctl_get_context_id - Get the kernel context ID
++ *
++ * @id: The kernel context ID
++ */
++struct kbase_ioctl_get_context_id {
++	__u32 id;
++};
++
++#define KBASE_IOCTL_GET_CONTEXT_ID \
++	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
++
++/**
++ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
++ *
++ * @flags: Flags
++ *
++ * The ioctl returns a file descriptor when successful
++ */
++struct kbase_ioctl_tlstream_acquire {
++	__u32 flags;
++};
++
++#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
++	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
++
++#define KBASE_IOCTL_TLSTREAM_FLUSH \
++	_IO(KBASE_IOCTL_TYPE, 19)
++
++/**
++ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
++ *
++ * @gpu_addr: The memory region to modify
++ * @pages:    The number of physical pages that should be present
++ *
++ * The ioctl may return on the following error codes or 0 for success:
++ *   -ENOMEM: Out of memory
++ *   -EINVAL: Invalid arguments
++ */
++struct kbase_ioctl_mem_commit {
++	__u64 gpu_addr;
++	__u64 pages;
++};
++
++#define KBASE_IOCTL_MEM_COMMIT \
++	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
++
++/**
++ * union kbase_ioctl_mem_alias - Create an alias of memory regions
++ * @in: Input parameters
++ * @in.flags: Flags, see BASE_MEM_xxx
++ * @in.stride: Bytes between start of each memory region
++ * @in.nents: The number of regions to pack together into the alias
++ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
++ * @out: Output parameters
++ * @out.flags: Flags, see BASE_MEM_xxx
++ * @out.gpu_va: Address of the new alias
++ * @out.va_pages: Size of the new alias
++ */
++union kbase_ioctl_mem_alias {
++	struct {
++		__u64 flags;
++		__u64 stride;
++		__u64 nents;
++		__u64 aliasing_info;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++		__u64 va_pages;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_ALIAS \
++	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
++
++/**
++ * union kbase_ioctl_mem_import - Import memory for use by the GPU
++ * @in: Input parameters
++ * @in.flags: Flags, see BASE_MEM_xxx
++ * @in.phandle: Handle to the external memory
++ * @in.type: Type of external memory, see base_mem_import_type
++ * @in.padding: Amount of extra VA pages to append to the imported buffer
++ * @out: Output parameters
++ * @out.flags: Flags, see BASE_MEM_xxx
++ * @out.gpu_va: Address of the new alias
++ * @out.va_pages: Size of the new alias
++ */
++union kbase_ioctl_mem_import {
++	struct {
++		__u64 flags;
++		__u64 phandle;
++		__u32 type;
++		__u32 padding;
++	} in;
++	struct {
++		__u64 flags;
++		__u64 gpu_va;
++		__u64 va_pages;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_IMPORT \
++	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
++
++/**
++ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
++ * @gpu_va: The GPU region to modify
++ * @flags: The new flags to set
++ * @mask: Mask of the flags to modify
++ */
++struct kbase_ioctl_mem_flags_change {
++	__u64 gpu_va;
++	__u64 flags;
++	__u64 mask;
++};
++
++#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
++	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
++
++/**
++ * struct kbase_ioctl_stream_create - Create a synchronisation stream
++ * @name: A name to identify this stream. Must be NULL-terminated.
++ *
++ * Note that this is also called a "timeline", but is named stream to avoid
++ * confusion with other uses of the word.
++ *
++ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
++ *
++ * The ioctl returns a file descriptor.
++ */
++struct kbase_ioctl_stream_create {
++	char name[32];
++};
++
++#define KBASE_IOCTL_STREAM_CREATE \
++	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
++
++/**
++ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
++ * @fd: The file descriptor to validate
++ */
++struct kbase_ioctl_fence_validate {
++	int fd;
++};
++
++#define KBASE_IOCTL_FENCE_VALIDATE \
++	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
++
++/**
++ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
++ * @buffer: Pointer to the information
++ * @len: Length
++ * @padding: Padding
++ *
++ * The data provided is accessible through a debugfs file
++ */
++struct kbase_ioctl_mem_profile_add {
++	__u64 buffer;
++	__u32 len;
++	__u32 padding;
++};
++
++#define KBASE_IOCTL_MEM_PROFILE_ADD \
++	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
++
++/**
++ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
++ * @count: Number of resources
++ * @address: Array of __u64 GPU addresses of the external resources to map
++ */
++struct kbase_ioctl_sticky_resource_map {
++	__u64 count;
++	__u64 address;
++};
++
++#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
++	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
++
++/**
++ * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was
++ *                                          previously permanently mapped
++ * @count: Number of resources
++ * @address: Array of __u64 GPU addresses of the external resources to unmap
++ */
++struct kbase_ioctl_sticky_resource_unmap {
++	__u64 count;
++	__u64 address;
++};
++
++#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
++	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
++
++/**
++ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
++ *                                                   the GPU memory region for
++ *                                                   the given gpu address and
++ *                                                   the offset of that address
++ *                                                   into the region
++ * @in: Input parameters
++ * @in.gpu_addr: GPU virtual address
++ * @in.size: Size in bytes within the region
++ * @out: Output parameters
++ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
++ *             for the length of @offset bytes
++ * @out.offset: The offset from the start of the memory region to @gpu_addr
++ */
++union kbase_ioctl_mem_find_gpu_start_and_offset {
++	struct {
++		__u64 gpu_addr;
++		__u64 size;
++	} in;
++	struct {
++		__u64 start;
++		__u64 offset;
++	} out;
++};
++
++#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
++	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
++
++#define KBASE_IOCTL_CINSTR_GWT_START \
++	_IO(KBASE_IOCTL_TYPE, 33)
++
++#define KBASE_IOCTL_CINSTR_GWT_STOP \
++	_IO(KBASE_IOCTL_TYPE, 34)
++
++/**
++ * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses.
++ * @in: Input parameters
++ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
++ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
++ * @in.len: Number of addresses the buffers can hold.
++ * @in.padding: padding
++ * @out: Output parameters
++ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
++ * @out.more_data_available: Status indicating if more addresses are available.
++ * @out.padding: padding
++ *
++ * This structure is used when performing a call to dump GPU write fault
++ * addresses.
++ */
++union kbase_ioctl_cinstr_gwt_dump {
++	struct {
++		__u64 addr_buffer;
++		__u64 size_buffer;
++		__u32 len;
++		__u32 padding;
++
++	} in;
++	struct {
++		__u32 no_of_addr_collected;
++		__u8 more_data_available;
++		__u8 padding[27];
++	} out;
++};
++
++#define KBASE_IOCTL_CINSTR_GWT_DUMP \
++	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
++
++/**
++ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
++ *
++ * @va_pages: Number of VA pages to reserve for EXEC_VA
++ */
++struct kbase_ioctl_mem_exec_init {
++	__u64 va_pages;
++};
++
++#define KBASE_IOCTL_MEM_EXEC_INIT \
++	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
++
++/**
++ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
++ *                                          cpu/gpu time (counter values)
++ * @in: Input parameters
++ * @in.request_flags: Bit-flags indicating the requested types.
++ * @in.paddings:      Unused, size alignment matching the out.
++ * @out: Output parameters
++ * @out.sec:           Integer field of the monotonic time, unit in seconds.
++ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
++ * @out.padding:       Unused, for __u64 alignment
++ * @out.timestamp:     System wide timestamp (counter) value.
++ * @out.cycle_counter: GPU cycle counter value.
++ */
++union kbase_ioctl_get_cpu_gpu_timeinfo {
++	struct {
++		__u32 request_flags;
++		__u32 paddings[7];
++	} in;
++	struct {
++		__u64 sec;
++		__u32 nsec;
++		__u32 padding;
++		__u64 timestamp;
++		__u64 cycle_counter;
++	} out;
++};
++
++#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
++	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
++
++/**
++ * struct kbase_ioctl_context_priority_check - Check the max possible priority
++ * @priority: Input priority & output priority
++ */
++
++struct kbase_ioctl_context_priority_check {
++	__u8 priority;
++};
++
++#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
++	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
++
++/**
++ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
++ *
++ * @max_core_count: Maximum core count
++ */
++struct kbase_ioctl_set_limited_core_count {
++	__u8 max_core_count;
++};
++
++#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
++	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
++
++/**
++ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
++ *                                              information
++ * @info_item_size:  Performance counter item size in bytes.
++ * @info_item_count: Performance counter item count in the info_list_ptr.
++ * @info_list_ptr:   Performance counter item list pointer which points to a
++ *                   list with info_item_count of items.
++ *
++ * On success: returns info_item_size and info_item_count if info_list_ptr is
++ * NULL, returns performance counter information if info_list_ptr is not NULL.
++ * On error: returns a negative error code.
++ */
++struct kbase_ioctl_kinstr_prfcnt_enum_info {
++	__u32 info_item_size;
++	__u32 info_item_count;
++	__u64 info_list_ptr;
++};
++
++#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
++	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
++
++/**
++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
++ * @in: input parameters.
++ * @in.request_item_count: Number of requests in the requests array.
++ * @in.request_item_size:  Size in bytes of each request in the requests array.
++ * @in.requests_ptr:       Pointer to the requests array.
++ * @out: output parameters.
++ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
++ *                                 each sample.
++ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
++ *                                 for reading performance counter samples.
++ *
++ * A fd is returned from the ioctl if successful, or a negative value on error.
++ */
++union kbase_ioctl_kinstr_prfcnt_setup {
++	struct {
++		__u32 request_item_count;
++		__u32 request_item_size;
++		__u64 requests_ptr;
++	} in;
++	struct {
++		__u32 prfcnt_metadata_item_size;
++		__u32 prfcnt_mmap_size_bytes;
++	} out;
++};
++
++#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
++	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
++
++/***************
++ * test ioctls *
++ ***************/
++#if MALI_UNIT_TEST
++/* These ioctls are purely for test purposes and are not used in the production
++ * driver, they therefore may change without notice
++ */
++
++#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
++
++
++/**
++ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
++ * @bytes_collected: number of bytes read by user
++ * @bytes_generated: number of bytes generated by tracepoints
++ */
++struct kbase_ioctl_tlstream_stats {
++	__u32 bytes_collected;
++	__u32 bytes_generated;
++};
++
++#define KBASE_IOCTL_TLSTREAM_STATS \
++	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
++
++#endif /* MALI_UNIT_TEST */
++
++/* Customer extension range */
++#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
++
++/* If the integration needs extra ioctl add them there
++ * like this:
++ *
++ * struct my_ioctl_args {
++ *  ....
++ * }
++ *
++ * #define KBASE_IOCTL_MY_IOCTL \
++ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
++ */
++
++
++/**********************************
++ * Definitions for GPU properties *
++ **********************************/
++#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
++#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
++#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
++#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
++
++#define KBASE_GPUPROP_PRODUCT_ID			1
++#define KBASE_GPUPROP_VERSION_STATUS			2
++#define KBASE_GPUPROP_MINOR_REVISION			3
++#define KBASE_GPUPROP_MAJOR_REVISION			4
++/* 5 previously used for GPU speed */
++#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
++/* 7 previously used for minimum GPU speed */
++#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
++#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
++#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
++#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
++#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
++
++#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
++#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
++#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
++
++#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
++#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
++
++#define KBASE_GPUPROP_MAX_THREADS			18
++#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
++#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
++#define KBASE_GPUPROP_MAX_REGISTERS			21
++#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
++#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
++#define KBASE_GPUPROP_IMPL_TECH				24
++
++#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
++#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
++#define KBASE_GPUPROP_RAW_L2_PRESENT			27
++#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
++#define KBASE_GPUPROP_RAW_L2_FEATURES			29
++#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
++#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
++#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
++#define KBASE_GPUPROP_RAW_AS_PRESENT			33
++#define KBASE_GPUPROP_RAW_JS_PRESENT			34
++#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
++#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
++#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
++#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
++#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
++#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
++#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
++#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
++#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
++#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
++#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
++#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
++#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
++#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
++#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
++#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
++#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
++#define KBASE_GPUPROP_RAW_GPU_ID			55
++#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
++#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
++#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
++#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
++#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
++
++#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
++#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
++#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
++#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
++#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
++#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
++#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
++#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
++#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
++#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
++#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
++#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
++#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
++#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
++#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
++#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
++#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
++#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
++#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
++
++#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
++
++#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
++
++#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
++#define KBASE_GPUPROP_TLS_ALLOC				84
++#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
++#ifdef __cpluscplus
++}
++#endif
++
++#endif /* _UAPI_KBASE_IOCTL_H_ */
+diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c
+new file mode 100644
+index 00000000000..cb9ff398314
+--- /dev/null
++++ b/src/panfrost/csf_test/test.c
+@@ -0,0 +1,1903 @@
++/*
++ * Copyright (C) 2022 Icecream95
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include <fcntl.h>
++#include <inttypes.h>
++#include <poll.h>
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++#include <stdlib.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "util/macros.h"
++
++#include "mali_kbase_csf_ioctl.h"
++#include "mali_kbase_ioctl.h"
++#include "mali_base_kernel.h"
++#include "mali_base_csf_kernel.h"
++#include "mali_gpu_csf_registers.h"
++
++#define PAN_ARCH 10
++#include "genxml/gen_macros.h"
++
++#include "wrap.h"
++#include "decode.h"
++
++#include "pan_shader.h"
++#include "compiler/nir/nir_builder.h"
++#include "bifrost/valhall/disassemble.h"
++
++#define CS_EVENT_REGISTER 0x5A
++
++static bool pr = true;
++static bool colour_term = true;
++
++static void
++dump_start(FILE *f)
++{
++        if (colour_term)
++                fprintf(f, "\x1b[90m");
++}
++
++static void
++dump_end(FILE *f)
++{
++        if (colour_term)
++                fprintf(f, "\x1b[39m");
++}
++
++/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */
++static void
++cache_clean(volatile void *addr)
++{
++#ifdef __aarch64__
++        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
++#endif
++}
++
++static void
++cache_invalidate(volatile void *addr)
++{
++#ifdef __aarch64__
++        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
++#endif
++}
++
++static void
++cache_barrier(void)
++{
++#ifdef __ARM_ARCH
++        __asm__ volatile ("dsb sy" ::: "memory");
++#endif
++}
++
++static void
++memory_barrier(void)
++{
++#ifdef __ARM_ARCH
++        __asm__ volatile ("dmb sy" ::: "memory");
++#endif
++}
++
++typedef void (*cacheline_op)(volatile void *addr);
++
++#define CACHELINE_SIZE 64
++
++static void
++cacheline_op_range(volatile void *start, unsigned length, cacheline_op op)
++{
++        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
++        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
++        for (; ptr < end; ptr += CACHELINE_SIZE)
++                op(ptr);
++}
++
++static void
++cache_clean_range(volatile void *start, unsigned length)
++{
++        cacheline_op_range(start, length, cache_clean);
++}
++
++static void
++cache_invalidate_range(volatile void *start, unsigned length)
++{
++        cacheline_op_range(start, length, cache_invalidate);
++}
++
++struct state;
++struct test;
++
++typedef bool (* section)(struct state *s, struct test *t);
++
++#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */
++#define CS_QUEUE_SIZE 65536
++
++struct state {
++        int page_size;
++        int argc;
++        char **argv;
++
++        int mali_fd;
++        int tl_fd;
++        void *tracking_region;
++        void *csf_user_reg;
++
++        uint8_t *gpuprops;
++        unsigned gpuprops_size;
++        uint32_t gpu_id;
++
++        struct {
++                struct panfrost_ptr normal, exec, coherent, cached, event, ev2;
++        } allocations;
++
++        uint64_t tiler_heap_va;
++        uint64_t tiler_heap_header;
++
++        uint8_t csg_handle;
++        uint32_t csg_uid;
++
++        struct panfrost_ptr cs_mem[CS_QUEUE_COUNT];
++        void *cs_user_io[CS_QUEUE_COUNT];
++        unsigned cs_last_submit[CS_QUEUE_COUNT];
++        struct pan_command_stream cs[CS_QUEUE_COUNT];
++
++        unsigned shader_alloc_offset;
++        mali_ptr compute_shader;
++};
++
++struct test {
++        section part;
++        section cleanup;
++        const char *label;
++
++        struct test *subtests;
++        unsigned sub_length;
++
++        /* for allocation tests */
++        unsigned offset;
++        unsigned flags;
++
++        bool add;
++        bool invalid;
++        bool blit;
++        bool vertex;
++};
++
++/* See STATE and ALLOC macros below */
++#define DEREF_STATE(s, offset) ((void*) s + offset)
++
++static uint64_t
++pan_get_gpuprop(struct state *s, int name)
++{
++        int i = 0;
++        uint64_t x = 0;
++        while (i < s->gpuprops_size) {
++                x = 0;
++                memcpy(&x, s->gpuprops + i, 4);
++                i += 4;
++
++                int size = 1 << (x & 3);
++                int this_name = x >> 2;
++
++                x = 0;
++                memcpy(&x, s->gpuprops + i, size);
++                i += size;
++
++                if (this_name == name)
++                        return x;
++        }
++
++        fprintf(stderr, "Unknown prop %i\n", name);
++        return 0;
++}
++
++static bool
++open_kbase(struct state *s, struct test *t)
++{
++        s->mali_fd = open("/dev/mali0", O_RDWR);
++        if (s->mali_fd != -1)
++                return true;
++
++        perror("open(\"/dev/mali0\")");
++        return false;
++}
++
++static bool
++close_kbase(struct state *s, struct test *t)
++{
++        if (getenv("TEST_CHECK_LEAKS")) {
++                int pid = getpid();
++                char cmd_buffer[64] = {0};
++                sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid);
++                system(cmd_buffer);
++                sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid);
++                system(cmd_buffer);
++        }
++
++        if (s->mali_fd > 0)
++                return close(s->mali_fd) == 0;
++        return true;
++}
++
++static bool
++get_version(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_version_check ver = { 0 };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_VERSION_CHECK)");
++                return false;
++        }
++
++        if (pr)
++                printf("Major %i Minor %i: ", ver.major, ver.minor);
++        return true;
++}
++
++static bool
++set_flags(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_set_flags flags = {
++                .create_flags = 0
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++mmap_tracking(struct state *s, struct test *t)
++{
++        s->tracking_region = mmap(NULL, s->page_size, PROT_NONE,
++                                  MAP_SHARED, s->mali_fd,
++                                  BASE_MEM_MAP_TRACKING_HANDLE);
++
++        if (s->tracking_region == MAP_FAILED) {
++                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
++                s->tracking_region = NULL;
++                return false;
++        }
++        return true;
++}
++
++static bool
++munmap_tracking(struct state *s, struct test *t)
++{
++        if (s->tracking_region)
++                return munmap(s->tracking_region, s->page_size) == 0;
++        return true;
++}
++
++static bool
++get_gpuprops(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_get_gpuprops props = { 0 };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
++                return false;
++        } else if (!ret) {
++                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
++                return false;
++        }
++
++        s->gpuprops_size = ret;
++        s->gpuprops = calloc(s->gpuprops_size, 1);
++
++        props.size = s->gpuprops_size;
++        props.buffer = (uint64_t)(uintptr_t) s->gpuprops;
++
++        ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
++                return false;
++        }
++
++        return true;
++}
++
++static bool
++free_gpuprops(struct state *s, struct test *t)
++{
++        free(s->gpuprops);
++        return true;
++}
++
++static bool
++get_gpu_id(struct state *s, struct test *t)
++{
++        uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID);
++        if (!gpu_id)
++                return false;
++        s->gpu_id = gpu_id;
++
++        uint16_t maj = gpu_id >> 12;
++        uint16_t min = (gpu_id >> 8) & 0xf;
++        uint16_t rev = (gpu_id >> 4) & 0xf;
++
++        uint16_t product = gpu_id & 0xf;
++        uint16_t prod = product | ((maj & 1) << 4);
++
++        const char *names[] = {
++                [1] = "TDUX",
++                [2] = "G710",
++                [3] = "G510",
++                [4] = "G310",
++                [7] = "G610",
++                [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */
++                [16 + 3] = "G615",
++        };
++        const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL;
++        if (!name)
++                name = "unknown";
++
++        if (pr)
++                printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product);
++
++        if (maj < 10) {
++                printf("not v10 or later: ");
++                return false;
++        }
++
++        return true;
++}
++
++static bool
++get_coherency_mode(struct state *s, struct test *t)
++{
++        uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE);
++
++        const char *modes[] = {
++                [0] = "ACE-Lite",
++                [1] = "ACE",
++                [31] = "None",
++        };
++        const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL;
++        if (!name)
++                name = "Unknown";
++
++        if (pr)
++                printf("0x%"PRIx64" (%s): ", mode, name);
++        return true;
++}
++
++static bool
++get_csf_caps(struct state *s, struct test *t)
++{
++        union kbase_ioctl_cs_get_glb_iface iface = { 0 };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))");
++                return false;
++        }
++
++        int ver_maj = iface.out.glb_version >> 24;
++        int ver_min = (iface.out.glb_version >> 16) & 0xff;
++        int ver_rev = iface.out.glb_version & 0xffff;
++
++        if (pr)
++                printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ",
++                       ver_maj, ver_min, ver_rev, iface.out.features,
++                       iface.out.group_num, iface.out.total_stream_num);
++
++        unsigned group_num = iface.out.group_num;
++        unsigned stream_num = iface.out.total_stream_num;
++
++        struct basep_cs_group_control *group_data =
++                calloc(group_num, sizeof(*group_data));
++
++        struct basep_cs_stream_control *stream_data =
++                calloc(stream_num, sizeof(*stream_data));
++
++        iface = (union kbase_ioctl_cs_get_glb_iface) {
++                .in = {
++                        .max_group_num = group_num,
++                        .max_total_stream_num = stream_num,
++                        .groups_ptr = (uintptr_t) group_data,
++                        .streams_ptr = (uintptr_t) stream_data,
++                }
++        };
++
++        ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))");
++
++                free(group_data);
++                free(stream_data);
++
++                return false;
++        }
++
++        unsigned print_groups = pr ? group_num : 0;
++        unsigned print_streams = pr ? stream_num : 0;
++
++        for (unsigned i = 0; i < print_groups; ++i) {
++                if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data)))
++                        continue;
++
++                fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n",
++                        i, group_data[i].features, group_data[i].stream_num);
++        }
++
++        for (unsigned i = 0; i < print_streams; ++i) {
++                if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data)))
++                        continue;
++
++                unsigned reg = stream_data[i].features & 0xff;
++                unsigned score = (stream_data[i].features >> 8) & 0xff;
++                unsigned feat = stream_data[i].features >> 16;
++
++                fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n",
++                        i, reg, score, feat);
++        }
++
++        free(group_data);
++        free(stream_data);
++
++        return true;
++}
++
++static bool
++mmap_user_reg(struct state *s, struct test *t)
++{
++        s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ,
++                               MAP_SHARED, s->mali_fd,
++                               BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
++
++        if (s->csf_user_reg == MAP_FAILED) {
++                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
++                s->csf_user_reg = NULL;
++                return false;
++        }
++        return true;
++}
++
++static bool
++munmap_user_reg(struct state *s, struct test *t)
++{
++        if (s->csf_user_reg)
++                return munmap(s->csf_user_reg, s->page_size) == 0;
++        return true;
++}
++
++static bool
++init_mem_exec(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_mem_exec_init init = {
++                .va_pages = 0x100000,
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++init_mem_jit(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_mem_jit_init init = {
++                .va_pages = 1 << 25,
++                .max_allocations = 255,
++                .phys_pages = 1 << 25,
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++stream_create(struct state *s, struct test *t)
++{
++        struct kbase_ioctl_stream_create stream = {
++                .name = "stream"
++        };
++
++        s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream);
++
++        if (s->tl_fd == -1) {
++                perror("ioctl(KBASE_IOCTL_STREAM_CREATE)");
++                return false;
++        }
++        return true;
++
++}
++
++static bool
++stream_destroy(struct state *s, struct test *t)
++{
++        if (s->tl_fd > 0)
++                return close(s->tl_fd) == 0;
++        return true;
++}
++
++static bool
++tiler_heap_create(struct state *s, struct test *t)
++{
++        union kbase_ioctl_cs_tiler_heap_init init = {
++                .in = {
++                        .chunk_size = 1 << 21,
++                        .initial_chunks = 5,
++                        .max_chunks = 200,
++                        .target_in_flight = 65535,
++                }
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
++                return false;
++        }
++
++        s->tiler_heap_va = init.out.gpu_heap_va;
++        s->tiler_heap_header = init.out.first_chunk_va;
++        printf("heap va: %"PRIx64", heap header: %"PRIx64"\n",
++               s->tiler_heap_va, s->tiler_heap_header);
++
++        return true;
++}
++
++static bool
++tiler_heap_term(struct state *s, struct test *t)
++{
++        if (!s->tiler_heap_va)
++                return true;
++
++        struct kbase_ioctl_cs_tiler_heap_term term = {
++                .gpu_heap_va = s->tiler_heap_va
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++cs_group_create(struct state *s, struct test *t)
++{
++        union kbase_ioctl_cs_queue_group_create_1_6 create = {
++                .in = {
++                        /* Mali *still* only supports a single tiler unit */
++                        .tiler_mask = 1,
++                        .fragment_mask = ~0ULL,
++                        .compute_mask = ~0ULL,
++
++                        .cs_min = CS_QUEUE_COUNT,
++
++                        .priority = 1,
++                        .tiler_max = 1,
++                        .fragment_max = 64,
++                        .compute_max = 64,
++                }
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
++                return false;
++        }
++
++        s->csg_handle = create.out.group_handle;
++        s->csg_uid = create.out.group_uid;
++
++        if (pr)
++                printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid);
++
++        /* Should be at least 1 */
++        if (!s->csg_uid)
++                abort();
++
++        return true;
++}
++
++static bool
++cs_group_term(struct state *s, struct test *t)
++{
++        if (!s->csg_uid)
++                return true;
++
++        struct kbase_ioctl_cs_queue_group_term term = {
++                .group_handle = s->csg_handle
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
++                return false;
++        }
++        return true;
++}
++
++static bool
++decode_init(struct state *s, struct test *t)
++{
++        pandecode_initialize(true);
++        return true;
++}
++
++static bool
++decode_close(struct state *s, struct test *t)
++{
++        pandecode_close();
++        return true;
++}
++
++static struct panfrost_ptr
++alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a)
++{
++        struct panfrost_ptr p = {0};
++
++        uint64_t va_pages = a->in.va_pages;
++        uint64_t flags = a->in.flags;
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
++                return p;
++        }
++
++        if ((flags & BASE_MEM_SAME_VA) &&
++            (!(a->out.flags & BASE_MEM_SAME_VA) ||
++                a->out.gpu_va != 0x41000)) {
++
++                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
++                        (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va);
++                return p;
++        }
++
++        void *ptr = mmap(NULL, s->page_size * va_pages,
++                         PROT_READ | PROT_WRITE, MAP_SHARED,
++                         s->mali_fd, a->out.gpu_va);
++
++        if (ptr == MAP_FAILED) {
++                perror("mmap(GPU BO)");
++                return p;
++        }
++
++        uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ?
++                (uintptr_t) ptr : a->out.gpu_va;
++
++        pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL);
++
++        p.cpu = ptr;
++        p.gpu = gpu_va;
++
++        memset(p.cpu, 0, s->page_size * va_pages);
++
++        return p;
++}
++
++static struct panfrost_ptr
++alloc_mem(struct state *s, uint64_t size, uint64_t flags)
++{
++        unsigned pages = size / s->page_size;
++
++        union kbase_ioctl_mem_alloc a = {
++                .in = {
++                        .va_pages = pages,
++                        .commit_pages = pages,
++                        .extension = 0,
++                        .flags = flags,
++                }
++        };
++
++        return alloc_ioctl(s, &a);
++}
++
++static void
++alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size)
++{
++        mmap(p.cpu - s->page_size, 1,
++             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
++             -1, 0);
++
++        mmap(p.cpu + alloc_size, 1,
++             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
++             -1, 0);
++}
++
++static bool
++alloc(struct state *s, struct test *t)
++{
++        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
++
++        *ptr = alloc_mem(s, s->page_size, t->flags);
++
++        volatile int *p = (volatile int *) ptr->cpu;
++        *p = 0x12345;
++        if (*p != 0x12345) {
++                printf("Error reading from allocated memory at %p\n", p);
++                return false;
++        }
++        *p = 0;
++        cache_clean(p);
++
++        return true;
++}
++
++static bool
++dealloc(struct state *s, struct test *t)
++{
++        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
++
++        if (ptr->cpu)
++                return munmap(ptr->cpu, s->page_size) == 0;
++        return true;
++}
++
++static bool
++cs_queue_create(struct state *s, struct test *t)
++{
++        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
++
++                /* Read/write from CPU/GPU, nothing special
++                 * like coherency */
++                s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f);
++                s->cs[i].ptr = s->cs_mem[i].cpu;
++
++                if (!s->cs_mem[i].cpu)
++                        return false;
++        }
++
++        return true;
++}
++
++static bool
++cs_queue_free(struct state *s, struct test *t)
++{
++        bool pass = true;
++        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
++                if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE))
++                        pass = false;
++        }
++        return pass;
++}
++
++static bool
++cs_queue_register(struct state *s, struct test *t)
++{
++        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
++                struct kbase_ioctl_cs_queue_register reg = {
++                        .buffer_gpu_addr = s->cs_mem[i].gpu,
++                        .buffer_size = CS_QUEUE_SIZE,
++                        .priority = 1,
++                };
++
++                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
++
++                if (ret == -1) {
++                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
++                        return false;
++                }
++
++                union kbase_ioctl_cs_queue_bind bind = {
++                        .in = {
++                                .buffer_gpu_addr = s->cs_mem[i].gpu,
++                                .group_handle = s->csg_handle,
++                                .csi_index = i,
++                        }
++                };
++
++                ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
++
++                if (ret == -1) {
++                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
++                }
++
++                s->cs_user_io[i] =
++                        mmap(NULL,
++                             s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
++                             PROT_READ | PROT_WRITE, MAP_SHARED,
++                             s->mali_fd, bind.out.mmap_handle);
++
++                if (s->cs_user_io[i] == MAP_FAILED) {
++                        perror("mmap(CS USER IO)");
++                        s->cs_user_io[i] = NULL;
++                        return false;
++                }
++        }
++        return true;
++}
++
++static bool
++cs_queue_term(struct state *s, struct test *t)
++{
++        bool pass = true;
++
++        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
++                if (s->cs_user_io[i] &&
++                    munmap(s->cs_user_io[i],
++                           s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES))
++                        pass = false;
++
++                struct kbase_ioctl_cs_queue_terminate term = {
++                        .buffer_gpu_addr = s->cs_mem[i].gpu,
++                };
++
++                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE,
++                                &term);
++
++                if (ret == -1)
++                        pass = false;
++        }
++        return pass;
++}
++
++#define CS_RING_DOORBELL(s, i) \
++        *((uint32_t *)(s->cs_user_io[i])) = 1
++
++#define CS_READ_REGISTER(s, i, r) \
++        *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r))
++
++#define CS_WRITE_REGISTER(s, i, r, v) \
++        *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v
++
++static void
++submit_cs(struct state *s, unsigned i)
++{
++        uintptr_t p = (uintptr_t) s->cs[i].ptr;
++        unsigned pad = (-p) & 63;
++        memset(s->cs[i].ptr, 0, pad);
++
++        unsigned last_offset = s->cs_last_submit[i];
++
++        unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu;
++        insert_offset %= CS_QUEUE_SIZE;
++
++        for (unsigned o = last_offset; o != insert_offset;
++             o = (o + 64) % CS_QUEUE_SIZE)
++                cache_clean(s->cs_mem[i].cpu + o);
++
++        // TODO: Handle wraparound
++        // TODO: Provide a persistent buffer for pandecode to use?
++        if (pr) {
++                dump_start(stderr);
++                pandecode_cs(s->cs_mem[i].gpu + last_offset,
++                             insert_offset - last_offset, s->gpu_id);
++                dump_end(stderr);
++        }
++
++        cache_barrier();
++
++        CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset);
++        s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset;
++
++        memory_barrier();
++        CS_RING_DOORBELL(s, i);
++        memory_barrier();
++
++        s->cs_last_submit[i] = insert_offset;
++}
++
++/* Returns true if there was a timeout */
++static bool
++wait_event(struct state *s, unsigned timeout_ms)
++{
++        struct pollfd fd = {
++                .fd = s->mali_fd,
++                .events = POLLIN,
++        };
++
++        int ret = poll(&fd, 1, timeout_ms);
++
++        if (ret == -1) {
++                perror("poll(mali_fd)");
++                return true;
++        }
++
++        /* Timeout */
++        if (ret == 0)
++                return true;
++
++        struct base_csf_notification event;
++        ret = read(s->mali_fd, &event, sizeof(event));
++
++        if (ret == -1) {
++                perror("read(mali_fd)");
++                return true;
++        }
++
++        if (ret != sizeof(event)) {
++                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
++                        ret, (int) sizeof(event));
++                return false;
++        }
++
++        switch (event.type) {
++        case BASE_CSF_NOTIFICATION_EVENT:
++                fprintf(stderr, "Notification event!\n");
++                return false;
++
++        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
++                break;
++
++        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
++                fprintf(stderr, "No event from mali_fd!\n");
++                return false;
++
++        default:
++                fprintf(stderr, "Unknown event type!\n");
++                return false;
++        }
++
++        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
++
++        switch (e.error_type) {
++        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
++                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
++                fprintf(stderr, "Queue group error: status 0x%x "
++                        "sideband 0x%"PRIx64"\n",
++                        e.payload.fatal_group.status,
++                        (uint64_t) e.payload.fatal_group.sideband);
++                break;
++        }
++        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
++                unsigned queue = e.payload.fatal_queue.csi_index;
++
++                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
++                fprintf(stderr, "Queue %i error: status 0x%x "
++                        "sideband 0x%"PRIx64":",
++                        queue, e.payload.fatal_queue.status,
++                        (uint64_t) e.payload.fatal_queue.sideband);
++
++                unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT);
++                pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id);
++
++                break;
++        }
++
++        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
++                fprintf(stderr, "Command stream timeout!\n");
++                break;
++        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
++                fprintf(stderr, "Command stream OOM!\n");
++                break;
++        default:
++                fprintf(stderr, "Unknown error type!\n");
++        }
++
++        return false;
++}
++
++static bool
++kick_queue(struct state *s, unsigned i)
++{
++        struct kbase_ioctl_cs_queue_kick kick = {
++                .buffer_gpu_addr = s->cs_mem[i].gpu
++        };
++
++        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
++
++        if (ret == -1) {
++                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
++                return false;
++        }
++
++        return true;
++}
++
++static bool
++wait_cs(struct state *s, unsigned i)
++{
++        unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu;
++
++        unsigned timeout_ms = 500;
++
++        bool done_kick = false;
++
++        while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) {
++                if (wait_event(s, timeout_ms)) {
++                        if (pr)
++                                fprintf(stderr, "Event wait timeout!\n");
++
++                        unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT);
++                        unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE);
++
++                        if (e != extract_offset) {
++                                fprintf(stderr, "CS_EXTRACT (%i) != %i, "
++                                        "CS_ACTIVE (%i) on queue %i:",
++                                        e, extract_offset, a, i);
++                                /* Decode two instructions instead? */
++                                pandecode_cs(s->cs_mem[i].gpu + e, 8, 1);
++
++                                if (done_kick) {
++                                        cache_barrier();
++                                        return false;
++                                } else {
++                                        fprintf(stderr, "Kicking queue\n");
++                                        kick_queue(s, i);
++                                        done_kick = true;
++                                }
++                        }
++                }
++        }
++
++        cache_barrier();
++
++        return true;
++}
++
++static bool
++cs_init(struct state *s, struct test *t)
++{
++        uint64_t event_init[] = { 1, 1, 1 };
++        memcpy(s->allocations.event.cpu, event_init, sizeof(event_init));
++
++        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
++                CS_WRITE_REGISTER(s, i, CS_INSERT, 0);
++                pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) {
++                        switch (i) {
++                        case 0: cfg.compute = true; break;
++                        case 1: cfg.compute = true; cfg.fragment = true; break;
++                        case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break;
++                        case 3: cfg.fragment = true; break;
++                        }
++                }
++                pan_pack_ins(s->cs + i, CS_SLOT, cfg) {
++                        cfg.index = 2;
++                }
++                pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER,
++                               s->allocations.event.gpu);
++                submit_cs(s, i);
++
++                if (!kick_queue(s, i))
++                        return false;
++        }
++
++        return true;
++}
++
++static struct panfrost_ptr *
++buffers_elem(struct util_dynarray *buffers, unsigned index)
++{
++        unsigned size = util_dynarray_num_elements(buffers,
++                                                   struct panfrost_ptr);
++
++        if (index >= size) {
++                unsigned grow = index + 1 - size;
++
++                memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow),
++                       0, grow * sizeof(struct panfrost_ptr));
++        }
++
++        return util_dynarray_element(buffers, struct panfrost_ptr, index);
++}
++
++static void
++dump_hex64(FILE *fp, uint64_t *values, unsigned size)
++{
++        bool zero = false;
++        for (unsigned i = 0; i < size / 8; i += 2) {
++                uint64_t a = values[i];
++                uint64_t b = values[i + 1];
++
++                if (!a && !b) {
++                        if (!zero)
++                                fprintf(fp, "%06X  *\n", i * 8);
++                        zero = true;
++                        continue;
++                }
++
++                zero = false;
++
++                fprintf(fp, "%06X  %16"PRIx64" %16"PRIx64"\n",
++                        i * 8, a, b);
++        }
++
++        fprintf(fp, "\n");
++}
++
++static void
++dump_delta(FILE *fp, uint64_t *values, unsigned size)
++{
++        uint64_t old = 0;
++        bool zero = false;
++        bool el = false;
++        for (unsigned i = 0; i < size / 8; ++i) {
++                uint64_t val = values[i];
++                int64_t delta = val - old;
++
++                if (!zero || delta) {
++                        fprintf(fp, "%"PRIi64"\n", delta);
++                        el = false;
++                } else if (!el) {
++                        fprintf(fp, "...\n");
++                        el = true;
++                }
++
++                old = val;
++                zero = (delta == 0);
++        }
++}
++
++static void
++dump_tiler(FILE *fp, uint8_t *values, unsigned size)
++{
++        fflush(stdout);
++        FILE *stream = popen("tiler-hex-read", "w");
++        // TODO!
++        fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n",
++                256, 256, 6, values, size);
++        pan_hexdump(stream, values, size, false);
++        pclose(stream);
++}
++
++/* TODO: Pass in a filename? */
++static void
++dump_filehex(uint8_t *values, unsigned size)
++{
++        char buf[1024] = {0};
++
++        for (unsigned i = 0; i < 10000; ++i) {
++                snprintf(buf, 1024, "/tmp/fdump.%05i", i);
++
++                int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666);
++                if (fd == -1)
++                        continue;
++
++                FILE *fp = fdopen(fd, "w");
++
++                fprintf(fp, "%p, %u:\n", values, size);
++                pan_hexdump(fp, values, size, false);
++
++                fclose(fp); /* will close fd */
++                break;
++        }
++}
++
++static void
++dump_heatmap(FILE *fp, uint8_t *values, unsigned size,
++             unsigned gran, unsigned length, unsigned stride)
++{
++        unsigned sum = 0;
++        unsigned gr = 0;
++        unsigned st = 0;
++        unsigned ll = 0;
++
++        while (size && !values[size - 1])
++                --size;
++
++        for (unsigned i = 0; i < size; ++i) {
++                sum += values[i];
++
++                if (++gr == gran) {
++                        fprintf(fp, " %02x", sum & 0xff);
++                        gr = 0;
++                        sum = 0;
++                }
++
++                if (++ll == length) {
++                        i += stride - length;
++                        fprintf(fp, "\n");
++                        st = 0;
++                        ll = 0;
++                } else if (++st == stride) {
++                        fprintf(fp, "\n");
++                        st = 0;
++                }
++        }
++        fprintf(fp, " %02x\n", sum & 0xff);
++}
++
++static bool
++cs_test(struct state *s, struct test *t)
++{
++        if (s->argc < 2)
++                return true;
++
++        FILE *f = fopen(s->argv[1], "r");
++
++        struct util_dynarray buffers;
++        util_dynarray_init(&buffers, NULL);
++
++        for (;;) {
++                char *line = NULL;
++                size_t sz = 0;
++                if (getline(&line, &sz, f) == -1)
++                        break;
++
++                unsigned long src, dst, offset, src_offset, size, iter, flags;
++                unsigned long gran, stride, length;
++                int read;
++                char *mode;
++
++                if (sscanf(line, "rel%ms %lu+%lu %lu+%lu",
++                           &mode, &dst, &offset, &src, &src_offset) == 5) {
++
++                        if (strcmp(mode, "oc") && strcmp(mode, "split")) {
++                                fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode);
++                        }
++                        bool split = (mode[0] == 's');
++                        free(mode);
++
++                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
++                        struct panfrost_ptr *d = buffers_elem(&buffers, dst);
++
++                        if (!s->gpu || !d->gpu) {
++                                fprintf(stderr, "relocating to buffer that doesn't exist!\n");
++                        }
++
++                        uint64_t *dest = d->cpu + offset;
++                        uint64_t value = s->gpu + src_offset;
++                        if (split) {
++                                dest[0] |= (uint32_t) value;
++                                dest[1] |= (uint32_t) (value >> 32);
++                        } else {
++                                *dest |= value;
++                        }
++
++                } else if (sscanf(line, "buffer %lu %lu %lx %n",
++                                  &dst, &size, &flags, &read) == 3) {
++                        line += read;
++
++                        struct panfrost_ptr buffer =
++                                alloc_mem(s, ALIGN_POT(size, s->page_size),
++                                          flags);
++
++                        alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size));
++
++                        *buffers_elem(&buffers, dst) = buffer;
++
++                        //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu);
++
++                        uint64_t *fill = buffer.cpu;
++
++                        for (unsigned i = 0; i < size / 8; ++i) {
++                                read = 0;
++                                unsigned long long val = 0;
++                                if (sscanf(line, "%Lx %n", &val, &read) != 1)
++                                        break;
++                                line += read;
++                                fill[i] = val;
++                        }
++
++                        cache_clean_range(buffer.cpu, size);
++
++                } else if (sscanf(line, "exe %n %lu %lu %lu",
++                                  &read, &iter, &dst, &size) == 3) {
++                        line += read;
++
++                        unsigned iter_mask = 0;
++
++                        for (;;) {
++                                read = 0;
++                                if (sscanf(line, "%lu %lu %lu %n",
++                                           &iter, &dst, &size, &read) != 3)
++                                        break;
++                                line += read;
++
++                                struct panfrost_ptr *d =
++                                        buffers_elem(&buffers, dst);
++
++                                /* TODO: Check 'size' against buffer size */
++
++                                pandecode_cs(d->gpu, size, s->gpu_id);
++
++                                if (iter > 3) {
++                                        fprintf(stderr,
++                                                "execute on out-of-bounds "
++                                                "iterator\n");
++                                        continue;
++                                }
++
++                                memcpy(s->cs[iter].ptr, d->cpu, size);
++                                s->cs[iter].ptr += size / 8;
++
++                                iter_mask |= (1 << iter);
++                        }
++
++                        u_foreach_bit(i, iter_mask)
++                                submit_cs(s, i);
++
++                        u_foreach_bit(i, iter_mask)
++                                kick_queue(s, i);
++
++                        u_foreach_bit(i, iter_mask)
++                                wait_cs(s, i);
++
++                } else if (sscanf(line, "dump %lu %lu %lu %ms",
++                                  &src, &offset, &size, &mode) == 4) {
++
++                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
++
++                        if (!s->gpu)
++                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
++
++                        cache_invalidate_range(s->cpu + offset, size);
++
++                        if (!strcmp(mode, "hex"))
++                                pan_hexdump(stdout, s->cpu + offset, size, true);
++                        else if (!strcmp(mode, "hex64"))
++                                dump_hex64(stdout, s->cpu + offset, size);
++                        else if (!strcmp(mode, "delta"))
++                                dump_delta(stdout, s->cpu + offset, size);
++                        else if (!strcmp(mode, "tiler"))
++                                dump_tiler(stdout, s->cpu + offset, size);
++                        else if (!strcmp(mode, "filehex"))
++                                dump_filehex(s->cpu + offset, size);
++
++                        free(mode);
++
++                } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu",
++                                  &src, &offset, &size,
++                                  &gran, &length, &stride) == 6) {
++
++                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
++
++                        if (!s->gpu)
++                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
++
++                        cache_invalidate_range(s->cpu + offset, size);
++
++                        dump_heatmap(stdout, s->cpu + offset, size,
++                                     gran, length, stride);
++
++                } else if (sscanf(line, "memset %lu %lu %lu %lu",
++                                  &src, &offset, &gran, &size) == 4) {
++
++                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
++
++                        if (!s->gpu)
++                                fprintf(stderr, "memset on buffer that doesn't exist!\n");
++
++                        memset(s->cpu + offset, gran, size);
++                        cache_clean_range(s->cpu + offset, size);
++
++                } else if (sscanf(line, "sleep %lu", &size) == 1) {
++
++                        usleep(size * 1000);
++
++                } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) {
++
++                        void *ptr;
++
++                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
++                                         s->tiler_heap_header);
++                        pan_hexdump(stdout, ptr, 4096, false);
++                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
++                        munmap(ptr, 1 << 21);
++
++                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
++                                         s->tiler_heap_header + (1 << 21));
++                        pan_hexdump(stdout, ptr, 4096, false);
++                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
++                        munmap(ptr, 1 << 21);
++
++                } else {
++                        fprintf(stderr, "unknown command '%s'\n", line);
++                }
++        }
++
++        /* Skip following tests */
++        return false;
++}
++
++static void
++pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value)
++{
++        pan_emit_cs_32(c, 0x5e, value);
++        pan_pack_ins(c, CS_ADD_IMM, cfg) {
++                cfg.value = offset;
++                cfg.src = 0x5a;
++                cfg.dest = 0x5c;
++        }
++        pan_pack_ins(c, CS_EVADD, cfg) {
++                cfg.value = 0x5e;
++                cfg.addr = 0x5c;
++        }
++}
++
++static bool
++cs_simple(struct state *s, struct test *t)
++{
++        unsigned queue = t->vertex ? 2 : 0;
++
++        pan_command_stream *c = s->cs + queue;
++
++        unsigned dest = t->invalid ? 0x65 : 0x48;
++
++        pan_emit_cs_32(c, dest, 0x1234);
++        pan_cs_evadd(c, 0, 1);
++
++        submit_cs(s, queue);
++        return wait_cs(s, queue);
++}
++
++static bool
++cs_store(struct state *s, struct test *t)
++{
++        pan_command_stream *c = s->cs;
++
++        uint32_t *dest = s->allocations.ev2.cpu + 240;
++        mali_ptr dest_va = s->allocations.ev2.gpu + 240;
++        uint32_t value = 1234;
++        uint32_t add = 4320000;
++
++        *dest = 0;
++        cache_clean(dest);
++
++        unsigned addr_reg = 0x48;
++        unsigned value_reg = 0x4a;
++
++        if (t->invalid)
++                dest_va = 0xfdcba9876543;
++
++        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); }
++        pan_emit_cs_48(c, addr_reg, dest_va);
++        pan_emit_cs_32(c, value_reg, value);
++
++        if (t->add) {
++                pan_pack_ins(c, CS_ADD_IMM, cfg) {
++                        cfg.value = add;
++                        cfg.src = value_reg;
++                        cfg.dest = value_reg;
++                }
++                value += add;
++        }
++
++        pan_pack_ins(c, CS_STR, cfg) {
++                cfg.addr = addr_reg;
++                cfg.register_base = value_reg;
++                cfg.register_mask = 1;
++        }
++        pan_cs_evadd(c, 0, 1);
++
++        submit_cs(s, 0);
++        wait_cs(s, 0);
++
++        cache_invalidate(dest);
++        cache_barrier(); /* Just in case it's needed */
++        uint32_t result = *dest;
++
++        if (t->invalid && result == value) {
++                printf("Got %i, did not expect %i: ", result, value);
++                return false;
++        } else if (result != value) {
++                printf("Got %i, expected %i: ", result, value);
++                return false;
++        }
++
++        return true;
++}
++
++static void
++emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end)
++{
++        cache_clean_range(start, end - start);
++
++        pan_emit_cs_48(c, 0x48, va);
++        pan_emit_cs_32(c, 0x4a, end - start);
++        pan_pack_ins(c, CS_CALL, cfg) {
++                cfg.address = 0x48;
++                cfg.length = 0x4a;
++        }
++}
++
++static bool
++cs_sub(struct state *s, struct test *t)
++{
++        pan_command_stream *c = s->cs;
++        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
++        mali_ptr cs_va = s->allocations.cached.gpu;
++
++        uint32_t *dest = s->allocations.normal.cpu;
++        mali_ptr dest_va = s->allocations.normal.gpu;
++        uint32_t value = 4321;
++
++        *dest = 0;
++        cache_clean(dest);
++
++        unsigned addr_reg = 0x48;
++        unsigned value_reg = 0x4a;
++
++        void *start = i->ptr;
++
++        pan_emit_cs_ins(c, 0x30, 0x5a0000000000);
++
++        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
++        pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); }
++        //pan_emit_cs_ins(i, 0x31, 0);
++
++        pan_emit_cs_48(i, addr_reg, dest_va);
++        pan_emit_cs_32(i, value_reg, value);
++        //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL);
++        pan_pack_ins(i, CS_STR, cfg) {
++                cfg.addr = addr_reg;
++                cfg.register_base = value_reg;
++                cfg.register_mask = 1;
++        }
++        //pan_emit_cs_ins(i, 0x09, 0);
++        //pan_emit_cs_ins(i, 0x31, 0x100000000);
++
++        //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL);
++
++        /*
++        pan_pack_ins(i, CS_STR_32, cfg) {
++                cfg.unk_1 = 1;
++                cfg.unk_2 = 4;
++                cfg.unk_3 = 1;
++                cfg.addr = addr_reg;
++                cfg.value = value_reg;
++                }*/
++
++        emit_cs_call(c, cs_va, start, i->ptr);
++        pan_cs_evadd(c, 0, 1);
++
++        submit_cs(s, 0);
++        wait_cs(s, 0);
++
++        cache_invalidate(dest);
++        cache_barrier(); /* Just in case it's needed */
++        uint32_t result = *dest;
++
++        if (result != value) {
++                printf("Got %i, expected %i: ", result, value);
++                return false;
++        }
++
++        return true;
++}
++
++static mali_ptr
++upload_shader(struct state *s, struct util_dynarray binary)
++{
++        assert(s->shader_alloc_offset + binary.size < s->page_size);
++
++        mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset;
++
++        memcpy(s->allocations.exec.cpu, binary.data, binary.size);
++
++        /* Shouldn't be needed, but just in case... */
++        cache_clean_range(s->allocations.exec.cpu, binary.size);
++
++        s->shader_alloc_offset += binary.size;
++
++        return va;
++}
++
++static bool
++compute_compile(struct state *s, struct test *t)
++{
++        nir_builder _b =
++                nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
++                                               GENX(pan_shader_get_compiler_options)(),
++                                               "mem_store"), *b = &_b;
++
++        nir_ssa_def *ptr =
++                nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0));
++
++        nir_ssa_def *value = nir_imm_int(b, 123);
++
++        nir_store_global(b, ptr, 8, value, 1);
++
++        struct panfrost_compile_inputs inputs = {
++                .gpu_id = s->gpu_id,
++                .no_ubo_to_push = true,
++        };
++
++        struct util_dynarray binary = {0};
++        struct pan_shader_info shader_info = {0};
++
++        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
++
++        dump_start(stderr);
++        disassemble_valhall(stderr, binary.data, binary.size, true);
++        dump_end(stderr);
++
++        s->compute_shader = upload_shader(s, binary);
++
++        util_dynarray_fini(&binary);
++        ralloc_free(b->shader);
++
++        return true;
++}
++
++static struct panfrost_ptr
++mem_offset(struct panfrost_ptr ptr, unsigned offset)
++{
++        ptr.cpu += offset;
++        ptr.gpu += offset;
++        return ptr;
++}
++
++static bool
++compute_execute(struct state *s, struct test *t)
++{
++        unsigned queue = t->blit ? 1 : 0;
++
++        pan_command_stream *c = s->cs + queue;
++        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
++        mali_ptr cs_va = s->allocations.cached.gpu;
++
++        struct panfrost_ptr dest = s->allocations.normal;
++        uint32_t value = 123;
++
++        *(uint32_t *) dest.cpu = 0;
++        cache_clean(dest.cpu);
++
++        struct panfrost_ptr fau = mem_offset(dest, 128);
++        *(uint64_t *) fau.cpu = dest.gpu;
++        cache_clean(fau.cpu);
++
++        struct panfrost_ptr local_storage = mem_offset(dest, 192);
++        pan_pack(local_storage.cpu, LOCAL_STORAGE, _);
++        cache_clean(local_storage.cpu);
++
++        struct panfrost_ptr shader_program = mem_offset(dest, 256);
++        pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) {
++                cfg.stage = MALI_SHADER_STAGE_COMPUTE;
++                cfg.primary_shader = true;
++                cfg.register_allocation =
++                        MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
++                cfg.binary = s->compute_shader;
++        }
++        cache_clean(shader_program.cpu);
++
++        void *start = i->ptr;
++
++        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
++        //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
++
++        pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) {
++                cfg.workgroup_size_x = 1;
++                cfg.workgroup_size_y = 1;
++                cfg.workgroup_size_z = 1;
++
++                cfg.workgroup_count_x = 1;
++                cfg.workgroup_count_y = 1;
++                cfg.workgroup_count_z = 1;
++
++                cfg.compute.shader = shader_program.gpu;
++                cfg.compute.thread_storage = local_storage.gpu;
++
++                cfg.compute.fau = fau.gpu;
++                cfg.compute.fau_count = 1;
++        }
++
++        pan_pack_ins(i, COMPUTE_LAUNCH, _);
++
++        //pan_emit_cs_32(c, 0x54, 1);
++        //pan_emit_cs_ins(c, 0x24, 0x540000000233);
++        emit_cs_call(c, cs_va, start, i->ptr);
++
++        pan_emit_cs_32(c, 0x4a, 0);
++        pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL);
++
++        pan_emit_cs_48(c, 0x48, dest.gpu);
++        pan_pack_ins(c, CS_LDR, cfg) {
++                cfg.offset = 0;
++                cfg.register_mask = 1;
++                cfg.addr = 0x48;
++                cfg.register_base = 0x20;
++        }
++        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; }
++        pan_pack_ins(c, CS_ADD_IMM, cfg) {
++                cfg.value = 1;
++                cfg.src = 0x20;
++                cfg.dest = 0x20;
++        }
++        pan_pack_ins(c, CS_STR, cfg) {
++                cfg.offset = 64;
++                cfg.register_mask = 1;
++                cfg.addr = 0x48;
++                cfg.register_base = 0x20;
++        }
++
++        pan_cs_evadd(c, 0, 1);
++
++        submit_cs(s, queue);
++        wait_cs(s, queue);
++
++        cache_invalidate(dest.cpu);
++        cache_barrier(); /* Just in case it's needed */
++        uint32_t result = ((uint32_t *)dest.cpu)[0];
++        uint32_t result2 = ((uint32_t *)dest.cpu)[16];
++
++        if (result != value) {
++                printf("Got %i, %i, expected %i: ", result, result2, value);
++                return false;
++        }
++
++        return true;
++}
++
++static bool
++mmu_dump(struct state *s, struct test *t)
++{
++        unsigned size = 1024 * 1024;
++
++        void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED,
++                         s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE);
++        if (mem == MAP_FAILED) {
++                perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)");
++                return false;
++        }
++
++        pan_hexdump(stdout, mem, size, true);
++
++        return true;
++}
++
++#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) }
++
++#define STATE(item) .offset = offsetof(struct state, item)
++
++#define ALLOC(item) .offset = offsetof(struct state, allocations.item)
++#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f }
++
++struct test kbase_main[] = {
++        { open_kbase, close_kbase, "Open kbase device" },
++        { get_version, NULL, "Check version" },
++        { set_flags, NULL, "Set flags" },
++        { mmap_tracking, munmap_tracking, "Map tracking handle" },
++        { get_gpuprops, free_gpuprops, "Get GPU properties" },
++        { get_gpu_id, NULL, "GPU ID" },
++        { get_coherency_mode, NULL, "Coherency mode" },
++        { get_csf_caps, NULL, "CSF caps" },
++        { mmap_user_reg, munmap_user_reg, "Map user register page" },
++        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
++        { init_mem_jit, NULL, "Initialise JIT allocator" },
++        { stream_create, stream_destroy, "Create synchronisation stream" },
++        { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" },
++        { cs_group_create, cs_group_term, "Create command stream group" },
++        { decode_init, decode_close, "Initialize pandecode" },
++
++        /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */
++        ALLOC_TEST("Allocate normal memory", normal, 0x200f),
++        ALLOC_TEST("Allocate exectuable memory", exec, 0x2017),
++        ALLOC_TEST("Allocate coherent memory", coherent, 0x280f),
++        ALLOC_TEST("Allocate cached memory", cached, 0x380f),
++        ALLOC_TEST("Allocate CSF event memory", event, 0x8200f),
++        ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f),
++
++        /* These three tests are run for every queue, but later ones are not */
++        { cs_queue_create, cs_queue_free, "Create command stream queues" },
++        { cs_queue_register, cs_queue_term, "Register command stream queues" },
++
++        { cs_test, NULL, "Test command stream" },
++
++        { cs_init, NULL, "Initialise and start command stream queues" },
++        { cs_simple, NULL, "Execute MOV command" },
++        { cs_simple, NULL, "Execute MOV command (again)" },
++        { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true },
++        //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true },
++        { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true },
++        { cs_store, NULL, "Execute STR command" },
++        //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true },
++        { cs_store, NULL, "Execute ADD command", .add = true },
++        { cs_sub, NULL, "Execute STR on iterator" },
++
++        { compute_compile, NULL, "Compile a compute shader" },
++        { compute_execute, NULL, "Execute a compute shader" },
++        { compute_execute, NULL, "Execute compute on blit queue", .blit = true },
++
++        //{ mmu_dump, NULL, "Dump MMU pagetables" },
++};
++
++static void
++do_test_list(struct state *s, struct test *tests, unsigned length);
++
++static void
++cleanup_test_list(struct state *s, struct test *tests, unsigned length)
++{
++        for (unsigned i = length; i > 0; --i) {
++                unsigned n = i - 1;
++
++                struct test *t = &tests[n];
++                if (!t->cleanup)
++                        continue;
++
++                if (pr)
++                        printf("[CLEANUP %i] %s: ", n, t->label);
++                if (t->cleanup(s, t)) {
++                        if (pr)
++                                printf("PASS\n");
++                } else {
++                        if (pr)
++                                printf("FAIL\n");
++                }
++        }
++}
++
++static unsigned
++interpret_test_list(struct state *s, struct test *tests, unsigned length)
++{
++        for (unsigned i = 0; i < length; ++i) {
++                struct test *t = &tests[i];
++
++                if (pr)
++                        printf("[TEST %i] %s: ", i, t->label);
++                if (t->part) {
++                        if (t->part(s, t)) {
++                                if (pr)
++                                        printf("PASS\n");
++                        } else {
++                                if (pr)
++                                        printf("FAIL\n");
++                                if (!getenv("TEST_KEEP_GOING"))
++                                        return i + 1;
++                        }
++                }
++                if (t->subtests)
++                        do_test_list(s, t->subtests, t->sub_length);
++        }
++
++        return length;
++}
++
++static void
++do_test_list(struct state *s, struct test *tests, unsigned length)
++{
++        unsigned ran = interpret_test_list(s, tests, length);
++        cleanup_test_list(s, tests, ran);
++}
++
++int
++main(int argc, char *argv[])
++{
++        struct state s = {
++                .page_size = sysconf(_SC_PAGE_SIZE),
++                .argc = argc,
++                .argv = argv,
++        };
++
++        if (getenv("CSF_QUIET"))
++                pr = false;
++
++        if (!strcmp(getenv("TERM"), "dumb"))
++                colour_term = false;
++
++        if (pr)
++                printf("Running Valhall CSF tests\n");
++
++        do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main));
++}
+diff --git a/src/panfrost/lib/genxml/common.xml b/src/panfrost/lib/genxml/common.xml
+index d4b5240fb01..d75baaba208 100644
+--- a/src/panfrost/lib/genxml/common.xml
++++ b/src/panfrost/lib/genxml/common.xml
+@@ -46,7 +46,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+diff --git a/src/panfrost/lib/genxml/decode.c b/src/panfrost/lib/genxml/decode.c
+index ae214e8d7ec..86298fa5d42 100644
+--- a/src/panfrost/lib/genxml/decode.c
++++ b/src/panfrost/lib/genxml/decode.c
+@@ -54,6 +54,12 @@
+         pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \
+ }
+ 
++#define DUMP_SECTION_CS_V10(A, S, cl, buf, buf_unk, ...) { \
++        pan_section_unpack_cs_v10(cl, buf, buf_unk, A, S, temp); \
++        pandecode_log(__VA_ARGS__); \
++        pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \
++}
++
+ #define MAP_ADDR(T, addr, cl) \
+         const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T));
+ 
+@@ -158,7 +164,7 @@ pandecode_midgard_tiler_descriptor(
+         if (nonzero_weights)
+                 DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 5 */
+ 
+ #if PAN_ARCH >= 5
+ static void
+@@ -184,7 +190,7 @@ pandecode_render_target(uint64_t gpu_va, unsigned gpu_id,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH >= 5 */
+ 
+ #if PAN_ARCH >= 6
+ static void
+@@ -201,7 +207,7 @@ pandecode_sample_locations(const void *fb)
+                                 samples[2 * i + 1] - 128);
+         }
+ }
+-#endif
++#endif /* PAN_ARCH >= 6 */
+ 
+ static void
+ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+@@ -228,29 +234,29 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id)
+ #if PAN_ARCH >= 6
+         pandecode_sample_locations(fb);
+ 
+-        unsigned dcd_size = pan_size(DRAW);
++        unsigned dcd_size = pan_size(DRAW_NO_CS);
+ 
+         if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (0 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Pre frame 0:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+ 
+         if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (1 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Pre frame 1:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+ 
+         if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (2 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Post frame:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+-#else
++#else /* PAN_ARCH < 6 */
+         DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");
+ 
+         const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER);
+@@ -284,7 +290,7 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id)
+                 .rt_count = params.render_target_count,
+                 .has_extra = params.has_zs_crc_extension
+         };
+-#else
++#else /* PAN_ARCH < 5 */
+         /* Dummy unpack of the padding section to make sure all words are 0.
+          * No need to call print here since the section is supposed to be empty.
+          */
+@@ -341,7 +347,7 @@ pandecode_attributes(mali_ptr addr, int count,
+         }
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH >= 5
+ static mali_ptr
+@@ -358,7 +364,7 @@ pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader)
+         return b.blend_shader ? (b.shader_pc & ~0xf) : 0;
+ #endif
+ }
+-#endif
++#endif /* PAN_ARCH >= 6 || PAN_ARCH == 5 */
+ 
+ #if PAN_ARCH <= 7
+ static unsigned
+@@ -412,8 +418,9 @@ pandecode_invocation(const void *i)
+ 
+         DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n")
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
++#if PAN_ARCH < 10
+ static void
+ pandecode_primitive(const void *p)
+ {
+@@ -439,7 +446,7 @@ pandecode_primitive(const void *p)
+                         pandecode_validate_buffer(primitive.indices, primitive.index_count * size);
+         } else if (primitive.index_type)
+                 pandecode_log("// XXX: unexpected index size\n");
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ }
+ 
+ static void
+@@ -451,6 +458,7 @@ pandecode_primitive_size(const void *s, bool constant)
+ 
+         DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n")
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ #if PAN_ARCH <= 7
+ static void
+@@ -482,7 +490,7 @@ pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count)
+         free(ptr);
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ static void
+ pandecode_shader_disassemble(mali_ptr shader_ptr, int type, unsigned gpu_id)
+@@ -566,7 +574,7 @@ pandecode_texture_payload(mali_ptr payload,
+         pandecode_indent--;
+         pandecode_log("},\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH <= 5
+ static void
+@@ -585,7 +593,7 @@ pandecode_texture(mali_ptr u, unsigned tex)
+                         temp.levels, nr_samples, temp.array_size);
+         pandecode_indent--;
+ }
+-#else
++#else /* PAN_ARCH > 5 */
+ static void
+ pandecode_texture(const void *cl, unsigned tex)
+ {
+@@ -603,7 +611,7 @@ pandecode_texture(const void *cl, unsigned tex)
+ 
+         for (unsigned i = 0; i < plane_count; ++i)
+                 DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i);
+-#else
++#else /* PAN_ARCH < 9 */
+         unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ?
+                               1 : temp.sample_count;
+ 
+@@ -630,7 +638,7 @@ pandecode_textures(mali_ptr textures, unsigned texture_count)
+ 
+         for (unsigned tex = 0; tex < texture_count; ++tex)
+                 pandecode_texture(cl + pan_size(TEXTURE) * tex, tex);
+-#else
++#else /* PAN_ARCH < 6 */
+         mali_ptr *PANDECODE_PTR_VAR(u, textures);
+ 
+         for (int tex = 0; tex < texture_count; ++tex) {
+@@ -741,7 +749,7 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+                                                                            gpu_id);
+                         }
+                 }
+-#endif
++#endif /* PAN_ARCH >= 5 */
+         } else
+                 pandecode_log("// XXX: missing shader descriptor\n");
+ 
+@@ -807,7 +815,7 @@ pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH >= 6
+ static void
+@@ -823,6 +831,10 @@ pandecode_tiler(mali_ptr gpu_va)
+         DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n");
+ }
+ 
++#endif /* PAN_ARCH >= 6 */
++
++#if PAN_ARCH < 10
++#if PAN_ARCH >= 6
+ #if PAN_ARCH <= 7
+ static void
+ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h,
+@@ -854,8 +866,8 @@ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h,
+ 
+         pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding);
+ }
+-#endif
+-#endif
++#endif /* PAN_ARCH <= 7 */
++#endif /* PAN_ARCH >= 6 */
+ 
+ static void
+ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+@@ -890,7 +902,7 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+         pan_section_unpack(p, TILER_JOB, PADDING, padding);
+ #endif
+ 
+-#else
++#else /* PAN_ARCH < 6 */
+         pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive);
+         pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE),
+                                  primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE);
+@@ -898,12 +910,17 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ static void
+-pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
++pandecode_fragment_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                       unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+         struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job);
+-        pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s);
++#endif
++
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, FRAGMENT_JOB, PAYLOAD, s);
+ 
+         UNUSED struct pandecode_fbd info = pandecode_fbd(s.framebuffer, true, gpu_id);
+ 
+@@ -920,7 +937,7 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
+                 expected_tag |= MALI_FBD_TAG_HAS_ZS_RT;
+ 
+         expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2);
+-#endif
++#endif /* PAN_ARCH >= 5 */
+ 
+         DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n");
+ 
+@@ -936,6 +953,8 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
+         pandecode_log("\n");
+ }
+ 
++#if PAN_ARCH < 10
++// TODO: Use the same model as for malloc_vertex jobs?
+ static void
+ pandecode_write_value_job(mali_ptr job)
+ {
+@@ -953,6 +972,7 @@ pandecode_cache_flush_job(mali_ptr job)
+         DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n");
+         pandecode_log("\n");
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ #if PAN_ARCH >= 9
+ static void
+@@ -1034,6 +1054,9 @@ pandecode_resource_tables(mali_ptr addr, const char *label)
+ static void
+ pandecode_depth_stencil(mali_ptr addr)
+ {
++        if (!addr)
++                return;
++
+         MAP_ADDR(DEPTH_STENCIL, addr, cl);
+         pan_unpack(cl, DEPTH_STENCIL, desc);
+         DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil");
+@@ -1060,14 +1083,15 @@ static void
+ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+               unsigned gpu_id)
+ {
+-        mali_ptr frag_shader = 0;
+-
+         pandecode_depth_stencil(p->depth_stencil);
+ 
+         for (unsigned i = 0; i < p->blend_count; ++i) {
++                MAP_ADDR(SHADER_PROGRAM, p->shader.shader, cl);
++                pan_unpack(cl, SHADER_PROGRAM, desc);
++
+                 struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend);
+ 
+-                mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader);
++                mali_ptr blend_shader = pandecode_blend(blend_descs, i, desc.binary);
+                 if (blend_shader) {
+                         fprintf(pandecode_dump_stream, "Blend shader %u", i);
+                         pandecode_shader_disassemble(blend_shader, 0, gpu_id);
+@@ -1079,21 +1103,26 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+ }
+ 
+ static void
+-pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id)
++pandecode_malloc_vertex_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                            unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+         struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job);
++#endif
+ 
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE, p, cs_buf, cs_buf_unk, "Primitive:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, cs_buf, cs_buf_unk, "Instance count:\n");
++#if PAN_ARCH < 10
+         DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n");
++#endif
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, TILER, p, cs_buf, cs_buf_unk, "Tiler:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, SCISSOR, p, cs_buf, cs_buf_unk, "Scissor:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, cs_buf, cs_buf_unk, "Primitive Size:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INDICES, p, cs_buf, cs_buf_unk, "Indices:\n");
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, DRAW, dcd);
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, TILER, tiler_ptr);
+         pandecode_log("Tiler Job Payload:\n");
+         pandecode_indent++;
+         if (tiler_ptr.address)
+@@ -1104,17 +1133,20 @@ pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id)
+ 
+         pandecode_dcd(&dcd, 0, gpu_id);
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position);
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, POSITION, position);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, VARYING, varying);
+         pandecode_shader_environment(&position, gpu_id);
+         pandecode_shader_environment(&varying, gpu_id);
+ }
+ 
+ static void
+-pandecode_compute_job(mali_ptr job, unsigned gpu_id)
++pandecode_compute_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                      unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+ 	struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job);
+-	pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload);
++#endif
++	pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, COMPUTE_JOB, PAYLOAD, payload);
+ 
+ 	pandecode_shader(payload.compute.shader, "Shader", gpu_id);
+ 	if (payload.compute.thread_storage)
+@@ -1126,8 +1158,9 @@ pandecode_compute_job(mali_ptr job, unsigned gpu_id)
+ 
+ 	DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n");
+ }
+-#endif
++#endif /* PAN_ARCH >= 9 */
+ 
++#if PAN_ARCH < 10
+ /* Entrypoint to start tracing. jc_gpu_va is the GPU address for the first job
+  * in the chain; later jobs are found by walking the chain. GPU ID is the
+  * more finegrained ID because some details are model-specific even within a
+@@ -1183,18 +1216,18 @@ GENX(pandecode_jc)(mali_ptr jc_gpu_va, unsigned gpu_id)
+                         pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id);
+                         break;
+ #endif
+-#else
++#else /* PAN_ARCH > 7 */
+ 		case MALI_JOB_TYPE_COMPUTE:
+-			pandecode_compute_job(jc_gpu_va, gpu_id);
++                        pandecode_compute_job(jc_gpu_va, NULL, NULL, gpu_id);
+ 			break;
+ 
+ 		case MALI_JOB_TYPE_MALLOC_VERTEX:
+-			pandecode_malloc_vertex_job(jc_gpu_va, gpu_id);
++			pandecode_malloc_vertex_job(jc_gpu_va, NULL, NULL, gpu_id);
+ 			break;
+ #endif
+ 
+                 case MALI_JOB_TYPE_FRAGMENT:
+-                        pandecode_fragment_job(jc_gpu_va, gpu_id);
++                        pandecode_fragment_job(jc_gpu_va, NULL, NULL, gpu_id);
+                         break;
+ 
+                 default:
+@@ -1232,3 +1265,544 @@ GENX(pandecode_abort_on_fault)(mali_ptr jc_gpu_va)
+ 
+         pandecode_map_read_write();
+ }
++#endif
++
++#if PAN_ARCH >= 10
++static void
++pandecode_cs_dump_state(uint32_t *state)
++{
++        uint64_t *st_64 = (uint64_t *)state;
++        /* Only registers below 0x40 seem to be actually be used by jobs */
++        for (unsigned i = 0; i < 0x40 / 4; ++i) {
++                uint64_t v1 = st_64[i * 2];
++                uint64_t v2 = st_64[i * 2 + 1];
++
++                if (!v1 && !v2)
++                        continue;
++
++                pandecode_log("0x%2x: 0x%16"PRIx64" 0x%16"PRIx64"\n",
++                              i * 4, v1, v2);
++        }
++}
++
++/* Assumes eight scoreboards */
++static void
++pandecode_scoreboard_mask(unsigned mask)
++{
++        if (mask == 0xff) {
++                pandecode_log_cont("all");
++                return;
++        } else if (!mask) {
++                pandecode_log_cont("none");
++                return;
++        }
++
++        const char *comma = "";
++        for (unsigned i = 0; i < 8; ++i) {
++                if (mask & (1 << i)) {
++                        pandecode_log_cont("%s%i", comma, i);
++                        comma = ",";
++                }
++        }
++}
++
++static void
++pandecode_regmask(unsigned base, unsigned mask)
++{
++        switch (mask) {
++        case 0:
++                pandecode_log_cont("(invalid: %02x mask 0)", base);
++                return;
++        case 1:
++                pandecode_log_cont("w%02x", base);
++                return;
++        case 3:
++                pandecode_log_cont("x%02x", base);
++                return;
++        default:
++                break;
++        }
++
++        unsigned first = ffs(mask) - 1;
++        if (first)
++                pandecode_log_cont("{(+%i) ", first);
++        else
++                pandecode_log_cont("{");
++
++        unsigned edges = mask ^ (mask << 1);
++
++        const char *comma = "";
++
++        bool outside = true;
++        unsigned start;
++        u_foreach_bit(i, edges) {
++                if (outside)
++                        start = i;
++                else if (i == start + 1)
++                        pandecode_log_cont("%sw%02x", comma,
++                                           base + start);
++                else if (i == start + 2)
++                        pandecode_log_cont("%sx%02x", comma,
++                                           base + start);
++                else
++                        pandecode_log_cont("%sw%02x-w%02x", comma,
++                                           base + start,
++                                           base + i - 1);
++                outside = !outside;
++
++                if (outside)
++                        comma = ", ";
++        }
++
++        pandecode_log_cont("}");
++}
++
++static void
++pandecode_cs_buffer(uint64_t *commands, unsigned size,
++                    uint32_t *buffer, uint32_t *buffer_unk,
++                    unsigned gpu_id, mali_ptr va);
++
++// Hack hack hackity hack: gpu_id == 1 means "don't decode" (only disassemble)
++static void
++pandecode_cs_command(uint64_t command, mali_ptr va,
++                     uint32_t *buffer, uint32_t *buffer_unk,
++                     unsigned gpu_id)
++{
++        uint8_t op = command >> 56;
++        uint8_t addr = (command >> 48) & 0xff;
++        uint64_t value = command & 0xffffffffffffULL;
++
++        uint32_t h = value >> 32;
++        uint32_t l = value;
++
++        uint8_t arg1 = h & 0xff;
++        uint8_t arg2 = h >> 8;
++
++        if (command)
++                pandecode_log("%"PRIx64" %016"PRIx64" ", va, command);
++
++        switch (op) {
++        case 0:
++                if (addr || value)
++                        pandecode_log("nop %02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 1:
++                buffer_unk[addr] = buffer[addr] = l;
++                buffer_unk[addr + 1] = buffer[addr + 1] = h;
++                pandecode_log("mov x%02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 2:
++                buffer_unk[addr] = buffer[addr] = l;
++                pandecode_log("mov w%02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 3:
++                if (l & 0xff00ffff || h || addr) {
++                        pandecode_log("wait (unk %02x), (unk %04x), "
++                                      "%i, (unk %04x)\n", addr, h, l >> 16, l);
++                } else {
++                        pandecode_log("wait ");
++                        pandecode_scoreboard_mask(l >> 16);
++                        pandecode_log_cont("\n");
++                }
++                break;
++        case 4: {
++                uint32_t masked = l & 0xffff0000;
++                unsigned task_increment = l & 0x3fff;
++                unsigned task_axis = (l >> 14) & 3;
++                if (h != 0xff00 || addr || masked)
++                        pandecode_log("compute (unk %02x), (unk %04x), "
++                                      "(unk %x), inc %i, axis %i\n\n", addr, h, masked, task_increment, task_axis);
++                else
++                        pandecode_log("compute inc %i, axis %i\n\n", task_increment, task_axis);
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_compute_job(0, buffer, buffer_unk, gpu_id);
++
++                        /* The gallium driver emits this even for compute jobs, clear
++                         * it from unknown state */
++                        pan_unpack_cs(buffer, buffer_unk, SCISSOR, unused_scissor);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++        case 6: {
++                /* The meaning of the first argument (in h) is unknown, but it
++                 * appears that the second bit must be set. */
++                uint32_t masked = l & 0xfffff8f0;
++                uint8_t mode = l & 0xf;
++                uint8_t index = (l >> 8) & 7;
++                if (addr || masked)
++                        pandecode_log("idvs (unk %02x), 0x%04x, (unk %x), "
++                                      "mode %i index %i\n\n",
++                                      addr, h, masked, mode, index);
++                else
++                        pandecode_log("idvs 0x%04x, mode %i index %i\n\n",
++                                      h, mode, index);
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_malloc_vertex_job(0, buffer, buffer_unk, gpu_id);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++        case 7: {
++                uint64_t masked = value & ~0x000100000071;
++                bool tem = value & 1;
++                bool unk = (value >> 32) & 1;
++
++                const char *order = (const char *[]){
++                        "z_order",
++                        "horizontal",
++                        "vertical",
++                        "invalid_3",
++                        "invalid_4",
++                        "reverse_horizontal",
++                        "reverse_vertical",
++                        "invalid_7",
++                }[(value >> 4) & 7];
++
++                if (addr || masked) {
++                        pandecode_log("fragment (unk %02x), (unk %"PRIx64")\n\n",
++                                      addr, value);
++                } else if (value) {
++                        pandecode_log("fragment tem %i, render %s, unk %i\n\n",
++                                      tem, order, unk);
++                } else {
++                        pandecode_log("fragment\n\n");
++                }
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_fragment_job(0, buffer, buffer_unk, gpu_id);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++
++        case 9: {
++                if (addr || l || h > 1)
++                        pandecode_log("flush_tiler (unk %02x), (unk %"PRIx64")\n",
++                                      addr, value);
++                else if (h)
++                        pandecode_log("flush_tiler unk\n");
++                else
++                        pandecode_log("flush_tiler\n");
++                break;
++        }
++
++        case 16: case 17: {
++                char wid = (op == 16) ? 'w' : 'x';
++
++                if (op == 16) {
++                        buffer_unk[addr] = buffer[addr] = buffer[arg2] + l;
++                } else {
++                        uint64_t r = buffer[arg2] + ((uint64_t)buffer[arg2 + 1] << 32) + l;
++                        buffer_unk[addr] = buffer[addr] = r;
++                        buffer_unk[addr + 1] = buffer[addr + 1] = r >> 32;
++                }
++
++                if (arg1)
++                        pandecode_log("add %c%02x, (unk %x), %c%02x, #0x%x\n",
++                                      wid, addr, arg1, wid, arg2, l);
++                else if ((int32_t) l < 0)
++                        pandecode_log("add %c%02x, %c%02x, %i\n",
++                                      wid, addr, wid, arg2, (int32_t) l);
++                else if (l)
++                        pandecode_log("add %c%02x, %c%02x, #0x%x\n",
++                                      wid, addr, wid, arg2, l);
++                else
++                        pandecode_log("mov %c%02x, %c%02x\n",
++                                      wid, addr, wid, arg2);
++
++                break;
++        }
++
++        case 20: case 21: {
++                const char *name = (op == 20) ? "ldr" : "str";
++
++                /* The immediate offset must be 4-aligned (though if the
++                 * address itself is unaligned, the bits will silently be
++                 * masked off).
++                 *
++                 * Up to 16 32-bit registers can be read or written in a
++                 * single instruction, behaviour is similar to LDM or STM
++                 * except that a base register is specified.
++                 *
++                 * These instructions are high latency. Use WAIT 0 to wait for
++                 * the result of an LDR, or for a STR to finish.
++                 *
++                 * For LDR, it is an error for the address register to be
++                 * included in the destination register set.
++                 */
++
++                if (arg1) {
++                        pandecode_log("%s (unk %02x), x%02x, (mask %x), [x%02x, %i]\n",
++                                      name, arg1, addr, l >> 16, arg2, (int16_t) l);
++                } else {
++                        pandecode_log("%s ", name);
++                        pandecode_regmask(addr, l >> 16);
++                        pandecode_log_cont(", [x%02x, %i]\n", arg2, (int16_t) l);
++                }
++                break;
++        }
++
++        case 22: {
++                /* The signed 32-bit source register is compared against zero
++                 * for these comparisons. For example, .GT means that the
++                 * branch is taken if the signed register value is greater
++                 * than zero. */
++                const char *comparisons[] = {
++                        ".le", ".gt",
++                        ".eq", ".ne",
++                        ".lt", ".ge",
++                        "" /* always */, ".(invalid: never)",
++                };
++
++                const char *m = comparisons[(l >> 28) & 7];
++
++                int16_t offset = l;
++
++                bool forward = (offset >= 0);
++                if (!forward)
++                        offset = -1 - offset;
++
++                if (addr || arg1 || l & 0x8fff0000) {
++                        pandecode_log("b%s (unk %02x), w%02x, (unk %02x), "
++                                      "(unk 0x%x), %s %i\n",
++                                      m, addr, arg2, arg1, l & 0x8fff0000,
++                                      forward ? "skip" : "back",
++                                      offset);
++                } else {
++                        pandecode_log("b%s w%02x, %s %i\n",
++                                      m, arg2,
++                                      forward ? "skip" : "back",
++                                      offset);
++                }
++
++                break;
++        }
++
++        case 23: {
++                if (value >> 3 || addr)
++                        pandecode_log("slot (unk %02x), (unk %"PRIx64"), "
++                                      "%i\n", addr, value >> 3, l & 7);
++                else
++                        pandecode_log("slot %i\n", l);
++                break;
++        }
++
++        case 32: case 33: {
++                /* A tail call is similar to a normal call, but reuses the
++                 * current stack entry so that execution returns directly to
++                 * the parent, rather than pushing a new entry and returning
++                 * to the instruction after the call. Using tail calls avoids
++                 * the possibility of stack overflow.
++                 */
++                const char *name = (op == 32) ? "call" : "tailcall";
++
++                unsigned length = buffer[arg1];
++                uint64_t target = (((uint64_t)buffer[arg2 + 1]) << 32) | buffer[arg2];
++
++                assert(!(length & 7));
++                unsigned instrs = length / 8;
++
++                if (addr || l)
++                        pandecode_log("%s (unk %02x), w%02x (%i instructions), x%02x (0x%"PRIx64"), (unk %x)\n",
++                                      name, addr, arg1, instrs, arg2, target, l);
++                else
++                        pandecode_log("%s w%02x (%i instructions), x%02x (0x%"PRIx64")\n",
++                                      name, arg1, instrs, arg2, target);
++
++                if (!target || !length)
++                        break;
++
++                uint64_t *t = pandecode_fetch_gpu_mem(target, length);
++                pandecode_indent++;
++                pandecode_cs_buffer(t, length, buffer, buffer_unk, gpu_id,
++                                    target);
++                pandecode_indent--;
++                break;
++        }
++
++        case 34: {
++                /* idvs implies tiler */
++                if (l & ~0xf)
++                        pandecode_log("resources 0x%x\n", l);
++                else
++                        pandecode_log("resources%s%s%s%s\n",
++                                      (l & 1) ? " compute" : "",
++                                      (l & 2) ? " fragment" : "",
++                                      (l & 4) ? " tiler" : "",
++                                      (l & 8) ? " idvs" : "");
++                break;
++        }
++
++        case 37: case 38: case 51: case 52: {
++                /*
++                 * 0b 00100101 / 00100110 -- opcode
++                 *    ????0??? -- unk. usually 1, faults if "0" bit set
++                 *    aaaaaaaa -- address register
++                 *    vvvvvvvv -- 32-bit value register
++                 *    00000000 -- seems to act as NOP if nonzero
++                 *    mmmmmmmm -- some sort of mask, unknown purpose
++                 *    ???????? -- seems to have no effect
++                 *    ?????s0u -- 's' disables signal to CPU,
++                 *                'u' has unknown purpose (disable GPU signal?)
++                 *
++                 * The difference between the two opcodes is unknown.
++                 *
++                 * That the 'mmmmmmmm' byte is somehow a scoreboard mask is
++                 * a possibility.
++                 */
++
++                const char *name = (op & 1) ? "evadd" : "evstr";
++                const char *type = (op > 50) ? "x" : "w";
++
++                if (addr != 1 || l & 0xff00fffa) {
++                        pandecode_log("%s (unk %02x), %s%02x, [x%02x], "
++                                      "unk 0x%x, flags 0x%x\n",
++                                      name, addr, type, arg1, arg2,
++                                      l >> 16, (uint16_t) l);
++                } else {
++                        pandecode_log("%s %s%02x, [x%02x], unk 0x%x%s%s\n",
++                                      name, type, arg1, arg2, l >> 16,
++                                      l & 0x4 ? "" : ", irq",
++                                      l & 0x1 ? ", unk0" : "");
++                }
++
++                break;
++        }
++
++        case 39: case 53: {
++                const char *m = (const char *[]){
++                        ".ls",
++                        ".hi",
++                }[(l >> 28) & 1];
++                const char *e = (const char *[]){
++                        ".inherit",
++                        ".no_error",
++                }[l & 1];
++                const char *type = (op > 50) ? "x" : "w";
++
++                /* Wait until the value in the destination register is changed
++                 * to pass the comparison. For example, with .LS the value
++                 * in memory must be less than or same as the reference to
++                 * continue execution. */
++                if (addr || l & ~((1 << 28) | (1 << 0)))
++                        pandecode_log("evwait%s%s (unk %02x), %s%02x, "
++                                      "[x%02x, unk %x]\n",
++                                      m, e, addr, type, arg1, arg2, l);
++                else
++                        pandecode_log("evwait%s%s %s%02x, [x%02x]\n",
++                                      m, e, type, arg1, arg2);
++                break;
++        }
++
++        case 40: {
++                if (addr || l >> 16 || arg1 > 1) {
++                        pandecode_log("str type %02x, (unk %02x), "
++                                      "(unk %x), [x%02x, %i]\n",
++                                      addr, arg1,
++                                      l >> 16, arg2, (int16_t) l);
++                } else {
++                        const char *type = (const char *[]) {
++                                "timestamp",
++                                "cycles",
++                        }[arg1];
++
++                        pandecode_log("str %s, [x%02x, %i]\n",
++                                      type, arg2, (int16_t) l);
++                }
++                break;
++        }
++
++        case 48: {
++                if (addr || arg1 || l)
++                        pandecode_log("heapctx (unk %02x), "
++                                      "x%02x, (unk %02x), (unk %x)\n",
++                                      addr, arg2, arg1, l);
++                else
++                        pandecode_log("heapctx x%02x\n", arg2);
++                break;
++        }
++
++        case 49: {
++                const char *m = (const char *[]){
++                        "vt_start",
++                        "vt_end",
++                        "unk",
++                        "frag_end",
++                }[arg1 & 3];
++
++                if (addr || arg2 || arg1 > 3 || l)
++                        pandecode_log("heapinc (unk %02x), "
++                                      "(unk %02x), %02x, (unk %x)\n",
++                                      addr, arg2, arg1, l);
++                else
++                        pandecode_log("heapinc %s\n", m);
++                break;
++        }
++
++        default:
++                /*
++                 * UNK 00 30, #0x480000000000 -- takes an eight-byte aligned
++                 * memory address.
++                 */
++
++                pandecode_log("UNK %02x %02x, #0x%"PRIx64"\n", addr, op, value);
++                break;
++        }
++}
++
++// TODO: reorder args
++static void
++pandecode_cs_buffer(uint64_t *commands, unsigned size,
++                    uint32_t *buffer, uint32_t *buffer_unk,
++                    unsigned gpu_id, mali_ptr va)
++{
++        uint64_t *end = (uint64_t *)((uint8_t *) commands + size);
++
++        for (uint64_t c = *commands; commands < end; c = *(++commands)) {
++                pandecode_cs_command(c, va, buffer, buffer_unk, gpu_id);
++                va += 8;
++        }
++}
++
++// TODO: Does it make sense to pass in the length?
++void
++GENX(pandecode_cs)(mali_ptr cs_gpu_va, unsigned size, unsigned gpu_id)
++{
++        pandecode_dump_file_open();
++
++        // TODO: Pass down the buffer during recursion
++        uint32_t buffer[256] = {0};
++        uint32_t buffer_unk[256] = {0};
++
++        uint64_t *commands = pandecode_fetch_gpu_mem(cs_gpu_va, 1);
++
++        pandecode_log("\n");
++
++        pandecode_cs_buffer(commands, size, buffer, buffer_unk, gpu_id,
++                            cs_gpu_va);
++
++        fflush(pandecode_dump_stream);
++        pandecode_map_read_write();
++}
++#endif
+diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h
+index 6fa6014eb0e..4f175adfb2e 100644
+--- a/src/panfrost/lib/genxml/decode.h
++++ b/src/panfrost/lib/genxml/decode.h
+@@ -50,8 +50,6 @@ struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(uint64_
+ 
+ void pandecode_map_read_write(void);
+ 
+-void pandecode_dump_mappings(void);
+-
+ static inline void *
+ __pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size,
+                           int line, const char *filename)
+@@ -98,6 +96,8 @@ void pandecode_abort_on_fault_v6(mali_ptr jc_gpu_va);
+ void pandecode_abort_on_fault_v7(mali_ptr jc_gpu_va);
+ void pandecode_abort_on_fault_v9(mali_ptr jc_gpu_va);
+ 
++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
+ static inline void
+ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings)
+ {
+@@ -130,7 +130,7 @@ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings)
+                         fprintf(fp, " | ");
+                         for (unsigned j = i & ~0xF; j <= i; ++j) {
+                                 uint8_t c = hex[j];
+-                                fputc((c < 32 || c > 128) ? '.' : c, fp);
++                                fputc((c < 32 || c > 126) ? '.' : c, fp);
+                         }
+                 }
+ 
+diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c
+index ecc02387175..41c63b290c7 100644
+--- a/src/panfrost/lib/genxml/decode_common.c
++++ b/src/panfrost/lib/genxml/decode_common.c
+@@ -202,7 +202,7 @@ pointer_as_memory_reference(uint64_t ptr)
+ 
+ static int pandecode_dump_frame_count = 0;
+ 
+-static bool force_stderr = false;
++bool force_stderr = false;
+ 
+ void
+ pandecode_dump_file_open(void)
+@@ -230,7 +230,7 @@ pandecode_dump_file_open(void)
+         }
+ }
+ 
+-static void
++void
+ pandecode_dump_file_close(void)
+ {
+         simple_mtx_assert_locked(&pandecode_lock);
+@@ -289,8 +289,9 @@ pandecode_dump_mappings(void)
+                 if (!it->addr || !it->length)
+                         continue;
+ 
+-                fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n",
+-                        it->name, it->gpu_va);
++                fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64
++                        " length %zu\n\n",
++                        it->name, it->gpu_va, it->length);
+ 
+                 pan_hexdump(pandecode_dump_stream, it->addr, it->length, false);
+                 fprintf(pandecode_dump_stream, "\n");
+@@ -333,3 +334,20 @@ pandecode_jc(mali_ptr jc_gpu_va, unsigned gpu_id)
+ 
+         simple_mtx_unlock(&pandecode_lock);
+ }
++
++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
++void
++pandecode_cs(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id)
++{
++        simple_mtx_lock(&pandecode_lock);
++
++        switch (pan_arch(gpu_id)) {
++        // Hack hack hackity hack: gpu_id == 1 means "don't decode" (only
++        // disassemble)
++        case 0: case 10: pandecode_cs_v10(cs_gpu_va, cs_size, gpu_id); break;
++        default: unreachable("Unsupported architecture");
++        }
++
++        simple_mtx_unlock(&pandecode_lock);
++}
+diff --git a/src/panfrost/lib/genxml/gen_macros.h b/src/panfrost/lib/genxml/gen_macros.h
+index 1ef4b53a508..24072634fdc 100644
+--- a/src/panfrost/lib/genxml/gen_macros.h
++++ b/src/panfrost/lib/genxml/gen_macros.h
+@@ -93,6 +93,9 @@ pan_arch(unsigned gpu_id)
+ #elif (PAN_ARCH == 9)
+ #  define GENX(X) X##_v9
+ #  include "genxml/v9_pack.h"
++#elif (PAN_ARCH == 10)
++#  define GENX(X) X##_v10
++#  include "genxml/v10_pack.h"
+ #else
+ #  error "Need to add suffixing macro for this architecture"
+ #endif
+diff --git a/src/panfrost/lib/genxml/gen_pack.py b/src/panfrost/lib/genxml/gen_pack.py
+index 434a228c514..bd6343f5908 100644
+--- a/src/panfrost/lib/genxml/gen_pack.py
++++ b/src/panfrost/lib/genxml/gen_pack.py
+@@ -46,6 +46,18 @@
+ 
+ #include "util/bitpack_helpers.h"
+ 
++/* Most functions assume the caller has done bounds checking */
++typedef struct pan_command_stream {
++   uint64_t *ptr;
++   uint64_t *begin;
++   uint64_t *end;
++   uint64_t gpu;
++} pan_command_stream;
++
++struct pan_command_stream_decoded {
++  uint32_t values[256];
++};
++
+ #define __gen_unpack_float(x, y, z) uif(__gen_unpack_uint(x, y, z))
+ 
+ static inline uint32_t
+@@ -98,6 +110,20 @@
+    return (2*odd + 1) << shift;
+ }
+ 
++static inline void
++__gen_clear_value(uint8_t *restrict cl, uint32_t start, uint32_t end)
++{
++   for (uint32_t byte = start / 8; byte <= end / 8; byte++) {
++      uint8_t m = 0;
++      if (byte == start / 8)
++         m |= 0xff >> (8 - start % 8);
++      if (byte == end / 8)
++         m |= 0xff << (1 + end % 8);
++
++      cl[byte] &= m;
++   }
++}
++
+ #define PREFIX1(A) MALI_ ## A
+ #define PREFIX2(A, B) MALI_ ## A ## _ ## B
+ #define PREFIX4(A, B, C, D) MALI_ ## A ## _ ## B ## _ ## C ## _ ## D
+@@ -183,6 +209,96 @@
+ 
+ """
+ 
++no_cs = "".join([f"""
++#define MALI_{y} MALI_{x}
++#define MALI_{y}_header MALI_{x}_header
++#define MALI_{y}_pack MALI_{x}_pack
++#define MALI_{y}_LENGTH MALI_{x}_LENGTH
++#define MALI_{y}_ALIGN MALI_{x}_ALIGN
++#define mali_{y.lower()}_packed mali_{x.lower()}_packed
++#define MALI_{y}_unpack MALI_{x}_unpack
++#define MALI_{y}_print MALI_{x}_print
++""" for x, y in (("DRAW", "DRAW_NO_CS"), )]) + """
++
++#define pan_pack_cs_v10(dst, _, T, name) pan_pack(dst, T, name)
++
++#define pan_section_pack_cs_v10(dst, _, A, S, name) pan_section_pack(dst, A, S, name)
++
++#define pan_unpack_cs_v10(dst, _, __, T, name) pan_unpack(dst, T, name)
++
++#define pan_section_unpack_cs_v10(src, _, __, A, S, name) pan_section_unpack(src, A, S, name)
++"""
++
++with_cs = """
++#define pan_pack_cs(dst, T, name)                       \\
++   for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\
++        *_loop_terminate = (void *) (dst);                  \\
++        __builtin_expect(_loop_terminate != NULL, 1);       \\
++        ({ PREFIX2(T, pack_cs)(dst, &name);  \\
++           _loop_terminate = NULL; }))
++
++#define pan_section_pack_cs(dst, A, S, name)                                                         \\
++   for (PREFIX4(A, SECTION, S, TYPE) name = { PREFIX4(A, SECTION, S, header) }, \\
++        *_loop_terminate = (void *) (dst);                                                        \\
++        __builtin_expect(_loop_terminate != NULL, 1);                                             \\
++        ({ PREFIX4(A, SECTION, S, pack_cs) (dst, &name);              \\
++           _loop_terminate = NULL; }))
++
++#define pan_section_pack_cs_v10(_, dst, A, S, name) pan_section_pack_cs(dst, A, S, name)
++
++// TODO: assert that the first argument is NULL
++#define pan_pack_cs_v10(_, dst, T, name) pan_pack_cs(dst, T, name)
++
++#define pan_pack_ins(dst, T, name)                       \\
++   for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\
++        *_loop_terminate = (void *) (dst);                  \\
++        __builtin_expect(_loop_terminate != NULL, 1);       \\
++        ({ PREFIX2(T, pack_ins)(dst, &name);  \\
++           _loop_terminate = NULL; }))
++
++#define pan_unpack_cs(buf, buf_unk, T, name) \\
++        struct PREFIX1(T) name; \\
++        PREFIX2(T, unpack)(buf, buf_unk, &name)
++
++#define pan_unpack_cs_v10(_, buf, buf_unk, T, name) pan_unpack_cs(buf, buf_unk, T, name)
++
++#define pan_section_unpack_cs_v10(_, buf, buf_unk, A, S, name) \\
++        PREFIX4(A, SECTION, S, TYPE) name;                             \\
++        PREFIX4(A, SECTION, S, unpack)(buf, buf_unk, &name)
++
++static inline void
++pan_emit_cs_ins(pan_command_stream *s, uint8_t op, uint64_t instr)
++{
++   assert(instr < (1ULL << 56));
++   instr |= ((uint64_t)op << 56);
++   *((s->ptr)++) = instr;
++}
++
++static inline void
++pan_emit_cs_32(pan_command_stream *s, uint8_t reg, uint32_t value)
++{
++   pan_emit_cs_ins(s, 2, ((uint64_t) reg << 48) | value);
++}
++
++static inline void
++pan_emit_cs_48(pan_command_stream *s, uint8_t reg, uint64_t value)
++{
++   assert(value < (1ULL << 48));
++   pan_emit_cs_ins(s, 1, ((uint64_t) reg << 48) | value);
++}
++
++static inline void
++pan_emit_cs_64(pan_command_stream *s, uint8_t reg, uint64_t value)
++{
++   if (value < (1ULL << 48)) {
++      pan_emit_cs_48(s, reg, value);
++   } else {
++      pan_emit_cs_32(s, reg, value);
++      pan_emit_cs_32(s, reg + 1, value >> 32);
++   }
++}
++"""
++
+ def to_alphanum(name):
+     substitutions = {
+         ' ': '_',
+@@ -297,7 +413,7 @@ def __init__(self, parser, attrs):
+ 
+         if ":" in str(attrs["start"]):
+             (word, bit) = attrs["start"].split(":")
+-            self.start = (int(word) * 32) + int(bit)
++            self.start = (int(word, 0) * 32) + int(bit)
+         else:
+             self.start = int(attrs["start"])
+ 
+@@ -331,7 +447,8 @@ def emit_template_struct(self, dim):
+             type = 'uint64_t'
+         elif self.type == 'int':
+             type = 'int32_t'
+-        elif self.type in ['uint', 'hex', 'uint/float', 'padded', 'Pixel Format']:
++            # TODO: Convert to tuple
++        elif self.type in ['uint', 'hex', 'register', 'uint/float', 'padded', 'Pixel Format']:
+             type = 'uint32_t'
+         elif self.type in self.parser.structs:
+             type = 'struct ' + self.parser.gen_prefix(safe_name(self.type.upper()))
+@@ -385,8 +502,8 @@ def emit_template_struct(self, dim):
+                 field.emit_template_struct(dim)
+ 
+     class Word:
+-        def __init__(self):
+-            self.size = 32
++        def __init__(self, size=32):
++            self.size = size
+             self.contributors = []
+ 
+     class FieldRef:
+@@ -410,7 +527,7 @@ def collect_fields(self, fields, offset, path, all_fields):
+             end = offset + field.end
+             all_fields.append(self.FieldRef(field, field_path, start, end))
+ 
+-    def collect_words(self, fields, offset, path, words):
++    def collect_words(self, fields, offset, path, words, ins=False):
+         for field in fields:
+             field_path = '{}{}'.format(path, field.name)
+             start = offset + field.start
+@@ -424,16 +541,27 @@ def collect_words(self, fields, offset, path, words):
+             contributor = self.FieldRef(field, field_path, start, end)
+             first_word = contributor.start // 32
+             last_word = contributor.end // 32
++            if ins:
++                assert(last_word < 2)
++                first_word = last_word = 0
++
+             for b in range(first_word, last_word + 1):
+                 if not b in words:
+-                    words[b] = self.Word()
++                    words[b] = self.Word(size=64 if ins else 32)
++
+                 words[b].contributors.append(contributor)
+ 
+-    def emit_pack_function(self):
+-        self.get_length()
++        return
++
++    def emit_pack_function(self, csf=False, ins=False):
++        if csf:
++            self.length = 256 * 4
++        else:
++            self.get_length()
++            assert(not ins)
+ 
+         words = {}
+-        self.collect_words(self.fields, 0, '', words)
++        self.collect_words(self.fields, 0, '', words, ins=ins)
+ 
+         # Validate the modifier is lossless
+         for field in self.fields:
+@@ -449,25 +577,52 @@ def emit_pack_function(self):
+             elif field.modifier[0] == "log2":
+                 print("   assert(util_is_power_of_two_nonzero(values->{}));".format(field.name))
+ 
+-        for index in range(self.length // 4):
++        if ins:
++            index_list = (0, )
++        elif csf:
++            index_list = sorted(words)
++        else:
++            index_list = range(self.length // 4)
++
++        for index in index_list:
+             # Handle MBZ words
+             if not index in words:
+-                print("   cl[%2d] = 0;" % index)
++                if ins:
++                    print("   pan_emit_cs_ins(s, 0x%02x, 0);" % self.op)
++                elif not csf:
++                    print("   cl[%2d] = 0;" % index)
+                 continue
+ 
+             word = words[index]
+ 
+             word_start = index * 32
+ 
++            size = 32
++            # Can we move all fields from the next index here?
++            if csf and index % 2 == 0 and index + 1 in words:
++                word_next = words[index + 1]
++                end = max(c.end for c in word_next.contributors)
++                if end - word_start < 48:
++                    size = 48
++                    word.contributors += [x for x in word_next.contributors if not x in word.contributors]
++                    del words[index + 1]
++
+             v = None
+-            prefix = "   cl[%2d] =" % index
++            if ins:
++                prefix = "   pan_emit_cs_ins(s, 0x%02x," % self.op
++            elif size == 48:
++                prefix = "   pan_emit_cs_48(s, 0x%02x," % index
++            elif csf:
++                prefix = "   pan_emit_cs_32(s, 0x%02x," % index
++            else:
++                prefix = "   cl[%2d] = (" % index
+ 
+             for contributor in word.contributors:
+                 field = contributor.field
+                 name = field.name
+                 start = contributor.start
+                 end = contributor.end
+-                contrib_word_start = (start // 32) * 32
++                contrib_word_start = (start // word.size) * word.size
+                 start -= contrib_word_start
+                 end -= contrib_word_start
+ 
+@@ -482,7 +637,7 @@ def emit_pack_function(self):
+                     elif field.modifier[0] == "log2":
+                         value = "util_logbase2({})".format(value)
+ 
+-                if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format"]:
++                if field.type in ["uint", "hex", "uint/float", "address", "register", "Pixel Format"]:
+                     s = "util_bitpack_uint(%s, %d, %d)" % \
+                         (value, start, end)
+                 elif field.type == "padded":
+@@ -505,11 +660,13 @@ def emit_pack_function(self):
+ 
+                 if not s == None:
+                     shift = word_start - contrib_word_start
+-                    if shift:
++                    if shift > 0:
+                         s = "%s >> %d" % (s, shift)
++                    elif shift < 0:
++                        s = "%s << %d" % (s, -shift)
+ 
+                     if contributor == word.contributors[-1]:
+-                        print("%s %s;" % (prefix, s))
++                        print("%s %s);" % (prefix, s))
+                     else:
+                         print("%s %s |" % (prefix, s))
+                     prefix = "           "
+@@ -528,22 +685,23 @@ def mask_for_word(self, index, start, end):
+         count = (end - start + 1)
+         return (((1 << count) - 1) << start)
+ 
+-    def emit_unpack_function(self):
++    def emit_unpack_function(self, csf=False):
+         # First, verify there is no garbage in unused bits
+         words = {}
+         self.collect_words(self.fields, 0, '', words)
+ 
+-        for index in range(self.length // 4):
+-            base = index * 32
+-            word = words.get(index, self.Word())
+-            masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors]
+-            mask = reduce(lambda x,y: x | y, masks, 0)
++        if not csf:
++            for index in range(self.length // 4):
++                base = index * 32
++                word = words.get(index, self.Word())
++                masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors]
++                mask = reduce(lambda x,y: x | y, masks, 0)
+ 
+-            ALL_ONES = 0xffffffff
++                ALL_ONES = 0xffffffff
+ 
+-            if mask != ALL_ONES:
+-                TMPL = '   if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");'
+-                print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index))
++                if mask != ALL_ONES:
++                    TMPL = '   if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");'
++                    print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index))
+ 
+         fieldrefs = []
+         self.collect_fields(self.fields, 0, '', fieldrefs)
+@@ -556,7 +714,7 @@ def emit_unpack_function(self):
+             args.append(str(fieldref.start))
+             args.append(str(fieldref.end))
+ 
+-            if field.type in set(["uint", "hex", "uint/float", "address", "Pixel Format"]):
++            if field.type in set(["uint", "hex", "uint/float", "address", "register", "Pixel Format"]):
+                 convert = "__gen_unpack_uint"
+             elif field.type in self.parser.enums:
+                 convert = "(enum %s)__gen_unpack_uint" % enum_name(field.type)
+@@ -588,6 +746,9 @@ def emit_unpack_function(self):
+                 mask = hex(field.modifier[1] - 1)
+                 print('   assert(!(values->{} & {}));'.format(fieldref.path, mask))
+ 
++            if csf:
++                print('   __gen_clear_value({});'.format(', '.join(['cl_unk'] + args[1:])))
++
+     def emit_print_function(self):
+         for field in self.fields:
+             convert = None
+@@ -610,7 +771,7 @@ def emit_print_function(self):
+                 print('   fprintf(fp, "%*s{}: %f\\n", indent, "", {});'.format(name, val))
+             elif field.type in ["uint", "hex"] and (field.end - field.start) >= 32:
+                 print('   fprintf(fp, "%*s{}: 0x%" PRIx64 "\\n", indent, "", {});'.format(name, val))
+-            elif field.type == "hex":
++            elif field.type in ("hex", "register"):
+                 print('   fprintf(fp, "%*s{}: 0x%x\\n", indent, "", {});'.format(name, val))
+             elif field.type == "uint/float":
+                 print('   fprintf(fp, "%*s{}: 0x%X (%f)\\n", indent, "", {}, uif({}));'.format(name, val, val))
+@@ -649,9 +810,13 @@ def start_element(self, name, attrs):
+                     print(v6_format_printer)
+                 else:
+                     print(v7_format_printer)
++                if arch < 10:
++                    print(no_cs)
++                else:
++                    print(with_cs)
+         elif name == "struct":
+             name = attrs["name"]
+-            self.no_direct_packing = attrs.get("no-direct-packing", False)
++            self.layout = attrs.get("layout", "struct")
+             object_name = self.gen_prefix(safe_name(name.upper()))
+             self.struct = object_name
+ 
+@@ -659,10 +824,16 @@ def start_element(self, name, attrs):
+             if "size" in attrs:
+                 self.group.length = int(attrs["size"]) * 4
+             self.group.align = int(attrs["align"]) if "align" in attrs else None
++            self.group.op = int(attrs["op"]) if "op" in attrs else None
+             self.structs[attrs["name"]] = self.group
++            self.unpacked_alias = self.gen_prefix(safe_name(attrs["unpacked"].upper())) if "unpacked" in attrs else None
+         elif name == "field":
+-            self.group.fields.append(Field(self, attrs))
+             self.values = []
++            self.skip_field = self.layout == "cs" and not attrs["start"].startswith("0x")
++            if self.skip_field:
++                #print(f"#warning Skipping non-CS field {attrs['name']}")
++                return
++            self.group.fields.append(Field(self, attrs))
+         elif name == "enum":
+             self.values = []
+             self.enum = safe_name(attrs["name"])
+@@ -675,6 +846,8 @@ def start_element(self, name, attrs):
+             self.values.append(Value(attrs))
+         elif name == "aggregate":
+             aggregate_name = self.gen_prefix(safe_name(attrs["name"].upper()))
++            # TODO: Make .layout less "global"?
++            self.layout = attrs.get("layout", "struct")
+             self.aggregate = Aggregate(self, aggregate_name, attrs)
+             self.aggregates[attrs['name']] = self.aggregate
+         elif name == "section":
+@@ -687,7 +860,8 @@ def end_element(self, name):
+             self.struct = None
+             self.group = None
+         elif name  == "field":
+-            self.group.fields[-1].values = self.values
++            if not self.skip_field:
++                self.group.fields[-1].values = self.values
+         elif name  == "enum":
+             self.emit_enum()
+             self.enum = None
+@@ -717,22 +891,33 @@ def emit_header(self, name):
+         print('')
+ 
+     def emit_template_struct(self, name, group):
+-        print("struct %s {" % name)
+-        group.emit_template_struct("")
+-        print("};\n")
++        if self.unpacked_alias:
++            # TODO: Check the fields match
++            print("#define %s %s" % (name, self.unpacked_alias))
++        else:
++            print("struct %s {" % name)
++            group.emit_template_struct("")
++            print("};\n")
+ 
+     def emit_aggregate(self):
+         aggregate = self.aggregate
+-        print("struct %s_packed {" % aggregate.name.lower())
+-        print("   uint32_t opaque[{}];".format(aggregate.get_size() // 4))
+-        print("};\n")
+-        print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size))
++
++        if self.layout == "struct":
++            print("struct %s_packed {" % aggregate.name.lower())
++            print("   uint32_t opaque[{}];".format(aggregate.get_size() // 4))
++            print("};\n")
++            print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size))
++        else:
++            assert(self.layout == "cs")
++
+         if aggregate.align != None:
+             print('#define {}_ALIGN {}'.format(aggregate.name.upper(), aggregate.align))
+         for section in aggregate.sections:
+             print('#define {}_SECTION_{}_TYPE struct {}'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+             print('#define {}_SECTION_{}_header {}_header'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+             print('#define {}_SECTION_{}_pack {}_pack'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
++            # TODO: Only when req'd
++            print('#define {}_SECTION_{}_pack_cs {}_pack_cs'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+             print('#define {}_SECTION_{}_unpack {}_unpack'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+             print('#define {}_SECTION_{}_print {}_print'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+             print('#define {}_SECTION_{}_OFFSET {}'.format(aggregate.name.upper(), section.name.upper(), section.offset))
+@@ -747,12 +932,32 @@ def emit_pack_function(self, name, group):
+         print("}\n\n")
+ 
+         # Should be a whole number of words
+-        assert((self.group.length % 4) == 0)
++        assert((group.length % 4) == 0)
++
++        print('#define {} {}'.format (name + "_LENGTH", group.length))
++        if group.align != None:
++            print('#define {} {}'.format (name + "_ALIGN", group.align))
++        print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), group.length // 4))
++
++    def emit_cs_pack_function(self, name, group):
++        print("static inline void\n%s_pack_cs(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{\n" %
++              (name, ' ' * (len(name) + 6), name))
++
++        group.emit_pack_function(csf=True)
+ 
+-        print('#define {} {}'.format (name + "_LENGTH", self.group.length))
+-        if self.group.align != None:
+-            print('#define {} {}'.format (name + "_ALIGN", self.group.align))
+-        print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), self.group.length // 4))
++        print("}\n\n")
++
++        assert(group.length == 256 * 4)
++
++    def emit_ins_pack_function(self, name, group):
++        print("static inline void\n%s_pack_ins(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{" %
++              (name, ' ' * (len(name) + 6), name))
++
++        group.emit_pack_function(csf=True, ins=True)
++
++        print("}\n\n")
++
++        assert(group.length == 256 * 4)
+ 
+     def emit_unpack_function(self, name, group):
+         print("static inline void")
+@@ -763,6 +968,18 @@ def emit_unpack_function(self, name, group):
+ 
+         print("}\n")
+ 
++    def emit_cs_unpack_function(self, name, group):
++        print("static inline void")
++        print("%s_unpack(const uint32_t * restrict buffer, uint32_t * restrict buffer_unk,\n"
++              "%sstruct %s * restrict values)\n{"
++              "   const uint8_t *cl = (uint8_t *)buffer;\n"
++              "   uint8_t *cl_unk = (uint8_t *)buffer_unk;\n" %
++              (name.upper(), ' ' * (len(name) + 8), name))
++
++        group.emit_unpack_function(csf=True)
++
++        print("}\n")
++
+     def emit_print_function(self, name, group):
+         print("static inline void")
+         print("{}_print(FILE *fp, const struct {} * values, unsigned indent)\n{{".format(name.upper(), name))
+@@ -776,14 +993,20 @@ def emit_struct(self):
+ 
+         self.emit_template_struct(self.struct, self.group)
+         self.emit_header(name)
+-        if self.no_direct_packing == False:
++        if self.layout == "struct":
+             self.emit_pack_function(self.struct, self.group)
+             self.emit_unpack_function(self.struct, self.group)
++        elif self.layout == "cs":
++            self.emit_cs_pack_function(self.struct, self.group)
++            self.emit_cs_unpack_function(self.struct, self.group)
++        elif self.layout == "ins":
++            # TODO: I don't think that the current unpack emit functions would
++            # work
++            self.emit_ins_pack_function(self.struct, self.group)
++        else:
++            assert(self.layout == "none")
+         self.emit_print_function(self.struct, self.group)
+ 
+-    def enum_prefix(self, name):
+-        return 
+-
+     def emit_enum(self):
+         e_name = enum_name(self.enum)
+         prefix = e_name if self.enum != 'Format' else global_prefix
+diff --git a/src/panfrost/lib/genxml/meson.build b/src/panfrost/lib/genxml/meson.build
+index 61041168ab0..191a970ff63 100644
+--- a/src/panfrost/lib/genxml/meson.build
++++ b/src/panfrost/lib/genxml/meson.build
+@@ -20,7 +20,7 @@
+ # SOFTWARE.
+ 
+ pan_packers = []
+-foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9']
++foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10']
+   pan_packers += custom_target(
+     packer + '_pack.h',
+     input : ['gen_pack.py', packer + '.xml'],
+@@ -37,7 +37,7 @@ idep_pan_packers = declare_dependency(
+ 
+ libpanfrost_decode_per_arch = []
+ 
+-foreach ver : ['4', '5', '6', '7', '9']
++foreach ver : ['4', '5', '6', '7', '9', '10']
+   libpanfrost_decode_per_arch += static_library(
+     'pandecode-arch-v' + ver,
+     ['decode.c', pan_packers],
+diff --git a/src/panfrost/lib/genxml/v10.xml b/src/panfrost/lib/genxml/v10.xml
+new file mode 100644
+index 00000000000..d1f104f4e62
+--- /dev/null
++++ b/src/panfrost/lib/genxml/v10.xml
+@@ -0,0 +1,1668 @@
++<panxml arch="10">
++  <enum name="Attribute Type">
++    <value name="1D" value="1"/>
++    <value name="1D POT Divisor" value="2"/>
++    <value name="1D NPOT Divisor" value="3"/>
++    <value name="1D Primitive Index Buffer" value="5"/>
++    <value name="Vertex packet" value="6"/>
++  </enum>
++
++  <enum name="Channel">
++    <value name="R" value="0"/>
++    <value name="G" value="1"/>
++    <value name="B" value="2"/>
++    <value name="A" value="3"/>
++    <value name="0" value="4"/>
++    <value name="1" value="5"/>
++  </enum>
++
++  <enum name="Depth Source">
++    <value name="Minimum" value="0"/>
++    <value name="Maximum" value="1"/>
++    <value name="Fixed function" value="2"/>
++    <value name="Shader" value="3"/>
++  </enum>
++
++  <enum name="Depth Clamp Mode">
++    <value name="[0, 1]" value="1"/>
++    <value name="Bounds" value="2"/>
++  </enum>
++
++  <!-- Only used for pandecode shader type -->
++  <enum name="Job Type">
++    <value name="Compute" value="4"/>
++    <value name="Tiler" value="7"/>
++    <value name="Fragment" value="9"/>
++  </enum>
++
++  <enum name="Shader stage">
++    <value name="Compute" value="1"/>
++    <value name="Fragment" value="2"/>
++    <value name="Vertex" value="3"/>
++  </enum>
++
++  <enum name="Descriptor Type">
++    <value name="Sampler" value="1"/>
++    <value name="Texture" value="2"/>
++    <value name="Attribute" value="5"/>
++    <value name="Depth/stencil" value="7"/>
++    <value name="Shader" value="8"/>
++    <value name="Buffer" value="9"/>
++    <value name="Plane" value="10"/>
++  </enum>
++
++  <enum name="Buffer Type">
++    <value name="Simple" value="1"/>
++    <value name="Tiler heap" value="2"/>
++    <value name="Structure" value="3"/>
++    <value name="Vertex packet" value="4"/>
++  </enum>
++
++  <enum name="Draw Mode">
++    <value name="None" value="0"/>
++    <value name="Points" value="1"/>
++    <value name="Lines" value="2"/>
++    <value name="Line strip" value="4"/>
++    <value name="Line loop" value="6"/>
++    <value name="Triangles" value="8"/>
++    <value name="Triangle strip" value="10"/>
++    <value name="Triangle fan" value="12"/>
++    <value name="Polygon" value="13"/>
++    <value name="Quads" value="14"/>
++  </enum>
++
++  <enum name="Exception Access">
++    <value name="None" value="0"/>
++    <value name="Execute" value="2"/>
++    <value name="Read" value="1"/>
++    <value name="Write" value="3"/>
++  </enum>
++
++  <enum name="Func">
++    <value name="Never" value="0"/>
++    <value name="Less" value="1"/>
++    <value name="Equal" value="2"/>
++    <value name="Lequal" value="3"/>
++    <value name="Greater" value="4"/>
++    <value name="Not Equal" value="5"/>
++    <value name="Gequal" value="6"/>
++    <value name="Always" value="7"/>
++  </enum>
++
++  <enum name="Address Mode">
++    <value name="Flat" value="0"/>
++    <value name="Packed" value="1"/>
++  </enum>
++
++  <enum name="Format">
++    <!--
++      Used to index into the TEXTURE_FEATURES register, which indicates support
++      for compressed texture formats. The enum is no longer used for pixel
++      format descriptors, but the naming is kept consistent with previous Mali
++      generations for backwards compatibility.
++    -->
++    <value name="ETC2 RGB8" value="1"/>
++    <value name="ETC2 R11 UNORM" value="2"/>
++    <value name="ETC2 RGBA8" value="3"/>
++    <value name="ETC2 RG11 UNORM" value="4"/>
++    <value name="BC1 UNORM" value="7"/>
++    <value name="BC2 UNORM" value="8"/>
++    <value name="BC3 UNORM" value="9"/>
++    <value name="BC4 UNORM" value="10"/>
++    <value name="BC4 SNORM" value="11"/>
++    <value name="BC5 UNORM" value="12"/>
++    <value name="BC5 SNORM" value="13"/>
++    <value name="BC6H UF16" value="14"/>
++    <value name="BC6H SF16" value="15"/>
++    <value name="BC7 UNORM" value="16"/>
++    <value name="ETC2 R11 SNORM" value="17"/>
++    <value name="ETC2 RG11 SNORM" value="18"/>
++    <value name="ETC2 RGB8A1" value="19"/>
++    <value name="ASTC 3D LDR" value="20"/>
++    <value name="ASTC 3D HDR" value="21"/>
++    <value name="ASTC 2D LDR" value="22"/>
++    <value name="ASTC 2D HDR" value="23"/>
++    <value name="ASTC 2D NARROW" value="62"/>
++    <value name="ASTC 3D NARROW" value="63"/>
++
++    <!-- Actual enum values follow -->
++    <value name="YUV8" value="32"/>
++    <value name="YUVA8" value="33"/>
++    <value name="YUYV8" value="34"/>
++    <value name="VYUY8" value="35"/>
++    <value name="Y8 UV8 422" value="36"/>
++    <value name="Y8 U8 V8 422" value="37"/>
++    <value name="Y8 UV8 420" value="38"/>
++    <value name="Y8 U8 V8 420" value="39"/>
++    <value name="YUV10 A2" value="40"/>
++    <value name="A2 YUV10" value="41"/>
++    <value name="YUYAAYVYAA" value="42"/>
++    <!--- TODO: revisit YUV -->
++    <value name="YUYV10" value="44"/>
++    <value name="VYUY10" value="45"/>
++    <value name="Y10 UV10 422" value="46"/>
++    <value name="Y10 UV10 420" value="47"/>
++    <value name="YUV16" value="48"/>
++    <value name="YUVA16" value="49"/>
++    <value name="YUYV16" value="50"/>
++    <value name="VYUY16" value="51"/>
++    <value name="Y10X6 U10X6V10X6 422" value="52"/>
++    <value name="Y16 UV16 422" value="53"/>
++    <value name="Y16 UV16 420" value="55"/>
++    <value name="RGB565" value="64"/>
++    <value name="RGB5 A1 UNORM" value="65"/>
++    <value name="A1 BGR5 UNORM" value="66"/>
++    <value name="RGB10 A2 UNORM" value="67"/>
++    <value name="A2 BGR10 UNORM" value="68"/>
++    <value name="RGB10 A2 SNORM" value="69"/>
++    <value name="A2 BGR10 SNORM" value="70"/>
++    <value name="RGB10 A2UI" value="71"/>
++    <value name="A2 BGR10UI" value="72"/>
++    <value name="RGB10 A2I" value="73"/>
++    <value name="A2 BGR10I" value="74"/>
++    <value name="Z16 UNORM" value="75"/>
++    <value name="Z32" value="76"/>
++    <value name="Z24X8 UNORM" value="77"/>
++    <value name="X8Z24" value="78"/>
++    <value name="R32 FIXED" value="81"/>
++    <value name="RG32 FIXED" value="82"/>
++    <value name="RGB32 FIXED" value="83"/>
++    <value name="RGBA32 FIXED" value="84"/>
++    <value name="R11F G11F B10F" value="89"/>
++    <value name="B10F G11F R11F" value="90"/>
++    <value name="R9F G9F B9F E5F" value="91"/>
++    <value name="E5F B9F G9F R9F" value="92"/>
++    <value name="S8" value="93"/>
++    <!--- RGBA32F + snap to 2^-8, used for vertex writes -->
++    <value name="Snap 4" value="94"/>
++    <value name="Constant" value="95"/>
++    <!--- 96-98 reserved -->
++    <value name="R8 SNORM" value="99"/>
++    <value name="R16 SNORM" value="100"/>
++    <value name="R32 SNORM" value="101"/>
++    <!--- 102-105 reserved -->
++    <value name="RG4 SNORM" value="106"/>
++    <value name="RG8 SNORM" value="107"/>
++    <value name="RG16 SNORM" value="108"/>
++    <value name="RG32 SNORM" value="109"/>
++    <!-- 110-114 reserved -->
++    <value name="RGB8 SNORM" value="115"/>
++    <value name="RGB16 SNORM" value="116"/>
++    <value name="RGB32 SNORM" value="117"/>
++    <!-- 118-120 reserved -->
++    <value name="RGBA2 SNORM" value="121"/>
++    <value name="RGBA4 SNORM" value="122"/>
++    <value name="RGBA8 SNORM" value="123"/>
++    <value name="RGBA16 SNORM" value="124"/>
++    <value name="RGBA32 SNORM" value="125"/>
++    <!-- 126-130 reserved -->
++    <value name="R8UI" value="131"/>
++    <value name="R16UI" value="132"/>
++    <value name="R32UI" value="133"/>
++    <!-- 134-137 reserved -->
++    <value name="RG4UI" value="138"/>
++    <value name="RG8UI" value="139"/>
++    <value name="RG16UI" value="140"/>
++    <value name="RG32UI" value="141"/>
++    <!-- 142-146 reserved -->
++    <value name="RGB8UI" value="147"/>
++    <value name="RGB16UI" value="148"/>
++    <value name="RGB32UI" value="149"/>
++    <!-- 150-152 reserved -->
++    <value name="RGBA2UI" value="153"/>
++    <value name="RGBA4UI" value="154"/>
++    <value name="RGBA8UI" value="155"/>
++    <value name="RGBA16UI" value="156"/>
++    <value name="RGBA32UI" value="157"/>
++    <!-- 158-162 reserved -->
++    <value name="R8 UNORM" value="163"/>
++    <value name="R16 UNORM" value="164"/>
++    <value name="R32 UNORM" value="165"/>
++    <!-- 166 reserved -->
++    <value name="R32F" value="167"/>
++    <!-- 168-169 reserved -->
++    <value name="RG4 UNORM" value="170"/>
++    <value name="RG8 UNORM" value="171"/>
++    <value name="RG16 UNORM" value="172"/>
++    <value name="RG32 UNORM" value="173"/>
++    <!-- 174 reserved -->
++    <value name="RG32F" value="175"/>
++    <!-- 176-178 reserved -->
++    <value name="RGB8 UNORM" value="179"/>
++    <value name="RGB16 UNORM" value="180"/>
++    <value name="RGB32 UNORM" value="181"/>
++    <!-- 182 reserved -->
++    <value name="RGB32F" value="183"/>
++    <!-- 184 reserved -->
++    <value name="RGBA2 UNORM" value="185"/>
++    <value name="RGBA4 UNORM" value="186"/>
++    <value name="RGBA8 UNORM" value="187"/>
++    <value name="RGBA16 UNORM" value="188"/>
++    <value name="RGBA32 UNORM" value="189"/>
++    <!-- 190 reserved -->
++    <value name="RGBA32F" value="191"/>
++    <!-- 192-194 reserved -->
++    <value name="R8I" value="195"/>
++    <value name="R16I" value="196"/>
++    <value name="R32I" value="197"/>
++    <!-- 198 reserved -->
++    <value name="R16F" value="199"/>
++    <!-- 200-201 reserved -->
++    <value name="RG4I" value="202"/>
++    <value name="RG8I" value="203"/>
++    <value name="RG16I" value="204"/>
++    <value name="RG32I" value="205"/>
++    <!-- 206 reserved -->
++    <value name="RG16F" value="207"/>
++    <!-- 208-210 reserved -->
++    <value name="RGB8I" value="211"/>
++    <value name="RGB16I" value="212"/>
++    <value name="RGB32I" value="213"/>
++    <!-- 214 reserved -->
++    <value name="RGB16F" value="215"/>
++    <!-- 216 reserved -->
++    <value name="RGBA2I" value="217"/>
++    <value name="RGBA4I" value="218"/>
++    <value name="RGBA8I" value="219"/>
++    <value name="RGBA16I" value="220"/>
++    <value name="RGBA32I" value="221"/>
++    <!-- 222 reserved -->
++    <value name="RGBA16F" value="223"/>
++    <value name="RGB5 A1 AU" value="224"/>
++    <value name="RGB5 A1 PU" value="225"/>
++    <value name="R5G6B5 AU" value="226"/>
++    <value name="R5G6B5 PU" value="227"/>
++    <!-- 228-229 reserved -->
++    <value name="Snap4 V" value="230"/>
++    <value name="R32F RTZ" value="231"/>
++    <value name="RGBA4 AU" value="232"/>
++    <value name="RGBA4 PU" value="233"/>
++    <!-- 234-236 reserved -->
++    <value name="RGBA8 TB" value="237"/>
++    <value name="RGB10 A2 TB" value="238"/>
++    <value name="RG32F RTZ" value="239"/>
++    <value name="Tess Vertex Pack" value="240"/>
++    <value name="RGB8 A2 AU" value="241"/>
++    <value name="RGB8 A2 PU" value="242"/>
++    <!-- 243-246 reserved -->
++    <value name="RGB32F RTZ" value="247"/>
++    <!-- 248-254 reserved -->
++    <value name="RGBA32F RTZ" value="255"/>
++  </enum>
++
++  <enum name="RGB Component Order">
++    <value name="RGBA" value="0"/>
++    <value name="GRBA" value="2"/>
++    <value name="BGRA" value="4"/>
++    <value name="ARGB" value="8"/>
++    <value name="AGRB" value="10"/>
++    <value name="ABGR" value="12"/>
++    <value name="RGB1" value="16"/>
++    <value name="GRB1" value="18"/>
++    <value name="BGR1" value="20"/>
++    <value name="1RGB" value="24"/>
++    <value name="1GRB" value="26"/>
++    <value name="1BGR" value="28"/>
++    <value name="RRRR" value="226"/>
++    <value name="RRR1" value="227"/>
++    <value name="RRRA" value="228"/>
++    <value name="000A" value="229"/>
++    <value name="0001" value="230"/>
++    <value name="0000" value="231"/>
++    <value name="Snap4 v9" value="232"/>
++    <value name="Snap4 v10" value="233"/>
++    <value name="Snap4 v11" value="234"/>
++    <value name="Snap4 v12" value="235"/>
++    <value name="Snap4 v13" value="236"/>
++    <value name="Snap4 v14" value="237"/>
++    <value name="Snap4 v15" value="238"/>
++    <value name="Snap4 v16" value="239"/>
++    <!-- Internal only, do not use -->
++    <value name="R000" value="240"/>
++    <value name="RBGA" value="242"/>
++  </enum>
++
++  <enum name="YUV Swizzle">
++    <value name="YUVA" value="0"/>
++    <value name="YVUA" value="1"/>
++    <value name="UYVA" value="2"/>
++    <value name="UVYA" value="3"/>
++    <value name="VUYA" value="4"/>
++    <value name="VYUA" value="5"/>
++    <value name="Y00A" value="6"/>
++    <value name="YXXA" value="7"/>
++  </enum>
++
++  <enum name="YUV Conversion Mode">
++    <value name="No Conversion" value="0"/>
++    <value name="BT 601" value="3"/>
++    <value name="BT 709" value="4"/>
++    <value name="Unk" value="5"/>
++    <value name="BT 2020" value="6"/>
++  </enum>
++
++  <enum name="YUV Cr Siting">
++    <value name="Co-Sited" value="0"/>
++    <value name="Center Y" value="1"/>
++    <value name="Center X" value="2"/>
++    <value name="Center" value="3"/>
++    <value name="One Quarter" value="4"/>
++    <value name="Three Quarters" value="5"/>
++    <value name="Replicated" value="7"/>
++  </enum>
++
++  <enum name="ASTC 2D Dimension">
++    <value name="4" value="0"/>
++    <value name="5" value="1"/>
++    <value name="6" value="2"/>
++    <value name="8" value="4"/>
++    <value name="10" value="6"/>
++    <value name="12" value="7"/>
++  </enum>
++
++  <enum name="ASTC 3D Dimension">
++    <value name="4" value="0"/>
++    <value name="5" value="1"/>
++    <value name="6" value="2"/>
++    <value name="3" value="3"/>
++  </enum>
++
++  <enum name="Pixel Kill">
++    <value name="Force Early" value="0"/>
++    <value name="Strong Early" value="1"/>
++    <value name="Weak Early" value="2"/>
++    <value name="Force Late" value="3"/>
++  </enum>
++
++  <enum name="Block Format">
++    <value name="Give me a GPU hang" value="0"/>
++    <!-- 16x16 block u-interleaved -->
++    <value name="Tiled U-Interleaved" value="1"/>
++    <value name="Linear" value="2"/>
++    <value name="Also Tiled" value="8"/>
++    <value name="AFBC" value="12"/>
++    <!-- This seems to actually use a newer AFBC version -->
++    <value name="AFBC Tiled" value="13"/>
++  </enum>
++
++  <enum name="Mipmap Mode">
++    <value name="Nearest" value="0"/>
++    <value name="None" value="1"/>
++    <value name="Trilinear" value="3"/>
++  </enum>
++
++  <enum name="LOD Algorithm">
++    <value name="Isotropic" value="0"/>
++      <!--- 1, 2 reserved -->
++    <value name="Anisotropic" value="3"/>
++  </enum>
++
++  <enum name="MSAA">
++    <value name="Single" value="0"/>
++    <!-- N samples, 1 surface, resolved -->
++    <value name="Average" value="1"/>
++    <!-- N samples, 1 surface, unresolved -->
++    <value name="Multiple" value="2"/>
++    <!-- N samples, N surfaces -->
++    <value name="Layered" value="3"/>
++  </enum>
++
++  <enum name="Index Type">
++    <value name="None" value="0"/>
++    <value name="UINT8" value="1"/>
++    <value name="UINT16" value="2"/>
++    <value name="UINT32" value="3"/>
++  </enum>
++
++  <enum name="Occlusion Mode">
++    <value name="Disabled" value="0"/>
++    <value name="Predicate" value="1"/>
++    <value name="Counter" value="3"/>
++  </enum>
++
++  <enum name="Stencil Op">
++    <value name="Keep" value="0"/>
++    <value name="Replace" value="1"/>
++    <value name="Zero" value="2"/>
++    <value name="Invert" value="3"/>
++    <value name="Incr Wrap" value="4"/>
++    <value name="Decr Wrap" value="5"/>
++    <value name="Incr Sat" value="6"/>
++    <value name="Decr Sat" value="7"/>
++  </enum>
++
++  <enum name="Texture Dimension">
++    <value name="Cube" value="0"/>
++    <value name="1D" value="1"/>
++    <value name="2D" value="2"/>
++    <value name="3D" value="3"/>
++  </enum>
++
++  <enum name="Wrap Mode">
++    <value name="Repeat" value="8"/>
++    <value name="Clamp to Edge" value="9"/>
++    <value name="Clamp to Border" value="11"/>
++    <value name="Mirrored Repeat" value="12"/>
++    <value name="Mirrored Clamp to Edge" value="13"/>
++    <value name="Mirrored Clamp to Border" value="15"/>
++  </enum>
++
++  <enum name="Tile Render Order">
++    <value name="Z Order" value="0"/>
++    <value name="Horizontal" value="1"/>
++    <value name="Vertical" value="2"/>
++    <value name="Reverse horizontal" value="5"/>
++    <value name="Reverse vertical" value="6"/>
++  </enum>
++
++  <enum name="Task Axis">
++    <value name="X" value="0"/>
++    <value name="Y" value="1"/>
++    <value name="Z" value="2"/>
++  </enum>
++
++  <enum name="Attribute Frequency">
++    <value name="Vertex" value="0"/>
++    <value name="Instance" value="1"/>
++  </enum>
++
++  <struct name="Attribute" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Attribute"/>
++    <field name="Attribute type" size="4" start="0:4" type="Attribute Type"/>
++    <field name="Offset enable" size="1" start="8" type="bool"/>
++    <field name="Format" size="22" start="0:10" type="Pixel Format"/>
++    <field name="Table" size="6" start="1:0" type="uint"/>
++    <field name="Frequency" size="2" start="1:6" type="Attribute Frequency"/> <!-- Internally, ID select -->
++    <field name="Divisor R" size="5" start="1:24" type="uint"/>
++    <field name="Divisor E" size="1" start="1:29" type="uint"/>
++    <field name="Offset" size="32" start="2:0" type="int"/>
++    <field name="Buffer index" size="32" start="3:0" type="uint"/> 
++    <field name="Stride" size="32" start="4:0" type="uint"/>
++
++    <!-- Vertex packet buffer only -->
++    <field name="Packet stride" size="16" start="4:0" type="uint" modifier="shr(6)"/>
++    <field name="Attribute stride" size="16" start="4:16" type="uint"/>
++
++    <field name="Divisor D" size="32" start="5:0" type="uint"/>
++  </struct>
++
++  <struct name="Buffer" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Buffer"/>
++    <field name="Buffer type" size="4" start="0:4" type="Buffer Type" default="Simple"/>
++    <field name="Size" size="32" start="1:0" type="uint"/>
++    <field name="Address" size="64" start="2:0" type="address"/>
++
++    <!-- Structured buffer only -->
++    <field name="Stride" size="32" start="4:0" type="uint"/>
++
++    <!-- Vertex packet buffer only -->
++    <field name="Packet stride" size="16" start="4:0" type="uint" modifier="shr(6)"/>
++    <field name="Attribute stride" size="16" start="4:16" type="uint"/>
++
++  </struct>
++
++  <enum name="Blend Operand A">
++    <value name="Zero" value="1"/>
++    <value name="Src" value="2"/>
++    <value name="Dest" value="3"/>
++  </enum>
++
++  <enum name="Blend Operand B">
++    <value name="Src Minus Dest" value="0"/>
++    <value name="Src Plus Dest" value="1"/>
++    <value name="Src" value="2"/>
++    <value name="Dest" value="3"/>
++  </enum>
++
++  <enum name="Blend Operand C">
++    <value name="Zero" value="1"/>
++    <value name="Src" value="2"/>
++    <value name="Dest" value="3"/>
++    <value name="Src x 2" value="4"/>
++    <value name="Src Alpha" value="5"/>
++    <value name="Dest Alpha" value="6"/>
++    <value name="Constant" value="7"/>
++  </enum>
++
++  <struct name="Blend Function" layout="none">
++    <!-- Blend equation: A + (B * C) -->
++    <field name="A" size="2" start="0" type="Blend Operand A"/>
++    <field name="Negate A" size="1" start="3" type="bool"/>
++    <field name="B" size="2" start="4" type="Blend Operand B"/>
++    <field name="Negate B" size="1" start="7" type="bool"/>
++    <field name="C" size="3" start="8" type="Blend Operand C"/>
++    <field name="Invert C" size="1" start="11" type="bool"/>
++  </struct>
++
++  <struct name="Blend Equation" size="1">
++    <field name="RGB" size="12" start="0:0" type="Blend Function"/>
++    <field name="Alpha" size="12" start="0:12" type="Blend Function"/>
++    <field name="Color Mask" size="4" start="0:28" type="uint"/>
++  </struct>
++
++  <enum name="Register File Format">
++    <value name="F16" value="0"/>
++    <value name="F32" value="1"/>
++    <value name="I32" value="2"/>
++    <value name="U32" value="3"/>
++    <value name="I16" value="4"/>
++    <value name="U16" value="5"/>
++  </enum>
++
++  <enum name="Blend Mode">
++    <value name="Shader" value="0"/>
++    <value name="Opaque" value="1"/>
++    <value name="Fixed-Function" value="2"/>
++    <value name="Off" value="3"/>
++  </enum>
++
++  <struct name="Blend Shader" size="2">
++    <field name="PC" size="28" start="1:4" type="uint" modifier="shr(4)"/>
++  </struct>
++
++  <struct name="Internal Conversion" size="1" align="4">
++    <field name="Memory Format" size="22" start="0" type="Pixel Format"/>
++    <field name="Raw" size="1" start="22" type="bool"/>
++    <field name="Register Format" size="3" start="24" type="Register File Format"/>
++  </struct>
++
++  <struct name="Blend Fixed-Function" size="2">
++    <field name="Num Comps" size="2" start="0:3" type="uint" modifier="minus(1)" default="1"/>
++    <field name="RT" size="4" start="0:16" type="uint">
++       <value name="MALI_BIFROST_BLEND_MAX_RT" value="8"/>
++    </field>
++    <field name="Conversion" size="32" start="1:0" type="Internal Conversion"/>
++  </struct>
++
++  <struct name="Internal Blend" align="8">
++    <field name="Mode" size="2" start="0:0" type="Blend Mode"/>
++    <field name="Shader" size="64" start="0:0" type="Blend Shader"/>
++    <field name="Fixed-Function" size="64" start="0:0" type="Blend Fixed-Function"/>
++  </struct>
++
++  <struct name="Blend" size="4" align="16">
++    <field name="Load Destination" size="1" start="0:0" type="bool" default="false"/>
++    <field name="Alpha To One" size="1" start="0:8" type="bool"/>
++    <field name="Enable" size="1" start="0:9" type="bool" default="true"/>
++    <field name="sRGB" size="1" start="0:10" type="bool" default="false"/>
++    <field name="Round to FB precision" size="1" start="0:11" type="bool" default="false"/>
++    <field name="Constant" size="16" start="0:16" type="uint"/>
++    <field name="Equation" size="32" start="1:0" type="Blend Equation"/>
++    <field name="Internal" size="64" start="2:0" type="Internal Blend"/>
++  </struct>
++
++  <enum name="Point Size Array Format">
++    <value name="None" value="0"/>
++    <value name="FP16" value="2"/>
++    <value name="FP32" value="3"/>
++  </enum>
++
++  <struct name="Tiler launch" layout="ins" op="5">
++  </struct>
++
++  <struct name="IDVS launch" layout="ins" op="6">
++    <!-- The low 32 bits are all ORd with register w38 -->
++    <field name="Draw mode" size="8" start="0" type="Draw Mode" default="None"/>
++    <field name="Index type" size="3" start="8" type="Index Type" default="None"/>
++    <field name="Secondary Shader" size="1" start="18" type="bool"/>
++    <field name="Unk" size="16" start="32" type="register" default="0x4a42"/>
++  </struct>
++
++  <struct name="Primitive" layout="cs">
++    <field name="Index count" size="32" start="0x21:0" type="uint"/>
++    <field name="Base vertex offset" size="32" start="0x24:0" type="uint"/>
++    <field name="Instance offset" size="32" start="0x25:0" type="uint"/>
++
++    <!-- These bits can also be set from the IDVS launch instruction...
++         they are combined with a bitwise OR. -->
++    <field name="Draw mode" size="8" start="0x38:0" type="Draw Mode" default="None"/>
++    <field name="Index type" size="3" start="0x38:8" type="Index Type" default="None"/>
++    <field name="Point size array format" size="2" start="0x38:11" type="Point Size Array Format"/>
++    <field name="Primitive Index Enable" size="1" start="0x38:13" type="bool"/>
++    <field name="Primitive Index Writeback" size="1" start="0x38:14" type="bool"/>
++    <field name="Allow rotating primitives" size="1" start="0x38:15" type="bool" default="true"/>
++    <field name="Low Depth Cull" size="1" start="0x38:16" type="bool"/>
++    <field name="High Depth Cull" size="1" start="0x38:17" type="bool"/>
++    <field name="Secondary Shader" size="1" start="0x38:18" type="bool"/>
++    <field name="Primitive restart" size="1" start="0x38:19" type="bool"/>
++    <!-- Set to 2 when writing point size, 1 otherwise. 0 appears to be fine in any case. -->
++    <field name="Unk" size="2" start="0x38:22" type="uint"/>
++
++    <!-- These are likely in 0x38, but bit positions have not been verified -->
++    <field name="Layer index enable" size="1" start="0:20" type="bool"/>
++    <field name="Scissor array enable" size="1" start="0:21" type="bool"/>
++  </struct>
++
++  <struct name="Sampler" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Sampler"/>
++    <field name="Wrap Mode R" size="4" start="0:8" type="Wrap Mode" default="Clamp to Edge"/>
++    <field name="Wrap Mode T" size="4" start="0:12" type="Wrap Mode" default="Clamp to Edge"/>
++    <field name="Wrap Mode S" size="4" start="0:16" type="Wrap Mode" default="Clamp to Edge"/>
++    <field name="Round to nearest even" size="1" start="0:21" type="bool" default="false"/>
++    <!--- Disable sRGB-to-linear conversion (assume linear) -->
++    <field name="sRGB override" size="1" start="0:22" type="bool" default="false"/>
++    <field name="Seamless Cube Map" size="1" start="0:23" type="bool" default="true"/>
++    <field name="Clamp integer coordinates" size="1" start="0:24" type="bool"/>
++    <field name="Normalized Coordinates" size="1" start="0:25" type="bool" default="true"/>
++    <field name="Clamp integer array indices" size="1" start="0:26" type="bool" default="true"/>
++    <field name="Minify nearest" size="1" start="0:27" type="bool" default="false"/>
++    <field name="Magnify nearest" size="1" start="0:28" type="bool" default="false"/>
++    <!--- Set for 0.5, clear for 0.0 -->
++    <field name="Magnify cutoff" size="1" start="0:29" type="bool" default="false"/>
++    <field name="Mipmap Mode" size="2" start="0:30" type="Mipmap Mode" default="Nearest"/>
++    <field name="Minimum LOD" size="13" start="1:0" type="uint" default="0"/>
++    <field name="Compare Function" size="3" start="1:13" type="Func" default="Never"/>
++    <field name="Maximum LOD" size="13" start="1:16" type="uint" default="0"/>
++    <field name="LOD bias" size="16" start="2:0" type="int" default="0"/>
++    <field name="Maximum anisotropy" size="5" start="2:16" type="uint" default="1" modifier="minus(1)"/>
++    <field name="LOD algorithm" size="2" start="2:24" type="LOD Algorithm" default="Isotropic"/>
++    <field name="Border Color R" size="32" start="4:0" type="uint/float" default="0.0"/>
++    <field name="Border Color G" size="32" start="5:0" type="uint/float" default="0.0"/>
++    <field name="Border Color B" size="32" start="6:0" type="uint/float" default="0.0"/>
++    <field name="Border Color A" size="32" start="7:0" type="uint/float" default="0.0"/>
++  </struct>
++
++  <enum name="Plane Type">
++    <value name="Null" value="0"/>
++    <value name="Generic" value="1"/>
++    <value name="ASTC 2D" value="4"/>
++    <value name="ASTC 3D" value="5"/>
++    <value name="AFBC" value="6"/>
++    <value name="Chroma 2p" value="8"/>
++  </enum>
++
++  <enum name="Clump Ordering">
++    <value name="Tiled U-Interleaved" value="1"/>
++    <value name="Linear" value="2"/>
++  </enum>
++
++  <enum name="Clump Format">
++    <value name="RAW8" value="0"/>
++    <value name="RAW16" value="1"/>
++    <value name="RAW32" value="2"/>
++    <value name="RAW64" value="3"/>
++    <value name="RAW128" value="4"/>
++
++    <value name="RAW24" value="8"/>
++    <value name="RAW48" value="9"/>
++    <value name="RAW96" value="10"/>
++
++    <value name="D32X32" value="16"/>
++    <value name="X32D32" value="17"/>
++    <value name="X32S8X24" value="18"/>
++    <value name="X24S8X32" value="19"/>
++    <value name="X24S8" value="20"/>
++    <value name="S8X24" value="21"/>
++    <value name="S8" value="22"/>
++
++    <value name="L4A4" value="24"/>
++    <value name="L8A8" value="25"/>
++    <value name="A8" value="26"/>
++
++    <value name="ETC2 RGB8" value="32"/>
++    <value name="ETC2 R11 UNORM" value="33"/>
++    <value name="ETC2 R11 SNORM" value="34"/>
++    <value name="ETC2 RG11 UNORM" value="35"/>
++    <value name="ETC2 RG11 SNORM" value="36"/>
++    <value name="ETC2 RGBA8" value="37"/>
++    <value name="ETC2 RGB8A1" value="38"/>
++
++    <value name="BC1 UNORM" value="48"/>
++    <value name="BC2 UNORM" value="49"/>
++    <value name="BC3 UNORM" value="50"/>
++    <value name="BC4 UNORM" value="51"/>
++    <value name="BC4 SNORM" value="52"/>
++    <value name="BC5 UNORM" value="53"/>
++    <value name="BC5 SNORM" value="54"/>
++    <value name="BC6H UF16" value="55"/>
++    <value name="BC6H SF16" value="56"/>
++    <value name="BC7 UNORM" value="57"/>
++
++    <value name="Y8 UV8 422" value="64"/>
++    <value name="Y10 UV10 422" value="65"/>
++    <value name="Y16 UV16 422" value="67"/>
++
++    <value name="Y8 UV8 420" value="72"/>
++    <value name="Y10 UV10 420" value="73"/>
++    <value name="Y16 UV16 420" value="75"/>
++    <value name="YUV420 10x6" value="76"/>
++
++    <value name="RAW10" value="120"/>
++    <value name="RAW12" value="121"/>
++
++    <value name="R8G8B8G8" value="124"/>
++    <value name="G8R8G8B8" value="125"/>
++  </enum>
++
++  <enum name="AFBC Superblock Size">
++    <value name="16x16" value="0"/>
++    <value name="32x8" value="1"/>
++    <value name="64x4" value="2"/>
++  </enum>
++
++  <enum name="AFBC Compression Mode">
++    <value name="R8" value="0"/>
++    <value name="R8G8 alt" value="1"/>
++    <value name="R8G8" value="3"/>
++    <value name="R5G6B5" value="4"/>
++    <value name="R4G4B4A4" value="5"/>
++    <value name="R5G5B5A1" value="6"/>
++    <value name="R8G8B8A8 alt" value="8"/>
++    <value name="R8G8B8" value="9"/>
++    <value name="R8G8B8A8" value="10"/>
++    <value name="R10G10B10A2" value="11"/>
++    <value name="R11G11B10" value="12"/>
++    <value name="S8" value="14"/>
++    <value name="X24S8" value="15"/>
++
++    <!-- Subsampled maybe? -->
++    <value name="Unk 1" value="16"/>
++    <value name="Unk 2" value="17"/>
++
++    <value name="YUV420 6c8" value="32"/>
++    <value name="YUV420 2c8" value="34"/>
++    <value name="YUV420 1c8" value="35"/>
++
++    <value name="YUV422 4c8" value="37"/>
++    <value name="YUV422 2c8" value="38"/>
++    <value name="YUV422 1c8" value="39"/>
++
++    <value name="YUV420 6c10" value="40"/>
++    <value name="YUV420 2c10" value="42"/>
++    <value name="YUV420 1c10" value="43"/>
++
++    <value name="YUV422 4c10" value="45"/>
++    <value name="YUV422 2c10" value="46"/>
++    <value name="YUV422 1c10" value="47"/>
++  </enum>
++
++  <struct name="Plane AFBC Overlay" size="8">
++    <field name="Superblock size" size="3" start="0:8" type="AFBC Superblock Size"/>
++
++    <field name="YTR" size="1" start="0:16" type="bool"/>
++    <field name="Split block" size="1" start="0:17" type="bool"/>
++    <field name="Alpha hint" size="1" start="0:18" type="bool"/>
++    <field name="Tiled header" size="1" start="0:19" type="bool"/>
++    <field name="Prefetch" size="1" start="0:20" type="bool"/>
++    <field name="Compression mode" size="6" start="0:26" type="AFBC Compression Mode"/>
++
++    <field name="Header stride" size="32" start="5:0" type="uint"/>
++  </struct>
++
++  <struct name="Plane ASTC 2D Overlay" size="8">
++    <field name="Block width" size="3" start="26" type="ASTC 2D Dimension"/>
++    <field name="Block height" size="3" start="29" type="ASTC 2D Dimension"/>
++  </struct>
++
++  <struct name="Plane ASTC 3D Overlay" size="8">
++    <field name="Block width" size="2" start="26" type="ASTC 3D Dimension"/>
++    <field name="Block height" size="2" start="28" type="ASTC 3D Dimension"/>
++    <field name="Block depth" size="2" start="30" type="ASTC 3D Dimension"/>
++  </struct>
++
++  <struct name="Plane ASTC Overlay" size="8">
++    <field name="Decode HDR" size="1" start="24" type="bool"/>
++    <field name="Decode Wide" size="1" start="25" type="bool"/>
++
++    <field name="2D" size="32" start="0" type="Plane ASTC 2D Overlay"/>
++    <field name="3D" size="32" start="0" type="Plane ASTC 3D Overlay"/>
++  </struct>
++
++  <struct name="Plane" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Plane"/>
++    <field name="Plane type" size="4" start="0:4" type="Plane Type"/>
++
++    <field name="AFBC" size="32" start="0:0" type="Plane AFBC Overlay"/>
++    <field name="ASTC" size="32" start="0:0" type="Plane ASTC Overlay"/>
++
++    <!-- Not AFBC -->
++    <field name="Clump ordering" size="4" start="0:8" type="Clump Ordering"/>
++
++    <!-- Generic, Chroma 2p -->
++    <field name="Clump format" size="8" start="24" type="Clump Format"/>
++
++    <field name="Size" size="32" start="1:0" type="uint"/>
++    <field name="Pointer" size="64" start="2:0" type="address"/>
++    <field name="Row stride" size="32" start="4:0" type="uint"/>
++    <field name="Slice stride" size="32" start="6:0" type="int"/>
++  </struct>
++
++  <struct name="Texture" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Texture"/>
++    <field name="Dimension" size="2" start="0:4" type="Texture Dimension"/>
++    <field name="Sample corner position" size="1" start="0:8" type="bool" default="false"/>
++    <field name="Normalize coordinates" size="1" start="0:9" type="bool" default="false"/>
++    <field name="Format" size="22" start="0:10" type="Pixel Format"/>
++    <field name="Width" size="16" start="1:0" type="uint" modifier="minus(1)"/>
++    <field name="Height" size="16" start="1:16" type="uint" modifier="minus(1)"/>
++    <field name="Swizzle" size="12" start="2:0" type="uint"/>
++    <field name="Texel interleave" size="1" start="2:12" type="bool"/>
++    <field name="Levels" size="5" start="2:16" type="uint" default="1" modifier="minus(1)"/>
++    <field name="Minimum level" size="5" start="2:24" type="uint"/>
++    <field name="Minimum LOD" size="13" start="3:0" type="uint" default="0"/>
++    <field name="Sample count" size="3" start="3:13" type="uint" default="1" modifier="log2"/>
++    <field name="Maximum LOD" size="13" start="3:16" type="uint" default="0"/>
++    <field name="Surfaces" size="64" start="4:0" type="address"/>
++    <field name="Array size" size="16" start="6:0" type="uint" modifier="minus(1)" default="1"/>
++    <field name="Depth" size="16" start="7:0" type="uint" modifier="minus(1)" default="1"/>
++  </struct>
++
++  <enum name="Shader Register Allocation">
++    <value name="64 Per Thread" value="0"/>
++    <value name="32 Per Thread" value="2"/>
++  </enum>
++
++  <enum name="Flush to zero mode">
++    <value name="Preserve subnormals" value="0"/>
++    <value name="DX11" value="1"/>
++    <value name="Always" value="2"/>
++    <value name="Abrupt" value="3"/>
++  </enum>
++
++  <struct name="Preload" size="1">
++    <field name="R48-R63" size="16" start="0:0" type="hex"/>
++    <field name="R55" size="1" start="0:7" type="bool"/>
++    <field name="R56" size="1" start="0:8" type="bool"/>
++    <field name="R57" size="1" start="0:9" type="bool"/>
++    <field name="R58" size="1" start="0:10" type="bool"/>
++    <field name="R59" size="1" start="0:11" type="bool"/>
++    <field name="R60" size="1" start="0:12" type="bool"/>
++    <field name="R61" size="1" start="0:13" type="bool"/>
++    <field name="R62" size="1" start="0:14" type="bool"/>
++    <field name="R63" size="1" start="0:15" type="bool"/>
++  </struct>
++
++  <struct name="Shader Program" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Shader"/>
++    <field name="Stage" size="4" start="0:4" type="Shader stage"/>
++    <field name="Primary shader" size="1" start="0:8" type="bool"/>
++    <field name="Suppress NaN" size="1" start="0:16" type="bool"/>
++    <field name="Flush to zero mode" size="2" start="0:17" type="Flush to zero mode"/>
++    <field name="Suppress Inf" size="1" start="0:19" type="bool"/>
++    <field name="Requires helper threads" size="1" start="0:28" type="bool"/> <!-- Fragment only -->
++    <field name="Shader contains JUMP_EX" size="1" start="0:29" type="bool"/>
++    <field name="Register allocation" size="2" start="0:30" type="Shader Register Allocation"/>
++    <field name="Preload" size="16" start="1:0" type="Preload"/>
++    <field name="Binary" size="64" start="2:0" type="address"/>
++  </struct>
++
++  <struct name="Scissor" layout="cs">
++    <field name="Scissor Minimum X" size="16" start="0x2a:0" type="uint"/>
++    <field name="Scissor Minimum Y" size="16" start="0x2a:16" type="uint"/>
++    <field name="Scissor Maximum X" size="16" start="0x2b:0" type="uint"/>
++    <field name="Scissor Maximum Y" size="16" start="0x2b:16" type="uint"/>
++  </struct>
++
++  <struct name="Local Storage" size="8" align="64">
++    <field name="TLS Size" size="5" start="0:0" type="uint"/>
++    <field name="WLS Instances" size="5" start="1:0" type="uint" modifier="log2" prefix="MALI_LOCAL_STORAGE" default="MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM">
++      <value name="No Workgroup Mem" value="0x80000000"/>
++    </field>
++    <field name="WLS Size Base" size="2" start="1:5" type="uint"/>
++    <field name="WLS Size Scale" size="5" start="1:8" type="uint"/>
++    <field name="TLS Base Pointer" size="48" start="2:0" type="address"/>
++    <field name="TLS Address Mode" size="4" start="3:28" type="Address Mode"/>
++    <field name="WLS Base Pointer" size="64" start="4:0" type="address"/>
++  </struct>
++
++  <enum name="Color Buffer Internal Format">
++    <value name="Raw Value" value="0"/>
++    <value name="R8G8B8A8" value="1"/>
++    <value name="R10G10B10A2" value="2"/>
++    <value name="R8G8B8A2" value="3"/>
++    <value name="R4G4B4A4" value="4"/>
++    <value name="R5G6B5A0" value="5"/>
++    <value name="R5G5B5A1" value="6"/>
++    <value name="RAW8" value="32"/>
++    <value name="RAW16" value="33"/>
++    <value name="RAW32" value="34"/>
++    <value name="RAW64" value="35"/>
++    <value name="RAW128" value="36"/>
++  </enum>
++
++  <enum name="Color Format">
++    <value name="RAW8" value="0"/>
++    <value name="RAW16" value="1"/>
++    <value name="RAW24" value="2"/>
++    <value name="RAW32" value="3"/>
++    <value name="RAW48" value="4"/>
++    <value name="RAW64" value="5"/>
++    <value name="RAW96" value="6"/>
++    <value name="RAW128" value="7"/>
++    <value name="RAW192" value="8"/>
++    <value name="RAW256" value="9"/>
++    <value name="RAW384" value="10"/>
++    <value name="RAW512" value="11"/>
++    <value name="RAW768" value="12"/>
++    <value name="RAW1024" value="13"/>
++    <value name="RAW1536" value="14"/>
++    <value name="RAW2048" value="15"/>
++    <value name="R8" value="16"/>
++    <value name="R8G8" value="17"/>
++    <value name="R8G8B8" value="18"/>
++    <value name="R8G8B8A8" value="19"/>
++    <value name="R4G4B4A4" value="20"/>
++    <value name="R5G6B5" value="21"/>
++    <value name="R8G8B8_FROM_R8G8B8A2" value="22"/>
++    <value name="R10G10B10A2" value="24"/>
++    <value name="A2B10G10R10" value="25"/>
++    <value name="R5G5B5A1" value="28"/>
++    <value name="A1B5G5R5" value="29"/>
++    <value name="NATIVE" value="31"/>
++  </enum>
++
++  <enum name="Downsampling Accumulation Mode">
++    <value name="Unsigned normalized integer" value="0"/>
++    <value name="Signed normalized integer" value="1"/>
++  </enum>
++
++  <enum name="Sample Layout">
++    <value name="Ordered 4x Grid" value="0"/>
++    <value name="Rotated 4x Grid" value="1"/>
++    <value name="D3D 8x Grid" value="2"/>
++    <value name="D3D 16x Grid" value="3"/>
++  </enum>
++
++  <enum name="ZS Format">
++    <value name="D16" value="1"/>
++    <value name="D24" value="2"/>
++    <value name="D24X8" value="4"/>
++    <value name="D24S8" value="5"/>
++    <value name="D32" value="14"/>
++  </enum>
++
++  <enum name="ZS Preload Format">
++    <value name="D32_S8X24" value="4"/>
++  </enum>
++
++  <enum name="S Format">
++    <value name="S8" value="1"/>
++    <value name="X24S8" value="4"/>
++  </enum>
++
++  <enum name="Tie-Break Rule">
++    <value name="0_IN_180_OUT" value="0"/>
++    <value name="0_OUT_180_IN" value="1"/>
++    <value name="MINUS_180_IN_0_OUT" value="2"/>
++    <value name="MINUS_180_OUT_0_IN" value="3"/>
++    <value name="90_IN_270_OUT" value="4"/>
++    <value name="90_OUT_270_IN" value="5"/>
++    <value name="MINUS_90_IN_90_OUT" value="6"/>
++    <value name="MINUS_90_OUT_90_IN" value="7"/>
++  </enum>
++
++  <struct name="RT Buffer">
++    <field name="Base" size="64" start="0:0" type="address"/>
++    <field name="Row Stride" size="32" start="2:0" type="uint"/>
++    <field name="Surface Stride" size="32" start="3:0" type="uint"/>
++  </struct>
++
++  <enum name="Sample Pattern">
++    <value name="Single-sampled" value="0"/>
++    <value name="Ordered 4x Grid" value="1"/>
++    <value name="Rotated 4x Grid" value="2"/>
++    <value name="D3D 8x Grid" value="3"/>
++    <value name="D3D 16x Grid" value="4"/>
++  </enum>
++
++  <enum name="Z Internal Format">
++    <value name="D16" value="0"/>
++    <value name="D24" value="1"/>
++    <value name="D32" value="2"/>
++  </enum>
++
++  <enum name="FBD Tag">
++    <value name="IS_MFBD" value="1"/>
++    <value name="HAS_ZS_RT" value="2"/>
++    <value name="MASK" value="63"/>
++  </enum>
++
++  <enum name="Pre Post Frame Shader Mode">
++    <value name="Never" value="0"/>
++    <value name="Always" value="1"/>
++    <value name="Intersect" value="2"/>
++    <value name="Early ZS always" value="3"/>
++  </enum>
++
++  <enum name="Downscale">
++    <value name="None" value="0"/>
++    <value name="2x" value="1"/>
++    <value name="4x" value="2"/>
++  </enum>
++
++  <struct name="Framebuffer Parameters" align="64">
++    <field name="Pre Frame 0" size="3" start="0:0" type="Pre Post Frame Shader Mode"/>
++    <field name="Pre Frame 1" size="3" start="0:3" type="Pre Post Frame Shader Mode"/>
++    <field name="Post Frame" size="3" start="0:6" type="Pre Post Frame Shader Mode"/>
++    <field name="Downscale geometry" size="2" start="0:9" type="Downscale"/>
++    <field name="Internal layer index" size="8" start="0:24" type="uint"/>
++    <field name="Frame argument" size="64" start="2:0" type="hex"/>
++    <field name="Sample Locations" size="64" start="4:0" type="address"/>
++    <field name="Frame Shader DCDs" size="64" start="6:0" type="address"/>
++    <field name="Width" size="16" start="8:0" type="uint" modifier="minus(1)"/>
++    <field name="Height" size="16" start="8:16" type="uint" modifier="minus(1)"/>
++    <field name="Bound Min X" size="16" start="9:0" type="uint"/>
++    <field name="Bound Min Y" size="16" start="9:16" type="uint"/>
++    <field name="Bound Max X" size="16" start="10:0" type="uint"/>
++    <field name="Bound Max Y" size="16" start="10:16" type="uint"/>
++    <field name="Sample Count" size="3" start="11:0" type="uint" default="1" modifier="log2"/>
++    <field name="Sample Pattern" size="3" start="11:3" type="Sample Pattern"/>
++    <field name="Tie-Break Rule" size="3" start="11:6" type="Tie-Break Rule"/>
++    <field name="Effective Tile Size" size="4" start="11:9" type="uint" modifier="log2"/>
++    <field name="First provoking vertex" size="1" start="11:14" type="bool"/>
++    <field name="Point sprite coord origin max Y" size="1" start="11:15" type="bool" default="true"/>
++    <field name="Blend suppress Inf" size="1" start="11:16" type="bool"/>
++    <field name="Blend suppress NaN" size="1" start="11:17" type="bool"/>
++    <field name="Blend suppress denorm" size="1" start="11:18" type="bool"/>
++    <field name="Render Target Count" size="4" start="11:19" type="uint" modifier="minus(1)"/>
++    <field name="Color Buffer Allocation" size="8" start="11:24" type="uint" modifier="shr(10)"/>
++    <field name="S Clear" size="8" start="12:0" type="uint"/>
++    <field name="S Write Enable" size="1" start="12:8" type="bool"/>
++    <field name="S Preload Enable" size="1" start="12:9" type="bool"/>
++    <field name="S Unload Enable" size="1" start="12:10" type="bool"/>
++    <field name="Z Internal Format" size="2" start="12:16" type="Z Internal Format"/>
++    <field name="Z Write Enable" size="1" start="12:18" type="bool"/>
++    <field name="Z Preload Enable" size="1" start="12:19" type="bool"/>
++    <field name="Z Unload Enable" size="1" start="12:20" type="bool"/>
++    <field name="Has ZS CRC Extension" size="1" start="12:21" type="bool"/>
++    <field name="Empty Tile Read Enable" size="1" start="12:28" type="bool"/>
++    <field name="Empty Tile Write Enable" size="1" start="12:29" type="bool"/>
++    <field name="CRC Read Enable" size="1" start="12:30" type="bool"/>
++    <field name="CRC Write Enable" size="1" start="12:31" type="bool"/>
++    <field name="Z Clear" size="32" start="13:0" type="float"/>
++    <field name="Tiler" size="64" start="14:0" type="address"/>
++  </struct>
++
++  <struct name="ZS CRC Extension" align="64" size="16">
++    <field name="ZS Write Format" size="4" start="0:0" type="ZS Format"/>
++    <field name="ZS Block Format" size="4" start="0:4" type="Block Format"/>
++    <field name="ZS MSAA" size="2" start="0:8" default="Single" type="MSAA"/>
++    <field name="CRC Render Target" size="4" start="0:13" type="uint"/>
++    <field name="S Write Format" size="4" start="0:16" type="S Format"/>
++    <field name="S Block Format" size="4" start="0:20" type="Block Format"/>
++    <field name="S MSAA" size="2" start="0:24" default="Single" type="MSAA"/>
++
++    <field name="AFBC Reverse Issue Order" size="1" start="0:30" type="bool"/>
++    <!-- Note: Must be set if AFBC is enabled and effective_tile_size is not 16x16 -->
++    <field name="ZS Clean Pixel Write Enable" size="1" start="0:31" type="bool"/>
++
++    <field name="CRC Row Stride" size="32" start="1:0" type="uint"/>
++    <field name="CRC Unk" size="16" start="2:0" type="hex"/>
++    <field name="CRC Clear Color" size="48" start="2:16" type="hex"/>
++    <field name="CRC Base" size="64" start="4:0" type="address"/>
++
++    <field name="ZS Writeback Base" size="64" start="8:0" type="address"/>
++    <!-- Header clumps per row (different than Bifrost's AFBC line stride) -->
++    <field name="ZS Writeback Row Stride" size="32" start="10:0" type="uint"/>
++    <field name="ZS Writeback Surface Stride" size="32" start="11:0" type="uint"/>
++    <field name="ZS AFBC Body Offset" size="32" start="11:0" type="uint"/>
++
++    <field name="S Writeback Base" size="64" start="12:0" type="address"/>
++    <field name="S Writeback Row Stride" size="32" start="14:0" type="uint"/>
++    <field name="S Writeback Surface Stride" size="32" start="15:0" type="uint"/>
++    <field name="S AFBC Body Offset" size="32" start="15:0" type="uint"/>
++  </struct>
++
++  <enum name="YUV Conv K6">
++    <value name="0" value="0"/>
++    <value name="16" value="1"/>
++  </enum>
++
++  <enum name="YUV Conv K7 Clamp">
++    <value name="MINUS_128_TO_127" value="0"/>
++    <value name="MINUS_112_TO_111" value="1"/>
++    <value name="0_TO_255" value="2"/>
++    <value name="16_TO_239" value="3"/>
++  </enum>
++
++  <enum name="YUV Conv K8">
++    <value name="220" value="0"/>
++    <value name="256" value="1"/>
++  </enum>
++
++  <struct name="Render Target YUV Overlay" size="16">
++    <field name="Swizzle" size="3" start="2:16" type="YUV Swizzle"/>
++    <field name="Full Range" size="1" start="2:20" type="bool"/>
++    <field name="Conversion Mode" size="4" start="2:21" type="YUV Conversion Mode"/>
++    <field name="Cr Siting" size="3" start="2:25" type="YUV Cr Siting"/>
++    <field name="Unsigned Cr Range" size="1" start="2:28" type="bool"/>
++    <!-- TODO: Is planar YUV even supported? Only "Plane 0" appears to
++         be used, and the stride gets ignored. -->
++    <field name="Plane 0 Base" size="64" start="4:0" type="address"/>
++    <field name="Plane 1 Base" size="64" start="6:0" type="address"/>
++    <field name="Plane 2 Base" size="64" start="8:0" type="address"/>
++    <field name="Plane 0 Stride" size="32" start="10:0" type="uint"/>
++    <field name="Plane 1 2 Stride" size="32" start="11:0" type="uint"/>
++  </struct>
++
++  <struct name="Render Target AFBC Overlay" size="16">
++    <field name="YUV Transform" size="1" start="2:0" type="bool"/>
++    <!-- Split each block into two half-height blocks -->
++    <field name="Split block" size="1" start="2:1" type="bool"/>
++    <field name="Wide block" size="1" start="2:2" type="bool"/>
++    <field name="Reverse issue order" size="1" start="2:3" type="bool"/>
++    <field name="Front buffer" size="1" start="2:4" type="bool"/>
++    <field name="Alpha hint" size="1" start="2:5" type="bool"/>
++    <field name="Compression mode" size="6" start="2:10" type="AFBC Compression Mode"/>
++
++    <field name="Header" size="64" start="8:0" type="address"/>
++    <field name="Row stride" size="32" start="10:0" type="uint"/>
++    <field name="Body offset" size="32" start="11:0" type="uint"/>
++  </struct>
++
++  <struct name="RT Clear">
++    <field name="Color 0" size="32" start="0:0" type="uint"/>
++    <field name="Color 1" size="32" start="1:0" type="uint"/>
++    <field name="Color 2" size="32" start="2:0" type="uint"/>
++    <field name="Color 3" size="32" start="3:0" type="uint"/>
++  </struct>
++
++  <struct name="Render Target" align="64">
++    <field name="YUV" size="512" start="0:0" type="Render Target YUV Overlay"/>
++    <field name="AFBC" size="512" start="0:0" type="Render Target AFBC Overlay"/>
++    <field name="Internal Buffer Offset" size="12" start="0:4" type="uint" modifier="shr(4)"/>
++    <field name="YUV Enable" size="1" start="0:24" type="bool"/>
++    <field name="Dithered Clear" size="1" start="0:25" type="bool"/>
++    <field name="Internal Format" size="6" start="0:26" type="Color Buffer Internal Format"/>
++    <field name="Write Enable" size="1" start="1:0" type="bool"/>
++    <field name="Writeback Format" size="5" start="1:3" type="Color Format"/>
++    <field name="Writeback Block Format" size="4" start="1:8" type="Block Format"/>
++    <field name="Writeback MSAA" size="2" start="1:12" type="MSAA"/>
++    <field name="sRGB" size="1" start="1:14" type="bool"/>
++    <field name="Dithering Enable" size="1" start="1:15" type="bool"/>
++    <field name="Swizzle" size="12" start="1:16" type="uint"/>
++    <field name="Clean Pixel Write Enable" size="1" start="1:31" type="bool"/>
++    <field name="RGB" size="128" start="8:0" type="RT Buffer"/>
++    <field name="Clear" size="128" start="12:0" type="RT Clear"/>
++  </struct>
++
++  <enum name="Chunk Size">
++    <value name="256 KiB" value="0"/>
++    <value name="512 KiB" value="1"/>
++    <value name="1 MiB" value="2"/>
++    <value name="2 MiB" value="3"/>
++  </enum>
++
++  <!-- Is this actually correct? Changing it doesn't seem to affect
++       heap memory usage at all. Then again, neither does the chunk
++       size... perhaps those are both wrong, at least for v10.  -->
++  <enum name="Chunk Partition">
++    <value name="Dynamic" value="0"/>
++    <value name="Static 25%" value="1"/>
++    <value name="Static 50%" value="2"/>
++    <value name="Static 75%" value="3"/>
++  </enum>
++
++  <!-- Actually a buffer descriptor, split out for backwards compatibility.-->
++  <struct name="Tiler Heap" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Buffer"/>
++    <field name="Buffer type" size="4" start="0:4" type="Buffer Type" default="Tiler heap"/>
++    <field name="Chunk size" size="2" start="0:8" type="Chunk Size" default="256 KiB"/>
++    <field name="Partitioning" size="2" start="0:10" type="Chunk Partition" default="Dynamic"/>
++    <field name="Size" size="32" start="1:0" type="uint" modifier="align(4096)"/>
++    <field name="Base" size="64" start="2:0" type="address"/>
++    <field name="Bottom" size="64" start="4:0" type="address"/>
++    <field name="Top" size="64" start="6:0" type="address"/>
++  </struct>
++
++  <struct name="Tiler Weights" size="8">
++    <field name="Weight0" size="16" start="0:16" type="uint"/>
++    <field name="Weight1" size="16" start="1:16" type="uint"/>
++    <field name="Weight2" size="16" start="2:16" type="uint"/>
++    <field name="Weight3" size="16" start="3:16" type="uint"/>
++    <field name="Weight4" size="16" start="4:16" type="uint"/>
++    <field name="Weight5" size="16" start="5:16" type="uint"/>
++    <field name="Weight6" size="16" start="6:16" type="uint"/>
++    <field name="Weight7" size="16" start="7:16" type="uint"/>
++  </struct>
++
++  <struct name="Tiler State" size="16">
++    <field name="Word0" size="32" start="0:0" type="uint"/>
++    <field name="Word1" size="32" start="1:0" type="uint"/>
++    <field name="Word2" size="32" start="2:0" type="uint"/>
++    <field name="Word3" size="32" start="3:0" type="uint"/>
++    <field name="Word4" size="32" start="4:0" type="uint"/>
++    <field name="Word5" size="32" start="5:0" type="uint"/>
++    <field name="Word6" size="32" start="6:0" type="uint"/>
++    <field name="Word7" size="32" start="7:0" type="uint"/>
++    <field name="Word8" size="32" start="8:0" type="uint"/>
++    <field name="Word9" size="32" start="9:0" type="uint"/>
++    <field name="Word10" size="32" start="10:0" type="uint"/>
++    <field name="Word11" size="32" start="11:0" type="uint"/>
++    <field name="Word12" size="32" start="12:0" type="uint"/>
++    <field name="Word13" size="32" start="13:0" type="uint"/>
++    <field name="Word14" size="32" start="14:0" type="uint"/>
++    <field name="Word15" size="32" start="15:0" type="uint"/>
++  </struct>
++
++  <struct name="Tiler Context" size="48" align="64">
++    <field name="Polygon List" size="64" start="0:0" type="address"/>
++    <field name="Hierarchy Mask" size="13" start="2:0" type="uint"/>
++    <field name="Sample Pattern" size="3" start="2:13" type="Sample Pattern"/>
++    <field name="Update Cost Table" size="1" start="2:16" type="bool"/>
++    <field name="Sample test disable" size="1" start="2:17" type="bool"/>
++    <field name="First provoking vertex" size="1" start="2:18" type="bool"/>
++    <field name="FB Width" size="16" start="3:0" type="uint" modifier="minus(1)"/>
++    <field name="FB Height" size="16" start="3:16" type="uint" modifier="minus(1)"/>
++    <field name="Layer count" size="8" start="4:0" type="uint" default="1" modifier="minus(1)"/>
++    <field name="Layer offset" size="8" start="4:8" type="uint"/>
++    <field name="Heap" size="64" start="6:0" type="address"/>
++    <!-- The blob places this directly before the heap pointer -->
++    <field name="Scratch" size="64" start="8:0" type="address"/>
++    <!-- Filled in by hardare, passed to HEAPCLEAR -->
++    <field name="Heap Start" size="64" start="10:0" type="address"/>
++    <field name="Heap End" size="64" start="12:0" type="address"/>
++    <field name="Weights" size="256" start="8:0" type="Tiler Weights"/>
++    <field name="State" size="512" start="32:0" type="Tiler State"/>
++  </struct>
++
++  <struct name="Framebuffer Padding" size="16">
++  </struct>
++
++  <aggregate name="Framebuffer" align="64">
++    <section name="Parameters" offset="0" type="Framebuffer Parameters"/>
++    <section name="Padding" offset="64" type="Framebuffer Padding"/>
++  </aggregate>
++
++  <struct name="Fragment Job Payload" layout="cs">
++    <field name="Bound Min X" size="16" start="0x2a:0" type="uint"/>
++    <field name="Bound Min Y" size="16" start="0x2a:16" type="uint"/>
++    <field name="Bound Max X" size="16" start="0x2b:0" type="uint"/>
++    <field name="Bound Max Y" size="16" start="0x2b:16" type="uint"/>
++    <field name="Framebuffer" size="48" start="0x28:0" type="address"/>
++    <field name="Tile Enable Map" size="48" start="0x2c:0" type="address"/>
++    <!-- Must have 64-byte alignment -->
++    <field name="Tile Enable Map Row Stride" size="9" start="0x2e:0" type="uint"/>
++  </struct>
++
++  <aggregate name="Fragment Job" align="128">
++    <section name="Payload" offset="32" type="Fragment Job Payload"/>
++  </aggregate>
++
++  <struct name="Fragment launch" layout="ins" op="7">
++    <field name="Has Tile Enable Map" size="1" start="0" type="bool"/>
++    <field name="Tile render order" size="3" start="4" type="Tile Render Order" default="Z Order"/>
++    <field name="Unk" size="1" start="32" type="bool"/>
++  </struct>
++
++  <struct name="Shader Environment" size="16" align="64">
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="FAU count" size="8" start="1:0" type="uint"/>
++    <field name="Resources" size="48" start="8:0" type="address"/>
++    <field name="Shader" size="48" start="10:0" type="address"/>
++    <field name="Thread storage" size="48" start="12:0" type="address"/>
++    <field name="FAU" size="64" start="14:0" type="address"/>
++  </struct>
++
++  <struct name="Position Shader Environment" layout="cs" unpacked="Shader Environment">
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="Resources" size="48" start="0x00:0" type="address"/>
++    <field name="Shader" size="48" start="0x10:0" type="address"/>
++    <!-- Note that filling the thread storage pointer at 0x1e might
++         also be requried for prefix shaders. -->
++    <field name="Thread storage" size="48" start="0x18:0" type="address"/>
++    <field name="FAU" size="48" start="0x08:0" type="address"/>
++    <field name="FAU count" size="8" start="0x08:56" type="uint"/>
++  </struct>
++
++  <struct name="Varying Shader Environment" layout="cs" unpacked="Shader Environment">
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="Resources" size="48" start="0x00:0" type="address"/>
++    <field name="Shader" size="48" start="0x12:0" type="address"/>
++    <!-- These field alias with position shaders -->
++    <field name="Thread storage" size="48" start="0x18:0" type="address"/>
++    <field name="FAU" size="48" start="0x08:0" type="address"/>
++    <field name="FAU count" size="8" start="0x08:56" type="uint"/>
++  </struct>
++
++  <struct name="Fragment Shader Environment" layout="cs" unpacked="Shader Environment">
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="Resources" size="48" start="0x04:0" type="address"/>
++    <field name="Shader" size="48" start="0x14:0" type="address"/>
++    <field name="Thread storage" size="48" start="0x18:0" type="address"/>
++    <field name="FAU" size="48" start="0x0c:0" type="address"/>
++    <field name="FAU count" size="8" start="0x0c:56" type="uint"/>
++  </struct>
++
++  <struct name="Compute Shader Environment" layout="cs" unpacked="Shader Environment">
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="Resources" size="48" start="0x06:0" type="address"/>
++    <field name="Shader" size="48" start="0x16:0" type="address"/>
++    <field name="Thread storage" size="48" start="0x1e:0" type="address"/>
++    <field name="FAU" size="48" start="0x0e:0" type="address"/>
++    <field name="FAU count" size="8" start="0x0e:56" type="uint"/>
++  </struct>
++
++  <struct name="Compute Payload" layout="cs">
++    <field name="Workgroup size X" size="10" start="0x21:0" type="uint" modifier="minus(1)"/>
++    <field name="Workgroup size Y" size="10" start="0x21:10" type="uint" modifier="minus(1)"/>
++    <field name="Workgroup size Z" size="10" start="0x21:20" type="uint" modifier="minus(1)"/>
++    <field name="Allow merging workgroups" size="1" start="0x21:31" type="bool"/>
++    <field name="Offset X" size="32" start="0x22:0" type="uint"/>
++    <field name="Offset Y" size="32" start="0x23:0" type="uint"/>
++    <field name="Offset Z" size="32" start="0x24:0" type="uint"/>
++    <field name="Workgroup count X" size="32" start="0x25:0" type="uint"/>
++    <field name="Workgroup count Y" size="32" start="0x26:0" type="uint"/>
++    <field name="Workgroup count Z" size="32" start="0x27:0" type="uint"/>
++    <field name="Compute" size="512" start="0x00:0" type="Compute Shader Environment"/>
++  </struct>
++
++  <struct name="Compute Launch" layout="ins" op="4">
++    <field name="Unk 1" size="14" start="0" type="uint" default="1"/>
++    <field name="Unk 2" size="2" start="14" type="uint" default="2"/>
++    <field name="Unk 3" size="8" start="40" type="hex" default="255"/>
++  </struct>
++
++  <!-- Compute job also covers vertex and geometry operations -->
++  <aggregate name="Compute Job" align="128">
++    <section name="Payload" offset="32" type="Compute Payload"/>
++  </aggregate>
++
++  <struct name="Resource" size="4" align="16">
++    <field name="Address" size="56" start="0:0" type="address"/>
++    <field name="Contains descriptors" size="1" start="1:24" type="bool" default="true"/>
++    <field name="Size" size="64" start="2:0" type="uint"/> <!-- bytes -->
++  </struct>
++
++  <struct name="Depth/stencil" size="8" align="32">
++    <field name="Type" size="4" start="0:0" type="Descriptor Type" default="Depth/stencil"/>
++    <field name="Front compare function" size="3" start="0:4" type="Func"/>
++    <field name="Front stencil fail" size="3" start="0:7" type="Stencil Op"/>
++    <field name="Front depth fail" size="3" start="0:10" type="Stencil Op"/>
++    <field name="Front depth pass" size="3" start="0:13" type="Stencil Op"/>
++    <field name="Back compare function" size="3" start="0:16" type="Func"/>
++    <field name="Back stencil fail" size="3" start="0:19" type="Stencil Op"/>
++    <field name="Back depth fail" size="3" start="0:22" type="Stencil Op"/>
++    <field name="Back depth pass" size="3" start="0:25" type="Stencil Op"/>
++    <field name="Stencil from shader" size="1" start="0:30" type="bool"/>
++    <field name="Stencil test enable" size="1" start="0:31" type="bool"/>
++    <field name="Front write mask" size="8" start="1:0" type="hex"/>
++    <field name="Back write mask" size="8" start="1:8" type="hex"/>
++    <field name="Front value mask" size="8" start="1:16" type="hex"/>
++    <field name="Back value mask" size="8" start="1:24" type="hex"/>
++    <field name="Front reference value" size="8" start="2:0" type="hex"/>
++    <field name="Back reference value" size="8" start="2:8" type="hex"/>
++    <field name="Depth cull enable" size="1" start="4:22" type="bool"/>
++    <field name="Depth clamp mode" size="2" start="4:23" type="Depth Clamp Mode"/>
++    <field name="Depth source" size="2" start="4:25" type="Depth Source" default="Fixed function"/>
++    <field name="Depth write enable" size="1" start="4:27" type="bool"/>
++    <field name="Depth bias enable" size="1" start="4:28" type="bool"/>
++    <field name="Depth function" size="3" start="4:29" type="Func"/>
++    <field name="Depth units" size="32" start="5:0" type="float"/>
++    <field name="Depth factor" size="32" start="6:0" type="float"/>
++    <field name="Depth bias clamp" size="32" start="7:0" type="float"/>
++  </struct>
++
++  <struct name="Vertex Array" size="4">
++    <field name="Packet" size="1" start="0:0" type="bool"/>
++
++    <!-- Written by hardware in packet mode -->
++    <field name="Pointer" size="58" start="0:6" type="address" modifier="shr(6)"/>
++
++    <!-- Written by hardware, leave zero -->
++    <field name="Vertex packet stride" size="16" start="2:0" type="uint"/>
++    <field name="Vertex attribute stride" size="16" start="2:16" type="uint"/>
++    <!-- Comes from the w26 register -->
++    <field name="Unk" size="24" start="3:0" type="hex"/>
++  </struct>
++
++  <struct name="Draw" layout="cs">
++    <field name="Allow forward pixel to kill" size="1" start="0x39:0" type="bool"/>
++    <field name="Allow forward pixel to be killed" size="1" start="0x39:1" type="bool"/>
++    <field name="Pixel kill operation" size="2" start="0x39:2" type="Pixel Kill"/>
++    <field name="ZS update operation" size="2" start="0x39:4" type="Pixel Kill"/>
++    <field name="Allow primitive reorder" size="1" start="0x39:6" type="bool"/>
++    <field name="Overdraw alpha0" size="1" start="0x39:7" type="bool"/>
++    <field name="Overdraw alpha1" size="1" start="0x39:8" type="bool"/>
++    <field name="Clean Fragment Write" size="1" start="0x39:9" type="bool"/>
++    <field name="Primitive Barrier" size="1" start="0x39:10" type="bool"/>
++    <field name="Evaluate per-sample" size="1" start="0x39:11" type="bool"/>
++    <field name="Single-sampled lines" size="1" start="0x39:13" type="bool"/>
++    <field name="Occlusion query" size="2" start="0x39:14" type="Occlusion Mode" default="Disabled"/>
++    <field name="Front face CCW" size="1" start="0x39:16" type="bool"/>
++    <field name="Cull front face" size="1" start="0x39:17" type="bool"/>
++    <field name="Cull back face" size="1" start="0x39:18" type="bool"/>
++    <field name="Multisample enable" size="1" start="0x39:19" type="bool"/>
++    <field name="Shader modifies coverage" size="1" start="0x39:20" type="bool"/>
++    <field name="Alpha-to-coverage Invert" size="1" start="0x39:21" type="bool"/>
++    <field name="Alpha-to-coverage" size="1" start="0x39:22" type="bool"/>
++    <field name="Scissor to bounding box" size="1" start="0x39:23" type="bool"/>
++    <field name="Sample mask" size="16" start="0x3a:0" type="uint"/>
++    <field name="Render target mask" size="16" start="0x3a:16" type="uint"/>
++
++    <field name="Unk" size="24" start="0x26:0" type="hex" default="0x1000"/>
++
++    <!-- TODO v10, Doesn't exist in register space but is written to the heap -->
++    <field name="Vertex array" size="128" start="2:0" type="Vertex Array"/>
++
++    <field name="Minimum Z" size="32" start="0x2c:0" type="float"/>
++    <field name="Maximum Z" size="32" start="0x2d:0" type="float"/>
++    <field name="Depth/stencil" size="48" start="0x34:0" type="address"/>
++    <field name="Blend count" size="4" start="0x32:0" type="uint"/>
++    <field name="Blend" size="44" start="0x32:4" type="address" modifier="shr(4)"/>
++    <field name="Occlusion" size="48" start="0x2e:0" type="address"/>
++    <field name="Shader" size="512" start="0x0:0" type="Fragment Shader Environment"/>
++  </struct>
++
++  <struct name="Draw No CS" align="64" unpacked="Draw">
++    <field name="Allow forward pixel to kill" size="1" start="0:0" type="bool"/>
++    <field name="Allow forward pixel to be killed" size="1" start="0:1" type="bool"/>
++    <field name="Pixel kill operation" size="2" start="0:2" type="Pixel Kill"/>
++    <field name="ZS update operation" size="2" start="0:4" type="Pixel Kill"/>
++    <field name="Allow primitive reorder" size="1" start="0:6" type="bool"/>
++    <field name="Overdraw alpha0" size="1" start="0:7" type="bool"/>
++    <field name="Overdraw alpha1" size="1" start="0:8" type="bool"/>
++    <field name="Clean Fragment Write" size="1" start="0:9" type="bool"/>
++    <field name="Primitive Barrier" size="1" start="0:10" type="bool"/>
++    <field name="Evaluate per-sample" size="1" start="0:11" type="bool"/>
++    <field name="Single-sampled lines" size="1" start="0:13" type="bool"/>
++    <field name="Occlusion query" size="2" start="0:14" type="Occlusion Mode" default="Disabled"/>
++    <field name="Front face CCW" size="1" start="0:16" type="bool"/>
++    <field name="Cull front face" size="1" start="0:17" type="bool"/>
++    <field name="Cull back face" size="1" start="0:18" type="bool"/>
++    <field name="Multisample enable" size="1" start="0:19" type="bool"/>
++    <field name="Shader modifies coverage" size="1" start="0:20" type="bool"/>
++    <field name="Alpha-to-coverage Invert" size="1" start="0:21" type="bool"/>
++    <field name="Alpha-to-coverage" size="1" start="0:22" type="bool"/>
++    <field name="Scissor to bounding box" size="1" start="0:23" type="bool"/>
++    <field name="Sample mask" size="16" start="1:0" type="uint"/>
++    <field name="Render target mask" size="8" start="1:16" type="hex"/>
++    <!-- This is correct, but uncommenting would break things! -->
++    <!--<field name="Vertex array" size="128" start="2:0" type="Vertex Array"/>-->
++    <field name="Minimum Z" size="32" start="6:0" type="float"/>
++    <field name="Maximum Z" size="32" start="7:0" type="float"/>
++    <field name="Depth/stencil" size="64" start="10:0" type="address"/>
++    <field name="Blend count" size="4" start="12:0" type="uint"/>
++    <field name="Blend" size="60" start="12:4" type="address" modifier="shr(4)"/>
++    <field name="Occlusion" size="64" start="14:0" type="address"/>
++    <field name="Shader" size="512" start="16:0" type="Shader Environment"/>
++  </struct>
++
++  <struct name="Count" layout="cs">
++    <!-- I think this is right? -->
++    <field name="Count" size="32" start="0x22:0" type="uint"/>
++  </struct>
++
++  <struct name="Allocation" layout="cs">
++    <!-- "Vertex packet stride" is only written by hardware? -->
++    <field name="Vertex attribute stride" size="16" start="0x30:0" type="uint"/>
++  </struct>
++
++  <struct name="Tiler Pointer" layout="cs">
++    <field name="Address" size="48" start="0x28:0" type="address"/>
++  </struct>
++
++  <struct name="Primitive Size" layout="cs">
++    <field name="Constant" size="32" start="0x3c:0" type="float"/>
++    <field name="Size Array" size="48" start="0x3c:0" type="uint"/>
++  </struct>
++
++  <struct name="Indices" layout="cs">
++    <field name="Address" size="64" start="0x36:0" type="address"/>
++    <!-- This seems different from v9, should it live someplace else? -->
++    <field name="Size" size="32" start="0x27:0" type="uint"/>
++  </struct>
++
++  <!-- TODO: What to do about these? -->
++  <aggregate name="Malloc Vertex Job" layout="cs">
++    <section name="Primitive" offset="32" type="Primitive"/>
++    <section name="Instance Count" offset="48" type="Count"/>
++    <section name="Allocation" offset="52" type="Allocation"/>
++    <section name="Tiler" offset="56" type="Tiler Pointer"/>
++    <section name="Scissor" offset="104" type="Scissor"/>
++    <section name="Primitive Size" offset="112" type="Primitive Size"/>
++    <section name="Indices" offset="120" type="Indices"/>
++    <section name="Draw" offset="128" type="Draw"/>
++    <section name="Position" offset="256" type="Position Shader Environment"/>
++    <section name="Varying" offset="320" type="Varying Shader Environment"/>
++  </aggregate>
++
++  <aggregate name="Tiler Job" layout="cs">
++    <section name="Primitive" offset="32" type="Primitive"/>
++    <section name="Instance Count" offset="48" type="Count"/>
++    <section name="Vertex Count" offset="52" type="Count"/>
++    <section name="Tiler" offset="56" type="Tiler Pointer"/>
++    <section name="Scissor" offset="104" type="Scissor"/>
++    <section name="Primitive Size" offset="112" type="Primitive Size"/>
++    <section name="Indices" offset="120" type="Indices"/>
++    <section name="Draw" offset="128" type="Draw"/>
++  </aggregate>
++
++  <struct name="CS NOP" layout="ins" op="0"/>
++
++  <struct name="CS Add Imm" layout="ins" op="17">
++    <field name="Value" size="32" start="0" type="int"/>
++    <field name="Src" size="8" start="40" type="register"/>
++    <field name="Dest" size="8" start="48" type="register"/>
++  </struct>
++
++  <struct name="CS LDR" layout="ins" op="20">
++      <field name="Offset" size="16" start="0" type="int"/>
++      <field name="Register Mask" size="16" start="16" type="hex"/>
++      <field name="Addr" size="8" start="40" type="register"/>
++      <field name="Register Base" size="8" start="48" type="register"/>
++  </struct>
++
++  <struct name="CS STR" layout="ins" op="21">
++      <field name="Offset" size="16" start="0" type="int"/>
++      <field name="Register Mask" size="16" start="16" type="hex"/>
++      <field name="Addr" size="8" start="40" type="register"/>
++      <field name="Register Base" size="8" start="48" type="register"/>
++  </struct>
++
++  <!-- TODO: The next four are all just about equivalent... they
++       should have a shared definition. -->
++  <struct name="CS EVADD" layout="ins" op="37">
++    <field name="Unk Flag" size="1" start="0" type="bool" default="true"/>
++    <field name="No IRQ" size="1" start="2" type="bool" default="false"/>
++    <field name="Unk 2" size="16" start="16" type="hex" default="253"/>
++    <!-- TODO: Separate types for 32-bit and 64-bit registers -->
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++    <field name="Unk 3" size="8" start="48" type="hex" default="1"/>
++  </struct>
++
++  <struct name="CS EVADD 64" layout="ins" op="51">
++    <field name="Unk Flag" size="1" start="0" type="bool" default="true"/>
++    <field name="No IRQ" size="1" start="2" type="bool" default="false"/>
++    <field name="Unk 2" size="16" start="16" type="hex" default="253"/>
++    <!-- TODO: Separate types for 32-bit and 64-bit registers -->
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++    <field name="Unk 3" size="8" start="48" type="hex" default="1"/>
++  </struct>
++
++  <struct name="CS EVSTR" layout="ins" op="38">
++    <field name="Unk Flag" size="1" start="0" type="bool" default="true"/>
++    <field name="No IRQ" size="1" start="2" type="bool" default="false"/>
++    <field name="Unk 2" size="16" start="16" type="hex" default="253"/>
++    <!-- TODO: Separate types for 32-bit and 64-bit registers -->
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++    <field name="Unk 3" size="8" start="48" type="hex" default="1"/>
++  </struct>
++
++  <struct name="CS EVSTR 64" layout="ins" op="52">
++    <field name="Unk Flag" size="1" start="0" type="bool" default="true"/>
++    <field name="No IRQ" size="1" start="2" type="bool" default="false"/>
++    <field name="Unk 2" size="16" start="16" type="hex" default="253"/>
++    <!-- TODO: Separate types for 32-bit and 64-bit registers -->
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++    <field name="Unk 3" size="8" start="48" type="hex" default="1"/>
++  </struct>
++
++  <!-- Arm uses "greater than" and "less or equal", but I chose these
++       names to match up with AArch64 comparison names - these are
++       unsigned, while branch instructions are signed. -->
++  <enum name="Wait Condition">
++    <value name="Lower or same" value="0"/>
++    <value name="Higher" value="1"/>
++  </enum>
++
++  <!-- TODO: Use a single instruction with two opcodes for bitness -->
++  <struct name="CS EVWAIT" layout="ins" op="39">
++    <field name="No error" size="1" start="0" type="bool"/>
++    <field name="Condition" size="1" start="28" type="Wait Condition"/>
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++  </struct>
++
++  <struct name="CS EVWAIT 64" layout="ins" op="53">
++    <field name="No error" size="1" start="0" type="bool"/>
++    <field name="Condition" size="1" start="28" type="Wait Condition"/>
++    <field name="Value" size="8" start="32" type="register"/>
++    <field name="Addr" size="8" start="40" type="register"/>
++  </struct>
++
++  <struct name="CS Slot" layout="ins" op="23">
++    <field name="Index" size="3" start="0" type="uint"/>
++  </struct>
++
++  <struct name="CS Wait" layout="ins" op="3">
++    <field name="Slots" size="8" start="16" type="hex"/>
++  </struct>
++
++  <struct name="CS Resources" layout="ins" op="34">
++    <field name="Mask" size="56" start="0" type="hex"/>
++    <field name="Compute" size="1" start="0" type="bool"/>
++    <field name="Fragment" size="1" start="1" type="bool"/>
++    <field name="Tiler" size="1" start="2" type="bool"/>
++    <field name="IDVS" size="1" start="3" type="bool"/>
++  </struct>
++
++  <struct name="CS Call" layout="ins" op="32">
++    <field name="Length" size="8" start="32" type="register"/>
++    <field name="Address" size="8" start="40" type="register"/>
++  </struct>
++
++  <struct name="CS Tailcall" layout="ins" op="33">
++    <field name="Length" size="8" start="32" type="register"/>
++    <field name="Address" size="8" start="40" type="register"/>
++  </struct>
++
++  <struct name="CS Flush Tiler" layout="ins" op="9"/>
++
++  <!-- TODO: What else can the instruction do? -->
++  <struct name="CS HEAPCLEAR" layout="ins" op="11">
++    <field name="Unk 1" size="16" start="0" type="hex" default="1"/>
++    <field name="Slots" size="8" start="16" type="hex"/>
++    <field name="End" size="8" start="32" type="register"/>
++    <field name="Start" size="8" start="40" type="register"/>
++    <field name="Unk 2" size="8" start="48" type="hex" default="2"/>
++  </struct>
++
++  <struct name="CS HEAPCTX" layout="ins" op="48">
++    <field name="Address" size="8" start="40" type="register"/>
++  </struct>
++
++  <enum name="Heap Statistic">
++    <value name="V/T Start" value="0"/>
++    <value name="V/T End" value="1"/>
++    <value name="Fragment End" value="3"/>
++  </enum>
++
++  <struct name="CS HEAPINC" layout="ins" op="49">
++    <field name="Type" size="8" start="32" type="Heap Statistic"/>
++  </struct>
++</panxml>
+diff --git a/src/panfrost/lib/genxml/v4.xml b/src/panfrost/lib/genxml/v4.xml
+index b72fc3e28ef..a4ee54c2bac 100644
+--- a/src/panfrost/lib/genxml/v4.xml
++++ b/src/panfrost/lib/genxml/v4.xml
+@@ -446,7 +446,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+diff --git a/src/panfrost/lib/genxml/v5.xml b/src/panfrost/lib/genxml/v5.xml
+index f9fc44e89f3..2feb8909609 100644
+--- a/src/panfrost/lib/genxml/v5.xml
++++ b/src/panfrost/lib/genxml/v5.xml
+@@ -467,7 +467,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+diff --git a/src/panfrost/lib/genxml/v6.xml b/src/panfrost/lib/genxml/v6.xml
+index 042f1e694d4..321ab524eaf 100644
+--- a/src/panfrost/lib/genxml/v6.xml
++++ b/src/panfrost/lib/genxml/v6.xml
+@@ -467,7 +467,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+@@ -689,7 +689,7 @@
+     <field name="Stencil from shader" size="1" start="28" type="bool"/>
+   </struct>
+ 
+-  <struct name="Compute Preload" size="1" no-direct-packing="true">
++  <struct name="Compute Preload" size="1" layout="none">
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Local Invocation XY" size="1" start="7" type="bool"/>
+     <field name="Local Invocation Z" size="1" start="8" type="bool"/>
+@@ -708,7 +708,7 @@
+     <value name="8" value="3"/>
+   </enum>
+ 
+-  <struct name="Vertex Preload" size="1" no-direct-packing="true">
++  <struct name="Vertex Preload" size="1" layout="none">
+     <field name="Warp limit" size="2" start="0" type="Warp Limit"/>
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Position result address lo" size="1" start="10" type="bool"/>
+@@ -717,7 +717,7 @@
+     <field name="Instance ID" size="1" start="14" type="bool"/>
+   </struct>
+ 
+-  <struct name="Fragment Preload" size="1" no-direct-packing="true">
++  <struct name="Fragment Preload" size="1" layout="none">
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Coverage" size="1" start="7" type="bool"/>
+     <field name="Primitive ID" size="1" start="9" type="bool"/>
+diff --git a/src/panfrost/lib/genxml/v7.xml b/src/panfrost/lib/genxml/v7.xml
+index 3440ee70613..b084ef6b3bf 100644
+--- a/src/panfrost/lib/genxml/v7.xml
++++ b/src/panfrost/lib/genxml/v7.xml
+@@ -512,7 +512,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+@@ -754,7 +754,7 @@
+     <field name="Shader wait dependency 7" size="1" start="31" type="bool"/>
+   </struct>
+ 
+-  <struct name="Compute Preload" size="1" no-direct-packing="true">
++  <struct name="Compute Preload" size="1" layout="none">
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Local Invocation XY" size="1" start="7" type="bool"/>
+     <field name="Local Invocation Z" size="1" start="8" type="bool"/>
+@@ -773,7 +773,7 @@
+     <value name="8" value="3"/>
+   </enum>
+ 
+-  <struct name="Vertex Preload" size="1" no-direct-packing="true">
++  <struct name="Vertex Preload" size="1" layout="none">
+     <field name="Warp limit" size="2" start="0" type="Warp Limit"/>
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Position result address lo" size="1" start="10" type="bool"/>
+@@ -782,7 +782,7 @@
+     <field name="Instance ID" size="1" start="14" type="bool"/>
+   </struct>
+ 
+-  <struct name="Fragment Preload" size="1" no-direct-packing="true">
++  <struct name="Fragment Preload" size="1" layout="none">
+     <field name="PC" size="1" start="6" type="bool"/>
+     <field name="Coverage" size="1" start="7" type="bool"/>
+     <field name="Primitive ID" size="1" start="9" type="bool"/>
+@@ -846,13 +846,13 @@
+     <field name="Depth Pass" size="3" start="25" type="Stencil Op"/>
+   </struct>
+ 
+-  <struct name="LD_VAR Preload" size="1" no-direct-packing="true">
++  <struct name="LD_VAR Preload" size="1" layout="none">
+     <field name="Varying Index" size="5" start="4" type="uint"/>
+     <field name="Register Format" size="2" start="9" type="Message Preload Register Format"/>
+     <field name="Num Components" size="2" start="11" type="uint" modifier="minus(1)" default="1"/>
+   </struct>
+ 
+-  <struct name="VAR_TEX Preload" size="1" no-direct-packing="true">
++  <struct name="VAR_TEX Preload" size="1" layout="none">
+     <field name="Varying Index" size="3" start="4" type="uint"/>
+     <field name="Texture Index" size="2" start="7" type="uint"/>
+     <field name="Register Format" size="2" start="9" type="Message Preload Register Format"/>
+diff --git a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml
+index 43d461077d6..b650bb2002a 100644
+--- a/src/panfrost/lib/genxml/v9.xml
++++ b/src/panfrost/lib/genxml/v9.xml
+@@ -526,7 +526,7 @@
+     <value name="Constant" value="7"/>
+   </enum>
+ 
+-  <struct name="Blend Function" no-direct-packing="true">
++  <struct name="Blend Function" layout="none">
+     <!-- Blend equation: A + (B * C) -->
+     <field name="A" size="2" start="0" type="Blend Operand A"/>
+     <field name="Negate A" size="1" start="3" type="bool"/>
+@@ -599,12 +599,6 @@
+     <value name="FP32" value="3"/>
+   </enum>
+ 
+-  <enum name="Primitive Restart">
+-    <value name="None" value="0"/>
+-    <value name="Implicit" value="2"/>
+-    <value name="Explicit" value="3"/>
+-  </enum>
+-
+   <struct name="Primitive">
+     <field name="Draw mode" size="8" start="0:0" type="Draw Mode" default="None"/>
+     <field name="Index type" size="3" start="0:8" type="Index Type" default="None"/>
+@@ -612,10 +606,10 @@
+     <field name="Primitive Index Enable" size="1" start="0:13" type="bool"/>
+     <field name="Primitive Index Writeback" size="1" start="0:14" type="bool"/>
+     <field name="Allow rotating primitives" size="1" start="0:15" type="bool" default="true"/>
+-    <field name="Low Depth Cull" size="1" start="0:16" type="bool" default="true"/>
+-    <field name="High Depth Cull" size="1" start="0:17" type="bool" default="true"/>
++    <field name="Low Depth Cull" size="1" start="0:16" type="bool"/>
++    <field name="High Depth Cull" size="1" start="0:17" type="bool"/>
+     <field name="Secondary Shader" size="1" start="0:18" type="bool"/>
+-    <field name="Primitive restart" size="2" start="0:19" type="Primitive Restart"/>
++    <field name="Primitive restart" size="1" start="0:19" type="bool"/>
+     <field name="Layer index enable" size="1" start="0:20" type="bool"/>
+     <field name="Scissor array enable" size="1" start="0:21" type="bool"/>
+ 
+@@ -1309,28 +1303,28 @@
+   </aggregate>
+ 
+   <struct name="Shader Environment" size="16" align="64">
+-    <field name="Attribute offset" start="0:0" size="32" type="uint"/>
+-    <field name="FAU count" start="1:0" size="8" type="uint"/>
+-    <field name="Resources" start="8:0" size="64" type="address"/>
+-    <field name="Shader" start="10:0" size="64" type="address"/>
+-    <field name="Thread storage" start="12:0" size="64" type="address"/>
+-    <field name="FAU" start="14:0" size="64" type="address"/>
++    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
++    <field name="FAU count" size="8" start="1:0" type="uint"/>
++    <field name="Resources" size="64" start="8:0" type="address"/>
++    <field name="Shader" size="64" start="10:0" type="address"/>
++    <field name="Thread storage" size="64" start="12:0" type="address"/>
++    <field name="FAU" size="64" start="14:0" type="address"/>
+   </struct>
+ 
+   <struct name="Compute Payload" size="24">
+-    <field name="Workgroup size X" start="0:0" size="10" type="uint" modifier="minus(1)"/>
+-    <field name="Workgroup size Y" start="0:10" size="10" type="uint" modifier="minus(1)"/>
+-    <field name="Workgroup size Z" start="0:20" size="10" type="uint" modifier="minus(1)"/>
+-    <field name="Allow merging workgroups" start="0:31" size="1" type="bool"/>
+-    <field name="Task increment" start="1:0" size="14" type="uint" default="1"/>
+-    <field name="Task axis" start="1:14" size="2" type="Task Axis"/>
+-    <field name="Workgroup count X" start="2:0" size="32" type="uint"/>
+-    <field name="Workgroup count Y" start="3:0" size="32" type="uint"/>
+-    <field name="Workgroup count Z" start="4:0" size="32" type="uint"/>
+-    <field name="Offset X" start="5:0" size="32" type="uint"/>
+-    <field name="Offset Y" start="6:0" size="32" type="uint"/>
+-    <field name="Offset Z" start="7:0" size="32" type="uint"/>
+-    <field name="Compute" start="8:0" size="512" type="Shader Environment"/>
++    <field name="Workgroup size X" size="10" start="0:0" type="uint" modifier="minus(1)"/>
++    <field name="Workgroup size Y" size="10" start="0:10" type="uint" modifier="minus(1)"/>
++    <field name="Workgroup size Z" size="10" start="0:20" type="uint" modifier="minus(1)"/>
++    <field name="Allow merging workgroups" size="1" start="0:31" type="bool"/>
++    <field name="Task increment" size="14" start="1:0" type="uint" default="1"/>
++    <field name="Task axis" size="2" start="1:14" type="Task Axis"/>
++    <field name="Workgroup count X" size="32" start="2:0" type="uint"/>
++    <field name="Workgroup count Y" size="32" start="3:0" type="uint"/>
++    <field name="Workgroup count Z" size="32" start="4:0" type="uint"/>
++    <field name="Offset X" size="32" start="5:0" type="uint"/>
++    <field name="Offset Y" size="32" start="6:0" type="uint"/>
++    <field name="Offset Z" size="32" start="7:0" type="uint"/>
++    <field name="Compute" size="512" start="8:0" type="Shader Environment"/>
+   </struct>
+ 
+   <!-- Compute job also covers vertex and geometry operations -->
+@@ -1340,9 +1334,9 @@
+   </aggregate>
+ 
+   <struct name="Resource" size="4" align="16">
+-    <field name="Address" start="0:0" size="56" type="address"/>
+-    <field name="Contains descriptors" start="1:24" size="1" type="bool" default="true"/>
+-    <field name="Size" start="2:0" size="64" type="uint"/> <!-- bytes -->
++    <field name="Address" size="56" start="0:0" type="address"/>
++    <field name="Contains descriptors" size="1" start="1:24" type="bool" default="true"/>
++    <field name="Size" size="64" start="2:0" type="uint"/> <!-- bytes -->
+   </struct>
+ 
+   <struct name="Depth/stencil" size="8" align="32">
+@@ -1363,8 +1357,8 @@
+     <field name="Back value mask" size="8" start="1:24" type="hex"/>
+     <field name="Front reference value" size="8" start="2:0" type="hex"/>
+     <field name="Back reference value" size="8" start="2:8" type="hex"/>
+-    <field name="Depth cull enable" size="1" start="4:22" type="bool" default="true"/>
+-    <field name="Depth clamp mode" size="2" start="4:23" type="Depth Clamp Mode" default="[0, 1]"/>
++    <field name="Depth cull enable" size="1" start="4:22" type="bool"/>
++    <field name="Depth clamp mode" size="2" start="4:23" type="Depth Clamp Mode"/>
+     <field name="Depth source" size="2" start="4:25" type="Depth Source" default="Fixed function"/>
+     <field name="Depth write enable" size="1" start="4:27" type="bool"/>
+     <field name="Depth bias enable" size="1" start="4:28" type="bool"/>
+@@ -1374,6 +1368,7 @@
+     <field name="Depth bias clamp" size="32" start="7:0" type="float"/>
+   </struct>
+ 
++  <!-- TODO: Is this actually four words? -->
+   <struct name="Vertex Array" size="3">
+     <field name="Packet" size="1" start="0:0" type="bool"/>
+ 
+@@ -1391,7 +1386,7 @@
+     <field name="Pixel kill operation" size="2" start="0:2" type="Pixel Kill"/>
+     <field name="ZS update operation" size="2" start="0:4" type="Pixel Kill"/>
+     <field name="Allow primitive reorder" size="1" start="0:6" type="bool"/>
+-    <field name="Overdraw alpha0" start="0:7" size="1" type="bool"/>
++    <field name="Overdraw alpha0" size="1" start="0:7" type="bool"/>
+     <field name="Overdraw alpha1" size="1" start="0:8" type="bool"/>
+     <field name="Clean Fragment Write" size="1" start="0:9" type="bool"/>
+     <field name="Primitive Barrier" size="1" start="0:10" type="bool"/>
+@@ -1407,24 +1402,24 @@
+     <field name="Alpha-to-coverage" size="1" start="0:22" type="bool"/>
+     <field name="Scissor to bounding box" size="1" start="0:23" type="bool"/>
+     <field name="Sample mask" size="16" start="1:0" type="uint"/>
+-    <field name="Render target mask" start="1:16" size="8" type="hex"/>
+-    <field name="Vertex array" start="2:0" size="96" type="Vertex Array"/>
+-    <field name="Minimum Z" start="6:0" size="32" type="float"/>
+-    <field name="Maximum Z" start="7:0" size="32" type="float"/>
+-    <field name="Depth/stencil" start="10:0" size="64" type="address"/>
+-    <field name="Blend count" start="12:0" size="4" type="uint"/>
+-    <field name="Blend" start="12:4" size="60" type="address" modifier="shr(4)"/>
++    <field name="Render target mask" size="8" start="1:16" type="hex"/>
++    <field name="Vertex array" size="96" start="2:0" type="Vertex Array"/>
++    <field name="Minimum Z" size="32" start="6:0" type="float"/>
++    <field name="Maximum Z" size="32" start="7:0" type="float"/>
++    <field name="Depth/stencil" size="64" start="10:0" type="address"/>
++    <field name="Blend count" size="4" start="12:0" type="uint"/>
++    <field name="Blend" size="60" start="12:4" type="address" modifier="shr(4)"/>
+     <field name="Occlusion" size="64" start="14:0" type="address"/>
+-    <field name="Shader" start="16:0" size="512" type="Shader Environment"/>
++    <field name="Shader" size="512" start="16:0" type="Shader Environment"/>
+   </struct>
+ 
+   <struct name="Count" size="1">
+-    <field name="Count" start="0:0" size="32" type="uint"/>
++    <field name="Count" size="32" start="0:0" type="uint"/>
+   </struct>
+ 
+   <struct name="Allocation" size="1">
+-    <field name="Vertex packet stride" start="0:0" size="16" type="uint"/>
+-    <field name="Vertex attribute stride" start="0:16" size="16" type="uint"/>
++    <field name="Vertex packet stride" size="16" start="0:0" type="uint"/>
++    <field name="Vertex attribute stride" size="16" start="0:16" type="uint"/>
+   </struct>
+ 
+   <struct name="Tiler Pointer" size="2">
+diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build
+index f8c34cb5a82..31dab70e304 100644
+--- a/src/panfrost/lib/meson.build
++++ b/src/panfrost/lib/meson.build
+@@ -39,7 +39,7 @@ endforeach
+ 
+ libpanfrost_per_arch = []
+ 
+-foreach ver : ['4', '5', '6', '7', '9']
++foreach ver : ['4', '5', '6', '7', '9', '10']
+   libpanfrost_per_arch += static_library(
+     'pan-arch-v' + ver,
+     [
+@@ -93,7 +93,7 @@ libpanfrost_lib = static_library(
+   include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
+   c_args : [no_override_init_args],
+   gnu_symbol_visibility : 'hidden',
+-  dependencies: [dep_libdrm, idep_nir],
++  dependencies: [dep_libdrm, idep_nir, libpanfrost_base_dep],
+   build_by_default : false,
+   link_with: [libpanfrost_pixel_format, libpanfrost_per_arch],
+ )
+diff --git a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c
+index 7a524e53f66..31d9612b9e7 100644
+--- a/src/panfrost/lib/pan_afbc.c
++++ b/src/panfrost/lib/pan_afbc.c
+@@ -125,10 +125,6 @@ panfrost_afbc_format(unsigned arch, enum pipe_format format)
+          */
+         format = util_format_linear(format);
+ 
+-        /* Don't allow swizzled formats on v7+ */
+-        if (arch >= 7 && format != unswizzled_format(format))
+-                return PIPE_FORMAT_NONE;
+-
+         /* Otherwise swizzling doesn't affect AFBC */
+         format = unswizzled_format(format);
+ 
+@@ -189,3 +185,12 @@ panfrost_afbc_can_tile(const struct panfrost_device *dev)
+ {
+         return (dev->arch >= 7);
+ }
++
++/*
++ * Can this format only be used with AFBC_FORMAT_MOD_NATIVE_SWIZZLE?
++ */
++bool
++panfrost_afbc_only_native(unsigned arch, enum pipe_format format)
++{
++        return (arch >= 7 && format != unswizzled_format(format));
++}
+diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c
+index f6e6bf671b0..b8b84ca7f8d 100644
+--- a/src/panfrost/lib/pan_blend.c
++++ b/src/panfrost/lib/pan_blend.c
+@@ -800,7 +800,7 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev,
+         };
+ 
+         /* Blend shaders should only be used for blending on Bifrost onwards */
+-        assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation));
++        assert(dev->arch <= 5 || state->logicop_enable || !pan_blend_is_opaque(state->rts[rt].equation));
+         assert(state->rts[rt].equation.color_mask != 0);
+ 
+         struct hash_entry *he = _mesa_hash_table_search(dev->blend_shaders.shaders, &key);
+diff --git a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c
+index e2e2342b5e4..e6b0e2ce333 100644
+--- a/src/panfrost/lib/pan_blitter.c
++++ b/src/panfrost/lib/pan_blitter.c
+@@ -1150,7 +1150,7 @@ pan_preload_emit_dcd(struct pan_pool *pool,
+                                         blend.cpu);
+         }
+ 
+-        pan_pack(out, DRAW, cfg) {
++        pan_pack(out, DRAW_NO_CS, cfg) {
+                 if (zs) {
+                         /* ZS_EMIT requires late update/kill */
+                         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
+@@ -1225,7 +1225,7 @@ pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool,
+                 return;
+ 
+         fb->bifrost.pre_post.dcds =
+-                pan_pool_alloc_desc_array(desc_pool, 3, DRAW);
++                pan_pool_alloc_desc_array(desc_pool, 3, DRAW_NO_CS);
+ }
+ 
+ static void
+@@ -1237,7 +1237,7 @@ pan_preload_emit_pre_frame_dcd(struct pan_pool *desc_pool,
+         pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb);
+         assert(fb->bifrost.pre_post.dcds.cpu);
+         void *dcd = fb->bifrost.pre_post.dcds.cpu +
+-                    (dcd_idx * pan_size(DRAW));
++                    (dcd_idx * pan_size(DRAW_NO_CS));
+ 
+         /* We only use crc_rt to determine whether to force writes for updating
+          * the CRCs, so use a conservative tile size (16x16).
+diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c
+index b606d1b0359..708fd38354a 100644
+--- a/src/panfrost/lib/pan_bo.c
++++ b/src/panfrost/lib/pan_bo.c
+@@ -39,6 +39,7 @@
+ 
+ #include "util/u_inlines.h"
+ #include "util/u_math.h"
++#include "util/os_file.h"
+ 
+ /* This file implements a userspace BO cache. Allocating and freeing
+  * GPU-visible buffers is very expensive, and even the extra kernel roundtrips
+@@ -71,7 +72,38 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
+                         create_bo.flags |= PANFROST_BO_NOEXEC;
+         }
+ 
+-        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
++        void *cpu = NULL;
++
++        bool cached = false;
++
++        if (dev->kbase) {
++                if (flags & PAN_BO_CACHEABLE) {
++                        if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) {
++                                create_bo.flags |= MALI_BO_CACHED_CPU;
++                                /* TODO: What if kbase decides not to cache it? */
++                                cached = true;
++                        }
++                        if (dev->debug & PAN_DBG_UNCACHED_GPU)
++                                create_bo.flags |= MALI_BO_UNCACHED_GPU;
++                }
++
++                unsigned mali_flags = (flags & PAN_BO_EVENT) ? 0x8200f : 0;
++
++                struct base_ptr p = dev->mali.alloc(&dev->mali, size, create_bo.flags, mali_flags);
++
++                if (p.gpu) {
++                        cpu = p.cpu;
++                        create_bo.offset = p.gpu;
++                        create_bo.handle = kbase_alloc_gem_handle(&dev->mali, p.gpu, -1);
++                        if (!cpu)
++                                abort();
++                        ret = 0;
++                } else {
++                        ret = -1;
++                }
++        } else {
++                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
++        }
+         if (ret) {
+                 fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n");
+                 return NULL;
+@@ -82,29 +114,99 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
+ 
+         bo->size = create_bo.size;
+         bo->ptr.gpu = create_bo.offset;
++        bo->ptr.cpu = cpu;
++        if ((uintptr_t) bo->ptr.cpu != bo->ptr.gpu)
++                bo->free_ioctl = true;
+         bo->gem_handle = create_bo.handle;
+         bo->flags = flags;
+         bo->dev = dev;
+         bo->label = label;
++        bo->cached = cached;
++        bo->dmabuf_fd = -1;
+         return bo;
+ }
+ 
+ static void
+ panfrost_bo_free(struct panfrost_bo *bo)
+ {
++        struct panfrost_device *dev = bo->dev;
+         struct drm_gem_close gem_close = { .handle = bo->gem_handle };
+         int ret;
+ 
+-        ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li memfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        if (dev->kbase) {
++                os_munmap(bo->ptr.cpu, bo->size);
++                if (bo->munmap_ptr)
++                        os_munmap(bo->munmap_ptr, bo->size);
++                if (bo->free_ioctl)
++                        dev->mali.free(&dev->mali, bo->ptr.gpu);
++                kbase_free_gem_handle(&dev->mali, bo->gem_handle);
++                ret = 0;
++        } else {
++                ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++        }
+         if (ret) {
+                 fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n");
+                 assert(0);
+         }
+ 
+-        /* BO will be freed with the sparse array, but zero to indicate free */
++        /* BO will be freed with the stable_array, but zero to indicate free */
+         memset(bo, 0, sizeof(*bo));
+ }
+ 
++static bool
++panfrost_bo_usage_finished(struct panfrost_bo *bo, bool readers)
++{
++        struct panfrost_device *dev = bo->dev;
++        kbase k = &dev->mali;
++
++        bool ret = true;
++
++        pthread_mutex_lock(&dev->bo_usage_lock);
++        pthread_mutex_lock(&dev->mali.queue_lock);
++
++        util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) {
++                /* Skip if we are only waiting for writers */
++                if (!u->write && !readers)
++                        continue;
++
++                /* Usages are ordered, so everything else is also invalid */
++                if (u->queue >= k->event_slot_usage)
++                        break;
++
++                struct kbase_event_slot *slot = &k->event_slots[u->queue];
++                uint64_t seqnum = u->seqnum;
++
++                /* There is a race condition, where we can depend on an
++                 * unsubmitted batch. In that cade, decrease the seqnum.
++                 * Otherwise, skip invalid dependencies. TODO: do GC? */
++                if (slot->last_submit == seqnum)
++                        --seqnum;
++                else if (slot->last_submit < seqnum)
++                        continue;
++
++                if (slot->last <= seqnum) {
++                        ret = false;
++                        break;
++                }
++        }
++
++        pthread_mutex_unlock(&dev->mali.queue_lock);
++        pthread_mutex_unlock(&dev->bo_usage_lock);
++
++        return ret;
++}
++
+ /* Returns true if the BO is ready, false otherwise.
+  * access_type is encoding the type of access one wants to ensure is done.
+  * Waiting is always done for writers, but if wait_readers is set then readers
+@@ -113,12 +215,15 @@ panfrost_bo_free(struct panfrost_bo *bo)
+ bool
+ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+ {
++        struct panfrost_device *dev = bo->dev;
+         struct drm_panfrost_wait_bo req = {
+                 .handle = bo->gem_handle,
+ 		.timeout_ns = timeout_ns,
+         };
+         int ret;
+ 
++        /* TODO: With driver-handled sync, is gpu_access even worth it? */
++
+         /* If the BO has been exported or imported we can't rely on the cached
+          * state, we need to call the WAIT_BO ioctl.
+          */
+@@ -134,10 +239,31 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+                         return true;
+         }
+ 
++        if (dev->kbase && (dev->arch >= 10)) {
++                struct kbase_wait_ctx wait = kbase_wait_init(&dev->mali, timeout_ns);
++                while (kbase_wait_for_event(&wait)) {
++                        if (panfrost_bo_usage_finished(bo, wait_readers))
++                                break;
++                }
++                kbase_wait_fini(wait);
++
++                bool ret = panfrost_bo_usage_finished(bo, wait_readers);
++                if (bo->flags & PAN_BO_SHARED)
++                        ret &= kbase_poll_fd_until(bo->dmabuf_fd, wait_readers, wait.until);
++
++                if (ret)
++                        bo->gpu_access &= (wait_readers ? 0 : PAN_BO_ACCESS_READ);
++                return ret;
++        }
++
+         /* The ioctl returns >= 0 value when the BO we are waiting for is ready
+          * -1 otherwise.
+          */
+-        ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
++        if (dev->kbase)
++                ret = kbase_wait_bo(&dev->mali, bo->gem_handle, timeout_ns,
++                                    wait_readers);
++        else
++                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
+         if (ret != -1) {
+                 /* Set gpu_access to 0 so that the next call to bo_wait()
+                  * doesn't have to call the WAIT_BO ioctl.
+@@ -153,6 +279,32 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+         return false;
+ }
+ 
++static void
++panfrost_bo_mem_op(struct panfrost_bo *bo, size_t offset, size_t length, bool invalidate)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        assert(offset + length <= bo->size);
++
++        if (!bo->cached)
++                return;
++
++        dev->mali.mem_sync(&dev->mali, bo->ptr.gpu, bo->ptr.cpu + offset, length,
++                           invalidate);
++}
++
++void
++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length)
++{
++        panfrost_bo_mem_op(bo, offset, length, true);
++}
++
++void
++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length)
++{
++        panfrost_bo_mem_op(bo, offset, length, false);
++}
++
+ /* Helper to calculate the bucket index of a BO */
+ 
+ static unsigned
+@@ -200,21 +352,31 @@ panfrost_bo_cache_fetch(struct panfrost_device *dev,
+ 
+                 /* If the oldest BO in the cache is busy, likely so is
+                  * everything newer, so bail. */
+-                if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
+-                                      PAN_BO_ACCESS_RW))
+-                        break;
++
++                /* For kbase, BOs are not added to the cache until the GPU is
++                 * done with them, so there is no need to wait. */
++                if (!dev->kbase) {
++                        if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
++                                              PAN_BO_ACCESS_RW))
++                                break;
++                }
+ 
+                 struct drm_panfrost_madvise madv = {
+                         .handle = entry->gem_handle,
+                         .madv = PANFROST_MADV_WILLNEED,
+                 };
+-                int ret;
++                int ret = 0;
+ 
+                 /* This one works, splice it out of the cache */
+                 list_del(&entry->bucket_link);
+                 list_del(&entry->lru_link);
+ 
+-                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++                if (dev->kbase) {
++                        /* With kbase, BOs are never freed from the cache */
++                        madv.retained = true;
++                } else {
++                        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++                }
+                 if (!ret && !madv.retained) {
+                         panfrost_bo_free(entry);
+                         continue;
+@@ -276,7 +438,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo)
+         madv.madv = PANFROST_MADV_DONTNEED;
+ 	madv.retained = 0;
+ 
+-        drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++        // TODO: Allow freeing madvise'd BOs with kbase... not that it really
++        // matters for boards with 16 GB RAM
++        if (!dev->kbase)
++                drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
+ 
+         /* Add us to the bucket */
+         list_addtail(&bo->bucket_link, bucket);
+@@ -286,6 +451,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo)
+         clock_gettime(CLOCK_MONOTONIC, &time);
+         bo->last_used = time.tv_sec;
+ 
++        /* For kbase, the GPU can't be accessing this BO any more */
++        if (dev->kbase)
++                bo->gpu_access = 0;
++
+         /* Let's do some cleanup in the BO cache while we hold the
+          * lock.
+          */
+@@ -352,10 +521,15 @@ panfrost_bo_mmap(struct panfrost_bo *bo)
+ static void
+ panfrost_bo_munmap(struct panfrost_bo *bo)
+ {
++        /* We can't munmap BOs when using kbase, as that frees the storage and
++         * the GPU might still be using the BO. */
++        if (bo->dev->kbase)
++                return;
++
+         if (!bo->ptr.cpu)
+                 return;
+ 
+-        if (os_munmap((void *) (uintptr_t)bo->ptr.cpu, bo->size)) {
++        if (os_munmap(bo->ptr.cpu, bo->size)) {
+                 perror("munmap");
+                 abort();
+         }
+@@ -390,8 +564,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+         if (!bo)
+                 bo = panfrost_bo_cache_fetch(dev, size, flags, label, false);
+         if (!bo) {
+-                panfrost_bo_cache_evict_all(dev);
+-                bo = panfrost_bo_alloc(dev, size, flags, label);
++                for (unsigned i = 0; i < 5; ++i) {
++                        usleep(20 * 1000 * i * i);
++                        if (dev->kbase)
++                                kbase_ensure_handle_events(&dev->mali);
++                        panfrost_bo_cache_evict_all(dev);
++                        bo = panfrost_bo_alloc(dev, size, flags, label);
++                        if (bo)
++                                break;
++                }
+         }
+ 
+         if (!bo) {
+@@ -406,8 +587,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+         if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP)))
+                 panfrost_bo_mmap(bo);
+ 
++        if ((dev->debug & PAN_DBG_BO_CLEAR) && !(flags & PAN_BO_INVISIBLE)) {
++                memset(bo->ptr.cpu, 0, bo->size);
++                panfrost_bo_mem_clean(bo, 0, bo->size);
++        }
++
+         p_atomic_set(&bo->refcnt, 1);
+ 
++        util_dynarray_init(&bo->usage, NULL);
++
+         if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
+                 if (flags & PAN_BO_INVISIBLE)
+                         pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL);
+@@ -415,6 +603,14 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+                         pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL);
+         }
+ 
++        if (dev->bo_log) {
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li alloc %"PRIx64" to %"PRIx64" size %zu label %s\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label);
++                fflush(NULL);
++        }
++
+         return bo;
+ }
+ 
+@@ -427,6 +623,60 @@ panfrost_bo_reference(struct panfrost_bo *bo)
+         }
+ }
+ 
++static void
++panfrost_bo_fini(struct panfrost_bo *bo)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        /* When the reference count goes to zero, we need to cleanup */
++        panfrost_bo_munmap(bo);
++
++        if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
++                pandecode_inject_free(bo->ptr.gpu, bo->size);
++
++        /* Rather than freeing the BO now, we'll cache the BO for later
++         * allocations if we're allowed to.
++         */
++        if (!panfrost_bo_cache_put(bo))
++                panfrost_bo_free(bo);
++}
++
++static void
++panfrost_bo_free_gpu(void *data)
++{
++        struct panfrost_bo *bo = data;
++        struct panfrost_device *dev = bo->dev;
++
++        /* Don't free if there are still references */
++        if (p_atomic_dec_return(&bo->gpu_refcnt))
++                return;
++
++        pthread_mutex_lock(&dev->bo_map_lock);
++
++        /* Someone might have imported this BO while we were waiting for the
++         * lock, let's make sure it's still not referenced before freeing it.
++         */
++        if (p_atomic_read(&bo->refcnt) != 0) {
++                pthread_mutex_unlock(&dev->bo_map_lock);
++                return;
++        }
++
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li gpufree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        panfrost_bo_fini(bo);
++
++        pthread_mutex_unlock(&dev->bo_map_lock);
++}
++
+ void
+ panfrost_bo_unreference(struct panfrost_bo *bo)
+ {
+@@ -439,25 +689,57 @@ panfrost_bo_unreference(struct panfrost_bo *bo)
+ 
+         struct panfrost_device *dev = bo->dev;
+ 
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li free %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
+         pthread_mutex_lock(&dev->bo_map_lock);
+ 
+         /* Someone might have imported this BO while we were waiting for the
+          * lock, let's make sure it's still not referenced before freeing it.
+          */
+-        if (p_atomic_read(&bo->refcnt) == 0) {
+-                /* When the reference count goes to zero, we need to cleanup */
+-                panfrost_bo_munmap(bo);
++        if (p_atomic_read(&bo->refcnt) != 0) {
++                pthread_mutex_unlock(&dev->bo_map_lock);
++                return;
++        }
+ 
+-                if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
+-                        pandecode_inject_free(bo->ptr.gpu, bo->size);
++        util_dynarray_fini(&bo->usage);
+ 
+-                /* Rather than freeing the BO now, we'll cache the BO for later
+-                 * allocations if we're allowed to.
++        if (dev->kbase) {
++                /* Assume that all queues are using this BO, and so free the
++                 * BO only after all currently-submitted jobs have finished.
++                 * This could eventually be optimised to only wait on a subset
++                 * of queues.
+                  */
+-                if (!panfrost_bo_cache_put(bo))
+-                        panfrost_bo_free(bo);
++                bool added = dev->mali.callback_all_queues(&dev->mali,
++                        &bo->gpu_refcnt, panfrost_bo_free_gpu, bo);
+ 
++                if (added) {
++                        pthread_mutex_unlock(&dev->bo_map_lock);
++                        return;
++                }
+         }
++
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li immfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        panfrost_bo_fini(bo);
++
+         pthread_mutex_unlock(&dev->bo_map_lock);
+ }
+ 
+@@ -467,22 +749,42 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+         struct panfrost_bo *bo;
+         struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
+         ASSERTED int ret;
++        kbase_handle handle = { .fd = -1 };
+         unsigned gem_handle;
+ 
+-        ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
+-        assert(!ret);
++        if (dev->kbase) {
++                gem_handle = dev->mali.import_dmabuf(&dev->mali, fd);
++                if (gem_handle == -1)
++                        return NULL;
++        } else {
++                ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
++                assert(!ret);
++        }
+ 
+         pthread_mutex_lock(&dev->bo_map_lock);
+         bo = pan_lookup_bo(dev, gem_handle);
+ 
++        bool found = false;
++
+         if (!bo->dev) {
+                 get_bo_offset.handle = gem_handle;
+-                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
+-                assert(!ret);
++                if (dev->kbase) {
++                        handle = kbase_gem_handle_get(&dev->mali, gem_handle);
++                        get_bo_offset.offset = handle.va;
++                } else {
++                        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
++                        assert(!ret);
++                }
+ 
+                 bo->dev = dev;
+-                bo->ptr.gpu = (mali_ptr) get_bo_offset.offset;
+                 bo->size = lseek(fd, 0, SEEK_END);
++                bo->ptr.gpu = (mali_ptr) get_bo_offset.offset;
++                if (dev->kbase && (sizeof(void *) > 4 || get_bo_offset.offset < (1LL << 32))) {
++                        bo->ptr.cpu = (void *)(uintptr_t) get_bo_offset.offset;
++                } else if (dev->kbase) {
++                        bo->ptr.cpu = dev->mali.mmap_import(&dev->mali, bo->ptr.gpu, bo->size);
++                        bo->free_ioctl = true;
++                }
+                 /* Sometimes this can fail and return -1. size of -1 is not
+                  * a nice thing for mmap to try mmap. Be more robust also
+                  * for zero sized maps and fail nicely too
+@@ -493,8 +795,21 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+                 }
+                 bo->flags = PAN_BO_SHARED;
+                 bo->gem_handle = gem_handle;
++                util_dynarray_init(&bo->usage, NULL);
++                if (dev->kbase) {
++                        /* kbase always maps dma-bufs with caching */
++                        bo->cached = true;
++
++                        /* Importing duplicates the FD, so we cache the FD
++                         * from the handle */
++                        bo->dmabuf_fd = handle.fd;
++                } else {
++                        bo->dmabuf_fd = -1;
++                }
+                 p_atomic_set(&bo->refcnt, 1);
+         } else {
++                found = true;
++
+                 /* bo->refcnt == 0 can happen if the BO
+                  * was being released but panfrost_bo_import() acquired the
+                  * lock before panfrost_bo_unreference(). In that case, refcnt
+@@ -512,12 +827,34 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+         }
+         pthread_mutex_unlock(&dev->bo_map_lock);
+ 
++        if (dev->bo_log) {
++                int new_fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li import %"PRIx64" to %"PRIx64" size %zu fd %i new %i handle %i found %i\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size,
++                        fd, new_fd, gem_handle, found);
++                fflush(NULL);
++        }
++
+         return bo;
+ }
+ 
+ int
+ panfrost_bo_export(struct panfrost_bo *bo)
+ {
++        struct panfrost_device *dev = bo->dev;
++
++        if (bo->dmabuf_fd != -1) {
++                assert(bo->flags & PAN_BO_SHARED);
++
++                return os_dupfd_cloexec(bo->dmabuf_fd);
++        }
++
++        if (dev->kbase)
++                return -1;
++
+         struct drm_prime_handle args = {
+                 .handle = bo->gem_handle,
+                 .flags = DRM_CLOEXEC,
+diff --git a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h
+index 7d19fba9dfc..1b817191734 100644
+--- a/src/panfrost/lib/pan_bo.h
++++ b/src/panfrost/lib/pan_bo.h
+@@ -27,6 +27,7 @@
+ #define __PAN_BO_H__
+ 
+ #include "util/list.h"
++#include "util/u_dynarray.h"
+ #include "panfrost-job.h"
+ #include <time.h>
+ 
+@@ -50,6 +51,12 @@
+  * cached locally */
+ #define PAN_BO_SHARED             (1 << 4)
+ 
++/* Use event memory, required for CSF events to be signaled to the kernel */
++#define PAN_BO_EVENT              (1 << 5)
++
++/* Use the caching policy for resource BOs */
++#define PAN_BO_CACHEABLE          (1 << 6)
++
+ /* GPU access flags */
+ 
+ /* BO is either shared (can be accessed by more than one GPU batch) or private
+@@ -80,6 +87,12 @@ struct panfrost_ptr {
+         mali_ptr gpu;
+ };
+ 
++struct panfrost_usage {
++        uint32_t queue;
++        bool write;
++        uint64_t seqnum;
++};
++
+ struct panfrost_bo {
+         /* Must be first for casting */
+         struct list_head bucket_link;
+@@ -95,11 +108,16 @@ struct panfrost_bo {
+         /* Atomic reference count */
+         int32_t refcnt;
+ 
++        /* Reference count for GPU jobs */
++        int32_t gpu_refcnt;
++
+         struct panfrost_device *dev;
+ 
+         /* Mapping for the entire object (all levels) */
+         struct panfrost_ptr ptr;
+ 
++        struct util_dynarray usage;
++
+         /* Size of all entire trees */
+         size_t size;
+ 
+@@ -115,11 +133,31 @@ struct panfrost_bo {
+ 
+         /* Human readable description of the BO for debugging. */
+         const char *label;
++
++        /* Sometimes we don't access the BO through kbase's mapping of the
++         * memory, in that case we need to save the pointer to pass to
++         * munmap to avoid leaking memory. */
++        void *munmap_ptr;
++
++        /* For 32-bit applications we may not even be able to that, because
++         * the VA may be too high for kbase to map to an equivalent CPU
++         * address, in which case we must use the memory free icotl. */
++        bool free_ioctl;
++
++        /* Is the BO cached CPU-side? */
++        bool cached;
++
++        /* File descriptor for the dma-buf */
++        int dmabuf_fd;
+ };
+ 
+ bool
+ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers);
+ void
++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length);
++void
++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length);
++void
+ panfrost_bo_reference(struct panfrost_bo *bo);
+ void
+ panfrost_bo_unreference(struct panfrost_bo *bo);
+diff --git a/src/panfrost/lib/pan_cs.c b/src/panfrost/lib/pan_cs.c
+index 986eb1e158d..7b24ec6586a 100644
+--- a/src/panfrost/lib/pan_cs.c
++++ b/src/panfrost/lib/pan_cs.c
+@@ -282,9 +282,15 @@ pan_prepare_crc(const struct pan_fb_info *fb, int rt_crc,
+         ext->crc_render_target = rt_crc;
+ 
+         if (fb->rts[rt_crc].clear) {
++#if PAN_ARCH < 10
++                // todo v10
+                 uint32_t clear_val = fb->rts[rt_crc].clear_value[0];
+                 ext->crc_clear_color = clear_val | 0xc000000000000000 |
+                                        (((uint64_t)clear_val & 0xffff) << 32);
++#else
++                // TODO: Is this correct?
++                ext->crc_unk = 0x1f;
++#endif
+         }
+ #endif
+ }
+@@ -420,7 +426,8 @@ pan_rt_init_format(const struct pan_image_view *rt,
+         cfg->swizzle = panfrost_translate_swizzle_4(swizzle);
+ }
+ 
+-#if PAN_ARCH >= 9
++/* Don't define for later gens as this is not a GENX function */
++#if PAN_ARCH == 9
+ enum mali_afbc_compression_mode
+ pan_afbc_compression_mode(enum pipe_format format)
+ {
+@@ -438,14 +445,21 @@ pan_afbc_compression_mode(enum pipe_format format)
+         case PIPE_FORMAT_R8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8;
+         case PIPE_FORMAT_R8G8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8;
+         case PIPE_FORMAT_R5G6B5_UNORM: return MALI_AFBC_COMPRESSION_MODE_R5G6B5;
++        case PIPE_FORMAT_R5G5B5A1_UNORM: return MALI_AFBC_COMPRESSION_MODE_R5G5B5A1;
+         case PIPE_FORMAT_R4G4B4A4_UNORM: return MALI_AFBC_COMPRESSION_MODE_R4G4B4A4;
+         case PIPE_FORMAT_R8G8B8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8B8;
+         case PIPE_FORMAT_R8G8B8A8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8B8A8;
+         case PIPE_FORMAT_R10G10B10A2_UNORM: return MALI_AFBC_COMPRESSION_MODE_R10G10B10A2;
+         case PIPE_FORMAT_R11G11B10_FLOAT: return MALI_AFBC_COMPRESSION_MODE_R11G11B10;
+         case PIPE_FORMAT_S8_UINT: return MALI_AFBC_COMPRESSION_MODE_S8;
+-        case PIPE_FORMAT_NONE: unreachable("invalid format for AFBC");
+-        default: unreachable("unknown canonical AFBC format");
++        case PIPE_FORMAT_NONE:
++                fprintf(stderr, "invalid format for AFBC: %s\n", util_format_name(format));
++                fflush(NULL);
++                abort();
++        default:
++                fprintf(stderr, "unknown canonical AFBC format: %s\n", util_format_name(format));
++                fflush(NULL);
++                abort();
+         }
+ }
+ #endif
+@@ -558,6 +572,7 @@ GENX(pan_emit_tls)(const struct pan_tls_info *info,
+                          */
+                         cfg.tls_address_mode = MALI_ADDRESS_MODE_PACKED;
+ 
++                        /* The shift is only used for packed mode */
+                         assert((info->tls.ptr & 4095) == 0);
+                         cfg.tls_base_pointer = info->tls.ptr >> 8;
+ #else
+@@ -731,6 +746,9 @@ GENX(pan_emit_fbd)(const struct panfrost_device *dev,
+ #if PAN_ARCH >= 6
+                 bool force_clean_write = pan_force_clean_write(fb, tile_size);
+ 
++#if PAN_ARCH >= 9
++                cfg.frame_argument = 0x10000;
++#endif
+                 cfg.sample_locations =
+                         panfrost_sample_positions(dev, pan_sample_pattern(fb->nr_samples));
+                 cfg.pre_frame_0 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], force_clean_write);
+@@ -940,7 +958,7 @@ GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev,
+         pan_pack(out, TILER_HEAP, heap) {
+                 heap.size = dev->tiler_heap->size;
+                 heap.base = dev->tiler_heap->ptr.gpu;
+-                heap.bottom = dev->tiler_heap->ptr.gpu;
++                heap.bottom = dev->tiler_heap->ptr.gpu + 64;
+                 heap.top = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size;
+         }
+ }
+@@ -951,30 +969,39 @@ GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev,
+                          unsigned nr_samples,
+                          bool first_provoking_vertex,
+                          mali_ptr heap,
++                         mali_ptr scratch,
+                          void *out)
+ {
+         unsigned max_levels = dev->tiler_features.max_levels;
+         assert(max_levels >= 2);
+ 
+         pan_pack(out, TILER_CONTEXT, tiler) {
+-                /* TODO: Select hierarchy mask more effectively */
+-                tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
+-
+-                /* For large framebuffers, disable the smallest bin size to
+-                 * avoid pathological tiler memory usage. Required to avoid OOM
+-                 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
+-                 * Mali-G57.
++                /* TODO: Select hierarchy mask more effectively. */
++
++                /* Disable the smallest hierarchy level. This is required to
++                 * use 32x32 tiles on v10, and helps reduce tiler heap memory
++                 * usage for other GPUs. The rasteriser can efficiently skip
++                 * primitives not entering the current quadrant of a tile, so
++                 * this should not hurt performance much.
++                 * Even for GPUs earlier than v10, cores get fed tiles in
++                 * 32x32 pixel blocks, so making all of the tiles use the same
++                 * set of primitive lists could help with performance.
++                 * Maybe then v10 should disable two levels?
+                  */
+-                if (MAX2(fb_width, fb_height) >= 4096)
+-                        tiler.hierarchy_mask &= ~1;
++                tiler.hierarchy_mask = (max_levels >= 8) ? 0xFE : 0x28;
+ 
+                 tiler.fb_width = fb_width;
+                 tiler.fb_height = fb_height;
+                 tiler.heap = heap;
++#if PAN_ARCH >= 10
++                tiler.scratch = scratch;
++#endif
+                 tiler.sample_pattern = pan_sample_pattern(nr_samples);
+ #if PAN_ARCH >= 9
+                 tiler.first_provoking_vertex = first_provoking_vertex;
+ #endif
++                tiler.state.word1 = 31;
++                tiler.state.word3 = 0x10000000;
+         }
+ }
+ #endif
+@@ -984,24 +1011,43 @@ GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb,
+                             mali_ptr fbd,
+                             void *out)
+ {
++#if PAN_ARCH < 10
+         pan_section_pack(out, FRAGMENT_JOB, HEADER, header) {
+                 header.type = MALI_JOB_TYPE_FRAGMENT;
+                 header.index = 1;
+         }
++#endif
+ 
+-        pan_section_pack(out, FRAGMENT_JOB, PAYLOAD, payload) {
+-                payload.bound_min_x = fb->extent.minx >> MALI_TILE_SHIFT;
+-                payload.bound_min_y = fb->extent.miny >> MALI_TILE_SHIFT;
+-                payload.bound_max_x = fb->extent.maxx >> MALI_TILE_SHIFT;
+-                payload.bound_max_y = fb->extent.maxy >> MALI_TILE_SHIFT;
++#if PAN_ARCH < 10
++#define BOUND_SHIFT MALI_TILE_SHIFT
++#else
++#define BOUND_SHIFT 0
++#endif
++
++        pan_section_pack_cs_v10(out, fb->cs_fragment, FRAGMENT_JOB, PAYLOAD, payload) {
++                payload.bound_min_x = fb->extent.minx >> BOUND_SHIFT;
++                payload.bound_min_y = fb->extent.miny >> BOUND_SHIFT;
++                payload.bound_max_x = fb->extent.maxx >> BOUND_SHIFT;
++                payload.bound_max_y = fb->extent.maxy >> BOUND_SHIFT;
+                 payload.framebuffer = fbd;
+ 
+ #if PAN_ARCH >= 5
+                 if (fb->tile_map.base) {
++#if PAN_ARCH < 0
+                         payload.has_tile_enable_map = true;
++#endif
+                         payload.tile_enable_map = fb->tile_map.base;
+                         payload.tile_enable_map_row_stride = fb->tile_map.stride;
+                 }
++#else
++                assert(!fb->tile_map.base);
+ #endif
+         }
++
++#if PAN_ARCH >= 10
++        /* TODO: Do this here? */
++        pan_pack_ins(fb->cs_fragment, FRAGMENT_LAUNCH, launch) {
++                launch.has_tile_enable_map = !!fb->tile_map.base;
++        }
++#endif
+ }
+diff --git a/src/panfrost/lib/pan_cs.h b/src/panfrost/lib/pan_cs.h
+index 8186102e5c0..5c5e29cb6d6 100644
+--- a/src/panfrost/lib/pan_cs.h
++++ b/src/panfrost/lib/pan_cs.h
+@@ -121,6 +121,8 @@ struct pan_fb_info {
+         /* Only used on Valhall */
+         bool sprite_coord_origin;
+         bool first_provoking_vertex;
++
++        pan_command_stream *cs_fragment;
+ };
+ 
+ static inline unsigned
+@@ -171,7 +173,7 @@ void
+ GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev,
+                          unsigned fb_width, unsigned fb_height,
+                          unsigned nr_samples, bool first_provoking_vertex,
+-                         mali_ptr heap,
++                         mali_ptr heap, mali_ptr scratch,
+                          void *out);
+ #endif
+ 
+diff --git a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h
+index ad18d154a2c..acb46620968 100644
+--- a/src/panfrost/lib/pan_device.h
++++ b/src/panfrost/lib/pan_device.h
+@@ -35,11 +35,12 @@
+ #include "util/u_dynarray.h"
+ #include "util/bitset.h"
+ #include "util/list.h"
+-#include "util/sparse_array.h"
++#include "util/stable_array.h"
+ 
+ #include "panfrost/util/pan_ir.h"
+ #include "pan_pool.h"
+ #include "pan_util.h"
++#include "pan_base.h"
+ 
+ #include <genxml/gen_macros.h>
+ 
+@@ -182,6 +183,7 @@ struct panfrost_device {
+         void *memctx;
+ 
+         int fd;
++        bool kbase;
+ 
+         /* Properties of the GPU in use */
+         unsigned arch;
+@@ -204,6 +206,9 @@ struct panfrost_device {
+         const struct panfrost_model *model;
+         bool has_afbc;
+ 
++        /* Does the kernel support dma-buf fence import/export? */
++        bool has_dmabuf_fence;
++
+         /* Table of formats, indexed by a PIPE format */
+         const struct panfrost_format *formats;
+ 
+@@ -217,8 +222,11 @@ struct panfrost_device {
+ 
+         struct renderonly *ro;
+ 
++        /* Hold this while updating usage field of BOs */
++        pthread_mutex_t bo_usage_lock;
++
+         pthread_mutex_t bo_map_lock;
+-        struct util_sparse_array bo_map;
++        struct stable_array bo_map;
+ 
+         struct {
+                 pthread_mutex_t lock;
+@@ -263,6 +271,10 @@ struct panfrost_device {
+          * unconditionally on Bifrost, and useful for sharing with Midgard */
+ 
+         struct panfrost_bo *sample_positions;
++
++        struct kbase_ mali;
++
++        FILE *bo_log;
+ };
+ 
+ void
+@@ -271,6 +283,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev);
+ void
+ panfrost_close_device(struct panfrost_device *dev);
+ 
++bool
++panfrost_check_dmabuf_fence(struct panfrost_device *dev);
++
+ bool
+ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt);
+ 
+@@ -287,12 +302,18 @@ panfrost_query_sample_position(
+                 float *out);
+ 
+ unsigned
+-panfrost_query_l2_slices(const struct panfrost_device *dev);
++panfrost_query_l2_slices(struct panfrost_device *dev);
+ 
+ static inline struct panfrost_bo *
+ pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle)
+ {
+-        return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle);
++        return stable_array_get(&dev->bo_map, struct panfrost_bo, gem_handle);
++}
++
++static inline struct panfrost_bo *
++pan_lookup_bo_existing(struct panfrost_device *dev, uint32_t gem_handle)
++{
++        return stable_array_get_existing(&dev->bo_map, struct panfrost_bo, gem_handle);
+ }
+ 
+ static inline bool
+diff --git a/src/panfrost/lib/pan_layout.c b/src/panfrost/lib/pan_layout.c
+index b64a2d7a6e5..96940438f54 100644
+--- a/src/panfrost/lib/pan_layout.c
++++ b/src/panfrost/lib/pan_layout.c
+@@ -32,6 +32,14 @@
+  * enabling the YUV-like transform is typically a win where possible. */
+ 
+ uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT] = {
++        DRM_FORMAT_MOD_ARM_AFBC(
++                AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
++                AFBC_FORMAT_MOD_TILED |
++                AFBC_FORMAT_MOD_SC |
++                AFBC_FORMAT_MOD_SPARSE |
++                AFBC_FORMAT_MOD_YTR |
++                AFBC_FORMAT_MOD_NATIVE_SWIZZLE),
++
+         DRM_FORMAT_MOD_ARM_AFBC(
+                 AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
+                 AFBC_FORMAT_MOD_TILED |
+@@ -201,18 +209,17 @@ pan_afbc_body_align(uint64_t modifier)
+ #define CHECKSUM_TILE_HEIGHT 16
+ #define CHECKSUM_BYTES_PER_TILE 8
+ 
+-unsigned
+-panfrost_compute_checksum_size(
+-        struct pan_image_slice_layout *slice,
+-        unsigned width,
+-        unsigned height)
++struct pan_image_slice_crc
++panfrost_compute_checksum_size(unsigned width, unsigned height)
+ {
+         unsigned tile_count_x = DIV_ROUND_UP(width, CHECKSUM_TILE_WIDTH);
+         unsigned tile_count_y = DIV_ROUND_UP(height, CHECKSUM_TILE_HEIGHT);
+ 
+-        slice->crc.stride = tile_count_x * CHECKSUM_BYTES_PER_TILE;
+-
+-        return slice->crc.stride * tile_count_y;
++        struct pan_image_slice_crc ret = {
++                .stride = tile_count_x * CHECKSUM_BYTES_PER_TILE,
++                .size = ret.stride * tile_count_y,
++        };
++        return ret;
+ }
+ 
+ unsigned
+@@ -236,8 +243,11 @@ panfrost_get_legacy_stride(const struct pan_image_layout *layout,
+                 panfrost_block_size(layout->modifier, layout->format);
+ 
+         if (drm_is_afbc(layout->modifier)) {
++                unsigned align_w = block_size.width *
++                        pan_afbc_tile_size(layout->modifier);
++
+                 unsigned width = u_minify(layout->width, level);
+-                width = ALIGN_POT(width, block_size.width);
++                width = ALIGN_POT(width, align_w);
+ 
+                 return width * util_format_get_blocksize(layout->format);
+         } else {
+@@ -392,9 +402,7 @@ pan_image_layout_init(struct pan_image_layout *layout,
+ 
+                 /* Add a checksum region if necessary */
+                 if (layout->crc) {
+-                        slice->crc.size =
+-                                panfrost_compute_checksum_size(slice, width, height);
+-
++                        slice->crc = panfrost_compute_checksum_size(width, height);
+                         slice->crc.offset = offset;
+                         offset += slice->crc.size;
+                         slice->size += slice->crc.size;
+diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h
+index 01c8348c41d..e332adff362 100644
+--- a/src/panfrost/lib/pan_pool.h
++++ b/src/panfrost/lib/pan_pool.h
+@@ -130,4 +130,17 @@ pan_pool_alloc_descs(struct pan_pool *pool,
+ #define pan_pool_alloc_desc_aggregate(pool, ...) \
+         pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__))
+ 
++#ifdef PAN_ARCH
++#if PAN_ARCH < 10
++
++#define pan_pool_alloc_desc_cs_v10(pool, name) \
++        pan_pool_alloc_desc(pool, name)
++
++#else /* PAN_ARCH >= 10 */
++
++#define pan_pool_alloc_desc_cs_v10(pool, name) ((struct panfrost_ptr) {0})
++
++#endif
++#endif /* PAN_ARCH */
++
+ #endif
+diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c
+index 048954b4c4d..57188c24f52 100644
+--- a/src/panfrost/lib/pan_props.c
++++ b/src/panfrost/lib/pan_props.c
+@@ -24,6 +24,7 @@
+  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+  */
+ 
++#include <fcntl.h>
+ #include <xf86drm.h>
+ 
+ #include "util/u_math.h"
+@@ -31,12 +32,14 @@
+ #include "util/hash_table.h"
+ #include "util/u_thread.h"
+ #include "drm-uapi/panfrost_drm.h"
++#include "dma-uapi/dma-buf.h"
+ #include "pan_encoder.h"
+ #include "pan_device.h"
+ #include "pan_bo.h"
+ #include "pan_texture.h"
+ #include "wrap.h"
+ #include "pan_util.h"
++#include "pan_base.h"
+ 
+ /* Fixed "minimum revisions" */
+ #define NO_ANISO (~0)
+@@ -70,6 +73,18 @@ const struct panfrost_model panfrost_model_list[] = {
+         MODEL(0x7212, "G52", "TGOx", HAS_ANISO, 16384, {}),
+         MODEL(0x7402, "G52 r1", "TGOx", HAS_ANISO, 16384, {}),
+         MODEL(0x9093, "G57", "TNAx", HAS_ANISO, 16384, {}),
++        MODEL(0xa867, "G610", "LODx", HAS_ANISO, 65536, {}),
++        /* Matching the kbase dummy model, probably not real GPUs */
++        MODEL(0xa802, "G710", "TODx", HAS_ANISO, 65536, {}),
++};
++
++const struct panfrost_model panfrost_unknown_model = {
++   .gpu_id = 0,
++   .name = "Unknowm Mali device (Panfrost)",
++   .performance_counters = "AAAA",
++   .min_rev_anisotropic = NO_ANISO, 
++   .tilebuffer_size = 8192, 
++   .quirks = {}, 
+ };
+ 
+ #undef NO_ANISO
+@@ -83,12 +98,13 @@ const struct panfrost_model panfrost_model_list[] = {
+ const struct panfrost_model *
+ panfrost_get_model(uint32_t gpu_id)
+ {
++        
+         for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) {
+                 if (panfrost_model_list[i].gpu_id == gpu_id)
+                         return &panfrost_model_list[i];
+         }
+ 
+-        return NULL;
++        return &panfrost_unknown_model;
+ }
+ 
+ /* Abstraction over the raw drm_panfrost_get_param ioctl for fetching
+@@ -96,16 +112,27 @@ panfrost_get_model(uint32_t gpu_id)
+ 
+ static __u64
+ panfrost_query_raw(
+-                int fd,
++                struct panfrost_device *dev,
+                 enum drm_panfrost_param param,
+                 bool required,
+                 unsigned default_value)
+ {
++        if (dev->kbase) {
++                uint64_t value;
++                bool ret = dev->mali.get_pan_gpuprop(&dev->mali, param, &value);
++                if (ret) {
++                        return value;
++                } else {
++                        assert(!required);
++                        return default_value;
++                }
++        }
++
+         struct drm_panfrost_get_param get_param = {0,};
+         ASSERTED int ret;
+ 
+         get_param.param = param;
+-        ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
++        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
+ 
+         if (ret) {
+                 assert(!required);
+@@ -116,23 +143,23 @@ panfrost_query_raw(
+ }
+ 
+ static unsigned
+-panfrost_query_gpu_version(int fd)
++panfrost_query_gpu_version(struct panfrost_device *dev)
+ {
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
+ }
+ 
+ static unsigned
+-panfrost_query_gpu_revision(int fd)
++panfrost_query_gpu_revision(struct panfrost_device *dev)
+ {
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0);
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_REVISION, true, 0);
+ }
+ 
+ unsigned
+-panfrost_query_l2_slices(const struct panfrost_device *dev)
++panfrost_query_l2_slices(struct panfrost_device *dev)
+ {
+         /* Query MEM_FEATURES register */
+         uint32_t mem_features =
+-                panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES,
++                panfrost_query_raw(dev, DRM_PANFROST_PARAM_MEM_FEATURES,
+                                    true, 0);
+ 
+         /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */
+@@ -140,10 +167,10 @@ panfrost_query_l2_slices(const struct panfrost_device *dev)
+ }
+ 
+ static struct panfrost_tiler_features
+-panfrost_query_tiler_features(int fd)
++panfrost_query_tiler_features(struct panfrost_device *dev)
+ {
+         /* Default value (2^9 bytes and 8 levels) to match old behaviour */
+-        uint32_t raw = panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES,
++        uint32_t raw = panfrost_query_raw(dev, DRM_PANFROST_PARAM_TILER_FEATURES,
+                         false, 0x809);
+ 
+         /* Bin size is log2 in the first byte, max levels in the second byte */
+@@ -154,11 +181,11 @@ panfrost_query_tiler_features(int fd)
+ }
+ 
+ static unsigned
+-panfrost_query_core_count(int fd, unsigned *core_id_range)
++panfrost_query_core_count(struct panfrost_device *dev, unsigned *core_id_range)
+ {
+         /* On older kernels, worst-case to 16 cores */
+ 
+-        unsigned mask = panfrost_query_raw(fd,
++        unsigned mask = panfrost_query_raw(dev,
+                         DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
+ 
+         /* Some cores might be absent. In some cases, we care
+@@ -199,16 +226,16 @@ panfrost_max_thread_count(unsigned arch)
+ }
+ 
+ static unsigned
+-panfrost_query_thread_tls_alloc(int fd, unsigned major)
++panfrost_query_thread_tls_alloc(struct panfrost_device *dev, unsigned major)
+ {
+-        unsigned tls = panfrost_query_raw(fd,
++        unsigned tls = panfrost_query_raw(dev,
+                         DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0);
+ 
+         return (tls > 0) ? tls : panfrost_max_thread_count(major);
+ }
+ 
+ static uint32_t
+-panfrost_query_compressed_formats(int fd)
++panfrost_query_compressed_formats(struct panfrost_device *dev)
+ {
+         /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and
+          * should exist on any Mali configuration. All hardware should report
+@@ -227,7 +254,7 @@ panfrost_query_compressed_formats(int fd)
+                 (1 << MALI_ASTC_2D_LDR) |
+                 (1 << MALI_ASTC_2D_HDR);
+ 
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
+                         false, default_set);
+ }
+ 
+@@ -250,9 +277,9 @@ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt)
+  * may omit it, signaled as a nonzero value in the AFBC_FEATURES property. */
+ 
+ static bool
+-panfrost_query_afbc(int fd, unsigned arch)
++panfrost_query_afbc(struct panfrost_device *dev, unsigned arch)
+ {
+-        unsigned reg = panfrost_query_raw(fd,
++        unsigned reg = panfrost_query_raw(dev,
+                                           DRM_PANFROST_PARAM_AFBC_FEATURES,
+                                           false, 0);
+ 
+@@ -281,24 +308,40 @@ panfrost_query_optimal_tib_size(const struct panfrost_device *dev)
+ void
+ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+ {
++        if (kbase_open(&dev->mali, fd, 4, (dev->debug & PAN_DBG_LOG))) {
++                dev->kbase = true;
++                fd = -1;
++        }
++
+         dev->fd = fd;
+         dev->memctx = memctx;
+-        dev->gpu_id = panfrost_query_gpu_version(fd);
++        dev->gpu_id = panfrost_query_gpu_version(dev);
+         dev->arch = pan_arch(dev->gpu_id);
+-        dev->kernel_version = drmGetVersion(fd);
+-        dev->revision = panfrost_query_gpu_revision(fd);
++        if (dev->kbase) {
++                dev->kernel_version = calloc(1, sizeof(drmVersion));
++                *dev->kernel_version = (drmVersion) {
++                        .version_major = 1,
++                        .version_minor = 999,
++                };
++        } else {
++                dev->kernel_version = drmGetVersion(fd);
++        }
++        dev->revision = panfrost_query_gpu_revision(dev);
+         dev->model = panfrost_get_model(dev->gpu_id);
+ 
+         /* If we don't recognize the model, bail early */
+         if (!dev->model)
+                 return;
+ 
+-        dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range);
+-        dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch);
++        if (dev->debug & PAN_DBG_BO_LOG)
++                dev->bo_log = fopen("/tmp/bo_log", "w");
++
++        dev->core_count = panfrost_query_core_count(dev, &dev->core_id_range);
++        dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(dev, dev->arch);
+         dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev);
+-        dev->compressed_formats = panfrost_query_compressed_formats(fd);
+-        dev->tiler_features = panfrost_query_tiler_features(fd);
+-        dev->has_afbc = panfrost_query_afbc(fd, dev->arch);
++        dev->compressed_formats = panfrost_query_compressed_formats(dev);
++        dev->tiler_features = panfrost_query_tiler_features(dev);
++        dev->has_afbc = panfrost_query_afbc(dev, dev->arch);
+ 
+         if (dev->arch <= 6)
+                 dev->formats = panfrost_pipe_format_v6;
+@@ -307,8 +350,10 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+         else
+                 dev->formats = panfrost_pipe_format_v9;
+ 
+-        util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512);
++        stable_array_init(&dev->bo_map, struct panfrost_bo);
+ 
++        pthread_mutex_init(&dev->bo_usage_lock, NULL);
++        pthread_mutex_init(&dev->bo_map_lock, NULL);
+         pthread_mutex_init(&dev->bo_cache.lock, NULL);
+         list_inithead(&dev->bo_cache.lru);
+ 
+@@ -323,8 +368,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+          * active for a single job chain at once, so a single heap can be
+          * shared across batches/contextes */
+ 
+-        dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024,
+-                        PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap");
++        if (dev->arch < 10)
++                dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024,
++                                             PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap");
+ 
+         pthread_mutex_init(&dev->submit_lock, NULL);
+ 
+@@ -341,11 +387,102 @@ panfrost_close_device(struct panfrost_device *dev)
+         if (dev->model) {
+                 pthread_mutex_destroy(&dev->submit_lock);
+                 panfrost_bo_unreference(dev->tiler_heap);
++                panfrost_bo_unreference(dev->sample_positions);
+                 panfrost_bo_cache_evict_all(dev);
+                 pthread_mutex_destroy(&dev->bo_cache.lock);
+-                util_sparse_array_finish(&dev->bo_map);
++                pthread_mutex_destroy(&dev->bo_map_lock);
++                pthread_mutex_destroy(&dev->bo_usage_lock);
++                stable_array_fini(&dev->bo_map);
++        }
++
++        if (dev->kbase)
++                free(dev->kernel_version);
++        else
++                drmFreeVersion(dev->kernel_version);
++        if (dev->kbase)
++                dev->mali.close(&dev->mali);
++        else
++                close(dev->fd);
++}
++
++bool
++panfrost_check_dmabuf_fence(struct panfrost_device *dev)
++{
++        bool ret = false;
++        int err;
++
++        /* This function is only useful for kbase, where we can't create
++         * dma-bufs from the kbase FD. */
++        if (!dev->ro)
++                goto out;
++
++        struct drm_mode_create_dumb create_dumb = {
++                .width = 16,
++                .height = 16,
++                .bpp = 32,
++        };
++
++        err = drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_dumb);
++        if (err < 0) {
++                fprintf(stderr, "DRM_IOCTL_MODE_CREATE_DUMB failed "
++                        "for fence check: %s\n",
++                        strerror(errno));
++                goto out;
++        }
++
++        int fd;
++        err = drmPrimeHandleToFD(dev->ro->kms_fd, create_dumb.handle, O_CLOEXEC,
++                                 &fd);
++        if (err < 0) {
++                fprintf(stderr, "failed to export buffer for fence check: %s\n",
++                        strerror(errno));
++                goto free_dumb;
+         }
+ 
+-        drmFreeVersion(dev->kernel_version);
+-        close(dev->fd);
++        struct dma_buf_export_sync_file export = {
++                .flags = DMA_BUF_SYNC_RW,
++        };
++
++        /* ENOTTY is returned if the ioctl is unsupported */
++
++        err = drmIoctl(fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export);
++        if (err < 0) {
++                if (errno != ENOTTY)
++                        fprintf(stderr, "failed to export fence: %s\n",
++                                strerror(errno));
++                goto free_fd;
++        }
++
++        struct dma_buf_import_sync_file import = {
++                .flags = DMA_BUF_SYNC_RW,
++                .fd = export.fd,
++        };
++
++        err = drmIoctl(fd, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import);
++        if (err < 0) {
++                if (errno != ENOTTY)
++                        fprintf(stderr, "failed to import fence: %s\n",
++                                strerror(errno));
++                goto free_sync;
++        }
++
++        /* We made it this far, the kernel must support the ioctls */
++        ret = true;
++
++free_sync:
++        close(export.fd);
++
++free_fd:
++        close(fd);
++
++        /* Some compilers don't like goto to a declaration */
++        struct drm_mode_destroy_dumb destroy_dumb;
++free_dumb:
++        destroy_dumb = (struct drm_mode_destroy_dumb) {
++                .handle = create_dumb.handle,
++        };
++        drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb);
++
++out:
++        return ret;
+ }
+diff --git a/src/panfrost/lib/pan_scoreboard.h b/src/panfrost/lib/pan_scoreboard.h
+index f6476c66651..11820ca5432 100644
+--- a/src/panfrost/lib/pan_scoreboard.h
++++ b/src/panfrost/lib/pan_scoreboard.h
+@@ -55,6 +55,7 @@ struct pan_scoreboard {
+ };
+ 
+ #ifdef PAN_ARCH
++#if PAN_ARCH < 10
+ /*
+  * There are various types of Mali jobs:
+  *
+@@ -266,6 +267,7 @@ panfrost_scoreboard_initialize_tiler(struct pan_pool *pool,
+         scoreboard->first_job = transfer.gpu;
+         return transfer;
+ }
++#endif /* PAN_ARCH < 10 */
+ #endif /* PAN_ARCH */
+ 
+ #endif
+diff --git a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h
+index 58dcef725b6..1780ad28ec2 100644
+--- a/src/panfrost/lib/pan_texture.h
++++ b/src/panfrost/lib/pan_texture.h
+@@ -44,9 +44,15 @@
+ extern "C" {
+ #endif
+ 
+-#define PAN_MODIFIER_COUNT 6
++#define PAN_MODIFIER_COUNT 7
+ extern uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT];
+ 
++struct pan_image_slice_crc {
++        unsigned offset;
++        unsigned stride;
++        unsigned size;
++};
++
+ struct pan_image_slice_layout {
+         unsigned offset;
+ 
+@@ -80,11 +86,7 @@ struct pan_image_slice_layout {
+ 
+         /* If checksumming is enabled following the slice, what
+          * is its offset/stride? */
+-        struct {
+-                unsigned offset;
+-                unsigned stride;
+-                unsigned size;
+-        } crc;
++        struct pan_image_slice_crc crc;
+ 
+         unsigned size;
+ };
+@@ -141,11 +143,8 @@ struct pan_image_view {
+         } buf;
+ };
+ 
+-unsigned
+-panfrost_compute_checksum_size(
+-        struct pan_image_slice_layout *slice,
+-        unsigned width,
+-        unsigned height);
++struct pan_image_slice_crc
++panfrost_compute_checksum_size(unsigned width, unsigned height);
+ 
+ /* AFBC */
+ 
+@@ -164,6 +163,9 @@ panfrost_afbc_can_ytr(enum pipe_format format);
+ bool
+ panfrost_afbc_can_tile(const struct panfrost_device *dev);
+ 
++bool
++panfrost_afbc_only_native(unsigned arch, enum pipe_format format);
++
+ /*
+  * Represents the block size of a single plane. For AFBC, this represents the
+  * superblock size. For u-interleaving, this represents the tile size.
+diff --git a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h
+index c2f883737c3..eb6b34e1566 100644
+--- a/src/panfrost/lib/pan_util.h
++++ b/src/panfrost/lib/pan_util.h
+@@ -47,10 +47,16 @@
+ #define PAN_DBG_LINEAR          0x1000
+ #define PAN_DBG_NO_CACHE        0x2000
+ #define PAN_DBG_DUMP            0x4000
+-
+ #ifndef NDEBUG
+ #define PAN_DBG_OVERFLOW        0x8000
+ #endif
++#define PAN_DBG_TILER         0x010000
++#define PAN_DBG_BO_LOG        0x020000
++#define PAN_DBG_BO_CLEAR      0x040000
++#define PAN_DBG_UNCACHED_GPU  0x100000
++#define PAN_DBG_UNCACHED_CPU  0x200000
++#define PAN_DBG_LOG           0x400000
++#define PAN_DBG_GOFASTER      0x800000
+ 
+ struct panfrost_device;
+ 
+diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h
+index 56bb0f48aed..c706cc65308 100644
+--- a/src/panfrost/lib/wrap.h
++++ b/src/panfrost/lib/wrap.h
+@@ -46,6 +46,8 @@ void pandecode_initialize(bool to_stderr);
+ 
+ void pandecode_next_frame(void);
+ 
++void pandecode_dump_file_close(void);
++
+ void pandecode_close(void);
+ 
+ void
+@@ -55,6 +57,10 @@ void pandecode_inject_free(uint64_t gpu_va, unsigned sz);
+ 
+ void pandecode_jc(uint64_t jc_gpu_va, unsigned gpu_id);
+ 
++void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
++void pandecode_dump_mappings(void);
++
+ void
+ pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id);
+ 
+diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build
+index 66847f64569..7cbd81927b4 100644
+--- a/src/panfrost/meson.build
++++ b/src/panfrost/meson.build
+@@ -20,7 +20,7 @@
+ # SOFTWARE.
+ 
+ inc_panfrost_hw = include_directories([
+-   'include'
++   'include', 'base'
+ ])
+ 
+ inc_panfrost = include_directories([
+@@ -36,6 +36,8 @@ subdir('util')
+ subdir('midgard')
+ subdir('bifrost')
+ 
++subdir('base')
++
+ if with_gallium_panfrost or with_panfrost_vk
+    subdir('lib')
+    subdir('perf')
+@@ -71,6 +73,46 @@ bifrost_compiler = executable(
+   build_by_default : with_tools.contains('panfrost')
+ )
+ 
++csf_test = executable(
++  'csf_test',
++  ['csf_test/test.c'],
++  include_directories : [
++    inc_mapi,
++    inc_mesa,
++    inc_gallium,
++    inc_gallium_aux,
++    inc_include,
++    inc_src,
++    inc_panfrost,
++    inc_panfrost_hw,
++  ],
++  dependencies : [
++    idep_nir,
++    idep_mesautil,
++    idep_bi_opcodes_h,
++    dep_libdrm,
++    libpanfrost_dep,
++  ],
++  build_by_default : true
++)
++
++custom_target(
++  'panfrost_panloader',
++  output: ['panfrost_panloader.txt'],
++  depends : [
++    libpanfrost_lib,
++    libpanfrost_util,
++    _libmesa_util,
++    libpanfrost_decode,
++    libpanfrost_decode_per_arch,
++    libpanfrost_midgard_disasm,
++    libpanfrost_bifrost_disasm,
++    libpanfrost_valhall_disasm,
++  ],
++  command: ['touch', '@OUTPUT@'],
++  build_by_default : false,
++)
++
+ if with_panfrost_vk
+   subdir('vulkan')
+ endif
+diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c
+index b47902a9ce3..3643e5a6029 100644
+--- a/src/panfrost/midgard/disassemble.c
++++ b/src/panfrost/midgard/disassemble.c
+@@ -1242,7 +1242,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words,
+ UNUSED static void
+ print_varying_parameters(FILE *fp, midgard_load_store_word *word)
+ {
+-        midgard_varying_params p = midgard_unpack_varying_params(*word);
++        unsigned params = word->signed_offset & 0x1FF;
++        midgard_varying_params p;
++        memcpy(&p, &params, sizeof(p));
+ 
+         /* If a varying, there are qualifiers */
+         if (p.flat_shading)
+diff --git a/src/panfrost/tiler/tiler-hex-read b/src/panfrost/tiler/tiler-hex-read
+new file mode 100755
+index 00000000000..1c188e38ec1
+--- /dev/null
++++ b/src/panfrost/tiler/tiler-hex-read
+@@ -0,0 +1,400 @@
++#!/usr/bin/env python3
++
++import sys
++import struct
++
++FLIP_Y = False
++
++data = b''
++
++fb_width = 160
++fb_height = 160
++hierarchy_mask = 0xffff
++
++HEAP_OFS = 0x8000
++
++base_ptr = 0
++heap_ptr = 0
++midgard = False
++bifrost = True
++valhall = False
++size = None
++
++bak_data = b''
++
++cur_data = b''
++
++# TODO: More robust looping..
++for line in sys.stdin.read().split("\n"):
++    print(line)
++    split = line.split(" ")
++    if not len(split) or split[0] == "":
++        continue
++    if split[0] == "width":
++        fb_width = int(split[1])
++        continue
++    if split[0] == "height":
++        fb_height = int(split[1])
++        continue
++    if split[0] == "mask":
++        hierarchy_mask = int(split[1], 0)
++        continue
++    if split[0] == "vaheap":
++        base_ptr = int(split[1], 16)
++        bifrost = False
++        valhall = True
++        continue
++    if split[0] == "addr":
++        base_ptr = int(split[1], 16)
++        bifrost = False
++        midgard = True
++        HEAP_OFS = 0x40
++        continue
++    if split[0] == "heap":
++        heap_ptr = int(split[1], 16)
++        data += cur_data
++        cur_data = b''
++        bak_data = data
++        data = b''
++        continue
++    if split[0] == "size":
++        size = int(split[1], 0)
++        continue
++    offset = int(split[0], 16)
++    if offset > len(data):
++        data += cur_data
++        cur_data = b''
++        data += b'\0' * (offset - len(data))
++    for d in split[1:]:
++        if d == "" or d == "*":
++            continue
++        cur_data += bytes([int(d, 16)])
++
++data += cur_data
++
++if heap_ptr:
++    data, heap_data = bak_data, data
++
++if size == None:
++    size = len(data)
++
++def int7(val, signed=True):
++    val = val & 0x7f
++    if signed and val >= 0x40:
++        return val - 0x80
++    else:
++        return val
++
++def int8(val, signed=True):
++    val = val & 0xff
++    if signed and val >= 0x80:
++        return val - 0x100
++    else:
++        return val
++
++def fetch(ptr, size):
++    if midgard:
++        if ptr >= base_ptr and ptr < base_ptr + len(data):
++            base = ptr - base_ptr
++            return data[base:base+size]
++        elif ptr >= heap_ptr and ptr < heap_ptr + len(heap_data):
++            base = ptr - heap_ptr
++            return heap_data[base:base+size]
++    else:
++        if valhall:
++            ptr -= base_ptr
++        if ptr < 0:
++            return b""
++        return data[ptr:ptr+size]
++
++def print_draw(ptr):
++    draw = fetch(ptr, 128)
++    if len(draw) < 128:
++        print(" couldn't fetch draw struct")
++        return
++    decoded = struct.unpack("=16Q", draw)
++    coverage = [0 for x in decoded]
++
++    fields = (
++        ("Allow forward pixel to kill", 1, "0:0", "bool"),
++        ("Allow forward pixel to be killed", 1, "0:1", "bool"),
++        ("Pixel kill operation", 2, "0:2", "Pixel Kill"),
++        ("ZS update operation", 2, "0:4", "Pixel Kill"),
++        ("Allow primitive reorder", 1, "0:6", "bool"),
++        ("Overdraw alpha0", 1, "0:7", "bool"),
++        ("Overdraw alpha1", 1, "0:8", "bool"),
++        ("Clean Fragment Write", 1, "0:9", "bool"),
++        ("Primitive Barrier", 1, "0:10", "bool"),
++        ("Evaluate per-sample", 1, "0:11", "bool"),
++        ("Single-sampled lines", 1, "0:13", "bool"),
++        ("Occlusion query", 2, "0:14", "Occlusion Mode"),
++        ("Front face CCW", 1, "0:16", "bool"),
++        ("Cull front face", 1, "0:17", "bool"),
++        ("Cull back face", 1, "0:18", "bool"),
++        ("Multisample enable", 1, "0:19", "bool"),
++        ("Shader modifies coverage", 1, "0:20", "bool"),
++        ("Alpha-to-coverage Invert", 1, "0:21", "bool"),
++        ("Alpha-to-coverage", 1, "0:22", "bool"),
++        ("Scissor to bounding box", 1, "0:23", "bool"),
++        ("Sample mask", 16, "1:0", "uint"),
++        ("Render target mask", 8, "1:16", "hex"),
++
++        ("Packet", 1, "2:0", "bool"),
++        # TODO: shr modifier
++        ("Vertex array", 64, "2:0", "address"),
++        ("Vertex packet stride", 16, "4:0", "uint"),
++        ("Vertex attribute stride", 16, "4:16", "uint"),
++        ("Unk", 16, "5:0", "uint"),
++
++        ("Minimum Z", 32, "6:0", "float"),
++        ("Maximum Z", 32, "7:0", "float"),
++        ("Depth/stencil", 64, "10:0", "address"),
++        ("Blend count", 4, "12:0", "uint"),
++        ("Blend", 60, "12:4", "address"),
++        ("Occlusion", 64, "14:0", "address"),
++
++        ("Attribute offset", 32, "16:0", "uint"),
++        ("FAU count", 8, "17:0", "uint"),
++        ("Resources", 48, "24:0", "address"),
++        ("Shader", 48, "26:0", "address"),
++        ("Thread storage", 48, "28:0", "address"),
++        ("FAU", 64, "30:0", "address"),
++    )
++
++    for f in fields:
++        name, size, start, type = f
++        word, bit = [int(x) for x in start.split(":")]
++        if word & 1:
++            bit += 32
++        word >>= 1
++
++        mask = (1 << size) - 1
++        data = (decoded[word] >> bit) & mask
++        coverage[word] |= mask << bit
++        if type == "float":
++            data = struct.unpack("=f", struct.pack("=I", data))[0]
++        else:
++            data = hex(data)
++        print(f"   {name}: {data}")
++
++    for i, (d, c) in enumerate(zip(decoded, coverage)):
++        ci = c ^ ((1 << 64) - 1)
++        if d & ci:
++            print(f"    unk at 64-bit word {i}: {hex(d)} (known mask {hex(c)})")
++
++def print_vertex(ptr, positions):
++    for p in positions:
++        addr = ptr + p * 16
++        data = fetch(addr, 16)
++        if len(data) < 16:
++            print(f"        <no data : {hex(addr)}>")
++            continue
++        x, y, z, w = struct.unpack("=4f", data)
++        print(f"       <{x} {y} {z} {w}>")
++
++DRAW_TYPES = [
++    "unk",
++    "points",
++    "lines",
++    "tris",
++]
++
++def heap_interpret(start, end):
++    print(f"interpreting from {hex(start)} to {hex(end)}")
++
++    struct_count = 0
++
++    signed = True
++
++    base = 0
++    a = 0
++    b = 0
++    c = 0
++
++    num_vert = 3
++
++    draw_ptr = 0
++    pos_ptr = 0
++
++    while start != end:
++        if midgard and start & 0x1ff == 0x1f8:
++            jump = struct.unpack("=Q", fetch(start, 8))[0]
++            print(f"jump mdg: {hex(jump)}")
++            start = jump
++            continue
++
++        dat = fetch(start, 4)
++        if dat[3] & 0xe0 == 0x80:
++            struct_count += 1
++
++        print(f"{struct_count}:", " ".join([f"{hex(x)[2:].upper():>02}" for x in dat]), end="  ")
++
++        masked_op = dat[3] & ~3
++
++        up = struct.unpack("=I", dat)[0]
++
++        if valhall:
++            tri0 = tri0_7 = int7(up >> 15, signed)
++            tri1 = int7(up >> 8, signed)
++            tri2 = int7(up >> 1, signed)
++        else:
++            tri0 = int8(up >> 14, signed)
++            tri0_7 = int7(up >> 14, signed)
++            tri1 = int7(up >> 7, signed)
++            tri2 = int7(up, signed)
++
++        signed = True
++
++        if dat[3] & 0xe0 == 0x80:
++            res = ""
++            if valhall:
++                address = (up & 0x7ffffff) * 32
++                num_vert = (dat[3] >> 3) & 0x3
++            else:
++                address = (up & 0xffffff) * 64
++                num_vert = (dat[3] >> 2) & 0x3
++                if dat[3] & 0x10:
++                    a = 0
++                    res = " reset"
++            draw_ptr = address
++            if valhall:
++                pos_ptr = address + 128
++            print(f"draw {DRAW_TYPES[num_vert]}{res}: {hex(address)}")
++        elif valhall and dat[3] >> 4 == 12:
++            unk1 = up & 0x3f
++            address = (up >> 6) & 0xffff
++            unk2 = up >> 22
++            draw_ptr += address << 32
++            pos_ptr += address << 32
++            print(f"draw offset: {hex(address)}, unk {hex(unk1)}, {hex(unk2)}")
++
++            print_draw(draw_ptr)
++        elif dat[3] >> 6 == 1:
++            # TODO: handle two of these in a row
++            res = ""
++            if valhall:
++                # TOOD: Is the mask correct?
++                pf = (up >> 22) & 0x7f
++                shift = 7
++                if dat[3] & 0x20:
++                    a = 0
++                    res = " reset"
++            else:
++                pf = (up >> 21) & 0x7f
++                shift = 8
++
++            a += tri0_7 << shift
++            b += tri1 << 7
++            c += tri2 << 7
++            print(f"primitive offset{res}: {hex(pf << 4)} | +{tri0_7 << shift} {tri1 << 7} {tri2 << 7}")
++            signed = False
++        # TODO: Jumps are located based on position, not opcode
++        elif dat[3] == 0xff:
++            up64 = struct.unpack("=Q", fetch(start, 8))[0]
++            assert((up64 & 3) == 3)
++            print(f"jump (from {hex(start+8)}-8): {hex(up64 - 3)}")
++            start = up64 - 7
++        elif dat[3] == 0x00:
++            assert((up & 3) == 3)
++            print(f"jump (from {hex(start+4)}-4): {hex(up - 3)}, {hex(HEAP_OFS + up - 3)}")
++            start = HEAP_OFS + up - 7
++        elif (masked_op & 0xc0) == 0:
++            mode = hex(dat[3] >> 2)
++
++            pre_offset = (up >> 22) & 0xf
++
++            unk = ""
++            if valhall and up & 1:
++                unk = ", unk 1"
++
++            a += base + tri0
++            b += a + tri1
++            c += a + tri2
++            base = a
++
++            print(f"{mode} draw: {hex(pre_offset)} | +{tri0} {tri1} {tri2}{unk}")
++
++            print_vertex(pos_ptr, [a, b, c][:num_vert])
++
++            a = b = c = 0
++
++        else:
++            print(f"Unknown opcode {hex(dat[3])}")
++
++        start += 4
++
++def level_list():
++    levels = []
++    size = 16
++    anylevel = False
++
++    # TODO: Does this miss the largest level?
++    while anylevel == False or size // 2 < min(fb_width, fb_height):
++        if (hierarchy_mask << 4) & size != 0:
++            anylevel = True
++            levels.append(size)
++
++        size *= 2
++
++    return levels
++
++def div_round_up(x, y):
++    return (x + y - 1) // y
++
++def align(x, y):
++    return div_round_up(x, y) * y
++
++def tile_count(alignment=4):
++    return sum(align(div_round_up(fb_width, size) * div_round_up(fb_height, size), 4)
++               for size in level_list())
++
++if midgard:
++    unpacked_header = list(struct.unpack("=16i", data[0:64]))
++    # Is this really big endian?
++    unpacked_header[5:7] = struct.unpack(">2i", data[20:28])
++    print(f"header: {' '.join([str(x) for x in unpacked_header])}")
++
++    # Extra is because of HEAP_OFS
++    header_size = align(tile_count() + 8, 64)
++elif valhall:
++    # TODO: Does this figure need alignment?
++    HEAP_STRIDE = tile_count() * 8
++    HEAP_OFS = size - HEAP_STRIDE * 2
++
++pos = base_ptr + HEAP_OFS
++
++for size in level_list():
++    for y in range((fb_height + size - 1) // size):
++        for x in range((fb_width + size - 1) // size):
++            header = fetch(pos, 8)
++            if len(header) == 0:
++                break
++
++            if midgard:
++                end = struct.unpack("=Q", header)[0]
++                use = bool(end)
++                end += 4
++                start = base_ptr + header_size * 8 + (pos - base_ptr - HEAP_OFS) * 64
++            elif bifrost:
++                end, start = struct.unpack("=II", header)
++                use = bool(end)
++                start += HEAP_OFS
++                end += HEAP_OFS + 4
++                end &= ~3
++            else:
++                footer = fetch(pos + HEAP_STRIDE, 8)
++                if len(footer) == 0:
++                    break
++                start, end = struct.unpack("=QQ", header + footer)
++                use = bool(end)
++                # The upper bits are used for jump metadata
++                end &= (1 << 48) - 1
++                end += 4
++            if use:
++                if FLIP_Y:
++                    print([x * size, fb_height - (y + 1) * size], ((x + 1) * size, fb_height - y * size))
++                else:
++                    print([x * size, y * size], ((x + 1) * size, (y + 1) * size))
++                heap_interpret(start, end)
++
++            pos += 8
+diff --git a/src/util/os_misc.c b/src/util/os_misc.c
+index 13963afdffe..e5ade02e70b 100644
+--- a/src/util/os_misc.c
++++ b/src/util/os_misc.c
+@@ -53,7 +53,6 @@
+ #  define LOG_TAG "MESA"
+ #  include <unistd.h>
+ #  include <log/log.h>
+-#  include <cutils/properties.h>
+ #elif DETECT_OS_LINUX || DETECT_OS_CYGWIN || DETECT_OS_SOLARIS || DETECT_OS_HURD
+ #  include <unistd.h>
+ #elif DETECT_OS_OPENBSD || DETECT_OS_FREEBSD
+@@ -123,93 +122,10 @@ os_log_message(const char *message)
+ #endif
+ }
+ 
+-#if DETECT_OS_ANDROID
+-#  include <ctype.h>
+-#  include "hash_table.h"
+-#  include "ralloc.h"
+-#  include "simple_mtx.h"
+-
+-static struct hash_table *options_tbl;
+-
+-static void
+-options_tbl_fini(void)
+-{
+-   _mesa_hash_table_destroy(options_tbl, NULL);
+-}
+-
+-/**
+- * Get an option value from android's property system, as a fallback to
+- * getenv() (which is generally less useful on android due to processes
+- * typically being forked from the zygote.
+- *
+- * The option name used for getenv is translated into a property name
+- * by:
+- *
+- *  1) convert to lowercase
+- *  2) replace '_' with '.'
+- *  3) if necessary, prepend "mesa."
+- *
+- * For example:
+- *  - MESA_EXTENSION_OVERRIDE -> mesa.extension.override
+- *  - GALLIUM_HUD -> mesa.gallium.hud
+- *
+- * Note that we use a hashtable for two purposes:
+- *  1) Avoid re-translating the option name on subsequent lookups
+- *  2) Avoid leaking memory.  Because property_get() returns the
+- *     property value into a user allocated buffer, we cannot return
+- *     that directly to the caller, so we need to strdup().  With the
+- *     hashtable, subsquent lookups can return the existing string.
+- */
+-static const char *
+-os_get_android_option(const char *name)
+-{
+-   if (!options_tbl) {
+-      options_tbl = _mesa_hash_table_create(NULL, _mesa_hash_string,
+-            _mesa_key_string_equal);
+-      atexit(options_tbl_fini);
+-   }
+-
+-   struct hash_entry *entry = _mesa_hash_table_search(options_tbl, name);
+-   if (entry) {
+-      return entry->data;
+-   }
+-
+-   char value[PROPERTY_VALUE_MAX];
+-   char key[PROPERTY_KEY_MAX];
+-   char *p = key, *end = key + PROPERTY_KEY_MAX;
+-   /* add "mesa." prefix if necessary: */
+-   if (strstr(name, "MESA_") != name)
+-      p += strlcpy(p, "mesa.", end - p);
+-   p += strlcpy(p, name, end - p);
+-   for (int i = 0; key[i]; i++) {
+-      if (key[i] == '_') {
+-         key[i] = '.';
+-      } else {
+-         key[i] = tolower(key[i]);
+-      }
+-   }
+-
+-   const char *opt = NULL;
+-   int len = property_get(key, value, NULL);
+-   if (len > 1) {
+-      opt = ralloc_strdup(options_tbl, value);
+-   }
+-
+-   _mesa_hash_table_insert(options_tbl, name, (void *)opt);
+-
+-   return opt;
+-}
+-#endif
+-
+ const char *
+ os_get_option(const char *name)
+ {
+    const char *opt = getenv(name);
+-#if DETECT_OS_ANDROID
+-   if (!opt) {
+-      opt = os_get_android_option(name);
+-   }
+-#endif
+    return opt;
+ }
+ 
+diff --git a/src/util/perf/cpu_trace.h b/src/util/perf/cpu_trace.h
+index c13a3821158..e8423d40407 100644
+--- a/src/util/perf/cpu_trace.h
++++ b/src/util/perf/cpu_trace.h
+@@ -27,19 +27,6 @@
+          util_perfetto_trace_end(category);                                  \
+    } while (0)
+ 
+-/* NOTE: for now disable atrace for C++ to workaround a ndk bug with ordering
+- * between stdatomic.h and atomic.h.  See:
+- *
+- *   https://github.com/android/ndk/issues/1178
+- */
+-#elif defined(ANDROID) && !defined(__cplusplus)
+-
+-#include <cutils/trace.h>
+-
+-#define _MESA_TRACE_BEGIN(category, name)                                    \
+-   atrace_begin(ATRACE_TAG_GRAPHICS, name)
+-#define _MESA_TRACE_END(category) atrace_end(ATRACE_TAG_GRAPHICS)
+-
+ #else
+ 
+ #define _MESA_TRACE_BEGIN(category, name)
+diff --git a/src/util/stable_array.h b/src/util/stable_array.h
+new file mode 100644
+index 00000000000..a590aa48a50
+--- /dev/null
++++ b/src/util/stable_array.h
+@@ -0,0 +1,132 @@
++/*
++ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef STABLE_ARRAY_H
++#define STABLE_ARRAY_H
++
++#include "util/simple_mtx.h"
++#include "util/u_math.h"
++
++/* A thread-safe automatically growing array where elements have stable locations
++ *
++ * This data structure has these properties:
++ *
++ *  1. Accessing an element is constant time (if allocation is not required).
++ *
++ *  2. Elements are not moved in memory, so it is safe to store a pointer to
++ *     something in a stable_array.
++ *
++ *  3. The data structure is thread-safe. To improve performance, there is
++ *     also a fast path that does not require atomics.
++ *
++ *  4. Although the data structure is not lock-free, there is a limit on the
++ *     number of times that a lock is ever acquired--a maximum of 32 times the
++ *     number of accessing threads. In practice, contention will never be an
++ *     issue for long-lived stable_arrays.
++ *
++ *  5. Memory usage is similar to util_dynarray, with each allocation being
++ *     twice as large as the last. Freeing buckets is currently never done.
++ *
++ * The data structure is faster than util_sparse_array, but is not sparse.
++ */
++
++struct stable_array
++{
++   uint8_t *buckets[32];
++   simple_mtx_t lock;
++   size_t eltsize;
++};
++
++static inline void
++stable_array_init_bytes(struct stable_array *buf, size_t eltsize)
++{
++   memset(buf, 0, sizeof(*buf));
++   buf->eltsize = eltsize;
++   simple_mtx_init(&buf->lock, mtx_plain);
++}
++
++static inline void
++stable_array_fini(struct stable_array *buf)
++{
++   simple_mtx_destroy(&buf->lock);
++   for (unsigned i = 0; i < ARRAY_SIZE(buf->buckets); ++i) {
++      if (buf->buckets[i])
++         free(buf->buckets[i]);
++   }
++}
++
++struct stable_array_index
++{
++   unsigned bucket;
++   unsigned idx;
++};
++
++static inline struct stable_array_index
++stable_array_get_index(unsigned idx)
++{
++   struct stable_array_index i = {0};
++   i.bucket = util_logbase2(idx);
++   i.idx = i.bucket ? (idx -= (1 << i.bucket)) : idx;
++   return i;
++}
++
++static inline void *
++stable_array_get_bytes(struct stable_array *buf, unsigned idx, size_t eltsize)
++{
++   assert(eltsize == buf->eltsize);
++
++   struct stable_array_index i = stable_array_get_index(idx);
++
++   uint8_t *bucket = p_atomic_read(&buf->buckets[i.bucket]);
++
++   if (!bucket) {
++      simple_mtx_lock(&buf->lock);
++      bucket = buf->buckets[i.bucket];
++
++      if (!bucket) {
++         /* The first two buckets both have two elements */
++         bucket = (uint8_t *)calloc(1U << MAX2(i.bucket, 1), eltsize);
++
++         p_atomic_set(&buf->buckets[i.bucket], bucket);
++      }
++      simple_mtx_unlock(&buf->lock);
++   }
++
++   return bucket + eltsize * i.idx;
++}
++
++static inline void *
++stable_array_get_existing_bytes(struct stable_array *buf, unsigned idx, size_t eltsize)
++{
++   assert(eltsize == buf->eltsize);
++
++   struct stable_array_index i = stable_array_get_index(idx);
++
++   return buf->buckets[i.bucket] + eltsize * i.idx;
++}
++
++#define stable_array_init(buf, type) stable_array_init_bytes((buf), sizeof(type))
++#define stable_array_get(buf, type, idx) ((type*)stable_array_get_bytes((buf), (idx), sizeof(type)))
++#define stable_array_get_existing(buf, type, idx) ((type*)stable_array_get_existing_bytes((buf), (idx), sizeof(type)))
++
++#endif
+diff --git a/src/util/u_debug_stack_android.cpp b/src/util/u_debug_stack_android.cpp
+index 2c7b2d53676..f31389752bd 100644
+--- a/src/util/u_debug_stack_android.cpp
++++ b/src/util/u_debug_stack_android.cpp
+@@ -21,7 +21,6 @@
+  * IN THE SOFTWARE.
+  */
+ 
+-#include <backtrace/Backtrace.h>
+ 
+ #include "util/simple_mtx.h"
+ #include "util/u_debug.h"
+@@ -52,56 +51,14 @@ debug_backtrace_capture(debug_stack_frame *backtrace,
+                         unsigned start_frame,
+                         unsigned nr_frames)
+ {
+-   Backtrace *bt;
+ 
+-   if (!nr_frames)
+-      return;
+-
+-   bt = Backtrace::Create(BACKTRACE_CURRENT_PROCESS,
+-                          BACKTRACE_CURRENT_THREAD);
+-   if (bt == NULL) {
+-      for (unsigned i = 0; i < nr_frames; i++)
+-         backtrace[i].procname = NULL;
+-      return;
+-   }
+-
+-   /* Add one to exclude this call. Unwind already ignores itself. */
+-   bt->Unwind(start_frame + 1);
+-
+-   simple_mtx_lock(&table_mutex);
+-
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      const backtrace_frame_data_t* frame = bt->GetFrame(i);
+-      if (frame) {
+-         backtrace[i].procname = intern_symbol(frame->func_name.c_str());
+-         backtrace[i].start_ip = frame->pc;
+-         backtrace[i].off = frame->func_offset;
+-         backtrace[i].map = intern_symbol(frame->map.Name().c_str());
+-         backtrace[i].map_off = frame->rel_pc;
+-      } else {
+-         backtrace[i].procname = NULL;
+-      }
+-   }
+-
+-   simple_mtx_unlock(&table_mutex);
+-
+-   delete bt;
+ }
+ 
+ void
+ debug_backtrace_dump(const debug_stack_frame *backtrace,
+                      unsigned nr_frames)
+ {
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      if (backtrace[i].procname)
+-         debug_printf(
+-            "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n",
+-            backtrace[i].map,
+-            backtrace[i].map_off,
+-            backtrace[i].start_ip,
+-            backtrace[i].procname,
+-            backtrace[i].off);
+-   }
++
+ }
+ 
+ void
+@@ -109,14 +66,5 @@ debug_backtrace_print(FILE *f,
+                       const debug_stack_frame *backtrace,
+                       unsigned nr_frames)
+ {
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      if (backtrace[i].procname)
+-         fprintf(f,
+-                 "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n",
+-                 backtrace[i].map,
+-                 backtrace[i].map_off,
+-                 backtrace[i].start_ip,
+-                 backtrace[i].procname,
+-                 backtrace[i].off);
+-   }
++
+ }
diff --git a/src/amd/vulkan/radv_buffer_view.c b/src/amd/vulkan/radv_buffer_view.c
new file mode 100644
index 00000000000..f1e09d49dfe
--- /dev/null
+++ b/src/amd/vulkan/radv_buffer_view.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * based in part on anv driver which is:
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "gfx10_format_table.h"
+
+#include "radv_private.h"
+
+void
+radv_make_texel_buffer_descriptor(struct radv_device *device, uint64_t va, VkFormat vk_format, unsigned offset,
+                                  unsigned range, uint32_t *state)
+{
+   const struct util_format_description *desc;
+   unsigned stride;
+   unsigned num_format, data_format;
+   int first_non_void;
+   enum pipe_swizzle swizzle[4];
+   unsigned rsrc_word3;
+
+   desc = vk_format_description(vk_format);
+   first_non_void = vk_format_get_first_non_void_channel(vk_format);
+   stride = desc->block.bits / 8;
+
+   radv_compose_swizzle(desc, NULL, swizzle);
+
+   va += offset;
+
+   if (device->physical_device->rad_info.gfx_level != GFX8 && stride) {
+      range /= stride;
+   }
+
+   rsrc_word3 = S_008F0C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_008F0C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) |
+                S_008F0C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_008F0C_DST_SEL_W(radv_map_swizzle(swizzle[3]));
+
+   if (device->physical_device->rad_info.gfx_level >= GFX10) {
+      const struct gfx10_format *fmt =
+         &ac_get_gfx10_format_table(&device->physical_device->rad_info)[vk_format_to_pipe_format(vk_format)];
+
+      /* OOB_SELECT chooses the out-of-bounds check.
+       *
+       * GFX10:
+       *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
+       *  - 1: index >= NUM_RECORDS
+       *  - 2: NUM_RECORDS == 0
+       *  - 3: if SWIZZLE_ENABLE:
+       *          swizzle_address >= NUM_RECORDS
+       *       else:
+       *          offset >= NUM_RECORDS
+       *
+       * GFX11:
+       *  - 0: (index >= NUM_RECORDS) || (offset+payload > STRIDE)
+       *  - 1: index >= NUM_RECORDS
+       *  - 2: NUM_RECORDS == 0
+       *  - 3: if SWIZZLE_ENABLE && STRIDE:
+       *          (index >= NUM_RECORDS) || ( offset+payload > STRIDE)
+       *       else:
+       *          offset+payload > NUM_RECORDS
+       */
+      rsrc_word3 |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                    S_008F0C_RESOURCE_LEVEL(device->physical_device->rad_info.gfx_level < GFX11);
+   } else {
+      num_format = radv_translate_buffer_numformat(desc, first_non_void);
+      data_format = radv_translate_buffer_dataformat(desc, first_non_void);
+
+      assert(data_format != V_008F0C_BUF_DATA_FORMAT_INVALID);
+      assert(num_format != ~0);
+
+      rsrc_word3 |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+   }
+
+   state[0] = va;
+   state[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
+   state[2] = range;
+   state[3] = rsrc_word3;
+}
+
+void
+radv_buffer_view_init(struct radv_buffer_view *view, struct radv_device *device,
+                      const VkBufferViewCreateInfo *pCreateInfo)
+{
+   RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer);
+   uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset;
+
+   vk_buffer_view_init(&device->vk, &view->vk, pCreateInfo);
+
+   view->bo = buffer->bo;
+
+   radv_make_texel_buffer_descriptor(device, va, view->vk.format, view->vk.offset, view->vk.range, view->state);
+}
+
+void
+radv_buffer_view_finish(struct radv_buffer_view *view)
+{
+   vk_buffer_view_finish(&view->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+radv_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo,
+                      const VkAllocationCallbacks *pAllocator, VkBufferView *pView)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   struct radv_buffer_view *view;
+
+   view = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!view)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   radv_buffer_view_init(view, device, pCreateInfo);
+
+   *pView = radv_buffer_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+radv_DestroyBufferView(VkDevice _device, VkBufferView bufferView, const VkAllocationCallbacks *pAllocator)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   RADV_FROM_HANDLE(radv_buffer_view, view, bufferView);
+
+   if (!view)
+      return;
+
+   radv_buffer_view_finish(view);
+   vk_free2(&device->vk.alloc, pAllocator, view);
+}
diff --git a/src/amd/vulkan/radv_image_view.c b/src/amd/vulkan/radv_image_view.c
new file mode 100644
index 00000000000..463ababade4
--- /dev/null
+++ b/src/amd/vulkan/radv_image_view.c
@@ -0,0 +1,945 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * based in part on anv driver which is:
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "radv_private.h"
+
+#include "gfx10_format_table.h"
+
+static unsigned
+gfx9_border_color_swizzle(const struct util_format_description *desc)
+{
+   unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+
+   if (desc->format == PIPE_FORMAT_S8_UINT) {
+      /* Swizzle of 8-bit stencil format is defined as _x__ but the hw expects XYZW. */
+      assert(desc->swizzle[1] == PIPE_SWIZZLE_X);
+      return bc_swizzle;
+   }
+
+   if (desc->swizzle[3] == PIPE_SWIZZLE_X) {
+      /* For the pre-defined border color values (white, opaque
+       * black, transparent black), the only thing that matters is
+       * that the alpha channel winds up in the correct place
+       * (because the RGB channels are all the same) so either of
+       * these enumerations will work.
+       */
+      if (desc->swizzle[2] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
+   } else if (desc->swizzle[0] == PIPE_SWIZZLE_X) {
+      if (desc->swizzle[1] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
+   } else if (desc->swizzle[1] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
+   } else if (desc->swizzle[2] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
+   }
+
+   return bc_swizzle;
+}
+
+static unsigned
+radv_tex_dim(VkImageType image_type, VkImageViewType view_type, unsigned nr_layers, unsigned nr_samples,
+             bool is_storage_image, bool gfx9)
+{
+   if (view_type == VK_IMAGE_VIEW_TYPE_CUBE || view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+      return is_storage_image ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_CUBE;
+
+   /* GFX9 allocates 1D textures as 2D. */
+   if (gfx9 && image_type == VK_IMAGE_TYPE_1D)
+      image_type = VK_IMAGE_TYPE_2D;
+   switch (image_type) {
+   case VK_IMAGE_TYPE_1D:
+      return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_1D_ARRAY : V_008F1C_SQ_RSRC_IMG_1D;
+   case VK_IMAGE_TYPE_2D:
+      if (nr_samples > 1)
+         return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_MSAA;
+      else
+         return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_2D;
+   case VK_IMAGE_TYPE_3D:
+      if (view_type == VK_IMAGE_VIEW_TYPE_3D)
+         return V_008F1C_SQ_RSRC_IMG_3D;
+      else
+         return V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+   default:
+      unreachable("illegal image type");
+   }
+}
+
+static inline unsigned
+si_tile_mode_index(const struct radv_image_plane *plane, unsigned level, bool stencil)
+{
+   if (stencil)
+      return plane->surface.u.legacy.zs.stencil_tiling_index[level];
+   else
+      return plane->surface.u.legacy.tiling_index[level];
+}
+
+void
+si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *image,
+                               const struct legacy_surf_level *base_level_info, unsigned plane_id, unsigned base_level,
+                               unsigned first_level, unsigned block_width, bool is_stencil, bool is_storage_image,
+                               bool disable_compression, bool enable_write_compression, uint32_t *state,
+                               const struct ac_surf_nbc_view *nbc_view)
+{
+   struct radv_image_plane *plane = &image->planes[plane_id];
+   struct radv_image_binding *binding = image->disjoint ? &image->bindings[plane_id] : &image->bindings[0];
+   uint64_t gpu_address = binding->bo ? radv_buffer_get_va(binding->bo) + binding->offset : 0;
+   uint64_t va = gpu_address;
+   uint8_t swizzle = plane->surface.tile_swizzle;
+   enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
+   uint64_t meta_va = 0;
+   if (gfx_level >= GFX9) {
+      if (is_stencil)
+         va += plane->surface.u.gfx9.zs.stencil_offset;
+      else
+         va += plane->surface.u.gfx9.surf_offset;
+      if (nbc_view && nbc_view->valid) {
+         va += nbc_view->base_address_offset;
+         swizzle = nbc_view->tile_swizzle;
+      }
+   } else
+      va += (uint64_t)base_level_info->offset_256B * 256;
+
+   state[0] = va >> 8;
+   if (gfx_level >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D)
+      state[0] |= swizzle;
+   state[1] &= C_008F14_BASE_ADDRESS_HI;
+   state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
+
+   if (gfx_level >= GFX8) {
+      state[6] &= C_008F28_COMPRESSION_EN;
+      state[7] = 0;
+      if (!disable_compression && radv_dcc_enabled(image, first_level)) {
+         meta_va = gpu_address + plane->surface.meta_offset;
+         if (gfx_level <= GFX8)
+            meta_va += plane->surface.u.legacy.color.dcc_level[base_level].dcc_offset;
+
+         unsigned dcc_tile_swizzle = swizzle << 8;
+         dcc_tile_swizzle &= (1 << plane->surface.meta_alignment_log2) - 1;
+         meta_va |= dcc_tile_swizzle;
+      } else if (!disable_compression && radv_image_is_tc_compat_htile(image)) {
+         meta_va = gpu_address + plane->surface.meta_offset;
+      }
+
+      if (meta_va) {
+         state[6] |= S_008F28_COMPRESSION_EN(1);
+         if (gfx_level <= GFX9)
+            state[7] = meta_va >> 8;
+      }
+   }
+
+   /* GFX10.3+ can set a custom pitch for 1D and 2D non-array, but it must be a multiple
+    * of 256B.
+    *
+    * If an imported image is used with VK_IMAGE_VIEW_TYPE_2D_ARRAY, it may hang due to VM faults
+    * because DEPTH means pitch with 2D, but it means depth with 2D array.
+    */
+   if (device->physical_device->rad_info.gfx_level >= GFX10_3 && plane->surface.u.gfx9.uses_custom_pitch) {
+      assert((plane->surface.u.gfx9.surf_pitch * plane->surface.bpe) % 256 == 0);
+      assert(image->vk.image_type == VK_IMAGE_TYPE_2D);
+      assert(plane->surface.is_linear);
+      assert(G_00A00C_TYPE(state[3]) == V_008F1C_SQ_RSRC_IMG_2D);
+      unsigned pitch = plane->surface.u.gfx9.surf_pitch;
+
+      /* Subsampled images have the pitch in the units of blocks. */
+      if (plane->surface.blk_w == 2)
+         pitch *= 2;
+
+      state[4] &= C_00A010_DEPTH & C_00A010_PITCH_MSB;
+      state[4] |= S_00A010_DEPTH(pitch - 1) | /* DEPTH contains low bits of PITCH. */
+                  S_00A010_PITCH_MSB((pitch - 1) >> 13);
+   }
+
+   if (gfx_level >= GFX10) {
+      state[3] &= C_00A00C_SW_MODE;
+
+      if (is_stencil) {
+         state[3] |= S_00A00C_SW_MODE(plane->surface.u.gfx9.zs.stencil_swizzle_mode);
+      } else {
+         state[3] |= S_00A00C_SW_MODE(plane->surface.u.gfx9.swizzle_mode);
+      }
+
+      state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED;
+
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta = {
+            .rb_aligned = 1,
+            .pipe_aligned = 1,
+         };
+
+         if (!(plane->surface.flags & RADEON_SURF_Z_OR_SBUFFER))
+            meta = plane->surface.u.gfx9.color.dcc;
+
+         if (radv_dcc_enabled(image, first_level) && is_storage_image && enable_write_compression)
+            state[6] |= S_00A018_WRITE_COMPRESS_ENABLE(1);
+
+         state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
+      }
+
+      state[7] = meta_va >> 16;
+   } else if (gfx_level == GFX9) {
+      state[3] &= C_008F1C_SW_MODE;
+      state[4] &= C_008F20_PITCH;
+
+      if (is_stencil) {
+         state[3] |= S_008F1C_SW_MODE(plane->surface.u.gfx9.zs.stencil_swizzle_mode);
+         state[4] |= S_008F20_PITCH(plane->surface.u.gfx9.zs.stencil_epitch);
+      } else {
+         state[3] |= S_008F1C_SW_MODE(plane->surface.u.gfx9.swizzle_mode);
+         state[4] |= S_008F20_PITCH(plane->surface.u.gfx9.epitch);
+      }
+
+      state[5] &= C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED;
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta = {
+            .rb_aligned = 1,
+            .pipe_aligned = 1,
+         };
+
+         if (!(plane->surface.flags & RADEON_SURF_Z_OR_SBUFFER))
+            meta = plane->surface.u.gfx9.color.dcc;
+
+         state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
+                     S_008F24_META_RB_ALIGNED(meta.rb_aligned);
+      }
+   } else {
+      /* GFX6-GFX8 */
+      unsigned pitch = base_level_info->nblk_x * block_width;
+      unsigned index = si_tile_mode_index(plane, base_level, is_stencil);
+
+      state[3] &= C_008F1C_TILING_INDEX;
+      state[3] |= S_008F1C_TILING_INDEX(index);
+      state[4] &= C_008F20_PITCH;
+      state[4] |= S_008F20_PITCH(pitch - 1);
+   }
+}
+
+/**
+ * Build the sampler view descriptor for a texture (GFX10).
+ */
+static void
+gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image,
+                              VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping,
+                              unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer,
+                              unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state,
+                              uint32_t *fmask_state, VkImageCreateFlags img_create_flags,
+                              const struct ac_surf_nbc_view *nbc_view, const VkImageViewSlicedCreateInfoEXT *sliced_3d)
+{
+   const struct util_format_description *desc;
+   enum pipe_swizzle swizzle[4];
+   unsigned img_format;
+   unsigned type;
+
+   desc = vk_format_description(vk_format);
+
+   /* For emulated ETC2 without alpha we need to override the format to a 3-componenent format, so
+    * that border colors work correctly (alpha forced to 1). Since Vulkan has no such format,
+    * this uses the Gallium formats to set the description. */
+   if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_UNORM) {
+      desc = util_format_description(PIPE_FORMAT_R8G8B8X8_UNORM);
+   } else if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_SRGB) {
+      desc = util_format_description(PIPE_FORMAT_R8G8B8X8_SRGB);
+   }
+
+   img_format =
+      ac_get_gfx10_format_table(&device->physical_device->rad_info)[vk_format_to_pipe_format(vk_format)].img_format;
+
+   radv_compose_swizzle(desc, mapping, swizzle);
+
+   if (img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) {
+      assert(image->vk.image_type == VK_IMAGE_TYPE_3D);
+      type = V_008F1C_SQ_RSRC_IMG_3D;
+   } else {
+      type = radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, image->vk.samples, is_storage_image,
+                          device->physical_device->rad_info.gfx_level == GFX9);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = image->vk.array_layers;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (view_type != VK_IMAGE_VIEW_TYPE_3D)
+         depth = image->vk.array_layers;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = image->vk.array_layers / 6;
+
+   state[0] = 0;
+   state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
+   state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+              S_00A008_RESOURCE_LEVEL(device->physical_device->rad_info.gfx_level < GFX11);
+   state[3] = S_00A00C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_00A00C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) |
+              S_00A00C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_00A00C_DST_SEL_W(radv_map_swizzle(swizzle[3])) |
+              S_00A00C_BASE_LEVEL(image->vk.samples > 1 ? 0 : first_level) |
+              S_00A00C_LAST_LEVEL(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : last_level) |
+              S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc)) | S_00A00C_TYPE(type);
+   /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
+    * to know the total number of layers.
+    */
+   state[4] =
+      S_00A010_DEPTH(type == V_008F1C_SQ_RSRC_IMG_3D ? depth - 1 : last_layer) | S_00A010_BASE_ARRAY(first_layer);
+   state[5] = S_00A014_ARRAY_PITCH(0) | S_00A014_PERF_MOD(4);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) {
+      assert(type == V_008F1C_SQ_RSRC_IMG_3D);
+
+      /* ARRAY_PITCH is only meaningful for 3D images, 0 means SRV, 1 means UAV.
+       * In SRV mode, BASE_ARRAY is ignored and DEPTH is the last slice of mipmap level 0.
+       * In UAV mode, BASE_ARRAY is the first slice and DEPTH is the last slice of the bound level.
+       */
+      state[4] &= C_00A010_DEPTH;
+      state[4] |= S_00A010_DEPTH(!is_storage_image ? depth - 1 : u_minify(depth, first_level) - 1);
+      state[5] |= S_00A014_ARRAY_PITCH(is_storage_image);
+   } else if (sliced_3d) {
+      unsigned total = u_minify(depth, first_level);
+
+      assert(type == V_008F1C_SQ_RSRC_IMG_3D && is_storage_image);
+
+      unsigned first_slice = sliced_3d->sliceOffset;
+      unsigned slice_count = sliced_3d->sliceCount == VK_REMAINING_3D_SLICES_EXT
+                                ? MAX2(1, total - sliced_3d->sliceOffset)
+                                : sliced_3d->sliceCount;
+      unsigned last_slice = first_slice + slice_count - 1;
+
+      state[4] = 0;
+      state[4] |= S_00A010_DEPTH(last_slice) | S_00A010_BASE_ARRAY(first_slice);
+      state[5] |= S_00A014_ARRAY_PITCH(1);
+   }
+
+   unsigned max_mip = image->vk.samples > 1 ? util_logbase2(image->vk.samples) : image->vk.mip_levels - 1;
+   if (nbc_view && nbc_view->valid)
+      max_mip = nbc_view->num_levels - 1;
+
+   unsigned min_lod_clamped = radv_float_to_ufixed(CLAMP(min_lod, 0, 15), 8);
+   if (device->physical_device->rad_info.gfx_level >= GFX11) {
+      state[1] |= S_00A004_MAX_MIP(max_mip);
+      state[5] |= S_00A014_MIN_LOD_LO(min_lod_clamped);
+      state[6] |= S_00A018_MIN_LOD_HI(min_lod_clamped >> 5);
+   } else {
+      state[1] |= S_00A004_MIN_LOD(min_lod_clamped);
+      state[5] |= S_00A014_MAX_MIP(max_mip);
+   }
+
+   if (radv_dcc_enabled(image, first_level)) {
+      state[6] |=
+         S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+         S_00A018_MAX_COMPRESSED_BLOCK_SIZE(image->planes[0].surface.u.gfx9.color.dcc.max_compressed_block_size) |
+         S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(device, vk_format));
+   }
+
+   if (radv_image_get_iterate256(device, image)) {
+      state[6] |= S_00A018_ITERATE_256(1);
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (fmask_state) {
+      if (radv_image_has_fmask(image)) {
+         uint64_t gpu_address = radv_buffer_get_va(image->bindings[0].bo);
+         uint32_t format;
+         uint64_t va;
+
+         assert(image->plane_count == 1);
+
+         va = gpu_address + image->bindings[0].offset + image->planes[0].surface.fmask_offset;
+
+         switch (image->vk.samples) {
+         case 2:
+            format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F2;
+            break;
+         case 4:
+            format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F4;
+            break;
+         case 8:
+            format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F8;
+            break;
+         default:
+            unreachable("invalid nr_samples");
+         }
+
+         fmask_state[0] = (va >> 8) | image->planes[0].surface.fmask_tile_swizzle;
+         fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) | S_00A004_WIDTH_LO(width - 1);
+         fmask_state[2] =
+            S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | S_00A008_RESOURCE_LEVEL(1);
+         fmask_state[3] =
+            S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+            S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+            S_00A00C_SW_MODE(image->planes[0].surface.u.gfx9.color.fmask_swizzle_mode) |
+            S_00A00C_TYPE(radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, 0, false, false));
+         fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
+         fmask_state[5] = 0;
+         fmask_state[6] = S_00A018_META_PIPE_ALIGNED(1);
+         fmask_state[7] = 0;
+
+         if (radv_image_is_tc_compat_cmask(image)) {
+            va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset;
+
+            fmask_state[6] |= S_00A018_COMPRESSION_EN(1);
+            fmask_state[6] |= S_00A018_META_DATA_ADDRESS_LO(va >> 8);
+            fmask_state[7] |= va >> 16;
+         }
+      } else
+         memset(fmask_state, 0, 8 * 4);
+   }
+}
+
+/**
+ * Build the sampler view descriptor for a texture (SI-GFX9)
+ */
+static void
+si_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image,
+                           VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping,
+                           unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer,
+                           unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state,
+                           uint32_t *fmask_state, VkImageCreateFlags img_create_flags)
+{
+   const struct util_format_description *desc;
+   enum pipe_swizzle swizzle[4];
+   int first_non_void;
+   unsigned num_format, data_format, type;
+
+   desc = vk_format_description(vk_format);
+
+   /* For emulated ETC2 without alpha we need to override the format to a 3-componenent format, so
+    * that border colors work correctly (alpha forced to 1). Since Vulkan has no such format,
+    * this uses the Gallium formats to set the description. */
+   if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_UNORM) {
+      desc = util_format_description(PIPE_FORMAT_R8G8B8X8_UNORM);
+   } else if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_SRGB) {
+      desc = util_format_description(PIPE_FORMAT_R8G8B8X8_SRGB);
+   }
+
+   radv_compose_swizzle(desc, mapping, swizzle);
+
+   first_non_void = vk_format_get_first_non_void_channel(vk_format);
+
+   num_format = radv_translate_tex_numformat(vk_format, desc, first_non_void);
+   if (num_format == ~0) {
+      num_format = 0;
+   }
+
+   data_format = radv_translate_tex_dataformat(vk_format, desc, first_non_void);
+   if (data_format == ~0) {
+      data_format = 0;
+   }
+
+   /* S8 with either Z16 or Z32 HTILE need a special format. */
+   if (device->physical_device->rad_info.gfx_level == GFX9 && vk_format == VK_FORMAT_S8_UINT &&
+       radv_image_is_tc_compat_htile(image)) {
+      if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+         data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
+      else if (image->vk.format == VK_FORMAT_D16_UNORM_S8_UINT)
+         data_format = V_008F14_IMG_DATA_FORMAT_S8_16;
+   }
+
+   if (device->physical_device->rad_info.gfx_level == GFX9 &&
+       img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) {
+      assert(image->vk.image_type == VK_IMAGE_TYPE_3D);
+      type = V_008F1C_SQ_RSRC_IMG_3D;
+   } else {
+      type = radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, image->vk.samples, is_storage_image,
+                          device->physical_device->rad_info.gfx_level == GFX9);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = image->vk.array_layers;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (view_type != VK_IMAGE_VIEW_TYPE_3D)
+         depth = image->vk.array_layers;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = image->vk.array_layers / 6;
+
+   state[0] = 0;
+   state[1] = (S_008F14_MIN_LOD(radv_float_to_ufixed(CLAMP(min_lod, 0, 15), 8)) | S_008F14_DATA_FORMAT(data_format) |
+               S_008F14_NUM_FORMAT(num_format));
+   state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
+   state[3] = (S_008F1C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_008F1C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) |
+               S_008F1C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_008F1C_DST_SEL_W(radv_map_swizzle(swizzle[3])) |
+               S_008F1C_BASE_LEVEL(image->vk.samples > 1 ? 0 : first_level) |
+               S_008F1C_LAST_LEVEL(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : last_level) |
+               S_008F1C_TYPE(type));
+   state[4] = 0;
+   state[5] = S_008F24_BASE_ARRAY(first_layer);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (device->physical_device->rad_info.gfx_level == GFX9) {
+      unsigned bc_swizzle = gfx9_border_color_swizzle(desc);
+
+      /* Depth is the last accessible layer on Gfx9.
+       * The hw doesn't need to know the total number of layers.
+       */
+      if (type == V_008F1C_SQ_RSRC_IMG_3D)
+         state[4] |= S_008F20_DEPTH(depth - 1);
+      else
+         state[4] |= S_008F20_DEPTH(last_layer);
+
+      state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
+      state[5] |= S_008F24_MAX_MIP(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : image->vk.mip_levels - 1);
+   } else {
+      state[3] |= S_008F1C_POW2_PAD(image->vk.mip_levels > 1);
+      state[4] |= S_008F20_DEPTH(depth - 1);
+      state[5] |= S_008F24_LAST_ARRAY(last_layer);
+   }
+   if (!(image->planes[0].surface.flags & RADEON_SURF_Z_OR_SBUFFER) && image->planes[0].surface.meta_offset) {
+      state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(device, vk_format));
+   } else {
+      if (device->instance->disable_aniso_single_level) {
+         /* The last dword is unused by hw. The shader uses it to clear
+          * bits in the first dword of sampler state.
+          */
+         if (device->physical_device->rad_info.gfx_level <= GFX7 && image->vk.samples <= 1) {
+            if (first_level == last_level)
+               state[7] = C_008F30_MAX_ANISO_RATIO;
+            else
+               state[7] = 0xffffffff;
+         }
+      }
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (fmask_state) {
+      if (radv_image_has_fmask(image)) {
+         uint32_t fmask_format;
+         uint64_t gpu_address = radv_buffer_get_va(image->bindings[0].bo);
+         uint64_t va;
+
+         assert(image->plane_count == 1);
+
+         va = gpu_address + image->bindings[0].offset + image->planes[0].surface.fmask_offset;
+
+         if (device->physical_device->rad_info.gfx_level == GFX9) {
+            fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK;
+            switch (image->vk.samples) {
+            case 2:
+               num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_2;
+               break;
+            case 4:
+               num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_4;
+               break;
+            case 8:
+               num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_8;
+               break;
+            default:
+               unreachable("invalid nr_samples");
+            }
+         } else {
+            switch (image->vk.samples) {
+            case 2:
+               fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
+               break;
+            case 4:
+               fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
+               break;
+            case 8:
+               fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
+               break;
+            default:
+               assert(0);
+               fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID;
+            }
+            num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+         }
+
+         fmask_state[0] = va >> 8;
+         fmask_state[0] |= image->planes[0].surface.fmask_tile_swizzle;
+         fmask_state[1] =
+            S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(fmask_format) | S_008F14_NUM_FORMAT(num_format);
+         fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
+         fmask_state[3] =
+            S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+            S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+            S_008F1C_TYPE(radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, 0, false, false));
+         fmask_state[4] = 0;
+         fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
+         fmask_state[6] = 0;
+         fmask_state[7] = 0;
+
+         if (device->physical_device->rad_info.gfx_level == GFX9) {
+            fmask_state[3] |= S_008F1C_SW_MODE(image->planes[0].surface.u.gfx9.color.fmask_swizzle_mode);
+            fmask_state[4] |=
+               S_008F20_DEPTH(last_layer) | S_008F20_PITCH(image->planes[0].surface.u.gfx9.color.fmask_epitch);
+            fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(1) | S_008F24_META_RB_ALIGNED(1);
+
+            if (radv_image_is_tc_compat_cmask(image)) {
+               va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset;
+
+               fmask_state[5] |= S_008F24_META_DATA_ADDRESS(va >> 40);
+               fmask_state[6] |= S_008F28_COMPRESSION_EN(1);
+               fmask_state[7] |= va >> 8;
+            }
+         } else {
+            fmask_state[3] |= S_008F1C_TILING_INDEX(image->planes[0].surface.u.legacy.color.fmask.tiling_index);
+            fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
+                              S_008F20_PITCH(image->planes[0].surface.u.legacy.color.fmask.pitch_in_pixels - 1);
+            fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
+
+            if (radv_image_is_tc_compat_cmask(image)) {
+               va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset;
+
+               fmask_state[6] |= S_008F28_COMPRESSION_EN(1);
+               fmask_state[7] |= va >> 8;
+            }
+         }
+      } else
+         memset(fmask_state, 0, 8 * 4);
+   }
+}
+
+void
+radv_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image,
+                             VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping,
+                             unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer,
+                             unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state,
+                             uint32_t *fmask_state, VkImageCreateFlags img_create_flags,
+                             const struct ac_surf_nbc_view *nbc_view, const VkImageViewSlicedCreateInfoEXT *sliced_3d)
+{
+   if (device->physical_device->rad_info.gfx_level >= GFX10) {
+      gfx10_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, first_level,
+                                    last_level, first_layer, last_layer, width, height, depth, min_lod, state,
+                                    fmask_state, img_create_flags, nbc_view, sliced_3d);
+   } else {
+      si_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, first_level,
+                                 last_level, first_layer, last_layer, width, height, depth, min_lod, state, fmask_state,
+                                 img_create_flags);
+   }
+}
+
+static inline void
+compute_non_block_compressed_view(struct radv_device *device, const struct radv_image_view *iview,
+                                  struct ac_surf_nbc_view *nbc_view)
+{
+   const struct radv_image *image = iview->image;
+   const struct radeon_surf *surf = &image->planes[0].surface;
+   struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws);
+   struct ac_surf_info surf_info = radv_get_ac_surf_info(device, image);
+
+   ac_surface_compute_nbc_view(addrlib, &device->physical_device->rad_info, surf, &surf_info, iview->vk.base_mip_level,
+                               iview->vk.base_array_layer, nbc_view);
+}
+
+static void
+radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_device *device, VkFormat vk_format,
+                                const VkComponentMapping *components, float min_lod, bool is_storage_image,
+                                bool disable_compression, bool enable_compression, unsigned plane_id,
+                                unsigned descriptor_plane_id, VkImageCreateFlags img_create_flags,
+                                const struct ac_surf_nbc_view *nbc_view,
+                                const VkImageViewSlicedCreateInfoEXT *sliced_3d)
+{
+   struct radv_image *image = iview->image;
+   struct radv_image_plane *plane = &image->planes[plane_id];
+   bool is_stencil = iview->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT;
+   unsigned first_layer = iview->vk.base_array_layer;
+   uint32_t blk_w;
+   union radv_descriptor *descriptor;
+   uint32_t hw_level = 0;
+
+   if (is_storage_image) {
+      descriptor = &iview->storage_descriptor;
+   } else {
+      descriptor = &iview->descriptor;
+   }
+
+   assert(vk_format_get_plane_count(vk_format) == 1);
+   assert(plane->surface.blk_w % vk_format_get_blockwidth(plane->format) == 0);
+   blk_w = plane->surface.blk_w / vk_format_get_blockwidth(plane->format) * vk_format_get_blockwidth(vk_format);
+
+   if (device->physical_device->rad_info.gfx_level >= GFX9) {
+      hw_level = iview->vk.base_mip_level;
+      if (nbc_view->valid) {
+         hw_level = nbc_view->level;
+         iview->extent.width = nbc_view->width;
+         iview->extent.height = nbc_view->height;
+
+         /* Clear the base array layer because addrlib adds it as part of the base addr offset. */
+         first_layer = 0;
+      }
+   }
+
+   radv_make_texture_descriptor(device, image, is_storage_image, iview->vk.view_type, vk_format, components, hw_level,
+                                hw_level + iview->vk.level_count - 1, first_layer,
+                                iview->vk.base_array_layer + iview->vk.layer_count - 1,
+                                vk_format_get_plane_width(image->vk.format, plane_id, iview->extent.width),
+                                vk_format_get_plane_height(image->vk.format, plane_id, iview->extent.height),
+                                iview->extent.depth, min_lod, descriptor->plane_descriptors[descriptor_plane_id],
+                                descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor,
+                                img_create_flags, nbc_view, sliced_3d);
+
+   const struct legacy_surf_level *base_level_info = NULL;
+   if (device->physical_device->rad_info.gfx_level <= GFX8) {
+      if (is_stencil)
+         base_level_info = &plane->surface.u.legacy.zs.stencil_level[iview->vk.base_mip_level];
+      else
+         base_level_info = &plane->surface.u.legacy.level[iview->vk.base_mip_level];
+   }
+
+   bool enable_write_compression = radv_image_use_dcc_image_stores(device, image);
+   if (is_storage_image && !(enable_write_compression || enable_compression))
+      disable_compression = true;
+   si_set_mutable_tex_desc_fields(device, image, base_level_info, plane_id, iview->vk.base_mip_level,
+                                  iview->vk.base_mip_level, blk_w, is_stencil, is_storage_image, disable_compression,
+                                  enable_write_compression, descriptor->plane_descriptors[descriptor_plane_id],
+                                  nbc_view);
+}
+
+/**
+ * Determine if the given image view can be fast cleared.
+ */
+static bool
+radv_image_view_can_fast_clear(const struct radv_device *device, const struct radv_image_view *iview)
+{
+   struct radv_image *image;
+
+   if (!iview)
+      return false;
+   image = iview->image;
+
+   /* Only fast clear if the image itself can be fast cleared. */
+   if (!radv_image_can_fast_clear(device, image))
+      return false;
+
+   /* Only fast clear if all layers are bound. */
+   if (iview->vk.base_array_layer > 0 || iview->vk.layer_count != image->vk.array_layers)
+      return false;
+
+   /* Only fast clear if the view covers the whole image. */
+   if (!radv_image_extent_compare(image, &iview->extent))
+      return false;
+
+   return true;
+}
+
+void
+radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+                     const VkImageViewCreateInfo *pCreateInfo, VkImageCreateFlags img_create_flags,
+                     const struct radv_image_view_extra_create_info *extra_create_info)
+{
+   RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image);
+   const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
+   uint32_t plane_count = 1;
+   float min_lod = 0.0f;
+
+   const struct VkImageViewMinLodCreateInfoEXT *min_lod_info =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_MIN_LOD_CREATE_INFO_EXT);
+
+   if (min_lod_info)
+      min_lod = min_lod_info->minLod;
+
+   const struct VkImageViewSlicedCreateInfoEXT *sliced_3d =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_SLICED_CREATE_INFO_EXT);
+
+   bool from_client = extra_create_info && extra_create_info->from_client;
+   vk_image_view_init(&device->vk, &iview->vk, !from_client, pCreateInfo);
+
+   switch (image->vk.image_type) {
+   case VK_IMAGE_TYPE_1D:
+   case VK_IMAGE_TYPE_2D:
+      assert(range->baseArrayLayer + vk_image_subresource_layer_count(&image->vk, range) - 1 <= image->vk.array_layers);
+      break;
+   case VK_IMAGE_TYPE_3D:
+      assert(range->baseArrayLayer + vk_image_subresource_layer_count(&image->vk, range) - 1 <=
+             radv_minify(image->vk.extent.depth, range->baseMipLevel));
+      break;
+   default:
+      unreachable("bad VkImageType");
+   }
+   iview->image = image;
+   iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask);
+   iview->nbc_view.valid = false;
+
+   /* If the image has an Android external format, pCreateInfo->format will be
+    * VK_FORMAT_UNDEFINED. */
+   if (iview->vk.format == VK_FORMAT_UNDEFINED) {
+      iview->vk.format = image->vk.format;
+      iview->vk.view_format = image->vk.format;
+   }
+
+   /* Split out the right aspect. Note that for internal meta code we sometimes
+    * use an equivalent color format for the aspect so we first have to check
+    * if we actually got depth/stencil formats. */
+   if (iview->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT) {
+      if (vk_format_has_stencil(iview->vk.view_format))
+         iview->vk.view_format = vk_format_stencil_only(iview->vk.view_format);
+   } else if (iview->vk.aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
+      if (vk_format_has_depth(iview->vk.view_format))
+         iview->vk.view_format = vk_format_depth_only(iview->vk.view_format);
+   }
+
+   if (vk_format_get_plane_count(image->vk.format) > 1 &&
+       pCreateInfo->subresourceRange.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) {
+      plane_count = vk_format_get_plane_count(iview->vk.format);
+   }
+
+   /* when the view format is emulated, redirect the view to the hidden plane 1 */
+   if (radv_is_format_emulated(device->physical_device, iview->vk.format)) {
+      assert(radv_is_format_emulated(device->physical_device, image->vk.format));
+      iview->plane_id = 1;
+      iview->vk.view_format = image->planes[iview->plane_id].format;
+      iview->vk.format = image->planes[iview->plane_id].format;
+      plane_count = 1;
+   }
+
+   if (device->physical_device->rad_info.gfx_level >= GFX9) {
+      iview->extent = (VkExtent3D){
+         .width = image->vk.extent.width,
+         .height = image->vk.extent.height,
+         .depth = image->vk.extent.depth,
+      };
+   } else {
+      iview->extent = iview->vk.extent;
+   }
+
+   if (iview->vk.format != image->planes[iview->plane_id].format) {
+      const struct radv_image_plane *plane = &image->planes[iview->plane_id];
+      unsigned view_bw = vk_format_get_blockwidth(iview->vk.format);
+      unsigned view_bh = vk_format_get_blockheight(iview->vk.format);
+      unsigned plane_bw = vk_format_get_blockwidth(plane->format);
+      unsigned plane_bh = vk_format_get_blockheight(plane->format);
+
+      iview->extent.width = DIV_ROUND_UP(iview->extent.width * view_bw, plane_bw);
+      iview->extent.height = DIV_ROUND_UP(iview->extent.height * view_bh, plane_bh);
+
+      /* Comment ported from amdvlk -
+       * If we have the following image:
+       *              Uncompressed pixels   Compressed block sizes (4x4)
+       *      mip0:       22 x 22                   6 x 6
+       *      mip1:       11 x 11                   3 x 3
+       *      mip2:        5 x  5                   2 x 2
+       *      mip3:        2 x  2                   1 x 1
+       *      mip4:        1 x  1                   1 x 1
+       *
+       * On GFX9 the descriptor is always programmed with the WIDTH and HEIGHT of the base level and
+       * the HW is calculating the degradation of the block sizes down the mip-chain as follows
+       * (straight-up divide-by-two integer math): mip0:  6x6 mip1:  3x3 mip2:  1x1 mip3:  1x1
+       *
+       * This means that mip2 will be missing texels.
+       *
+       * Fix this by calculating the base mip's width and height, then convert
+       * that, and round it back up to get the level 0 size. Clamp the
+       * converted size between the original values, and the physical extent
+       * of the base mipmap.
+       *
+       * On GFX10 we have to take care to not go over the physical extent
+       * of the base mipmap as otherwise the GPU computes a different layout.
+       * Note that the GPU does use the same base-mip dimensions for both a
+       * block compatible format and the compressed format, so even if we take
+       * the plain converted dimensions the physical layout is correct.
+       */
+      if (device->physical_device->rad_info.gfx_level >= GFX9 && vk_format_is_block_compressed(plane->format) &&
+          !vk_format_is_block_compressed(iview->vk.format)) {
+         /* If we have multiple levels in the view we should ideally take the last level,
+          * but the mip calculation has a max(..., 1) so walking back to the base mip in an
+          * useful way is hard. */
+         if (iview->vk.level_count > 1) {
+            iview->extent.width = plane->surface.u.gfx9.base_mip_width;
+            iview->extent.height = plane->surface.u.gfx9.base_mip_height;
+         } else {
+            unsigned lvl_width = radv_minify(image->vk.extent.width, range->baseMipLevel);
+            unsigned lvl_height = radv_minify(image->vk.extent.height, range->baseMipLevel);
+
+            lvl_width = DIV_ROUND_UP(lvl_width * view_bw, plane_bw);
+            lvl_height = DIV_ROUND_UP(lvl_height * view_bh, plane_bh);
+
+            iview->extent.width =
+               CLAMP(lvl_width << range->baseMipLevel, iview->extent.width, plane->surface.u.gfx9.base_mip_width);
+            iview->extent.height =
+               CLAMP(lvl_height << range->baseMipLevel, iview->extent.height, plane->surface.u.gfx9.base_mip_height);
+
+            /* If the hardware-computed extent is still be too small, on GFX10
+             * we can attempt another workaround provided by addrlib that
+             * changes the descriptor's base level, and adjusts the address and
+             * extents accordingly.
+             */
+            if (device->physical_device->rad_info.gfx_level >= GFX10 &&
+                (radv_minify(iview->extent.width, range->baseMipLevel) < lvl_width ||
+                 radv_minify(iview->extent.height, range->baseMipLevel) < lvl_height) &&
+                iview->vk.layer_count == 1) {
+               compute_non_block_compressed_view(device, iview, &iview->nbc_view);
+            }
+         }
+      }
+   }
+
+   iview->support_fast_clear = radv_image_view_can_fast_clear(device, iview);
+   iview->disable_dcc_mrt = extra_create_info ? extra_create_info->disable_dcc_mrt : false;
+
+   bool disable_compression = extra_create_info ? extra_create_info->disable_compression : false;
+   bool enable_compression = extra_create_info ? extra_create_info->enable_compression : false;
+   for (unsigned i = 0; i < plane_count; ++i) {
+      VkFormat format = vk_format_get_plane_format(iview->vk.view_format, i);
+      radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, false,
+                                      disable_compression, enable_compression, iview->plane_id + i, i, img_create_flags,
+                                      &iview->nbc_view, NULL);
+      radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, true,
+                                      disable_compression, enable_compression, iview->plane_id + i, i, img_create_flags,
+                                      &iview->nbc_view, sliced_3d);
+   }
+}
+
+void
+radv_image_view_finish(struct radv_image_view *iview)
+{
+   vk_image_view_finish(&iview->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+radv_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator, VkImageView *pView)
+{
+   RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image);
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   struct radv_image_view *view;
+
+   view = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (view == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   radv_image_view_init(view, device, pCreateInfo, image->vk.create_flags,
+                        &(struct radv_image_view_extra_create_info){.from_client = true});
+
+   *pView = radv_image_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+radv_DestroyImageView(VkDevice _device, VkImageView _iview, const VkAllocationCallbacks *pAllocator)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   RADV_FROM_HANDLE(radv_image_view, iview, _iview);
+
+   if (!iview)
+      return;
+
+   radv_image_view_finish(iview);
+   vk_free2(&device->vk.alloc, pAllocator, iview);
+}
diff --git a/src/amd/vulkan/radv_sdma.h b/src/amd/vulkan/radv_sdma.h
new file mode 100644
index 00000000000..5f5a701e6f1
--- /dev/null
+++ b/src/amd/vulkan/radv_sdma.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright © 2023 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef RADV_SDMA_H
+#define RADV_SDMA_H
+
+#include "radv_private.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct radv_sdma_surf {
+   VkExtent3D extent; /* Image extent. */
+   VkOffset3D offset; /* Image offset. */
+   uint64_t va;       /* Virtual address of image data. */
+   unsigned bpp;      /* Bytes per pixel. */
+   unsigned blk_w;    /* Image format block width in pixels. */
+   unsigned blk_h;    /* Image format block height in pixels. */
+   bool is_linear;    /* Whether the image is linear. */
+
+   union {
+      /* linear images only */
+      struct {
+         unsigned pitch;       /* Row pitch in bytes. */
+         unsigned slice_pitch; /* Slice pitch in bytes. */
+      };
+      /* tiled images only */
+      struct {
+         uint64_t meta_va;      /* Virtual address of metadata. */
+         uint32_t meta_config;  /* Metadata configuration DWORD. */
+         uint32_t header_dword; /* Extra bits for the copy packet header. */
+         uint32_t info_dword;   /* Image information DWORD. */
+      };
+   };
+};
+
+ALWAYS_INLINE static VkExtent3D
+radv_sdma_get_copy_extent(const struct radv_image *const image, const VkImageSubresourceLayers subresource,
+                          VkExtent3D extent)
+{
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
+      extent.depth = vk_image_subresource_layer_count(&image->vk, &subresource);
+
+   return extent;
+}
+
+struct radv_sdma_surf radv_sdma_get_buf_surf(const struct radv_buffer *const buffer,
+                                             const struct radv_image *const image,
+                                             const VkBufferImageCopy2 *const region,
+                                             const VkImageAspectFlags aspect_mask);
+struct radv_sdma_surf radv_sdma_get_surf(const struct radv_device *const device, const struct radv_image *const image,
+                                         const VkImageSubresourceLayers subresource, const VkOffset3D offset,
+                                         const VkImageAspectFlags aspect_mask);
+void radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdbuf *cs,
+                                 const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img,
+                                 const VkExtent3D extent, bool to_image);
+bool radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_sdma_surf *buf,
+                                               const struct radv_sdma_surf *img, const VkExtent3D ext);
+void radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs,
+                                           const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img_in,
+                                           const VkExtent3D copy_extent, struct radeon_winsys_bo *temp_bo,
+                                           bool to_image);
+void radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va,
+                           uint64_t size);
+void radv_sdma_fill_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, const uint64_t va,
+                           const uint64_t size, const uint32_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADV_SDMA_H */
diff --git a/src/android_stub/meson.build.rej b/src/android_stub/meson.build.rej
new file mode 100644
index 00000000000..b9a9ddad68a
--- /dev/null
+++ b/src/android_stub/meson.build.rej
@@ -0,0 +1,10 @@
+diff a/src/android_stub/meson.build b/src/android_stub/meson.build	(rejected hunks)
+@@ -1,7 +1,7 @@
+ if with_android_stub
+   stub_libs = []
+ 
+-  foreach lib : ['backtrace', 'cutils', 'hardware', 'log', 'nativewindow', 'sync']
++  foreach lib : ['hardware', 'log', 'nativewindow']
+     stub_libs += shared_library(
+       lib,
+       files(lib + '_stub.cpp'),
diff --git a/src/compiler/glsl/glsl_to_nir.cpp.rej b/src/compiler/glsl/glsl_to_nir.cpp.rej
new file mode 100644
index 00000000000..5d3805f6c89
--- /dev/null
+++ b/src/compiler/glsl/glsl_to_nir.cpp.rej
@@ -0,0 +1,39 @@
+diff a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp	(rejected hunks)
+@@ -81,9 +81,6 @@ class nir_visitor : public ir_visitor
+ 
+    void create_function(ir_function_signature *ir);
+ 
+-   /* True if we have any output rvalues */
+-   bool has_output_rvalue;
+-
+ private:
+    void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size);
+    nir_ssa_def *evaluate_rvalue(ir_rvalue *ir);
+@@ -274,9 +271,6 @@ glsl_to_nir(const struct gl_constants *consts,
+          if (var->data.mode == nir_var_shader_in && var->data.sample)
+             shader->info.fs.uses_sample_shading = true;
+       }
+-
+-      if (v1.has_output_rvalue)
+-         shader->info.fs.uses_sample_shading = true;
+    }
+ 
+    return shader;
+@@ -287,7 +281,6 @@ nir_visitor::nir_visitor(const struct gl_constants *consts, nir_shader *shader)
+    this->supports_std430 = consts->UseSTD430AsDefaultPacking;
+    this->shader = shader;
+    this->is_global = true;
+-   this->has_output_rvalue = false;
+    this->var_table = _mesa_pointer_hash_table_create(NULL);
+    this->overload_table = _mesa_pointer_hash_table_create(NULL);
+    this->sparse_variable_set = _mesa_pointer_set_create(NULL);
+@@ -1826,9 +1819,6 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir)
+ 
+       enum gl_access_qualifier access = deref_get_qualifier(this->deref);
+       this->result = nir_load_deref_with_access(&b, this->deref, access);
+-
+-      if (nir_deref_mode_is(this->deref, nir_var_shader_out))
+-         this->has_output_rvalue = true;
+    }
+ 
+    return this->result;
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index d9ec60ad21f..c48fdd5c13e 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -279,6 +279,9 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxUniformComponents = 1024;
    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxInputComponents = 0; /* not used */
    ctx->Const.Program[MESA_SHADER_COMPUTE].MaxOutputComponents = 0; /* not used */
+   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = 16;
+   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicCounters = 16;
+   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxShaderStorageBlocks = 16;
 
    /* Set up default shader compiler options. */
    struct gl_shader_compiler_options options;
diff --git a/src/drm-shim/device.c b/src/drm-shim/device.c
index 345d72aa653..6a3321e5cd2 100644
--- a/src/drm-shim/device.c
+++ b/src/drm-shim/device.c
@@ -296,6 +296,10 @@ drm_shim_ioctl(int fd, unsigned long request, void *arg)
    ASSERTED int type = _IOC_TYPE(request);
    int nr = _IOC_NR(request);
 
+   /* Used by kbase; do not claim to be a kbase FD */
+   if (type == 0x80)
+      return -EINVAL;
+
    assert(type == DRM_IOCTL_BASE);
 
    if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) {
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index bb75273ead9..6d3fb97f9c7 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -3322,6 +3322,8 @@ dri2_query_wayland_buffer_wl(_EGLDisplay *disp,
    const struct wl_drm_components_descriptor *format;
 
    buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, buffer_resource);
+   if (!buffer)
+           buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, buffer_resource);
    if (!buffer)
       return EGL_FALSE;
 
diff --git a/src/egl/drivers/dri2/egl_dri2.c.rej b/src/egl/drivers/dri2/egl_dri2.c.rej
new file mode 100644
index 00000000000..6b841557268
--- /dev/null
+++ b/src/egl/drivers/dri2/egl_dri2.c.rej
@@ -0,0 +1,60 @@
+diff a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c	(rejected hunks)
+@@ -52,6 +52,8 @@
+ #include <wayland-client.h>
+ #include "wayland-drm.h"
+ #include "wayland-drm-client-protocol.h"
++#include "mali-buffer-sharing.h"
++#include "mali-buffer-sharing-client-protocol.h"
+ #include "linux-dmabuf-unstable-v1-client-protocol.h"
+ #endif
+ 
+@@ -2259,6 +2261,9 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx,
+ 
+    buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm,
+                                    (struct wl_resource *) _buffer);
++   if (!buffer)
++           buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali,
++                                   (struct wl_resource *) _buffer);
+    if (!buffer)
+        return NULL;
+ 
+@@ -3256,11 +3261,27 @@ dri2_bind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy)
+            wayland_drm_init(wl_dpy, device_name,
+                             &wl_drm_callbacks, disp, flags);
+ 
++   drmSetVersion sv = {
++      .drm_di_major = 1,
++      .drm_di_minor = 4,
++      .drm_dd_major = -1,
++      .drm_dd_minor = -1,
++   };
++   drmSetInterfaceVersion(dri2_dpy->fd, &sv);
++
++   char *busid = drmGetBusid(dri2_dpy->fd);
++   dri2_dpy->wl_server_mali =
++           mali_buffer_sharing_init(wl_dpy, busid,
++                                    &wl_drm_callbacks,
++                                    disp);
++   drmFreeBusid(busid);
++
+    free(device_name);
+ 
+    if (!dri2_dpy->wl_server_drm)
+       goto fail;
+ 
++   // TODO: Do this for mali_buffer_sharing
+ #ifdef HAVE_DRM_PLATFORM
+    /* We have to share the wl_drm instance with gbm, so gbm can convert
+     * wl_buffers to gbm bos. */
+@@ -3281,6 +3302,11 @@ dri2_unbind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy)
+ {
+    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ 
++   if (dri2_dpy->wl_server_mali) {
++           wayland_drm_uninit(dri2_dpy->wl_server_mali);
++           dri2_dpy->wl_server_mali = NULL;
++   }
++
+    if (!dri2_dpy->wl_server_drm)
+            return EGL_FALSE;
+ 
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 5edd810f476..37ac1684f26 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -296,8 +296,11 @@ struct dri2_egl_display {
    struct wl_display *wl_dpy_wrapper;
    struct wl_registry *wl_registry;
    struct wl_drm *wl_server_drm;
+   struct wl_drm *wl_server_mali;
    struct wl_drm *wl_drm;
+   struct wl_drm *wl_mali;
    uint32_t wl_drm_version, wl_drm_name;
+   uint32_t wl_mali_version, wl_mali_name;
    struct wl_shm *wl_shm;
    struct wl_event_queue *wl_queue;
    struct zwp_linux_dmabuf_v1 *wl_dmabuf;
@@ -345,6 +348,7 @@ struct dri2_egl_surface {
    struct wl_surface *wl_surface_wrapper;
    struct wl_display *wl_dpy_wrapper;
    struct wl_drm *wl_drm_wrapper;
+   struct wl_drm *wl_mali_wrapper;
    struct wl_callback *throttle_callback;
    struct zwp_linux_dmabuf_feedback_v1 *wl_dmabuf_feedback;
    struct dmabuf_feedback dmabuf_feedback, pending_dmabuf_feedback;
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 7e85dd682a4..78e3009c421 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -748,7 +748,7 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
                                                dri2_surf->base.PresentOpaque);
    assert(visual_idx != -1);
 
-   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) {
+   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm || dri2_dpy->wl_mali) {
       dri2_surf->format = dri2_wl_visuals[visual_idx].wl_drm_format;
    } else {
       assert(dri2_dpy->wl_shm);
@@ -771,6 +771,16 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
                          dri2_surf->wl_queue);
    }
 
+   if (dri2_dpy->wl_mali) {
+      dri2_surf->wl_mali_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_mali);
+      if (!dri2_surf->wl_mali_wrapper) {
+         _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
+         goto cleanup_queue;
+      }
+      wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_mali_wrapper,
+                         dri2_surf->wl_queue);
+   }
+
    dri2_surf->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy);
    if (!dri2_surf->wl_dpy_wrapper) {
       _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
@@ -907,6 +917,8 @@ dri2_wl_destroy_surface(_EGLDisplay *disp, _EGLSurface *surf)
    wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper);
    if (dri2_surf->wl_drm_wrapper)
       wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
+   if (dri2_surf->wl_mali_wrapper)
+      wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper);
    if (dri2_surf->wl_dmabuf_feedback) {
       zwp_linux_dmabuf_feedback_v1_destroy(dri2_surf->wl_dmabuf_feedback);
       dmabuf_feedback_fini(&dri2_surf->dmabuf_feedback);
@@ -1503,6 +1515,26 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy,
       ret = zwp_linux_buffer_params_v1_create_immed(params, width, height,
                                                     fourcc, 0);
       zwp_linux_buffer_params_v1_destroy(params);
+   } else if (dri2_surf->wl_mali_wrapper || dri2_dpy->wl_mali) {
+      struct wl_drm *wl_mali =
+         dri2_surf ? dri2_surf->wl_mali_wrapper : dri2_dpy->wl_mali;
+      int fd = -1, stride;
+
+      if (num_planes > 1)
+         return NULL;
+
+      query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd);
+      query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
+      if (!query) {
+         if (fd >= 0)
+            close(fd);
+         return NULL;
+      }
+
+      ret = mali_buffer_sharing_create_buffer((void *)wl_mali,
+                                              width, height, stride,
+                                              fourcc, 0, 0, fd);
+      close(fd);
    } else {
       struct wl_drm *wl_drm =
          dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm;
@@ -1776,6 +1808,62 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
    }
 }
 
+static void
+mali_handle_device(void *data, struct mali_buffer_sharing *drm, const char *device)
+{
+   struct dri2_egl_display *dri2_dpy = data;
+   drm_magic_t magic;
+
+   // hack
+   //printf("device '%s'\n", device);
+   dri2_dpy->device_name = strdup("/dev/dri/card0");
+
+   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
+   if (dri2_dpy->fd == -1) {
+      _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)",
+              dri2_dpy->device_name, strerror(errno));
+      free(dri2_dpy->device_name);
+      dri2_dpy->device_name = NULL;
+      return;
+   }
+
+   if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
+      dri2_dpy->authenticated = true;
+   } else {
+      roundtrip(dri2_dpy);
+      if (drmGetMagic(dri2_dpy->fd, &magic)) {
+         close(dri2_dpy->fd);
+         dri2_dpy->fd = -1;
+         free(dri2_dpy->device_name);
+         dri2_dpy->device_name = NULL;
+         _eglLog(_EGL_WARNING, "wayland-egl: drmGetMagic failed");
+         return;
+      }
+
+      mali_buffer_sharing_auth((void *)dri2_dpy->wl_mali, magic);
+      roundtrip(dri2_dpy);
+      // We don't get a callback
+      dri2_dpy->authenticated = true;
+   }
+
+   int supported_fourcc[] = {
+      WL_DRM_FORMAT_ABGR16F,
+      WL_DRM_FORMAT_ABGR2101010,
+      WL_DRM_FORMAT_XRGB8888,
+      WL_DRM_FORMAT_ARGB8888,
+      WL_DRM_FORMAT_ABGR8888,
+      WL_DRM_FORMAT_XBGR8888,
+      WL_DRM_FORMAT_RGB565,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(supported_fourcc); ++i) {
+      int visual_idx = dri2_wl_visual_idx_from_fourcc(supported_fourcc[i]);
+      assert(visual_idx != -1);
+
+      BITSET_SET(dri2_dpy->formats.formats_bitmap, visual_idx);
+   }
+}
+
 static void
 drm_handle_format(void *data, struct wl_drm *drm, uint32_t format)
 {
@@ -2836,6 +2924,8 @@ dri2_teardown_wayland(struct dri2_egl_display *dri2_dpy)
    dri2_wl_formats_fini(&dri2_dpy->formats);
    if (dri2_dpy->wl_drm)
       wl_drm_destroy(dri2_dpy->wl_drm);
+   if (dri2_dpy->wl_mali)
+      wl_drm_destroy(dri2_dpy->wl_mali);
    if (dri2_dpy->wl_dmabuf)
       zwp_linux_dmabuf_v1_destroy(dri2_dpy->wl_dmabuf);
    if (dri2_dpy->wl_shm)
diff --git a/src/egl/drivers/dri2/platform_wayland.c.rej b/src/egl/drivers/dri2/platform_wayland.c.rej
new file mode 100644
index 00000000000..b20afec38c3
--- /dev/null
+++ b/src/egl/drivers/dri2/platform_wayland.c.rej
@@ -0,0 +1,89 @@
+diff a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c	(rejected hunks)
+@@ -51,6 +51,7 @@
+ #include <wayland-egl-backend.h>
+ #include <wayland-client.h>
+ #include "wayland-drm-client-protocol.h"
++#include "mali-buffer-sharing-client-protocol.h"
+ #include "linux-dmabuf-unstable-v1-client-protocol.h"
+ 
+ /*
+@@ -765,6 +776,8 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
+  cleanup_drm:
+    if (dri2_surf->wl_drm_wrapper)
+       wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
++   if (dri2_surf->wl_mali_wrapper)
++      wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper);
+  cleanup_queue:
+    wl_event_queue_destroy(dri2_surf->wl_queue);
+  cleanup_surf:
+@@ -1768,6 +1859,10 @@ static const struct wl_drm_listener drm_listener = {
+    .capabilities = drm_handle_capabilities
+ };
+ 
++static const struct mali_buffer_sharing_listener mali_listener = {
++   .alloc_device = mali_handle_device,
++};
++
+ static void
+ dmabuf_ignore_format(void *data, struct zwp_linux_dmabuf_v1 *dmabuf,
+                      uint32_t format)
+@@ -1813,6 +1908,14 @@ wl_drm_bind(struct dri2_egl_display *dri2_dpy)
+    wl_drm_add_listener(dri2_dpy->wl_drm, &drm_listener, dri2_dpy);
+ }
+ 
++static void
++wl_mali_bind(struct dri2_egl_display *dri2_dpy)
++{
++   dri2_dpy->wl_mali = wl_registry_bind(dri2_dpy->wl_registry, dri2_dpy->wl_mali_name,
++                                        &mali_buffer_sharing_interface, dri2_dpy->wl_mali_version);
++   mali_buffer_sharing_add_listener((void *)dri2_dpy->wl_mali, &mali_listener, dri2_dpy);
++}
++
+ static void
+ default_dmabuf_feedback_format_table(void *data,
+                                      struct zwp_linux_dmabuf_feedback_v1 *zwp_linux_dmabuf_feedback_v1,
+@@ -1943,6 +2046,9 @@ registry_handle_global_drm(void *data, struct wl_registry *registry,
+    if (strcmp(interface, wl_drm_interface.name) == 0) {
+       dri2_dpy->wl_drm_version = MIN2(version, 2);
+       dri2_dpy->wl_drm_name = name;
++   } else if (strcmp(interface, mali_buffer_sharing_interface.name) == 0) {
++      dri2_dpy->wl_mali_version = MIN2(version, 5);
++      dri2_dpy->wl_mali_name = name;
+    } else if (strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0 && version >= 3) {
+       dri2_dpy->wl_dmabuf =
+          wl_registry_bind(registry, name, &zwp_linux_dmabuf_v1_interface,
+@@ -2145,10 +2251,7 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
+ 
+    /* We couldn't retrieve a render node from the dma-buf feedback (or the
+     * feedback was not advertised at all), so we must fallback to wl_drm. */
+-   if (dri2_dpy->fd == -1) {
+-      /* wl_drm not advertised by compositor, so can't continue */
+-      if (dri2_dpy->wl_drm_name == 0)
+-         goto cleanup;
++   if (dri2_dpy->fd == -1 && dri2_dpy->wl_drm_name) {
+       wl_drm_bind(dri2_dpy);
+ 
+       if (dri2_dpy->wl_drm == NULL)
+@@ -2161,6 +2264,22 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
+          goto cleanup;
+    }
+ 
++   if (dri2_dpy->fd == -1 && dri2_dpy->wl_mali_name) {
++      wl_mali_bind(dri2_dpy);
++
++      if (dri2_dpy->wl_mali == NULL)
++         goto cleanup;
++      if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1)
++         goto cleanup;
++
++      if (!dri2_dpy->authenticated &&
++          (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated))
++         goto cleanup;
++   }
++
++   if (dri2_dpy->fd == -1)
++           goto cleanup;
++
+    dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd,
+                                                &dri2_dpy->is_different_gpu);
+    dev = _eglAddDevice(dri2_dpy->fd, false);
diff --git a/src/egl/meson.build.rej b/src/egl/meson.build.rej
new file mode 100644
index 00000000000..1056b3fe25d
--- /dev/null
+++ b/src/egl/meson.build.rej
@@ -0,0 +1,19 @@
+diff a/src/egl/meson.build b/src/egl/meson.build	(rejected hunks)
+@@ -122,14 +122,16 @@ if with_dri2
+   endif
+   if with_platform_wayland
+     deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers]
+-    link_for_egl += libwayland_drm
++    link_for_egl += [libwayland_drm, libmali_buffer_sharing]
+     files_egl += files('drivers/dri2/platform_wayland.c')
+     files_egl += [
+       linux_dmabuf_unstable_v1_protocol_c,
+       linux_dmabuf_unstable_v1_client_protocol_h,
+       wayland_drm_client_protocol_h,
++      mali_buffer_sharing_client_protocol_h,
+     ]
+     incs_for_egl += include_directories('wayland/wayland-drm')
++    incs_for_egl += include_directories('wayland/mali-buffer-sharing')
+   endif
+   if with_platform_android
+     deps_for_egl += dep_android
diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c
new file mode 100644
index 00000000000..d3c9a6f0dd2
--- /dev/null
+++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2022 Icecream95
+ * Copyright © 2011 Kristian Høgsberg
+ * Copyright © 2011 Benjamin Franzke
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kristian Høgsberg <krh@bitplanet.net>
+ *    Benjamin Franzke <benjaminfranzke@googlemail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+
+#include <wayland-server.h>
+#include "mali-buffer-sharing.h"
+#include "mali-buffer-sharing-server-protocol.h"
+#include "wayland-drm-client-protocol.h"
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
+static void
+destroy_buffer(struct wl_resource *resource)
+{
+        struct wl_drm_buffer *buffer = wl_resource_get_user_data(resource);
+        struct wl_drm *drm = buffer->drm;
+
+        drm->callbacks.release_buffer(drm->user_data, buffer);
+        free(buffer);
+}
+
+static void
+buffer_destroy(struct wl_client *client, struct wl_resource *resource)
+{
+        wl_resource_destroy(resource);
+}
+
+static void
+create_buffer(struct wl_client *client, struct wl_resource *resource,
+              uint32_t id, uint32_t name, int fd,
+              int32_t width, int32_t height,
+              uint32_t format,
+              int32_t offset, int32_t stride)
+{
+        struct wl_drm *drm = wl_resource_get_user_data(resource);
+        struct wl_drm_buffer *buffer;
+
+        buffer = calloc(1, sizeof *buffer);
+        if (buffer == NULL) {
+                wl_resource_post_no_memory(resource);
+                return;
+        }
+
+        buffer->drm = drm;
+        buffer->width = width;
+        buffer->height = height;
+        buffer->format = format;
+        buffer->offset[0] = offset;
+        buffer->stride[0] = stride;
+
+        drm->callbacks.reference_buffer(drm->user_data, name, fd, buffer);
+        if (buffer->driver_buffer == NULL) {
+                // TODO: We should return an error
+                return;
+        }
+
+        buffer->resource =
+                wl_resource_create(client, &wl_buffer_interface, 1, id);
+        if (!buffer->resource) {
+                wl_resource_post_no_memory(resource);
+                free(buffer);
+                return;
+        }
+
+        wl_resource_set_implementation(buffer->resource,
+                                       (void (**)(void)) &drm->buffer_interface,
+                                       buffer, destroy_buffer);
+}
+
+static void
+mali_create_buffer(struct wl_client *client,
+                   struct wl_resource *resource,
+                   uint32_t id,
+                   int32_t width, int32_t height, uint32_t stride,
+                   enum wl_drm_format format, uint32_t unk1, uint32_t unk2,
+                   int fd)
+{
+        create_buffer(client, resource, id, 0, fd, width, height, format,
+                      0, stride);
+        close(fd);
+}
+
+static void
+mali_auth(struct wl_client *client,
+          struct wl_resource *resource, uint32_t id)
+{
+        struct wl_drm *drm = wl_resource_get_user_data(resource);
+
+        drm->callbacks.authenticate(drm->user_data, id);
+}
+
+static const struct mali_buffer_sharing_interface mali_interface = {
+        mali_create_buffer,
+        mali_auth,
+};
+
+static void
+bind_mali(struct wl_client *client, void *data, uint32_t version, uint32_t id)
+{
+        struct wl_drm *drm = data;
+        struct wl_resource *resource;
+
+        resource = wl_resource_create(client, &mali_buffer_sharing_interface,
+                                      MIN(version, 4), id);
+        if (!resource) {
+                wl_client_post_no_memory(client);
+                return;
+        }
+
+        wl_resource_set_implementation(resource, &mali_interface, data, NULL);
+
+        mali_buffer_sharing_send_alloc_device(resource, drm->device_name);
+}
+
+struct wl_drm *
+mali_buffer_sharing_init(struct wl_display *display, char *device_name,
+                 const struct wayland_drm_callbacks *callbacks, void *user_data)
+{
+        struct wl_drm *drm;
+
+        drm = malloc(sizeof *drm);
+        if (!drm)
+                return NULL;
+
+        drm->display = display;
+        drm->device_name = strdup(device_name ?: "");
+        drm->callbacks = *callbacks;
+        drm->user_data = user_data;
+        drm->flags = 1;
+
+        drm->buffer_interface.destroy = buffer_destroy;
+
+        drm->wl_drm_global =
+                wl_global_create(display, &mali_buffer_sharing_interface, 5,
+                                 drm, bind_mali);
+
+        return drm;
+}
diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h
new file mode 100644
index 00000000000..f7f2c4811df
--- /dev/null
+++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h
@@ -0,0 +1,12 @@
+#ifndef MALI_BUFFER_H
+#define MALI_BUFFER_H
+
+#include <wayland-server.h>
+
+#include "wayland-drm.h"
+
+struct wl_drm *
+mali_buffer_sharing_init(struct wl_display *display, char *device_name,
+		 const struct wayland_drm_callbacks *callbacks, void *user_data);
+
+#endif
diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml
new file mode 100644
index 00000000000..0ad02488118
--- /dev/null
+++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="mali">
+
+  <copyright>
+    Copyright © 2022 Icecream95
+
+    Permission to use, copy, modify, distribute, and sell this
+    software and its documentation for any purpose is hereby granted
+    without fee, provided that\n the above copyright notice appear in
+    all copies and that both that copyright notice and this permission
+    notice appear in supporting documentation, and that the name of
+    the copyright holders not be used in advertising or publicity
+    pertaining to distribution of the software without specific,
+    written prior permission.  The copyright holders make no
+    representations about the suitability of this software for any
+    purpose.  It is provided "as is" without express or implied
+    warranty.
+
+    THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+    SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+    FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+    AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+    ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
+    THIS SOFTWARE.
+  </copyright>
+
+  <interface name="mali_buffer_sharing" version="5">
+    <event name="alloc_device">
+      <arg name="name" type="string"/>
+    </event>
+
+    <request name="create_buffer">
+      <arg name="id" type="new_id" interface="wl_buffer"/>
+      <arg name="width" type="int"/>
+      <arg name="height" type="int"/>
+      <arg name="stride" type="uint"/>
+      <arg name="format" type="uint"/>
+      <arg name="unk1" type="uint"/>
+      <arg name="unk2" type="uint"/>
+      <!-- FD of the dmabuf to send -->
+      <arg name="name" type="fd"/>
+    </request>
+
+    <request name="auth">
+      <arg name="id" type="uint"/>
+    </request>
+  </interface>
+</protocol>
diff --git a/src/egl/wayland/mali-buffer-sharing/meson.build b/src/egl/wayland/mali-buffer-sharing/meson.build
new file mode 100644
index 00000000000..0693bf8668c
--- /dev/null
+++ b/src/egl/wayland/mali-buffer-sharing/meson.build
@@ -0,0 +1,51 @@
+# Copyright © 2017 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+inc_mali_buffer_sharing = include_directories('.')
+
+mali_buffer_sharing_protocol_c = custom_target(
+  'mali-buffer-sharing-protocol.c',
+  input : 'mali-buffer-sharing.xml',
+  output : 'mali-buffer-sharing-protocol.c',
+  command : [prog_wl_scanner, wl_scanner_arg, '@INPUT@', '@OUTPUT@'],
+)
+
+mali_buffer_sharing_client_protocol_h = custom_target(
+  'mali-buffer-sharing-client-protocol.h',
+  input : 'mali-buffer-sharing.xml',
+  output : 'mali-buffer-sharing-client-protocol.h',
+  command : [prog_wl_scanner, 'client-header', '@INPUT@', '@OUTPUT@'],
+)
+
+mali_buffer_sharing_server_protocol_h = custom_target(
+  'mali-buffer-sharing-server-protocol.h',
+  input : 'mali-buffer-sharing.xml',
+  output : 'mali-buffer-sharing-server-protocol.h',
+  command : [prog_wl_scanner, 'server-header', '@INPUT@', '@OUTPUT@'],
+)
+
+libmali_buffer_sharing = static_library(
+  'mali-buffer-sharing',
+  ['mali-buffer-sharing.c', mali_buffer_sharing_protocol_c, mali_buffer_sharing_server_protocol_h, wayland_drm_client_protocol_h],
+  include_directories : inc_wayland_drm,
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [dep_wayland_server],
+  build_by_default : false,
+)
diff --git a/src/egl/wayland/wayland-drm/wayland-drm.c.rej b/src/egl/wayland/wayland-drm/wayland-drm.c.rej
new file mode 100644
index 00000000000..9016c1f2638
--- /dev/null
+++ b/src/egl/wayland/wayland-drm/wayland-drm.c.rej
@@ -0,0 +1,10 @@
+diff a/src/egl/wayland/wayland-drm/wayland-drm.c b/src/egl/wayland/wayland-drm/wayland-drm.c	(rejected hunks)
+@@ -212,7 +212,7 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
+ 
+ 	wl_resource_set_implementation(resource, &drm_interface, data, NULL);
+ 
+-	wl_resource_post_event(resource, WL_DRM_DEVICE, drm->device_name);
++	wl_resource_post_event(resource, WL_DRM_DEVICE, "/dev/dri/card0");
+ 
+ 	if (drm->callbacks.is_format_supported(drm->user_data,
+ 					       WL_DRM_FORMAT_ARGB2101010)) {
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index fd24be07d2d..6713a04557d 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -1446,6 +1446,11 @@ cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage,
    }
 }
 
+void
+cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen)
+{
+   ctx->max_sampler_seen = max_sampler_seen;
+}
 
 /**
  * Send staged sampler state to the driver.
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index f796310d39b..87a27597097 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -84,6 +84,9 @@ void
 cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage,
                    unsigned idx, const struct pipe_sampler_state *states);
 
+void
+cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen);
+
 void
 cso_single_sampler_done(struct cso_context *cso,
                         enum pipe_shader_type shader_stage);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej
new file mode 100644
index 00000000000..5c8f600a0b7
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej
@@ -0,0 +1,19 @@
+diff a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c	(rejected hunks)
+@@ -1027,7 +1027,7 @@ static void emit_atomic_global(struct lp_build_nir_context *bld_base,
+       case nir_intrinsic_global_atomic_fadd:
+          op = LLVMAtomicRMWBinOpFAdd;
+          break;
+-#if LLVM_VERSION_MAJOR >= 15
++#if LLVM_VERSION_MAJOR >= 16
+       case nir_intrinsic_global_atomic_fmin:
+          op = LLVMAtomicRMWBinOpFMin;
+          break;
+@@ -1542,7 +1542,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
+       case nir_intrinsic_ssbo_atomic_fadd:
+          op = LLVMAtomicRMWBinOpFAdd;
+          break;
+-#if LLVM_VERSION_MAJOR >= 15
++#if LLVM_VERSION_MAJOR >= 16
+       case nir_intrinsic_shared_atomic_fmin:
+       case nir_intrinsic_ssbo_atomic_fmin:
+          op = LLVMAtomicRMWBinOpFMin;
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej
new file mode 100644
index 00000000000..81d28aa4e6b
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej
@@ -0,0 +1,10 @@
+diff a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c	(rejected hunks)
+@@ -189,7 +189,7 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
+    int new_fd;
+ 
+    if (fd < 0 || (new_fd = os_dupfd_cloexec(fd)) < 0)
+-     return false;
++      return false;
+ 
+    ret = pipe_loader_drm_probe_fd_nodup(dev, new_fd);
+    if (!ret)
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej
new file mode 100644
index 00000000000..fd803af37c0
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej
@@ -0,0 +1,43 @@
+diff a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h	(rejected hunks)
+@@ -8,6 +8,10 @@
+ #include "frontend/sw_winsys.h"
+ #include "target-helpers/inline_debug_helper.h"
+ 
++#include <stdio.h>
++#include <fcntl.h>
++#include <errno.h>
++
+ /* Helper function to choose and instantiate one of the software rasterizers:
+  * llvmpipe, softpipe.
+  */
+@@ -33,6 +37,10 @@
+ #include "asahi/agx_public.h"
+ #endif
+ 
++#if defined(GALLIUM_PANFROST)
++#include "panfrost/pan_public.h"
++#endif
++
+ static inline struct pipe_screen *
+ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+ {
+@@ -71,6 +79,19 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+       screen = agx_screen_create(0, NULL, winsys);
+ #endif
+ 
++#if defined(GALLIUM_PANFROST)
++   if(screen == NULL && strcmp(driver, "panfrost") == 0) {
++      int kbase_device_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK);
++      if(kbase_device_fd == -1) { 
++         printf("PAN_OSMESA: Failed to open kbase device: %s", strerror(errno));
++      }else {
++      	screen = panfrost_create_screen(kbase_device_fd, NULL);
++      }
++   }
++#else
++#error You forgot to include Panfrost
++#endif
++
+    return screen ? debug_screen_wrap(screen) : NULL;
+ }
+ 
diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build
index 987475c76df..aa0db97ea65 100644
--- a/src/gallium/drivers/panfrost/meson.build
+++ b/src/gallium/drivers/panfrost/meson.build
@@ -47,6 +47,7 @@ panfrost_includes = [
   inc_include,
   inc_src,
   inc_panfrost,
+  inc_panfrost_hw,
 ]
 
 compile_args_panfrost = [
diff --git a/src/gallium/drivers/panfrost/meson.build.rej b/src/gallium/drivers/panfrost/meson.build.rej
new file mode 100644
index 00000000000..508ebed2cd9
--- /dev/null
+++ b/src/gallium/drivers/panfrost/meson.build.rej
@@ -0,0 +1,10 @@
+diff a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build	(rejected hunks)
+@@ -51,7 +52,7 @@ compile_args_panfrost = [
+   '-Wno-pointer-arith'
+ ]
+ 
+-panfrost_versions = ['4', '5', '6', '7', '9']
++panfrost_versions = ['4', '5', '6', '7', '9', '10']
+ libpanfrost_versions = []
+ 
+ foreach ver : panfrost_versions
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c.rej b/src/gallium/drivers/panfrost/pan_cmdstream.c.rej
new file mode 100644
index 00000000000..fd0f475f81b
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c.rej
@@ -0,0 +1,1186 @@
+diff a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c	(rejected hunks)
+@@ -23,12 +23,15 @@
+  * SOFTWARE.
+  */
+ 
++#include "dma-uapi/dma-buf.h"
++
+ #include "util/macros.h"
+ #include "util/u_prim.h"
+ #include "util/u_vbuf.h"
+ #include "util/u_helpers.h"
+ #include "util/u_draw.h"
+ #include "util/u_memory.h"
++#include "util/u_viewport.h"
+ #include "pipe/p_defines.h"
+ #include "pipe/p_state.h"
+ #include "gallium/auxiliary/util/u_blend.h"
+@@ -749,8 +752,8 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+         float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
+         float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
+         float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
+-        float minz = (vp->translate[2] - fabsf(vp->scale[2]));
+-        float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
++        float minz, maxz;
++        util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
+ 
+         /* Scissor to the intersection of viewport and to the scissor, clamped
+          * to the framebuffer */
+@@ -778,10 +781,16 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+         maxx--;
+         maxy--;
+ 
+-        batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY;
+-        batch->maximum_z = rast->depth_clip_far  ? maxz : +INFINITY;
+-
+ #if PAN_ARCH <= 7
++        /* Proper depth clamp support was only introduced in v9, before then
++         * all that can be done is disabling clipping by adjusting the
++         * viewport. This means that the result will be wrong for float depth
++         * buffers or non-[0, 1] depth range. */
++        if (!rast->depth_clip_near)
++                minz = -INFINITY;
++        if (!rast->depth_clip_far)
++                maxz = +INFINITY;
++
+         struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
+ 
+         pan_pack(T.cpu, VIEWPORT, cfg) {
+@@ -790,19 +799,22 @@ panfrost_emit_viewport(struct panfrost_batch *batch)
+                 cfg.scissor_maximum_x = maxx;
+                 cfg.scissor_maximum_y = maxy;
+ 
+-                cfg.minimum_z = batch->minimum_z;
+-                cfg.maximum_z = batch->maximum_z;
++                cfg.minimum_z = minz;
++                cfg.maximum_z = maxz;
+         }
+ 
+         return T.gpu;
+ #else
+-        pan_pack(&batch->scissor, SCISSOR, cfg) {
++        pan_pack_cs_v10(&batch->scissor, &batch->cs_vertex, SCISSOR, cfg) {
+                 cfg.scissor_minimum_x = minx;
+                 cfg.scissor_minimum_y = miny;
+                 cfg.scissor_maximum_x = maxx;
+                 cfg.scissor_maximum_y = maxy;
+         }
+ 
++        batch->minimum_z = minz;
++        batch->maximum_z = maxz;
++
+         return 0;
+ #endif
+ }
+@@ -838,6 +850,14 @@ panfrost_emit_depth_stencil(struct panfrost_batch *batch)
+                 cfg.depth_units = rast->base.offset_units * 2.0f;
+                 cfg.depth_factor = rast->base.offset_scale;
+                 cfg.depth_bias_clamp = rast->base.offset_clamp;
++
++                if (rast->base.depth_clip_near && rast->base.depth_clip_far) {
++                        cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_0_1;
++                        cfg.depth_cull_enable = true;
++                } else {
++                        cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
++                        cfg.depth_cull_enable = false;
++                }
+         }
+ 
+         pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
+@@ -1482,9 +1502,17 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+         size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
+         struct panfrost_ptr transfer =
+                 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
++        void *sys_cpu = malloc(sys_size);
++
++        /* Write to a shadow buffer to make pushing cheaper */
++        struct panfrost_ptr sys_shadow = {
++                .cpu = sys_cpu,
++                .gpu = transfer.gpu,
++        };
+ 
+         /* Upload sysvals requested by the shader */
+-        panfrost_upload_sysvals(batch, &transfer, ss, stage);
++        panfrost_upload_sysvals(batch, &sys_shadow, ss, stage);
++        memcpy(transfer.cpu, sys_cpu, sys_size);
+ 
+         /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
+         struct panfrost_compiled_shader *shader = ctx->prog[stage];
+@@ -1527,8 +1555,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+         if (pushed_words)
+                 *pushed_words = ss->info.push.count;
+ 
+-        if (ss->info.push.count == 0)
++        if (ss->info.push.count == 0) {
++                free(sys_cpu);
+                 return ubos.gpu;
++        }
+ 
+         /* Copy push constants required by the shader */
+         struct panfrost_ptr push_transfer =
+@@ -1580,13 +1610,15 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
+                  * off to upload sysvals to a staging buffer on the CPU on the
+                  * assumption sysvals will get pushed (TODO) */
+ 
+-                const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
++                const void *mapped_ubo = (src.ubo == sysval_ubo) ? sys_cpu :
+                         panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
+ 
+                 /* TODO: Is there any benefit to combining ranges */
+                 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
+         }
+ 
++        free(sys_cpu);
++
+         return ubos.gpu;
+ }
+ 
+@@ -2777,6 +2809,385 @@ emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
+         return transfer.gpu;
+ }
+ 
++#if PAN_ARCH >= 10
++
++static int
++panfrost_export_dmabuf_fence(int dmabuf)
++{
++        struct dma_buf_export_sync_file export = {
++                .flags = DMA_BUF_SYNC_RW,
++        };
++
++        int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export);
++        if (err < 0) {
++                fprintf(stderr, "failed to export fence: %s\n",
++                        strerror(errno));
++                return -1;
++        }
++
++        return export.fd;
++}
++
++static bool
++panfrost_import_dmabuf_fence(int dmabuf, int fence)
++{
++        struct dma_buf_import_sync_file import = {
++                .flags = DMA_BUF_SYNC_RW,
++                .fd = fence,
++        };
++
++        int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import);
++        if (err < 0) {
++                fprintf(stderr, "failed to import fence: %s\n",
++                        strerror(errno));
++                return false;
++        }
++
++        return true;
++}
++
++static uint64_t *
++panfrost_cs_ring_allocate_instrs(struct panfrost_cs *cs, unsigned count)
++{
++        pan_command_stream c = cs->cs;
++
++        if (c.ptr + count > c.end) {
++                assert(c.ptr <= c.end);
++                assert(c.begin + count <= c.ptr);
++
++                /* Instructions are in a ring buffer, simply NOP out the end
++                 * and start back from the start. Possibly, doing a TAILCALL
++                 * straight to the start could also work. */
++                memset(c.ptr, 0, (c.end - c.ptr) * 8);
++                c.ptr = c.begin;
++
++                cs->offset += cs->base.size;
++                cs->cs = c;
++        }
++
++        /* TODO: Check against the extract offset */
++        return c.ptr + count;
++}
++
++// TODO: Rewrite this!
++static void
++emit_csf_queue(struct panfrost_batch *batch, struct panfrost_cs *cs,
++               pan_command_stream s, struct util_dynarray *deps,
++               bool first, bool last)
++{
++        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
++
++        assert(s.ptr <= s.end);
++
++        bool fragment = (cs->hw_resources & 2);
++        bool vertex = (cs->hw_resources & 12); /* TILER | IDVS */
++
++        uint64_t *limit = panfrost_cs_ring_allocate_instrs(cs,
++                128 + util_dynarray_num_elements(deps, struct panfrost_usage) * 4);
++
++        pan_command_stream *c = &cs->cs;
++
++        /* First, do some waiting at the start of the job */
++
++        pan_emit_cs_32(c, 0x54, *cs->base.latest_flush);
++        // TODO genxmlify
++        pan_emit_cs_ins(c, 0x24, 0x540000000233ULL);
++        // TODO: What does this need to be?
++        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 0xff; }
++
++        /* For the first job in the batch, wait on dependencies */
++        // TODO: Usually the vertex job shouldn't have to wait for dmabufs!
++        if (first) {
++                mali_ptr seqnum_ptr_base = dev->mali.event_mem.gpu;
++
++                util_dynarray_foreach(deps, struct panfrost_usage, u) {
++                        /* Note the multiplication in the call to
++                         * cs_ring_allocate_instrs. pan_emit_cs_64 might be
++                         * split, so the total is four instructions. */
++                        pan_emit_cs_48(c, 0x42, seqnum_ptr_base +
++                                       u->queue * PAN_EVENT_SIZE);
++                        pan_emit_cs_64(c, 0x40, u->seqnum);
++                        pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                                cfg.no_error = true;
++                                cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                                cfg.value = 0x40;
++                                cfg.addr = 0x42;
++                        }
++                }
++
++                uint64_t kcpu_seqnum = ++cs->kcpu_seqnum;
++
++                util_dynarray_foreach(&batch->dmabufs, int, fd) {
++                        int fence = panfrost_export_dmabuf_fence(*fd);
++
++                        /* TODO: poll on the dma-buf? */
++                        if (fence == -1)
++                                continue;
++
++                        // TODO: What if we reach the limit for number of KCPU
++                        // commands in a queue? It's pretty low (256)
++                        dev->mali.kcpu_fence_import(&dev->mali, cs->base.ctx,
++                                                    fence);
++
++                        close(fence);
++                }
++
++                bool ret = dev->mali.kcpu_cqs_set(&dev->mali, cs->base.ctx,
++                                  cs->kcpu_event_ptr, kcpu_seqnum + 1);
++
++                if (ret) {
++                        /* If we don't set no_error, kbase might decide to
++                         * pass on errors from waiting for fences. */
++                        pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr);
++                        pan_emit_cs_64(c, 0x40, kcpu_seqnum);
++                        pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                                cfg.no_error = true;
++                                cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                                cfg.value = 0x40;
++                                cfg.addr = 0x42;
++                        }
++                }
++        }
++
++        /* Fragment jobs need to wait for the vertex job */
++        if (fragment && !first) {
++                pan_pack_ins(c, CS_EVWAIT_64, cfg) {
++                        cfg.condition = MALI_WAIT_CONDITION_HIGHER;
++                        cfg.value = 0x4e;
++                        cfg.addr = 0x4c;
++                }
++        }
++
++        if (vertex) {
++                pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 3; }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
++                pan_pack_ins(c, CS_HEAPINC, cfg) {
++                        cfg.type = MALI_HEAP_STATISTIC_V_T_START;
++                }
++        } else if (fragment) {
++                pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 4; }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 4; }
++        }
++
++        // copying to the main buffer can make debugging easier.
++        // TODO: This needs to be more reliable.
++#if 0
++        unsigned length = (s.ptr - s.begin) * 8;
++        unsigned clamped = MIN2(length, cs->bo->ptr.cpu + cs->bo->size - (void *)c->ptr);
++        memcpy(c->ptr, s->begin, clamped);
++        c->ptr += clamped / 8;
++
++        if (clamped != length) {
++                unsigned rest = length - clamped;
++                c->ptr = cs->bo->ptr.cpu;
++                memcpy(c->ptr, s->begin, rest);
++                c->ptr += rest / 8;
++
++                cs->offset += cs->bo->size;
++        }
++#else
++
++        pan_emit_cs_48(c, 0x48, s.gpu);
++        pan_emit_cs_32(c, 0x4a, (s.ptr - s.begin) * 8);
++        pan_pack_ins(c, CS_CALL, cfg) { cfg.address = 0x48; cfg.length = 0x4a; }
++#endif
++
++        if (vertex) {
++                pan_pack_ins(c, CS_FLUSH_TILER, _) { }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
++                pan_pack_ins(c, CS_HEAPINC, cfg) {
++                        cfg.type = MALI_HEAP_STATISTIC_V_T_END;
++                }
++        }
++
++        if (fragment) {
++                /* Skip the next operation if the batch doesn't use a tiler
++                 * heap (i.e. it's just a blit) */
++                pan_emit_cs_ins(c, 22, 0x560030000001); /* b.ne w56, skip 1 */
++                pan_emit_cs_ins(c, 22, 0x570020000007); /* b.eq w57, skip 7 */
++
++                pan_pack_ins(c, CS_LDR, cfg) {
++                        cfg.offset = 4 * 10; /* Heap Start */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++                pan_pack_ins(c, CS_LDR, cfg) {
++                        cfg.offset = 4 * 12; /* Heap End */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4c;
++                }
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 0) | (1 << 3); }
++
++                pan_pack_ins(c, CS_HEAPCLEAR, cfg) {
++                        cfg.start = 0x4a;
++                        cfg.end = 0x4c;
++                        cfg.slots = 1 << 3;
++                }
++
++                /* Reset the fields so that the clear operation isn't done again */
++                pan_emit_cs_48(c, 0x4a, 0);
++                pan_pack_ins(c, CS_STR, cfg) {
++                        cfg.offset = 4 * 10; /* Heap Start */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++                pan_pack_ins(c, CS_STR, cfg) {
++                        cfg.offset = 4 * 12; /* Heap End */
++                        cfg.register_mask = 0x3;
++                        cfg.addr = 0x56;
++                        cfg.register_base = 0x4a;
++                }
++
++                /* Branch target for above branch */
++
++                // This seems to be done by the HEAPCLEAR
++                //pan_pack_ins(c, CS_HEAPINC, cfg) {
++                //        cfg.type = MALI_HEAP_STATISTIC_FRAGMENT_END;
++                //}
++        }
++
++        if (fragment) {
++                pan_emit_cs_32(c, 0x54, 0);
++                pan_emit_cs_ins(c, 0x24, 0x2540000f80211);
++                pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 1; }
++        }
++
++        {
++                // This could I think be optimised to 0xf80211 rather than 0x233
++                // TODO: Does this need to run for vertex jobs?
++                // What about when doing transform feedback?
++                // I think we at least need it for compute?
++
++                //pan_emit_cs_32(c, 0x54, 0);
++                //pan_emit_cs_ins(c, 0x24, 0x540000000233ULL);
++        }
++
++        if (last) {
++                uint64_t kcpu_seqnum = ++cs->kcpu_seqnum;
++
++                pan_emit_cs_64(c, 0x40, kcpu_seqnum + 1);
++                pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr);
++                pan_pack_ins(c, CS_EVSTR_64, cfg) {
++                        /* This is the scoreboard mask, right?.. */
++                        cfg.unk_2 = (3 << 3);
++                        cfg.value = 0x40;
++                        cfg.addr = 0x42;
++                }
++
++                dev->mali.kcpu_cqs_wait(&dev->mali, cs->base.ctx,
++                                        cs->kcpu_event_ptr, kcpu_seqnum);
++
++                int fence = dev->mali.kcpu_fence_export(&dev->mali, cs->base.ctx);
++
++                if (fence != -1) {
++                        util_dynarray_foreach(&batch->dmabufs, int, fd) {
++                                panfrost_import_dmabuf_fence(*fd, fence);
++                        }
++                }
++
++                close(fence);
++        }
++
++        pan_emit_cs_48(c, 0x48, cs->event_ptr);
++        pan_emit_cs_64(c, 0x4a, cs->seqnum + 1);
++        pan_pack_ins(c, CS_EVSTR_64, cfg) {
++                /* This is the scoreboard mask, right?.. */
++                cfg.unk_2 = (3 << 3);
++                cfg.value = 0x4a;
++                cfg.addr = 0x48;
++        }
++
++        // TODO: is this just a weird ddk thing, or is it required?
++        // Probably it just lessens the WC impact
++        while ((uintptr_t)c->ptr & 63)
++                pan_emit_cs_ins(c, 0, 0);
++
++        assert(c->ptr <= limit);
++}
++
++static void
++emit_csf_toplevel(struct panfrost_batch *batch)
++{
++        pan_command_stream *cv = &batch->ctx->kbase_cs_vertex.cs;
++        pan_command_stream *cf = &batch->ctx->kbase_cs_fragment.cs;
++
++        pan_command_stream v = batch->cs_vertex;
++        pan_command_stream f = batch->cs_fragment;
++
++        if (batch->cs_vertex_last_size) {
++                assert(v.ptr <= v.end);
++                *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8;
++                v = batch->cs_vertex_first;
++        }
++
++        bool vert = (v.ptr != v.begin);
++        bool frag = (f.ptr != f.begin);
++
++        // TODO: Clean up control-flow?
++
++        if (vert) {
++                pan_emit_cs_48(cv, 0x48, batch->ctx->kbase_ctx->tiler_heap_va);
++                pan_pack_ins(cv, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++
++                emit_csf_queue(batch, &batch->ctx->kbase_cs_vertex, v,
++                               &batch->vert_deps, true, !frag);
++        }
++
++        if (!frag)
++                return;
++
++        pan_emit_cs_48(cf, 0x48, batch->ctx->kbase_ctx->tiler_heap_va);
++        pan_pack_ins(cf, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++
++        uint64_t vertex_seqnum = batch->ctx->kbase_cs_vertex.seqnum;
++        // TODO: this assumes SAME_VA
++        mali_ptr seqnum_ptr = (uintptr_t) batch->ctx->kbase_cs_vertex.event_ptr;
++
++        pan_emit_cs_48(cf, 0x4c, seqnum_ptr);
++        pan_emit_cs_64(cf, 0x4e, vertex_seqnum);
++
++        // What does this instruction do?
++        //pan_emit_cs_32(cf, 0x54, 0);
++        //pan_emit_cs_ins(cf, 0x24, 0x540000000200);
++
++        assert(vert || batch->tiler_ctx.bifrost == 0);
++        pan_emit_cs_48(cf, 0x56, batch->tiler_ctx.bifrost);
++
++        emit_csf_queue(batch, &batch->ctx->kbase_cs_fragment, f,
++                       &batch->frag_deps, !vert, true);
++}
++
++static void
++init_cs(struct panfrost_context *ctx, struct panfrost_cs *cs)
++{
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++        pan_command_stream *c = &cs->cs;
++
++        cs->seqnum = 0;
++
++        cs->offset = 0;
++        c->ptr = cs->bo->ptr.cpu;
++        c->begin = cs->bo->ptr.cpu;
++        c->end = cs->bo->ptr.cpu + cs->base.size;
++        c->gpu = cs->bo->ptr.gpu;
++
++        // eight instructions == 64 bytes
++        pan_pack_ins(c, CS_RESOURCES, cfg) { cfg.mask = cs->hw_resources; }
++        pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 2; }
++        pan_emit_cs_48(c, 0x48, ctx->kbase_ctx->tiler_heap_va);
++        pan_pack_ins(c, CS_HEAPCTX, cfg) { cfg.address = 0x48; }
++        for (unsigned i = 0; i < 4; ++i)
++                pan_pack_ins(c, CS_NOP, _);
++
++        dev->mali.cs_submit(&dev->mali, &cs->base, 64, NULL, 0);
++        //dev->mali.cs_wait(&dev->mali, &cs->base, 64);
++}
++
++#endif
++
+ #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
+ 
+ static uint8_t
+@@ -2904,14 +3315,14 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch,
+ #endif
+ 
+ static void
+-panfrost_emit_primitive_size(struct panfrost_context *ctx,
++panfrost_emit_primitive_size(struct panfrost_batch *batch,
+                              bool points, mali_ptr size_array,
+                              void *prim_size)
+ {
+-        struct panfrost_rasterizer *rast = ctx->rasterizer;
++        struct panfrost_rasterizer *rast = batch->ctx->rasterizer;
+ 
+-        pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
+-                if (panfrost_writes_point_size(ctx)) {
++        pan_pack_cs_v10(prim_size, &batch->cs_vertex, PRIMITIVE_SIZE, cfg) {
++                if (panfrost_writes_point_size(batch->ctx)) {
+                         cfg.size_array = size_array;
+                 } else {
+                         cfg.constant = points ?
+@@ -3037,6 +3448,43 @@ panfrost_update_state_3d(struct panfrost_batch *batch)
+ }
+ 
+ #if PAN_ARCH >= 6
++
++#if PAN_ARCH >= 10
++static mali_ptr
++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch)
++{
++        struct panfrost_context *ctx = batch->ctx;
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++
++        if (ctx->tiler_heap_desc)
++                return ctx->tiler_heap_desc->ptr.gpu;
++
++        ctx->tiler_heap_desc = panfrost_bo_create(dev, 4096, 0, "Tiler heap descriptor");
++
++        pan_pack(ctx->tiler_heap_desc->ptr.cpu, TILER_HEAP, heap) {
++                heap.size = ctx->kbase_ctx->tiler_heap_chunk_size;
++                heap.base = ctx->kbase_ctx->tiler_heap_header;
++                heap.bottom = heap.base + 64;
++                heap.top = heap.base + heap.size;
++        }
++
++        return ctx->tiler_heap_desc->ptr.gpu;
++}
++#else
++static mali_ptr
++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch)
++{
++        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
++
++        struct panfrost_ptr t =
++                pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
++
++        GENX(pan_emit_tiler_heap)(dev, t.cpu);
++
++        return t.gpu;
++}
++#endif
++
+ static mali_ptr
+ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
+ {
+@@ -3048,18 +3496,32 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
+         if (batch->tiler_ctx.bifrost)
+                 return batch->tiler_ctx.bifrost;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
++        mali_ptr heap = panfrost_get_tiler_heap_desc(batch);
+ 
+-        GENX(pan_emit_tiler_heap)(dev, t.cpu);
++        mali_ptr scratch = 0;
++
++#if PAN_ARCH >= 10
++        // TODO: Dynamically size?
++        unsigned scratch_bits = 16;
++
++        /* Allocate scratch space for vertex positions / point sizes */
++        // TODO: Should this be shared?
++        struct panfrost_ptr sc =
++                pan_pool_alloc_aligned(&batch->pool.base, 1 << scratch_bits, 4096);
++
++        /* I think the scratch size is passed in the low bits of the
++         * pointer... but trying to go above 16 gives a CS_INHERIT_FAULT.
++         */
++        scratch = sc.gpu + scratch_bits;
++#endif
+ 
+-        mali_ptr heap = t.gpu;
++        struct panfrost_ptr t =
++                pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+ 
+-        t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+         GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
+                                  util_framebuffer_get_num_samples(&batch->key),
+                                  pan_tristate_get(batch->first_provoking_vertex),
+-                                 heap, t.cpu);
++                                 heap, scratch, t.cpu);
+ 
+         batch->tiler_ctx.bifrost = t.gpu;
+         return batch->tiler_ctx.bifrost;
+@@ -3070,18 +3532,19 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
+  * jobs and Valhall IDVS jobs
+  */
+ static void
+-panfrost_emit_primitive(struct panfrost_context *ctx,
++panfrost_emit_primitive(struct panfrost_batch *batch,
+                         const struct pipe_draw_info *info,
+                         const struct pipe_draw_start_count_bias *draw,
+                         mali_ptr indices, bool secondary_shader, void *out)
+ {
+-        UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
++        struct panfrost_context *ctx = batch->ctx;
++        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+ 
+         bool lines = (info->mode == PIPE_PRIM_LINES ||
+                       info->mode == PIPE_PRIM_LINE_LOOP ||
+                       info->mode == PIPE_PRIM_LINE_STRIP);
+ 
+-        pan_pack(out, PRIMITIVE, cfg) {
++        pan_pack_cs_v10(out, &batch->cs_vertex, PRIMITIVE, cfg) {
+                 cfg.draw_mode = pan_draw_mode(info->mode);
+                 if (panfrost_writes_point_size(ctx))
+                         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
+@@ -3113,12 +3576,20 @@ panfrost_emit_primitive(struct panfrost_context *ctx,
+ 
+                 /* Non-fixed restart indices should have been lowered */
+                 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
++
++                /* TODO: This is in a hot function, optimise? */
++                if (ctx->pipe_viewport.scale[2] > 0) {
++                        cfg.low_depth_cull = rast->depth_clip_near;
++                        cfg.high_depth_cull = rast->depth_clip_far;
++                } else {
++                        cfg.low_depth_cull = rast->depth_clip_far;
++                        cfg.high_depth_cull = rast->depth_clip_near;
++                }
+ #endif
+ 
+                 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
+                 cfg.index_type = panfrost_translate_index_size(info->index_size);
+ 
+-
+                 if (PAN_ARCH >= 9) {
+                         /* Base vertex offset on Valhall is used for both
+                          * indexed and non-indexed draws, in a simple way for
+@@ -3240,7 +3711,7 @@ panfrost_emit_draw(void *out,
+         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+         bool polygon = (prim == PIPE_PRIM_TRIANGLES);
+ 
+-        pan_pack(out, DRAW, cfg) {
++        pan_pack_cs_v10(out, &batch->cs_vertex, DRAW, cfg) {
+                 /*
+                  * From the Gallium documentation,
+                  * pipe_rasterizer_state::cull_face "indicates which faces of
+@@ -3270,6 +3741,7 @@ panfrost_emit_draw(void *out,
+                         ctx->prog[PIPE_SHADER_FRAGMENT];
+ 
+                 cfg.multisample_enable = rast->multisample;
++
+                 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
+ 
+                 /* Use per-sample shading if required by API Also use it when a
+@@ -3283,7 +3755,10 @@ panfrost_emit_draw(void *out,
+ 
+                 cfg.single_sampled_lines = !rast->multisample;
+ 
++                /* This is filled in by hardware on v10 */
++#if PAN_ARCH < 10
+                 cfg.vertex_array.packet = true;
++#endif
+ 
+                 cfg.minimum_z = batch->minimum_z;
+                 cfg.maximum_z = batch->maximum_z;
+@@ -3411,14 +3886,18 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+          */
+         secondary_shader &= fs_required;
+ 
+-        panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader,
++#if PAN_ARCH < 10
++        panfrost_emit_primitive(batch, info, draw, 0, secondary_shader,
+                                 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
++#else
++        panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, job);
++#endif
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
+                 cfg.count = info->instance_count;
+         }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
+                 if (secondary_shader) {
+                         unsigned v = vs->info.varyings.output_count;
+                         unsigned f = fs->info.varyings.input_count;
+@@ -3427,34 +3906,45 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+                         unsigned size = slots * 16;
+ 
+                         /* Assumes 16 byte slots. We could do better. */
++#if PAN_ARCH < 10
+                         cfg.vertex_packet_stride = size + 16;
++#endif
+                         cfg.vertex_attribute_stride = size;
+                 } else {
+                         /* Hardware requirement for "no varyings" */
++#if PAN_ARCH < 10
+                         cfg.vertex_packet_stride = 16;
++#endif
+                         cfg.vertex_attribute_stride = 0;
+                 }
+         }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, TILER, cfg) {
+                 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
+         }
+ 
++        /* For v10, the scissor is emitted directly by
++         * panfrost_emit_viewport */
++#if PAN_ARCH < 10
+         STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
+         memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR),
+                &batch->scissor, pan_size(SCISSOR));
++#endif
+ 
+-        panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0,
++        panfrost_emit_primitive_size(batch, info->mode == PIPE_PRIM_POINTS, 0,
+                                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INDICES, cfg) {
+                 cfg.address = indices;
++#if PAN_ARCH >= 10
++                cfg.size = draw->count * info->index_size;
++#endif
+         }
+ 
+         panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW),
+                            batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0);
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
++        pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, POSITION, cfg) {
+                 /* IDVS/points vertex shader */
+                 mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX];
+ 
+@@ -3464,20 +3954,21 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+ 
+                 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr,
+                                      batch->tls.gpu);
+-        }
+ 
+-        pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
+-                /* If a varying shader is used, we configure it with the same
+-                 * state as the position shader for backwards compatible
+-                 * behaviour with Bifrost. This could be optimized.
+-                 */
+-                if (!secondary_shader) continue;
++                pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, VARYING, vary) {
++                        /* If a varying shader is used, we configure it with the same
++                         * state as the position shader for backwards compatible
++                         * behaviour with Bifrost. This could be optimized.
++                         */
++                        if (!secondary_shader) continue;
+ 
+-                mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
++                        mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
+                                 (2 * pan_size(SHADER_PROGRAM));
+ 
+-                panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX,
+-                             ptr, batch->tls.gpu);
++                        vary.shader = ptr;
++
++                        // TODO: Fix this function for v9!
++                }
+         }
+ }
+ #endif
+@@ -3492,12 +3983,10 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
+                          mali_ptr pos, mali_ptr psiz, bool secondary_shader,
+                          void *job)
+ {
+-        struct panfrost_context *ctx = batch->ctx;
+-
+         void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
+         memcpy(section, invocation_template, pan_size(INVOCATION));
+ 
+-        panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader,
++        panfrost_emit_primitive(batch, info, draw, indices, secondary_shader,
+                                 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
+ 
+         void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
+@@ -3514,7 +4003,7 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
+         panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW),
+                            batch, true, prim, pos, fs_vary, varyings);
+ 
+-        panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
++        panfrost_emit_primitive_size(batch, prim == PIPE_PRIM_POINTS, psiz, prim_size);
+ }
+ #endif
+ 
+@@ -3526,8 +4015,8 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+ {
+         struct panfrost_context *ctx = batch->ctx;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
++        UNUSED struct panfrost_ptr t =
++                pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
+ 
+         /* Nothing to do */
+         if (batch->ctx->streamout.num_targets == 0)
+@@ -3556,7 +4045,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
+ 
+ #if PAN_ARCH >= 9
+-        pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
++        pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) {
+                 cfg.workgroup_size_x = 1;
+                 cfg.workgroup_size_y = 1;
+                 cfg.workgroup_size_z = 1;
+@@ -3569,15 +4058,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+                                      batch->rsd[PIPE_SHADER_VERTEX],
+                                      batch->tls.gpu);
+ 
++#if PAN_ARCH < 10
+                 /* TODO: Indexing. Also, this is a legacy feature... */
+                 cfg.compute.attribute_offset = batch->ctx->offset_start;
++#endif
+ 
+                 /* Transform feedback shaders do not use barriers or shared
+                  * memory, so we may merge workgroups.
+                  */
+                 cfg.allow_merging_workgroups = true;
++
++#if PAN_ARCH < 10
+                 cfg.task_increment = 1;
+                 cfg.task_axis = MALI_TASK_AXIS_Z;
++#endif
+         }
+ #else
+         struct mali_invocation_packed invocation;
+@@ -3593,12 +4087,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
+                                   attribs, attrib_bufs, t.cpu);
+ #endif
++#if PAN_ARCH >= 10
++        // TODO: Use a seperate compute queue?
++        pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) {
++                // TODO v10: Set parameters
++        }
++        batch->scoreboard.first_job = 1;
++#else
+         enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
+ #if PAN_ARCH <= 5
+         job_type = MALI_JOB_TYPE_VERTEX;
+ #endif
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
+                          true, false, 0, 0, &t, false);
++#endif
+ 
+         ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
+         ctx->prog[PIPE_SHADER_VERTEX] = vs;
+@@ -3607,6 +4109,54 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
+         batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
+ }
+ 
++#if PAN_ARCH >= 10
++static pan_command_stream
++panfrost_batch_create_cs(struct panfrost_batch *batch, unsigned count)
++{
++        struct panfrost_ptr cs = pan_pool_alloc_aligned(&batch->pool.base, count * 8, 64);
++
++        return (pan_command_stream) {
++                .ptr = cs.cpu,
++                .begin = cs.cpu,
++                .end = cs.cpu + count,
++                .gpu = cs.gpu,
++        };
++}
++
++static uint64_t *
++panfrost_cs_vertex_allocate_instrs(struct panfrost_batch *batch, unsigned count)
++{
++        /* Doing a tail call to another buffer takes three instructions */
++        count += 3;
++
++        pan_command_stream v = batch->cs_vertex;
++
++        if (v.ptr + count > v.end) {
++                batch->cs_vertex = panfrost_batch_create_cs(batch, MAX2(count, 1 << 13));
++
++                /* The size will be filled in later. */
++                uint32_t *last_size = (uint32_t *)v.ptr;
++                pan_emit_cs_32(&v, 0x5e, 0);
++
++                pan_emit_cs_48(&v, 0x5c, batch->cs_vertex.gpu);
++                pan_pack_ins(&v, CS_TAILCALL, cfg) { cfg.address = 0x5c; cfg.length = 0x5e; }
++
++                assert(v.ptr <= v.end);
++
++                /* This is not strictly required, but makes disassembly look
++                 * nicer */
++                if (batch->cs_vertex_last_size)
++                        *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8;
++
++                batch->cs_vertex_last_size = last_size;
++                if (!batch->cs_vertex_first.gpu)
++                        batch->cs_vertex_first = v;
++        }
++
++        return batch->cs_vertex.ptr + count;
++}
++#endif
++
+ static void
+ panfrost_direct_draw(struct panfrost_batch *batch,
+                      const struct pipe_draw_info *info,
+@@ -3618,6 +4168,11 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         struct panfrost_context *ctx = batch->ctx;
+ 
++#if PAN_ARCH >= 10
++        /* TODO: We don't need quite so much space */
++        uint64_t *limit = panfrost_cs_vertex_allocate_instrs(batch, 64);
++#endif
++
+         /* If we change whether we're drawing points, or whether point sprites
+          * are enabled (specified in the rasterizer), we may need to rebind
+          * shaders accordingly. This implicitly covers the case of rebinding
+@@ -3647,18 +4202,19 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         UNUSED struct panfrost_ptr tiler, vertex;
+ 
+-        if (idvs) {
+ #if PAN_ARCH >= 9
+-                tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
+-#elif PAN_ARCH >= 6
++        tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, MALLOC_VERTEX_JOB);
++#else /* PAN_ARCH < 9 */
++        if (idvs) {
++#if PAN_ARCH >= 6
+                 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
+-#else
+-                unreachable("IDVS is unsupported on Midgard");
+ #endif
++                unreachable("IDVS is unsupported on Midgard");
+         } else {
+-                vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
+-                tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
++                vertex = pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
++                tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, TILER_JOB);
+         }
++#endif /* PAN_ARCH */
+ 
+         unsigned vertex_count = ctx->vertex_count;
+ 
+@@ -3726,7 +4282,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ 
+         mali_ptr attribs, attrib_bufs;
+         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+         panfrost_update_state_3d(batch);
+         panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
+@@ -3752,13 +4308,25 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+ #if PAN_ARCH >= 9
+         assert(idvs && "Memory allocated IDVS required on Valhall");
+ 
+-        panfrost_emit_malloc_vertex(batch, info, draw, indices,
+-                                    secondary_shader, tiler.cpu);
++        panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, tiler.cpu);
+ 
++#if PAN_ARCH >= 10
++        pan_pack_ins(&batch->cs_vertex, IDVS_LAUNCH, _);
++        /* TODO: Find a better way to specify that there were jobs */
++        batch->scoreboard.first_job = 1;
++        batch->scoreboard.first_tiler = NULL + 1;
++
++        /* Make sure we didn't use more CS instructions than we allocated
++         * space for */
++        assert(batch->cs_vertex.ptr <= limit);
++
++#else /* PAN_ARCH < 10 */
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                          MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
+                          0, &tiler, false);
+-#else
++#endif
++#else /* PAN_ARCH < 9 */
++
+         /* Fire off the draw itself */
+         panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
+                                  fs_vary, varyings, pos, psiz, secondary_shader,
+@@ -3773,7 +4341,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
+                 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                                  MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
+                                  0, 0, &tiler, false);
+-#endif
++#endif /* PAN_ARCH < 6 */
+         } else {
+                 panfrost_draw_emit_vertex(batch, info, &invocation,
+                                           vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
+@@ -4102,8 +4670,8 @@ panfrost_launch_grid(struct pipe_context *pipe,
+ 
+         ctx->compute_grid = info;
+ 
+-        struct panfrost_ptr t =
+-                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
++        UNUSED struct panfrost_ptr t =
++                pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB);
+ 
+         /* Invoke according to the grid info */
+ 
+@@ -4143,7 +4711,7 @@ panfrost_launch_grid(struct pipe_context *pipe,
+ #else
+         struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
+ 
+-        pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
++        pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) {
+                 cfg.workgroup_size_x = info->block[0];
+                 cfg.workgroup_size_y = info->block[1];
+                 cfg.workgroup_size_z = info->block[2];
+@@ -4166,12 +4734,14 @@ panfrost_launch_grid(struct pipe_context *pipe,
+                         cs->info.cs.allow_merging_workgroups &&
+                         (info->variable_shared_mem == 0);
+ 
++#if PAN_ARCH < 10
+                 cfg.task_increment = 1;
+                 cfg.task_axis = MALI_TASK_AXIS_Z;
++#endif
+         }
+ #endif
+ 
+-        unsigned indirect_dep = 0;
++        UNUSED unsigned indirect_dep = 0; // TODO v10 (unused)
+ #if PAN_GPU_INDIRECTS
+         if (info->indirect) {
+                 struct pan_indirect_dispatch_info indirect = {
+@@ -4191,9 +4761,17 @@ panfrost_launch_grid(struct pipe_context *pipe,
+         }
+ #endif
+ 
++#if PAN_ARCH >= 10
++        pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) {
++                /* TODO: Change this as needed */
++                cfg.unk_1 = 512;
++        }
++        batch->scoreboard.first_job = 1;
++#else
+         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                          MALI_JOB_TYPE_COMPUTE, true, false,
+                          indirect_dep, 0, &t, false);
++#endif
+         panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
+ }
+ 
+@@ -4453,6 +5031,30 @@ panfrost_create_sampler_view(
+         return (struct pipe_sampler_view *) so;
+ }
+ 
++static void
++panfrost_init_logicop_blend_state(struct panfrost_blend_state *so)
++{
++        for (unsigned c = 0; c < so->pan.rt_count; ++c) {
++                unsigned g = so->base.independent_blend_enable ? c : 0;
++                const struct pipe_rt_blend_state pipe = so->base.rt[g];
++
++                struct pan_blend_equation equation = {0};
++
++                equation.color_mask = pipe.colormask;
++                equation.blend_enable = false;
++
++                so->info[c] = (struct pan_blend_info) {
++                        .enabled = (pipe.colormask != 0),
++                        .load_dest = true,
++                        .fixed_function = false,
++                };
++
++                so->pan.rts[c].equation = equation;
++
++                so->load_dest_mask |= BITFIELD_BIT(c);
++        }
++}
++
+ /* A given Gallium blend state can be encoded to the hardware in numerous,
+  * dramatically divergent ways due to the interactions of blending with
+  * framebuffer formats. Conceptually, there are two modes:
+@@ -4492,6 +5094,11 @@ panfrost_create_blend_state(struct pipe_context *pipe,
+         so->pan.logicop_func = blend->logicop_func;
+         so->pan.rt_count = blend->max_rt + 1;
+ 
++        if (blend->logicop_enable) {
++                panfrost_init_logicop_blend_state(so);
++                return so;
++        }
++
+         for (unsigned c = 0; c < so->pan.rt_count; ++c) {
+                 unsigned g = blend->independent_blend_enable ? c : 0;
+                 const struct pipe_rt_blend_state pipe = blend->rt[g];
+@@ -4521,12 +5128,10 @@ panfrost_create_blend_state(struct pipe_context *pipe,
+                         .opaque = pan_blend_is_opaque(equation),
+                         .constant_mask = constant_mask,
+ 
+-                        /* TODO: check the dest for the logicop */
+-                        .load_dest = blend->logicop_enable ||
+-                                pan_blend_reads_dest(equation),
++                        .load_dest = pan_blend_reads_dest(equation),
+ 
+                         /* Could this possibly be fixed-function? */
+-                        .fixed_function = !blend->logicop_enable &&
++                        .fixed_function =
+                                 pan_blend_can_fixed_function(equation,
+                                                              supports_2src) &&
+                                 (!constant_mask ||
+@@ -4612,10 +5217,12 @@ prepare_shader(struct panfrost_compiled_shader *state,
+ 
+         state->state = panfrost_pool_take_ref(pool, ptr.gpu);
+ 
++        // TODO: Why set primary_shader to false again?
++
+         /* Generic, or IDVS/points */
+         pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
+                 cfg.stage = pan_shader_stage(&state->info);
+-                cfg.primary_shader = true;
++                cfg.primary_shader = false;
+                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
+                 cfg.binary = state->bin.gpu;
+                 cfg.preload.r48_r63 = (state->info.preload >> 48);
+@@ -4631,7 +5238,7 @@ prepare_shader(struct panfrost_compiled_shader *state,
+         /* IDVS/triangles */
+         pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
+                 cfg.stage = pan_shader_stage(&state->info);
+-                cfg.primary_shader = true;
++                cfg.primary_shader = false;
+                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
+                 cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
+                 cfg.preload.r48_r63 = (state->info.preload >> 48);
+@@ -4707,6 +5314,11 @@ init_batch(struct panfrost_batch *batch)
+         /* On Midgard, the TLS is embedded in the FB descriptor */
+         batch->tls = batch->framebuffer;
+ #endif
++
++#if PAN_ARCH >= 10
++        batch->cs_vertex = panfrost_batch_create_cs(batch, 1 << 13);
++        batch->cs_fragment = panfrost_batch_create_cs(batch, 1 << 9);
++#endif
+ }
+ 
+ static void
+@@ -4821,6 +5433,10 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
+         screen->vtbl.init_polygon_list = init_polygon_list;
+         screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
+         screen->vtbl.compile_shader = GENX(pan_shader_compile);
++#if PAN_ARCH >= 10
++        screen->vtbl.emit_csf_toplevel = emit_csf_toplevel;
++        screen->vtbl.init_cs = init_cs;
++#endif
+ 
+         GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
+                                &screen->blitter.desc_pool.base);
diff --git a/src/gallium/drivers/panfrost/pan_context.c.rej b/src/gallium/drivers/panfrost/pan_context.c.rej
new file mode 100644
index 00000000000..8005be06871
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_context.c.rej
@@ -0,0 +1,178 @@
+diff a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c	(rejected hunks)
+@@ -34,7 +34,6 @@
+ 
+ #include "util/macros.h"
+ #include "util/format/u_format.h"
+-#include "util/libsync.h"
+ #include "util/u_inlines.h"
+ #include "util/u_upload_mgr.h"
+ #include "util/u_memory.h"
+@@ -571,6 +570,19 @@ panfrost_destroy(struct pipe_context *pipe)
+         struct panfrost_context *panfrost = pan_context(pipe);
+         struct panfrost_device *dev = pan_device(pipe->screen);
+ 
++        if (dev->kbase && dev->mali.context_create) {
++                dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_vertex.base);
++                dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_fragment.base);
++
++                dev->mali.context_destroy(&dev->mali, panfrost->kbase_ctx);
++
++                panfrost_bo_unreference(panfrost->kbase_cs_vertex.bo);
++                panfrost_bo_unreference(panfrost->kbase_cs_fragment.bo);
++        }
++
++        if (panfrost->tiler_heap_desc)
++                panfrost_bo_unreference(panfrost->tiler_heap_desc);
++
+         _mesa_hash_table_destroy(panfrost->writers, NULL);
+ 
+         if (panfrost->blitter)
+@@ -582,11 +594,15 @@ panfrost_destroy(struct pipe_context *pipe)
+         panfrost_pool_cleanup(&panfrost->descs);
+         panfrost_pool_cleanup(&panfrost->shaders);
+ 
+-        drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj);
+-        if (panfrost->in_sync_fd != -1)
+-                close(panfrost->in_sync_fd);
++        if (dev->kbase) {
++                dev->mali.syncobj_destroy(&dev->mali, panfrost->syncobj_kbase);
++        } else {
++                drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj);
++                if (panfrost->in_sync_fd != -1)
++                        close(panfrost->in_sync_fd);
+ 
+-        drmSyncobjDestroy(dev->fd, panfrost->syncobj);
++                drmSyncobjDestroy(dev->fd, panfrost->syncobj);
++        }
+         ralloc_free(pipe);
+ }
+ 
+@@ -873,6 +889,58 @@ panfrost_create_fence_fd(struct pipe_context *pctx,
+         *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type);
+ }
+ 
++struct sync_merge_data {
++	char	name[32];
++	int32_t	fd2;
++	int32_t	fence;
++	uint32_t	flags;
++	uint32_t	pad;
++};
++
++#define SYNC_IOC_MAGIC		'>'
++#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
++
++static inline int sync_merge(const char *name, int fd1, int fd2)
++{
++	struct sync_merge_data data = {{0}};
++	int ret;
++
++	data.fd2 = fd2;
++	strncpy(data.name, name, sizeof(data.name));
++
++	do {
++		ret = ioctl(fd1, SYNC_IOC_MERGE, &data);
++	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
++
++	if (ret < 0)
++		return ret;
++
++	return data.fence;
++}
++
++static inline int sync_accumulate(const char *name, int *fd1, int fd2)
++{
++	int ret;
++
++	assert(fd2 >= 0);
++
++	if (*fd1 < 0) {
++		*fd1 = dup(fd2);
++		return 0;
++	}
++
++	ret = sync_merge(name, *fd1, fd2);
++	if (ret < 0) {
++		/* leave *fd1 as it is */
++		return ret;
++	}
++
++	close(*fd1);
++	*fd1 = ret;
++
++	return 0;
++}
++
+ static void
+ panfrost_fence_server_sync(struct pipe_context *pctx,
+                            struct pipe_fence_handle *f)
+@@ -888,6 +956,28 @@ panfrost_fence_server_sync(struct pipe_context *pctx,
+         close(fd);
+ }
+ 
++static struct panfrost_cs
++panfrost_cs_create(struct panfrost_context *ctx, unsigned size, unsigned mask)
++{
++        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++        struct kbase_context *kctx = ctx->kbase_ctx;
++
++        struct panfrost_cs c = {0};
++
++        c.bo = panfrost_bo_create(dev, size, 0, "Command stream");
++
++        c.base = dev->mali.cs_bind(&dev->mali, kctx, c.bo->ptr.gpu, size);
++
++        c.event_ptr = dev->mali.event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE;
++        c.kcpu_event_ptr = dev->mali.kcpu_event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE;
++
++        c.hw_resources = mask;
++        screen->vtbl.init_cs(ctx, &c);
++
++        return c;
++}
++
+ struct pipe_context *
+ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+ {
+@@ -981,6 +1071,14 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+ 
+         assert(ctx->blitter);
+ 
++        if (dev->kbase && dev->mali.context_create)
++                ctx->kbase_ctx = dev->mali.context_create(&dev->mali);
++
++        if (dev->arch >= 10) {
++                ctx->kbase_cs_vertex = panfrost_cs_create(ctx, 65536, 13);
++                ctx->kbase_cs_fragment = panfrost_cs_create(ctx, 65536, 2);
++        }
++
+         /* Prepare for render! */
+ 
+         /* By default mask everything on */
+@@ -992,13 +1090,18 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+         /* Create a syncobj in a signaled state. Will be updated to point to the
+          * last queued job out_sync every time we submit a new job.
+          */
+-        ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj);
+-        assert(!ret && ctx->syncobj);
+-
+-        /* Sync object/FD used for NATIVE_FENCE_FD. */
+-        ctx->in_sync_fd = -1;
+-        ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj);
+-        assert(!ret);
++        if (dev->kbase) {
++                ctx->syncobj_kbase = dev->mali.syncobj_create(&dev->mali);
++                ctx->in_sync_fd = -1;
++        } else {
++                ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj);
++                assert(!ret && ctx->syncobj);
++
++                /* Sync object/FD used for NATIVE_FENCE_FD. */
++                ctx->in_sync_fd = -1;
++                ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj);
++                assert(!ret);
++        }
+ 
+         return gallium;
+ }
diff --git a/src/gallium/drivers/panfrost/pan_context.h.rej b/src/gallium/drivers/panfrost/pan_context.h.rej
new file mode 100644
index 00000000000..1f8d89a2948
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_context.h.rej
@@ -0,0 +1,42 @@
+diff a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h	(rejected hunks)
+@@ -117,6 +117,19 @@ struct panfrost_streamout {
+         unsigned num_targets;
+ };
+ 
++// TODO: This struct is a mess
++struct panfrost_cs {
++        struct kbase_cs base;
++        struct panfrost_bo *bo;
++        pan_command_stream cs;
++        mali_ptr event_ptr;
++        uint64_t seqnum;
++        mali_ptr kcpu_event_ptr;
++        uint64_t kcpu_seqnum;
++        uint64_t offset;
++        unsigned hw_resources;
++};
++
+ struct panfrost_context {
+         /* Gallium context */
+         struct pipe_context base;
+@@ -132,6 +145,7 @@ struct panfrost_context {
+ 
+         /* Sync obj used to keep track of in-flight jobs. */
+         uint32_t syncobj;
++        struct kbase_syncobj *syncobj_kbase;
+ 
+         /* Set of 32 batches. When the set is full, the LRU entry (the batch
+          * with the smallest seqnum) is flushed to free a slot.
+@@ -229,6 +243,12 @@ struct panfrost_context {
+ 
+         int in_sync_fd;
+         uint32_t in_sync_obj;
++
++        struct kbase_context *kbase_ctx;
++        struct panfrost_bo *event_bo;
++        struct panfrost_cs kbase_cs_vertex;
++        struct panfrost_cs kbase_cs_fragment;
++        struct panfrost_bo *tiler_heap_desc;
+ };
+ 
+ /* Corresponds to the CSO */
diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c
index 5d8e4bc499d..c25269303cf 100644
--- a/src/gallium/drivers/panfrost/pan_disk_cache.c
+++ b/src/gallium/drivers/panfrost/pan_disk_cache.c
@@ -34,7 +34,9 @@
 
 #include "pan_context.h"
 
+#ifdef ENABLE_SHADER_CACHE
 static bool debug = false;
+#endif
 
 extern int midgard_debug;
 extern int bifrost_debug;
diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c.rej b/src/gallium/drivers/panfrost/pan_disk_cache.c.rej
new file mode 100644
index 00000000000..cd344c1d62d
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_disk_cache.c.rej
@@ -0,0 +1,23 @@
+diff a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c	(rejected hunks)
+@@ -141,6 +143,8 @@ panfrost_disk_cache_retrieve(struct disk_cache *cache,
+         blob_copy_bytes(&blob, ptr, binary_size);
+         blob_copy_bytes(&blob, &binary->info, sizeof(binary->info));
+ 
++        free(buffer);
++
+         return true;
+ #else
+         return false;
+@@ -156,11 +160,7 @@ panfrost_disk_cache_init(struct panfrost_screen *screen)
+ #ifdef ENABLE_SHADER_CACHE
+         const char *renderer = screen->base.get_name(&screen->base);
+ 
+-        const struct build_id_note *note =
+-                build_id_find_nhdr_for_addr(panfrost_disk_cache_init);
+-        assert(note && build_id_length(note) == 20); /* sha1 */
+-
+-        const uint8_t *id_sha1 = build_id_data(note);
++        const uint8_t *id_sha1 = "1";
+         assert(id_sha1);
+ 
+         char timestamp[41];
diff --git a/src/gallium/drivers/panfrost/pan_fence.c.rej b/src/gallium/drivers/panfrost/pan_fence.c.rej
new file mode 100644
index 00000000000..280c30c29a3
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_fence.c.rej
@@ -0,0 +1,66 @@
+diff a/src/gallium/drivers/panfrost/pan_fence.c b/src/gallium/drivers/panfrost/pan_fence.c	(rejected hunks)
+@@ -42,7 +42,10 @@ panfrost_fence_reference(struct pipe_screen *pscreen,
+         struct pipe_fence_handle *old = *ptr;
+ 
+         if (pipe_reference(&old->reference, &fence->reference)) {
+-                drmSyncobjDestroy(dev->fd, old->syncobj);
++                if (dev->kbase)
++                        dev->mali.syncobj_destroy(&dev->mali, old->kbase);
++                else
++                        drmSyncobjDestroy(dev->fd, old->syncobj);
+                 free(old);
+         }
+ 
+@@ -65,6 +68,13 @@ panfrost_fence_finish(struct pipe_screen *pscreen,
+         if (abs_timeout == OS_TIMEOUT_INFINITE)
+                 abs_timeout = INT64_MAX;
+ 
++        if (dev->kbase) {
++                /* TODO: Use the timeout */
++                bool ret = dev->mali.syncobj_wait(&dev->mali, fence->kbase);
++                fence->signaled = ret;
++                return ret;
++        }
++
+         ret = drmSyncobjWait(dev->fd, &fence->syncobj,
+                              1,
+                              abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+@@ -81,6 +91,10 @@ panfrost_fence_get_fd(struct pipe_screen *screen,
+         struct panfrost_device *dev = pan_device(screen);
+         int fd = -1;
+ 
++        /* TODO: Export a sync file using KCPU */
++        if (dev->kbase)
++                return fd;
++
+         drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd);
+         return fd;
+ }
+@@ -92,6 +106,10 @@ panfrost_fence_from_fd(struct panfrost_context *ctx, int fd,
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+         int ret;
+ 
++        /* TODO: Implement this for kbase */
++        if (dev->kbase)
++                return NULL;
++
+         struct pipe_fence_handle *f = calloc(1, sizeof(*f));
+         if (!f)
+                 return NULL;
+@@ -134,6 +152,16 @@ panfrost_fence_create(struct panfrost_context *ctx)
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+         int fd = -1, ret;
+ 
++        if (dev->kbase) {
++                struct pipe_fence_handle *f = calloc(1, sizeof(*f));
++                if (!f)
++                        return NULL;
++
++                f->kbase = dev->mali.syncobj_dup(&dev->mali, ctx->syncobj_kbase);
++                pipe_reference_init(&f->reference, 1);
++                return f;
++        }
++
+         /* Snapshot the last rendering out fence. We'd rather have another
+          * syncobj instead of a sync file, but this is all we get.
+          * (HandleToFD/FDToHandle just gives you another syncobj ID for the
diff --git a/src/gallium/drivers/panfrost/pan_fence.h.rej b/src/gallium/drivers/panfrost/pan_fence.h.rej
new file mode 100644
index 00000000000..49caf91fe93
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_fence.h.rej
@@ -0,0 +1,9 @@
+diff a/src/gallium/drivers/panfrost/pan_fence.h b/src/gallium/drivers/panfrost/pan_fence.h	(rejected hunks)
+@@ -32,6 +32,7 @@ struct panfrost_context;
+ struct pipe_fence_handle {
+         struct pipe_reference reference;
+         uint32_t syncobj;
++        struct kbase_syncobj *kbase;
+         bool signaled;
+ };
+ 
diff --git a/src/gallium/drivers/panfrost/pan_job.c.rej b/src/gallium/drivers/panfrost/pan_job.c.rej
new file mode 100644
index 00000000000..a9a26176e91
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_job.c.rej
@@ -0,0 +1,596 @@
+diff a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c	(rejected hunks)
+@@ -25,6 +25,7 @@
+  */
+ 
+ #include <assert.h>
++#include <unistd.h>
+ 
+ #include "drm-uapi/panfrost_drm.h"
+ 
+@@ -81,6 +82,14 @@ panfrost_batch_init(struct panfrost_context *ctx,
+         batch->resources =_mesa_set_create(NULL, _mesa_hash_pointer,
+                                           _mesa_key_pointer_equal);
+ 
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i)
++                util_dynarray_init(&batch->resource_bos[i], NULL);
++
++        util_dynarray_init(&batch->vert_deps, NULL);
++        util_dynarray_init(&batch->frag_deps, NULL);
++
++        util_dynarray_init(&batch->dmabufs, NULL);
++
+         /* Preallocate the main pool, since every batch has at least one job
+          * structure so it will be used */
+         panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, true);
+@@ -96,6 +105,9 @@ panfrost_batch_init(struct panfrost_context *ctx,
+ 
+         panfrost_batch_add_surface(batch, batch->key.zsbuf);
+ 
++        if ((dev->debug & PAN_DBG_SYNC) || !(dev->debug & PAN_DBG_GOFASTER))
++                batch->needs_sync = true;
++
+         screen->vtbl.init_batch(batch);
+ }
+ 
+@@ -115,15 +127,30 @@ static void
+ panfrost_batch_add_resource(struct panfrost_batch *batch,
+                             struct panfrost_resource *rsrc)
+ {
++        struct panfrost_context *ctx = batch->ctx;
++        struct panfrost_device *dev = pan_device(ctx->base.screen);
++
+         bool found = false;
+         _mesa_set_search_or_add(batch->resources, rsrc, &found);
+ 
+-        if (!found) {
+-                /* Cache number of batches accessing a resource */
+-                rsrc->track.nr_users++;
++        /* Nothing to do if we already have the resource */
++        if (found)
++                return;
++
++        /* Cache number of batches accessing a resource */
++        rsrc->track.nr_users++;
++
++        /* Reference the resource on the batch */
++        pipe_reference(NULL, &rsrc->base.reference);
+ 
+-                /* Reference the resource on the batch */
+-                pipe_reference(NULL, &rsrc->base.reference);
++        if (rsrc->scanout) {
++                if (dev->has_dmabuf_fence) {
++                        int fd = rsrc->image.data.bo->dmabuf_fd;
++                        util_dynarray_append(&batch->dmabufs, int, fd);
++                } else {
++                        perf_debug_ctx(ctx, "Forcing sync on batch");
++                        batch->needs_sync = true;
++                }
+         }
+ }
+ 
+@@ -172,6 +199,10 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc
+ {
+         struct panfrost_device *dev = pan_device(ctx->base.screen);
+ 
++        /* Make sure we keep handling events, to free old BOs */
++        if (dev->kbase)
++                kbase_ensure_handle_events(&dev->mali);
++
+         assert(batch->seqnum);
+ 
+         if (ctx->batch == batch)
+@@ -186,10 +217,18 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc
+                 if (!flags[i])
+                         continue;
+ 
+-                struct panfrost_bo *bo = pan_lookup_bo(dev, i);
++                struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i);
+                 panfrost_bo_unreference(bo);
+         }
+ 
++        util_dynarray_fini(&batch->dmabufs);
++
++        util_dynarray_fini(&batch->vert_deps);
++        util_dynarray_fini(&batch->frag_deps);
++
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i)
++                util_dynarray_fini(&batch->resource_bos[i]);
++
+         panfrost_batch_destroy_resources(ctx, batch);
+         panfrost_pool_cleanup(&batch->pool);
+         panfrost_pool_cleanup(&batch->invisible_pool);
+@@ -313,7 +352,7 @@ panfrost_batch_update_access(struct panfrost_batch *batch,
+                 }
+         }
+ 
+-        if (writes) {
++        if (writes && (writer != batch)) {
+                 _mesa_hash_table_insert(ctx->writers, rsrc, batch);
+                 rsrc->track.nr_writers++;
+         }
+@@ -380,6 +419,12 @@ panfrost_batch_read_rsrc(struct panfrost_batch *batch,
+         uint32_t access = PAN_BO_ACCESS_READ |
+                 panfrost_access_for_stage(stage);
+ 
++        enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ?
++                PAN_USAGE_READ_FRAGMENT : PAN_USAGE_READ_VERTEX;
++
++        util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *,
++                             rsrc->image.data.bo);
++
+         panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access);
+ 
+         if (rsrc->separate_stencil)
+@@ -396,6 +441,12 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch,
+         uint32_t access = PAN_BO_ACCESS_WRITE |
+                 panfrost_access_for_stage(stage);
+ 
++        enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ?
++                PAN_USAGE_WRITE_FRAGMENT : PAN_USAGE_WRITE_VERTEX;
++
++        util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *,
++                             rsrc->image.data.bo);
++
+         panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access);
+ 
+         if (rsrc->separate_stencil)
+@@ -489,7 +540,7 @@ panfrost_batch_get_shared_memory(struct panfrost_batch *batch,
+ }
+ 
+ static void
+-panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
++panfrost_batch_to_fb_info(struct panfrost_batch *batch,
+                           struct pan_fb_info *fb,
+                           struct pan_image_view *rts,
+                           struct pan_image_view *zs,
+@@ -511,6 +562,7 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         fb->rt_count = batch->key.nr_cbufs;
+         fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin);
+         fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex);
++        fb->cs_fragment = &batch->cs_fragment;
+ 
+         static const unsigned char id_swz[] = {
+                 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W,
+@@ -604,22 +656,22 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH);
+         fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL);
+ 
+-        if (!fb->zs.clear.z &&
++        if (!fb->zs.clear.z && z_rsrc &&
+             ((batch->read & PIPE_CLEAR_DEPTH) ||
+              ((batch->draws & PIPE_CLEAR_DEPTH) &&
+-              z_rsrc && BITSET_TEST(z_rsrc->valid.data, z_view->first_level))))
++              BITSET_TEST(z_rsrc->valid.data, z_view->first_level))))
+                 fb->zs.preload.z = true;
+ 
+-        if (!fb->zs.clear.s &&
++        if (!fb->zs.clear.s && s_rsrc &&
+             ((batch->read & PIPE_CLEAR_STENCIL) ||
+              ((batch->draws & PIPE_CLEAR_STENCIL) &&
+-              s_rsrc && BITSET_TEST(s_rsrc->valid.data, s_view->first_level))))
++              BITSET_TEST(s_rsrc->valid.data, s_view->first_level))))
+                 fb->zs.preload.s = true;
+ 
+         /* Preserve both component if we have a combined ZS view and
+          * one component needs to be preserved.
+          */
+-        if (s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) {
++        if (z_view && s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) {
+                 bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level);
+ 
+                 fb->zs.discard.z = false;
+@@ -629,6 +681,28 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch,
+         }
+ }
+ 
++static int
++panfrost_batch_submit_kbase(struct panfrost_device *dev,
++                            struct drm_panfrost_submit *submit,
++                            struct kbase_syncobj *syncobj)
++{
++        dev->mali.handle_events(&dev->mali);
++
++        int atom = dev->mali.submit(&dev->mali,
++                                    submit->jc,
++                                    submit->requirements,
++                                    syncobj,
++                                    (int32_t *)(uintptr_t) submit->bo_handles,
++                                    submit->bo_handle_count);
++
++        if (atom == -1) {
++                errno = EINVAL;
++                return -1;
++        }
++
++        return 0;
++}
++
+ static int
+ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+                             mali_ptr first_job_desc,
+@@ -695,7 +769,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+                  * We also preserve existing flags as this batch might not
+                  * be the first one to access the BO.
+                  */
+-                struct panfrost_bo *bo = pan_lookup_bo(dev, i);
++                struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i);
+ 
+                 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
+         }
+@@ -718,6 +792,8 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+         submit.bo_handles = (u64) (uintptr_t) bo_handles;
+         if (ctx->is_noop)
+                 ret = 0;
++        else if (dev->kbase)
++                ret = panfrost_batch_submit_kbase(dev, &submit, ctx->syncobj_kbase);
+         else
+                 ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
+         free(bo_handles);
+@@ -728,8 +804,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
+         /* Trace the job if we're doing that */
+         if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
+                 /* Wait so we can get errors reported back */
+-                drmSyncobjWait(dev->fd, &out_sync, 1,
+-                               INT64_MAX, 0, NULL);
++                if (dev->kbase)
++                        dev->mali.syncobj_wait(&dev->mali, ctx->syncobj_kbase);
++                else
++                        drmSyncobjWait(dev->fd, &out_sync, 1,
++                                       INT64_MAX, 0, NULL);
+ 
+                 if (dev->debug & PAN_DBG_TRACE)
+                         pandecode_jc(submit.jc, dev->gpu_id);
+@@ -799,6 +878,323 @@ panfrost_batch_submit_jobs(struct panfrost_batch *batch,
+         return ret;
+ }
+ 
++#define BASE_MEM_MMU_DUMP_HANDLE (1 << 12)
++
++static void
++mmu_dump(struct panfrost_device *dev)
++{
++        unsigned size = 16 * 1024 * 1024;
++
++        fprintf(stderr, "dumping MMU tables\n");
++        sleep(3);
++
++        void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED,
++                         dev->mali.fd, BASE_MEM_MMU_DUMP_HANDLE);
++        if (mem == MAP_FAILED) {
++                perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)");
++                return;;
++        }
++
++        fprintf(stderr, "writing to file\n");
++        sleep(1);
++
++        char template[] = {"/tmp/mmu-dump.XXXXXX"};
++        int fd = mkstemp(template);
++        if (fd == -1) {
++                perror("mkstemp(/tmp/mmu-dump.XXXXXX)");
++                goto unmap;
++        }
++
++        write(fd, mem, size);
++        close(fd);
++
++unmap:
++        munmap(mem, size);
++}
++
++static void
++reset_context(struct panfrost_context *ctx)
++{
++        struct pipe_screen *pscreen = ctx->base.screen;
++        struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
++
++        /* Don't recover from the fault if PAN_MESA_DEBUG=sync is specified,
++         * to somewhat mimic behaviour with JM GPUs. TODO: Just abort? */
++        bool recover = !(dev->debug & PAN_DBG_SYNC);
++
++        mesa_loge("Context reset");
++
++        dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_vertex.base);
++        dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_fragment.base);
++
++        dev->mali.context_recreate(&dev->mali, ctx->kbase_ctx);
++
++        //mmu_dump(dev);
++
++        if (recover) {
++                dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_vertex.base);
++                dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_fragment.base);
++        } else {
++                ctx->kbase_cs_vertex.base.user_io = NULL;
++                ctx->kbase_cs_fragment.base.user_io = NULL;
++        }
++
++        ctx->kbase_cs_vertex.base.last_insert = 0;
++        ctx->kbase_cs_fragment.base.last_insert = 0;
++
++        screen->vtbl.init_cs(ctx, &ctx->kbase_cs_vertex);
++        screen->vtbl.init_cs(ctx, &ctx->kbase_cs_fragment);
++
++        /* TODO: this leaks memory */
++        ctx->tiler_heap_desc = 0;
++}
++
++static void
++pandecode_cs_ring(struct panfrost_device *dev, struct panfrost_cs *cs,
++                  uint64_t insert)
++{
++        insert %= cs->base.size;
++        uint64_t start = cs->base.last_insert % cs->base.size;
++
++        if (insert < start) {
++                pandecode_cs(cs->base.va + start, cs->base.size - start, dev->gpu_id);
++                start = 0;
++        }
++
++        pandecode_cs(cs->base.va + start, insert - start, dev->gpu_id);
++}
++
++static unsigned
++panfrost_add_dep_after(struct util_dynarray *deps,
++                       struct panfrost_usage u,
++                       unsigned index)
++{
++        unsigned size = util_dynarray_num_elements(deps, struct panfrost_usage);
++
++        for (unsigned i = index; i < size; ++i) {
++                struct panfrost_usage *d =
++                        util_dynarray_element(deps, struct panfrost_usage, i);
++
++                /* TODO: Remove d if it is an invalid entry? */
++
++                if ((d->queue == u.queue) && (d->write == u.write)) {
++                        d->seqnum = MAX2(d->seqnum, u.seqnum);
++                        return i;
++
++                } else if (d->queue > u.queue) {
++                        void *p = util_dynarray_grow(deps, struct panfrost_usage, 1);
++                        assert(p);
++                        memmove(util_dynarray_element(deps, struct panfrost_usage, i + 1),
++                                util_dynarray_element(deps, struct panfrost_usage, i),
++                                (size - i) * sizeof(struct panfrost_usage));
++
++                        *util_dynarray_element(deps, struct panfrost_usage, i) = u;
++                        return i;
++                }
++        }
++
++        util_dynarray_append(deps, struct panfrost_usage, u);
++        return size;
++}
++
++static void
++panfrost_update_deps(struct util_dynarray *deps, struct panfrost_bo *bo, bool write)
++{
++        /* Both lists should be sorted, so each dependency is at a higher
++         * index than the last */
++        unsigned index = 0;
++        util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) {
++                /* read->read access does not require a dependency */
++                if (!write && !u->write)
++                        continue;
++
++                index = panfrost_add_dep_after(deps, *u, index);
++        }
++}
++
++static inline bool
++panfrost_usage_writes(enum panfrost_usage_type usage)
++{
++        return (usage == PAN_USAGE_WRITE_VERTEX) || (usage == PAN_USAGE_WRITE_FRAGMENT);
++}
++
++static inline bool
++panfrost_usage_fragment(enum panfrost_usage_type usage)
++{
++        return (usage == PAN_USAGE_READ_FRAGMENT) || (usage == PAN_USAGE_WRITE_FRAGMENT);
++}
++
++/* Removes invalid dependencies from deps */
++static void
++panfrost_clean_deps(struct panfrost_device *dev, struct util_dynarray *deps)
++{
++        kbase k = &dev->mali;
++
++        struct panfrost_usage *rebuild = util_dynarray_begin(deps);
++        unsigned index = 0;
++
++        util_dynarray_foreach(deps, struct panfrost_usage, u) {
++                /* Usages are ordered, so we can break here */
++                if (u->queue >= k->event_slot_usage)
++                        break;
++
++                struct kbase_event_slot *slot = &k->event_slots[u->queue];
++                uint64_t seqnum = u->seqnum;
++
++                /* There is a race condition, where we can depend on an
++                 * unsubmitted batch. In that cade, decrease the seqnum.
++                 * Otherwise, skip invalid dependencies. */
++                if (slot->last_submit == seqnum)
++                        --seqnum;
++                else if (slot->last_submit < seqnum)
++                        continue;
++
++                /* This usage is valid, add it to the returned list */
++                rebuild[index++] = (struct panfrost_usage) {
++                        .queue = u->queue,
++                        .write = u->write,
++                        .seqnum = seqnum,
++                };
++        }
++
++        /* No need to check the return value, it can only shrink */
++        (void)! util_dynarray_resize(deps, struct panfrost_usage, index);
++}
++
++static int
++panfrost_batch_submit_csf(struct panfrost_batch *batch,
++                          const struct pan_fb_info *fb)
++{
++        struct panfrost_context *ctx = batch->ctx;
++        struct pipe_screen *pscreen = ctx->base.screen;
++        struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
++
++        ++ctx->kbase_cs_vertex.seqnum;
++
++        if (panfrost_has_fragment_job(batch)) {
++                screen->vtbl.emit_fragment_job(batch, fb);
++                ++ctx->kbase_cs_fragment.seqnum;
++        }
++
++        pthread_mutex_lock(&dev->bo_usage_lock);
++        for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) {
++
++                bool write = panfrost_usage_writes(i);
++                pan_bo_access access = write ? PAN_BO_ACCESS_RW : PAN_BO_ACCESS_READ;
++                struct util_dynarray *deps;
++                unsigned queue;
++                uint64_t seqnum;
++
++                if (panfrost_usage_fragment(i)) {
++                        deps = &batch->frag_deps;
++                        queue = ctx->kbase_cs_fragment.base.event_mem_offset;
++                        seqnum = ctx->kbase_cs_fragment.seqnum;
++                } else {
++                        deps = &batch->vert_deps;
++                        queue = ctx->kbase_cs_vertex.base.event_mem_offset;
++                        seqnum = ctx->kbase_cs_vertex.seqnum;
++                }
++
++                util_dynarray_foreach(&batch->resource_bos[i], struct panfrost_bo *, bo) {
++                        panfrost_update_deps(deps, *bo, write);
++                        struct panfrost_usage u = {
++                                .queue = queue,
++                                .write = write,
++                                .seqnum = seqnum,
++                        };
++
++                        panfrost_add_dep_after(&(*bo)->usage, u, 0);
++                        (*bo)->gpu_access |= access;
++                }
++        }
++        pthread_mutex_unlock(&dev->bo_usage_lock);
++
++        /* For now, only a single batch can use each tiler heap at once */
++        if (ctx->tiler_heap_desc) {
++                panfrost_update_deps(&batch->vert_deps, ctx->tiler_heap_desc, true);
++
++                struct panfrost_usage u = {
++                        .queue = ctx->kbase_cs_fragment.base.event_mem_offset,
++                        .write = true,
++                        .seqnum = ctx->kbase_cs_fragment.seqnum,
++                };
++                panfrost_add_dep_after(&ctx->tiler_heap_desc->usage, u, 0);
++        }
++
++        /* TODO: Use atomics in kbase code to avoid lock? */
++        pthread_mutex_lock(&dev->mali.queue_lock);
++
++        panfrost_clean_deps(dev, &batch->vert_deps);
++        panfrost_clean_deps(dev, &batch->frag_deps);
++
++        pthread_mutex_unlock(&dev->mali.queue_lock);
++
++        screen->vtbl.emit_csf_toplevel(batch);
++
++        uint64_t vs_offset = ctx->kbase_cs_vertex.offset +
++                (void *)ctx->kbase_cs_vertex.cs.ptr - ctx->kbase_cs_vertex.bo->ptr.cpu;
++        uint64_t fs_offset = ctx->kbase_cs_fragment.offset +
++                (void *)ctx->kbase_cs_fragment.cs.ptr - ctx->kbase_cs_fragment.bo->ptr.cpu;
++
++        if (dev->debug & PAN_DBG_TRACE) {
++                pandecode_cs_ring(dev, &ctx->kbase_cs_vertex, vs_offset);
++                pandecode_cs_ring(dev, &ctx->kbase_cs_fragment, fs_offset);
++        }
++
++        bool log = (dev->debug & PAN_DBG_LOG);
++
++        // TODO: We need better synchronisation than a single fake syncobj!
++
++        if (log)
++                printf("About to submit\n");
++
++        dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset,
++                            ctx->syncobj_kbase, ctx->kbase_cs_vertex.seqnum);
++
++        dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset,
++                            ctx->syncobj_kbase, ctx->kbase_cs_fragment.seqnum);
++
++        bool reset = false;
++
++        // TODO: How will we know to reset a CS when waiting is not done?
++        if (batch->needs_sync) {
++                if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ctx->syncobj_kbase))
++                        reset = true;
++
++                if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ctx->syncobj_kbase))
++                        reset = true;
++        }
++
++        if (dev->debug & PAN_DBG_TILER) {
++                fflush(stdout);
++                FILE *stream = popen("tiler-hex-read", "w");
++
++                /* TODO: Dump more than just the first chunk */
++                unsigned size = batch->ctx->kbase_ctx->tiler_heap_chunk_size;
++                uint64_t va = batch->ctx->kbase_ctx->tiler_heap_header;
++
++                fprintf(stream, "width %i\n" "height %i\n" "mask %i\n"
++                        "vaheap 0x%"PRIx64"\n" "size %i\n",
++                        batch->key.width, batch->key.height, 0xfe, va, size);
++
++                void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
++                                 MAP_SHARED, dev->mali.fd, va);
++
++                pan_hexdump(stream, ptr, size, false);
++                //memset(ptr, 0, size);
++                munmap(ptr, size);
++
++                pclose(stream);
++        }
++
++        if (reset)
++                reset_context(ctx);
++
++        return 0;
++}
++
+ static void
+ panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb)
+ {
+@@ -824,6 +1220,7 @@ panfrost_batch_submit(struct panfrost_context *ctx,
+ {
+         struct pipe_screen *pscreen = ctx->base.screen;
+         struct panfrost_screen *screen = pan_screen(pscreen);
++        struct panfrost_device *dev = pan_device(pscreen);
+         int ret;
+ 
+         /* Nothing to do! */
+@@ -867,7 +1264,11 @@ panfrost_batch_submit(struct panfrost_context *ctx,
+         if (batch->scoreboard.first_tiler || batch->clear)
+                 screen->vtbl.emit_fbd(batch, &fb);
+ 
+-        ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj);
++        /* TODO: Don't hardcode the arch number */
++        if (dev->arch < 10)
++                ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj);
++        else
++                ret = panfrost_batch_submit_csf(batch, &fb);
+ 
+         if (ret)
+                 fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret);
+@@ -969,6 +1370,8 @@ panfrost_batch_clear(struct panfrost_batch *batch,
+                 for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) {
+                         if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
+                                 continue;
++                        if (!ctx->pipe_framebuffer.cbufs[i])
++                                continue;
+ 
+                         enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
+                         pan_pack_color(batch->clear_color[i], color, format, false);
diff --git a/src/gallium/drivers/panfrost/pan_job.h.rej b/src/gallium/drivers/panfrost/pan_job.h.rej
new file mode 100644
index 00000000000..69ea4d72b11
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_job.h.rej
@@ -0,0 +1,42 @@
+diff a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h	(rejected hunks)
+@@ -79,6 +79,14 @@ pan_tristate_get(struct pan_tristate state)
+         return (state.v == PAN_TRISTATE_TRUE);
+ }
+ 
++enum panfrost_usage_type {
++        PAN_USAGE_READ_VERTEX,
++        PAN_USAGE_WRITE_VERTEX,
++        PAN_USAGE_READ_FRAGMENT,
++        PAN_USAGE_WRITE_FRAGMENT,
++        PAN_USAGE_COUNT,
++};
++
+ /* A panfrost_batch corresponds to a bound FBO we're rendering to,
+  * collecting over multiple draws. */
+ 
+@@ -194,6 +202,25 @@ struct panfrost_batch {
+ 
+         /* Referenced resources, holds a pipe_reference. */
+         struct set *resources;
++
++        struct util_dynarray resource_bos[PAN_USAGE_COUNT];
++
++        /* struct panfrost_usage */
++        struct util_dynarray vert_deps;
++        struct util_dynarray frag_deps;
++
++        /* Referenced dma-bufs FDs, for emitting synchronisation commands. */
++        struct util_dynarray dmabufs;
++
++        /* Command stream pointers for CSF Valhall. Vertex CS tracking is more
++         * complicated as there may be multiple buffers. */
++        pan_command_stream cs_vertex;
++        uint32_t *cs_vertex_last_size;
++        pan_command_stream cs_vertex_first;
++
++        pan_command_stream cs_fragment;
++
++        bool needs_sync;
+ };
+ 
+ /* Functions for managing the above */
diff --git a/src/gallium/drivers/panfrost/pan_resource.c.rej b/src/gallium/drivers/panfrost/pan_resource.c.rej
new file mode 100644
index 00000000000..e989ad133b2
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_resource.c.rej
@@ -0,0 +1,426 @@
+diff a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c	(rejected hunks)
+@@ -33,6 +33,7 @@
+ #include <xf86drm.h>
+ #include <fcntl.h>
+ #include "drm-uapi/drm_fourcc.h"
++#include "drm-uapi/drm.h"
+ 
+ #include "frontend/winsys_handle.h"
+ #include "util/format/u_format.h"
+@@ -51,6 +52,46 @@
+ #include "pan_tiling.h"
+ #include "decode.h"
+ 
++/* The kbase kernel driver always maps imported BOs with caching. When we
++ * don't want that, instead do mmap from the display driver side to get a
++ * write-combine mapping.
++ */
++static void
++panfrost_bo_mmap_scanout(struct panfrost_bo *bo,
++                         struct renderonly *ro,
++                         struct renderonly_scanout *scanout)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        /* If we are fine with a cached mapping, just return */
++        if (!(dev->debug & PAN_DBG_UNCACHED_CPU))
++                return;
++
++        struct drm_mode_map_dumb map_dumb = {
++                .handle = scanout->handle,
++        };
++
++        int err = drmIoctl(ro->kms_fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
++        if (err < 0) {
++                fprintf(stderr, "DRM_IOCTL_MODE_MAP_DUMB failed: %s\n",
++                        strerror(errno));
++                return;
++        }
++
++        void *addr = mmap(NULL, bo->size,
++                          PROT_READ | PROT_WRITE, MAP_SHARED,
++                          ro->kms_fd, map_dumb.offset);
++        if (addr == MAP_FAILED) {
++                fprintf(stderr, "kms_fd mmap failed: %s\n",
++                        strerror(errno));
++                return;
++        }
++
++        bo->munmap_ptr = bo->ptr.cpu;
++        bo->ptr.cpu = addr;
++        bo->cached = false;
++}
++
+ static struct pipe_resource *
+ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                               const struct pipe_resource *templat,
+@@ -102,15 +143,17 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                 return NULL;
+         }
+ 
+-        rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle);
++        struct panfrost_bo *bo = panfrost_bo_import(dev, whandle->handle);
+         /* Sometimes an import can fail e.g. on an invalid buffer fd, out of
+          * memory space to mmap it etc.
+          */
+-        if (!rsc->image.data.bo) {
++        if (!bo) {
+                 FREE(rsc);
+                 return NULL;
+         }
+ 
++        rsc->image.data.bo = bo;
++
+         rsc->modifier_constant = true;
+ 
+         BITSET_SET(rsc->valid.data, 0);
+@@ -122,6 +165,9 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
+                 /* failure is expected in some cases.. */
+         }
+ 
++        if (rsc->scanout)
++                panfrost_bo_mmap_scanout(bo, dev->ro, rsc->scanout);
++
+         return prsc;
+ }
+ 
+@@ -473,7 +519,9 @@ panfrost_resource_setup(struct panfrost_device *dev,
+ static void
+ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres)
+ {
+-        panfrost_bo_mmap(pres->image.data.bo);
++        struct panfrost_bo *bo = pres->image.data.bo;
++
++        panfrost_bo_mmap(bo);
+ 
+         unsigned nr_samples = MAX2(pres->base.nr_samples, 1);
+ 
+@@ -482,16 +530,16 @@ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres)
+                         struct pan_image_slice_layout *slice = &pres->image.layout.slices[l];
+ 
+                         for (unsigned s = 0; s < nr_samples; ++s) {
+-                                void *ptr = pres->image.data.bo->ptr.cpu +
+-                                            (i * pres->image.layout.array_stride) +
+-                                            slice->offset +
+-                                            (s * slice->afbc.surface_stride);
++                                size_t offset = (i * pres->image.layout.array_stride) +
++                                                slice->offset +
++                                                (s * slice->afbc.surface_stride);
+ 
+                                 /* Zero-ed AFBC headers seem to encode a plain
+                                  * black. Let's use this pattern to keep the
+                                  * initialization simple.
+                                  */
+-                                memset(ptr, 0, slice->afbc.header_size);
++                                memset(bo->ptr.cpu + offset, 0, slice->afbc.header_size);
++                                panfrost_bo_mem_clean(bo, offset, slice->afbc.header_size);
+                         }
+                 }
+         }
+@@ -643,7 +691,9 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
+                 (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" :
+                 "Other resource";
+ 
+-        if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) {
++        /* Revert to doing a kmsro allocation for any shared BO, because kbase
++         * cannot do export */
++        if (dev->ro && (template->bind & PAN_BIND_SHARED_MASK)) {
+                 struct winsys_handle handle;
+                 struct pan_block_size blocksize = panfrost_block_size(modifier, template->format);
+ 
+@@ -702,12 +752,21 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
+                         free(so);
+                         return NULL;
+                 }
++
++                panfrost_bo_mmap_scanout(so->image.data.bo, dev->ro, so->scanout);
+         } else {
+                 /* We create a BO immediately but don't bother mapping, since we don't
+                  * care to map e.g. FBOs which the CPU probably won't touch */
+ 
++                /* For now, don't cache buffers as syncing can be slow when
++                 * too much memory is mapped. TODO: dynamically switch, or use
++                 * the STREAM_READ etc. hints? */
++                bool buffer = (template->target == PIPE_BUFFER);
++                unsigned cache_flag = buffer ? 0 : PAN_BO_CACHEABLE;
++
+                 so->image.data.bo =
+-                        panfrost_bo_create(dev, so->image.layout.data_size, PAN_BO_DELAY_MMAP, label);
++                        panfrost_bo_create(dev, so->image.layout.data_size,
++                                           PAN_BO_DELAY_MMAP | cache_flag, label);
+ 
+                 so->constant_stencil = true;
+         }
+@@ -741,10 +800,22 @@ panfrost_resource_create_with_modifiers(struct pipe_screen *screen,
+                          const struct pipe_resource *template,
+                          const uint64_t *modifiers, int count)
+ {
++        struct panfrost_device *dev = pan_device(screen);
++
+         for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) {
+-                if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) {
+-                        return panfrost_resource_create_with_modifier(screen, template,
+-                                        pan_best_modifiers[i]);
++                uint64_t mod = pan_best_modifiers[i];
++
++                if (drm_is_afbc(mod) && !dev->has_afbc)
++                        continue;
++
++                if (mod != DRM_FORMAT_MOD_LINEAR && (dev->debug & PAN_DBG_LINEAR))
++                        continue;
++
++                /* TODO: What if mod is an unsupported AFBC variant for this
++                 * format? */
++
++                if (drm_find_modifier(mod, modifiers, count)) {
++                        return panfrost_resource_create_with_modifier(screen, template, mod);
+                 }
+         }
+ 
+@@ -773,6 +844,71 @@ panfrost_resource_destroy(struct pipe_screen *screen,
+         free(rsrc);
+ }
+ 
++static void
++panfrost_clear_render_target(struct pipe_context *pipe,
++                             struct pipe_surface *dst,
++                             const union pipe_color_union *color,
++                             unsigned dstx, unsigned dsty,
++                             unsigned width, unsigned height,
++                             bool render_condition_enabled)
++{
++        struct panfrost_context *ctx = pan_context(pipe);
++
++        /* TODO: dstx, etc. */
++
++        struct pipe_framebuffer_state tmp = {0};
++        util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer);
++
++        struct pipe_framebuffer_state fb = {
++                .width = dst->width,
++                .height = dst->height,
++                .layers = 1,
++                .samples = 1,
++                .nr_cbufs = 1,
++                .cbufs[0] = dst,
++        };
++        pipe->set_framebuffer_state(pipe, &fb);
++
++        struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear render target");
++        panfrost_batch_clear(batch, PIPE_CLEAR_COLOR0, color, 0, 0);
++
++        pipe->set_framebuffer_state(pipe, &tmp);
++        util_unreference_framebuffer_state(&tmp);
++}
++
++static void
++panfrost_clear_depth_stencil(struct pipe_context *pipe,
++                             struct pipe_surface *dst,
++                             unsigned clear_flags,
++                             double depth, unsigned stencil,
++                             unsigned dstx, unsigned dsty,
++                             unsigned width, unsigned height,
++                             bool render_condition_enabled)
++{
++        struct panfrost_context *ctx = pan_context(pipe);
++
++        /* TODO: dstx, etc. */
++
++        struct pipe_framebuffer_state tmp = {0};
++        util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer);
++
++        struct pipe_framebuffer_state fb = {
++                .width = dst->width,
++                .height = dst->height,
++                .layers = 1,
++                .samples = 1,
++                .nr_cbufs = 0,
++                .zsbuf = dst,
++        };
++        pipe->set_framebuffer_state(pipe, &fb);
++
++        struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear depth/stencil");
++        panfrost_batch_clear(batch, clear_flags, NULL, depth, stencil);
++
++        pipe->set_framebuffer_state(pipe, &tmp);
++        util_unreference_framebuffer_state(&tmp);
++}
++
+ /* Most of the time we can do CPU-side transfers, but sometimes we need to use
+  * the 3D pipe for this. Let's wrap u_blitter to blit to/from staging textures.
+  * Code adapted from freedreno */
+@@ -968,6 +1104,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                 struct panfrost_resource *staging = pan_alloc_staging(ctx, rsrc, level, box);
+                 assert(staging);
+ 
++                panfrost_bo_mmap(staging->image.data.bo);
++
+                 /* Staging resources have one LOD: level 0. Query the strides
+                  * on this LOD.
+                  */
+@@ -990,9 +1128,11 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                         pan_blit_to_staging(pctx, transfer);
+                         panfrost_flush_writer(ctx, staging, "AFBC read staging blit");
+                         panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false);
++
++                        panfrost_bo_mem_invalidate(staging->image.data.bo, 0,
++                                                   staging->image.data.bo->size);
+                 }
+ 
+-                panfrost_bo_mmap(staging->image.data.bo);
+                 return staging->image.data.bo->ptr.cpu;
+         }
+ 
+@@ -1029,7 +1169,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+             !(usage & PIPE_MAP_UNSYNCHRONIZED) &&
+             !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
+             (usage & PIPE_MAP_WRITE) &&
+-            rsrc->track.nr_users > 0) {
++            rsrc->track.nr_users > 0 &&
++            bo->size < 16 * 1024 * 1024) {
+ 
+                 /* When a resource to be modified is already being used by a
+                  * pending batch, it is often faster to copy the whole BO than
+@@ -1051,6 +1192,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                 copy_resource = false;
+         }
+ 
++        bool cache_inval = true;
++
+         if (create_new_bo) {
+                 /* Make sure we re-emit any descriptors using this resource */
+                 panfrost_dirty_state_all(ctx);
+@@ -1075,12 +1218,14 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                                                            flags, bo->label);
+ 
+                         if (newbo) {
+-                                if (copy_resource)
+-                                        memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size);
++                                if (copy_resource) {
++                                        panfrost_bo_mem_invalidate(bo, 0, bo->size);
++                                        memcpy(newbo->ptr.cpu, bo->ptr.cpu, bo->size);
++                                }
+ 
+                                 panfrost_resource_swap_bo(ctx, rsrc, newbo);
+ 
+-	                        if (!copy_resource &&
++                                if (!copy_resource &&
+                                     drm_is_afbc(rsrc->image.layout.modifier))
+                                         panfrost_resource_init_afbc_headers(rsrc);
+ 
+@@ -1102,6 +1247,22 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                         panfrost_flush_writer(ctx, rsrc, "Synchronized read");
+                         panfrost_bo_wait(bo, INT64_MAX, false);
+                 }
++        } else {
++                /* No flush for writes to uninitialized */
++                cache_inval = false;
++        }
++
++        /* TODO: Only the accessed region for textures */
++        if (cache_inval) {
++                size_t offset = 0;
++                size_t size = bo->size;
++
++                if (resource->target == PIPE_BUFFER) {
++                        offset = box->x * (size_t) bytes_per_block;
++                        size = box->width * (size_t) bytes_per_block;
++                }
++
++                panfrost_bo_mem_invalidate(bo, offset, size);
+         }
+ 
+         /* For access to compressed textures, we want the (x, y, w, h)
+@@ -1128,6 +1289,8 @@ panfrost_ptr_map(struct pipe_context *pctx,
+                  * caching... I don't know if this is actually possible but we
+                  * should still get it right */
+ 
++                // TODO: Fix this for cached BOs
++
+                 unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT;
+ 
+                 if ((usage & dpw) == dpw && rsrc->index_cache)
+@@ -1281,8 +1444,15 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+          * reloads that can cascade into DATA_INVALID_FAULTs due to reading
+          * malformed AFBC data if uninitialized */
+ 
+-        if (trans->staging.rsrc) {
++        bool afbc = trans->staging.rsrc;
++
++        if (afbc) {
+                 if (transfer->usage & PIPE_MAP_WRITE) {
++                        struct panfrost_resource *trans_rsrc = pan_resource(trans->staging.rsrc);
++                        struct panfrost_bo *trans_bo = trans_rsrc->image.data.bo;
++
++                        panfrost_bo_mem_clean(trans_bo, 0, trans_bo->size);
++
+                         if (panfrost_should_linear_convert(dev, prsrc, transfer)) {
+ 
+                                 panfrost_bo_unreference(prsrc->image.data.bo);
+@@ -1290,7 +1460,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                                 panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR,
+                                                         prsrc->image.layout.format);
+ 
+-                                prsrc->image.data.bo = pan_resource(trans->staging.rsrc)->image.data.bo;
++                                prsrc->image.data.bo = trans_bo;
+                                 panfrost_bo_reference(prsrc->image.data.bo);
+                         } else {
+                                 pan_blit_from_staging(pctx, trans);
+@@ -1315,10 +1485,13 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                                         panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR,
+                                                                 prsrc->image.layout.format);
+                                         if (prsrc->image.layout.data_size > bo->size) {
++                                                /* We want the BO to be MMAPed. */
++                                                uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP;
+                                                 const char *label = bo->label;
++
+                                                 panfrost_bo_unreference(bo);
+                                                 bo = prsrc->image.data.bo =
+-                                                        panfrost_bo_create(dev, prsrc->image.layout.data_size, 0, label);
++                                                        panfrost_bo_create(dev, prsrc->image.layout.data_size, flags, label);
+                                                 assert(bo);
+                                         }
+ 
+@@ -1339,6 +1512,25 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+                 }
+         }
+ 
++        /* TODO: Only the accessed region */
++        /* It is important to not do this for AFBC resources, or else the
++         * clean might overwrite the result of the blit. */
++        if (!afbc && (transfer->usage & PIPE_MAP_WRITE)) {
++                size_t offset = 0;
++                size_t size = prsrc->image.data.bo->size;
++
++                /* TODO: Don't recalculate */
++                if (prsrc->base.target == PIPE_BUFFER) {
++                        enum pipe_format format = prsrc->image.layout.format;
++                        int bytes_per_block = util_format_get_blocksize(format);
++
++                        offset = transfer->box.x * (size_t) bytes_per_block;
++                        size = transfer->box.width * (size_t) bytes_per_block;
++                }
++
++                panfrost_bo_mem_clean(prsrc->image.data.bo,
++                                      offset, size);
++        }
+ 
+         util_range_add(&prsrc->base, &prsrc->valid_buffer_range,
+                        transfer->box.x,
+@@ -1353,6 +1545,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx,
+         ralloc_free(transfer);
+ }
+ 
++// TODO: does this need to be changed for cached resources?
+ static void
+ panfrost_ptr_flush_region(struct pipe_context *pctx,
+                                struct pipe_transfer *transfer,
+@@ -1486,6 +1679,8 @@ panfrost_resource_context_init(struct pipe_context *pctx)
+         pctx->texture_unmap = u_transfer_helper_transfer_unmap;
+         pctx->create_surface = panfrost_create_surface;
+         pctx->surface_destroy = panfrost_surface_destroy;
++        pctx->clear_render_target = panfrost_clear_render_target;
++        pctx->clear_depth_stencil = panfrost_clear_depth_stencil;
+         pctx->resource_copy_region = util_resource_copy_region;
+         pctx->blit = panfrost_blit;
+         pctx->generate_mipmap = panfrost_generate_mipmap;
diff --git a/src/gallium/drivers/panfrost/pan_screen.c.rej b/src/gallium/drivers/panfrost/pan_screen.c.rej
new file mode 100644
index 00000000000..6d6ff33b6bd
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_screen.c.rej
@@ -0,0 +1,87 @@
+diff a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c	(rejected hunks)
+@@ -56,7 +56,7 @@
+ 
+ static const struct debug_named_value panfrost_debug_options[] = {
+         {"perf",      PAN_DBG_PERF,     "Enable performance warnings"},
+-        {"trace",     PAN_DBG_TRACE,    "Trace the command stream"},
++        {"trace",     PAN_DBG_TRACE | PAN_DBG_BO_CLEAR, "Trace the command stream"},
+         {"deqp",      PAN_DBG_DEQP,     "Hacks for dEQP"},
+         {"dirty",     PAN_DBG_DIRTY,    "Always re-emit all state"},
+         {"sync",      PAN_DBG_SYNC,     "Wait for each job's completion and abort on GPU faults"},
+@@ -72,6 +72,13 @@ static const struct debug_named_value panfrost_debug_options[] = {
+ #ifdef PAN_DBG_OVERFLOW
+         {"overflow",  PAN_DBG_OVERFLOW, "Check for buffer overflows in pool uploads"},
+ #endif
++        {"tiler",     PAN_DBG_TILER,    "Decode the tiler heap"},
++        {"bolog",     PAN_DBG_BO_LOG,   "Log BO allocations/deallocations"},
++        {"boclear",   PAN_DBG_BO_CLEAR, "Clear BOs on allocation"},
++        {"nogpuc",    PAN_DBG_UNCACHED_GPU, "Use uncached GPU memory for textures"},
++        {"nocpuc",    PAN_DBG_UNCACHED_CPU, "Use uncached CPU mappings for textures"},
++        {"log",       PAN_DBG_LOG,      "Log job submission etc."},
++        {"gofaster",  PAN_DBG_GOFASTER, "Experimental performance improvements"},
+         DEBUG_NAMED_VALUE_END
+ };
+ 
+@@ -122,6 +129,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
+         case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+         case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+         case PIPE_CAP_SHADER_PACK_HALF_FLOAT:
++        case PIPE_CAP_CLIP_HALFZ:
+                 return 1;
+ 
+         case PIPE_CAP_MAX_RENDER_TARGETS:
+@@ -300,7 +308,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
+          * still supported as it is core GLES3.0 functionality
+          */
+         case PIPE_CAP_PRIMITIVE_RESTART:
+-                return dev->arch <= 7;
++                return is_gl3 || dev->arch <= 7;
+ 
+         case PIPE_CAP_FLATSHADE:
+         case PIPE_CAP_TWO_SIDED_COLOR:
+@@ -606,6 +614,7 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen,
+         bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format);
+         bool ytr = panfrost_afbc_can_ytr(format);
+         bool tiled_afbc = panfrost_afbc_can_tile(dev);
++        bool native = panfrost_afbc_only_native(dev->arch, format);
+ 
+         unsigned count = 0;
+ 
+@@ -619,6 +628,9 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen,
+                 if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc)
+                         continue;
+ 
++                if (drm_is_afbc(pan_best_modifiers[i]) && !(pan_best_modifiers[i] & AFBC_FORMAT_MOD_NATIVE_SWIZZLE) && native)
++                        continue;
++
+                 if (test_modifier != DRM_FORMAT_MOD_INVALID &&
+                     test_modifier != pan_best_modifiers[i])
+                         continue;
+@@ -822,13 +834,17 @@ panfrost_create_screen(int fd, struct renderonly *ro)
+ 
+         /* Bail early on unsupported hardware */
+         if (dev->model == NULL) {
+-                debug_printf("panfrost: Unsupported model %X", dev->gpu_id);
++                debug_printf("panfrost: Unsupported model %X\n", dev->gpu_id);
+                 panfrost_destroy_screen(&(screen->base));
+                 return NULL;
+         }
+ 
+         dev->ro = ro;
+ 
++        /* The functionality is only useful with kbase */
++        if (dev->kbase)
++                dev->has_dmabuf_fence = panfrost_check_dmabuf_fence(dev);
++
+         screen->base.destroy = panfrost_destroy_screen;
+ 
+         screen->base.get_name = panfrost_get_name;
+@@ -874,6 +890,8 @@ panfrost_create_screen(int fd, struct renderonly *ro)
+                 panfrost_cmdstream_screen_init_v7(screen);
+         else if (dev->arch == 9)
+                 panfrost_cmdstream_screen_init_v9(screen);
++        else if (dev->arch == 10)
++                panfrost_cmdstream_screen_init_v10(screen);
+         else
+                 unreachable("Unhandled architecture major");
+ 
diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h
index 3400c0a6cbf..3ef30cf17cf 100644
--- a/src/gallium/drivers/panfrost/pan_screen.h
+++ b/src/gallium/drivers/panfrost/pan_screen.h
@@ -51,6 +51,7 @@ static const struct pipe_driver_query_info panfrost_driver_query_list[] = {
 
 struct panfrost_batch;
 struct panfrost_context;
+struct panfrost_cs;
 struct panfrost_resource;
 struct panfrost_compiled_shader;
 struct pan_fb_info;
diff --git a/src/gallium/drivers/panfrost/pan_screen.h.rej b/src/gallium/drivers/panfrost/pan_screen.h.rej
new file mode 100644
index 00000000000..0d7d3ea9803
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_screen.h.rej
@@ -0,0 +1,28 @@
+diff a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h	(rejected hunks)
+@@ -57,6 +58,7 @@ struct pan_blend_state;
+ 
+ /* Virtual table of per-generation (GenXML) functions */
+ 
++
+ struct panfrost_vtable {
+         /* Prepares the renderer state descriptor or shader program descriptor
+          * for a given compiled shader, and if desired uploads it as well */
+@@ -100,6 +102,10 @@ struct panfrost_vtable {
+                                struct panfrost_compile_inputs *inputs,
+                                struct util_dynarray *binary,
+                                struct pan_shader_info *info);
++
++        void (*emit_csf_toplevel)(struct panfrost_batch *);
++
++        void (*init_cs)(struct panfrost_context *ctx, struct panfrost_cs *cs);
+ };
+ 
+ struct panfrost_screen {
+@@ -138,6 +144,7 @@ void panfrost_cmdstream_screen_init_v5(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen);
+ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen);
++void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen);
+ 
+ #define perf_debug(dev, ...) \
+         do { \
diff --git a/src/gallium/frontends/nine/nine_ff.c b/src/gallium/frontends/nine/nine_ff.c
index a5182fbd0a8..bae01856b57 100644
--- a/src/gallium/frontends/nine/nine_ff.c
+++ b/src/gallium/frontends/nine/nine_ff.c
@@ -1442,7 +1442,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
             struct ureg_src texture_coord = ps.vT[s];
             struct ureg_dst delta;
             switch (key->ts[s].textarget) {
-            case 0: target = TGSI_TEXTURE_1D; break;
+            case 0: target = TGSI_TEXTURE_2D; break;
             case 1: target = TGSI_TEXTURE_2D; break;
             case 2: target = TGSI_TEXTURE_3D; break;
             case 3: target = TGSI_TEXTURE_CUBE; break;
diff --git a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c
index eff7a0f5de8..432d201786f 100644
--- a/src/gallium/frontends/nine/nine_shader.c
+++ b/src/gallium/frontends/nine/nine_shader.c
@@ -2198,7 +2198,7 @@ static inline unsigned
 d3dstt_to_tgsi_tex(BYTE sampler_type)
 {
     switch (sampler_type) {
-    case NINED3DSTT_1D:     return TGSI_TEXTURE_1D;
+    case NINED3DSTT_1D:     return TGSI_TEXTURE_2D;
     case NINED3DSTT_2D:     return TGSI_TEXTURE_2D;
     case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D;
     case NINED3DSTT_CUBE:   return TGSI_TEXTURE_CUBE;
@@ -2211,7 +2211,7 @@ static inline unsigned
 d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
 {
     switch (sampler_type) {
-    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D;
+    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW2D;
     case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D;
     case NINED3DSTT_VOLUME:
     case NINED3DSTT_CUBE:
diff --git a/src/gallium/frontends/nine/nine_shader.c.rej b/src/gallium/frontends/nine/nine_shader.c.rej
new file mode 100644
index 00000000000..b6441552e6a
--- /dev/null
+++ b/src/gallium/frontends/nine/nine_shader.c.rej
@@ -0,0 +1,10 @@
+diff a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c	(rejected hunks)
+@@ -2186,7 +2186,7 @@ ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
+ {
+     boolean shadow = !!(info->sampler_mask_shadow & (1 << stage));
+     switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
+-    case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D;
++    case 1: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
+     case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
+     case 3: return TGSI_TEXTURE_3D;
+     default:
diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c
index fb7caba7900..3e4de025e41 100644
--- a/src/gallium/frontends/nine/nine_state.c
+++ b/src/gallium/frontends/nine/nine_state.c
@@ -1121,8 +1121,10 @@ update_textures_and_samplers(struct NineDevice9 *device)
                             false, view);
     context->enabled_sampler_count_vs = num_textures;
 
-    if (commit_samplers)
+    if (commit_samplers) {
+        cso_set_max_sampler(context->cso, num_textures - 1);
         cso_single_sampler_done(context->cso, PIPE_SHADER_VERTEX);
+    }
 }
 
 /* State commit only */
diff --git a/src/gallium/frontends/nine/nine_state.c.rej b/src/gallium/frontends/nine/nine_state.c.rej
new file mode 100644
index 00000000000..cae533928d5
--- /dev/null
+++ b/src/gallium/frontends/nine/nine_state.c.rej
@@ -0,0 +1,13 @@
+diff a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c	(rejected hunks)
+@@ -1039,8 +1039,10 @@ update_textures_and_samplers(struct NineDevice9 *device)
+                             false, view);
+     context->enabled_sampler_count_ps = num_textures;
+ 
+-    if (commit_samplers)
++    if (commit_samplers) {
++        cso_set_max_sampler(context->cso, num_textures - 1);
+         cso_single_sampler_done(context->cso, PIPE_SHADER_FRAGMENT);
++    }
+ 
+     commit_samplers = FALSE;
+     sampler_mask = context->programmable_vs ? context->vs->sampler_mask : 0;
diff --git a/src/gallium/targets/d3dadapter9/meson.build.rej b/src/gallium/targets/d3dadapter9/meson.build.rej
new file mode 100644
index 00000000000..89bfd12debe
--- /dev/null
+++ b/src/gallium/targets/d3dadapter9/meson.build.rej
@@ -0,0 +1,11 @@
+diff a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build	(rejected hunks)
+@@ -64,7 +64,8 @@ libgallium_nine = shared_library(
+     dep_selinux, dep_libdrm, dep_llvm, dep_thread,
+     idep_xmlconfig, idep_mesautil, idep_nir,
+     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
+-    driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno
++    driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno,
++    driver_panfrost, driver_kmsro,
+   ],
+   name_prefix : '',
+   version : '.'.join(nine_version),
diff --git a/src/gallium/targets/osmesa/meson.build.rej b/src/gallium/targets/osmesa/meson.build.rej
new file mode 100644
index 00000000000..05104104856
--- /dev/null
+++ b/src/gallium/targets/osmesa/meson.build.rej
@@ -0,0 +1,14 @@
+diff a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build	(rejected hunks)
+@@ -55,10 +55,10 @@ libosmesa = shared_library(
+     libmesa, libgallium, libws_null, osmesa_link_with,
+   ],
+   dependencies : [
+-    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast
++    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast, driver_panfrost, dep_libdrm
+   ],
+   name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
+-  soversion : host_machine.system() == 'windows' ? '' : '8',
++  soversion : '',
+   version : '8.0.0',
+   darwin_versions : '9.0.0',
+   install : true,
diff --git a/src/gallium/targets/rusticl/meson.build.rej b/src/gallium/targets/rusticl/meson.build.rej
new file mode 100644
index 00000000000..32064a34bd4
--- /dev/null
+++ b/src/gallium/targets/rusticl/meson.build.rej
@@ -0,0 +1,9 @@
+diff a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build	(rejected hunks)
+@@ -43,6 +43,7 @@ librusticl = shared_library(
+   ],
+   dependencies : [
+     driver_iris,
++    driver_kmsro,
+     driver_nouveau,
+     driver_panfrost,
+     driver_swrast,
diff --git a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej
new file mode 100644
index 00000000000..5a81dda1e0d
--- /dev/null
+++ b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej
@@ -0,0 +1,19 @@
+diff a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c	(rejected hunks)
+@@ -101,9 +101,15 @@ struct pipe_screen *kmsro_drm_screen_create(int fd,
+ #endif
+ 
+ #if defined(GALLIUM_PANFROST)
+-   ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER);
++   bool noop = getenv("KBASE_NOOP");
+ 
+-   if (ro->gpu_fd >= 0) {
++   if (!noop) {
++      ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER);
++      if (ro->gpu_fd < 0)
++         ro->gpu_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK);
++   }
++
++   if ((ro->gpu_fd >= 0) || noop) {
+       ro->create_for_resource = renderonly_create_kms_dumb_buffer_for_resource;
+       screen = panfrost_drm_screen_create_renderonly(ro);
+       if (!screen)
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 1b188a4c800..4585cbb6ef4 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -118,7 +118,6 @@ static char* load_shader_replacement(struct _shader_replacement *repl)
    return NULL;
 }
 #endif
-#endif
 
 /**
  * Return mask of GLSL_x flags by examining the MESA_GLSL env var.
@@ -1933,8 +1932,6 @@ _mesa_LinkProgram(GLuint programObj)
    link_program_error(ctx, shProg);
 }
 
-#ifdef ENABLE_SHADER_CACHE
-
 /**
  * Construct a full path for shader replacement functionality using
  * following format:
@@ -2067,8 +2064,6 @@ _mesa_read_shader_source(const gl_shader_stage stage, const char *source,
    return buffer;
 }
 
-#endif /* ENABLE_SHADER_CACHE */
-
 /**
  * Called via glShaderSource() and glShaderSourceARB() API functions.
  * Basically, concatenate the source code strings into one long string
@@ -2150,7 +2145,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count,
    uint8_t original_sha1[SHA1_DIGEST_LENGTH];
    _mesa_sha1_compute(source, strlen(source), original_sha1);
 
-#ifdef ENABLE_SHADER_CACHE
    GLcharARB *replacement;
 
    /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace
@@ -2163,7 +2157,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count,
       free(source);
       source = replacement;
    }
-#endif /* ENABLE_SHADER_CACHE */
 
    set_shader_source(sh, source, original_sha1);
 
diff --git a/src/mesa/main/shaderapi.c.rej b/src/mesa/main/shaderapi.c.rej
new file mode 100644
index 00000000000..52e1b756b51
--- /dev/null
+++ b/src/mesa/main/shaderapi.c.rej
@@ -0,0 +1,9 @@
+diff a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c	(rejected hunks)
+@@ -70,7 +70,6 @@
+ #include "state_tracker/st_context.h"
+ #include "state_tracker/st_program.h"
+ 
+-#ifdef ENABLE_SHADER_CACHE
+ #if CUSTOM_SHADER_REPLACEMENT
+ #include "shader_replacement.h"
+ /* shader_replacement.h must declare a variable like this:
diff --git a/src/meson.build b/src/meson.build
index 1293538b8f6..1f04b7860cc 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -75,6 +75,7 @@ if with_imgui
 endif
 if with_platform_wayland
   subdir('egl/wayland/wayland-drm')
+  subdir('egl/wayland/mali-buffer-sharing')
 endif
 if with_any_vk or with_gallium_zink
   subdir('vulkan')
diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
new file mode 100644
index 00000000000..3b02350c08b
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_CSF_KERNEL_H_
+#define _UAPI_BASE_CSF_KERNEL_H_
+
+#include <linux/types.h>
+#include "../mali_base_common_kernel.h"
+
+/* Memory allocation, access/hint flags & mask specific to CSF GPU.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* Must be FIXED memory. */
+#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8)
+
+/* CSF event memory
+ *
+ * If Outer shareable coherence is not specified or not available, then on
+ * allocation kbase will automatically use the uncached GPU mapping.
+ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
+ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
+ *
+ * This memory requires a permanent mapping
+ *
+ * See also kbase_reg_needs_kernel_mapping()
+ */
+#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
+
+#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+
+
+/* Must be FIXABLE memory: its GPU VA will be determined at a later point,
+ * at which time it will be at a fixed GPU VA.
+ */
+#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29)
+
+/* Note that the number of bits used for base_mem_alloc_flags
+ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
+ */
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20
+
+/* Special base mem handles specific to CSF.
+ */
+#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
+
+#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
+	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
+	 LOCAL_PAGE_SHIFT)
+
+/* Valid set of just-in-time memory allocation flags */
+#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
+
+/* flags for base context specific to CSF */
+
+/* Base context creates a CSF event notification thread.
+ *
+ * The creation of a CSF event notification thread is conditional but
+ * mandatory for the handling of CSF events.
+ */
+#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | \
+	 BASE_CONTEXT_CSF_EVENT_THREAD | \
+	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/* Flags for base tracepoint specific to CSF */
+
+/* Enable KBase tracepoints for CSF builds */
+#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
+
+/* Enable additional CSF Firmware side tracepoints */
+#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
+
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
+		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
+		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
+
+/* Number of pages mapped into the process address space for a bound GPU
+ * command queue. A pair of input/output pages and a Hw doorbell page
+ * are mapped to enable direct submission of commands to Hw.
+ */
+#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
+
+#define BASE_QUEUE_MAX_PRIORITY (15U)
+
+/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
+#define BASEP_EVENT_VAL_INDEX (0U)
+#define BASEP_EVENT_ERR_INDEX (1U)
+
+/* The upper limit for number of objects that could be waited/set per command.
+ * This limit is now enforced as internally the error inherit inputs are
+ * converted to 32-bit flags in a __u32 variable occupying a previously padding
+ * field.
+ */
+#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
+
+/* CSF CSI EXCEPTION_HANDLER_FLAGS */
+#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0)
+#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG)
+
+/**
+ * enum base_kcpu_command_type - Kernel CPU queue command type.
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
+ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
+ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
+ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
+ */
+enum base_kcpu_command_type {
+	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
+	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
+	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
+	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
+};
+
+/**
+ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
+ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
+ *                                      priority levels.
+ *
+ * Currently this is in order of highest to lowest, but if new levels are added
+ * then those new levels may be out of order to preserve the ABI compatibility
+ * with previous releases. At that point, ensure assignment to
+ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
+ * a linear ordering.
+ *
+ * There should be no gaps in the enum, otherwise use of
+ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
+ */
+enum base_queue_group_priority {
+	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
+	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
+	BASE_QUEUE_GROUP_PRIORITY_LOW,
+	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
+	BASE_QUEUE_GROUP_PRIORITY_COUNT
+};
+
+struct base_kcpu_command_fence_info {
+	__u64 fence;
+};
+
+struct base_cqs_wait_info {
+	__u64 addr;
+	__u32 val;
+	__u32 padding;
+};
+
+struct base_kcpu_command_cqs_wait_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+struct base_cqs_set {
+	__u64 addr;
+};
+
+struct base_kcpu_command_cqs_set_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
+ *
+ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
+ *                           is an unsigned 32-bit integer
+ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
+ *                           is an unsigned 64-bit integer
+ */
+typedef enum PACKED {
+	BASEP_CQS_DATA_TYPE_U32 = 0,
+	BASEP_CQS_DATA_TYPE_U64 = 1,
+} basep_cqs_data_type;
+
+/**
+ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
+ *                                Operation conditions
+ *
+ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Less than or Equal to
+ *                                the Wait Operation value
+ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Greater than the Wait Operation value
+ */
+typedef enum {
+	BASEP_CQS_WAIT_OPERATION_LE = 0,
+	BASEP_CQS_WAIT_OPERATION_GT = 1,
+} basep_cqs_wait_operation_op;
+
+struct base_cqs_wait_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
+ *		about the Timeline CQS wait objects
+ *
+ * @objs:              An array of Timeline CQS waits.
+ * @nr_objs:           Number of Timeline CQS waits in the array.
+ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
+ *                     to be served as the source for importing into the
+ *                     queue's error-state.
+ */
+struct base_kcpu_command_cqs_wait_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+/**
+ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
+ *
+ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
+ *                                to a synchronization object
+ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
+ *                                of a synchronization object
+ */
+typedef enum {
+	BASEP_CQS_SET_OPERATION_ADD = 0,
+	BASEP_CQS_SET_OPERATION_SET = 1,
+} basep_cqs_set_operation_op;
+
+struct base_cqs_set_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
+ *		about the Timeline CQS set objects
+ *
+ * @objs:    An array of Timeline CQS sets.
+ * @nr_objs: Number of Timeline CQS sets in the array.
+ * @padding: Structure padding, unused bytes.
+ */
+struct base_kcpu_command_cqs_set_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * struct base_kcpu_command_import_info - structure which contains information
+ *		about the imported buffer.
+ *
+ * @handle:	Address of imported user buffer.
+ */
+struct base_kcpu_command_import_info {
+	__u64 handle;
+};
+
+/**
+ * struct base_kcpu_command_jit_alloc_info - structure which contains
+ *		information about jit memory allocation.
+ *
+ * @info:	An array of elements of the
+ *		struct base_jit_alloc_info type.
+ * @count:	The number of elements in the info array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_alloc_info {
+	__u64 info;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_jit_free_info - structure which contains
+ *		information about jit memory which is to be freed.
+ *
+ * @ids:	An array containing the JIT IDs to free.
+ * @count:	The number of elements in the ids array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_free_info {
+	__u64 ids;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_group_suspend_info - structure which contains
+ *		suspend buffer data captured for a suspended queue group.
+ *
+ * @buffer:		Pointer to an array of elements of the type char.
+ * @size:		Number of elements in the @buffer array.
+ * @group_handle:	Handle to the mapping of CSG.
+ * @padding:		padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_group_suspend_info {
+	__u64 buffer;
+	__u32 size;
+	__u8 group_handle;
+	__u8 padding[3];
+};
+
+
+/**
+ * struct base_kcpu_command - kcpu command.
+ * @type:	type of the kcpu command, one enum base_kcpu_command_type
+ * @padding:	padding to a multiple of 64 bits
+ * @info:	structure which contains information about the kcpu command;
+ *		actual type is determined by @p type
+ * @info.fence:              Fence
+ * @info.cqs_wait:           CQS wait
+ * @info.cqs_set:            CQS set
+ * @info.cqs_wait_operation: CQS wait operation
+ * @info.cqs_set_operation:  CQS set operation
+ * @info.import:             import
+ * @info.jit_alloc:          JIT allocation
+ * @info.jit_free:           JIT deallocation
+ * @info.suspend_buf_copy:   suspend buffer copy
+ * @info.sample_time:        sample time
+ * @info.padding:            padding
+ */
+struct base_kcpu_command {
+	__u8 type;
+	__u8 padding[sizeof(__u64) - sizeof(__u8)];
+	union {
+		struct base_kcpu_command_fence_info fence;
+		struct base_kcpu_command_cqs_wait_info cqs_wait;
+		struct base_kcpu_command_cqs_set_info cqs_set;
+		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
+		struct base_kcpu_command_import_info import;
+		struct base_kcpu_command_jit_alloc_info jit_alloc;
+		struct base_kcpu_command_jit_free_info jit_free;
+		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
+		__u64 padding[2]; /* No sub-struct should be larger */
+	} info;
+};
+
+/**
+ * struct basep_cs_stream_control - CSI capabilities.
+ *
+ * @features: Features of this stream
+ * @padding:  Padding to a multiple of 64 bits.
+ */
+struct basep_cs_stream_control {
+	__u32 features;
+	__u32 padding;
+};
+
+/**
+ * struct basep_cs_group_control - CSG interface capabilities.
+ *
+ * @features:     Features of this group
+ * @stream_num:   Number of streams in this group
+ * @suspend_size: Size in bytes of the suspend buffer for this group
+ * @padding:      Padding to a multiple of 64 bits.
+ */
+struct basep_cs_group_control {
+	__u32 features;
+	__u32 stream_num;
+	__u32 suspend_size;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
+ *        error information associated with GPU command queue group.
+ *
+ * @sideband:     Additional information of the unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_group_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
+ *        error information related to GPU command queue.
+ *
+ * @sideband:     Additional information about this unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @csi_index:    Index of the CSF interface the queue is bound to.
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
+};
+
+/**
+ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
+ *
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
+ *                                          command queue group.
+ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
+ *                                          command queue.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
+ *                                          progress timeout.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
+ *                                             of tiler heap memory.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
+ *
+ * This type is used for &struct_base_gpu_queue_group_error.error_type.
+ */
+enum base_gpu_queue_group_error_type {
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
+	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
+	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
+};
+
+/**
+ * struct base_gpu_queue_group_error - Unrecoverable fault information
+ * @error_type:          Error type of @base_gpu_queue_group_error_type
+ *                       indicating which field in union payload is filled
+ * @padding:             Unused bytes for 64bit boundary
+ * @payload:             Input Payload
+ * @payload.fatal_group: Unrecoverable fault error associated with
+ *                       GPU command queue group
+ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
+ */
+struct base_gpu_queue_group_error {
+	__u8 error_type;
+	__u8 padding[7];
+	union {
+		struct base_gpu_queue_group_error_fatal_payload fatal_group;
+		struct base_gpu_queue_error_fatal_payload fatal_queue;
+	} payload;
+};
+
+/**
+ * enum base_csf_notification_type - Notification type
+ *
+ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
+ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
+ *                                               error
+ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
+ *                                               queue
+ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
+ *
+ * This type is used for &struct_base_csf_notification.type.
+ */
+enum base_csf_notification_type {
+	BASE_CSF_NOTIFICATION_EVENT = 0,
+	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
+	BASE_CSF_NOTIFICATION_COUNT
+};
+
+/**
+ * struct base_csf_notification - Event or error notification
+ *
+ * @type:                      Notification type of @base_csf_notification_type
+ * @padding:                   Padding for 64bit boundary
+ * @payload:                   Input Payload
+ * @payload.align:             To fit the struct into a 64-byte cache line
+ * @payload.csg_error:         CSG error
+ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
+ *                             fatal error
+ * @payload.csg_error.padding: Padding
+ * @payload.csg_error.error:   Unrecoverable fault error
+ *
+ */
+struct base_csf_notification {
+	__u8 type;
+	__u8 padding[7];
+	union {
+		struct {
+			__u8 handle;
+			__u8 padding[7];
+			struct base_gpu_queue_group_error error;
+		} csg_error;
+
+		__u8 align[56];
+	} payload;
+};
+
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ *
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ *   4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ *   release number.
+ *   8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ *   release number.
+ *   4 bit values (0-15).
+ * @padding: padding to align to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ *   clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ *   is a bitpattern where a set bit indicates that the format is supported.
+ *   Before using a texture format, it is recommended that the corresponding
+ *   bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ *   It is unlikely that a client will be able to allocate all of this memory
+ *   for their own purposes, but this at least provides an upper bound on the
+ *   memory available to the GPU.
+ *   This is required for OpenCL's clGetDeviceInfo() call when
+ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ *   client will not be expecting to allocate anywhere near this value.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+};
+
+#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
new file mode 100644
index 00000000000..17e338cb238
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * This header was originally autogenerated, but it is now ok (and
+ * expected) to have to add to it.
+ */
+
+#ifndef _UAPI_GPU_CSF_REGISTERS_H_
+#define _UAPI_GPU_CSF_REGISTERS_H_
+
+/* Only user block defines are included. HI words have been removed */
+
+/* CS_USER_INPUT_BLOCK register offsets */
+#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
+#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
+
+/* CS_USER_OUTPUT_BLOCK register offsets */
+#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
+#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
+
+/* USER register offsets */
+#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
+
+#endif
diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
new file mode 100644
index 00000000000..db7252605f0
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
@@ -0,0 +1,530 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_CSF_IOCTL_H_
+#define _UAPI_KBASE_CSF_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 1.0:
+ * - CSF IOCTL header separated from JM
+ * 1.1:
+ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 1.2:
+ * - Add new CSF GPU_FEATURES register into the property structure
+ *   returned by KBASE_IOCTL_GET_GPUPROPS
+ * 1.3:
+ * - Add __u32 group_uid member to
+ *   &struct_kbase_ioctl_cs_queue_group_create.out
+ * 1.4:
+ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
+ *   instr_features member of same size
+ * 1.5:
+ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
+ *   queue registration call with extended format for supporting CS
+ *   trace configurations with CSF trace_command.
+ * 1.6:
+ * - Added new HW performance counters interface to all GPUs.
+ * 1.7:
+ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
+ * 1.8:
+ * - Removed Kernel legacy HWC interface
+ * 1.9:
+ * - Reorganization of GPU-VA memory zones, including addition of
+ *   FIXED_VA zone and auto-initialization of EXEC_VA zone.
+ * - Added new Base memory allocation interface
+ * 1.10:
+ * - First release of new HW performance counters interface.
+ * 1.11:
+ * - Dummy model (no mali) backend will now clear HWC values after each sample
+ * 1.12:
+ * - Added support for incremental rendering flag in CSG create call
+ */
+
+#define BASE_UK_VERSION_MAJOR 1
+#define BASE_UK_VERSION_MINOR 12
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+/**
+ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
+ *                                        base back-end
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ *
+ * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
+ *        Any change of this struct should also be mirrored to the latter.
+ */
+struct kbase_ioctl_cs_queue_register {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER \
+	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
+
+/**
+ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
+ *                                    to notify that a queue has been updated
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_kick {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_KICK \
+	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
+
+/**
+ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
+ *
+ * @in:                 Input parameters
+ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @in.group_handle:    Handle of the group to which the queue should be bound
+ * @in.csi_index:       Index of the CSF interface the queue should be bound to
+ * @in.padding:         Currently unused, must be zero
+ * @out:                Output parameters
+ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
+ *                      input/output pages
+ */
+union kbase_ioctl_cs_queue_bind {
+	struct {
+		__u64 buffer_gpu_addr;
+		__u8 group_handle;
+		__u8 csi_index;
+		__u8 padding[6];
+	} in;
+	struct {
+		__u64 mmap_handle;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_BIND \
+	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
+
+/**
+ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
+ *                                           base back-end in extended format,
+ *                                           involving trace buffer configuration
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
+ * @ex_buffer_base: Trace buffer GPU base address for the queue
+ * @ex_buffer_size: Size of the trace buffer in bytes
+ * @ex_event_size: Trace event write size, in log2 designation
+ * @ex_event_state: Trace event states configuration
+ * @ex_padding: Currently unused, must be zero
+ *
+ * Note: There is an identical sub-section at the start of this struct to that
+ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
+ *        must also be mirrored to the latter. Following the said sub-section,
+ *        the remaining fields forms the extension, marked with ex_*.
+ */
+struct kbase_ioctl_cs_queue_register_ex {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+	__u64 ex_offset_var_addr;
+	__u64 ex_buffer_base;
+	__u32 ex_buffer_size;
+	__u8 ex_event_size;
+	__u8 ex_event_state;
+	__u8 ex_padding[2];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
+	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
+
+/**
+ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_terminate {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
+ *                                               group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create_1_6 {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
+	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.csi_handlers:  Flags to signal that the application intends to use CSI
+ *                    exception handlers in some linear buffers to deal with
+ *                    the given exception types.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 csi_handlers;
+		__u8 padding[2];
+		/**
+		 * @in.reserved: Reserved
+		 */
+		__u64 reserved;
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
+	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
+
+/**
+ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
+ *
+ * @group_handle: Handle of the queue group to be terminated
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_cs_queue_group_term {
+	__u8 group_handle;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
+
+#define KBASE_IOCTL_CS_EVENT_SIGNAL \
+	_IO(KBASE_IOCTL_TYPE, 44)
+
+typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
+
+/**
+ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
+ *
+ * @id: ID of the new command queue returned by the kernel
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_new {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
+	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
+ *
+ * @id: ID of the command queue to be destroyed
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_delete {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
+	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
+ *
+ * @addr: Memory address of an array of struct base_kcpu_queue_command
+ * @nr_commands: Number of commands in the array
+ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_enqueue {
+	__u64 addr;
+	__u32 nr_commands;
+	base_kcpu_queue_id id;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
+	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
+
+/**
+ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
+ * @in:                Input parameters
+ * @in.chunk_size:     Size of each chunk.
+ * @in.initial_chunks: Initial number of chunks that heap will be created with.
+ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
+ * @in.target_in_flight: Number of render-passes that the driver should attempt to
+ *                     keep in flight for which allocation of new chunks is
+ *                     allowed.
+ * @in.group_id:       Group ID to be used for physical allocations.
+ * @in.padding:        Padding
+ * @out:               Output parameters
+ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
+ *                     for the heap.
+ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
+ *                     actually points to the header of heap chunk and not to
+ *                     the low address of free memory in the chunk.
+ */
+union kbase_ioctl_cs_tiler_heap_init {
+	struct {
+		__u32 chunk_size;
+		__u32 initial_chunks;
+		__u32 max_chunks;
+		__u16 target_in_flight;
+		__u8 group_id;
+		__u8 padding;
+	} in;
+	struct {
+		__u64 gpu_heap_va;
+		__u64 first_chunk_va;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
+	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
+
+/**
+ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
+ *                                         instance
+ *
+ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
+ */
+struct kbase_ioctl_cs_tiler_heap_term {
+	__u64 gpu_heap_va;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
+	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
+
+/**
+ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
+ *                                        of CSF interface capabilities
+ *
+ * @in:                    Input parameters
+ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
+ *                         which case groups_ptr is unused.
+ * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in
+ *                         which case streams_ptr is unused.
+ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
+ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
+ * @out:                   Output parameters
+ * @out.glb_version:       Global interface version.
+ * @out.features:          Bit mask of features (e.g. whether certain types of job
+ *                         can be suspended).
+ * @out.group_num:         Number of CSGs supported.
+ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
+ *                         hold the size of firmware performance counter data
+ *                         and 15:0 hold the size of hardware performance counter
+ *                         data.
+ * @out.total_stream_num:  Total number of CSs, summed across all groups.
+ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
+ *                         size of events. Bits 3:0 hold the offset update rate.
+ *                         (csf >= 1.1.0)
+ *
+ */
+union kbase_ioctl_cs_get_glb_iface {
+	struct {
+		__u32 max_group_num;
+		__u32 max_total_stream_num;
+		__u64 groups_ptr;
+		__u64 streams_ptr;
+	} in;
+	struct {
+		__u32 glb_version;
+		__u32 features;
+		__u32 group_num;
+		__u32 prfcnt_size;
+		__u32 total_stream_num;
+		__u32 instr_features;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_GET_GLB_IFACE \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
+
+struct kbase_ioctl_cs_cpu_queue_info {
+	__u64 buffer;
+	__u64 size;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
+	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
+
+/**
+ * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @in.fixed_address: The GPU virtual address requested for the allocation,
+ *                    if the allocation is using the BASE_MEM_FIXED flag.
+ * @in.extra: Space for extra parameters that may be added in the future.
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc_ex {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+		__u64 fixed_address;
+		__u64 extra[3];
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+/**
+ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
+ * @cpu_addr: Memory address to write
+ * @value: Value to write
+ * @padding: Currently unused, must be zero
+ */
+struct kbase_ioctl_cs_event_memory_write {
+	__u64 cpu_addr;
+	__u8 value;
+	__u8 padding[7];
+};
+
+/**
+ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
+ * @in: Input parameters
+ * @in.cpu_addr: Memory address to read
+ * @out: Output parameters
+ * @out.value: Value read
+ * @out.padding: Currently unused, must be zero
+ */
+union kbase_ioctl_cs_event_memory_read {
+	struct {
+		__u64 cpu_addr;
+	} in;
+	struct {
+		__u8 value;
+		__u8 padding[7];
+	} out;
+};
+
+#endif /* MALI_UNIT_TEST */
+
+#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
new file mode 100644
index 00000000000..ae43908b936
--- /dev/null
+++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
@@ -0,0 +1,1051 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_JM_KERNEL_H_
+#define _UAPI_BASE_JM_KERNEL_H_
+
+#include <linux/types.h>
+#include "../mali_base_common_kernel.h"
+
+/* Memory allocation, access/hint flags & mask specific to JM GPU.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* Used as BASE_MEM_FIXED in other backends */
+#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
+
+/**
+ * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved.
+ *
+ * Do not remove, use the next unreserved bit for new flags
+ */
+#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19)
+
+/**
+ * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned
+ * to 'extension' pages, where 'extension' must be a power of 2 and no more than
+ * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
+ */
+#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20)
+
+/* Use the GPU VA chosen by the kernel client */
+#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27)
+
+/* Force trimming of JIT allocations when creating a new allocation */
+#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29)
+
+/* Note that the number of bits used for base_mem_alloc_flags
+ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
+ */
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \
+	 BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED \
+	(BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19)
+
+
+/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the
+ * initial commit is aligned to 'extension' pages, where 'extension' must be a power
+ * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
+ */
+#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP  (1 << 0)
+
+/**
+ * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points
+ * to a __u32 holding the used size in bytes;
+ * otherwise it points to a __u64 holding the lowest address of unused memory.
+ */
+#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE  (1 << 1)
+
+/**
+ * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags
+ *
+ * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr
+ * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set
+ * and heap_info_gpu_addr being 0 will be rejected).
+ */
+#define BASE_JIT_ALLOC_VALID_FLAGS \
+	(BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/*
+ * Private flags used on the base context
+ *
+ * These start at bit 31, and run down to zero.
+ *
+ * They share the same space as base_context_create_flags, and so must
+ * not collide with them.
+ */
+
+/* Private flag tracking whether job descriptor dumping is disabled */
+#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \
+	((base_context_create_flags)(1 << 31))
+
+/* Flags for base tracepoint specific to JM */
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED)
+/*
+ * Dependency stuff, keep it private for now. May want to expose it if
+ * we decide to make the number of semaphores a configurable
+ * option.
+ */
+#define BASE_JD_ATOM_COUNT              256
+
+/* Maximum number of concurrent render passes.
+ */
+#define BASE_JD_RP_COUNT (256)
+
+/* Set/reset values for a software event */
+#define BASE_JD_SOFT_EVENT_SET             ((unsigned char)1)
+#define BASE_JD_SOFT_EVENT_RESET           ((unsigned char)0)
+
+/**
+ * struct base_jd_udata - Per-job data
+ *
+ * @blob: per-job data array
+ *
+ * This structure is used to store per-job data, and is completely unused
+ * by the Base driver. It can be used to store things such as callback
+ * function pointer, data to handle job completion. It is guaranteed to be
+ * untouched by the Base driver.
+ */
+struct base_jd_udata {
+	__u64 blob[2];
+};
+
+/**
+ * typedef base_jd_dep_type - Job dependency type.
+ *
+ * A flags field will be inserted into the atom structure to specify whether a
+ * dependency is a data or ordering dependency (by putting it before/after
+ * 'core_req' in the structure it should be possible to add without changing
+ * the structure size).
+ * When the flag is set for a particular dependency to signal that it is an
+ * ordering only dependency then errors will not be propagated.
+ */
+typedef __u8 base_jd_dep_type;
+
+#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
+#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
+#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
+
+/**
+ * typedef base_jd_core_req - Job chain hardware requirements.
+ *
+ * A job chain must specify what GPU features it needs to allow the
+ * driver to schedule the job correctly.  By not specifying the
+ * correct settings can/will cause an early job termination.  Multiple
+ * values can be ORed together to specify multiple requirements.
+ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
+ * dependencies, and that doesn't execute anything on the hardware.
+ */
+typedef __u32 base_jd_core_req;
+
+/* Requirements that come from the HW */
+
+/* No requirement, dependency only
+ */
+#define BASE_JD_REQ_DEP ((base_jd_core_req)0)
+
+/* Requires fragment shaders
+ */
+#define BASE_JD_REQ_FS  ((base_jd_core_req)1 << 0)
+
+/* Requires compute shaders
+ *
+ * This covers any of the following GPU job types:
+ * - Vertex Shader Job
+ * - Geometry Shader Job
+ * - An actual Compute Shader Job
+ *
+ * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
+ * job is specifically just the "Compute Shader" job type, and not the "Vertex
+ * Shader" nor the "Geometry Shader" job type.
+ */
+#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1)
+
+/* Requires tiling */
+#define BASE_JD_REQ_T  ((base_jd_core_req)1 << 2)
+
+/* Requires cache flushes */
+#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3)
+
+/* Requires value writeback */
+#define BASE_JD_REQ_V  ((base_jd_core_req)1 << 4)
+
+/* SW-only requirements - the HW does not expose these as part of the job slot
+ * capabilities
+ */
+
+/* Requires fragment job with AFBC encoding */
+#define BASE_JD_REQ_FS_AFBC  ((base_jd_core_req)1 << 13)
+
+/* SW-only requirement: coalesce completion events.
+ * If this bit is set then completion of this atom will not cause an event to
+ * be sent to userspace, whether successful or not; completion events will be
+ * deferred until an atom completes which does not have this bit set.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
+ */
+#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5)
+
+/* SW Only requirement: the job chain requires a coherent core group. We don't
+ * mind which coherent core group is used.
+ */
+#define BASE_JD_REQ_COHERENT_GROUP  ((base_jd_core_req)1 << 6)
+
+/* SW Only requirement: The performance counters should be enabled only when
+ * they are needed, to reduce power consumption.
+ */
+#define BASE_JD_REQ_PERMON               ((base_jd_core_req)1 << 7)
+
+/* SW Only requirement: External resources are referenced by this atom.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and
+ * BASE_JD_REQ_SOFT_EVENT_WAIT.
+ */
+#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((base_jd_core_req)1 << 8)
+
+/* SW Only requirement: Software defined job. Jobs with this bit set will not be
+ * submitted to the hardware but will cause some action to happen within the
+ * driver
+ */
+#define BASE_JD_REQ_SOFT_JOB        ((base_jd_core_req)1 << 9)
+
+#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
+#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
+#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
+
+/* 0x4 RESERVED for now */
+
+/* SW only requirement: event wait/trigger job.
+ *
+ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
+ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
+ *   other waiting jobs. It completes immediately.
+ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
+ *   possible for other jobs to wait upon. It completes immediately.
+ */
+#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
+#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
+#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
+
+#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
+
+/* SW only requirement: Just In Time allocation
+ *
+ * This job requests a single or multiple just-in-time allocations through a
+ * list of base_jit_alloc_info structure which is passed via the jc element of
+ * the atom. The number of base_jit_alloc_info structures present in the
+ * list is passed via the nr_extres element of the atom
+ *
+ * It should be noted that the id entry in base_jit_alloc_info must not
+ * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE.
+ *
+ * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE
+ * soft job to free the JIT allocation is still made.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
+
+/* SW only requirement: Just In Time free
+ *
+ * This job requests a single or multiple just-in-time allocations created by
+ * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time
+ * allocations is passed via the jc element of the atom.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
+
+/* SW only requirement: Map external resource
+ *
+ * This job requests external resource(s) are mapped once the dependencies
+ * of the job have been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
+
+/* SW only requirement: Unmap external resource
+ *
+ * This job requests external resource(s) are unmapped once the dependencies
+ * of the job has been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
+
+/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
+ *
+ * This indicates that the Job Chain contains GPU jobs of the 'Compute
+ * Shaders' type.
+ *
+ * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job
+ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
+ */
+#define BASE_JD_REQ_ONLY_COMPUTE    ((base_jd_core_req)1 << 10)
+
+/* HW Requirement: Use the base_jd_atom::device_nr field to specify a
+ * particular core group
+ *
+ * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
+ * takes priority
+ *
+ * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms.
+ */
+#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11)
+
+/* SW Flag: If this bit is set then the successful completion of this atom
+ * will not cause an event to be sent to userspace
+ */
+#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((base_jd_core_req)1 << 12)
+
+/* SW Flag: If this bit is set then completion of this atom will not cause an
+ * event to be sent to userspace, whether successful or not.
+ */
+#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14)
+
+/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job starts which does not have this bit set or a job completes
+ * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use
+ * if the CPU may have written to memory addressed by the job since the last job
+ * without this bit set was submitted.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15)
+
+/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job completes which does not have this bit set or a job starts
+ * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use
+ * if the CPU may read from or partially overwrite memory addressed by the job
+ * before the next job without this bit set completes.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16)
+
+/* Request the atom be executed on a specific job slot.
+ *
+ * When this flag is specified, it takes precedence over any existing job slot
+ * selection logic.
+ */
+#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17)
+
+/* SW-only requirement: The atom is the start of a renderpass.
+ *
+ * If this bit is set then the job chain will be soft-stopped if it causes the
+ * GPU to write beyond the end of the physical pages backing the tiler heap, and
+ * committing more memory to the heap would exceed an internal threshold. It may
+ * be resumed after running one of the job chains attached to an atom with
+ * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be
+ * resumed multiple times until it completes without memory usage exceeding the
+ * threshold.
+ *
+ * Usually used with BASE_JD_REQ_T.
+ */
+#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18)
+
+/* SW-only requirement: The atom is the end of a renderpass.
+ *
+ * If this bit is set then the atom incorporates the CPU address of a
+ * base_jd_fragment object instead of the GPU address of a job chain.
+ *
+ * Which job chain is run depends upon whether the atom with the same renderpass
+ * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or
+ * was soft-stopped when it exceeded an upper threshold for tiler heap memory
+ * usage.
+ *
+ * It also depends upon whether one of the job chains attached to the atom has
+ * already been run as part of the same renderpass (in which case it would have
+ * written unresolved multisampled and otherwise-discarded output to temporary
+ * buffers that need to be read back). The job chain for doing a forced read and
+ * forced write (from/to temporary buffers) is run as many times as necessary.
+ *
+ * Usually used with BASE_JD_REQ_FS.
+ */
+#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19)
+
+/* SW-only requirement: The atom needs to run on a limited core mask affinity.
+ *
+ * If this bit is set then the kbase_context.limited_core_mask will be applied
+ * to the affinity.
+ */
+#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20)
+
+/* These requirement bits are currently unused in base_jd_core_req
+ */
+#define BASEP_JD_REQ_RESERVED \
+	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
+	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \
+	BASE_JD_REQ_EVENT_COALESCE | \
+	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
+	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
+	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \
+	BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \
+	BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK))
+
+/* Mask of all bits in base_jd_core_req that control the type of the atom.
+ *
+ * This allows dependency only atoms to have flags set
+ */
+#define BASE_JD_REQ_ATOM_TYPE \
+	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
+	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
+
+/**
+ * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that
+ * controls the type of a soft job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
+
+/* Returns non-zero value if core requirements passed define a soft job or
+ * a dependency only job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
+	(((core_req) & BASE_JD_REQ_SOFT_JOB) || \
+	((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
+
+/**
+ * enum kbase_jd_atom_state - Atom states
+ *
+ * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used.
+ * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD.
+ * @KBASE_JD_ATOM_STATE_IN_JS:  Atom has been given to JS (is runnable/running).
+ * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet
+ *                                    handed back to job dispatcher for
+ *                                    dependency resolution.
+ * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed
+ *                                 back to userspace.
+ */
+enum kbase_jd_atom_state {
+	KBASE_JD_ATOM_STATE_UNUSED,
+	KBASE_JD_ATOM_STATE_QUEUED,
+	KBASE_JD_ATOM_STATE_IN_JS,
+	KBASE_JD_ATOM_STATE_HW_COMPLETED,
+	KBASE_JD_ATOM_STATE_COMPLETED
+};
+
+/**
+ * typedef base_atom_id - Type big enough to store an atom number in.
+ */
+typedef __u8 base_atom_id;
+
+/**
+ * struct base_dependency - base dependency
+ *
+ * @atom_id:         An atom number
+ * @dependency_type: Dependency type
+ */
+struct base_dependency {
+	base_atom_id atom_id;
+	base_jd_dep_type dependency_type;
+};
+
+/**
+ * struct base_jd_fragment - Set of GPU fragment job chains used for rendering.
+ *
+ * @norm_read_norm_write: Job chain for full rendering.
+ *                        GPU address of a fragment job chain to render in the
+ *                        circumstance where the tiler job chain did not exceed
+ *                        its memory usage threshold and no fragment job chain
+ *                        was previously run for the same renderpass.
+ *                        It is used no more than once per renderpass.
+ * @norm_read_forced_write: Job chain for starting incremental
+ *                          rendering.
+ *                          GPU address of a fragment job chain to render in
+ *                          the circumstance where the tiler job chain exceeded
+ *                          its memory usage threshold for the first time and
+ *                          no fragment job chain was previously run for the
+ *                          same renderpass.
+ *                          Writes unresolved multisampled and normally-
+ *                          discarded output to temporary buffers that must be
+ *                          read back by a subsequent forced_read job chain
+ *                          before the renderpass is complete.
+ *                          It is used no more than once per renderpass.
+ * @forced_read_forced_write: Job chain for continuing incremental
+ *                            rendering.
+ *                            GPU address of a fragment job chain to render in
+ *                            the circumstance where the tiler job chain
+ *                            exceeded its memory usage threshold again
+ *                            and a fragment job chain was previously run for
+ *                            the same renderpass.
+ *                            Reads unresolved multisampled and
+ *                            normally-discarded output from temporary buffers
+ *                            written by a previous forced_write job chain and
+ *                            writes the same to temporary buffers again.
+ *                            It is used as many times as required until
+ *                            rendering completes.
+ * @forced_read_norm_write: Job chain for ending incremental rendering.
+ *                          GPU address of a fragment job chain to render in the
+ *                          circumstance where the tiler job chain did not
+ *                          exceed its memory usage threshold this time and a
+ *                          fragment job chain was previously run for the same
+ *                          renderpass.
+ *                          Reads unresolved multisampled and normally-discarded
+ *                          output from temporary buffers written by a previous
+ *                          forced_write job chain in order to complete a
+ *                          renderpass.
+ *                          It is used no more than once per renderpass.
+ *
+ * This structure is referenced by the main atom structure if
+ * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req.
+ */
+struct base_jd_fragment {
+	__u64 norm_read_norm_write;
+	__u64 norm_read_forced_write;
+	__u64 forced_read_forced_write;
+	__u64 forced_read_norm_write;
+};
+
+/**
+ * typedef base_jd_prio - Base Atom priority.
+ *
+ * Only certain priority levels are actually implemented, as specified by the
+ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
+ * level that is not one of those defined below.
+ *
+ * Priority levels only affect scheduling after the atoms have had dependencies
+ * resolved. For example, a low priority atom that has had its dependencies
+ * resolved might run before a higher priority atom that has not had its
+ * dependencies resolved.
+ *
+ * In general, fragment atoms do not affect non-fragment atoms with
+ * lower priorities, and vice versa. One exception is that there is only one
+ * priority value for each context. So a high-priority (e.g.) fragment atom
+ * could increase its context priority, causing its non-fragment atoms to also
+ * be scheduled sooner.
+ *
+ * The atoms are scheduled as follows with respect to their priorities:
+ * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies
+ *   resolved, and atom 'X' has a higher priority than atom 'Y'
+ * * If atom 'Y' is currently running on the HW, then it is interrupted to
+ *   allow atom 'X' to run soon after
+ * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing
+ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
+ * * Any two atoms that have the same priority could run in any order with
+ *   respect to each other. That is, there is no ordering constraint between
+ *   atoms of the same priority.
+ *
+ * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are
+ * scheduled between contexts. The default value, 0, will cause higher-priority
+ * atoms to be scheduled first, regardless of their context. The value 1 will
+ * use a round-robin algorithm when deciding which context's atoms to schedule
+ * next, so higher-priority atoms can only preempt lower priority atoms within
+ * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and
+ * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details.
+ */
+typedef __u8 base_jd_prio;
+
+/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */
+#define BASE_JD_PRIO_MEDIUM  ((base_jd_prio)0)
+/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and
+ * BASE_JD_PRIO_LOW
+ */
+#define BASE_JD_PRIO_HIGH    ((base_jd_prio)1)
+/* Low atom priority. */
+#define BASE_JD_PRIO_LOW     ((base_jd_prio)2)
+/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH,
+ * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW
+ */
+#define BASE_JD_PRIO_REALTIME    ((base_jd_prio)3)
+
+/* Invalid atom priority (max uint8_t value) */
+#define BASE_JD_PRIO_INVALID ((base_jd_prio)255)
+
+/* Count of the number of priority levels. This itself is not a valid
+ * base_jd_prio setting
+ */
+#define BASE_JD_NR_PRIO_LEVELS 4
+
+/**
+ * struct base_jd_atom_v2 - Node of a dependency graph used to submit a
+ *                          GPU job chain or soft-job to the kernel driver.
+ *
+ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
+ *                 is set in the base_jd_core_req) the CPU address of a
+ *                 base_jd_fragment object.
+ * @udata:         User data.
+ * @extres_list:   List of external resources.
+ * @nr_extres:     Number of external resources or JIT allocations.
+ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
+ *                 allocations written to by the atom. When the atom
+ *                 completes, the value stored at the
+ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
+ *                 each allocation is read in order to enforce an
+ *                 overall physical memory usage limit.
+ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
+ *                 this field; this is done in order to reduce possibility of
+ *                 improper assignment of a dependency field.
+ * @atom_number:   Unique number to identify the atom.
+ * @prio:          Atom priority. Refer to base_jd_prio for more details.
+ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+ *                 specified.
+ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
+ * @core_req:      Core requirements.
+ * @renderpass_id: Renderpass identifier used to associate an atom that has
+ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
+ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
+ * @padding:       Unused. Must be zero.
+ *
+ * This structure has changed since UK 10.2 for which base_jd_core_req was a
+ * __u16 value.
+ *
+ * In UK 10.3 a core_req field of a __u32 type was added to the end of the
+ * structure, and the place in the structure previously occupied by __u16
+ * core_req was kept but renamed to compat_core_req.
+ *
+ * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2].
+ * Compatibility with UK 10.x from UK 11.y is not handled because
+ * the major version increase prevents this.
+ *
+ * For UK 11.20 jit_id[2] must be initialized to zero.
+ */
+struct base_jd_atom_v2 {
+	__u64 jc;
+	struct base_jd_udata udata;
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
+	struct base_dependency pre_dep[2];
+	base_atom_id atom_number;
+	base_jd_prio prio;
+	__u8 device_nr;
+	__u8 jobslot;
+	base_jd_core_req core_req;
+	__u8 renderpass_id;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr
+ *                          at the beginning.
+ *
+ * @seq_nr:        Sequence number of logical grouping of atoms.
+ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
+ *                 is set in the base_jd_core_req) the CPU address of a
+ *                 base_jd_fragment object.
+ * @udata:         User data.
+ * @extres_list:   List of external resources.
+ * @nr_extres:     Number of external resources or JIT allocations.
+ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
+ *                 allocations written to by the atom. When the atom
+ *                 completes, the value stored at the
+ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
+ *                 each allocation is read in order to enforce an
+ *                 overall physical memory usage limit.
+ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
+ *                 this field; this is done in order to reduce possibility of
+ *                 improper assignment of a dependency field.
+ * @atom_number:   Unique number to identify the atom.
+ * @prio:          Atom priority. Refer to base_jd_prio for more details.
+ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+ *                 specified.
+ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
+ * @core_req:      Core requirements.
+ * @renderpass_id: Renderpass identifier used to associate an atom that has
+ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
+ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
+ * @padding:       Unused. Must be zero.
+ */
+typedef struct base_jd_atom {
+	__u64 seq_nr;
+	__u64 jc;
+	struct base_jd_udata udata;
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
+	struct base_dependency pre_dep[2];
+	base_atom_id atom_number;
+	base_jd_prio prio;
+	__u8 device_nr;
+	__u8 jobslot;
+	base_jd_core_req core_req;
+	__u8 renderpass_id;
+	__u8 padding[7];
+} base_jd_atom;
+
+/* Job chain event code bits
+ * Defines the bits used to create ::base_jd_event_code
+ */
+enum {
+	BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */
+	BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */
+	/* Event indicates success (SW events only) */
+	BASE_JD_SW_EVENT_SUCCESS = (1u << 13),
+	BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */
+	BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */
+	BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */
+	BASE_JD_SW_EVENT_RESERVED = (3u << 11),	/* Reserved event type */
+	/* Mask to extract the type from an event code */
+	BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11)
+};
+
+/**
+ * enum base_jd_event_code - Job chain event codes
+ *
+ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status
+ *                                         codes.
+ *                                         Obscurely, BASE_JD_EVENT_TERMINATED
+ *                                         indicates a real fault, because the
+ *                                         job was hard-stopped.
+ * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as
+ *                             'previous job done'.
+ * @BASE_JD_EVENT_STOPPED:     Can't be seen by userspace, becomes
+ *                             TERMINATED, DONE or JOB_CANCELLED.
+ * @BASE_JD_EVENT_TERMINATED:  This is actually a fault status code - the job
+ *                             was hard stopped.
+ * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on
+ *                        complete/fail/cancel.
+ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes.
+ *                                       Obscurely, BASE_JD_EVENT_TERMINATED
+ *                                       indicates a real fault,
+ *                                       because the job was hard-stopped.
+ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and
+ *                                                  software error status codes.
+ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and
+ *                                                software error status codes.
+ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status
+ *                                        codes.
+ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes.
+ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes.
+ *                                         Such codes are never returned to
+ *                                         user-space.
+ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes.
+ * @BASE_JD_EVENT_DONE: atom has completed successfull
+ * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which
+ *                                  shall result in a failed atom
+ * @BASE_JD_EVENT_JOB_POWER_FAULT:  The job could not be executed because the
+ *                                  part of the memory system required to access
+ *                                  job descriptors was not powered on
+ * @BASE_JD_EVENT_JOB_READ_FAULT:   Reading a job descriptor into the Job
+ *                                  manager failed
+ * @BASE_JD_EVENT_JOB_WRITE_FAULT:  Writing a job descriptor from the Job
+ *                                  manager failed
+ * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the
+ *                                    specified affinity mask does not intersect
+ *                                    any available cores
+ * @BASE_JD_EVENT_JOB_BUS_FAULT:    A bus access failed while executing a job
+ * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program
+ *                                  counter was executed.
+ * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal
+ *                                  encoding was executed.
+ * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where
+ *                                  the instruction encoding did not match the
+ *                                  instruction type encoded in the program
+ *                                  counter.
+ * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that
+ *                                  contained invalid combinations of operands.
+ * @BASE_JD_EVENT_INSTR_TLS_FAULT:  A shader instruction was executed that tried
+ *                                  to access the thread local storage section
+ *                                  of another thread.
+ * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that
+ *                                  tried to do an unsupported unaligned memory
+ *                                  access.
+ * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that
+ *                                  failed to complete an instruction barrier.
+ * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job
+ *                                  contains invalid combinations of data.
+ * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to
+ *                                  process a tile that is entirely outside the
+ *                                  bounding box of the frame.
+ * @BASE_JD_EVENT_STATE_FAULT:      Matches ADDR_RANGE_FAULT. A virtual address
+ *                                  has been found that exceeds the virtual
+ *                                  address range.
+ * @BASE_JD_EVENT_OUT_OF_MEMORY:    The tiler ran out of memory when executing a job.
+ * @BASE_JD_EVENT_UNKNOWN:          If multiple jobs in a job chain fail, only
+ *                                  the first one the reports an error will set
+ *                                  and return full error information.
+ *                                  Subsequent failing jobs will not update the
+ *                                  error status registers, and may write an
+ *                                  error status of UNKNOWN.
+ * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to
+ *                                  physical memory where the original virtual
+ *                                  address is no longer available.
+ * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache
+ *                                  has detected that the same line has been
+ *                                  accessed as both shareable and non-shareable
+ *                                  memory from inside the GPU.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table
+ *                                  entry at level 1 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table
+ *                                  entry at level 2 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table
+ *                                  entry at level 3 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table
+ *                                  entry at level 4 of the translation table.
+ * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to
+ *                                  the permission flags set in translation
+ *                                  table
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading
+ *                                  level 0 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading
+ *                                  level 1 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading
+ *                                  level 2 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading
+ *                                  level 3 of the translation tables.
+ * @BASE_JD_EVENT_ACCESS_FLAG:      Matches ACCESS_FLAG_0. A memory access hit a
+ *                                  translation table entry with the ACCESS_FLAG
+ *                                  bit set to zero in level 0 of the
+ *                                  page table, and the DISABLE_AF_FAULT flag
+ *                                  was not set.
+ * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to
+ *                                   grow memory on demand
+ * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its
+ *                               dependencies failed
+ * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data
+ *                             in the atom which overlaps with
+ *                             BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the
+ *                             platform doesn't support the feature specified in
+ *                             the atom.
+ * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate
+ *                                to userspace that the KBase context has been
+ *                                destroyed and Base should stop listening for
+ *                                further events
+ * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in
+ *                                   the GPU has to be retried (but it has not
+ *                                   started) due to e.g., GPU reset
+ * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal
+ *                             the completion of a renderpass. This value
+ *                             shouldn't be returned to userspace but I haven't
+ *                             seen where it is reset back to JD_EVENT_DONE.
+ *
+ * HW and low-level SW events are represented by event codes.
+ * The status of jobs which succeeded are also represented by
+ * an event code (see @BASE_JD_EVENT_DONE).
+ * Events are usually reported as part of a &struct base_jd_event.
+ *
+ * The event codes are encoded in the following way:
+ * * 10:0  - subtype
+ * * 12:11 - type
+ * * 13    - SW success (only valid if the SW bit is set)
+ * * 14    - SW event (HW event if not set)
+ * * 15    - Kernel event (should never be seen in userspace)
+ *
+ * Events are split up into ranges as follows:
+ * * BASE_JD_EVENT_RANGE_<description>_START
+ * * BASE_JD_EVENT_RANGE_<description>_END
+ *
+ * code is in <description>'s range when:
+ * BASE_JD_EVENT_RANGE_<description>_START <= code <
+ *   BASE_JD_EVENT_RANGE_<description>_END
+ *
+ * Ranges can be asserted for adjacency by testing that the END of the previous
+ * is equal to the START of the next. This is useful for optimizing some tests
+ * for range.
+ *
+ * A limitation is that the last member of this enum must explicitly be handled
+ * (with an assert-unreachable statement) in switch statements that use
+ * variables of this type. Otherwise, the compiler warns that we have not
+ * handled that enum value.
+ */
+enum base_jd_event_code {
+	/* HW defined exceptions */
+	BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0,
+
+	/* non-fatal exceptions */
+	BASE_JD_EVENT_NOT_STARTED = 0x00,
+	BASE_JD_EVENT_DONE = 0x01,
+	BASE_JD_EVENT_STOPPED = 0x03,
+	BASE_JD_EVENT_TERMINATED = 0x04,
+	BASE_JD_EVENT_ACTIVE = 0x08,
+
+	BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40,
+	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40,
+
+	/* job exceptions */
+	BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40,
+	BASE_JD_EVENT_JOB_POWER_FAULT = 0x41,
+	BASE_JD_EVENT_JOB_READ_FAULT = 0x42,
+	BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43,
+	BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44,
+	BASE_JD_EVENT_JOB_BUS_FAULT = 0x48,
+	BASE_JD_EVENT_INSTR_INVALID_PC = 0x50,
+	BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51,
+	BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52,
+	BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53,
+	BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54,
+	BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55,
+	BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56,
+	BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58,
+	BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59,
+	BASE_JD_EVENT_STATE_FAULT = 0x5A,
+	BASE_JD_EVENT_OUT_OF_MEMORY = 0x60,
+	BASE_JD_EVENT_UNKNOWN = 0x7F,
+
+	/* GPU exceptions */
+	BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80,
+	BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88,
+
+	/* MMU exceptions */
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4,
+	BASE_JD_EVENT_PERMISSION_FAULT = 0xC8,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4,
+	BASE_JD_EVENT_ACCESS_FLAG = 0xD8,
+
+	/* SW defined exceptions */
+	BASE_JD_EVENT_MEM_GROWTH_FAILED =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000,
+	BASE_JD_EVENT_JOB_CANCELLED =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002,
+	BASE_JD_EVENT_JOB_INVALID =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003,
+
+	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_RESERVED | 0x3FF,
+
+	BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | 0x000,
+
+	BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000,
+
+	BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF,
+
+	BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | 0x000,
+	BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000,
+	BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001,
+
+	BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF
+};
+
+/**
+ * struct base_jd_event_v2 - Event reporting structure
+ *
+ * @event_code:  event code of type @ref base_jd_event_code.
+ * @atom_number: the atom number that has completed.
+ * @padding:     padding.
+ * @udata:       user data.
+ *
+ * This structure is used by the kernel driver to report information
+ * about GPU events. They can either be HW-specific events or low-level
+ * SW events, such as job-chain completion.
+ *
+ * The event code contains an event type field which can be extracted
+ * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK.
+ */
+struct base_jd_event_v2 {
+	__u32 event_code;
+	base_atom_id atom_number;
+	__u8 padding[3];
+	struct base_jd_udata udata;
+};
+
+/**
+ * struct base_dump_cpu_gpu_counters - Structure for
+ *                                     BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS
+ *                                     jobs.
+ * @system_time:   gpu timestamp
+ * @cycle_counter: gpu cycle count
+ * @sec:           cpu time(sec)
+ * @usec:          cpu time(usec)
+ * @padding:       padding
+ *
+ * This structure is stored into the memory pointed to by the @jc field
+ * of &struct base_jd_atom.
+ *
+ * It must not occupy the same CPU cache line(s) as any neighboring data.
+ * This is to avoid cases where access to pages containing the structure
+ * is shared between cached and un-cached memory regions, which would
+ * cause memory corruption.
+ */
+
+struct base_dump_cpu_gpu_counters {
+	__u64 system_time;
+	__u64 cycle_counter;
+	__u64 sec;
+	__u32 usec;
+	__u8 padding[36];
+};
+
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ *
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ *   4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ *   release number.
+ *   8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ *   release number.
+ *   4 bit values (0-15).
+ * @padding: padding to align to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ *   clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ *   is a bitpattern where a set bit indicates that the format is supported.
+ *   Before using a texture format, it is recommended that the corresponding
+ *   bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ *   It is unlikely that a client will be able to allocate all of this memory
+ *   for their own purposes, but this at least provides an upper bound on the
+ *   memory available to the GPU.
+ *   This is required for OpenCL's clGetDeviceInfo() call when
+ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ *   client will not be expecting to allocate anywhere near this value.
+ * @num_exec_engines: The number of execution engines. Only valid for tGOX
+ *   (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise,
+ *   this is always 0.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+	__u8 num_exec_engines;
+};
+
+#endif /* _UAPI_BASE_JM_KERNEL_H_ */
diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
new file mode 100644
index 00000000000..20d931adc9b
--- /dev/null
+++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_JM_IOCTL_H_
+#define _UAPI_KBASE_JM_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 11.1:
+ * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags
+ * 11.2:
+ * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED,
+ *   which some user-side clients prior to 11.2 might fault if they received
+ *   them
+ * 11.3:
+ * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and
+ *   KBASE_IOCTL_STICKY_RESOURCE_UNMAP
+ * 11.4:
+ * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET
+ * 11.5:
+ * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD)
+ * 11.6:
+ * - Added flags field to base_jit_alloc_info structure, which can be used to
+ *   specify pseudo chunked tiler alignment for JIT allocations.
+ * 11.7:
+ * - Removed UMP support
+ * 11.8:
+ * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags
+ * 11.9:
+ * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY
+ *   under base_mem_alloc_flags
+ * 11.10:
+ * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for
+ *   JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations
+ *   with one softjob.
+ * 11.11:
+ * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags
+ * 11.12:
+ * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS
+ * 11.13:
+ * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT
+ * 11.14:
+ * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set
+ *   under base_mem_alloc_flags
+ * 11.15:
+ * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags.
+ * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be
+ *   passed to mmap().
+ * 11.16:
+ * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf.
+ * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for
+ *   dma-buf. Now, buffers are mapped on GPU when first imported, no longer
+ *   requiring external resource or sticky resource tracking. UNLESS,
+ *   CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled.
+ * 11.17:
+ * - Added BASE_JD_REQ_JOB_SLOT.
+ * - Reused padding field in base_jd_atom_v2 to pass job slot number.
+ * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO
+ * 11.18:
+ * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags
+ * 11.19:
+ * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified.
+ * 11.20:
+ * - Added new phys_pages member to kbase_ioctl_mem_jit_init for
+ *   KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2
+ *   (replacing '_OLD') and _11_5 suffixes
+ * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in
+ *   base_jd_atom_v2. It must currently be initialized to zero.
+ * - Added heap_info_gpu_addr to base_jit_alloc_info, and
+ *   BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's
+ *   flags member. Previous variants of this structure are kept and given _10_2
+ *   and _11_5 suffixes.
+ * - The above changes are checked for safe values in usual builds
+ * 11.21:
+ * - v2.0 of mali_trace debugfs file, which now versions the file separately
+ * 11.22:
+ * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2.
+ *   KBASE_IOCTL_JOB_SUBMIT supports both in parallel.
+ * 11.23:
+ * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify
+ *   the physical memory backing of JIT allocations. This was not supposed
+ *   to be a valid use case, but it was allowed by the previous implementation.
+ * 11.24:
+ * - Added a sysfs file 'serialize_jobs' inside a new sub-directory
+ *   'scheduling'.
+ * 11.25:
+ * - Enabled JIT pressure limit in base/kbase by default
+ * 11.26
+ * - Added kinstr_jm API
+ * 11.27
+ * - Backwards compatible extension to HWC ioctl.
+ * 11.28:
+ * - Added kernel side cache ops needed hint
+ * 11.29:
+ * - Reserve ioctl 52
+ * 11.30:
+ * - Add a new priority level BASE_JD_PRIO_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 11.31:
+ * - Added BASE_JD_REQ_LIMITED_CORE_MASK.
+ * - Added ioctl 55: set_limited_core_count.
+ * 11.32:
+ * - Added new HW performance counters interface to all GPUs.
+ * 11.33:
+ * - Removed Kernel legacy HWC interface
+ * 11.34:
+ * - First release of new HW performance counters interface.
+ * 11.35:
+ * - Dummy model (no mali) backend will now clear HWC values after each sample
+ */
+#define BASE_UK_VERSION_MAJOR 11
+#define BASE_UK_VERSION_MINOR 35
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+
+/**
+ * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel
+ *
+ * @addr: Memory address of an array of struct base_jd_atom_v2 or v3
+ * @nr_atoms: Number of entries in the array
+ * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom)
+ */
+struct kbase_ioctl_job_submit {
+	__u64 addr;
+	__u32 nr_atoms;
+	__u32 stride;
+};
+
+#define KBASE_IOCTL_JOB_SUBMIT \
+	_IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit)
+
+#define KBASE_IOCTL_POST_TERM \
+	_IO(KBASE_IOCTL_TYPE, 4)
+
+/**
+ * struct kbase_ioctl_soft_event_update - Update the status of a soft-event
+ * @event: GPU address of the event which has been updated
+ * @new_status: The new status to set
+ * @flags: Flags for future expansion
+ */
+struct kbase_ioctl_soft_event_update {
+	__u64 event;
+	__u32 new_status;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_SOFT_EVENT_UPDATE \
+	_IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update)
+
+/**
+ * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for
+ * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the
+ * kernel
+ *
+ * @size:    The size of the `struct kbase_kinstr_jm_atom_state_change`
+ * @version: Represents a breaking change in the
+ *           `struct kbase_kinstr_jm_atom_state_change`
+ * @padding: Explicit padding to get the structure up to 64bits. See
+ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
+ *
+ * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the
+ * end of the structure that older user space might not understand. If the
+ * `version` is the same, the structure is still compatible with newer kernels.
+ * The `size` can be used to cast the opaque memory returned from the kernel.
+ */
+struct kbase_kinstr_jm_fd_out {
+	__u16 size;
+	__u8 version;
+	__u8 padding[5];
+};
+
+/**
+ * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor
+ *
+ * @count: Number of atom states that can be stored in the kernel circular
+ *         buffer. Must be a power of two
+ * @padding: Explicit padding to get the structure up to 64bits. See
+ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
+ */
+struct kbase_kinstr_jm_fd_in {
+	__u16 count;
+	__u8 padding[6];
+};
+
+union kbase_kinstr_jm_fd {
+	struct kbase_kinstr_jm_fd_in in;
+	struct kbase_kinstr_jm_fd_out out;
+};
+
+#define KBASE_IOCTL_KINSTR_JM_FD \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd)
+
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#endif /* _UAPI_KBASE_JM_IOCTL_H_ */
diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h
new file mode 100644
index 00000000000..f8378146ace
--- /dev/null
+++ b/src/panfrost/base/include/mali_base_common_kernel.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_COMMON_KERNEL_H_
+#define _UAPI_BASE_COMMON_KERNEL_H_
+
+#include <linux/types.h>
+
+struct base_mem_handle {
+	struct {
+		__u64 handle;
+	} basep;
+};
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+/* Memory allocation, access/hint flags & mask.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* IN */
+/* Read access CPU side
+ */
+#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
+
+/* Write access CPU side
+ */
+#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
+
+/* Read access GPU side
+ */
+#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
+
+/* Write access GPU side
+ */
+#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
+
+/* Execute allowed on the GPU side
+ */
+#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
+
+/* Will be permanently mapped in kernel space.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
+
+/* The allocation will completely reside within the same 4GB chunk in the GPU
+ * virtual space.
+ * Since this flag is primarily required only for the TLS memory which will
+ * not be used to contain executable code and also not used for Tiler heap,
+ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
+ */
+#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
+
+/* Userspace is not allowed to free this memory.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
+
+/* Grow backing store on GPU Page Fault
+ */
+#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
+
+/* Page coherence Outer shareable, if available
+ */
+#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
+
+/* Page coherence Inner shareable
+ */
+#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
+
+/* IN/OUT */
+/* Should be cached on the CPU, returned if actually cached
+ */
+#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
+
+/* IN/OUT */
+/* Must have same VA on both the GPU and the CPU
+ */
+#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
+
+/* OUT */
+/* Must call mmap to acquire a GPU address for the allocation
+ */
+#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
+
+/* IN */
+/* Page coherence Outer shareable, required.
+ */
+#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
+
+/* Protected memory
+ */
+#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
+
+/* Not needed physical memory
+ */
+#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
+
+/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
+ * addresses to be the same
+ */
+#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
+
+/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
+ * mode. Some components within the GPU might only be able to access memory
+ * that is GPU cacheable. Refer to the specific GPU implementation for more
+ * details. The 3 shareability flags will be ignored for GPU uncached memory.
+ * If used while importing USER_BUFFER type memory, then the import will fail
+ * if the memory is not aligned to GPU and CPU cache line width.
+ */
+#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
+
+/*
+ * Bits [22:25] for group_id (0~15).
+ *
+ * base_mem_group_id_set() should be used to pack a memory group ID into a
+ * base_mem_alloc_flags value instead of accessing the bits directly.
+ * base_mem_group_id_get() should be used to extract the memory group ID from
+ * a base_mem_alloc_flags value.
+ */
+#define BASEP_MEM_GROUP_ID_SHIFT 22
+#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
+
+/* Must do CPU cache maintenance when imported memory is mapped/unmapped
+ * on GPU. Currently applicable to dma-buf type only.
+ */
+#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
+
+/* OUT */
+/* Kernel side cache sync ops required */
+#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
+
+/* Number of bits used as flags for base memory management
+ *
+ * Must be kept in sync with the base_mem_alloc_flags flags
+ */
+#define BASE_MEM_FLAGS_NR_BITS 30
+
+/* A mask for all output bits, excluding IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
+
+/* A mask for all input bits, including IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_INPUT_MASK                                                                  \
+	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
+
+/* Special base mem handles.
+ */
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
+/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
+
+/* Flags to pass to ::base_context_init.
+ * Flags can be ORed together to enable multiple things.
+ *
+ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
+ * not collide with them.
+ */
+typedef __u32 base_context_create_flags;
+
+/* Flags for base context */
+
+/* No flags set */
+#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
+
+/* Base context is embedded in a cctx object (flag used for CINSTR
+ * software counter macros)
+ */
+#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
+
+/* Base context is a 'System Monitor' context for Hardware counters.
+ *
+ * One important side effect of this is that job submission is disabled.
+ */
+#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1)
+
+/* Bit-shift used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
+
+/* Bitmask used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_MASK                                                            \
+	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/* Bitpattern describing the base_context_create_flags that can be
+ * passed to the kernel
+ */
+#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS                                                          \
+	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK)
+
+/* Flags for base tracepoint
+ */
+
+/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
+ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
+ */
+#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
+
+/* Indicate that job dumping is enabled. This could affect certain timers
+ * to account for the performance impact.
+ */
+#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
+
+#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */
diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h
new file mode 100644
index 00000000000..3d826c720b2
--- /dev/null
+++ b/src/panfrost/base/include/mali_base_kernel.h
@@ -0,0 +1,700 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Base structures shared with the kernel.
+ */
+
+#ifndef _UAPI_BASE_KERNEL_H_
+#define _UAPI_BASE_KERNEL_H_
+
+#include <linux/types.h>
+#include "mali_base_common_kernel.h"
+
+#define BASE_MAX_COHERENT_GROUPS 16
+
+#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
+#define LOCAL_PAGE_SHIFT PAGE_SHIFT
+#define LOCAL_PAGE_LSB ~PAGE_MASK
+#else
+#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
+#endif
+
+#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
+#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
+#else
+#error Failed to find page size
+#endif
+#endif
+
+/* Physical memory group ID for normal usage.
+ */
+#define BASE_MEM_GROUP_DEFAULT (0)
+
+/* Number of physical memory groups.
+ */
+#define BASE_MEM_GROUP_COUNT (16)
+
+/**
+ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
+ *
+ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
+ * in order to determine the best cache policy. Some combinations are
+ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
+ * which defines a write-only region on the CPU side, which is
+ * heavily read by the CPU...
+ * Other flags are only meaningful to a particular allocator.
+ * More flags can be added to this list, as long as they don't clash
+ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
+ */
+typedef __u32 base_mem_alloc_flags;
+
+/* A mask for all the flags which are modifiable via the base_mem_set_flags
+ * interface.
+ */
+#define BASE_MEM_FLAGS_MODIFIABLE \
+	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
+	 BASE_MEM_COHERENT_LOCAL)
+
+/* A mask of all the flags that can be returned via the base_mem_get_flags()
+ * interface.
+ */
+#define BASE_MEM_FLAGS_QUERYABLE \
+	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
+		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
+		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
+		BASEP_MEM_FLAGS_KERNEL_ONLY))
+
+/**
+ * enum base_mem_import_type - Memory types supported by @a base_mem_import
+ *
+ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
+ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
+ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
+ * base_mem_import_user_buffer
+ *
+ * Each type defines what the supported handle type is.
+ *
+ * If any new type is added here ARM must be contacted
+ * to allocate a numeric value for it.
+ * Do not just add a new type without synchronizing with ARM
+ * as future releases from ARM might include other new types
+ * which could clash with your custom types.
+ */
+enum base_mem_import_type {
+	BASE_MEM_IMPORT_TYPE_INVALID = 0,
+	/*
+	 * Import type with value 1 is deprecated.
+	 */
+	BASE_MEM_IMPORT_TYPE_UMM = 2,
+	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
+};
+
+/**
+ * struct base_mem_import_user_buffer - Handle of an imported user buffer
+ *
+ * @ptr:	address of imported user buffer
+ * @length:	length of imported user buffer in bytes
+ *
+ * This structure is used to represent a handle of an imported user buffer.
+ */
+
+struct base_mem_import_user_buffer {
+	__u64 ptr;
+	__u64 length;
+};
+
+/* Mask to detect 4GB boundary alignment */
+#define BASE_MEM_MASK_4GB  0xfffff000UL
+/* Mask to detect 4GB boundary (in page units) alignment */
+#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
+
+/* Limit on the 'extension' parameter for an allocation with the
+ * BASE_MEM_TILER_ALIGN_TOP flag set
+ *
+ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
+ */
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
+	(21u - (LOCAL_PAGE_SHIFT))
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
+	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
+
+/* Bit mask of cookies used for memory allocation setup */
+#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
+
+/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
+#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
+
+/*
+ * struct base_fence - Cross-device synchronisation fence.
+ *
+ * A fence is used to signal when the GPU has finished accessing a resource that
+ * may be shared with other devices, and also to delay work done asynchronously
+ * by the GPU until other devices have finished accessing a shared resource.
+ */
+struct base_fence {
+	struct {
+		int fd;
+		int stream_fd;
+	} basep;
+};
+
+/**
+ * struct base_mem_aliasing_info - Memory aliasing info
+ *
+ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ * @offset: Offset within the handle to start aliasing from, in pages.
+ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
+ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ *          specifies the number of times the special page is needed.
+ *
+ * Describes a memory handle to be aliased.
+ * A subset of the handle can be chosen for aliasing, given an offset and a
+ * length.
+ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
+ * region where a special page is mapped with a write-alloc cache setup,
+ * typically used when the write result of the GPU isn't needed, but the GPU
+ * must write anyway.
+ *
+ * Offset and length are specified in pages.
+ * Offset must be within the size of the handle.
+ * Offset+length must not overrun the size of the handle.
+ */
+struct base_mem_aliasing_info {
+	struct base_mem_handle handle;
+	__u64 offset;
+	__u64 length;
+};
+
+/* Maximum percentage of just-in-time memory allocation trimming to perform
+ * on free.
+ */
+#define BASE_JIT_MAX_TRIM_LEVEL (100)
+
+/* Maximum number of concurrent just-in-time memory allocations.
+ */
+#define BASE_JIT_ALLOC_COUNT (255)
+
+/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
+ *
+ * jit_version is 1
+ *
+ * Due to the lack of padding specified, user clients between 32 and 64-bit
+ * may have assumed a different size of the struct
+ *
+ * An array of structures was not supported
+ */
+struct base_jit_alloc_info_10_2 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+};
+
+/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
+ * to 11.19
+ *
+ * This structure had a number of modifications during and after kernel driver
+ * version 11.5, but remains size-compatible throughout its version history, and
+ * with earlier variants compatible with future variants by requiring
+ * zero-initialization to the unused space in the structure.
+ *
+ * jit_version is 2
+ *
+ * Kernel driver version history:
+ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
+ *       must be zero. Kbase minor version was not incremented, so some
+ *       versions of 11.5 do not have this change.
+ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
+ *       minor version not incremented)
+ * 11.6: Added 'flags', replacing 1 padding byte
+ * 11.10: Arrays of this structure are supported
+ */
+struct base_jit_alloc_info_11_5 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+};
+
+/**
+ * struct base_jit_alloc_info - Structure which describes a JIT allocation
+ *                              request.
+ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
+ *                              allocated GPU virtual address to.
+ * @va_pages:                   The minimum number of virtual pages required.
+ * @commit_pages:               The minimum number of physical pages which
+ *                              should back the allocation.
+ * @extension:                     Granularity of physical pages to grow the
+ *                              allocation by during a fault.
+ * @id:                         Unique ID provided by the caller, this is used
+ *                              to pair allocation and free requests.
+ *                              Zero is not a valid value.
+ * @bin_id:                     The JIT allocation bin, used in conjunction with
+ *                              @max_allocations to limit the number of each
+ *                              type of JIT allocation.
+ * @max_allocations:            The maximum number of allocations allowed within
+ *                              the bin specified by @bin_id. Should be the same
+ *                              for all allocations within the same bin.
+ * @flags:                      flags specifying the special requirements for
+ *                              the JIT allocation, see
+ *                              %BASE_JIT_ALLOC_VALID_FLAGS
+ * @padding:                    Expansion space - should be initialised to zero
+ * @usage_id:                   A hint about which allocation should be reused.
+ *                              The kernel should attempt to use a previous
+ *                              allocation with the same usage_id
+ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
+ *                              the actual usage of the region.
+ *
+ * jit_version is 3.
+ *
+ * When modifications are made to this structure, it is still compatible with
+ * jit_version 3 when: a) the size is unchanged, and b) new members only
+ * replace the padding bytes.
+ *
+ * Previous jit_version history:
+ * jit_version == 1, refer to &base_jit_alloc_info_10_2
+ * jit_version == 2, refer to &base_jit_alloc_info_11_5
+ *
+ * Kbase version history:
+ * 11.20: added @heap_info_gpu_addr
+ */
+struct base_jit_alloc_info {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+	__u64 heap_info_gpu_addr;
+};
+
+enum base_external_resource_access {
+	BASE_EXT_RES_ACCESS_SHARED,
+	BASE_EXT_RES_ACCESS_EXCLUSIVE
+};
+
+struct base_external_resource {
+	__u64 ext_resource;
+};
+
+/**
+ * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources
+ * which can be mapped/unmapped in a single request.
+ */
+#define BASE_EXT_RES_COUNT_MAX 10
+
+/**
+ * struct base_external_resource_list - Structure which describes a list of
+ *                                      external resources.
+ * @count:                              The number of resources.
+ * @ext_res:                            Array of external resources which is
+ *                                      sized at allocation time.
+ */
+struct base_external_resource_list {
+	__u64 count;
+	struct base_external_resource ext_res[1];
+};
+
+struct base_jd_debug_copy_buffer {
+	__u64 address;
+	__u64 size;
+	struct base_external_resource extres;
+};
+
+#define GPU_MAX_JOB_SLOTS 16
+
+/**
+ * DOC: User-side Base GPU Property Queries
+ *
+ * The User-side Base GPU Property Query interface encapsulates two
+ * sub-modules:
+ *
+ * - "Dynamic GPU Properties"
+ * - "Base Platform Config GPU Properties"
+ *
+ * Base only deals with properties that vary between different GPU
+ * implementations - the Dynamic GPU properties and the Platform Config
+ * properties.
+ *
+ * For properties that are constant for the GPU Architecture, refer to the
+ * GPU module. However, we will discuss their relevance here just to
+ * provide background information.
+ *
+ * About the GPU Properties in Base and GPU modules
+ *
+ * The compile-time properties (Platform Config, GPU Compile-time
+ * properties) are exposed as pre-processor macros.
+ *
+ * Complementing the compile-time properties are the Dynamic GPU
+ * Properties, which act as a conduit for the GPU Configuration
+ * Discovery.
+ *
+ * In general, the dynamic properties are present to verify that the platform
+ * has been configured correctly with the right set of Platform Config
+ * Compile-time Properties.
+ *
+ * As a consistent guide across the entire DDK, the choice for dynamic or
+ * compile-time should consider the following, in order:
+ * 1. Can the code be written so that it doesn't need to know the
+ * implementation limits at all?
+ * 2. If you need the limits, get the information from the Dynamic Property
+ * lookup. This should be done once as you fetch the context, and then cached
+ * as part of the context data structure, so it's cheap to access.
+ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
+ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
+ * property). Examples of where this might be sensible follow:
+ *  - Part of a critical inner-loop
+ *  - Frequent re-use throughout the driver, causing significant extra load
+ * instructions or control flow that would be worthwhile optimizing out.
+ *
+ * We cannot provide an exhaustive set of examples, neither can we provide a
+ * rule for every possible situation. Use common sense, and think about: what
+ * the rest of the driver will be doing; how the compiler might represent the
+ * value if it is a compile-time constant; whether an OEM shipping multiple
+ * devices would benefit much more from a single DDK binary, instead of
+ * insignificant micro-optimizations.
+ *
+ * Dynamic GPU Properties
+ *
+ * Dynamic GPU properties are presented in two sets:
+ * 1. the commonly used properties in @ref base_gpu_props, which have been
+ * unpacked from GPU register bitfields.
+ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
+ * (also a member of base_gpu_props). All of these are presented in
+ * the packed form, as presented by the GPU  registers themselves.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ * The properties returned extend the GPU Configuration Discovery
+ * registers. For example, GPU clock speed is not specified in the GPU
+ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
+ *
+ * The GPU properties are obtained by a call to
+ * base_get_gpu_props(). This simply returns a pointer to a const
+ * base_gpu_props structure. It is constant for the life of a base
+ * context. Multiple calls to base_get_gpu_props() to a base context
+ * return the same pointer to a constant structure. This avoids cache pollution
+ * of the common data.
+ *
+ * This pointer must not be freed, because it does not point to the start of a
+ * region allocated by the memory allocator; instead, just close the @ref
+ * base_context.
+ *
+ *
+ * Kernel Operation
+ *
+ * During Base Context Create time, user-side makes a single kernel call:
+ * - A call to fill user memory with GPU information structures
+ *
+ * The kernel-side will fill the provided the entire processed base_gpu_props
+ * structure, because this information is required in both
+ * user and kernel side; it does not make sense to decode it twice.
+ *
+ * Coherency groups must be derived from the bitmasks, but this can be done
+ * kernel side, and just once at kernel startup: Coherency groups must already
+ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
+ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
+ *
+ * Coherency Group calculation
+ *
+ * Creation of the coherent group data is done at device-driver startup, and so
+ * is one-time. This will most likely involve a loop with CLZ, shifting, and
+ * bit clearing on the L2_PRESENT mask, depending on whether the
+ * system is L2 Coherent. The number of shader cores is done by a
+ * population count, since faulty cores may be disabled during production,
+ * producing a non-contiguous mask.
+ *
+ * The memory requirements for this algorithm can be determined either by a __u64
+ * population count on the L2_PRESENT mask (a LUT helper already is
+ * required for the above), or simple assumption that there can be no more than
+ * 16 coherent groups, since core groups are typically 4 cores.
+ */
+
+/*
+ * More information is possible - but associativity and bus width are not
+ * required by upper-level apis.
+ */
+struct mali_base_gpu_l2_cache_props {
+	__u8 log2_line_size;
+	__u8 log2_cache_size;
+	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	__u8 padding[5];
+};
+
+struct mali_base_gpu_tiler_props {
+	__u32 bin_size_bytes;	/* Max is 4*2^15 */
+	__u32 max_active_levels;	/* Max is 2^15 */
+};
+
+/**
+ * struct mali_base_gpu_thread_props - GPU threading system details.
+ * @max_threads: Max. number of threads per core
+ * @max_workgroup_size:     Max. number of threads per workgroup
+ * @max_barrier_size:       Max. number of threads that can synchronize on a
+ *                          simple barrier
+ * @max_registers:          Total size [1..65535] of the register file available
+ *                          per core.
+ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
+ *                          before it becomes blocked.
+ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
+ *                          field.
+ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
+ *                          3 = SW Model/Emulation
+ * @padding:                padding to align to 8-byte
+ * @tls_alloc:              Number of threads per core that TLS must be
+ *                          allocated for
+ */
+struct mali_base_gpu_thread_props {
+	__u32 max_threads;
+	__u32 max_workgroup_size;
+	__u32 max_barrier_size;
+	__u16 max_registers;
+	__u8 max_task_queue;
+	__u8 max_thread_group_split;
+	__u8 impl_tech;
+	__u8  padding[3];
+	__u32 tls_alloc;
+};
+
+/**
+ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
+ * @core_mask: Core restriction mask required for the group
+ * @num_cores: Number of cores in the group
+ * @padding:   padding to align to 8-byte
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ *       the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ *       wastage.
+ */
+struct mali_base_gpu_coherent_group {
+	__u64 core_mask;
+	__u16 num_cores;
+	__u16 padding[3];
+};
+
+/**
+ * struct mali_base_gpu_coherent_group_info - Coherency group information
+ * @num_groups: Number of coherent groups in the GPU.
+ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
+ *                   Equivalent to the number of L2 Caches.
+ *                   The GPU Counter dumping writes 2048 bytes per core group,
+ *                   regardless of whether the core groups are coherent or not.
+ *                   Hence this member is needed to calculate how much memory
+ *                   is required for dumping.
+ *                   @note Do not use it to work out how many valid elements
+ *                         are in the group[] member. Use num_groups instead.
+ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
+ *             methods
+ * @padding: padding to align to 8-byte
+ * @group: Descriptors of coherent groups
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_base_gpu_coherent_group_info {
+	__u32 num_groups;
+	__u32 num_core_groups;
+	__u32 coherency;
+	__u32 padding;
+	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
+};
+
+#if MALI_USE_CSF
+#include "csf/mali_base_csf_kernel.h"
+#else
+#include "jm/mali_base_jm_kernel.h"
+#endif
+
+/**
+ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
+ *                            Configuration Discovery registers.
+ * @shader_present: Shader core present bitmap
+ * @tiler_present: Tiler core present bitmap
+ * @l2_present: Level 2 cache present bitmap
+ * @stack_present: Core stack present bitmap
+ * @l2_features: L2 features
+ * @core_features: Core features
+ * @mem_features: Mem features
+ * @mmu_features: Mmu features
+ * @as_present: Bitmap of address spaces present
+ * @js_present: Job slots present
+ * @js_features: Array of job slot features.
+ * @tiler_features: Tiler features
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
+ * @gpu_id: GPU and revision identifier
+ * @thread_max_threads: Maximum number of threads per core
+ * @thread_max_workgroup_size: Maximum number of threads per workgroup
+ * @thread_max_barrier_size: Maximum number of threads per barrier
+ * @thread_features: Thread features
+ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
+ *                  available modes as exposed in the coherency_features register
+ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
+ * @gpu_features: GPU features
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct gpu_raw_gpu_props {
+	__u64 shader_present;
+	__u64 tiler_present;
+	__u64 l2_present;
+	__u64 stack_present;
+	__u32 l2_features;
+	__u32 core_features;
+	__u32 mem_features;
+	__u32 mmu_features;
+
+	__u32 as_present;
+
+	__u32 js_present;
+	__u32 js_features[GPU_MAX_JOB_SLOTS];
+	__u32 tiler_features;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	__u32 gpu_id;
+
+	__u32 thread_max_threads;
+	__u32 thread_max_workgroup_size;
+	__u32 thread_max_barrier_size;
+	__u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	__u32 coherency_mode;
+
+	__u32 thread_tls_alloc;
+	__u64 gpu_features;
+};
+
+/**
+ * struct base_gpu_props - Return structure for base_get_gpu_props().
+ * @core_props:     Core props.
+ * @l2_props:       L2 props.
+ * @unused_1:       Keep for backwards compatibility.
+ * @tiler_props:    Tiler props.
+ * @thread_props:   Thread props.
+ * @raw_props:      This member is large, likely to be 128 bytes.
+ * @coherency_info: This must be last member of the structure.
+ *
+ * NOTE: the raw_props member in this data structure contains the register
+ * values from which the value of the other members are derived. The derived
+ * members exist to allow for efficient access and/or shielding the details
+ * of the layout of the registers.
+ */
+struct base_gpu_props {
+	struct mali_base_gpu_core_props core_props;
+	struct mali_base_gpu_l2_cache_props l2_props;
+	__u64 unused_1;
+	struct mali_base_gpu_tiler_props tiler_props;
+	struct mali_base_gpu_thread_props thread_props;
+	struct gpu_raw_gpu_props raw_props;
+	struct mali_base_gpu_coherent_group_info coherency_info;
+};
+
+#define BASE_MEM_GROUP_ID_GET(flags)                                           \
+	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
+
+#define BASE_MEM_GROUP_ID_SET(id)                                              \
+	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
+					 BASE_MEM_GROUP_DEFAULT :              \
+					 id)                                   \
+	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
+	 BASE_MEM_GROUP_ID_MASK)
+
+#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
+	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
+	 ((base_context_create_flags)(group_id)                                \
+	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
+
+#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
+	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
+	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/*
+ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
+ * flags are also used, where applicable, for specifying which fields
+ * are valid following the request operation.
+ */
+
+/* For monotonic (counter) timefield */
+#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
+/* For system wide timestamp */
+#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
+/* For GPU cycle counter */
+#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
+/* Specify kernel GPU register timestamp */
+#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
+/* Specify userspace cntvct_el0 timestamp source */
+#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
+
+#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
+		BASE_TIMEINFO_MONOTONIC_FLAG | \
+		BASE_TIMEINFO_TIMESTAMP_FLAG | \
+		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
+		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
+		BASE_TIMEINFO_USER_SOURCE_FLAG)
+
+/* Maximum number of source allocations allowed to create an alias allocation.
+ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
+ * layers, since each cube map in the array will have 6 faces.
+ */
+#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
+
+#endif /* _UAPI_BASE_KERNEL_H_ */
diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h
new file mode 100644
index 00000000000..b250feca022
--- /dev/null
+++ b/src/panfrost/base/include/mali_kbase_gpuprops.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_GPUPROP_H_
+#define _UAPI_KBASE_GPUPROP_H_
+
+/**********************************
+ * Definitions for GPU properties *
+ **********************************/
+#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
+#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
+#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
+#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
+
+#define KBASE_GPUPROP_PRODUCT_ID			1
+#define KBASE_GPUPROP_VERSION_STATUS			2
+#define KBASE_GPUPROP_MINOR_REVISION			3
+#define KBASE_GPUPROP_MAJOR_REVISION			4
+/* 5 previously used for GPU speed */
+#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
+/* 7 previously used for minimum GPU speed */
+#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
+#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
+#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
+#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
+#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
+
+#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
+#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
+#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
+
+#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
+#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
+
+#define KBASE_GPUPROP_MAX_THREADS			18
+#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
+#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
+#define KBASE_GPUPROP_MAX_REGISTERS			21
+#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
+#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
+#define KBASE_GPUPROP_IMPL_TECH				24
+
+#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
+#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
+#define KBASE_GPUPROP_RAW_L2_PRESENT			27
+#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
+#define KBASE_GPUPROP_RAW_L2_FEATURES			29
+#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
+#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
+#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
+#define KBASE_GPUPROP_RAW_AS_PRESENT			33
+#define KBASE_GPUPROP_RAW_JS_PRESENT			34
+#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
+#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
+#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
+#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
+#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
+#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
+#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
+#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
+#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
+#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
+#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
+#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
+#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
+#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
+#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
+#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
+#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
+#define KBASE_GPUPROP_RAW_GPU_ID			55
+#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
+#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
+#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
+#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
+#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
+
+#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
+#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
+#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
+#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
+#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
+#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
+#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
+#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
+#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
+#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
+#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
+#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
+#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
+#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
+#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
+#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
+#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
+#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
+#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
+
+#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
+
+#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
+
+#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
+#define KBASE_GPUPROP_TLS_ALLOC				84
+#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
+
+#endif
diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h
new file mode 100644
index 00000000000..96f606af5f8
--- /dev/null
+++ b/src/panfrost/base/include/mali_kbase_ioctl.h
@@ -0,0 +1,759 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_IOCTL_H_
+#define _UAPI_KBASE_IOCTL_H_
+
+#ifdef __cpluscplus
+extern "C" {
+#endif
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+#if MALI_USE_CSF
+#include "csf/mali_kbase_csf_ioctl.h"
+#else
+#include "jm/mali_kbase_jm_ioctl.h"
+#endif /* MALI_USE_CSF */
+
+#define KBASE_IOCTL_TYPE 0x80
+
+/**
+ * struct kbase_ioctl_set_flags - Set kernel context creation flags
+ *
+ * @create_flags: Flags - see base_context_create_flags
+ */
+struct kbase_ioctl_set_flags {
+	__u32 create_flags;
+};
+
+#define KBASE_IOCTL_SET_FLAGS \
+	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
+
+/**
+ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
+ *
+ * @buffer: Pointer to the buffer to store properties into
+ * @size: Size of the buffer
+ * @flags: Flags - must be zero for now
+ *
+ * The ioctl will return the number of bytes stored into @buffer or an error
+ * on failure (e.g. @size is too small). If @size is specified as 0 then no
+ * data will be written but the return value will be the number of bytes needed
+ * for all the properties.
+ *
+ * @flags may be used in the future to request a different format for the
+ * buffer. With @flags == 0 the following format is used.
+ *
+ * The buffer will be filled with pairs of values, a __u32 key identifying the
+ * property followed by the value. The size of the value is identified using
+ * the bottom bits of the key. The value then immediately followed the key and
+ * is tightly packed (there is no padding). All keys and values are
+ * little-endian.
+ *
+ * 00 = __u8
+ * 01 = __u16
+ * 10 = __u32
+ * 11 = __u64
+ */
+struct kbase_ioctl_get_gpuprops {
+	__u64 buffer;
+	__u32 size;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_GET_GPUPROPS \
+	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
+
+/**
+ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC \
+	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
+
+/**
+ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
+ * @in: Input parameters
+ * @in.gpu_addr: A GPU address contained within the region
+ * @in.query: The type of query
+ * @out: Output parameters
+ * @out.value: The result of the query
+ *
+ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
+ */
+union kbase_ioctl_mem_query {
+	struct {
+		__u64 gpu_addr;
+		__u64 query;
+	} in;
+	struct {
+		__u64 value;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_QUERY \
+	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
+
+#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
+#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
+#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
+
+/**
+ * struct kbase_ioctl_mem_free - Free a memory region
+ * @gpu_addr: Handle to the region to free
+ */
+struct kbase_ioctl_mem_free {
+	__u64 gpu_addr;
+};
+
+#define KBASE_IOCTL_MEM_FREE \
+	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @buffer_count: requested number of dumping buffers
+ * @fe_bm:        counters selection bitmask (Front end)
+ * @shader_bm:    counters selection bitmask (Shader)
+ * @tiler_bm:     counters selection bitmask (Tiler)
+ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error
+ */
+struct kbase_ioctl_hwcnt_reader_setup {
+	__u32 buffer_count;
+	__u32 fe_bm;
+	__u32 shader_bm;
+	__u32 tiler_bm;
+	__u32 mmu_l2_bm;
+};
+
+#define KBASE_IOCTL_HWCNT_READER_SETUP \
+	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
+
+/**
+ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
+ * @data:    Counter samples for the dummy model.
+ * @size:    Size of the counter sample data.
+ * @padding: Padding.
+ */
+struct kbase_ioctl_hwcnt_values {
+	__u64 data;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_HWCNT_SET \
+	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
+
+/**
+ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
+ * @counter:   A counter of disjoint events in the kernel
+ */
+struct kbase_ioctl_disjoint_query {
+	__u32 counter;
+};
+
+#define KBASE_IOCTL_DISJOINT_QUERY \
+	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
+
+/**
+ * struct kbase_ioctl_get_ddk_version - Query the kernel version
+ * @version_buffer: Buffer to receive the kernel version string
+ * @size: Size of the buffer
+ * @padding: Padding
+ *
+ * The ioctl will return the number of bytes written into version_buffer
+ * (which includes a NULL byte) or a negative error code
+ *
+ * The ioctl request code has to be _IOW because the data in ioctl struct is
+ * being copied to the kernel, even though the kernel then writes out the
+ * version info to the buffer specified in the ioctl.
+ */
+struct kbase_ioctl_get_ddk_version {
+	__u64 version_buffer;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_GET_DDK_VERSION \
+	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 10.2--11.4)
+ * @va_pages: Number of VA pages to reserve for JIT
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_10_2 {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 11.5--11.19)
+ * @va_pages: Number of VA pages to reserve for JIT
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_11_5 {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
+
+/**
+ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
+ *                                   allocator
+ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
+ *            memory allocations
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ * @phys_pages: Maximum number of physical pages to allocate just-in-time
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ */
+struct kbase_ioctl_mem_jit_init {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+	__u64 phys_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
+
+/**
+ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
+ *
+ * @handle: GPU memory handle (GPU VA)
+ * @user_addr: The address where it is mapped in user space
+ * @size: The number of bytes to synchronise
+ * @type: The direction to synchronise: 0 is sync to memory (clean),
+ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_mem_sync {
+	__u64 handle;
+	__u64 user_addr;
+	__u64 size;
+	__u8 type;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_MEM_SYNC \
+	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
+
+/**
+ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
+ *
+ * @in: Input parameters
+ * @in.gpu_addr: The GPU address of the memory region
+ * @in.cpu_addr: The CPU address to locate
+ * @in.size: A size in bytes to validate is contained within the region
+ * @out: Output parameters
+ * @out.offset: The offset from the start of the memory region to @cpu_addr
+ */
+union kbase_ioctl_mem_find_cpu_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 cpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
+
+/**
+ * struct kbase_ioctl_get_context_id - Get the kernel context ID
+ *
+ * @id: The kernel context ID
+ */
+struct kbase_ioctl_get_context_id {
+	__u32 id;
+};
+
+#define KBASE_IOCTL_GET_CONTEXT_ID \
+	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
+
+/**
+ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
+ *
+ * @flags: Flags
+ *
+ * The ioctl returns a file descriptor when successful
+ */
+struct kbase_ioctl_tlstream_acquire {
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
+	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
+
+#define KBASE_IOCTL_TLSTREAM_FLUSH \
+	_IO(KBASE_IOCTL_TYPE, 19)
+
+/**
+ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
+ *
+ * @gpu_addr: The memory region to modify
+ * @pages:    The number of physical pages that should be present
+ *
+ * The ioctl may return on the following error codes or 0 for success:
+ *   -ENOMEM: Out of memory
+ *   -EINVAL: Invalid arguments
+ */
+struct kbase_ioctl_mem_commit {
+	__u64 gpu_addr;
+	__u64 pages;
+};
+
+#define KBASE_IOCTL_MEM_COMMIT \
+	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
+
+/**
+ * union kbase_ioctl_mem_alias - Create an alias of memory regions
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.stride: Bytes between start of each memory region
+ * @in.nents: The number of regions to pack together into the alias
+ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_alias {
+	struct {
+		__u64 flags;
+		__u64 stride;
+		__u64 nents;
+		__u64 aliasing_info;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALIAS \
+	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
+
+/**
+ * union kbase_ioctl_mem_import - Import memory for use by the GPU
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.phandle: Handle to the external memory
+ * @in.type: Type of external memory, see base_mem_import_type
+ * @in.padding: Amount of extra VA pages to append to the imported buffer
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_import {
+	struct {
+		__u64 flags;
+		__u64 phandle;
+		__u32 type;
+		__u32 padding;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_IMPORT \
+	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
+
+/**
+ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
+ * @gpu_va: The GPU region to modify
+ * @flags: The new flags to set
+ * @mask: Mask of the flags to modify
+ */
+struct kbase_ioctl_mem_flags_change {
+	__u64 gpu_va;
+	__u64 flags;
+	__u64 mask;
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
+	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
+
+/**
+ * struct kbase_ioctl_stream_create - Create a synchronisation stream
+ * @name: A name to identify this stream. Must be NULL-terminated.
+ *
+ * Note that this is also called a "timeline", but is named stream to avoid
+ * confusion with other uses of the word.
+ *
+ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
+ *
+ * The ioctl returns a file descriptor.
+ */
+struct kbase_ioctl_stream_create {
+	char name[32];
+};
+
+#define KBASE_IOCTL_STREAM_CREATE \
+	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
+
+/**
+ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
+ * @fd: The file descriptor to validate
+ */
+struct kbase_ioctl_fence_validate {
+	int fd;
+};
+
+#define KBASE_IOCTL_FENCE_VALIDATE \
+	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
+
+/**
+ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
+ * @buffer: Pointer to the information
+ * @len: Length
+ * @padding: Padding
+ *
+ * The data provided is accessible through a debugfs file
+ */
+struct kbase_ioctl_mem_profile_add {
+	__u64 buffer;
+	__u32 len;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_MEM_PROFILE_ADD \
+	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to map
+ */
+struct kbase_ioctl_sticky_resource_map {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
+	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
+
+/**
+ * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was
+ *                                          previously permanently mapped
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to unmap
+ */
+struct kbase_ioctl_sticky_resource_unmap {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
+	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
+
+/**
+ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
+ *                                                   the GPU memory region for
+ *                                                   the given gpu address and
+ *                                                   the offset of that address
+ *                                                   into the region
+ * @in: Input parameters
+ * @in.gpu_addr: GPU virtual address
+ * @in.size: Size in bytes within the region
+ * @out: Output parameters
+ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
+ *             for the length of @offset bytes
+ * @out.offset: The offset from the start of the memory region to @gpu_addr
+ */
+union kbase_ioctl_mem_find_gpu_start_and_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 start;
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
+
+#define KBASE_IOCTL_CINSTR_GWT_START \
+	_IO(KBASE_IOCTL_TYPE, 33)
+
+#define KBASE_IOCTL_CINSTR_GWT_STOP \
+	_IO(KBASE_IOCTL_TYPE, 34)
+
+/**
+ * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault
+ *                                     addresses.
+ * @in: Input parameters
+ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
+ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
+ * @in.len: Number of addresses the buffers can hold.
+ * @in.padding: padding
+ * @out: Output parameters
+ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
+ * @out.more_data_available: Status indicating if more addresses are available.
+ * @out.padding: padding
+ *
+ * This structure is used when performing a call to dump GPU write fault
+ * addresses.
+ */
+union kbase_ioctl_cinstr_gwt_dump {
+	struct {
+		__u64 addr_buffer;
+		__u64 size_buffer;
+		__u32 len;
+		__u32 padding;
+
+	} in;
+	struct {
+		__u32 no_of_addr_collected;
+		__u8 more_data_available;
+		__u8 padding[27];
+	} out;
+};
+
+#define KBASE_IOCTL_CINSTR_GWT_DUMP \
+	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
+
+/**
+ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
+ *
+ * @va_pages: Number of VA pages to reserve for EXEC_VA
+ */
+struct kbase_ioctl_mem_exec_init {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_EXEC_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
+
+/**
+ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
+ *                                          cpu/gpu time (counter values)
+ * @in: Input parameters
+ * @in.request_flags: Bit-flags indicating the requested types.
+ * @in.paddings:      Unused, size alignment matching the out.
+ * @out: Output parameters
+ * @out.sec:           Integer field of the monotonic time, unit in seconds.
+ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
+ * @out.padding:       Unused, for __u64 alignment
+ * @out.timestamp:     System wide timestamp (counter) value.
+ * @out.cycle_counter: GPU cycle counter value.
+ */
+union kbase_ioctl_get_cpu_gpu_timeinfo {
+	struct {
+		__u32 request_flags;
+		__u32 paddings[7];
+	} in;
+	struct {
+		__u64 sec;
+		__u32 nsec;
+		__u32 padding;
+		__u64 timestamp;
+		__u64 cycle_counter;
+	} out;
+};
+
+#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
+	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
+
+/**
+ * struct kbase_ioctl_context_priority_check - Check the max possible priority
+ * @priority: Input priority & output priority
+ */
+
+struct kbase_ioctl_context_priority_check {
+	__u8 priority;
+};
+
+#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
+
+/**
+ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
+ *
+ * @max_core_count: Maximum core count
+ */
+struct kbase_ioctl_set_limited_core_count {
+	__u8 max_core_count;
+};
+
+#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
+	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
+ *                                              information
+ * @info_item_size:  Performance counter item size in bytes.
+ * @info_item_count: Performance counter item count in the info_list_ptr.
+ * @info_list_ptr:   Performance counter item list pointer which points to a
+ *                   list with info_item_count of items.
+ *
+ * On success: returns info_item_size and info_item_count if info_list_ptr is
+ * NULL, returns performance counter information if info_list_ptr is not NULL.
+ * On error: returns a negative error code.
+ */
+struct kbase_ioctl_kinstr_prfcnt_enum_info {
+	__u32 info_item_size;
+	__u32 info_item_count;
+	__u64 info_list_ptr;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
+	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader
+ * @in: input parameters.
+ * @in.request_item_count: Number of requests in the requests array.
+ * @in.request_item_size:  Size in bytes of each request in the requests array.
+ * @in.requests_ptr:       Pointer to the requests array.
+ * @out: output parameters.
+ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
+ *                                 each sample.
+ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
+ *                                 for reading performance counter samples.
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error.
+ */
+union kbase_ioctl_kinstr_prfcnt_setup {
+	struct {
+		__u32 request_item_count;
+		__u32 request_item_size;
+		__u64 requests_ptr;
+	} in;
+	struct {
+		__u32 prfcnt_metadata_item_size;
+		__u32 prfcnt_mmap_size_bytes;
+	} out;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
+
+
+/**
+ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
+ * @bytes_collected: number of bytes read by user
+ * @bytes_generated: number of bytes generated by tracepoints
+ */
+struct kbase_ioctl_tlstream_stats {
+	__u32 bytes_collected;
+	__u32 bytes_generated;
+};
+
+#define KBASE_IOCTL_TLSTREAM_STATS \
+	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
+
+#endif /* MALI_UNIT_TEST */
+
+/* Customer extension range */
+#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
+
+/* If the integration needs extra ioctl add them there
+ * like this:
+ *
+ * struct my_ioctl_args {
+ *  ....
+ * }
+ *
+ * #define KBASE_IOCTL_MY_IOCTL \
+ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
+ */
+
+#ifdef __cpluscplus
+}
+#endif
+
+#endif /* _UAPI_KBASE_IOCTL_H_ */
diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h
new file mode 100644
index 00000000000..5f33f5c4c4b
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h
@@ -0,0 +1,80 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __KBASE_IOCTL_MIDGARD_H__
+#define __KBASE_IOCTL_MIDGARD_H__
+
+#define KBASE_IOCTL_TYPE_BASE  0x80
+#define KBASE_IOCTL_TYPE_MAX   0x82
+
+union kbase_ioctl_mem_alloc {
+        struct {
+                union kbase_ioctl_header header;
+                u64 va_pages;
+                u64 commit_pages;
+                u64 extension;
+                u64 flags;
+        } in;
+        struct {
+                union kbase_ioctl_header header;
+                u64 pad[3];
+                u64 flags;
+                mali_ptr gpu_va;
+                u16 va_alignment;
+        } out;
+        u64 pad[7];
+} __attribute__((packed));
+
+#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1)
+
+#define KBASE_IOCTL_GET_VERSION             (_IOWR(0x80,  0, struct kbase_ioctl_get_version))
+#define KBASE_IOCTL_MEM_ALLOC               (_IOWR(0x82,  0, union kbase_ioctl_mem_alloc))
+#define KBASE_IOCTL_MEM_IMPORT              (_IOWR(0x82,  1, union kbase_ioctl_mem_import))
+#define KBASE_IOCTL_MEM_COMMIT              (_IOWR(0x82,  2, struct kbase_ioctl_mem_commit))
+#define KBASE_IOCTL_MEM_QUERY               (_IOWR(0x82,  3, struct kbase_ioctl_mem_query))
+#define KBASE_IOCTL_MEM_FREE                (_IOWR(0x82,  4, struct kbase_ioctl_mem_free))
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE        (_IOWR(0x82,  5, struct kbase_ioctl_mem_flags_change))
+#define KBASE_IOCTL_MEM_ALIAS               (_IOWR(0x82,  6, struct kbase_ioctl_mem_alias))
+#define KBASE_IOCTL_MEM_SYNC                (_IOWR(0x82,  8, struct kbase_ioctl_mem_sync))
+#define KBASE_IOCTL_POST_TERM               (_IOWR(0x82,  9, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_SETUP             (_IOWR(0x82, 10, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_DUMP              (_IOWR(0x82, 11, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_CLEAR             (_IOWR(0x82, 12, __ioctl_placeholder))
+#define KBASE_IOCTL_GPU_PROPS_REG_DUMP      (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump))
+#define KBASE_IOCTL_FIND_CPU_OFFSET         (_IOWR(0x82, 15, __ioctl_placeholder))
+#define KBASE_IOCTL_GET_VERSION_NEW         (_IOWR(0x82, 16, struct kbase_ioctl_get_version))
+#define KBASE_IOCTL_SET_FLAGS               (_IOWR(0x82, 18, struct kbase_ioctl_set_flags))
+#define KBASE_IOCTL_SET_TEST_DATA           (_IOWR(0x82, 19, __ioctl_placeholder))
+#define KBASE_IOCTL_INJECT_ERROR            (_IOWR(0x82, 20, __ioctl_placeholder))
+#define KBASE_IOCTL_MODEL_CONTROL           (_IOWR(0x82, 21, __ioctl_placeholder))
+#define KBASE_IOCTL_KEEP_GPU_POWERED        (_IOWR(0x82, 22, __ioctl_placeholder))
+#define KBASE_IOCTL_FENCE_VALIDATE          (_IOWR(0x82, 23, __ioctl_placeholder))
+#define KBASE_IOCTL_STREAM_CREATE           (_IOWR(0x82, 24, struct kbase_ioctl_stream_create))
+#define KBASE_IOCTL_GET_PROFILING_CONTROLS  (_IOWR(0x82, 25, __ioctl_placeholder))
+#define KBASE_IOCTL_SET_PROFILING_CONTROLS  (_IOWR(0x82, 26, __ioctl_placeholder))
+#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder))
+#define KBASE_IOCTL_JOB_SUBMIT              (_IOWR(0x82, 28, struct kbase_ioctl_job_submit))
+#define KBASE_IOCTL_DISJOINT_QUERY          (_IOWR(0x82, 29, __ioctl_placeholder))
+#define KBASE_IOCTL_GET_CONTEXT_ID          (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id))
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4  (_IOWR(0x82, 32, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_TEST           (_IOWR(0x82, 33, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_STATS          (_IOWR(0x82, 34, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_FLUSH          (_IOWR(0x82, 35, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_READER_SETUP      (_IOWR(0x82, 36, __ioctl_placeholder))
+#define KBASE_IOCTL_SET_PRFCNT_VALUES       (_IOWR(0x82, 37, __ioctl_placeholder))
+#define KBASE_IOCTL_SOFT_EVENT_UPDATE       (_IOWR(0x82, 38, __ioctl_placeholder))
+#define KBASE_IOCTL_MEM_JIT_INIT            (_IOWR(0x82, 39, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE        (_IOWR(0x82, 40, __ioctl_placeholder))
+
+#endif /* __KBASE_IOCTL_MIDGARD_H__ */
diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h
new file mode 100644
index 00000000000..5c76f2dc8e5
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-ioctl.h
@@ -0,0 +1,743 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+/**
+ * Definitions for all of the ioctls for the original open source bifrost GPU
+ * kernel driver, written by ARM.
+ */
+
+#ifndef __KBASE_IOCTL_H__
+#define __KBASE_IOCTL_H__
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int32_t s32;
+typedef int64_t s64;
+
+
+typedef u8 mali_atom_id;
+
+/**
+ * Since these structs are passed to and from the kernel we need to make sure
+ * that we get the size of each struct to match exactly what the kernel is
+ * expecting. So, when editing this file make sure to add static asserts that
+ * check each struct's size against the arg length you see in strace.
+ */
+
+enum kbase_ioctl_mem_flags {
+	/* IN */
+	BASE_MEM_PROT_CPU_RD = (1U << 0),      /**< Read access CPU side */
+	BASE_MEM_PROT_CPU_WR = (1U << 1),      /**< Write access CPU side */
+	BASE_MEM_PROT_GPU_RD = (1U << 2),      /**< Read access GPU side */
+	BASE_MEM_PROT_GPU_WR = (1U << 3),      /**< Write access GPU side */
+	BASE_MEM_PROT_GPU_EX = (1U << 4),      /**< Execute allowed on the GPU
+						    side */
+
+	BASE_MEM_GROW_ON_GPF = (1U << 9),      /**< Grow backing store on GPU
+						    Page Fault */
+
+	BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer
+						    shareable, if available */
+	BASE_MEM_COHERENT_LOCAL = (1U << 11),  /**< Page coherence Inner
+						    shareable */
+	BASE_MEM_CACHED_CPU = (1U << 12),      /**< Should be cached on the
+						    CPU */
+
+	/* IN/OUT */
+	BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU
+					    and the CPU */
+	/* OUT */
+	BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU
+					     address for the alloc */
+	/* IN */
+	BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence
+					     Outer shareable, required. */
+	BASE_MEM_SECURE = (1U << 16),          /**< Secure memory */
+	BASE_MEM_DONT_NEED = (1U << 17),       /**< Not needed physical
+						    memory */
+	BASE_MEM_IMPORT_SHARED = (1U << 18),   /**< Must use shared CPU/GPU zone
+						    (SAME_VA zone) but doesn't
+						    require the addresses to
+						    be the same */
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_IN_MASK                                          \
+	(BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |                        \
+	 BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \
+	 BASE_MEM_GROW_ON_GPF |                                               \
+	 BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL |                 \
+	 BASE_MEM_CACHED_CPU |                                                \
+	 BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE |                \
+	 BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12)
+
+enum kbase_ioctl_coherency_mode {
+	COHERENCY_ACE_LITE = 0,
+	COHERENCY_ACE      = 1,
+	COHERENCY_NONE     = 31
+};
+
+/*
+ * Mali Atom priority
+ *
+ * Only certain priority levels are actually implemented, as specified by the
+ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
+ * level that is not one of those defined below.
+ *
+ * Priority levels only affect scheduling between atoms of the same type within
+ * a mali context, and only after the atoms have had dependencies resolved.
+ * Fragment atoms does not affect non-frament atoms with lower priorities, and
+ * the other way around. For example, a low priority atom that has had its
+ * dependencies resolved might run before a higher priority atom that has not
+ * had its dependencies resolved.
+ *
+ * The scheduling between mali contexts/processes and between atoms from
+ * different mali contexts/processes is unaffected by atom priority.
+ *
+ * The atoms are scheduled as follows with respect to their priorities:
+ * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies
+ *   resolved, and atom 'X' has a higher priority than atom 'Y'
+ * - If atom 'Y' is currently running on the HW, then it is interrupted to
+ *   allow atom 'X' to run soon after
+ * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing
+ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
+ * - Any two atoms that have the same priority could run in any order with
+ *   respect to each other. That is, there is no ordering constraint between
+ *   atoms of the same priority.
+ */
+typedef u8 mali_jd_prio;
+#define BASE_JD_PRIO_MEDIUM  ((mali_jd_prio)0)
+#define BASE_JD_PRIO_HIGH    ((mali_jd_prio)1)
+#define BASE_JD_PRIO_LOW     ((mali_jd_prio)2)
+
+/**
+ * @brief Job dependency type.
+ *
+ * A flags field will be inserted into the atom structure to specify whether a
+ * dependency is a data or ordering dependency (by putting it before/after
+ * 'core_req' in the structure it should be possible to add without changing
+ * the structure size).  When the flag is set for a particular dependency to
+ * signal that it is an ordering only dependency then errors will not be
+ * propagated.
+ */
+typedef u8 mali_jd_dep_type;
+#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
+#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
+#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
+
+/**
+ * @brief Job chain hardware requirements.
+ *
+ * A job chain must specify what GPU features it needs to allow the
+ * driver to schedule the job correctly.  By not specifying the
+ * correct settings can/will cause an early job termination.  Multiple
+ * values can be ORed together to specify multiple requirements.
+ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
+ * dependencies, and that doesn't execute anything on the hardware.
+ */
+typedef u32 mali_jd_core_req;
+
+/* Requirements that come from the HW */
+
+/**
+ * No requirement, dependency only
+ */
+#define BASE_JD_REQ_DEP ((mali_jd_core_req)0)
+
+/**
+ * Requires fragment shaders
+ */
+#define BASE_JD_REQ_FS  ((mali_jd_core_req)1 << 0)
+
+/**
+ * Requires compute shaders
+ * This covers any of the following Midgard Job types:
+ * - Vertex Shader Job
+ * - Geometry Shader Job
+ * - An actual Compute Shader Job
+ *
+ * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
+ * job is specifically just the "Compute Shader" job type, and not the "Vertex
+ * Shader" nor the "Geometry Shader" job type.
+ */
+#define BASE_JD_REQ_CS  ((mali_jd_core_req)1 << 1)
+#define BASE_JD_REQ_T   ((mali_jd_core_req)1 << 2)   /**< Requires tiling */
+#define BASE_JD_REQ_CF  ((mali_jd_core_req)1 << 3)   /**< Requires cache flushes */
+#define BASE_JD_REQ_V   ((mali_jd_core_req)1 << 4)   /**< Requires value writeback */
+
+/* SW-only requirements - the HW does not expose these as part of the job slot
+ * capabilities */
+
+/* Requires fragment job with AFBC encoding */
+#define BASE_JD_REQ_FS_AFBC  ((mali_jd_core_req)1 << 13)
+
+/**
+ * SW-only requirement: coalesce completion events.
+ * If this bit is set then completion of this atom will not cause an event to
+ * be sent to userspace, whether successful or not; completion events will be
+ * deferred until an atom completes which does not have this bit set.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
+ */
+#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5)
+
+/**
+ * SW Only requirement: the job chain requires a coherent core group. We don't
+ * mind which coherent core group is used.
+ */
+#define BASE_JD_REQ_COHERENT_GROUP  ((mali_jd_core_req)1 << 6)
+
+/**
+ * SW Only requirement: The performance counters should be enabled only when
+ * they are needed, to reduce power consumption.
+ */
+
+#define BASE_JD_REQ_PERMON               ((mali_jd_core_req)1 << 7)
+
+/**
+ * SW Only requirement: External resources are referenced by this atom.  When
+ * external resources are referenced no syncsets can be bundled with the atom
+ * but should instead be part of a NULL jobs inserted into the dependency
+ * tree.  The first pre_dep object must be configured for the external
+ * resouces to use, the second pre_dep object can be used to create other
+ * dependencies.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE.
+ */
+#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((mali_jd_core_req)1 << 8)
+
+/**
+ * SW Only requirement: Software defined job. Jobs with this bit set will not
+ * be submitted to the hardware but will cause some action to happen within
+ * the driver
+ */
+#define BASE_JD_REQ_SOFT_JOB        ((mali_jd_core_req)1 << 9)
+
+#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
+#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
+#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
+
+/**
+ * SW Only requirement : Replay job.
+ *
+ * If the preceding job fails, the replay job will cause the jobs specified in
+ * the list of mali_jd_replay_payload pointed to by the jc pointer to be
+ * replayed.
+ *
+ * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT
+ * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay
+ * job is failed, as well as any following dependencies.
+ *
+ * The replayed jobs will require a number of atom IDs. If there are not enough
+ * free atom IDs then the replay job will fail.
+ *
+ * If the preceding job does not fail, then the replay job is returned as
+ * completed.
+ *
+ * The replayed jobs will never be returned to userspace. The preceding failed
+ * job will be returned to userspace as failed; the status of this job should
+ * be ignored. Completion should be determined by the status of the replay soft
+ * job.
+ *
+ * In order for the jobs to be replayed, the job headers will have to be
+ * modified. The Status field will be reset to NOT_STARTED. If the Job Type
+ * field indicates a Vertex Shader Job then it will be changed to Null Job.
+ *
+ * The replayed jobs have the following assumptions :
+ *
+ * - No external resources. Any required external resources will be held by the
+ *   replay atom.
+ * - Pre-dependencies are created based on job order.
+ * - Atom numbers are automatically assigned.
+ * - device_nr is set to 0. This is not relevant as
+ *   BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set.
+ * - Priority is inherited from the replay job.
+ */
+#define BASE_JD_REQ_SOFT_REPLAY                 (BASE_JD_REQ_SOFT_JOB | 0x4)
+/**
+ * SW only requirement: event wait/trigger job.
+ *
+ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
+ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
+ *   other waiting jobs. It completes immediately.
+ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
+ *   possible for other jobs to wait upon. It completes immediately.
+ */
+#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
+#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
+#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
+
+#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
+
+/**
+ * SW only requirement: Just In Time allocation
+ *
+ * This job requests a JIT allocation based on the request in the
+ * @base_jit_alloc_info structure which is passed via the jc element of
+ * the atom.
+ *
+ * It should be noted that the id entry in @base_jit_alloc_info must not
+ * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE.
+ *
+ * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE
+ * soft job to free the JIT allocation is still made.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
+/**
+ * SW only requirement: Just In Time free
+ *
+ * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC
+ * to be freed. The ID of the JIT allocation is passed via the jc element of
+ * the atom.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
+
+/**
+ * SW only requirement: Map external resource
+ *
+ * This job requests external resource(s) are mapped once the dependencies
+ * of the job have been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * @base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
+/**
+ * SW only requirement: Unmap external resource
+ *
+ * This job requests external resource(s) are unmapped once the dependencies
+ * of the job has been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * @base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
+
+/**
+ * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
+ *
+ * This indicates that the Job Chain contains Midgard Jobs of the 'Compute
+ * Shaders' type.
+ *
+ * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job
+ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
+ */
+#define BASE_JD_REQ_ONLY_COMPUTE    ((mali_jd_core_req)1 << 10)
+
+/**
+ * HW Requirement: Use the mali_jd_atom::device_nr field to specify a
+ * particular core group
+ *
+ * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
+ * takes priority
+ *
+ * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms.
+ *
+ * If the core availability policy is keeping the required core group turned
+ * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code.
+ */
+#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11)
+
+/**
+ * SW Flag: If this bit is set then the successful completion of this atom
+ * will not cause an event to be sent to userspace
+ */
+#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((mali_jd_core_req)1 << 12)
+
+/**
+ * SW Flag: If this bit is set then completion of this atom will not cause an
+ * event to be sent to userspace, whether successful or not.
+ */
+#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14)
+
+/**
+ * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job starts which does not have this bit set or a job completes
+ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if
+ * the CPU may have written to memory addressed by the job since the last job
+ * without this bit set was submitted.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15)
+
+/**
+ * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job completes which does not have this bit set or a job starts
+ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not
+ * use if the CPU may read from or partially overwrite memory addressed by the
+ * job before the next job without this bit set completes.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16)
+
+/**
+ * These requirement bits are currently unused in mali_jd_core_req
+ */
+#define MALIP_JD_REQ_RESERVED \
+	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
+	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \
+	BASE_JD_REQ_EVENT_COALESCE | \
+	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
+	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
+	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END))
+
+/**
+ * Mask of all bits in mali_jd_core_req that control the type of the atom.
+ *
+ * This allows dependency only atoms to have flags set
+ */
+#define BASE_JD_REQ_ATOM_TYPE \
+	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
+	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
+
+/**
+ * Mask of all bits in mali_jd_core_req that control the type of a soft job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
+
+/*
+ * Returns non-zero value if core requirements passed define a soft job or
+ * a dependency only job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
+	((core_req & BASE_JD_REQ_SOFT_JOB) || \
+	(core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
+
+/**
+ * @brief The payload for a replay job. This must be in GPU memory.
+ */
+struct mali_jd_replay_payload {
+	/**
+	 * Pointer to the first entry in the mali_jd_replay_jc list.  These
+	 * will be replayed in @b reverse order (so that extra ones can be added
+	 * to the head in future soft jobs without affecting this soft job)
+	 */
+	u64 tiler_jc_list;
+
+	/**
+	 * Pointer to the fragment job chain.
+	 */
+	u64 fragment_jc;
+
+	/**
+	 * Pointer to the tiler heap free FBD field to be modified.
+	 */
+	u64 tiler_heap_free;
+
+	/**
+	 * Hierarchy mask for the replayed fragment jobs. May be zero.
+	 */
+	u16 fragment_hierarchy_mask;
+
+	/**
+	 * Hierarchy mask for the replayed tiler jobs. May be zero.
+	 */
+	u16 tiler_hierarchy_mask;
+
+	/**
+	 * Default weight to be used for hierarchy levels not in the original
+	 * mask.
+	 */
+	u32 hierarchy_default_weight;
+
+	/**
+	 * Core requirements for the tiler job chain
+	 */
+	mali_jd_core_req tiler_core_req;
+
+	/**
+	 * Core requirements for the fragment job chain
+	 */
+	mali_jd_core_req fragment_core_req;
+};
+
+/**
+ * @brief An entry in the linked list of job chains to be replayed. This must
+ *        be in GPU memory.
+ */
+struct mali_jd_replay_jc {
+	/**
+	 * Pointer to next entry in the list. A setting of NULL indicates the
+	 * end of the list.
+	 */
+	u64 next;
+
+	/**
+	 * Pointer to the job chain.
+	 */
+	u64 jc;
+};
+
+typedef u64 mali_ptr;
+
+#define MALI_PTR_FMT "0x%" PRIx64
+#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR
+
+#ifdef __LP64__
+#define PAD_CPU_PTR(p) p
+#else
+#define PAD_CPU_PTR(p) p; u32 :32;
+#endif
+
+/* FIXME: Again, they don't specify any of these as packed structs. However,
+ * looking at these structs I'm worried that there is already spots where the
+ * compiler is potentially sticking in padding...
+ * Going to try something a little crazy, and just hope that our compiler
+ * happens to add the same kind of offsets since we can't really compare sizes
+ */
+
+/*
+ * Blob provided by the driver to store callback driver, not actually modified
+ * by the driver itself
+ */
+struct mali_jd_udata {
+	u64 blob[2];
+};
+
+struct mali_jd_dependency {
+	mali_atom_id  atom_id;               /**< An atom number */
+	mali_jd_dep_type dependency_type;    /**< Dependency type */
+};
+
+#define MALI_EXT_RES_MAX 10
+
+/* The original header never explicitly defines any values for these. In C,
+ * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we
+ * actually need to decode here is EXCLUSIVE
+ */
+enum mali_external_resource_access {
+	MALI_EXT_RES_ACCESS_SHARED,
+	MALI_EXT_RES_ACCESS_EXCLUSIVE,
+};
+
+/* An aligned address to the resource | mali_external_resource_access */
+typedef u64 mali_external_resource;
+
+struct base_jd_atom_v2 {
+	mali_ptr jc;           /**< job-chain GPU address */
+	struct mali_jd_udata udata;	    /**< user data */
+	u64 extres_list; /**< list of external resources */
+	u16 nr_extres;			    /**< nr of external resources */
+	u16 compat_core_req;	            /**< core requirements which
+					      correspond to the legacy support
+					      for UK 10.2 */
+	struct mali_jd_dependency pre_dep[2];  /**< pre-dependencies, one need to
+					      use SETTER function to assign
+					      this field, this is done in
+					      order to reduce possibility of
+					      improper assigment of a
+					      dependency field */
+	mali_atom_id atom_number;	    /**< unique number to identify the
+					      atom */
+	mali_jd_prio prio;                  /**< Atom priority. Refer to @ref
+					      mali_jd_prio for more details */
+	u8 device_nr;			    /**< coregroup when
+					      BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+					      specified */
+	u8 :8;
+	mali_jd_core_req core_req;          /**< core requirements */
+} __attribute__((packed));
+
+/**
+ * enum mali_error - Mali error codes shared with userspace
+ *
+ * This is subset of those common Mali errors that can be returned to userspace.
+ * Values of matching user and kernel space enumerators MUST be the same.
+ * MALI_ERROR_NONE is guaranteed to be 0.
+ *
+ * @MALI_ERROR_NONE: Success
+ * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver
+ * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure
+ * @MALI_ERROR_FUNCTION_FAILED: Generic error code
+ */
+enum mali_error {
+	MALI_ERROR_NONE = 0,
+	MALI_ERROR_OUT_OF_GPU_MEMORY,
+	MALI_ERROR_OUT_OF_MEMORY,
+	MALI_ERROR_FUNCTION_FAILED,
+};
+
+/**
+ * Header used by all ioctls
+ */
+union kbase_ioctl_header {
+#ifdef dvalin
+	u32 pad[0];
+#else
+	/* [in] The ID of the UK function being called */
+	u32 id :32;
+	/* [out] The return value of the UK function that was called */
+	enum mali_error rc :32;
+
+	u64 :64;
+#endif
+} __attribute__((packed));
+
+struct kbase_ioctl_get_version {
+	union kbase_ioctl_header header;
+	u16 major; /* [out] */
+	u16 minor; /* [out] */
+	u32 :32;
+} __attribute__((packed));
+
+struct mali_mem_import_user_buffer {
+	u64 ptr;
+	u64 length;
+};
+
+union kbase_ioctl_mem_import {
+        struct {
+                union kbase_ioctl_header header;
+                u64 phandle;
+                enum {
+                        BASE_MEM_IMPORT_TYPE_INVALID = 0,
+                        BASE_MEM_IMPORT_TYPE_UMP = 1,
+                        BASE_MEM_IMPORT_TYPE_UMM = 2,
+                        BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3,
+                } type :32;
+                u32 :32;
+                u64 flags;
+        } in;
+        struct {
+                union kbase_ioctl_header header;
+                u64 pad[2];
+                u64 flags;
+                u64 gpu_va;
+                u64 va_pages;
+        } out;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_commit {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_addr;
+	u64 pages;
+	/* [out] */
+	u32 result_subcode;
+	u32 :32;
+} __attribute__((packed));
+
+enum kbase_ioctl_mem_query_type {
+	BASE_MEM_QUERY_COMMIT_SIZE = 1,
+	BASE_MEM_QUERY_VA_SIZE     = 2,
+	BASE_MEM_QUERY_FLAGS       = 3
+};
+
+struct kbase_ioctl_mem_query {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_addr;
+	enum kbase_ioctl_mem_query_type query : 32;
+	u32 :32;
+	/* [out] */
+	u64 value;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_free {
+	union kbase_ioctl_header header;
+	mali_ptr gpu_addr; /* [in] */
+} __attribute__((packed));
+/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
+
+struct kbase_ioctl_mem_flags_change {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_va;
+	u64 flags;
+	u64 mask;
+} __attribute__((packed));
+/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
+
+struct kbase_ioctl_mem_alias {
+	union kbase_ioctl_header header;
+	/* [in/out] */
+	u64 flags;
+	/* [in] */
+	u64 stride;
+	u64 nents;
+	u64 ai;
+	/* [out] */
+	mali_ptr gpu_va;
+	u64 va_pages;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_sync {
+	union kbase_ioctl_header header;
+	mali_ptr handle;
+	u64 user_addr;
+	u64 size;
+	enum {
+		MALI_SYNC_TO_DEVICE = 1,
+		MALI_SYNC_TO_CPU = 2,
+	} type :8;
+	u64 :56;
+} __attribute__((packed));
+
+struct kbase_ioctl_set_flags {
+	union kbase_ioctl_header header;
+	u32 create_flags; /* [in] */
+	u32 :32;
+} __attribute__((packed));
+
+struct kbase_ioctl_stream_create {
+	union kbase_ioctl_header header;
+	/* [in] */
+	char name[32];
+	/* [out] */
+	s32 fd;
+	u32 :32;
+} __attribute__((packed));
+
+struct kbase_ioctl_job_submit {
+	union kbase_ioctl_header header;
+	/* [in] */
+	u64 addr;
+	u32 nr_atoms;
+	u32 stride;
+} __attribute__((packed));
+
+struct kbase_ioctl_get_context_id {
+	union kbase_ioctl_header header;
+	/* [out] */
+	s64 id;
+} __attribute__((packed));
+
+#undef PAD_CPU_PTR
+
+enum base_jd_event_code {
+        BASE_JD_EVENT_DONE = 1,
+};
+
+struct base_jd_event_v2 {
+	enum base_jd_event_code event_code;
+	mali_atom_id atom_number;
+	struct mali_jd_udata udata;
+};
+
+/* Defined in mali-props.h */
+struct kbase_ioctl_gpu_props_reg_dump;
+
+/* For ioctl's we haven't written decoding stuff for yet */
+typedef struct {
+	union kbase_ioctl_header header;
+} __ioctl_placeholder;
+
+#endif /* __KBASE_IOCTL_H__ */
diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h
new file mode 100644
index 00000000000..5b9d8723600
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-props.h
@@ -0,0 +1,262 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __MALI_PROPS_H__
+#define __MALI_PROPS_H__
+
+#include "mali-ioctl.h"
+
+#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3
+#define MALI_GPU_MAX_JOB_SLOTS 16
+#define MALI_MAX_COHERENT_GROUPS 16
+
+/* Capabilities of a job slot as reported by JS_FEATURES registers */
+
+#define JS_FEATURE_NULL_JOB              (1u << 1)
+#define JS_FEATURE_SET_VALUE_JOB         (1u << 2)
+#define JS_FEATURE_CACHE_FLUSH_JOB       (1u << 3)
+#define JS_FEATURE_COMPUTE_JOB           (1u << 4)
+#define JS_FEATURE_VERTEX_JOB            (1u << 5)
+#define JS_FEATURE_GEOMETRY_JOB          (1u << 6)
+#define JS_FEATURE_TILER_JOB             (1u << 7)
+#define JS_FEATURE_FUSED_JOB             (1u << 8)
+#define JS_FEATURE_FRAGMENT_JOB          (1u << 9)
+
+struct mali_gpu_core_props {
+	/**
+	 * Product specific value.
+	 */
+	u32 product_id;
+
+	/**
+	 * Status of the GPU release.
+	 * No defined values, but starts at 0 and increases by one for each
+	 * release status (alpha, beta, EAC, etc.).
+	 * 4 bit values (0-15).
+	 */
+	u16 version_status;
+
+	/**
+	 * Minor release number of the GPU. "P" part of an "RnPn" release
+	 * number.
+	 * 8 bit values (0-255).
+	 */
+	u16 minor_revision;
+
+	/**
+	 * Major release number of the GPU. "R" part of an "RnPn" release
+	 * number.
+	 * 4 bit values (0-15).
+	 */
+	u16 major_revision;
+
+	u16 :16;
+
+	/**
+	 * @usecase GPU clock speed is not specified in the Midgard
+	 * Architecture, but is <b>necessary for OpenCL's clGetDeviceInfo()
+	 * function</b>.
+	 */
+	u32 gpu_speed_mhz;
+
+	/**
+	 * @usecase GPU clock max/min speed is required for computing
+	 * best/worst case in tasks as job scheduling ant irq_throttling. (It
+	 * is not specified in the Midgard Architecture).
+	 */
+	u32 gpu_freq_khz_max;
+	u32 gpu_freq_khz_min;
+
+	/**
+	 * Size of the shader program counter, in bits.
+	 */
+	u32 log2_program_counter_size;
+
+	/**
+	 * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a
+	 * bitpattern where a set bit indicates that the format is supported.
+	 *
+	 * Before using a texture format, it is recommended that the
+	 * corresponding bit be checked.
+	 */
+	u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	/**
+	 * Theoretical maximum memory available to the GPU. It is unlikely
+	 * that a client will be able to allocate all of this memory for their
+	 * own purposes, but this at least provides an upper bound on the
+	 * memory available to the GPU.
+	 *
+	 * This is required for OpenCL's clGetDeviceInfo() call when
+	 * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+	 * client will not be expecting to allocate anywhere near this value.
+	 */
+	u64 gpu_available_memory_size;
+};
+
+struct mali_gpu_l2_cache_props {
+	u8 log2_line_size;
+	u8 log2_cache_size;
+	u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	u64 :40;
+};
+
+struct mali_gpu_tiler_props {
+	u32 bin_size_bytes;	/* Max is 4*2^15 */
+	u32 max_active_levels;	/* Max is 2^15 */
+};
+
+struct mali_gpu_thread_props {
+	u32 max_threads;            /* Max. number of threads per core */
+	u32 max_workgroup_size;     /* Max. number of threads per workgroup */
+	u32 max_barrier_size;       /* Max. number of threads that can
+				       synchronize on a simple barrier */
+	u16 max_registers;          /* Total size [1..65535] of the register
+				       file available per core. */
+	u8  max_task_queue;         /* Max. tasks [1..255] which may be sent
+				       to a core before it becomes blocked. */
+	u8  max_thread_group_split; /* Max. allowed value [1..15] of the
+				       Thread Group Split field. */
+	enum {
+		MALI_GPU_IMPLEMENTATION_UNKNOWN = 0,
+		MALI_GPU_IMPLEMENTATION_SILICON = 1,
+		MALI_GPU_IMPLEMENTATION_FPGA    = 2,
+		MALI_GPU_IMPLEMENTATION_SW      = 3,
+	} impl_tech :8;
+	u64 :56;
+};
+
+/**
+ * @brief descriptor for a coherent group
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ * the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ * wastage.
+ */
+struct mali_ioctl_gpu_coherent_group {
+	u64 core_mask;	       /**< Core restriction mask required for the
+				 group */
+	u16 num_cores;	       /**< Number of cores in the group */
+	u64 :48;
+};
+
+/**
+ * @brief Coherency group information
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_gpu_coherent_group_info {
+	u32 num_groups;
+
+	/**
+	 * Number of core groups (coherent or not) in the GPU. Equivalent to
+	 * the number of L2 Caches.
+	 *
+	 * The GPU Counter dumping writes 2048 bytes per core group,
+	 * regardless of whether the core groups are coherent or not. Hence
+	 * this member is needed to calculate how much memory is required for
+	 * dumping.
+	 *
+	 * @note Do not use it to work out how many valid elements are in the
+	 * group[] member. Use num_groups instead.
+	 */
+	u32 num_core_groups;
+
+	/**
+	 * Coherency features of the memory, accessed by @ref gpu_mem_features
+	 * methods
+	 */
+	u32 coherency;
+
+	u32 :32;
+
+	/**
+	 * Descriptors of coherent groups
+	 */
+	struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS];
+};
+
+/**
+ * A complete description of the GPU's Hardware Configuration Discovery
+ * registers.
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it <b>does not need to be processed
+ * by the driver</b>. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct mali_gpu_raw_props {
+	u64 shader_present;
+	u64 tiler_present;
+	u64 l2_present;
+	u64 stack_present;
+
+	u32 l2_features;
+	u32 suspend_size; /* API 8.2+ */
+	u32 mem_features;
+	u32 mmu_features;
+
+	u32 as_present;
+
+	u32 js_present;
+	u32 js_features[MALI_GPU_MAX_JOB_SLOTS];
+	u32 tiler_features;
+	u32 texture_features[3];
+
+	u32 gpu_id;
+
+	u32 thread_max_threads;
+	u32 thread_max_workgroup_size;
+	u32 thread_max_barrier_size;
+	u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	u32 coherency_mode;
+};
+
+struct kbase_ioctl_gpu_props_reg_dump {
+	union kbase_ioctl_header header;
+	struct mali_gpu_core_props core;
+	struct mali_gpu_l2_cache_props l2;
+	u64 :64;
+	struct mali_gpu_tiler_props tiler;
+	struct mali_gpu_thread_props thread;
+
+	struct mali_gpu_raw_props raw;
+
+	/** This must be last member of the structure */
+	struct mali_gpu_coherent_group_info coherency_info;
+} __attribute__((packed));
+
+#endif
diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build
new file mode 100644
index 00000000000..5d7b9f1dff9
--- /dev/null
+++ b/src/panfrost/base/meson.build
@@ -0,0 +1,55 @@
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+# Copyright © 2022 Icecream95
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_base_versions = ['0', '1', '2', '258']
+libpanfrost_base_per_arch = []
+
+foreach ver : libpanfrost_base_versions
+  libpanfrost_base_per_arch += static_library(
+    'pan-base-v' + ver,
+    'pan_vX_base.c',
+    include_directories : [
+      inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux,
+      include_directories('include'),
+    ],
+    c_args : ['-DPAN_BASE_VER=' + ver],
+    gnu_symbol_visibility : 'hidden',
+    dependencies: [dep_valgrind],
+)
+endforeach
+
+libpanfrost_base = static_library(
+  'panfrost_base',
+  'pan_base.c',
+  include_directories : [
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw,
+    include_directories('include'),
+  ],
+  gnu_symbol_visibility : 'hidden',
+  build_by_default : false,
+  link_with: [libpanfrost_base_per_arch],
+)
+
+libpanfrost_base_dep = declare_dependency(
+  link_with: [libpanfrost_base_per_arch, libpanfrost_base],
+  include_directories: [include_directories('.')],
+)
diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c
new file mode 100644
index 00000000000..22dc09cfb52
--- /dev/null
+++ b/src/panfrost/base/pan_base.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <pthread.h>
+
+#include "util/macros.h"
+#include "pan_base.h"
+
+#include "mali_kbase_ioctl.h"
+
+bool
+kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose)
+{
+        *k = (struct kbase_) {0};
+        k->fd = fd;
+        k->cs_queue_count = cs_queue_count;
+        k->page_size = sysconf(_SC_PAGE_SIZE);
+        k->verbose = verbose;
+
+        if (k->fd == -1)
+           return kbase_open_csf_noop(k);
+
+        struct kbase_ioctl_version_check ver = { 0 };
+
+        if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) {
+                return kbase_open_csf(k);
+        } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) {
+                if (ver.major == 3)
+                        return kbase_open_old(k);
+                else
+                        return kbase_open_new(k);
+        }
+
+        return false;
+}
+
+/* If fd != -1, ownership is passed in */
+int
+kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd)
+{
+        kbase_handle h = {
+                .va = va,
+                .fd = fd
+        };
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
+
+        for (unsigned i = 0; i < size; ++i) {
+                if (handles[i].fd == -2) {
+                        handles[i] = h;
+                        return i;
+                }
+        }
+
+        util_dynarray_append(&k->gem_handles, kbase_handle, h);
+
+        return size;
+}
+
+int
+kbase_alloc_gem_handle(kbase k, base_va va, int fd)
+{
+        pthread_mutex_lock(&k->handle_lock);
+
+        int ret = kbase_alloc_gem_handle_locked(k, va, fd);
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return ret;
+}
+
+void
+kbase_free_gem_handle(kbase k, int handle)
+{
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        int fd;
+
+        if (handle >= size) {
+                pthread_mutex_unlock(&k->handle_lock);
+                return;
+        }
+
+        if (handle + 1 < size) {
+                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+                fd = ptr->fd;
+                ptr->fd = -2;
+        } else {
+                fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd;
+        }
+
+        if (fd != -1)
+                close(fd);
+
+        pthread_mutex_unlock(&k->handle_lock);
+}
+
+kbase_handle
+kbase_gem_handle_get(kbase k, int handle)
+{
+        kbase_handle h = { .fd = -1 };
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        if (handle < size)
+                h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return h;
+}
+
+int
+kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers)
+{
+        struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns);
+
+        while (kbase_wait_for_event(&wait)) {
+                pthread_mutex_lock(&k->handle_lock);
+                if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        kbase_wait_fini(wait);
+                        errno = EINVAL;
+                        return -1;
+                }
+                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+                if (!ptr->use_count) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        kbase_wait_fini(wait);
+                        return 0;
+                }
+                pthread_mutex_unlock(&k->handle_lock);
+        }
+
+        kbase_wait_fini(wait);
+        errno = ETIMEDOUT;
+        return -1;
+}
+
+static void
+adjust_time(struct timespec *tp, int64_t ns)
+{
+        ns += tp->tv_nsec;
+        tp->tv_nsec = ns % 1000000000;
+        tp->tv_sec += ns / 1000000000;
+}
+
+static int64_t
+ns_until(struct timespec tp)
+{
+        struct timespec now;
+        clock_gettime(CLOCK_MONOTONIC, &now);
+
+        int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000;
+        int64_t ns = tp.tv_nsec - now.tv_nsec;
+
+        /* Clamp the value to zero to avoid errors from ppoll */
+        return MAX2(sec + ns, 0);
+}
+
+static void
+kbase_wait_signal(kbase k)
+{
+        /* We must acquire the event condition lock, otherwise another
+         * thread could be between the trylock and the cond_wait, and
+         * not notice the broadcast. */
+        pthread_mutex_lock(&k->event_cnd_lock);
+        pthread_cond_broadcast(&k->event_cnd);
+        pthread_mutex_unlock(&k->event_cnd_lock);
+}
+
+struct kbase_wait_ctx
+kbase_wait_init(kbase k, int64_t timeout_ns)
+{
+        struct timespec tp;
+        clock_gettime(CLOCK_MONOTONIC, &tp);
+
+        adjust_time(&tp, timeout_ns);
+
+        return (struct kbase_wait_ctx) {
+                .k = k,
+                .until = tp,
+        };
+}
+
+bool
+kbase_wait_for_event(struct kbase_wait_ctx *ctx)
+{
+        kbase k = ctx->k;
+
+        /* Return instantly the first time so that a check outside the
+         * wait_for_Event loop is not required */
+        if (!ctx->has_cnd_lock) {
+                pthread_mutex_lock(&k->event_cnd_lock);
+                ctx->has_cnd_lock = true;
+                return true;
+        }
+
+        if (!ctx->has_lock) {
+                if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
+                        ctx->has_lock = true;
+                        pthread_mutex_unlock(&k->event_cnd_lock);
+                } else {
+                        int ret = pthread_cond_timedwait(&k->event_cnd,
+                                         &k->event_cnd_lock, &ctx->until);
+                        return ret != ETIMEDOUT;
+                }
+        }
+
+        bool event = k->poll_event(k, ns_until(ctx->until));
+        k->handle_events(k);
+        kbase_wait_signal(k);
+        return event;
+}
+
+void
+kbase_wait_fini(struct kbase_wait_ctx ctx)
+{
+        kbase k = ctx.k;
+
+        if (ctx.has_lock) {
+                pthread_mutex_unlock(&k->event_read_lock);
+                kbase_wait_signal(k);
+        } else if (ctx.has_cnd_lock) {
+                pthread_mutex_unlock(&k->event_cnd_lock);
+        }
+}
+
+void
+kbase_ensure_handle_events(kbase k)
+{
+        /* If we don't manage to take the lock, then events have recently/will
+         * soon be handled, there is no need to do anything. */
+        if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
+                k->handle_events(k);
+                pthread_mutex_unlock(&k->event_read_lock);
+                kbase_wait_signal(k);
+        }
+}
+
+bool
+kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp)
+{
+        struct pollfd pfd = {
+                .fd = fd,
+                .events = wait_shared ? POLLOUT : POLLIN,
+        };
+
+        uint64_t timeout = ns_until(tp);
+
+        struct timespec t = {
+                .tv_sec = timeout / 1000000000,
+                .tv_nsec = timeout % 1000000000,
+        };
+
+        int ret = ppoll(&pfd, 1, &t, NULL);
+
+        if (ret == -1 && errno != EINTR)
+                perror("kbase_poll_fd_until");
+
+        return ret != 0;
+}
diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h
new file mode 100644
index 00000000000..878f7468433
--- /dev/null
+++ b/src/panfrost/base/pan_base.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Library for interfacing with kbase */
+#ifndef PAN_BASE_H
+#define PAN_BASE_H
+
+#include "util/u_dynarray.h"
+#include "util/list.h"
+
+#define PAN_EVENT_SIZE 16
+
+typedef uint64_t base_va;
+struct base_ptr {
+        void *cpu;
+        base_va gpu;
+};
+
+struct kbase_syncobj;
+
+/* The job is done when the queue seqnum > seqnum */
+struct kbase_sync_link {
+        struct kbase_sync_link *next; /* must be first */
+        uint64_t seqnum;
+        void (*callback)(void *);
+        void *data;
+};
+
+struct kbase_event_slot {
+        struct kbase_sync_link *syncobjs;
+        struct kbase_sync_link **back;
+        uint64_t last_submit;
+        uint64_t last;
+};
+
+struct kbase_context {
+        uint8_t csg_handle;
+        uint8_t kcpu_queue;
+        bool kcpu_init; // TODO: Always create a queue?
+        uint32_t csg_uid;
+        unsigned num_csi;
+
+        unsigned tiler_heap_chunk_size;
+        base_va tiler_heap_va;
+        base_va tiler_heap_header;
+};
+
+struct kbase_cs {
+        struct kbase_context *ctx;
+        void *user_io;
+        base_va va;
+        unsigned size;
+        unsigned event_mem_offset;
+        unsigned csi;
+
+        uint64_t last_insert;
+
+        // TODO: This is only here because it's convenient for emit_csf_queue
+        uint32_t *latest_flush;
+};
+
+#define KBASE_SLOT_COUNT 2
+
+typedef struct {
+        base_va va;
+        int fd;
+        uint8_t use_count;
+        /* For emulating implicit sync. TODO make this work on v10 */
+        uint8_t last_access[KBASE_SLOT_COUNT];
+} kbase_handle;
+
+struct kbase_;
+typedef struct kbase_ *kbase;
+
+struct kbase_ {
+        unsigned setup_state;
+        bool verbose;
+
+        int fd;
+        unsigned api;
+        unsigned page_size;
+        // TODO: Actually we may want to try to pack multiple contexts / queue
+        // "sets" into a single group...
+        unsigned cs_queue_count;
+
+        /* Must not hold handle_lock while acquiring event_read_lock */
+        pthread_mutex_t handle_lock;
+        pthread_mutex_t event_read_lock;
+        pthread_mutex_t event_cnd_lock;
+        pthread_cond_t event_cnd;
+        /* TODO: Per-context/queue locks? */
+        pthread_mutex_t queue_lock;
+
+        struct list_head syncobjs;
+
+        unsigned gpuprops_size;
+        void *gpuprops;
+
+        void *tracking_region;
+        void *csf_user_reg;
+        struct base_ptr event_mem;
+        struct base_ptr kcpu_event_mem;
+        // TODO: dynamically size
+        struct kbase_event_slot event_slots[256];
+        // TODO: USe a bitset?
+        unsigned event_slot_usage;
+
+        uint8_t atom_number;
+
+        struct util_dynarray gem_handles;
+        struct util_dynarray atom_bos[256];
+        uint64_t job_seq;
+
+        void (*close)(kbase k);
+
+        bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value);
+        bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value);
+
+        struct base_ptr (*alloc)(kbase k, size_t size,
+                                 unsigned pan_flags,
+                                 unsigned mali_flags);
+        void (*free)(kbase k, base_va va);
+
+        int (*import_dmabuf)(kbase k, int fd);
+        void *(*mmap_import)(kbase k, base_va va, size_t size);
+
+        void (*cache_clean)(void *ptr, size_t size);
+        void (*cache_invalidate)(void *ptr, size_t size);
+
+        /* Returns false on timeout */
+        bool (*poll_event)(kbase k, int64_t timeout_ns);
+        bool (*handle_events)(kbase k);
+
+        /* <= v9 GPUs */
+        int (*submit)(kbase k, uint64_t va, unsigned req,
+                      struct kbase_syncobj *o,
+                      int32_t *handles, unsigned num_handles);
+
+        /* >= v10 GPUs */
+        struct kbase_context *(*context_create)(kbase k);
+        void (*context_destroy)(kbase k, struct kbase_context *ctx);
+        bool (*context_recreate)(kbase k, struct kbase_context *ctx);
+
+        // TODO: Pass in a priority?
+        struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx,
+                                   base_va va, unsigned size);
+        void (*cs_term)(kbase k, struct kbase_cs *cs);
+        void (*cs_rebind)(kbase k, struct kbase_cs *cs);
+
+        bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
+                          struct kbase_syncobj *o, uint64_t seqnum);
+        bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
+                        struct kbase_syncobj *o);
+
+        int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx);
+        bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd);
+
+        bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx,
+                             base_va addr, uint64_t value);
+        bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx,
+                              base_va addr, uint64_t value);
+
+        /* syncobj functions */
+        struct kbase_syncobj *(*syncobj_create)(kbase k);
+        void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o);
+        struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o);
+        /* TODO: timeout? (and for cs_wait) */
+        bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o);
+
+        /* Returns false if there are no active queues */
+        bool (*callback_all_queues)(kbase k, int32_t *count,
+                                    void (*callback)(void *), void *data);
+
+        void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size,
+                         bool invalidate);
+};
+
+bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose);
+
+/* Called from kbase_open */
+bool kbase_open_old(kbase k);
+bool kbase_open_new(kbase k);
+bool kbase_open_csf(kbase k);
+bool kbase_open_csf_noop(kbase k);
+
+/* BO management */
+int kbase_alloc_gem_handle(kbase k, base_va va, int fd);
+int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd);
+void kbase_free_gem_handle(kbase k, int handle);
+kbase_handle kbase_gem_handle_get(kbase k, int handle);
+int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers);
+
+/* Event waiting */
+struct kbase_wait_ctx {
+        kbase k;
+        struct timespec until;
+        bool has_lock;
+        bool has_cnd_lock;
+};
+
+struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns);
+/* Returns false on timeout, kbase_wait_fini must still be called */
+bool kbase_wait_for_event(struct kbase_wait_ctx *ctx);
+void kbase_wait_fini(struct kbase_wait_ctx ctx);
+
+void kbase_ensure_handle_events(kbase k);
+
+bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp);
+
+/* Must not conflict with PANFROST_BO_* flags */
+#define MALI_BO_CACHED_CPU   (1 << 16)
+#define MALI_BO_UNCACHED_GPU (1 << 17)
+
+#endif
diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h
new file mode 100644
index 00000000000..750a445a995
--- /dev/null
+++ b/src/panfrost/base/pan_base_noop.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PAN_BASE_NOOP_H
+#define PAN_BASE_NOOP_H
+
+/* For Mali-G610 as used in RK3588 */
+#define PROP(name, value) ((name << 2) | 2), value
+static const uint32_t gpu_props[] = {
+   PROP(KBASE_GPUPROP_RAW_GPU_ID,             0xa8670000),
+   PROP(KBASE_GPUPROP_PRODUCT_ID,                 0xa867),
+   PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT,        0x50005),
+   PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e),
+   PROP(KBASE_GPUPROP_TLS_ALLOC,                   0x800),
+   PROP(KBASE_GPUPROP_RAW_TILER_FEATURES,          0x809),
+};
+#undef PROP
+
+#define NOOP_COOKIE_ALLOC     0x41000
+#define NOOP_COOKIE_USER_IO   0x42000
+#define NOOP_COOKIE_MEM_ALLOC 0x43000
+
+static int
+kbase_ioctl(int fd, unsigned long request, ...)
+{
+   int ret = 0;
+
+   va_list args;
+
+   va_start(args, request);
+   void *ptr = va_arg(args, void *);
+   va_end(args);
+
+   switch (request) {
+   case KBASE_IOCTL_GET_GPUPROPS: {
+      struct kbase_ioctl_get_gpuprops *props = ptr;
+
+      if (props->size)
+         memcpy((void *)(uintptr_t) props->buffer,
+                gpu_props, MIN2(props->size, sizeof(gpu_props)));
+
+      ret = sizeof(gpu_props);
+      break;
+   }
+
+   case KBASE_IOCTL_MEM_ALLOC: {
+      union kbase_ioctl_mem_alloc *alloc = ptr;
+
+      alloc->out.gpu_va = NOOP_COOKIE_ALLOC;
+      alloc->out.flags = BASE_MEM_SAME_VA;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: {
+      union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr;
+
+      // TODO: Don't return duplicates?
+      create->out.group_handle = 0;
+      create->out.group_uid = 1;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_TILER_HEAP_INIT: {
+      union kbase_ioctl_cs_tiler_heap_init *init = ptr;
+
+      /* The values don't really matter, the CPU has no business in accessing
+       * these. */
+      init->out.gpu_heap_va = 0x60000;
+      init->out.first_chunk_va = 0x61000;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_QUEUE_BIND: {
+      union kbase_ioctl_cs_queue_bind *bind = ptr;
+      bind->out.mmap_handle = NOOP_COOKIE_USER_IO;
+      break;
+   }
+
+   case KBASE_IOCTL_MEM_IMPORT: {
+      union kbase_ioctl_mem_import *import = ptr;
+
+      if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) {
+         ret = -1;
+         errno = EINVAL;
+         break;
+      }
+
+      int *fd = (int *)(uintptr_t) import->in.phandle;
+
+      off_t size = lseek(*fd, 0, SEEK_END);
+
+      import->out.flags = BASE_MEM_NEED_MMAP;
+      import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC;
+      import->out.va_pages = DIV_ROUND_UP(size, 4096);
+   }
+
+   case KBASE_IOCTL_SET_FLAGS:
+   case KBASE_IOCTL_MEM_EXEC_INIT:
+   case KBASE_IOCTL_MEM_JIT_INIT:
+   case KBASE_IOCTL_CS_QUEUE_REGISTER:
+   case KBASE_IOCTL_CS_QUEUE_KICK:
+   case KBASE_IOCTL_CS_TILER_HEAP_TERM:
+   case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE:
+   case KBASE_IOCTL_MEM_SYNC:
+      break;
+
+   default:
+      ret = -1;
+      errno = ENOSYS;
+   }
+
+   return ret;
+}
+
+static void *
+kbase_mmap(void *addr, size_t length, int prot, int flags,
+           int fd, off_t offset)
+{
+   switch (offset) {
+   case BASE_MEM_MAP_TRACKING_HANDLE:
+   case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE:
+   case NOOP_COOKIE_ALLOC:
+   case NOOP_COOKIE_USER_IO:
+   case NOOP_COOKIE_MEM_ALLOC:
+      return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+   default:
+      errno = ENOSYS;
+      return MAP_FAILED;
+   }
+}
+#endif
diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h
new file mode 100644
index 00000000000..ad5af0c7098
--- /dev/null
+++ b/src/panfrost/base/pan_cache.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PAN_CACHE_H
+#define PAN_CACHE_H
+
+#ifdef __aarch64__
+
+static void
+cache_clean(volatile void *addr)
+{
+        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
+}
+
+static void
+cache_invalidate(volatile void *addr)
+{
+        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
+}
+
+typedef void (*cacheline_op)(volatile void *addr);
+
+#define CACHELINE_SIZE 64
+
+static void
+cacheline_op_range(volatile void *start, size_t length, cacheline_op op)
+{
+        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
+        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
+        for (; ptr < end; ptr += CACHELINE_SIZE)
+                op(ptr);
+}
+
+static void
+cache_clean_range(volatile void *start, size_t length)
+{
+        /* TODO: Do an invalidate at the start of the range? */
+        cacheline_op_range(start, length, cache_clean);
+}
+
+static void
+cache_invalidate_range(volatile void *start, size_t length)
+{
+        cacheline_op_range(start, length, cache_invalidate);
+}
+
+#endif /* __aarch64__ */
+
+/* The #ifdef covers both 32-bit and 64-bit ARM */
+#ifdef __ARM_ARCH
+static void
+cache_barrier(void)
+{
+        __asm__ volatile ("dsb sy" ::: "memory");
+}
+
+static void
+memory_barrier(void)
+{
+        __asm__ volatile ("dmb sy" ::: "memory");
+}
+#else
+
+/* TODO: How to do cache barriers when emulated? */
+static void
+cache_barrier(void)
+{
+}
+
+static void
+memory_barrier(void)
+{
+}
+#endif
+#endif
diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c
new file mode 100644
index 00000000000..99bd356c536
--- /dev/null
+++ b/src/panfrost/base/pan_vX_base.c
@@ -0,0 +1,1825 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <pthread.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#else
+#define RUNNING_ON_VALGRIND 0
+#endif
+
+#include "util/macros.h"
+#include "util/list.h"
+#include "util/u_atomic.h"
+#include "util/os_file.h"
+
+#include "pan_base.h"
+#include "pan_cache.h"
+
+#include "drm-uapi/panfrost_drm.h"
+
+#define PAN_BASE_API (PAN_BASE_VER & 0xff)
+#if (PAN_BASE_VER & 0x100) == 0x100
+#define PAN_BASE_NOOP
+#endif
+
+#if PAN_BASE_API >= 2
+#include "csf/mali_gpu_csf_registers.h"
+
+#define MALI_USE_CSF 1
+#endif
+
+#include "mali_kbase_gpuprops.h"
+
+#ifndef PAN_BASE_NOOP
+#define kbase_mmap mmap
+#endif
+
+#if PAN_BASE_API >= 1
+#include "mali_base_kernel.h"
+#include "mali_kbase_ioctl.h"
+
+#ifdef PAN_BASE_NOOP
+#include "pan_base_noop.h"
+#else
+#define kbase_ioctl ioctl
+#endif
+#else
+
+#include "old/mali-ioctl.h"
+#include "old/mali-ioctl-midgard.h"
+#include "old/mali-props.h"
+#endif
+
+#define LOG(fmt, ...) do { \
+                if (k->verbose) { \
+                        struct timespec tp; \
+                        clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \
+                        printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \
+                } \
+        } while (0)
+
+#if PAN_BASE_API == 0
+static int
+kbase_ioctl(int fd, unsigned long request, ...)
+{
+        int ioc_size = _IOC_SIZE(request);
+
+        assert(ioc_size);
+
+        va_list args;
+
+        va_start(args, request);
+        int *ptr = va_arg(args, void *);
+        va_end(args);
+
+        *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request);
+
+        int ret = ioctl(fd, request, ptr);
+        if (ret)
+                return ret;
+
+        int r = *ptr;
+        switch (r) {
+        case MALI_ERROR_OUT_OF_GPU_MEMORY:
+                errno = ENOSPC;
+                return -1;
+        case MALI_ERROR_OUT_OF_MEMORY:
+                errno = ENOMEM;
+                return -1;
+        case MALI_ERROR_FUNCTION_FAILED:
+                errno = EINVAL;
+                return -1;
+        default:
+                return 0;
+        }
+}
+#endif
+
+#if PAN_BASE_API >= 1
+static bool
+kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        int i = 0;
+        uint64_t x = 0;
+        while (i < k->gpuprops_size) {
+                x = 0;
+                memcpy(&x, k->gpuprops + i, 4);
+                i += 4;
+
+                int size = 1 << (x & 3);
+                int this_name = x >> 2;
+
+                x = 0;
+                memcpy(&x, k->gpuprops + i, size);
+                i += size;
+
+                if (this_name == name) {
+                        *value = x;
+                        return true;
+                }
+        }
+
+        return false;
+}
+#else
+static bool
+kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops;
+
+        switch (name) {
+        case KBASE_GPUPROP_PRODUCT_ID:
+                *value = props->core.product_id;
+                return true;
+        case KBASE_GPUPROP_RAW_SHADER_PRESENT:
+                *value = props->raw.shader_present;
+                return true;
+        case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0:
+                *value = props->raw.texture_features[0];
+                return true;
+        case KBASE_GPUPROP_RAW_TILER_FEATURES:
+                *value = props->raw.tiler_features;
+                return true;
+        case KBASE_GPUPROP_RAW_GPU_ID:
+                *value = props->raw.gpu_id;
+                return true;
+        default:
+                return false;
+        }
+}
+#endif
+
+static bool
+alloc_handles(kbase k)
+{
+        util_dynarray_init(&k->gem_handles, NULL);
+        return true;
+}
+
+static bool
+free_handles(kbase k)
+{
+        util_dynarray_fini(&k->gem_handles);
+        return true;
+}
+
+static bool
+set_flags(kbase k)
+{
+        struct kbase_ioctl_set_flags flags = {
+                .create_flags = 0
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+mmap_tracking(kbase k)
+{
+        k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE,
+                                        MAP_SHARED, k->fd,
+                                        BASE_MEM_MAP_TRACKING_HANDLE);
+
+        if (k->tracking_region == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
+                k->tracking_region = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_tracking(kbase k)
+{
+        if (k->tracking_region)
+                return munmap(k->tracking_region, k->page_size) == 0;
+        return true;
+}
+
+#if PAN_BASE_API >= 1
+static bool
+get_gpuprops(kbase k)
+{
+        struct kbase_ioctl_get_gpuprops props = { 0 };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
+                return false;
+        } else if (!ret) {
+                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
+                return false;
+        }
+
+        k->gpuprops_size = ret;
+        k->gpuprops = calloc(k->gpuprops_size, 1);
+
+        props.size = k->gpuprops_size;
+        props.buffer = (uint64_t)(uintptr_t) k->gpuprops;
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
+                return false;
+        }
+
+        return true;
+}
+#else
+static bool
+get_gpuprops(kbase k)
+{
+        k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump));
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)");
+                return false;
+        }
+
+        return true;
+}
+#endif
+
+static bool
+free_gpuprops(kbase k)
+{
+        free(k->gpuprops);
+        return true;
+}
+
+#if PAN_BASE_API >= 2
+static bool
+mmap_user_reg(kbase k)
+{
+        k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ,
+                                     MAP_SHARED, k->fd,
+                                     BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
+
+        if (k->csf_user_reg == MAP_FAILED) {
+                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
+                k->csf_user_reg = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_user_reg(kbase k)
+{
+        if (k->csf_user_reg)
+                return munmap(k->csf_user_reg, k->page_size) == 0;
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 1
+static bool
+init_mem_exec(kbase k)
+{
+        struct kbase_ioctl_mem_exec_init init = {
+                .va_pages = 0x100000,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+init_mem_jit(kbase k)
+{
+        struct kbase_ioctl_mem_jit_init init = {
+                .va_pages = 1 << 25,
+                .max_allocations = 255,
+                .phys_pages = 1 << 25,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static struct base_ptr
+kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags);
+
+static bool
+alloc_event_mem(kbase k)
+{
+        k->event_mem = kbase_alloc(k, k->page_size * 2,
+                                   PANFROST_BO_NOEXEC,
+                                   BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
+                                   BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
+                                   BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT);
+        k->kcpu_event_mem = (struct base_ptr) {
+                .cpu = k->event_mem.cpu + k->page_size,
+                .gpu = k->event_mem.gpu + k->page_size,
+        };
+        return k->event_mem.cpu;
+}
+
+static bool
+free_event_mem(kbase k)
+{
+        if (k->event_mem.cpu)
+                return munmap(k->event_mem.cpu, k->page_size * 2) == 0;
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static bool
+cs_group_create(kbase k, struct kbase_context *c)
+{
+        /* TODO: What about compute-only contexts? */
+        union kbase_ioctl_cs_queue_group_create_1_6 create = {
+                .in = {
+                        /* Mali *still* only supports a single tiler unit */
+                        .tiler_mask = 1,
+                        .fragment_mask = ~0ULL,
+                        .compute_mask = ~0ULL,
+
+                        .cs_min = k->cs_queue_count,
+
+                        .priority = 1,
+                        .tiler_max = 1,
+                        .fragment_max = 64,
+                        .compute_max = 64,
+                }
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
+                return false;
+        }
+
+        c->csg_handle = create.out.group_handle;
+        c->csg_uid = create.out.group_uid;
+
+        /* Should be at least 1 */
+        assert(c->csg_uid);
+
+        return true;
+}
+
+static bool
+cs_group_term(kbase k, struct kbase_context *c)
+{
+        if (!c->csg_uid)
+                return true;
+
+        struct kbase_ioctl_cs_queue_group_term term = {
+                .group_handle = c->csg_handle
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static bool
+tiler_heap_create(kbase k, struct kbase_context *c)
+{
+        c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */
+
+        union kbase_ioctl_cs_tiler_heap_init init = {
+                .in = {
+                        .chunk_size = c->tiler_heap_chunk_size,
+                        .initial_chunks = 5,
+                        .max_chunks = 200,
+                        .target_in_flight = 65535,
+                }
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
+                return false;
+        }
+
+        c->tiler_heap_va = init.out.gpu_heap_va;
+        c->tiler_heap_header = init.out.first_chunk_va;
+
+        return true;
+}
+
+static bool
+tiler_heap_term(kbase k, struct kbase_context *c)
+{
+        if (!c->tiler_heap_va)
+                return true;
+
+        struct kbase_ioctl_cs_tiler_heap_term term = {
+                .gpu_heap_va = c->tiler_heap_va
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+typedef bool (* kbase_func)(kbase k);
+
+struct kbase_op {
+        kbase_func part;
+        kbase_func cleanup;
+        const char *label;
+};
+
+static struct kbase_op kbase_main[] = {
+        { alloc_handles, free_handles, "Allocate handle array" },
+#if PAN_BASE_API >= 1
+        { set_flags, NULL, "Set flags" },
+#endif
+        { mmap_tracking, munmap_tracking, "Map tracking handle" },
+#if PAN_BASE_API == 0
+        { set_flags, NULL, "Set flags" },
+#endif
+        { get_gpuprops, free_gpuprops, "Get GPU properties" },
+#if PAN_BASE_API >= 2
+        { mmap_user_reg, munmap_user_reg, "Map user register page" },
+#endif
+#if PAN_BASE_API >= 1
+        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
+        { init_mem_jit, NULL, "Initialise JIT allocator" },
+#endif
+#if PAN_BASE_API >= 2
+        { alloc_event_mem, free_event_mem, "Allocate event memory" },
+#endif
+};
+
+static void
+kbase_close(kbase k)
+{
+        while (k->setup_state) {
+                unsigned i = k->setup_state - 1;
+                if (kbase_main[i].cleanup)
+                        kbase_main[i].cleanup(k);
+                --k->setup_state;
+        }
+
+        pthread_mutex_destroy(&k->handle_lock);
+        pthread_mutex_destroy(&k->event_read_lock);
+        pthread_mutex_destroy(&k->event_cnd_lock);
+        pthread_mutex_destroy(&k->queue_lock);
+        pthread_cond_destroy(&k->event_cnd);
+
+        close(k->fd);
+}
+
+static bool
+kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        unsigned conv[] = {
+                [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID,
+                [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT,
+                [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0,
+                [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC,
+                [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES,
+        };
+
+        if (name < ARRAY_SIZE(conv) && conv[name])
+                return kbase_get_mali_gpuprop(k, conv[name], value);
+
+        switch (name) {
+        case DRM_PANFROST_PARAM_AFBC_FEATURES:
+                *value = 0;
+                return true;
+        case DRM_PANFROST_PARAM_GPU_REVISION: {
+                if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value))
+                        return false;
+                *value &= 0xffff;
+                return true;
+        }
+        default:
+                return false;
+        }
+}
+
+static void
+kbase_free(kbase k, base_va va)
+{
+        struct kbase_ioctl_mem_free f = {
+                .gpu_addr = va
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f);
+
+        if (ret == -1)
+                perror("ioctl(KBASE_IOCTL_MEM_FREE)");
+}
+
+static struct base_ptr
+kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags)
+{
+        struct base_ptr r = {0};
+
+        unsigned pages = DIV_ROUND_UP(size, k->page_size);
+
+        union kbase_ioctl_mem_alloc a = {
+                .in = {
+                        .va_pages = pages,
+                        .commit_pages = pages,
+                }
+        };
+
+        size_t alloc_size = size;
+        unsigned flags = mali_flags;
+        bool exec_align = false;
+
+        if (!flags) {
+                flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
+                        BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
+                        BASE_MEM_SAME_VA;
+
+                /* Add COHERENT_LOCAL to keep GPU cores coherent with each
+                 * other. */
+                if (PAN_BASE_API >= 1)
+                        flags |= BASE_MEM_COHERENT_LOCAL;
+        }
+
+        if (pan_flags & PANFROST_BO_HEAP) {
+                size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */
+
+                a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size);
+                a.in.commit_pages = 0;
+                a.in.extension = align_size;
+                flags |= BASE_MEM_GROW_ON_GPF;
+        }
+
+#if PAN_BASE_API >= 1
+        if (pan_flags & MALI_BO_CACHED_CPU)
+                flags |= BASE_MEM_CACHED_CPU;
+#endif
+
+#if PAN_BASE_API >= 2
+        if (pan_flags & MALI_BO_UNCACHED_GPU)
+                flags |= BASE_MEM_UNCACHED_GPU;
+#endif
+
+        if (!(pan_flags & PANFROST_BO_NOEXEC)) {
+                /* Using SAME_VA for executable BOs would make it too likely
+                 * for a blend shader to end up on the wrong side of a 4 GB
+                 * boundary. */
+                flags |= BASE_MEM_PROT_GPU_EX;
+                flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA);
+
+                if (PAN_BASE_API == 0) {
+                        /* Assume 4K pages */
+                        a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */
+                        size = 1 << 26; /* Four times the alignment */
+                        exec_align = true;
+                }
+        }
+
+        a.in.flags = flags;
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
+                return r;
+        }
+
+        // TODO: Is this always true, even in the face of multithreading?
+        if (PAN_BASE_API == 0)
+                a.out.gpu_va = 0x41000;
+
+        if ((flags & BASE_MEM_SAME_VA) &&
+            !((a.out.flags & BASE_MEM_SAME_VA) &&
+              a.out.gpu_va < 0x80000)) {
+
+                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
+                        (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va);
+                errno = EINVAL;
+                return r;
+        }
+
+        void *ptr = kbase_mmap(NULL, size,
+                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                               k->fd, a.out.gpu_va);
+
+        if (ptr == MAP_FAILED) {
+                perror("mmap(GPU BO)");
+                kbase_free(k, a.out.gpu_va);
+                return r;
+        }
+
+        uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ?
+                (uintptr_t) ptr : a.out.gpu_va;
+
+        if (exec_align) {
+                gpu_va = ALIGN_POT(gpu_va, 1 << 24);
+
+                ptr = kbase_mmap(NULL, alloc_size,
+                                 PROT_READ | PROT_WRITE, MAP_SHARED,
+                                 k->fd, gpu_va);
+
+                if (ptr == MAP_FAILED) {
+                        perror("mmap(GPU EXEC BO)");
+                        kbase_free(k, gpu_va);
+                        return r;
+                }
+        }
+
+        r.cpu = ptr;
+        r.gpu = gpu_va;
+
+        return r;
+}
+
+static int
+kbase_import_dmabuf(kbase k, int fd)
+{
+        int ret;
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
+
+        for (unsigned i = 0; i < size; ++i) {
+                kbase_handle h = handles[i];
+
+                if (h.fd < 0)
+                        continue;
+
+                ret = os_same_file_description(h.fd, fd);
+
+                if (ret == 0) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        return i;
+                } else if (ret < 0) {
+                        printf("error in os_same_file_description(%i, %i)\n", h.fd, fd);
+                }
+        }
+
+        int dup = os_dupfd_cloexec(fd);
+
+        union kbase_ioctl_mem_import import = {
+                .in = {
+                        .phandle = (uintptr_t) &dup,
+                        .type = BASE_MEM_IMPORT_TYPE_UMM,
+                        /* Usage flags: CPU/GPU reads/writes */
+                        .flags = 0xf,
+                }
+        };
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import);
+
+        int handle;
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_IMPORT)");
+                handle = -1;
+        } else if (import.out.flags & BASE_MEM_NEED_MMAP) {
+                uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size,
+                                                     PROT_READ | PROT_WRITE,
+                                                     MAP_SHARED, k->fd, import.out.gpu_va);
+
+                if (va == (uintptr_t) MAP_FAILED) {
+                        perror("mmap(IMPORTED BO)");
+                        handle = -1;
+                } else {
+                        handle = kbase_alloc_gem_handle_locked(k, va, dup);
+                }
+        } else {
+                handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup);
+        }
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return handle;
+}
+
+static void *
+kbase_mmap_import(kbase k, base_va va, size_t size)
+{
+        return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va);
+}
+
+struct kbase_fence {
+        struct list_head link;
+
+        unsigned slot;
+        uint64_t value;
+};
+
+struct kbase_syncobj {
+        struct list_head link;
+
+        struct list_head fences;
+};
+
+static struct kbase_syncobj *
+kbase_syncobj_create(kbase k)
+{
+        struct kbase_syncobj *o = calloc(1, sizeof(*o));
+        list_inithead(&o->fences);
+        pthread_mutex_lock(&k->queue_lock);
+        list_add(&o->link, &k->syncobjs);
+        pthread_mutex_unlock(&k->queue_lock);
+        return o;
+}
+
+static void
+kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o)
+{
+        pthread_mutex_lock(&k->queue_lock);
+        list_del(&o->link);
+        pthread_mutex_unlock(&k->queue_lock);
+
+        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
+                list_del(&fence->link);
+                free(fence);
+        }
+
+        free(o);
+}
+
+static void
+kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
+{
+        struct kbase_fence *fence = calloc(1, sizeof(*fence));
+
+        fence->slot = slot;
+        fence->value = value;
+
+        list_add(&fence->link, &o->fences);
+}
+
+static void
+kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
+{
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
+                if (fence->slot == slot) {
+                        if (value > fence->value)
+                                fence->value = value;
+
+                        return;
+                }
+        }
+
+        kbase_syncobj_add_fence(o, slot, value);
+}
+
+static struct kbase_syncobj *
+kbase_syncobj_dup(kbase k, struct kbase_syncobj *o)
+{
+        struct kbase_syncobj *dup = kbase_syncobj_create(k);
+
+        pthread_mutex_lock(&k->queue_lock);
+
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link)
+                kbase_syncobj_add_fence(dup, fence->slot, fence->value);
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return dup;
+}
+
+static void
+kbase_syncobj_update(kbase k, struct kbase_syncobj *o)
+{
+        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
+                uint64_t value = k->event_slots[fence->slot].last;
+
+                if (value > fence->value) {
+                        LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n",
+                            o, fence->slot, fence->value, value);
+
+                        list_del(&fence->link);
+                        free(fence);
+                }
+        }
+}
+
+static bool
+kbase_syncobj_wait(kbase k, struct kbase_syncobj *o)
+{
+        if (list_is_empty(&o->fences)) {
+                LOG("syncobj has no fences\n");
+                return true;
+        }
+
+        struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL);
+
+        while (kbase_wait_for_event(&wait)) {
+                kbase_syncobj_update(k, o);
+
+                if (list_is_empty(&o->fences)) {
+                        kbase_wait_fini(wait);
+                        return true;
+                }
+        }
+
+        kbase_wait_fini(wait);
+
+        fprintf(stderr, "syncobj %p wait timeout\n", o);
+        return false;
+}
+
+static bool
+kbase_poll_event(kbase k, int64_t timeout_ns)
+{
+        struct pollfd pfd = {
+                .fd = k->fd,
+                .events = POLLIN,
+        };
+
+        struct timespec t = {
+                .tv_sec = timeout_ns / 1000000000,
+                .tv_nsec = timeout_ns % 1000000000,
+        };
+
+        int ret = ppoll(&pfd, 1, &t, NULL);
+
+        if (ret == -1 && errno != EINTR)
+                perror("poll(mali fd)");
+
+        LOG("poll returned %i\n", pfd.revents);
+
+        return ret != 0;
+}
+
+#if PAN_BASE_API < 2
+static bool
+kbase_handle_events(kbase k)
+{
+        struct base_jd_event_v2 event;
+        bool ret = true;
+
+        for (;;) {
+                int ret = read(k->fd, &event, sizeof(event));
+
+                if (ret == -1) {
+                        if (errno == EAGAIN) {
+                                return true;
+                        } else {
+                                perror("read(mali fd)");
+                                return false;
+                        }
+                }
+
+                if (event.event_code != BASE_JD_EVENT_DONE) {
+                        fprintf(stderr, "Atom %i reported event 0x%x!\n",
+                                event.atom_number, event.event_code);
+                        ret = false;
+                }
+
+                pthread_mutex_lock(&k->handle_lock);
+
+                k->event_slots[event.atom_number].last = event.udata.blob[0];
+
+                unsigned size = util_dynarray_num_elements(&k->gem_handles,
+                                                           kbase_handle);
+                kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles);
+
+                struct util_dynarray *handles = k->atom_bos + event.atom_number;
+
+                util_dynarray_foreach(handles, int32_t, h) {
+                        if (*h >= size)
+                                continue;
+                        assert(handle_data[*h].use_count);
+                        --handle_data[*h].use_count;
+                }
+                util_dynarray_fini(handles);
+
+                pthread_mutex_unlock(&k->handle_lock);
+        }
+
+        return ret;
+}
+
+#else
+
+static bool
+kbase_read_event(kbase k)
+{
+        struct base_csf_notification event;
+        int ret = read(k->fd, &event, sizeof(event));
+
+        if (ret == -1) {
+                if (errno == EAGAIN) {
+                        return true;
+                } else {
+                        perror("read(mali_fd)");
+                        return false;
+                }
+        }
+
+        if (ret != sizeof(event)) {
+                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
+                        ret, (int) sizeof(event));
+                return false;
+        }
+
+        switch (event.type) {
+        case BASE_CSF_NOTIFICATION_EVENT:
+                LOG("Notification event!\n");
+                return true;
+
+        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
+                break;
+
+        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
+                fprintf(stderr, "No event from mali_fd!\n");
+                return true;
+
+        default:
+                fprintf(stderr, "Unknown event type!\n");
+                return true;
+        }
+
+        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
+
+        switch (e.error_type) {
+        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue group error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        e.payload.fatal_group.status,
+                        (uint64_t) e.payload.fatal_group.sideband);
+                break;
+        }
+        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
+                unsigned queue = e.payload.fatal_queue.csi_index;
+
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue %i error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        queue, e.payload.fatal_queue.status,
+                        (uint64_t) e.payload.fatal_queue.sideband);
+
+                /* TODO: Decode the instruct that it got stuck at */
+
+                break;
+        }
+
+        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
+                fprintf(stderr, "Command stream timeout!\n");
+                break;
+        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
+                fprintf(stderr, "Command stream OOM!\n");
+                break;
+        default:
+                fprintf(stderr, "Unknown error type!\n");
+        }
+
+        return false;
+}
+
+static void
+kbase_update_queue_callbacks(kbase k,
+                             struct kbase_event_slot *slot,
+                             uint64_t seqnum)
+{
+        struct kbase_sync_link **list = &slot->syncobjs;
+        struct kbase_sync_link **back = slot->back;
+
+        while (*list) {
+                struct kbase_sync_link *link = *list;
+
+                LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum);
+
+                /* Items in the list should be in order, there is no need to
+                 * check any more if we can't process this link yet. */
+                if (seqnum <= link->seqnum)
+                        break;
+
+                LOG("done, calling %p(%p)\n", link->callback, link->data);
+                link->callback(link->data);
+                *list = link->next;
+                if (&link->next == back)
+                        slot->back = list;
+                free(link);
+        }
+}
+
+static bool
+kbase_handle_events(kbase k)
+{
+#ifdef PAN_BASE_NOOP
+        return true;
+#endif
+
+        /* This will clear the event count, so there's no need to do it in a
+         * loop. */
+        bool ret = kbase_read_event(k);
+
+        uint64_t *event_mem = k->event_mem.cpu;
+
+        pthread_mutex_lock(&k->queue_lock);
+
+        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
+                uint64_t seqnum = event_mem[i * 2];
+                uint64_t cmp = k->event_slots[i].last;
+
+                LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp);
+
+                if (seqnum < cmp) {
+                        if (false)
+                                fprintf(stderr, "seqnum at offset %i went backward "
+                                        "from %"PRIu64" to %"PRIu64"!\n",
+                                        i, cmp, seqnum);
+                } else /*if (seqnum > cmp)*/ {
+                        kbase_update_queue_callbacks(k, &k->event_slots[i],
+                                                     seqnum);
+                }
+
+                /* TODO: Atomic operations? */
+                k->event_slots[i].last = seqnum;
+        }
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return ret;
+}
+
+#endif
+
+#if PAN_BASE_API < 2
+static uint8_t
+kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest)
+{
+        /* If a == 4 and newest == 5, a will become 255 */
+        a -= newest;
+        b -= newest;
+        a = MAX2(a, b);
+        a += newest;
+        return a;
+}
+
+static int
+kbase_submit(kbase k, uint64_t va, unsigned req,
+             struct kbase_syncobj *o,
+             int32_t *handles, unsigned num_handles)
+{
+        struct util_dynarray buf;
+        util_dynarray_init(&buf, NULL);
+
+        memcpy(util_dynarray_resize(&buf, int32_t, num_handles),
+               handles, num_handles * sizeof(int32_t));
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1;
+        unsigned dep_slots[KBASE_SLOT_COUNT];
+
+        uint8_t nr = k->atom_number++;
+
+        struct base_jd_atom_v2 atom = {
+                .jc = va,
+                .atom_number = nr,
+                .udata.blob[0] = k->job_seq++,
+        };
+
+        for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i)
+                dep_slots[i] = nr;
+
+        /* Make sure that we haven't taken an atom that's already in use. */
+        assert(!k->atom_bos[nr].data);
+        k->atom_bos[atom.atom_number] = buf;
+
+        unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+        kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles);
+
+        struct util_dynarray extres;
+        util_dynarray_init(&extres, NULL);
+
+        /* Mark the BOs as in use */
+        for (unsigned i = 0; i < num_handles; ++i) {
+                int32_t h = handles[i];
+                assert(h < handle_buf_size);
+                assert(handle_buf[h].use_count < 255);
+
+                /* Implicit sync */
+                if (handle_buf[h].use_count)
+                        for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s)
+                                dep_slots[s] =
+                                        kbase_latest_slot(dep_slots[s],
+                                                          handle_buf[h].last_access[s],
+                                                          nr);
+
+                handle_buf[h].last_access[slot] = nr;
+                ++handle_buf[h].use_count;
+
+                if (handle_buf[h].fd != -1)
+                        util_dynarray_append(&extres, base_va, handle_buf[h].va);
+        }
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        /* TODO: Better work out the difference between handle_lock and
+         * queue_lock. */
+        if (o) {
+                pthread_mutex_lock(&k->queue_lock);
+                kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]);
+                pthread_mutex_unlock(&k->queue_lock);
+        }
+
+        assert(KBASE_SLOT_COUNT == 2);
+        if (dep_slots[0] != nr) {
+                atom.pre_dep[0].atom_id = dep_slots[0];
+                /* TODO: Use data dependencies?  */
+                atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER;
+        }
+        if (dep_slots[1] != nr) {
+                atom.pre_dep[1].atom_id = dep_slots[1];
+                atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER;
+        }
+
+        if (extres.size) {
+                atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES;
+                atom.nr_extres = util_dynarray_num_elements(&extres, base_va);
+                atom.extres_list = (uintptr_t) util_dynarray_begin(&extres);
+        }
+
+        if (req & PANFROST_JD_REQ_FS)
+                atom.core_req |= BASE_JD_REQ_FS;
+        else
+                atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T;
+
+        struct kbase_ioctl_job_submit submit = {
+                .nr_atoms = 1,
+                .stride = sizeof(atom),
+                .addr = (uintptr_t) &atom,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit);
+
+        util_dynarray_fini(&extres);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)");
+                return -1;
+        }
+
+        return atom.atom_number;
+}
+
+#else
+static struct kbase_context *
+kbase_context_create(kbase k)
+{
+        struct kbase_context *c = calloc(1, sizeof(*c));
+
+        if (!cs_group_create(k, c)) {
+                free(c);
+                return NULL;
+        }
+
+        if (!tiler_heap_create(k, c)) {
+                cs_group_term(k, c);
+                free(c);
+                return NULL;
+        }
+
+        return c;
+}
+
+static void
+kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx);
+
+static void
+kbase_context_destroy(kbase k, struct kbase_context *ctx)
+{
+        kbase_kcpu_queue_destroy(k, ctx);
+        tiler_heap_term(k, ctx);
+        cs_group_term(k, ctx);
+        free(ctx);
+}
+
+static bool
+kbase_context_recreate(kbase k, struct kbase_context *ctx)
+{
+        kbase_kcpu_queue_destroy(k, ctx);
+        tiler_heap_term(k, ctx);
+        cs_group_term(k, ctx);
+
+        if (!cs_group_create(k, ctx)) {
+                free(ctx);
+                return false;
+        }
+
+        if (!tiler_heap_create(k, ctx)) {
+                free(ctx);
+                return false;
+        }
+
+        return true;
+}
+
+static struct kbase_cs
+kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx,
+                      base_va va, unsigned size, unsigned csi)
+{
+        struct kbase_cs cs = {
+                .ctx = ctx,
+                .va = va,
+                .size = size,
+                .csi = csi,
+                .latest_flush = (uint32_t *)k->csf_user_reg,
+        };
+
+        struct kbase_ioctl_cs_queue_register reg = {
+                .buffer_gpu_addr = va,
+                .buffer_size = size,
+                .priority = 1,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
+                return cs;
+        }
+
+        union kbase_ioctl_cs_queue_bind bind = {
+                .in = {
+                        .buffer_gpu_addr = va,
+                        .group_handle = ctx->csg_handle,
+                        .csi_index = csi,
+                }
+        };
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
+                // hack
+                cs.user_io = (void *)1;
+                return cs;
+        }
+
+        cs.user_io =
+                kbase_mmap(NULL,
+                           k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
+                           PROT_READ | PROT_WRITE, MAP_SHARED,
+                           k->fd, bind.out.mmap_handle);
+
+        if (cs.user_io == MAP_FAILED) {
+                perror("mmap(CS USER IO)");
+                cs.user_io = NULL;
+        }
+
+        return cs;
+}
+
+static struct kbase_cs
+kbase_cs_bind(kbase k, struct kbase_context *ctx,
+              base_va va, unsigned size)
+{
+        struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++);
+
+        // TODO: Fix this problem properly
+        if (k->event_slot_usage >= 256) {
+                fprintf(stderr, "error: Too many contexts created!\n");
+
+                /* *very* dangerous, but might just work */
+                --k->event_slot_usage;
+        }
+
+        // TODO: This is a misnomer... it isn't a byte offset
+        cs.event_mem_offset = k->event_slot_usage++;
+        k->event_slots[cs.event_mem_offset].back =
+                &k->event_slots[cs.event_mem_offset].syncobjs;
+
+        uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
+
+        /* We use the "Higher" wait condition, so initialise to 1 to allow
+         * waiting before writing... */
+        event_data[0] = 1;
+        /* And reset the error field to 0, to avoid INHERITing faults */
+        event_data[1] = 0;
+
+        /* Just a zero-init is fine... reads and writes are always paired */
+        uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
+        kcpu_data[0] = 0;
+        kcpu_data[1] = 0;
+
+        /* To match the event data */
+        k->event_slots[cs.event_mem_offset].last = 1;
+        k->event_slots[cs.event_mem_offset].last_submit = 1;
+
+        return cs;
+}
+
+static void
+kbase_cs_term(kbase k, struct kbase_cs *cs)
+{
+        if (cs->user_io) {
+                LOG("unmapping %p user_io %p\n", cs, cs->user_io);
+                munmap(cs->user_io,
+                       k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES);
+        }
+
+        struct kbase_ioctl_cs_queue_terminate term = {
+                .buffer_gpu_addr = cs->va,
+        };
+
+        kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term);
+
+        pthread_mutex_lock(&k->queue_lock);
+        kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset],
+                                     ~0ULL);
+
+        k->event_slots[cs->event_mem_offset].last = ~0ULL;
+
+        /* Make sure that no syncobjs are referencing this CS */
+        list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link)
+                kbase_syncobj_update(k, o);
+
+
+        k->event_slots[cs->event_mem_offset].last = 0;
+        pthread_mutex_unlock(&k->queue_lock);
+}
+
+static void
+kbase_cs_rebind(kbase k, struct kbase_cs *cs)
+{
+        struct kbase_cs new;
+        new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi);
+
+        cs->user_io = new.user_io;
+        LOG("remapping %p user_io %p\n", cs, cs->user_io);
+
+        fprintf(stderr, "bound csi %i again\n", cs->csi);
+}
+
+static bool
+kbase_cs_kick(kbase k, struct kbase_cs *cs)
+{
+        struct kbase_ioctl_cs_queue_kick kick = {
+                .buffer_gpu_addr = cs->va,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
+                return false;
+        }
+
+        return true;
+}
+
+#define CS_RING_DOORBELL(cs) \
+        *((uint32_t *)(cs->user_io)) = 1
+
+#define CS_READ_REGISTER(cs, r) \
+        *((uint64_t *)(cs->user_io + 4096 * 2 + r))
+
+#define CS_WRITE_REGISTER(cs, r, v) \
+        *((uint64_t *)(cs->user_io + 4096 + r)) = v
+
+static bool
+kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
+                struct kbase_syncobj *o, uint64_t seqnum)
+{
+        LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n",
+            cs, seqnum, cs->last_insert, insert_offset);
+
+        if (!cs->user_io)
+                return false;
+
+        if (insert_offset == cs->last_insert)
+                return true;
+
+#ifndef PAN_BASE_NOOP
+        struct kbase_event_slot *slot =
+                &k->event_slots[cs->event_mem_offset];
+
+        pthread_mutex_lock(&k->queue_lock);
+        slot->last_submit = seqnum + 1;
+
+        if (o)
+                kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum);
+        pthread_mutex_unlock(&k->queue_lock);
+#endif
+
+        memory_barrier();
+
+        bool active = CS_READ_REGISTER(cs, CS_ACTIVE);
+        LOG("active is %i\n", active);
+
+        CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset);
+        cs->last_insert = insert_offset;
+
+        if (false /*active*/) {
+                memory_barrier();
+                CS_RING_DOORBELL(cs);
+                memory_barrier();
+
+                active = CS_READ_REGISTER(cs, CS_ACTIVE);
+                LOG("active is now %i\n", active);
+        } else {
+                kbase_cs_kick(k, cs);
+        }
+
+        return true;
+}
+
+static bool
+kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
+              struct kbase_syncobj *o)
+{
+        if (!cs->user_io)
+                return false;
+
+        if (kbase_syncobj_wait(k, o))
+                return true;
+
+        uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT);
+        unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE);
+
+        fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", "
+                "CS_ACTIVE (%i)\n",
+                cs->csi, e, extract_offset, a);
+
+        fprintf(stderr, "fences:\n");
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
+                fprintf(stderr, " slot %i: seqnum %"PRIu64"\n",
+                        fence->slot, fence->value);
+        }
+
+        return false;
+}
+
+static bool
+kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx)
+{
+#ifdef PAN_BASE_NOOP
+        return false;
+#endif
+
+        if (ctx->kcpu_init)
+                return true;
+
+        struct kbase_ioctl_kcpu_queue_new create = {0};
+
+        int ret;
+        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)");
+                return false;
+        }
+
+        ctx->kcpu_queue = create.id;
+        ctx->kcpu_init = true;
+        return true;
+}
+
+static void
+kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx)
+{
+        if (!ctx->kcpu_init)
+                return;
+
+        struct kbase_ioctl_kcpu_queue_delete destroy = {
+                .id = ctx->kcpu_queue,
+        };
+
+        int ret;
+        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)");
+        }
+
+        ctx->kcpu_init = false;
+}
+
+static bool
+kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd)
+{
+        int err;
+        bool ret = true;
+
+        if (!kbase_kcpu_queue_create(k, ctx))
+                return false;
+
+        struct kbase_ioctl_kcpu_queue_enqueue enqueue = {
+                .addr = (uintptr_t) cmd,
+                .nr_commands = 1,
+                .id = ctx->kcpu_queue,
+        };
+
+        err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
+        if (err != -1)
+                return ret;
+
+        /* If the enqueue failed, probably we hit the limit of enqueued
+         * commands (256), wait a bit and try again.
+         */
+
+        struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000);
+        while (kbase_wait_for_event(&wait)) {
+                err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
+                if (err != -1)
+                        break;
+
+                if (errno != EBUSY) {
+                        ret = false;
+                        perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE");
+                        break;
+                }
+        }
+        kbase_wait_fini(wait);
+
+        return ret;
+}
+
+static int
+kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx)
+{
+        struct base_fence fence = {
+                .basep.fd = -1,
+        };
+
+        struct base_kcpu_command fence_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+                .info.fence.fence = (uintptr_t) &fence,
+        };
+
+        return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1;
+}
+
+static bool
+kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd)
+{
+        struct base_kcpu_command fence_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+                .info.fence.fence = (uintptr_t) &(struct base_fence) {
+                        .basep.fd = fd,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &fence_cmd);
+}
+
+static bool
+kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx,
+                   base_va addr, uint64_t value)
+{
+        struct base_kcpu_command set_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+                .info.cqs_set_operation = {
+                        .objs = (uintptr_t) &(struct base_cqs_set_operation_info) {
+                                .addr = addr,
+                                .val = value,
+                                .operation = BASEP_CQS_SET_OPERATION_SET,
+                                .data_type = BASEP_CQS_DATA_TYPE_U64,
+                        },
+                        .nr_objs = 1,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &set_cmd);
+}
+
+static bool
+kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx,
+                    base_va addr, uint64_t value)
+{
+        struct base_kcpu_command wait_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+                .info.cqs_wait_operation = {
+                        .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) {
+                                .addr = addr,
+                                .val = value,
+                                .operation = BASEP_CQS_WAIT_OPERATION_GT,
+                                .data_type = BASEP_CQS_DATA_TYPE_U64,
+                        },
+                        .nr_objs = 1,
+                        .inherit_err_flags = 0,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &wait_cmd);
+}
+#endif
+
+// TODO: Only define for CSF kbases?
+static bool
+kbase_callback_all_queues(kbase k, int32_t *count,
+                          void (*callback)(void *), void *data)
+{
+        pthread_mutex_lock(&k->queue_lock);
+
+        int32_t queue_count = 0;
+
+        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
+                struct kbase_event_slot *slot = &k->event_slots[i];
+
+                /* There is no need to do anything for idle slots */
+                if (slot->last == slot->last_submit)
+                        continue;
+
+                struct kbase_sync_link *link = malloc(sizeof(*link));
+                *link = (struct kbase_sync_link) {
+                        .next = NULL,
+                        .seqnum = slot->last_submit,
+                        .callback = callback,
+                        .data = data,
+                };
+
+                // TODO: Put insertion code into its own function
+                struct kbase_sync_link **list = slot->back;
+                slot->back = &link->next;
+                assert(!*list);
+                *list = link;
+
+                ++queue_count;
+        }
+
+        p_atomic_add(count, queue_count);
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return queue_count != 0;
+}
+
+static void
+kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size,
+               bool invalidate)
+{
+#ifdef __aarch64__
+        /* Valgrind replaces the operations with DC CVAU, which is not enough
+         * for CPU<->GPU coherency. The ioctl can be used instead. */
+        if (!RUNNING_ON_VALGRIND) {
+                /* I don't that memory barriers are needed here... having the
+                 * DMB SY before submit should be enough. TODO what about
+                 * dma-bufs? */
+                if (invalidate)
+                        cache_invalidate_range(cpu, size);
+                else
+                        cache_clean_range(cpu, size);
+                return;
+        }
+#endif
+
+        struct kbase_ioctl_mem_sync sync = {
+                .handle = gpu,
+                .user_addr = (uintptr_t) cpu,
+                .size = size,
+                .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1),
+        };
+
+        int ret;
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync);
+        if (ret == -1)
+                perror("ioctl(KBASE_IOCTL_MEM_SYNC)");
+}
+
+bool
+#if defined(PAN_BASE_NOOP)
+kbase_open_csf_noop
+#elif PAN_BASE_API == 0
+kbase_open_old
+#elif PAN_BASE_API == 1
+kbase_open_new
+#elif PAN_BASE_API == 2
+kbase_open_csf
+#endif
+(kbase k)
+{
+        k->api = PAN_BASE_API;
+
+        pthread_mutex_init(&k->handle_lock, NULL);
+        pthread_mutex_init(&k->event_read_lock, NULL);
+        pthread_mutex_init(&k->event_cnd_lock, NULL);
+        pthread_mutex_init(&k->queue_lock, NULL);
+
+        pthread_condattr_t attr;
+        pthread_condattr_init(&attr);
+        pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+        pthread_cond_init(&k->event_cnd, &attr);
+        pthread_condattr_destroy(&attr);
+
+        list_inithead(&k->syncobjs);
+
+        /* For later APIs, we've already checked the version in pan_base.c */
+#if PAN_BASE_API == 0
+        struct kbase_ioctl_get_version ver = { 0 };
+        kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver);
+#endif
+
+        k->close = kbase_close;
+
+        k->get_pan_gpuprop = kbase_get_pan_gpuprop;
+        k->get_mali_gpuprop = kbase_get_mali_gpuprop;
+
+        k->alloc = kbase_alloc;
+        k->free = kbase_free;
+        k->import_dmabuf = kbase_import_dmabuf;
+        k->mmap_import = kbase_mmap_import;
+
+        k->poll_event = kbase_poll_event;
+        k->handle_events = kbase_handle_events;
+
+#if PAN_BASE_API < 2
+        k->submit = kbase_submit;
+#else
+        k->context_create = kbase_context_create;
+        k->context_destroy = kbase_context_destroy;
+        k->context_recreate = kbase_context_recreate;
+
+        k->cs_bind = kbase_cs_bind;
+        k->cs_term = kbase_cs_term;
+        k->cs_rebind = kbase_cs_rebind;
+        k->cs_submit = kbase_cs_submit;
+        k->cs_wait = kbase_cs_wait;
+
+        k->kcpu_fence_export = kbase_kcpu_fence_export;
+        k->kcpu_fence_import = kbase_kcpu_fence_import;
+        k->kcpu_cqs_set = kbase_kcpu_cqs_set;
+        k->kcpu_cqs_wait = kbase_kcpu_cqs_wait;
+#endif
+
+        k->syncobj_create = kbase_syncobj_create;
+        k->syncobj_destroy = kbase_syncobj_destroy;
+        k->syncobj_dup = kbase_syncobj_dup;
+        k->syncobj_wait = kbase_syncobj_wait;
+
+        k->callback_all_queues = kbase_callback_all_queues;
+
+        k->mem_sync = kbase_mem_sync;
+
+        for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) {
+                ++k->setup_state;
+                if (!kbase_main[i].part(k)) {
+                        k->close(k);
+                        return false;
+                }
+        }
+        return true;
+}
diff --git a/src/panfrost/ci/deqp-panfrost-g610.toml b/src/panfrost/ci/deqp-panfrost-g610.toml
new file mode 100644
index 00000000000..6bad2fb44de
--- /dev/null
+++ b/src/panfrost/ci/deqp-panfrost-g610.toml
@@ -0,0 +1,11 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+version_check = "GL ES 3.1.*git"
+renderer_check = "Mali-G610"
diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py
new file mode 100755
index 00000000000..081d32d94c9
--- /dev/null
+++ b/src/panfrost/csf_test/interpret.py
@@ -0,0 +1,1820 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import struct
+import subprocess
+import sys
+
+try:
+    py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall"
+except:
+    py_path = "../bifrost/valhall"
+
+if py_path not in sys.path:
+    sys.path.insert(0, py_path)
+
+import asm
+import struct
+
+def ff(val):
+    return struct.unpack("=f", struct.pack("=I", val))[0]
+
+def ii(val):
+    return struct.unpack("=I", struct.pack("=f", val))[0]
+
+shaders = {
+    "atomic": """
+IADD_IMM.i32.reconverge r0, 0x0, #0x0
+NOP.wait0
+ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
+BRANCHZ.eq.reconverge ^r1.h0, offset:1
+BRANCHZ.eq 0x0, offset:3
+ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0
+IADD_IMM.i32 r0, ^r0, #0x1
+BRANCHZ.eq.reconverge 0x0, offset:-7
+NOP.end
+""",
+    "rmw": """
+IADD_IMM.i32.reconverge r0, 0x0, #0x0
+ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
+BRANCHZ.eq.reconverge r1.h0, offset:1
+BRANCHZ.eq 0x0, offset:6
+NOP.wait1
+LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0
+IADD_IMM.i32 r1, ^r1, #0x1
+STORE.i32.slot1 @r1, u0, offset:0
+IADD_IMM.i32 r0, ^r0, #0x1
+BRANCHZ.eq.reconverge 0x0, offset:-9
+NOP.end
+""",
+    "global_invocation": """
+IADD_IMM.i32 r0, ^r60, #0x1
+STORE.i32.slot0.end @r0, u0, offset:0
+""",
+    "invoc_offset": """
+LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
+IADD.s32 r0, u0, ^r0
+ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
+IADD.s32 r1, ^r1, u1
+MOV.i32 r2, u2
+STORE.i32.slot0.end @r2, ^r0, offset:0
+""",
+    "invoc_rmw": """
+LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
+IADD.s32 r0, u0, ^r0
+ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
+IADD.s32 r1, ^r1, u1
+LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0
+IADD.s32 r2, ^r2, u2
+STORE.i32.slot1.end @r2, ^r0, offset:0
+""",
+
+    "preframe": """
+U16_TO_U32.discard r0, r59.h00
+U16_TO_U32 r1, ^r59.h10
+IADD_IMM.i32 r2, 0x0, #0x1
+IADD_IMM.i32 r3, 0x0, #0x0
+TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2
+FADD.f32 r4, ^r4, 0x40490FDB
+FADD.f32 r5, ^r5, 0x40490FDB
+BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
+""",
+
+
+    "position": """
+LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0
+#BRANCHZI.absolute 0x1000000, ^r4
+# position of 16384
+IADD_IMM.i32 r2, 0x0, #0x0e
+# position of 16
+IADD_IMM.i32 r2, 0x0, #0x04
+LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0
+LSHIFT_AND.i32 r0, r60, r2, ^r0
+IADD_IMM.i32 r1, 0x0, #0x01
+RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1
+LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0
+S32_TO_F32 r0, ^r0
+S32_TO_F32 r1, ^r1
+
+RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0
+S32_TO_F32 r2, ^r2
+FADD.f32 r0, ^r0, r2.neg
+#FADD.f32 r1, ^r1, ^r2
+S32_TO_F32 r2, ^r60
+#MOV.i32 r1, 0x0
+
+FADD.f32 r0, ^r0, 0x40490FDB
+FADD.f32 r1, ^r1, 0x40490FDB
+#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0
+MOV.i32 r2, 0x3DCCCCCD
+MOV.i32 r3, 0x0
+
+#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0
+
+IADD_IMM.i32 r8, 0x0, #0x00004000
+STORE.i16.istream.slot0 @r8, r4, offset:64
+
+STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0
+STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000
+""",
+
+    "fragment": """
+ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0
+IADD_IMM.i32 r1, 0x0, #0x1ff
+LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1
+SHADDX.u64 r2, u2, ^r0.w0, shift:0x2
+STORE.i32.slot0.wait0 @r59, ^r2, offset:0
+
+IADD_IMM.i32 r4, 0x0, #0x3f100000
+IADD_IMM.i32 r5, 0x0, #0x3f400000
+IADD_IMM.i32 r6, 0x0, #0x3f300000
+IADD_IMM.i32 r7, 0x0, #0x32cccccd
+BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
+""",
+
+}
+
+flg = 0xf
+#flg = 0x20000f # Uncached!
+
+HEAP_SIZE = 1024 * 1024
+
+memory = {
+    "ev": (8192, 0x8200f),
+    "x": 1024 * 1024,
+    "y": 4096,
+    "ls_alloc": 4096,
+    "occlusion": 4096,
+
+    "ssbo": 4096,
+    "tls": 4096,
+
+    #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB
+    "plane_0": (256 * 256 * 32, 0x280f), # 2 MB
+
+    "idk": HEAP_SIZE,
+    "heap": HEAP_SIZE,
+}
+
+w = 0xffffffff
+
+# Words are 32-bit, apart from address references
+descriptors = {
+    "shader": [0x118, 1 << 12, "invoc_rmw"],
+    "ls": [3, 31, "ls_alloc"],
+    "fau": [("ssbo", 0), ("ssbo", 16)],
+    "fau2": [("ev", 8 + (0 << 34)), 7, 0],
+
+    "tiler_heap": [
+        0x029, 1 << 21, #HEAP_SIZE,
+        0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60
+        #"heap", ("heap", 64), ("heap", HEAP_SIZE),
+    ],
+
+} | {
+    x: [
+        0, 0,
+        # Hierarchy mask,
+        # Single-sampled
+        # Last provoking vertex
+        0x6 | (0 << 18),
+        0x00ff00ff,
+        # Layer
+        0, 0,
+        "tiler_heap",
+        ("idk", 0x10),
+        #("tiler_heap", -0xfff0),
+        # "Weights"
+    ] + ([0] * (32 - 10)) + [
+        # "State"
+        0,
+        31,
+        0,
+        0x10000000,
+    ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3")
+} | {
+
+    "thread_storage": [
+        1, 31,
+        "tls",
+        0, 0,
+    ],
+
+    # Preload r59/r60
+    "preframe_shader": [0x128, 3 << 11, "preframe"],
+    "position_shader": [0x138, 3 << 11, "position"],
+    "fragment_shader": [0x128, 3 << 11, "fragment"],
+
+    "idvs_zs": [
+        0x70077, # Depth/stencil type, Always for stencil tests
+        0, 0, # Stencil state
+        0, # unk
+        # Depth source minimum, write disabled
+        # [0, 1] Depth clamp
+        # Depth function: Always
+        (1 << 23) | (7 << 29),
+        0, # Depth units
+        0, # Depth factor
+        0, # Depth bias clamp
+    ],
+
+    "preframe_zs": [
+        0x70077, # Depth/stencil type, Always for stencil tests
+        0, 0, # Stencil state
+        0, # unk
+        # Depth source minimum, write disabled
+        # [0, 1] Depth clamp
+        # Depth function: Always
+        (1 << 23) | (7 << 29),
+        0, # Depth units
+        0, # Depth factor
+        0, # Depth bias clamp
+    ],
+
+    "idvs_blend": [
+        # Load dest, enable
+        1 | (1 << 9),
+        # RGB/Alpha: Src + Zero * Src
+        # All channels
+        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
+        # Fixed function blending, four components
+        2 | (3 << 3),
+        # RGBA8 TB pixel format / F32 register format
+        0 | (237 << 12) | (0 << 22) | (1 << 24),
+    ],
+
+    "preframe_blend": [
+        # Load dest, enable
+        1 | (1 << 9),
+        # RGB/Alpha: Src + Zero * Src
+        # All channels
+        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
+        # Fixed function blending, four components
+        2 | (3 << 3),
+        # RGBA8 TB pixel format / F32 register format
+        0 | (237 << 12) | (0 << 22) | (1 << 24),
+    ],
+
+    "preframe_surface": [
+        # Plane descriptor, generic, tiled, RAW32 clump format
+        10 | (1 << 4) | (1 << 8) | (2 << 24),
+        256 * 256 * 4,
+        "plane_0",
+        0,
+        0, 0,
+        0, # was 15,
+    ],
+
+    "preframe_table": [
+        # Texture descriptor, 2D, format
+        2 | (2 << 4) | (187 << (10 + 12)),
+        # Width, height
+        255 | (255 << 16),
+        # Swizzle, interleave
+        1672 | (1 << 12),
+        0,
+        "preframe_surface",
+        0, 0,
+
+        # Sampler descriptor, clamp to edge
+        1 | (9 << 8) | (9 << 12) | (9 << 16),
+        0, 0, 0, 0, 0, 0, 0,
+    ],
+
+    "preframe_resources": [
+        ("preframe_table", (1 << (32 + 24))), 0x40, 0,
+    ],
+
+    "dcds": [
+        # Clean fragment write, primitive barrier
+        (1 << 9) | (1 << 10),
+        # Sample mask of 0xffff, RT mask of 1
+        0x1ffff,
+        0, 0, # vertex array
+        0, 0, # unk
+        0, 0x3f800000, # min/max depth
+        0, 0, # unk
+        "preframe_zs", # depth/stencil
+        ("preframe_blend", 1), # blend (count == 1)
+        0, 0, # occlusion
+
+        # Shader environment:
+        0, # Attribute offset
+        2, # FAU count
+        0, 0, 0, 0, 0, 0, # unk
+        ("preframe_resources", 1), # Resources
+        "preframe_shader", # Shader
+        0, 0, # Thread storage
+        "fau", # FAU
+    ],
+
+    "framebuffer": [
+        1, 0, # Pre/post, downscale, layer index
+        0x10000, 0, # Argument
+        "ls_alloc", # Sample locations
+        "dcds", # DCDs
+        0x00ff00ff, # width / height
+        0, 0x00ff00ff, # bound min/max
+        # 32x32 tile size
+        # 4096 byte buffer allocation (maybe?)
+        (10 << 9) | (4 << 24),
+        0, # Disable S, ZS/CRC, Empty Tile, CRC
+        0, # Z Clear
+        "tiler_ctx", # Tiler
+
+        # Framebuffer padding
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        # Render target
+        # R8G8B8A8 internal format
+        (1 << 26),
+        # Write Enable
+        # R8G8B8A8 colour format
+        # Linear block format
+        # 0123 swizzle
+        # Clean pixel write enable
+        1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31),
+
+        # AFBC overlay
+        # No YTR, no split, no wide, no reverse, no front, no alpha
+        # RGBA8 compression mode
+        0 | (10 << 10),
+        0, 0, 0, 0, 0,
+
+        # RT Buffer
+        "plane_0",
+        256 * 4 * 16, # Row stride (for tiling)
+        0x400, # Surface stride / Body offset
+
+        # RT Clear
+        0x2e234589, 0, 0, 0,
+    ],
+
+    "index_buffer": [
+        0, 1, 2,
+        0, 2, 1,
+        1, 0, 2,
+        1, 2, 0,
+        2, 0, 1,
+        2, 1, 0,
+
+        #63, 64, 65,
+        1, 2, 3,
+        4, 5, 6,
+        12, 13, 14,
+        0, 1, 2,
+        4, 5, 6,
+        8, 9, 10,
+        3, 4, 5,
+    ],
+
+    "point_index": [x * 4 for x in range(32)] + [
+        0, 64, 440, 0,
+    ],
+
+    "position_data": [
+        ii(10.0), ii(10.0), ii(1.0), ii(1.0),
+    ],
+}
+
+# TODO: Use mako? Or just change the syntax for "LDM/STM"
+# and use f-strings again?
+
+cmds = """
+!cs 0
+resources fragment
+
+@ Bound min
+mov w2a, i16:0,0
+@ Bound max
+mov w2b, i16:255,255
+mov x28, $framebuffer+1
+
+slot 2
+
+fragment
+
+mov w4a, #0x0
+UNK 02 24, #0x4a0000ff0211
+wait 1
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!raw sleep 20
+!memset plane_0 0 0 262144
+!raw sleep 200
+!dump plane_0 0 12
+!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768
+"""
+
+altcmds = """
+!cs 0
+
+@ Some time is required for the change to become active
+@ Just submitting a second job appears to be enough
+resources compute fragment tiler idvs
+mov x48, #0x6000000000
+heapctx x48
+
+!cs 0
+
+slot 3
+wait 3
+heapinc vt_start
+
+@ Base vertex count
+mov w24, 0
+@ Instance count
+mov w22, 1
+
+@ Vertex attribute stride
+mov x30, 0
+
+@ Primitive
+mov w38, 0x430000
+@@ Draw
+@ Pixel kill etc.
+@   Enable occlusion query
+@mov w39, 0xc000
+mov w39, 0
+@ Unk...
+mov w26, 0x1000
+@ Sample mask / render target mask
+mov w3a, 0x1ffff
+@ Min/max Z
+mov w2c, float:0
+mov w2d, float:1.0
+@ Depth/stencil
+mov x34, $idvs_zs
+@ Blend
+mov x32, $idvs_blend+1
+@ Occlusion
+mov x2e, $occlusion
+
+@ Primitive size
+mov x3c, float:3.75
+@ Fragment shader environment
+mov x14, $fragment_shader
+@ FAU count == 2
+movp x0c, $fau+0x0200000000000000
+
+@ Position shader environment
+mov x10, $position_shader
+
+mov x18, $thread_storage
+
+@ is this right?! "Vertex attribute stride" apparently?
+@  that was for pure tiler jobs, for idvs it messes up points/lines
+@  for some reason
+@mov x30, $position_data
+
+@ Tiler
+mov x28, $tiler_ctx
+
+@ Scissor min
+mov w2a, i16:0,0
+@ Scissor max
+mov w2b, i16:255,255
+
+mov w21, 18
+mov w27, 4096
+mov x36, $index_buffer
+
+idvs 0x4002, mode triangles, index uint32
+
+mov w21, 1 @36
+mov w27, 4096
+mov x36, $point_index
+
+@idvs 0x4a42, mode points, index uint32
+
+mov w21, 400000
+mov w21, 18
+@idvs 0x4a42, mode triangles, index none
+
+@idvs 0x4a42, mode points, index none
+@idvs 0x4a42, mode line-loop, index none
+
+flush_tiler
+wait 3
+heapinc vt_end
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+!dump64 tiler_heap 0 4096
+@!dump idk 0 1048576
+@!dump position_data 0 4096
+
+!cs 0
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+slot 4
+wait 4
+heapinc vt_start
+
+mov x28, $tiler_ctx2
+idvs 0x4002, mode triangles, index none
+flush_tiler
+wait 4
+heapinc vt_end
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!dump64 tiler_heap 0 4096
+
+!cs 0
+
+mov x50, $ev
+
+@ Bound min
+mov w2a, i16:0,0
+@ Bound max
+mov w2b, i16:255,255
+mov x28, $framebuffer+1
+@ Tile enable map
+mov x2c, $x
+mov x2e, 64
+
+mov w40, 1
+str w40, [x2c]
+@str w40, [x2c, 128]
+
+@ Use tile enable map
+@fragment tem 1
+
+fragment
+
+@ Does this actually do anytihng?
+mov x48, $tiler_ctx
+ldr x4a, [x48, 40]
+ldr x4c, [x48, 48]
+wait 0,4
+UNK 02 0b, 0x4a4c00100001
+
+mov x48, $tiler_ctx2
+ldr x4a, [x48, 40]
+ldr x4c, [x48, 48]
+wait 0,4
+UNK 02 0b, 0x4a4c00100001
+
+UNK 02 24, #0x5f0000f80211
+@UNK 00 24, #0x5f0000000233
+wait 1
+
+mov x54, $plane_0
+ldr x56, [x54]
+wait 0
+
+mov x52, $y
+str x56, [x52]
+
+evstr w5f, [x50], unk 0xfd, irq
+
+!raw td
+!fdump heap 0 1048576
+!tiler heap 0 1048576
+
+
+@!dump rt_buffer 0 4096
+!dump y 0 4096
+@!dump plane_0 0 524288
+@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000
+!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400
+!dump occlusion 0 4096
+@!dump ssbo 0 4096
+
+!dump64 tiler_heap 0 4096
+!dump tiler_ctx 0 4096
+!dump tiler_ctx2 0 4096
+
+@!fdump heap 0 1048576
+
+!cs 0
+
+slot 3
+wait 3
+heapinc vt_start
+
+mov x28, $tiler_ctx3
+mov w2c, float:0
+mov w2d, float:1.0
+mov x2e, $occlusion
+
+idvs 0x4002, mode triangles, index none
+flush_tiler
+wait 3
+heapinc vt_end
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!dump64 tiler_heap 0 4096
+!dump tiler_ctx 0 4096
+!raw td
+
+"""
+
+docopy = """
+ldr {w00-w0f}, [x52]
+ldr {w10-w1f}, [x52, 64]
+ldr {w20-w2f}, [x52, 128]
+ldr {w30-w3f}, [x52, 192]
+add x52, x52, 256
+
+loop:
+wait 0
+
+str {w00-w0f}, [x54]
+ldr {w00-w0f}, [x52]
+str {w10-w1f}, [x54, 64]
+ldr {w10-w1f}, [x52, 64]
+str {w20-w2f}, [x54, 128]
+ldr {w20-w2f}, [x52, 128]
+str {w30-w3f}, [x54, 192]
+ldr {w30-w3f}, [x52, 192]
+
+add x54, x54, 256
+add x52, x52, 256
+add x50, x50, -256
+
+b.ne w50, loop
+b.ne w51, loop
+"""
+
+oldcmds = f"""
+!cs 0
+
+mov x50, 0x8000000
+
+mov x52, $from
+mov x54, $to
+mov x56, $x
+mov x58, $ev
+mov x5a, $y
+
+str cycles, [x56]
+{docopy}
+str cycles, [x56, 8]
+
+UNK 00 24, #0x5f0000000233
+evstr w5f, [x58], unk 0xfd, irq
+
+!cs 1
+
+mov x50, 0x8000000
+
+mov x52, $from
+mov x54, $to
+mov x56, $x
+mov x58, $ev
+mov x5a, $y
+
+add x52, x52, 0x8000000
+add x54, x54, 0x8000000
+add x56, x56, 32
+
+nop
+nop
+
+str cycles, [x56]
+{docopy}
+str cycles, [x56, 8]
+
+UNK 00 24, #0x5f0000000233
+evstr w5f, [x58], unk 0xfd, irq
+
+!delta x 0 4096
+"""
+
+oldcmds = """
+!cs 0
+endpt compute
+!cs 0
+
+@ Workgroup size 1x1x1, merging allowed
+mov w21, 0x80000000
+
+@ Workgroup count 1x1x1
+mov w25, 1
+mov w26, 1
+mov w27, 1
+
+@ Offset 0,0,0
+mov w22, 0
+mov w23, 0
+mov w24, 0
+
+@ TODO: offset x/y/z
+
+@ Resources
+mov x06, 0
+
+@ Shader
+mov x16, $shader
+
+@ Local storage
+mov x1e, $ls
+
+@ FAU
+movp x0e, $fau+0x0200000000000000
+
+slot 2
+wait 2
+
+UNK 0400000000008200
+
+mov x58, $fau
+ldr x56, [x58]
+wait 0
+
+@mov w4a, 0
+
+@slot 6
+@mov x54, $x
+@UNK 02 24, #0x4a0000f80211
+@ldr x52, [x56]
+@wait 0,1
+@str x52, [x54]
+
+mov w40, 60
+1: add w40, w40, -1
+
+@mov w4a, #0x0
+@UNK 02 24, #0x4a0000f80211
+@wait 1
+
+@mov w54, #0
+@UNK 00 24, #0x540000000233
+@wait all
+
+slot 2
+wait 2
+
+add w22, w22, 1
+@UNK 0400ff0000008200
+
+@b.ne w40, 1b
+
+!dump x 0 4096
+!dump y 0 4096
+!dump ev 0 4096
+"""
+
+oldcmds = """
+!cs 0
+
+mov x48, $x
+
+mov w21, 0x80000000
+mov w25, 1
+mov w26, 1
+mov w27, 1
+
+movp x0e, $fau+0x0200000000000000
+
+@ Write FAUs
+@add x0e, x48, 64
+@mov x50, $ev
+@str x50, [x0e]
+@mov x30, 10
+@str x30, [x0e, 8]
+@add w0f, w0f, 0x02000000
+
+@ Write shader descriptor
+@add x16, x48, 128
+@mov x30, 0x118
+@str x30, [x16]
+@mov x30, $compute
+@str x30, [x16, 8]
+
+wait 0
+
+add x1e, x48, 192
+
+mov x30, $y
+@regdump x30
+@mov x30, 0
+
+resources compute
+slot 2
+mov w54, #0xffffe0
+UNK 00 24, #0x540000000233
+
+wait all
+
+mov x54, 0
+mov w56, 0
+mov w5d, 1
+
+slot 2
+wait 2
+wait 2
+regdump x30
+UNK 0400ff0000008200
+add x30, x30, 0x200
+regdump x30
+slot 2
+wait 2
+
+mov w40, 1000
+1: add w40, w40, -1
+str cycles, [x50, 32]
+b.ne w40, 1b
+
+wait 0
+wait all
+
+@ 6 / 10 / 14
+mov w40, 1
+1: add w40, w40, -1
+UNK 0400ff0000000200
+b.ne w40, 1b
+
+mov w40, 1000
+1: add w40, w40, -1
+str cycles, [x50, 32]
+b.ne w40, 1b
+
+mov w42, 200
+mov w40, 100
+1: add w40, w40, -1
+@wait all
+@UNK 0400ff0000008001 @ compute
+
+@UNK 0400ff0000000001
+@UNK 2501504200000004 @ evadd
+@UNK 3 24, #0x4a0000000211
+
+@wait all
+b.ne w40, 1b
+
+@UNK 2601504200000004
+
+str cycles, [x50, 40]
+str cycles, [x50, 48]
+UNK 02 24, #0x4a0000000211
+wait 0
+
+add x5c, x50, 64
+evadd w5e, [x5c], unk 0xfd
+evadd w5e, [x5c], unk 0xfd, irq, unk0
+
+!dump x 0 4096
+!dump y 0 4096
+!delta ev 0 4096
+"""
+
+altcmds = """
+!cs 0
+!alloc x 4096
+!alloc ev 4096 0x8200f
+!alloc ev2 4096 0x8200f
+
+mov x10, $x
+UNK 00 30, #0x100000000000
+add x12, x10, 256
+str cycles, [x12]
+mov x5a, $ev2
+mov x48, 0
+mov w4a, 0
+slot 3
+wait 3
+UNK 00 31, 0
+mov x48, $ev
+mov w4a, 0x4321
+add x46, x48, 64
+mov w42, 0
+
+str cycles, [x12, 8]
+UNK 01 26, 0x484a00000005
+str cycles, [x12, 16]
+UNK 01 26, 0x484a00000005
+str cycles, [x12, 24]
+
+nop
+
+mov w10, 10000
+1:
+UNK 01 26, 0x484a00000005
+add w10, w10, -1
+b.ne w10, 1b
+str cycles, [x12, 32]
+
+mov w10, 10000
+1:
+UNK 01 26, 0x484a00000005
+@UNK 02 24, #0x420000000211
+add w10, w10, -1
+b.ne w10, 1b
+str cycles, [x12, 40]
+
+ldr x16, [x48, 0]
+wait 0
+str x16, [x48, 16]
+
+UNK 00 31, 0x100000000
+
+mov w4a, #0x0
+UNK 02 24, #0x4a0000000211
+
+mov w5e, 1
+add x5c, x5a, 0x100
+UNK 01 25, 0x5c5e00f80001
+
+!delta x 0 4096
+!dump ev 0 4096
+!dump ev2 0 4096
+"""
+
+altcmds = """
+!cs 0
+!alloc x 4096
+!alloc ev 4096 0x8200f
+
+iter vertex
+slot 2
+
+mov x40, $x
+mov w10, 1
+mov x48, 0
+mov w4a, 0
+call w4a, x48
+  nop
+  nop
+  nop
+  mov x20, $.
+@  movp x22, 0x0126000011223344
+  movp x22, 0x1600000060000001
+  str x22, [x20, 56]
+  1: nop
+  b 1b
+  nop
+  add x40, x40, #256
+  regdump x40
+
+mov x5a, #0x5ff7fd6000
+mov x48, $ev
+mov x40, #0x5ff7fd6000
+mov w54, #0x1
+UNK 00 24, #0x540000000233
+wait 0
+slot 6
+@UNK 00 31, #0x0
+UNK 00 09, #0x0
+wait 6
+@UNK 00 31, #0x100000000
+mov x4a, x40
+UNK 01 26, 0x484a00040001
+
+!dump x 0 4096
+@!dump ev 0 4096
+@!delta x 0 4096
+"""
+
+cycletest = """
+mov w10, 10
+1:
+str cycles, [x5c]
+add x5c, x5c, 8
+add w10, w10, -1
+mov w11, 100000
+
+inner:
+add w11, w11, -1
+b.ne w11, inner
+
+b.ne w10, 1b
+"""
+
+def get_cmds(cmd):
+    return cmds.replace("{cmd}", str(cmd))
+
+def assemble_shader(text):
+    lines = text.strip().split("\n")
+    lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"]
+    return [asm.parse_asm(ln) for ln in lines]
+
+class Buffer:
+    id = 0
+
+    def __init__(self):
+        self.id = Buffer.id
+        Buffer.id += 1
+
+def resolve_rel(to, branch):
+    return (to - branch) // 8 - 1
+
+def to_int16(value):
+    assert(value < 36768)
+    assert(value >= -32768)
+    return value & 0xffff
+
+class Level(Buffer):
+    def __init__(self, indent):
+        super().__init__()
+
+        self.indent = indent
+        self.buffer = []
+        self.call_addr_offset = None
+        self.call_len_offset = None
+
+        self.labels = {}
+        self.label_refs = []
+        # Numeric labels can be reused, so have to be handled specially.
+        self.num_labels = {}
+        self.num_refs = {}
+
+    def offset(self):
+        return len(self.buffer) * 8
+
+    def __repr__(self):
+        buf = " ".join(hex(x) for x in self.buffer)
+        return f"buffer {self.id} {self.offset()} 0x200f {buf}"
+
+    def buffer_add_value(self, offset, value):
+        self.buffer[offset // 8] += value
+
+    def process_relocs(self, refs, to=None):
+        for ref, offset, type_ in refs:
+            assert(type_ == "rel")
+
+            if to is None:
+                goto = self.labels[ref]
+            else:
+                goto = to
+
+            value = to_int16(resolve_rel(goto, offset))
+            self.buffer_add_value(offset, value)
+
+    def finish(self):
+        self.process_relocs(self.label_refs)
+
+class Alloc(Buffer):
+    def __init__(self, size, flags=0x280f):
+        super().__init__()
+
+        self.size = size
+        self.flags = flags
+        self.buffer = []
+
+    def __repr__(self):
+        buf = " ".join(hex(x) for x in self.buffer)
+        return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}"
+
+def fmt_reloc(r, name="reloc"):
+    dst, offset, src, src_offset = r
+    return f"{name} {dst}+{offset} {src}+{src_offset}"
+
+def fmt_exe(e):
+    return " ".join(str(x) for x in e)
+
+class Context:
+    def __init__(self):
+        self.levels = []
+        self.l = None
+
+        self.allocs = {}
+        self.completed = []
+        self.reloc = []
+        self.reloc_split = []
+
+        self.exe = []
+        self.last_exe = None
+
+        self.is_call = False
+
+    def set_l(self):
+        if len(self.levels):
+            self.l = self.levels[-1]
+
+    def pop_until(self, indent):
+        while self.l.indent != indent:
+            l = self.levels.pop()
+            self.completed.append(l)
+
+            self.set_l()
+            if not len(self.levels):
+                return
+
+            buf_len = l.offset()
+
+            r = self.l
+            self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0))
+            r.buffer[r.call_len_offset] = (
+                (r.buffer[r.call_len_offset] & (0xffff << 48)) +
+                buf_len)
+            r.buffer[r.call_addr_offset] &= (0xffff << 48)
+
+            r.call_addr_offset = None
+            r.call_len_offset = None
+
+    def flush_exe(self):
+        ind = self.levels[0].indent
+
+        self.pop_until(ind)
+        if len(self.levels[0].buffer):
+            l = self.levels.pop()
+            l.finish()
+            self.completed.append(l)
+
+            self.levels.append(Level(ind))
+            self.set_l()
+
+        if not len(self.exe):
+            return
+
+        if self.last_exe is None:
+            print("# Trying to add multiple CSs to an exe line, becoming confused")
+            return
+
+        if len(self.completed):
+            p = self.completed[-1]
+            assert(p.indent == ind)
+
+            self.exe[self.last_exe] += [p.id, p.offset()]
+
+        self.last_exe = None
+
+    def add_shaders(self, shaders):
+        for sh in shaders:
+            qwords = assemble_shader(shaders[sh])
+            sh = sh.lower()
+
+            a = Alloc(len(qwords) * 8, flags=0x2017)
+            a.buffer = qwords
+            self.allocs[sh] = a
+
+    def add_memory(self, memory):
+        for m in memory:
+            f = memory[m]
+            if isinstance(f, int):
+                size, flags = f, 0x280f
+            else:
+                size, flags = f
+            self.allocs[m] = Alloc(size, flags)
+
+    def add_descriptors(self, descriptors):
+        for d in descriptors:
+            words = descriptors[d]
+            a = Alloc(0)
+
+            buf = []
+            for w in words:
+                if isinstance(w, int):
+                    buf.append(w)
+                else:
+                    if isinstance(w, str):
+                        alloc, offset = w, 0
+                    else:
+                        alloc, offset = w
+                    ref = self.allocs[alloc]
+                    self.reloc.append((a.id, len(buf) * 4,
+                                       ref.id, offset))
+                    buf.append(0)
+                    buf.append(0)
+
+            it = iter(buf)
+            a.buffer = [x | (y << 32) for x, y in zip(it, it)]
+            a.size = len(a.buffer) * 8
+            self.allocs[d] = a
+
+    def interpret(self, text):
+        text = text.split("\n")
+
+        old_indent = None
+
+        for orig_line in text:
+            #print(orig_line, file=sys.stderr)
+
+            line = orig_line.split("@")[0].expandtabs().rstrip().lower()
+            if not line:
+                continue
+
+            indent = len(line) - len(line.lstrip())
+            line = line.lstrip()
+
+            if old_indent is None:
+                self.levels.append(Level(indent))
+            elif indent != old_indent:
+                if indent > old_indent:
+                    assert(self.is_call)
+
+                    self.levels.append(Level(indent))
+                else:
+                    self.pop_until(indent)
+
+            self.set_l()
+
+            old_indent = indent
+            self.is_call = False
+
+            given_code = None
+
+            # TODO: Check against this to test the disassembler?
+            if re.match(r"[0-9a-f]{16} ", line):
+                given_code = int(line[:16], 16)
+                line = line[16:].lstrip()
+
+            s = [x.strip(",") for x in line.split()]
+
+            if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])):
+                label = s[0]
+                if s[0].endswith(":"):
+                    label = label[:-1]
+
+                if is_num(label):
+                    label = int(label)
+                    if label in self.l.num_refs:
+                        self.l.process_relocs(self.l.num_refs[label], self.l.offset())
+                        del self.l.num_refs[label]
+                    self.l.num_labels[label] = self.l.offset()
+                else:
+                    if label in self.l.labels:
+                        print("Label reuse is not supported for non-numeric labels")
+                    self.l.labels[label] = self.l.offset()
+
+                s = s[1:]
+                if not len(s):
+                    continue
+
+            for i in range(len(s)):
+                if s[i].startswith("$"):
+                    name, *offset = s[i][1:].split("+")
+                    if name == ".":
+                        buf = self.l
+                    else:
+                        buf = self.allocs[name]
+                    if len(offset):
+                        assert(len(offset) == 1)
+                        offset = int(offset[0], 0)
+                    else:
+                        offset = 0
+
+                    if s[0] == "movp":
+                        rels = self.reloc_split
+                    else:
+                        rels = self.reloc
+
+                    rels.append((self.l.id, self.l.offset(),
+                                 buf.id, offset))
+                    s[i] = "#0x0"
+
+            def is_num(str):
+                return re.fullmatch(r"[0-9]+", str)
+
+            def hx(word):
+                return int(word, 16)
+
+            def reg(word):
+                return hx(word[1:])
+
+            def val(word):
+                if word.startswith("float:"):
+                    return ii(float(word.split(":")[1]))
+                elif word.startswith("i16:"):
+                    lo, hi = word.split(":")[1].split(",")
+                    lo, hi = val(lo), val(hi)
+                    assert(lo < (1 << 16))
+                    assert(hi < (1 << 16))
+                    return (lo & 0xffff) | (hi << 16)
+
+                value = int(word.strip("#"), 0)
+                assert(value < (1 << 48))
+                return value
+
+            sk = True
+
+            if s[0] == "!cs":
+                assert(len(s) == 2)
+                self.flush_exe()
+                self.last_exe = len(self.exe)
+                self.exe.append(["exe", int(s[1])])
+                continue
+            elif s[0] == "!parallel":
+                assert(len(s) == 2)
+                self.flush_exe()
+                self.last_exe = len(self.exe) - 1
+                self.exe[-1] += [int(s[1])]
+                continue
+            elif s[0] == "!alloc":
+                assert(len(s) == 3 or len(s) == 4)
+                alloc_id = s[1]
+                size = int(s[2])
+                flags = val(s[3]) if len(s) == 4 else 0x280f
+                self.allocs[alloc_id] = Alloc(size, flags)
+                continue
+            elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"):
+                assert(len(s) == 4)
+                alloc_id = s[1]
+                offset = val(s[2])
+                size = val(s[3])
+                mode = {
+                    "!dump": "hex",
+                    "!dump64": "hex64",
+                    "!fdump": "filehex",
+                    "!delta": "delta",
+                    "!tiler": "tiler",
+                }[s[0]]
+                self.exe.append(("dump", self.allocs[alloc_id].id,
+                                 offset, size, mode))
+                continue
+            elif s[0] == "!heatmap":
+                assert(len(s) == 10)
+                assert(s[4] == "gran")
+                assert(s[6] == "len")
+                assert(s[8] == "stride")
+                alloc_id = s[1]
+                offset = val(s[2])
+                size = val(s[3])
+                granularity = val(s[5])
+                length = val(s[7])
+                stride = val(s[9])
+                mode = "heatmap"
+                self.exe.append(("heatmap", self.allocs[alloc_id].id,
+                                 offset, size, granularity, length, stride))
+                continue
+            elif s[0] == "!memset":
+                assert(len(s) == 5)
+                alloc_id = s[1]
+                offset = val(s[2])
+                value = val(s[3])
+                size = val(s[4])
+                self.exe.append(("memset", self.allocs[alloc_id].id,
+                                 offset, value, size))
+                continue
+            elif s[0] == "!raw":
+                self.exe.append(s[1:])
+                continue
+            elif s[0] == "movp":
+                assert(len(s) == 3)
+                assert(s[1][0] == "x")
+                addr = reg(s[1])
+                # Can't use val() as that has a max of 48 bits
+                value = int(s[2].strip("#"), 0)
+
+                self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff))
+                self.l.buffer.append((2 << 56) | ((addr + 1) << 48)
+                                       | ((value >> 32) & 0xffffffff))
+                continue
+            elif s[0] == "regdump":
+                assert(len(s) == 2)
+                assert(s[1][0] == "x")
+                dest = reg(s[1])
+
+                # Number of registers to write per instruction
+                regs = 16
+
+                cmd = 21
+                value = (dest << 40) | (((1 << regs) - 1) << 16)
+
+                for i in range(0, 0x60, regs):
+                    code = (cmd << 56) | (i << 48) | value | (i << 2)
+                    self.l.buffer.append(code)
+
+                del cmd, value
+                continue
+
+            elif s[0] == "unk":
+                if len(s) == 2:
+                    h = hx(s[1])
+                    cmd = h >> 56
+                    addr = (h >> 48) & 0xff
+                    value = h & 0xffffffffffff
+                else:
+                    assert(len(s) == 4)
+                    cmd = hx(s[2])
+                    addr = hx(s[1])
+                    value = val(s[3])
+            elif s[0] == "nop":
+                if len(s) == 1:
+                    addr = 0
+                    value = 0
+                    cmd = 0
+                else:
+                    assert(len(s) == 3)
+                    addr = hx(s[1])
+                    value = val(s[2])
+                    cmd = 0
+            elif s[0] == "mov" and s[2][0] in "xw":
+                # This is actually an addition command
+                assert(len(s) == 3)
+                assert(s[1][0] == s[2][0])
+                cmd = { "x": 17, "w": 16 }[s[1][0]]
+                addr = reg(s[1])
+                value = reg(s[2]) << 40
+            elif s[0] == "mov":
+                assert(len(s) == 3)
+                cmd = { "x": 1, "w": 2 }[s[1][0]]
+                addr = reg(s[1])
+                value = val(s[2])
+            elif s[0] == "add":
+                assert(len(s) == 4)
+                assert(s[1][0] == s[2][0])
+                assert(s[1][0] in "wx")
+                cmd = 16 if s[1][0] == "w" else 17
+                addr = reg(s[1])
+                value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff)
+            elif s[0] == "resources":
+                assert(len(s) >= 2)
+                types = ["compute", "fragment", "tiler", "idvs"]
+                cmd = 34
+                addr = 0
+                value = 0
+                for t in s[1:]:
+                    if t in types:
+                        value |= 1 << types.index(t)
+                    else:
+                        value |= int(t, 0)
+            elif s[0] == "fragment":
+                cmd = 7
+                addr = 0
+                value = 0
+                if len(s) != 1:
+                    arg_map = {
+                        "tem": {"0": 0, "1": 1},
+                        "render": {
+                            "z_order": 0,
+                            "horizontal": 0x10,
+                            "vertical": 0x20,
+                            "reverse_horizontal": 0x50,
+                            "reverse_vertical": 0x60,
+                        },
+                        "unk": {"0": 0, "1": 1 << 32},
+                    }
+                    for arg, val in zip(s[1::2], s[2::2]):
+                        value |= arg_map[arg][val]
+            elif s[0] == "wait":
+                assert(len(s) == 2)
+                cmd = 3
+                addr = 0
+                if s[1] == "all":
+                    value = 255
+                else:
+                    value = sum(1 << int(x) for x in s[1].split(","))
+                value <<= 16
+            elif s[0] == "slot":
+                assert(len(s) == 2)
+                cmd = 23
+                addr = 0
+                value = int(s[1], 0)
+            elif s[0] == "add":
+                # TODO: unk variant
+                assert(len(s) == 4)
+                assert(s[1][0] == "x")
+                assert(s[2][0] == "x")
+                cmd = 17
+                addr = reg(s[1])
+                v = val(s[3])
+                assert(v < (1 << 32))
+                assert(v >= (-1 << 31))
+                value = (reg(s[2]) << 40) | (v & 0xffffffff)
+            elif s[0] == "idvs":
+                assert(len(s) == 6)
+                unk = val(s[1])
+                assert(s[2] == "mode")
+                modes = {
+                    "none": 0,
+                    "points": 1,
+                    "lines": 2,
+                    "line-strip": 4,
+                    "line-loop": 6,
+                    "triangles": 8,
+                    "triangle-strip": 10,
+                    "triangle-fan": 12,
+                    "polygon": 13,
+                    "quads": 14,
+                }
+                if s[3] in modes:
+                    mode = modes[s[3]]
+                else:
+                    mode = int(s[3])
+                assert(s[4] == "index")
+                itypes = {
+                    "none": 0,
+                    "uint8": 1,
+                    "uint16": 2,
+                    "uint32": 3,
+                }
+                if s[5] in itypes:
+                    index = itypes[s[5]]
+                else:
+                    index = int(s[5])
+
+                cmd = 6
+                addr = 0
+                value = (unk << 32) | (index << 8) | mode
+            elif s[0] == "flush_tiler":
+                assert(len(s) == 1)
+                cmd = 9
+                addr = 0
+                value = 0
+            elif s[0] == "str" and s[1] in ("cycles", "timestamp"):
+                assert(len(s) == 3 or len(s) == 4)
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]") for x in s]
+                assert(s[2][0] == "x")
+
+                type_ = 1 if s[1] == "cycles" else 0
+                dest = reg(s[2])
+                if len(s) == 4:
+                    offset = val(s[3])
+                else:
+                    offset = 0
+
+                cmd = 40
+                addr = 0
+                value = (dest << 40) | (type_ << 32) | to_int16(offset)
+            elif s[0] in ("ldr", "str"):
+                reglist = s[1]
+                if reglist[0] == "{":
+                    end = [x[-1] for x in s].index("}")
+                    reglist = s[1:end + 1]
+                    s = s[:1] + s[end:]
+
+                assert(len(s) == 3 or len(s) == 4)
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]") for x in s]
+                assert(s[2][0] == "x")
+
+                if isinstance(reglist, str):
+                    assert(reglist[0] in "xw")
+                    src = reg(reglist)
+                    mask = 3 if reglist[0] == "x" else 1
+                else:
+                    src = None
+                    mask = 0
+
+                    for r in ",".join(reglist).strip("{}").split(","):
+                        r = r.split("-")
+                        assert(len(r) in (1, 2))
+                        regno = [reg(x) for x in r]
+
+                        if src is None:
+                            src = regno[0]
+
+                        if len(r) == 1:
+                            assert(r[0][0] in "xw")
+                            new = 3 if r[0][0] == "x" else 1
+                            new = (new << regno[0]) >> src
+                        else:
+                            assert(regno[1] > regno[0])
+                            new = ((2 << regno[1]) - (1 << regno[0])) >> src
+
+                        assert(new < (1 << 16))
+                        assert(mask & new == 0)
+                        mask |= new
+
+                # Name is correct for str, but inverted for ldr
+                # (The same holds for src above)
+                dest = reg(s[2])
+                if len(s) == 4:
+                    offset = val(s[3])
+                else:
+                    offset = 0
+
+                cmd = 20 if s[0] == "ldr" else 21
+                addr = src
+                value = (dest << 40) | (mask << 16) | to_int16(offset)
+            elif s[0] == "b" or s[0].startswith("b."):
+                # For unconditional jumps, use w00 as a source register if it
+                # is not specified
+                if s[0] == "b" and (len(s) == 2 or
+                                    (len(s) == 3 and
+                                     s[1] in ("back", "skip"))):
+                    s = [s[0], "w00", *s[1:]]
+
+                assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip")))
+                assert(s[1][0] == "w")
+
+                ops = {
+                    "b.le": 0, "b.gt": 1,
+                    "b.eq": 2, "b.ne": 3,
+                    "b.lt": 4, "b.ge": 5,
+                    "b": 6, "b.al": 6,
+                }
+
+                src = reg(s[1])
+                if len(s) == 4:
+                    offset = val(s[3])
+                    if s[2] == "back":
+                        offset = -1 - offset
+                else:
+                    label = s[2]
+                    if re.fullmatch(r"[0-9]+b", label):
+                        label = int(label[:-1])
+                        assert(label in self.l.num_labels)
+                        offset = resolve_rel(self.l.num_labels[label],
+                                             self.l.offset())
+                    elif re.fullmatch(r"[0-9]+f", label):
+                        label = int(label[:-1])
+                        if label not in self.l.num_refs:
+                            self.l.num_refs[label] = []
+                        self.l.num_refs[label].append((label, self.l.offset(), "rel"))
+                        offset = 0
+                    else:
+                        assert(not re.fullmatch(r"[0-9]+", label))
+                        self.l.label_refs.append((label, self.l.offset(), "rel"))
+                        offset = 0
+
+                cmd = 22
+                addr = 0
+                value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset)
+
+            elif s[0] in ("evadd", "evstr"):
+                assert(len(s) in range(5, 8))
+                assert(s[1][0] in "wx")
+                assert(s[2].startswith("[x"))
+                assert(s[2][-1] == "]")
+                assert(s[3] == "unk")
+                s = [x.strip("[]()") for x in s]
+
+                val = reg(s[1])
+                dst = reg(s[2])
+                mask = hx(s[4])
+                irq = "irq" not in s
+                unk0 = "unk0" in s
+
+                if s[1][0] == "w":
+                    cmd = 37 if s[0] == "evadd" else 38
+                else:
+                    cmd = 51 if s[0] == "evadd" else 52
+                addr = 1
+                value = ((dst << 40) | (val << 32) | (mask << 16) |
+                         (irq << 2) | unk0)
+            elif s[0].split(".")[0] == "evwait":
+                for mod in s[0].split(".")[1:]:
+                    assert(mod in {"lo", "hi", "inherit", "no_error"})
+                assert(len(s) == 3)
+                assert(s[1][0] in "wx")
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]()") for x in s]
+                src = reg(s[2])
+                val = reg(s[1])
+                cond = 1 if ".hi" in s[0] else 0
+                error = 1 if ".no_error" in s[0] else 0
+
+                cmd = 53 if s[1][0] == "x" else 39
+                addr = 0
+                value = (src << 40) | (val << 32) | (cond << 28) | error
+            elif s[0] in ("call", "tailcall"):
+                ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1]
+                assert(len(ss) == 3)
+                assert(ss[1][0] == "w")
+                assert(ss[2][0] == "x")
+                cmd = { "call": 32, "tailcall": 33 }[s[0]]
+                addr = 0
+                num = reg(ss[1])
+                target = reg(ss[2])
+                value = (num << 32) | (target << 40)
+
+                l = self.l
+
+                cur = len(l.buffer)
+                for ofs in range(cur - 2, cur):
+                    if l.buffer[ofs] >> 48 == 0x100 + target:
+                        l.call_addr_offset = ofs
+                    if l.buffer[ofs] >> 48 == 0x200 + num:
+                        l.call_len_offset = ofs
+                assert(l.call_addr_offset is not None)
+                assert(l.call_len_offset is not None)
+
+                self.is_call = True
+            elif s[0] == "heapctx":
+                assert(len(s) == 2)
+                assert(s[1][0] == "x")
+                cmd = 48
+                addr = 0
+                value = reg(s[1]) << 40
+            elif s[0] == "heapinc":
+                assert(len(s) == 2)
+                modes = {
+                    "vt_start": 0,
+                    "vt_end": 1,
+                    "frag_end": 3,
+                }
+                if s[1] in modes:
+                    mode = modes[s[1]]
+                else:
+                    mode = int(s[1])
+                cmd = 49
+                addr = 0
+                value = mode << 32
+            else:
+                print("Unknown command:", orig_line, file=sys.stderr)
+                # TODO remove
+                cmd = 0
+                addr = 0
+                value = 0
+                sk = False
+                pass
+
+            code = (cmd << 56) | (addr << 48) | value
+
+            if given_code and code != given_code:
+                print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}")
+
+            self.l.buffer.append(code)
+
+            del cmd, addr, value
+
+            if False and not sk:
+                print(orig_line, file=sys.stderr)
+                print(indent, s, hex(code) if sk else "", file=sys.stderr)
+
+        self.pop_until(self.levels[0].indent)
+        self.flush_exe()
+
+    def __repr__(self):
+        r = []
+        r += [str(self.allocs[x]) for x in self.allocs]
+        r += [str(x) for x in self.completed]
+        r += [fmt_reloc(x) for x in self.reloc]
+        r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split]
+        r += [fmt_exe(x) for x in self.exe]
+        return "\n".join(r)
+
+def interpret(text):
+    c = Context()
+    c.add_shaders(shaders)
+    c.add_memory(memory)
+    c.add_descriptors(descriptors)
+    c.interpret(text)
+    #print(str(c))
+    return str(c)
+
+def run(text, capture=False):
+    if capture:
+        cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
+    else:
+        cap = {}
+
+    i = interpret(text) + "\n"
+
+    with open("/tmp/csf.cmds", "w") as f:
+        f.write(i)
+
+    # TODO: Keep seperate or merge stdout/stderr?
+    ret = subprocess.run(["csf_test", "/dev/stdin"],
+                         input=i, text=True, **cap)
+    if ret.stderr is None:
+        ret.stderr = ""
+    if ret.stdout is None:
+        ret.stdout = ""
+    return ret.stderr + ret.stdout
+
+def rebuild():
+    try:
+        p = subprocess.run(["rebuild-mesa"])
+        if p.returncode != 0:
+            return False
+    except FileNotFoundError:
+        pass
+    return True
+
+def go(text):
+    #print(interpret(text))
+    #return
+
+    if not rebuild():
+        return
+
+    print(run(text))
+    #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s",
+    #               shell=True)
+
+os.environ["CSF_QUIET"] = "1"
+
+go(get_cmds(""))
+
+#for c in range(1, 64):
+#    val = c
+#    ret = run(get_cmds(ii(val)))
+#    print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0])
+
+#rebuild()
+#for c in range(256):
+#    print(c, end=":")
+#    sys.stdout.flush()
+#    cmd = f"UNK 00 {hex(c)[2:]} 0x00000000"
+#    run(get_cmds(cmd))
+
+#interpret(cmds)
+#go(cmds)
diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h
new file mode 100644
index 00000000000..f5f859eb9ad
--- /dev/null
+++ b/src/panfrost/csf_test/mali_base_csf_kernel.h
@@ -0,0 +1,721 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_CSF_KERNEL_H_
+#define _UAPI_BASE_CSF_KERNEL_H_
+
+#include <linux/types.h>
+
+/* Memory allocation, access/hint flags.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* IN */
+/* Read access CPU side
+ */
+#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
+
+/* Write access CPU side
+ */
+#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
+
+/* Read access GPU side
+ */
+#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
+
+/* Write access GPU side
+ */
+#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
+
+/* Execute allowed on the GPU side
+ */
+#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
+
+/* Will be permanently mapped in kernel space.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
+
+/* The allocation will completely reside within the same 4GB chunk in the GPU
+ * virtual space.
+ * Since this flag is primarily required only for the TLS memory which will
+ * not be used to contain executable code and also not used for Tiler heap,
+ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
+ */
+#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
+
+/* Userspace is not allowed to free this memory.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
+
+#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
+
+/* Grow backing store on GPU Page Fault
+ */
+#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
+
+/* Page coherence Outer shareable, if available
+ */
+#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
+
+/* Page coherence Inner shareable
+ */
+#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
+
+/* IN/OUT */
+/* Should be cached on the CPU, returned if actually cached
+ */
+#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
+
+/* IN/OUT */
+/* Must have same VA on both the GPU and the CPU
+ */
+#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
+
+/* OUT */
+/* Must call mmap to acquire a GPU address for the alloc
+ */
+#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
+
+/* IN */
+/* Page coherence Outer shareable, required.
+ */
+#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
+
+/* Protected memory
+ */
+#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
+
+/* Not needed physical memory
+ */
+#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
+
+/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
+ * addresses to be the same
+ */
+#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
+
+/* CSF event memory
+ *
+ * If Outer shareable coherence is not specified or not available, then on
+ * allocation kbase will automatically use the uncached GPU mapping.
+ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
+ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
+ *
+ * This memory requires a permanent mapping
+ *
+ * See also kbase_reg_needs_kernel_mapping()
+ */
+#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
+
+#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+
+/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
+ * mode. Some components within the GPU might only be able to access memory
+ * that is GPU cacheable. Refer to the specific GPU implementation for more
+ * details. The 3 shareability flags will be ignored for GPU uncached memory.
+ * If used while importing USER_BUFFER type memory, then the import will fail
+ * if the memory is not aligned to GPU and CPU cache line width.
+ */
+#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
+
+/*
+ * Bits [22:25] for group_id (0~15).
+ *
+ * base_mem_group_id_set() should be used to pack a memory group ID into a
+ * base_mem_alloc_flags value instead of accessing the bits directly.
+ * base_mem_group_id_get() should be used to extract the memory group ID from
+ * a base_mem_alloc_flags value.
+ */
+#define BASEP_MEM_GROUP_ID_SHIFT 22
+#define BASE_MEM_GROUP_ID_MASK \
+	((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
+
+/* Must do CPU cache maintenance when imported memory is mapped/unmapped
+ * on GPU. Currently applicable to dma-buf type only.
+ */
+#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
+
+/* OUT */
+/* Kernel side cache sync ops required */
+#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
+
+/* Number of bits used as flags for base memory management
+ *
+ * Must be kept in sync with the base_mem_alloc_flags flags
+ */
+#define BASE_MEM_FLAGS_NR_BITS 29
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
+
+/* A mask for all output bits, excluding IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
+
+/* A mask for all input bits, including IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_INPUT_MASK \
+	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED \
+	BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20
+
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
+/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
+#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS                                            \
+	((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
+
+#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
+	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
+	 LOCAL_PAGE_SHIFT)
+
+/**
+ * Valid set of just-in-time memory allocation flags
+ */
+#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
+
+/* Flags to pass to ::base_context_init.
+ * Flags can be ORed together to enable multiple things.
+ *
+ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
+ * not collide with them.
+ */
+typedef __u32 base_context_create_flags;
+
+/* No flags set */
+#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
+
+/* Base context is embedded in a cctx object (flag used for CINSTR
+ * software counter macros)
+ */
+#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
+
+/* Base context is a 'System Monitor' context for Hardware counters.
+ *
+ * One important side effect of this is that job submission is disabled.
+ */
+#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \
+	((base_context_create_flags)1 << 1)
+
+/* Base context creates a CSF event notification thread.
+ *
+ * The creation of a CSF event notification thread is conditional but
+ * mandatory for the handling of CSF events.
+ */
+#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
+
+/* Bit-shift used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
+
+/* Bitmask used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \
+	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/* Bitpattern describing the base_context_create_flags that can be
+ * passed to the kernel
+ */
+#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \
+	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \
+	 BASEP_CONTEXT_MMU_GROUP_ID_MASK)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | \
+	 BASE_CONTEXT_CSF_EVENT_THREAD | \
+	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
+ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
+ */
+#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
+
+/* Indicate that job dumping is enabled. This could affect certain timers
+ * to account for the performance impact.
+ */
+#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
+
+/* Enable KBase tracepoints for CSF builds */
+#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
+
+/* Enable additional CSF Firmware side tracepoints */
+#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
+
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
+		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
+		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
+
+/* Number of pages mapped into the process address space for a bound GPU
+ * command queue. A pair of input/output pages and a Hw doorbell page
+ * are mapped to enable direct submission of commands to Hw.
+ */
+#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
+
+#define BASE_QUEUE_MAX_PRIORITY (15U)
+
+/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
+#define BASEP_EVENT_VAL_INDEX (0U)
+#define BASEP_EVENT_ERR_INDEX (1U)
+
+/* The upper limit for number of objects that could be waited/set per command.
+ * This limit is now enforced as internally the error inherit inputs are
+ * converted to 32-bit flags in a __u32 variable occupying a previously padding
+ * field.
+ */
+#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
+
+/**
+ * enum base_kcpu_command_type - Kernel CPU queue command type.
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
+ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
+ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
+ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
+ */
+enum base_kcpu_command_type {
+	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
+	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
+	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
+	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
+};
+
+/**
+ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
+ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
+ *                                      priority levels.
+ *
+ * Currently this is in order of highest to lowest, but if new levels are added
+ * then those new levels may be out of order to preserve the ABI compatibility
+ * with previous releases. At that point, ensure assignment to
+ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
+ * a linear ordering.
+ *
+ * There should be no gaps in the enum, otherwise use of
+ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
+ */
+enum base_queue_group_priority {
+	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
+	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
+	BASE_QUEUE_GROUP_PRIORITY_LOW,
+	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
+	BASE_QUEUE_GROUP_PRIORITY_COUNT
+};
+
+struct base_kcpu_command_fence_info {
+	__u64 fence;
+};
+
+struct base_cqs_wait_info {
+	__u64 addr;
+	__u32 val;
+	__u32 padding;
+};
+
+struct base_kcpu_command_cqs_wait_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+struct base_cqs_set {
+	__u64 addr;
+};
+
+struct base_kcpu_command_cqs_set_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
+ *
+ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
+ *                           is an unsigned 32-bit integer
+ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
+ *                           is an unsigned 64-bit integer
+ */
+typedef enum PACKED {
+	BASEP_CQS_DATA_TYPE_U32 = 0,
+	BASEP_CQS_DATA_TYPE_U64 = 1,
+} basep_cqs_data_type;
+
+/**
+ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
+ *                                Operation conditions
+ *
+ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Less than or Equal to
+ *                                the Wait Operation value
+ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Greater than the Wait Operation value
+ */
+typedef enum {
+	BASEP_CQS_WAIT_OPERATION_LE = 0,
+	BASEP_CQS_WAIT_OPERATION_GT = 1,
+} basep_cqs_wait_operation_op;
+
+struct base_cqs_wait_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
+ *		about the Timeline CQS wait objects
+ *
+ * @objs:              An array of Timeline CQS waits.
+ * @nr_objs:           Number of Timeline CQS waits in the array.
+ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
+ *                     to be served as the source for importing into the
+ *                     queue's error-state.
+ */
+struct base_kcpu_command_cqs_wait_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+/**
+ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
+ *
+ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
+ *                                to a synchronization object
+ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
+ *                                of a synchronization object
+ */
+typedef enum {
+	BASEP_CQS_SET_OPERATION_ADD = 0,
+	BASEP_CQS_SET_OPERATION_SET = 1,
+} basep_cqs_set_operation_op;
+
+struct base_cqs_set_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
+ *		about the Timeline CQS set objects
+ *
+ * @objs:    An array of Timeline CQS sets.
+ * @nr_objs: Number of Timeline CQS sets in the array.
+ * @padding: Structure padding, unused bytes.
+ */
+struct base_kcpu_command_cqs_set_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * struct base_kcpu_command_import_info - structure which contains information
+ *		about the imported buffer.
+ *
+ * @handle:	Address of imported user buffer.
+ */
+struct base_kcpu_command_import_info {
+	__u64 handle;
+};
+
+/**
+ * struct base_kcpu_command_jit_alloc_info - structure which contains
+ *		information about jit memory allocation.
+ *
+ * @info:	An array of elements of the
+ *		struct base_jit_alloc_info type.
+ * @count:	The number of elements in the info array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_alloc_info {
+	__u64 info;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_jit_free_info - structure which contains
+ *		information about jit memory which is to be freed.
+ *
+ * @ids:	An array containing the JIT IDs to free.
+ * @count:	The number of elements in the ids array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_free_info {
+	__u64 ids;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_group_suspend_info - structure which contains
+ *		suspend buffer data captured for a suspended queue group.
+ *
+ * @buffer:		Pointer to an array of elements of the type char.
+ * @size:		Number of elements in the @buffer array.
+ * @group_handle:	Handle to the mapping of CSG.
+ * @padding:		padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_group_suspend_info {
+	__u64 buffer;
+	__u32 size;
+	__u8 group_handle;
+	__u8 padding[3];
+};
+
+
+/**
+ * struct base_kcpu_command - kcpu command.
+ * @type:	type of the kcpu command, one enum base_kcpu_command_type
+ * @padding:	padding to a multiple of 64 bits
+ * @info:	structure which contains information about the kcpu command;
+ *		actual type is determined by @p type
+ * @info.fence:            Fence
+ * @info.cqs_wait:         CQS wait
+ * @info.cqs_set:          CQS set
+ * @info.import:           import
+ * @info.jit_alloc:        jit allocation
+ * @info.jit_free:         jit deallocation
+ * @info.suspend_buf_copy: suspend buffer copy
+ * @info.sample_time:      sample time
+ * @info.padding:          padding
+ */
+struct base_kcpu_command {
+	__u8 type;
+	__u8 padding[sizeof(__u64) - sizeof(__u8)];
+	union {
+		struct base_kcpu_command_fence_info fence;
+		struct base_kcpu_command_cqs_wait_info cqs_wait;
+		struct base_kcpu_command_cqs_set_info cqs_set;
+		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
+		struct base_kcpu_command_import_info import;
+		struct base_kcpu_command_jit_alloc_info jit_alloc;
+		struct base_kcpu_command_jit_free_info jit_free;
+		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
+		__u64 padding[2]; /* No sub-struct should be larger */
+	} info;
+};
+
+/**
+ * struct basep_cs_stream_control - CSI capabilities.
+ *
+ * @features: Features of this stream
+ * @padding:  Padding to a multiple of 64 bits.
+ */
+struct basep_cs_stream_control {
+	__u32 features;
+	__u32 padding;
+};
+
+/**
+ * struct basep_cs_group_control - CSG interface capabilities.
+ *
+ * @features:     Features of this group
+ * @stream_num:   Number of streams in this group
+ * @suspend_size: Size in bytes of the suspend buffer for this group
+ * @padding:      Padding to a multiple of 64 bits.
+ */
+struct basep_cs_group_control {
+	__u32 features;
+	__u32 stream_num;
+	__u32 suspend_size;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
+ *        error information associated with GPU command queue group.
+ *
+ * @sideband:     Additional information of the unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_group_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
+ *        error information related to GPU command queue.
+ *
+ * @sideband:     Additional information about this unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @csi_index:    Index of the CSF interface the queue is bound to.
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
+};
+
+/**
+ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
+ *
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
+ *                                          command queue group.
+ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
+ *                                          command queue.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
+ *                                          progress timeout.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
+ *                                             of tiler heap memory.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
+ *
+ * This type is used for &struct_base_gpu_queue_group_error.error_type.
+ */
+enum base_gpu_queue_group_error_type {
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
+	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
+	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
+};
+
+/**
+ * struct base_gpu_queue_group_error - Unrecoverable fault information
+ * @error_type:          Error type of @base_gpu_queue_group_error_type
+ *                       indicating which field in union payload is filled
+ * @padding:             Unused bytes for 64bit boundary
+ * @payload:             Input Payload
+ * @payload.fatal_group: Unrecoverable fault error associated with
+ *                       GPU command queue group
+ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
+ */
+struct base_gpu_queue_group_error {
+	__u8 error_type;
+	__u8 padding[7];
+	union {
+		struct base_gpu_queue_group_error_fatal_payload fatal_group;
+		struct base_gpu_queue_error_fatal_payload fatal_queue;
+	} payload;
+};
+
+/**
+ * enum base_csf_notification_type - Notification type
+ *
+ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
+ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
+ *                                               error
+ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
+ *                                               queue
+ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
+ *
+ * This type is used for &struct_base_csf_notification.type.
+ */
+enum base_csf_notification_type {
+	BASE_CSF_NOTIFICATION_EVENT = 0,
+	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
+	BASE_CSF_NOTIFICATION_COUNT
+};
+
+/**
+ * struct base_csf_notification - Event or error notification
+ *
+ * @type:                      Notification type of @base_csf_notification_type
+ * @padding:                   Padding for 64bit boundary
+ * @payload:                   Input Payload
+ * @payload.align:             To fit the struct into a 64-byte cache line
+ * @payload.csg_error:         CSG error
+ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
+ *                             fatal error
+ * @payload.csg_error.padding: Padding
+ * @payload.csg_error.error:   Unrecoverable fault error
+ *
+ */
+struct base_csf_notification {
+	__u8 type;
+	__u8 padding[7];
+	union {
+		struct {
+			__u8 handle;
+			__u8 padding[7];
+			struct base_gpu_queue_group_error error;
+		} csg_error;
+
+		__u8 align[56];
+	} payload;
+};
+
+#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h
new file mode 100644
index 00000000000..305956f341a
--- /dev/null
+++ b/src/panfrost/csf_test/mali_base_kernel.h
@@ -0,0 +1,746 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Base structures shared with the kernel.
+ */
+
+#ifndef _UAPI_BASE_KERNEL_H_
+#define _UAPI_BASE_KERNEL_H_
+
+#include <linux/types.h>
+
+struct base_mem_handle {
+	struct {
+		__u64 handle;
+	} basep;
+};
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+#define BASE_MAX_COHERENT_GROUPS 16
+
+#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
+#define LOCAL_PAGE_SHIFT PAGE_SHIFT
+#define LOCAL_PAGE_LSB ~PAGE_MASK
+#else
+#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
+#endif
+
+#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
+#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
+#else
+#error Failed to find page size
+#endif
+#endif
+
+/* Physical memory group ID for normal usage.
+ */
+#define BASE_MEM_GROUP_DEFAULT (0)
+
+/* Number of physical memory groups.
+ */
+#define BASE_MEM_GROUP_COUNT (16)
+
+/**
+ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
+ *
+ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
+ * in order to determine the best cache policy. Some combinations are
+ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
+ * which defines a write-only region on the CPU side, which is
+ * heavily read by the CPU...
+ * Other flags are only meaningful to a particular allocator.
+ * More flags can be added to this list, as long as they don't clash
+ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
+ */
+typedef __u32 base_mem_alloc_flags;
+
+/* A mask for all the flags which are modifiable via the base_mem_set_flags
+ * interface.
+ */
+#define BASE_MEM_FLAGS_MODIFIABLE \
+	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
+	 BASE_MEM_COHERENT_LOCAL)
+
+/* A mask of all the flags that can be returned via the base_mem_get_flags()
+ * interface.
+ */
+#define BASE_MEM_FLAGS_QUERYABLE \
+	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
+		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
+		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
+		BASEP_MEM_FLAGS_KERNEL_ONLY))
+
+/**
+ * enum base_mem_import_type - Memory types supported by @a base_mem_import
+ *
+ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
+ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
+ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
+ * base_mem_import_user_buffer
+ *
+ * Each type defines what the supported handle type is.
+ *
+ * If any new type is added here ARM must be contacted
+ * to allocate a numeric value for it.
+ * Do not just add a new type without synchronizing with ARM
+ * as future releases from ARM might include other new types
+ * which could clash with your custom types.
+ */
+enum base_mem_import_type {
+	BASE_MEM_IMPORT_TYPE_INVALID = 0,
+	/*
+	 * Import type with value 1 is deprecated.
+	 */
+	BASE_MEM_IMPORT_TYPE_UMM = 2,
+	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
+};
+
+/**
+ * struct base_mem_import_user_buffer - Handle of an imported user buffer
+ *
+ * @ptr:	address of imported user buffer
+ * @length:	length of imported user buffer in bytes
+ *
+ * This structure is used to represent a handle of an imported user buffer.
+ */
+
+struct base_mem_import_user_buffer {
+	__u64 ptr;
+	__u64 length;
+};
+
+/* Mask to detect 4GB boundary alignment */
+#define BASE_MEM_MASK_4GB  0xfffff000UL
+/* Mask to detect 4GB boundary (in page units) alignment */
+#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
+
+/* Limit on the 'extension' parameter for an allocation with the
+ * BASE_MEM_TILER_ALIGN_TOP flag set
+ *
+ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
+ */
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
+	(21u - (LOCAL_PAGE_SHIFT))
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
+	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
+
+/* Bit mask of cookies used for for memory allocation setup */
+#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
+
+/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
+#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
+
+/*
+ * struct base_fence - Cross-device synchronisation fence.
+ *
+ * A fence is used to signal when the GPU has finished accessing a resource that
+ * may be shared with other devices, and also to delay work done asynchronously
+ * by the GPU until other devices have finished accessing a shared resource.
+ */
+struct base_fence {
+	struct {
+		int fd;
+		int stream_fd;
+	} basep;
+};
+
+/**
+ * struct base_mem_aliasing_info - Memory aliasing info
+ *
+ * Describes a memory handle to be aliased.
+ * A subset of the handle can be chosen for aliasing, given an offset and a
+ * length.
+ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
+ * region where a special page is mapped with a write-alloc cache setup,
+ * typically used when the write result of the GPU isn't needed, but the GPU
+ * must write anyway.
+ *
+ * Offset and length are specified in pages.
+ * Offset must be within the size of the handle.
+ * Offset+length must not overrun the size of the handle.
+ *
+ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ * @offset: Offset within the handle to start aliasing from, in pages.
+ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
+ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ *          specifies the number of times the special page is needed.
+ */
+struct base_mem_aliasing_info {
+	struct base_mem_handle handle;
+	__u64 offset;
+	__u64 length;
+};
+
+/* Maximum percentage of just-in-time memory allocation trimming to perform
+ * on free.
+ */
+#define BASE_JIT_MAX_TRIM_LEVEL (100)
+
+/* Maximum number of concurrent just-in-time memory allocations.
+ */
+#define BASE_JIT_ALLOC_COUNT (255)
+
+/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
+ *
+ * jit_version is 1
+ *
+ * Due to the lack of padding specified, user clients between 32 and 64-bit
+ * may have assumed a different size of the struct
+ *
+ * An array of structures was not supported
+ */
+struct base_jit_alloc_info_10_2 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+};
+
+/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
+ * to 11.19
+ *
+ * This structure had a number of modifications during and after kernel driver
+ * version 11.5, but remains size-compatible throughout its version history, and
+ * with earlier variants compatible with future variants by requiring
+ * zero-initialization to the unused space in the structure.
+ *
+ * jit_version is 2
+ *
+ * Kernel driver version history:
+ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
+ *       must be zero. Kbase minor version was not incremented, so some
+ *       versions of 11.5 do not have this change.
+ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
+ *       minor version not incremented)
+ * 11.6: Added 'flags', replacing 1 padding byte
+ * 11.10: Arrays of this structure are supported
+ */
+struct base_jit_alloc_info_11_5 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+};
+
+/**
+ * struct base_jit_alloc_info - Structure which describes a JIT allocation
+ *                              request.
+ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
+ *                              allocated GPU virtual address to.
+ * @va_pages:                   The minimum number of virtual pages required.
+ * @commit_pages:               The minimum number of physical pages which
+ *                              should back the allocation.
+ * @extension:                     Granularity of physical pages to grow the
+ *                              allocation by during a fault.
+ * @id:                         Unique ID provided by the caller, this is used
+ *                              to pair allocation and free requests.
+ *                              Zero is not a valid value.
+ * @bin_id:                     The JIT allocation bin, used in conjunction with
+ *                              @max_allocations to limit the number of each
+ *                              type of JIT allocation.
+ * @max_allocations:            The maximum number of allocations allowed within
+ *                              the bin specified by @bin_id. Should be the same
+ *                              for all allocations within the same bin.
+ * @flags:                      flags specifying the special requirements for
+ *                              the JIT allocation, see
+ *                              %BASE_JIT_ALLOC_VALID_FLAGS
+ * @padding:                    Expansion space - should be initialised to zero
+ * @usage_id:                   A hint about which allocation should be reused.
+ *                              The kernel should attempt to use a previous
+ *                              allocation with the same usage_id
+ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
+ *                              the actual usage of the region.
+ *
+ * jit_version is 3.
+ *
+ * When modifications are made to this structure, it is still compatible with
+ * jit_version 3 when: a) the size is unchanged, and b) new members only
+ * replace the padding bytes.
+ *
+ * Previous jit_version history:
+ * jit_version == 1, refer to &base_jit_alloc_info_10_2
+ * jit_version == 2, refer to &base_jit_alloc_info_11_5
+ *
+ * Kbase version history:
+ * 11.20: added @heap_info_gpu_addr
+ */
+struct base_jit_alloc_info {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+	__u64 heap_info_gpu_addr;
+};
+
+enum base_external_resource_access {
+	BASE_EXT_RES_ACCESS_SHARED,
+	BASE_EXT_RES_ACCESS_EXCLUSIVE
+};
+
+struct base_external_resource {
+	__u64 ext_resource;
+};
+
+
+/**
+ * The maximum number of external resources which can be mapped/unmapped
+ * in a single request.
+ */
+#define BASE_EXT_RES_COUNT_MAX 10
+
+/**
+ * struct base_external_resource_list - Structure which describes a list of
+ *                                      external resources.
+ * @count:                              The number of resources.
+ * @ext_res:                            Array of external resources which is
+ *                                      sized at allocation time.
+ */
+struct base_external_resource_list {
+	__u64 count;
+	struct base_external_resource ext_res[1];
+};
+
+struct base_jd_debug_copy_buffer {
+	__u64 address;
+	__u64 size;
+	struct base_external_resource extres;
+};
+
+#define GPU_MAX_JOB_SLOTS 16
+
+/**
+ * User-side Base GPU Property Queries
+ *
+ * The User-side Base GPU Property Query interface encapsulates two
+ * sub-modules:
+ *
+ * - "Dynamic GPU Properties"
+ * - "Base Platform Config GPU Properties"
+ *
+ * Base only deals with properties that vary between different GPU
+ * implementations - the Dynamic GPU properties and the Platform Config
+ * properties.
+ *
+ * For properties that are constant for the GPU Architecture, refer to the
+ * GPU module. However, we will discuss their relevance here just to
+ * provide background information.
+ *
+ * About the GPU Properties in Base and GPU modules
+ *
+ * The compile-time properties (Platform Config, GPU Compile-time
+ * properties) are exposed as pre-processor macros.
+ *
+ * Complementing the compile-time properties are the Dynamic GPU
+ * Properties, which act as a conduit for the GPU Configuration
+ * Discovery.
+ *
+ * In general, the dynamic properties are present to verify that the platform
+ * has been configured correctly with the right set of Platform Config
+ * Compile-time Properties.
+ *
+ * As a consistent guide across the entire DDK, the choice for dynamic or
+ * compile-time should consider the following, in order:
+ * 1. Can the code be written so that it doesn't need to know the
+ * implementation limits at all?
+ * 2. If you need the limits, get the information from the Dynamic Property
+ * lookup. This should be done once as you fetch the context, and then cached
+ * as part of the context data structure, so it's cheap to access.
+ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
+ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
+ * property). Examples of where this might be sensible follow:
+ *  - Part of a critical inner-loop
+ *  - Frequent re-use throughout the driver, causing significant extra load
+ * instructions or control flow that would be worthwhile optimizing out.
+ *
+ * We cannot provide an exhaustive set of examples, neither can we provide a
+ * rule for every possible situation. Use common sense, and think about: what
+ * the rest of the driver will be doing; how the compiler might represent the
+ * value if it is a compile-time constant; whether an OEM shipping multiple
+ * devices would benefit much more from a single DDK binary, instead of
+ * insignificant micro-optimizations.
+ *
+ * Dynamic GPU Properties
+ *
+ * Dynamic GPU properties are presented in two sets:
+ * 1. the commonly used properties in @ref base_gpu_props, which have been
+ * unpacked from GPU register bitfields.
+ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
+ * (also a member of base_gpu_props). All of these are presented in
+ * the packed form, as presented by the GPU  registers themselves.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ * The properties returned extend the GPU Configuration Discovery
+ * registers. For example, GPU clock speed is not specified in the GPU
+ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
+ *
+ * The GPU properties are obtained by a call to
+ * base_get_gpu_props(). This simply returns a pointer to a const
+ * base_gpu_props structure. It is constant for the life of a base
+ * context. Multiple calls to base_get_gpu_props() to a base context
+ * return the same pointer to a constant structure. This avoids cache pollution
+ * of the common data.
+ *
+ * This pointer must not be freed, because it does not point to the start of a
+ * region allocated by the memory allocator; instead, just close the @ref
+ * base_context.
+ *
+ *
+ * Kernel Operation
+ *
+ * During Base Context Create time, user-side makes a single kernel call:
+ * - A call to fill user memory with GPU information structures
+ *
+ * The kernel-side will fill the provided the entire processed base_gpu_props
+ * structure, because this information is required in both
+ * user and kernel side; it does not make sense to decode it twice.
+ *
+ * Coherency groups must be derived from the bitmasks, but this can be done
+ * kernel side, and just once at kernel startup: Coherency groups must already
+ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
+ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
+ *
+ * Coherency Group calculation
+ *
+ * Creation of the coherent group data is done at device-driver startup, and so
+ * is one-time. This will most likely involve a loop with CLZ, shifting, and
+ * bit clearing on the L2_PRESENT mask, depending on whether the
+ * system is L2 Coherent. The number of shader cores is done by a
+ * population count, since faulty cores may be disabled during production,
+ * producing a non-contiguous mask.
+ *
+ * The memory requirements for this algorithm can be determined either by a __u64
+ * population count on the L2_PRESENT mask (a LUT helper already is
+ * required for the above), or simple assumption that there can be no more than
+ * 16 coherent groups, since core groups are typically 4 cores.
+ */
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+#define BASE_MAX_COHERENT_GROUPS 16
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ * 	0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ * 	4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ * 	release number.
+ * 	8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ * 	release number.
+ * 	4 bit values (0-15).
+ * @padding: padding to allign to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ * 	clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ * 	is a bitpattern where a set bit indicates that the format is supported.
+ * 	Before using a texture format, it is recommended that the corresponding
+ * 	bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ * 	It is unlikely that a client will be able to allocate all of this memory
+ * 	for their own purposes, but this at least provides an upper bound on the
+ * 	memory available to the GPU.
+ * 	This is required for OpenCL's clGetDeviceInfo() call when
+ * 	CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ * 	client will not be expecting to allocate anywhere near this value.
+ * @num_exec_engines: The number of execution engines.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+	__u8 num_exec_engines;
+};
+
+/*
+ * More information is possible - but associativity and bus width are not
+ * required by upper-level apis.
+ */
+struct mali_base_gpu_l2_cache_props {
+	__u8 log2_line_size;
+	__u8 log2_cache_size;
+	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	__u8 padding[5];
+};
+
+struct mali_base_gpu_tiler_props {
+	__u32 bin_size_bytes;	/* Max is 4*2^15 */
+	__u32 max_active_levels;	/* Max is 2^15 */
+};
+
+/**
+ * struct mali_base_gpu_thread_props - GPU threading system details.
+ * @max_threads: Max. number of threads per core
+ * @max_workgroup_size:     Max. number of threads per workgroup
+ * @max_barrier_size:       Max. number of threads that can synchronize on a
+ *                          simple barrier
+ * @max_registers:          Total size [1..65535] of the register file available
+ *                          per core.
+ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
+ *                          before it becomes blocked.
+ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
+ *                          field.
+ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
+ *                          3 = SW Model/Emulation
+ * @padding:                padding to allign to 8-byte
+ * @tls_alloc:              Number of threads per core that TLS must be
+ *                          allocated for
+ */
+struct mali_base_gpu_thread_props {
+	__u32 max_threads;
+	__u32 max_workgroup_size;
+	__u32 max_barrier_size;
+	__u16 max_registers;
+	__u8 max_task_queue;
+	__u8 max_thread_group_split;
+	__u8 impl_tech;
+	__u8  padding[3];
+	__u32 tls_alloc;
+};
+
+/**
+ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
+ * @core_mask: Core restriction mask required for the group
+ * @num_cores: Number of cores in the group
+ * @padding:   padding to allign to 8-byte
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * 	provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ * 	the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ * 	wastage.
+ */
+struct mali_base_gpu_coherent_group {
+	__u64 core_mask;
+	__u16 num_cores;
+	__u16 padding[3];
+};
+
+/**
+ * struct mali_base_gpu_coherent_group_info - Coherency group information
+ * @num_groups: Number of coherent groups in the GPU.
+ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
+ * 	Equivalent to the number of L2 Caches.
+ * 	  The GPU Counter dumping writes 2048 bytes per core group, regardless
+ * 	of whether the core groups are coherent or not. Hence this member is
+ * 	needed to calculate how much memory is required for dumping.
+ * 	  @note Do not use it to work out how many valid elements are in the
+ * 	group[] member. Use num_groups instead.
+ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
+ * 	methods
+ * @padding: padding to allign to 8-byte
+ * @group: Descriptors of coherent groups
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_base_gpu_coherent_group_info {
+	__u32 num_groups;
+	__u32 num_core_groups;
+	__u32 coherency;
+	__u32 padding;
+	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
+};
+
+/**
+ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
+ *                            Configuration Discovery registers.
+ * @shader_present: Shader core present bitmap
+ * @tiler_present: Tiler core present bitmap
+ * @l2_present: Level 2 cache present bitmap
+ * @stack_present: Core stack present bitmap
+ * @l2_features: L2 features
+ * @core_features: Core features
+ * @mem_features: Mem features
+ * @mmu_features: Mmu features
+ * @as_present: Bitmap of address spaces present
+ * @js_present: Job slots present
+ * @js_features: Array of job slot features.
+ * @tiler_features: Tiler features
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
+ * @gpu_id: GPU and revision identifier
+ * @thread_max_threads: Maximum number of threads per core
+ * @thread_max_workgroup_size: Maximum number of threads per workgroup
+ * @thread_max_barrier_size: Maximum number of threads per barrier
+ * @thread_features: Thread features
+ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
+ *                  available modes as exposed in the coherency_features register
+ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
+ * @gpu_features: GPU features
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct gpu_raw_gpu_props {
+	__u64 shader_present;
+	__u64 tiler_present;
+	__u64 l2_present;
+	__u64 stack_present;
+	__u32 l2_features;
+	__u32 core_features;
+	__u32 mem_features;
+	__u32 mmu_features;
+
+	__u32 as_present;
+
+	__u32 js_present;
+	__u32 js_features[GPU_MAX_JOB_SLOTS];
+	__u32 tiler_features;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	__u32 gpu_id;
+
+	__u32 thread_max_threads;
+	__u32 thread_max_workgroup_size;
+	__u32 thread_max_barrier_size;
+	__u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	__u32 coherency_mode;
+
+	__u32 thread_tls_alloc;
+	__u64 gpu_features;
+};
+
+/**
+ * struct base_gpu_props - Return structure for base_get_gpu_props().
+ * @core_props:     Core props.
+ * @l2_props:       L2 props.
+ * @unused_1:       Keep for backwards compatibility.
+ * @tiler_props:    Tiler props.
+ * @thread_props:   Thread props.
+ * @raw_props:      This member is large, likely to be 128 bytes.
+ * @coherency_info: This must be last member of the structure.
+ *
+ * NOTE: the raw_props member in this data structure contains the register
+ * values from which the value of the other members are derived. The derived
+ * members exist to allow for efficient access and/or shielding the details
+ * of the layout of the registers.
+ */
+struct base_gpu_props {
+	struct mali_base_gpu_core_props core_props;
+	struct mali_base_gpu_l2_cache_props l2_props;
+	__u64 unused_1;
+	struct mali_base_gpu_tiler_props tiler_props;
+	struct mali_base_gpu_thread_props thread_props;
+	struct gpu_raw_gpu_props raw_props;
+	struct mali_base_gpu_coherent_group_info coherency_info;
+};
+
+#define BASE_MEM_GROUP_ID_GET(flags)                                           \
+	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
+
+#define BASE_MEM_GROUP_ID_SET(id)                                              \
+	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
+					 BASE_MEM_GROUP_DEFAULT :              \
+					 id)                                   \
+	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
+	 BASE_MEM_GROUP_ID_MASK)
+
+#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
+	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
+	 ((base_context_create_flags)(group_id)                                \
+	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
+
+#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
+	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
+	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/*
+ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
+ * flags are also used, where applicable, for specifying which fields
+ * are valid following the request operation.
+ */
+
+/* For monotonic (counter) timefield */
+#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
+/* For system wide timestamp */
+#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
+/* For GPU cycle counter */
+#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
+/* Specify kernel GPU register timestamp */
+#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
+/* Specify userspace cntvct_el0 timestamp source */
+#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
+
+#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
+		BASE_TIMEINFO_MONOTONIC_FLAG | \
+		BASE_TIMEINFO_TIMESTAMP_FLAG | \
+		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
+		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
+		BASE_TIMEINFO_USER_SOURCE_FLAG)
+
+/* Maximum number of source allocations allowed to create an alias allocation.
+ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
+ * layers, since each cube map in the array will have 6 faces.
+ */
+#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
+
+#endif /* _UAPI_BASE_KERNEL_H_ */
diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h
new file mode 100644
index 00000000000..17e338cb238
--- /dev/null
+++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * This header was originally autogenerated, but it is now ok (and
+ * expected) to have to add to it.
+ */
+
+#ifndef _UAPI_GPU_CSF_REGISTERS_H_
+#define _UAPI_GPU_CSF_REGISTERS_H_
+
+/* Only user block defines are included. HI words have been removed */
+
+/* CS_USER_INPUT_BLOCK register offsets */
+#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
+#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
+
+/* CS_USER_OUTPUT_BLOCK register offsets */
+#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
+#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
+
+/* USER register offsets */
+#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
+
+#endif
diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
new file mode 100644
index 00000000000..3df8a01699f
--- /dev/null
+++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
@@ -0,0 +1,483 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_CSF_IOCTL_H_
+#define _UAPI_KBASE_CSF_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 1.0:
+ * - CSF IOCTL header separated from JM
+ * 1.1:
+ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 1.2:
+ * - Add new CSF GPU_FEATURES register into the property structure
+ *   returned by KBASE_IOCTL_GET_GPUPROPS
+ * 1.3:
+ * - Add __u32 group_uid member to
+ *   &struct_kbase_ioctl_cs_queue_group_create.out
+ * 1.4:
+ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
+ *   instr_features member of same size
+ * 1.5:
+ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
+ *   queue registration call with extended format for supporting CS
+ *   trace configurations with CSF trace_command.
+ * 1.6:
+ * - Added new HW performance counters interface to all GPUs.
+ * 1.7:
+ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
+ * 1.8:
+ * - Removed Kernel legacy HWC interface
+ */
+
+#define BASE_UK_VERSION_MAJOR 1
+#define BASE_UK_VERSION_MINOR 8
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+
+/**
+ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
+ *                                        base back-end
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ *
+ * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
+ *        Any change of this struct should also be mirrored to the latter.
+ */
+struct kbase_ioctl_cs_queue_register {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER \
+	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
+
+/**
+ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
+ *                                    to notify that a queue has been updated
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_kick {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_KICK \
+	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
+
+/**
+ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
+ *
+ * @in:                 Input parameters
+ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @in.group_handle:    Handle of the group to which the queue should be bound
+ * @in.csi_index:       Index of the CSF interface the queue should be bound to
+ * @in.padding:         Currently unused, must be zero
+ * @out:                Output parameters
+ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
+ *                      input/output pages
+ */
+union kbase_ioctl_cs_queue_bind {
+	struct {
+		__u64 buffer_gpu_addr;
+		__u8 group_handle;
+		__u8 csi_index;
+		__u8 padding[6];
+	} in;
+	struct {
+		__u64 mmap_handle;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_BIND \
+	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
+
+/**
+ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
+ *                                           base back-end in extended format,
+ *                                           involving trace buffer configuration
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
+ * @ex_buffer_base: Trace buffer GPU base address for the queue
+ * @ex_buffer_size: Size of the trace buffer in bytes
+ * @ex_event_size: Trace event write size, in log2 designation
+ * @ex_event_state: Trace event states configuration
+ * @ex_padding: Currently unused, must be zero
+ *
+ * @Note: There is an identical sub-section at the start of this struct to that
+ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
+ *        must also be mirrored to the latter. Following the said sub-section,
+ *        the remaining fields forms the extension, marked with ex_*.
+ */
+struct kbase_ioctl_cs_queue_register_ex {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+	__u64 ex_offset_var_addr;
+	__u64 ex_buffer_base;
+	__u32 ex_buffer_size;
+	__u8 ex_event_size;
+	__u8 ex_event_state;
+	__u8 ex_padding[2];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
+	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
+
+/**
+ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_terminate {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
+ *                                               group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create_1_6 {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
+	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+		__u64 reserved;
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
+	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
+
+/**
+ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
+ *
+ * @group_handle: Handle of the queue group to be terminated
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_cs_queue_group_term {
+	__u8 group_handle;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
+
+#define KBASE_IOCTL_CS_EVENT_SIGNAL \
+	_IO(KBASE_IOCTL_TYPE, 44)
+
+typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
+
+/**
+ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
+ *
+ * @id: ID of the new command queue returned by the kernel
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_new {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
+	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
+ *
+ * @id: ID of the command queue to be destroyed
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_delete {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
+	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
+ *
+ * @addr: Memory address of an array of struct base_kcpu_queue_command
+ * @nr_commands: Number of commands in the array
+ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_enqueue {
+	__u64 addr;
+	__u32 nr_commands;
+	base_kcpu_queue_id id;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
+	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
+
+/**
+ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
+ * @in:                Input parameters
+ * @in.chunk_size:     Size of each chunk.
+ * @in.initial_chunks: Initial number of chunks that heap will be created with.
+ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
+ * @in.target_in_flight: Number of render-passes that the driver should attempt to
+ *                     keep in flight for which allocation of new chunks is
+ *                     allowed.
+ * @in.group_id:       Group ID to be used for physical allocations.
+ * @in.padding:        Padding
+ * @out:               Output parameters
+ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
+ *                     for the heap.
+ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
+ *                     actually points to the header of heap chunk and not to
+ *                     the low address of free memory in the chunk.
+ */
+union kbase_ioctl_cs_tiler_heap_init {
+	struct {
+		__u32 chunk_size;
+		__u32 initial_chunks;
+		__u32 max_chunks;
+		__u16 target_in_flight;
+		__u8 group_id;
+		__u8 padding;
+	} in;
+	struct {
+		__u64 gpu_heap_va;
+		__u64 first_chunk_va;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
+	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
+
+/**
+ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
+ *                                         instance
+ *
+ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
+ */
+struct kbase_ioctl_cs_tiler_heap_term {
+	__u64 gpu_heap_va;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
+	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
+
+/**
+ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
+ *                                        of CSF interface capabilities
+ *
+ * @in:                    Input parameters
+ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
+ *                         which case groups_ptr is unused.
+ * @in.max_total_stream    _num: The maximum number of CSs to be read. Can be 0, in
+ *                         which case streams_ptr is unused.
+ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
+ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
+ * @out:                   Output parameters
+ * @out.glb_version:       Global interface version.
+ * @out.features:          Bit mask of features (e.g. whether certain types of job
+ *                         can be suspended).
+ * @out.group_num:         Number of CSGs supported.
+ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
+ *                         hold the size of firmware performance counter data
+ *                         and 15:0 hold the size of hardware performance counter
+ *                         data.
+ * @out.total_stream_num:  Total number of CSs, summed across all groups.
+ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
+ *                         size of events. Bits 3:0 hold the offset update rate.
+ *                         (csf >= 1.1.0)
+ *
+ */
+union kbase_ioctl_cs_get_glb_iface {
+	struct {
+		__u32 max_group_num;
+		__u32 max_total_stream_num;
+		__u64 groups_ptr;
+		__u64 streams_ptr;
+	} in;
+	struct {
+		__u32 glb_version;
+		__u32 features;
+		__u32 group_num;
+		__u32 prfcnt_size;
+		__u32 total_stream_num;
+		__u32 instr_features;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_GET_GLB_IFACE \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
+
+struct kbase_ioctl_cs_cpu_queue_info {
+	__u64 buffer;
+	__u64 size;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
+	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+/**
+ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
+ * @cpu_addr: Memory address to write
+ * @value: Value to write
+ * @padding: Currently unused, must be zero
+ */
+struct kbase_ioctl_cs_event_memory_write {
+	__u64 cpu_addr;
+	__u8 value;
+	__u8 padding[7];
+};
+
+/**
+ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
+ * @in: Input parameters
+ * @in.cpu_addr: Memory address to read
+ * @out: Output parameters
+ * @out.value: Value read
+ * @out.padding: Currently unused, must be zero
+ */
+union kbase_ioctl_cs_event_memory_read {
+	struct {
+		__u64 cpu_addr;
+	} in;
+	struct {
+		__u8 value;
+		__u8 padding[7];
+	} out;
+};
+
+#endif /* MALI_UNIT_TEST */
+
+#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h
new file mode 100644
index 00000000000..fc81b71b46a
--- /dev/null
+++ b/src/panfrost/csf_test/mali_kbase_ioctl.h
@@ -0,0 +1,854 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_IOCTL_H_
+#define _UAPI_KBASE_IOCTL_H_
+
+#ifdef __cpluscplus
+extern "C" {
+#endif
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+#define KBASE_IOCTL_TYPE 0x80
+
+/**
+ * struct kbase_ioctl_set_flags - Set kernel context creation flags
+ *
+ * @create_flags: Flags - see base_context_create_flags
+ */
+struct kbase_ioctl_set_flags {
+	__u32 create_flags;
+};
+
+#define KBASE_IOCTL_SET_FLAGS \
+	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
+
+/**
+ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
+ *
+ * @buffer: Pointer to the buffer to store properties into
+ * @size: Size of the buffer
+ * @flags: Flags - must be zero for now
+ *
+ * The ioctl will return the number of bytes stored into @buffer or an error
+ * on failure (e.g. @size is too small). If @size is specified as 0 then no
+ * data will be written but the return value will be the number of bytes needed
+ * for all the properties.
+ *
+ * @flags may be used in the future to request a different format for the
+ * buffer. With @flags == 0 the following format is used.
+ *
+ * The buffer will be filled with pairs of values, a __u32 key identifying the
+ * property followed by the value. The size of the value is identified using
+ * the bottom bits of the key. The value then immediately followed the key and
+ * is tightly packed (there is no padding). All keys and values are
+ * little-endian.
+ *
+ * 00 = __u8
+ * 01 = __u16
+ * 10 = __u32
+ * 11 = __u64
+ */
+struct kbase_ioctl_get_gpuprops {
+	__u64 buffer;
+	__u32 size;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_GET_GPUPROPS \
+	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
+
+/**
+ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC \
+	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
+
+/**
+ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
+ * @in: Input parameters
+ * @in.gpu_addr: A GPU address contained within the region
+ * @in.query: The type of query
+ * @out: Output parameters
+ * @out.value: The result of the query
+ *
+ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
+ */
+union kbase_ioctl_mem_query {
+	struct {
+		__u64 gpu_addr;
+		__u64 query;
+	} in;
+	struct {
+		__u64 value;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_QUERY \
+	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
+
+#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
+#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
+#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
+
+/**
+ * struct kbase_ioctl_mem_free - Free a memory region
+ * @gpu_addr: Handle to the region to free
+ */
+struct kbase_ioctl_mem_free {
+	__u64 gpu_addr;
+};
+
+#define KBASE_IOCTL_MEM_FREE \
+	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @buffer_count: requested number of dumping buffers
+ * @fe_bm:        counters selection bitmask (Front end)
+ * @shader_bm:    counters selection bitmask (Shader)
+ * @tiler_bm:     counters selection bitmask (Tiler)
+ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error
+ */
+struct kbase_ioctl_hwcnt_reader_setup {
+	__u32 buffer_count;
+	__u32 fe_bm;
+	__u32 shader_bm;
+	__u32 tiler_bm;
+	__u32 mmu_l2_bm;
+};
+
+#define KBASE_IOCTL_HWCNT_READER_SETUP \
+	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
+
+/**
+ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
+ * @data:    Counter samples for the dummy model.
+ * @size:    Size of the counter sample data.
+ * @padding: Padding.
+ */
+struct kbase_ioctl_hwcnt_values {
+	__u64 data;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_HWCNT_SET \
+	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
+
+/**
+ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
+ * @counter:   A counter of disjoint events in the kernel
+ */
+struct kbase_ioctl_disjoint_query {
+	__u32 counter;
+};
+
+#define KBASE_IOCTL_DISJOINT_QUERY \
+	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
+
+/**
+ * struct kbase_ioctl_get_ddk_version - Query the kernel version
+ * @version_buffer: Buffer to receive the kernel version string
+ * @size: Size of the buffer
+ * @padding: Padding
+ *
+ * The ioctl will return the number of bytes written into version_buffer
+ * (which includes a NULL byte) or a negative error code
+ *
+ * The ioctl request code has to be _IOW because the data in ioctl struct is
+ * being copied to the kernel, even though the kernel then writes out the
+ * version info to the buffer specified in the ioctl.
+ */
+struct kbase_ioctl_get_ddk_version {
+	__u64 version_buffer;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_GET_DDK_VERSION \
+	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 10.2--11.4)
+ * @va_pages: Number of VA pages to reserve for JIT
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_10_2 {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 11.5--11.19)
+ * @va_pages: Number of VA pages to reserve for JIT
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_11_5 {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
+
+/**
+ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
+ *                                   allocator
+ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
+ *            memory allocations
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ * @phys_pages: Maximum number of physical pages to allocate just-in-time
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ */
+struct kbase_ioctl_mem_jit_init {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+	__u64 phys_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
+
+/**
+ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
+ *
+ * @handle: GPU memory handle (GPU VA)
+ * @user_addr: The address where it is mapped in user space
+ * @size: The number of bytes to synchronise
+ * @type: The direction to synchronise: 0 is sync to memory (clean),
+ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_mem_sync {
+	__u64 handle;
+	__u64 user_addr;
+	__u64 size;
+	__u8 type;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_MEM_SYNC \
+	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
+
+/**
+ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
+ *
+ * @in: Input parameters
+ * @in.gpu_addr: The GPU address of the memory region
+ * @in.cpu_addr: The CPU address to locate
+ * @in.size: A size in bytes to validate is contained within the region
+ * @out: Output parameters
+ * @out.offset: The offset from the start of the memory region to @cpu_addr
+ */
+union kbase_ioctl_mem_find_cpu_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 cpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
+
+/**
+ * struct kbase_ioctl_get_context_id - Get the kernel context ID
+ *
+ * @id: The kernel context ID
+ */
+struct kbase_ioctl_get_context_id {
+	__u32 id;
+};
+
+#define KBASE_IOCTL_GET_CONTEXT_ID \
+	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
+
+/**
+ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
+ *
+ * @flags: Flags
+ *
+ * The ioctl returns a file descriptor when successful
+ */
+struct kbase_ioctl_tlstream_acquire {
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
+	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
+
+#define KBASE_IOCTL_TLSTREAM_FLUSH \
+	_IO(KBASE_IOCTL_TYPE, 19)
+
+/**
+ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
+ *
+ * @gpu_addr: The memory region to modify
+ * @pages:    The number of physical pages that should be present
+ *
+ * The ioctl may return on the following error codes or 0 for success:
+ *   -ENOMEM: Out of memory
+ *   -EINVAL: Invalid arguments
+ */
+struct kbase_ioctl_mem_commit {
+	__u64 gpu_addr;
+	__u64 pages;
+};
+
+#define KBASE_IOCTL_MEM_COMMIT \
+	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
+
+/**
+ * union kbase_ioctl_mem_alias - Create an alias of memory regions
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.stride: Bytes between start of each memory region
+ * @in.nents: The number of regions to pack together into the alias
+ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_alias {
+	struct {
+		__u64 flags;
+		__u64 stride;
+		__u64 nents;
+		__u64 aliasing_info;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALIAS \
+	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
+
+/**
+ * union kbase_ioctl_mem_import - Import memory for use by the GPU
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.phandle: Handle to the external memory
+ * @in.type: Type of external memory, see base_mem_import_type
+ * @in.padding: Amount of extra VA pages to append to the imported buffer
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_import {
+	struct {
+		__u64 flags;
+		__u64 phandle;
+		__u32 type;
+		__u32 padding;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_IMPORT \
+	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
+
+/**
+ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
+ * @gpu_va: The GPU region to modify
+ * @flags: The new flags to set
+ * @mask: Mask of the flags to modify
+ */
+struct kbase_ioctl_mem_flags_change {
+	__u64 gpu_va;
+	__u64 flags;
+	__u64 mask;
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
+	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
+
+/**
+ * struct kbase_ioctl_stream_create - Create a synchronisation stream
+ * @name: A name to identify this stream. Must be NULL-terminated.
+ *
+ * Note that this is also called a "timeline", but is named stream to avoid
+ * confusion with other uses of the word.
+ *
+ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
+ *
+ * The ioctl returns a file descriptor.
+ */
+struct kbase_ioctl_stream_create {
+	char name[32];
+};
+
+#define KBASE_IOCTL_STREAM_CREATE \
+	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
+
+/**
+ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
+ * @fd: The file descriptor to validate
+ */
+struct kbase_ioctl_fence_validate {
+	int fd;
+};
+
+#define KBASE_IOCTL_FENCE_VALIDATE \
+	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
+
+/**
+ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
+ * @buffer: Pointer to the information
+ * @len: Length
+ * @padding: Padding
+ *
+ * The data provided is accessible through a debugfs file
+ */
+struct kbase_ioctl_mem_profile_add {
+	__u64 buffer;
+	__u32 len;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_MEM_PROFILE_ADD \
+	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to map
+ */
+struct kbase_ioctl_sticky_resource_map {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
+	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was
+ *                                          previously permanently mapped
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to unmap
+ */
+struct kbase_ioctl_sticky_resource_unmap {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
+	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
+
+/**
+ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
+ *                                                   the GPU memory region for
+ *                                                   the given gpu address and
+ *                                                   the offset of that address
+ *                                                   into the region
+ * @in: Input parameters
+ * @in.gpu_addr: GPU virtual address
+ * @in.size: Size in bytes within the region
+ * @out: Output parameters
+ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
+ *             for the length of @offset bytes
+ * @out.offset: The offset from the start of the memory region to @gpu_addr
+ */
+union kbase_ioctl_mem_find_gpu_start_and_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 start;
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
+
+#define KBASE_IOCTL_CINSTR_GWT_START \
+	_IO(KBASE_IOCTL_TYPE, 33)
+
+#define KBASE_IOCTL_CINSTR_GWT_STOP \
+	_IO(KBASE_IOCTL_TYPE, 34)
+
+/**
+ * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses.
+ * @in: Input parameters
+ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
+ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
+ * @in.len: Number of addresses the buffers can hold.
+ * @in.padding: padding
+ * @out: Output parameters
+ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
+ * @out.more_data_available: Status indicating if more addresses are available.
+ * @out.padding: padding
+ *
+ * This structure is used when performing a call to dump GPU write fault
+ * addresses.
+ */
+union kbase_ioctl_cinstr_gwt_dump {
+	struct {
+		__u64 addr_buffer;
+		__u64 size_buffer;
+		__u32 len;
+		__u32 padding;
+
+	} in;
+	struct {
+		__u32 no_of_addr_collected;
+		__u8 more_data_available;
+		__u8 padding[27];
+	} out;
+};
+
+#define KBASE_IOCTL_CINSTR_GWT_DUMP \
+	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
+
+/**
+ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
+ *
+ * @va_pages: Number of VA pages to reserve for EXEC_VA
+ */
+struct kbase_ioctl_mem_exec_init {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_EXEC_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
+
+/**
+ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
+ *                                          cpu/gpu time (counter values)
+ * @in: Input parameters
+ * @in.request_flags: Bit-flags indicating the requested types.
+ * @in.paddings:      Unused, size alignment matching the out.
+ * @out: Output parameters
+ * @out.sec:           Integer field of the monotonic time, unit in seconds.
+ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
+ * @out.padding:       Unused, for __u64 alignment
+ * @out.timestamp:     System wide timestamp (counter) value.
+ * @out.cycle_counter: GPU cycle counter value.
+ */
+union kbase_ioctl_get_cpu_gpu_timeinfo {
+	struct {
+		__u32 request_flags;
+		__u32 paddings[7];
+	} in;
+	struct {
+		__u64 sec;
+		__u32 nsec;
+		__u32 padding;
+		__u64 timestamp;
+		__u64 cycle_counter;
+	} out;
+};
+
+#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
+	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
+
+/**
+ * struct kbase_ioctl_context_priority_check - Check the max possible priority
+ * @priority: Input priority & output priority
+ */
+
+struct kbase_ioctl_context_priority_check {
+	__u8 priority;
+};
+
+#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
+
+/**
+ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
+ *
+ * @max_core_count: Maximum core count
+ */
+struct kbase_ioctl_set_limited_core_count {
+	__u8 max_core_count;
+};
+
+#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
+	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
+ *                                              information
+ * @info_item_size:  Performance counter item size in bytes.
+ * @info_item_count: Performance counter item count in the info_list_ptr.
+ * @info_list_ptr:   Performance counter item list pointer which points to a
+ *                   list with info_item_count of items.
+ *
+ * On success: returns info_item_size and info_item_count if info_list_ptr is
+ * NULL, returns performance counter information if info_list_ptr is not NULL.
+ * On error: returns a negative error code.
+ */
+struct kbase_ioctl_kinstr_prfcnt_enum_info {
+	__u32 info_item_size;
+	__u32 info_item_count;
+	__u64 info_list_ptr;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
+	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @in: input parameters.
+ * @in.request_item_count: Number of requests in the requests array.
+ * @in.request_item_size:  Size in bytes of each request in the requests array.
+ * @in.requests_ptr:       Pointer to the requests array.
+ * @out: output parameters.
+ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
+ *                                 each sample.
+ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
+ *                                 for reading performance counter samples.
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error.
+ */
+union kbase_ioctl_kinstr_prfcnt_setup {
+	struct {
+		__u32 request_item_count;
+		__u32 request_item_size;
+		__u64 requests_ptr;
+	} in;
+	struct {
+		__u32 prfcnt_metadata_item_size;
+		__u32 prfcnt_mmap_size_bytes;
+	} out;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
+
+
+/**
+ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
+ * @bytes_collected: number of bytes read by user
+ * @bytes_generated: number of bytes generated by tracepoints
+ */
+struct kbase_ioctl_tlstream_stats {
+	__u32 bytes_collected;
+	__u32 bytes_generated;
+};
+
+#define KBASE_IOCTL_TLSTREAM_STATS \
+	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
+
+#endif /* MALI_UNIT_TEST */
+
+/* Customer extension range */
+#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
+
+/* If the integration needs extra ioctl add them there
+ * like this:
+ *
+ * struct my_ioctl_args {
+ *  ....
+ * }
+ *
+ * #define KBASE_IOCTL_MY_IOCTL \
+ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
+ */
+
+
+/**********************************
+ * Definitions for GPU properties *
+ **********************************/
+#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
+#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
+#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
+#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
+
+#define KBASE_GPUPROP_PRODUCT_ID			1
+#define KBASE_GPUPROP_VERSION_STATUS			2
+#define KBASE_GPUPROP_MINOR_REVISION			3
+#define KBASE_GPUPROP_MAJOR_REVISION			4
+/* 5 previously used for GPU speed */
+#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
+/* 7 previously used for minimum GPU speed */
+#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
+#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
+#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
+#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
+#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
+
+#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
+#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
+#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
+
+#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
+#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
+
+#define KBASE_GPUPROP_MAX_THREADS			18
+#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
+#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
+#define KBASE_GPUPROP_MAX_REGISTERS			21
+#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
+#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
+#define KBASE_GPUPROP_IMPL_TECH				24
+
+#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
+#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
+#define KBASE_GPUPROP_RAW_L2_PRESENT			27
+#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
+#define KBASE_GPUPROP_RAW_L2_FEATURES			29
+#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
+#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
+#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
+#define KBASE_GPUPROP_RAW_AS_PRESENT			33
+#define KBASE_GPUPROP_RAW_JS_PRESENT			34
+#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
+#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
+#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
+#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
+#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
+#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
+#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
+#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
+#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
+#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
+#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
+#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
+#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
+#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
+#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
+#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
+#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
+#define KBASE_GPUPROP_RAW_GPU_ID			55
+#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
+#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
+#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
+#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
+#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
+
+#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
+#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
+#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
+#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
+#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
+#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
+#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
+#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
+#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
+#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
+#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
+#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
+#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
+#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
+#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
+#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
+#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
+#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
+#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
+
+#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
+
+#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
+
+#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
+#define KBASE_GPUPROP_TLS_ALLOC				84
+#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
+#ifdef __cpluscplus
+}
+#endif
+
+#endif /* _UAPI_KBASE_IOCTL_H_ */
diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c
new file mode 100644
index 00000000000..cb9ff398314
--- /dev/null
+++ b/src/panfrost/csf_test/test.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "util/macros.h"
+
+#include "mali_kbase_csf_ioctl.h"
+#include "mali_kbase_ioctl.h"
+#include "mali_base_kernel.h"
+#include "mali_base_csf_kernel.h"
+#include "mali_gpu_csf_registers.h"
+
+#define PAN_ARCH 10
+#include "genxml/gen_macros.h"
+
+#include "wrap.h"
+#include "decode.h"
+
+#include "pan_shader.h"
+#include "compiler/nir/nir_builder.h"
+#include "bifrost/valhall/disassemble.h"
+
+#define CS_EVENT_REGISTER 0x5A
+
+static bool pr = true;
+static bool colour_term = true;
+
+static void
+dump_start(FILE *f)
+{
+        if (colour_term)
+                fprintf(f, "\x1b[90m");
+}
+
+static void
+dump_end(FILE *f)
+{
+        if (colour_term)
+                fprintf(f, "\x1b[39m");
+}
+
+/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */
+static void
+cache_clean(volatile void *addr)
+{
+#ifdef __aarch64__
+        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
+#endif
+}
+
+static void
+cache_invalidate(volatile void *addr)
+{
+#ifdef __aarch64__
+        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
+#endif
+}
+
+static void
+cache_barrier(void)
+{
+#ifdef __ARM_ARCH
+        __asm__ volatile ("dsb sy" ::: "memory");
+#endif
+}
+
+static void
+memory_barrier(void)
+{
+#ifdef __ARM_ARCH
+        __asm__ volatile ("dmb sy" ::: "memory");
+#endif
+}
+
+typedef void (*cacheline_op)(volatile void *addr);
+
+#define CACHELINE_SIZE 64
+
+static void
+cacheline_op_range(volatile void *start, unsigned length, cacheline_op op)
+{
+        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
+        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
+        for (; ptr < end; ptr += CACHELINE_SIZE)
+                op(ptr);
+}
+
+static void
+cache_clean_range(volatile void *start, unsigned length)
+{
+        cacheline_op_range(start, length, cache_clean);
+}
+
+static void
+cache_invalidate_range(volatile void *start, unsigned length)
+{
+        cacheline_op_range(start, length, cache_invalidate);
+}
+
+struct state;
+struct test;
+
+typedef bool (* section)(struct state *s, struct test *t);
+
+#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */
+#define CS_QUEUE_SIZE 65536
+
+struct state {
+        int page_size;
+        int argc;
+        char **argv;
+
+        int mali_fd;
+        int tl_fd;
+        void *tracking_region;
+        void *csf_user_reg;
+
+        uint8_t *gpuprops;
+        unsigned gpuprops_size;
+        uint32_t gpu_id;
+
+        struct {
+                struct panfrost_ptr normal, exec, coherent, cached, event, ev2;
+        } allocations;
+
+        uint64_t tiler_heap_va;
+        uint64_t tiler_heap_header;
+
+        uint8_t csg_handle;
+        uint32_t csg_uid;
+
+        struct panfrost_ptr cs_mem[CS_QUEUE_COUNT];
+        void *cs_user_io[CS_QUEUE_COUNT];
+        unsigned cs_last_submit[CS_QUEUE_COUNT];
+        struct pan_command_stream cs[CS_QUEUE_COUNT];
+
+        unsigned shader_alloc_offset;
+        mali_ptr compute_shader;
+};
+
+struct test {
+        section part;
+        section cleanup;
+        const char *label;
+
+        struct test *subtests;
+        unsigned sub_length;
+
+        /* for allocation tests */
+        unsigned offset;
+        unsigned flags;
+
+        bool add;
+        bool invalid;
+        bool blit;
+        bool vertex;
+};
+
+/* See STATE and ALLOC macros below */
+#define DEREF_STATE(s, offset) ((void*) s + offset)
+
+static uint64_t
+pan_get_gpuprop(struct state *s, int name)
+{
+        int i = 0;
+        uint64_t x = 0;
+        while (i < s->gpuprops_size) {
+                x = 0;
+                memcpy(&x, s->gpuprops + i, 4);
+                i += 4;
+
+                int size = 1 << (x & 3);
+                int this_name = x >> 2;
+
+                x = 0;
+                memcpy(&x, s->gpuprops + i, size);
+                i += size;
+
+                if (this_name == name)
+                        return x;
+        }
+
+        fprintf(stderr, "Unknown prop %i\n", name);
+        return 0;
+}
+
+static bool
+open_kbase(struct state *s, struct test *t)
+{
+        s->mali_fd = open("/dev/mali0", O_RDWR);
+        if (s->mali_fd != -1)
+                return true;
+
+        perror("open(\"/dev/mali0\")");
+        return false;
+}
+
+static bool
+close_kbase(struct state *s, struct test *t)
+{
+        if (getenv("TEST_CHECK_LEAKS")) {
+                int pid = getpid();
+                char cmd_buffer[64] = {0};
+                sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid);
+                system(cmd_buffer);
+                sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid);
+                system(cmd_buffer);
+        }
+
+        if (s->mali_fd > 0)
+                return close(s->mali_fd) == 0;
+        return true;
+}
+
+static bool
+get_version(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_version_check ver = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_VERSION_CHECK)");
+                return false;
+        }
+
+        if (pr)
+                printf("Major %i Minor %i: ", ver.major, ver.minor);
+        return true;
+}
+
+static bool
+set_flags(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_set_flags flags = {
+                .create_flags = 0
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+mmap_tracking(struct state *s, struct test *t)
+{
+        s->tracking_region = mmap(NULL, s->page_size, PROT_NONE,
+                                  MAP_SHARED, s->mali_fd,
+                                  BASE_MEM_MAP_TRACKING_HANDLE);
+
+        if (s->tracking_region == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
+                s->tracking_region = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_tracking(struct state *s, struct test *t)
+{
+        if (s->tracking_region)
+                return munmap(s->tracking_region, s->page_size) == 0;
+        return true;
+}
+
+static bool
+get_gpuprops(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_get_gpuprops props = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
+                return false;
+        } else if (!ret) {
+                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
+                return false;
+        }
+
+        s->gpuprops_size = ret;
+        s->gpuprops = calloc(s->gpuprops_size, 1);
+
+        props.size = s->gpuprops_size;
+        props.buffer = (uint64_t)(uintptr_t) s->gpuprops;
+
+        ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+free_gpuprops(struct state *s, struct test *t)
+{
+        free(s->gpuprops);
+        return true;
+}
+
+static bool
+get_gpu_id(struct state *s, struct test *t)
+{
+        uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID);
+        if (!gpu_id)
+                return false;
+        s->gpu_id = gpu_id;
+
+        uint16_t maj = gpu_id >> 12;
+        uint16_t min = (gpu_id >> 8) & 0xf;
+        uint16_t rev = (gpu_id >> 4) & 0xf;
+
+        uint16_t product = gpu_id & 0xf;
+        uint16_t prod = product | ((maj & 1) << 4);
+
+        const char *names[] = {
+                [1] = "TDUX",
+                [2] = "G710",
+                [3] = "G510",
+                [4] = "G310",
+                [7] = "G610",
+                [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */
+                [16 + 3] = "G615",
+        };
+        const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL;
+        if (!name)
+                name = "unknown";
+
+        if (pr)
+                printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product);
+
+        if (maj < 10) {
+                printf("not v10 or later: ");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+get_coherency_mode(struct state *s, struct test *t)
+{
+        uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE);
+
+        const char *modes[] = {
+                [0] = "ACE-Lite",
+                [1] = "ACE",
+                [31] = "None",
+        };
+        const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL;
+        if (!name)
+                name = "Unknown";
+
+        if (pr)
+                printf("0x%"PRIx64" (%s): ", mode, name);
+        return true;
+}
+
+static bool
+get_csf_caps(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_get_glb_iface iface = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))");
+                return false;
+        }
+
+        int ver_maj = iface.out.glb_version >> 24;
+        int ver_min = (iface.out.glb_version >> 16) & 0xff;
+        int ver_rev = iface.out.glb_version & 0xffff;
+
+        if (pr)
+                printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ",
+                       ver_maj, ver_min, ver_rev, iface.out.features,
+                       iface.out.group_num, iface.out.total_stream_num);
+
+        unsigned group_num = iface.out.group_num;
+        unsigned stream_num = iface.out.total_stream_num;
+
+        struct basep_cs_group_control *group_data =
+                calloc(group_num, sizeof(*group_data));
+
+        struct basep_cs_stream_control *stream_data =
+                calloc(stream_num, sizeof(*stream_data));
+
+        iface = (union kbase_ioctl_cs_get_glb_iface) {
+                .in = {
+                        .max_group_num = group_num,
+                        .max_total_stream_num = stream_num,
+                        .groups_ptr = (uintptr_t) group_data,
+                        .streams_ptr = (uintptr_t) stream_data,
+                }
+        };
+
+        ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))");
+
+                free(group_data);
+                free(stream_data);
+
+                return false;
+        }
+
+        unsigned print_groups = pr ? group_num : 0;
+        unsigned print_streams = pr ? stream_num : 0;
+
+        for (unsigned i = 0; i < print_groups; ++i) {
+                if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data)))
+                        continue;
+
+                fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n",
+                        i, group_data[i].features, group_data[i].stream_num);
+        }
+
+        for (unsigned i = 0; i < print_streams; ++i) {
+                if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data)))
+                        continue;
+
+                unsigned reg = stream_data[i].features & 0xff;
+                unsigned score = (stream_data[i].features >> 8) & 0xff;
+                unsigned feat = stream_data[i].features >> 16;
+
+                fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n",
+                        i, reg, score, feat);
+        }
+
+        free(group_data);
+        free(stream_data);
+
+        return true;
+}
+
+static bool
+mmap_user_reg(struct state *s, struct test *t)
+{
+        s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ,
+                               MAP_SHARED, s->mali_fd,
+                               BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
+
+        if (s->csf_user_reg == MAP_FAILED) {
+                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
+                s->csf_user_reg = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_user_reg(struct state *s, struct test *t)
+{
+        if (s->csf_user_reg)
+                return munmap(s->csf_user_reg, s->page_size) == 0;
+        return true;
+}
+
+static bool
+init_mem_exec(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_mem_exec_init init = {
+                .va_pages = 0x100000,
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+init_mem_jit(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_mem_jit_init init = {
+                .va_pages = 1 << 25,
+                .max_allocations = 255,
+                .phys_pages = 1 << 25,
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+stream_create(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_stream_create stream = {
+                .name = "stream"
+        };
+
+        s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream);
+
+        if (s->tl_fd == -1) {
+                perror("ioctl(KBASE_IOCTL_STREAM_CREATE)");
+                return false;
+        }
+        return true;
+
+}
+
+static bool
+stream_destroy(struct state *s, struct test *t)
+{
+        if (s->tl_fd > 0)
+                return close(s->tl_fd) == 0;
+        return true;
+}
+
+static bool
+tiler_heap_create(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_tiler_heap_init init = {
+                .in = {
+                        .chunk_size = 1 << 21,
+                        .initial_chunks = 5,
+                        .max_chunks = 200,
+                        .target_in_flight = 65535,
+                }
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
+                return false;
+        }
+
+        s->tiler_heap_va = init.out.gpu_heap_va;
+        s->tiler_heap_header = init.out.first_chunk_va;
+        printf("heap va: %"PRIx64", heap header: %"PRIx64"\n",
+               s->tiler_heap_va, s->tiler_heap_header);
+
+        return true;
+}
+
+static bool
+tiler_heap_term(struct state *s, struct test *t)
+{
+        if (!s->tiler_heap_va)
+                return true;
+
+        struct kbase_ioctl_cs_tiler_heap_term term = {
+                .gpu_heap_va = s->tiler_heap_va
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+cs_group_create(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_queue_group_create_1_6 create = {
+                .in = {
+                        /* Mali *still* only supports a single tiler unit */
+                        .tiler_mask = 1,
+                        .fragment_mask = ~0ULL,
+                        .compute_mask = ~0ULL,
+
+                        .cs_min = CS_QUEUE_COUNT,
+
+                        .priority = 1,
+                        .tiler_max = 1,
+                        .fragment_max = 64,
+                        .compute_max = 64,
+                }
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
+                return false;
+        }
+
+        s->csg_handle = create.out.group_handle;
+        s->csg_uid = create.out.group_uid;
+
+        if (pr)
+                printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid);
+
+        /* Should be at least 1 */
+        if (!s->csg_uid)
+                abort();
+
+        return true;
+}
+
+static bool
+cs_group_term(struct state *s, struct test *t)
+{
+        if (!s->csg_uid)
+                return true;
+
+        struct kbase_ioctl_cs_queue_group_term term = {
+                .group_handle = s->csg_handle
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+decode_init(struct state *s, struct test *t)
+{
+        pandecode_initialize(true);
+        return true;
+}
+
+static bool
+decode_close(struct state *s, struct test *t)
+{
+        pandecode_close();
+        return true;
+}
+
+static struct panfrost_ptr
+alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a)
+{
+        struct panfrost_ptr p = {0};
+
+        uint64_t va_pages = a->in.va_pages;
+        uint64_t flags = a->in.flags;
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
+                return p;
+        }
+
+        if ((flags & BASE_MEM_SAME_VA) &&
+            (!(a->out.flags & BASE_MEM_SAME_VA) ||
+                a->out.gpu_va != 0x41000)) {
+
+                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
+                        (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va);
+                return p;
+        }
+
+        void *ptr = mmap(NULL, s->page_size * va_pages,
+                         PROT_READ | PROT_WRITE, MAP_SHARED,
+                         s->mali_fd, a->out.gpu_va);
+
+        if (ptr == MAP_FAILED) {
+                perror("mmap(GPU BO)");
+                return p;
+        }
+
+        uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ?
+                (uintptr_t) ptr : a->out.gpu_va;
+
+        pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL);
+
+        p.cpu = ptr;
+        p.gpu = gpu_va;
+
+        memset(p.cpu, 0, s->page_size * va_pages);
+
+        return p;
+}
+
+static struct panfrost_ptr
+alloc_mem(struct state *s, uint64_t size, uint64_t flags)
+{
+        unsigned pages = size / s->page_size;
+
+        union kbase_ioctl_mem_alloc a = {
+                .in = {
+                        .va_pages = pages,
+                        .commit_pages = pages,
+                        .extension = 0,
+                        .flags = flags,
+                }
+        };
+
+        return alloc_ioctl(s, &a);
+}
+
+static void
+alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size)
+{
+        mmap(p.cpu - s->page_size, 1,
+             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
+             -1, 0);
+
+        mmap(p.cpu + alloc_size, 1,
+             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
+             -1, 0);
+}
+
+static bool
+alloc(struct state *s, struct test *t)
+{
+        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
+
+        *ptr = alloc_mem(s, s->page_size, t->flags);
+
+        volatile int *p = (volatile int *) ptr->cpu;
+        *p = 0x12345;
+        if (*p != 0x12345) {
+                printf("Error reading from allocated memory at %p\n", p);
+                return false;
+        }
+        *p = 0;
+        cache_clean(p);
+
+        return true;
+}
+
+static bool
+dealloc(struct state *s, struct test *t)
+{
+        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
+
+        if (ptr->cpu)
+                return munmap(ptr->cpu, s->page_size) == 0;
+        return true;
+}
+
+static bool
+cs_queue_create(struct state *s, struct test *t)
+{
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+
+                /* Read/write from CPU/GPU, nothing special
+                 * like coherency */
+                s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f);
+                s->cs[i].ptr = s->cs_mem[i].cpu;
+
+                if (!s->cs_mem[i].cpu)
+                        return false;
+        }
+
+        return true;
+}
+
+static bool
+cs_queue_free(struct state *s, struct test *t)
+{
+        bool pass = true;
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE))
+                        pass = false;
+        }
+        return pass;
+}
+
+static bool
+cs_queue_register(struct state *s, struct test *t)
+{
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                struct kbase_ioctl_cs_queue_register reg = {
+                        .buffer_gpu_addr = s->cs_mem[i].gpu,
+                        .buffer_size = CS_QUEUE_SIZE,
+                        .priority = 1,
+                };
+
+                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
+
+                if (ret == -1) {
+                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
+                        return false;
+                }
+
+                union kbase_ioctl_cs_queue_bind bind = {
+                        .in = {
+                                .buffer_gpu_addr = s->cs_mem[i].gpu,
+                                .group_handle = s->csg_handle,
+                                .csi_index = i,
+                        }
+                };
+
+                ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
+
+                if (ret == -1) {
+                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
+                }
+
+                s->cs_user_io[i] =
+                        mmap(NULL,
+                             s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
+                             PROT_READ | PROT_WRITE, MAP_SHARED,
+                             s->mali_fd, bind.out.mmap_handle);
+
+                if (s->cs_user_io[i] == MAP_FAILED) {
+                        perror("mmap(CS USER IO)");
+                        s->cs_user_io[i] = NULL;
+                        return false;
+                }
+        }
+        return true;
+}
+
+static bool
+cs_queue_term(struct state *s, struct test *t)
+{
+        bool pass = true;
+
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                if (s->cs_user_io[i] &&
+                    munmap(s->cs_user_io[i],
+                           s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES))
+                        pass = false;
+
+                struct kbase_ioctl_cs_queue_terminate term = {
+                        .buffer_gpu_addr = s->cs_mem[i].gpu,
+                };
+
+                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE,
+                                &term);
+
+                if (ret == -1)
+                        pass = false;
+        }
+        return pass;
+}
+
+#define CS_RING_DOORBELL(s, i) \
+        *((uint32_t *)(s->cs_user_io[i])) = 1
+
+#define CS_READ_REGISTER(s, i, r) \
+        *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r))
+
+#define CS_WRITE_REGISTER(s, i, r, v) \
+        *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v
+
+static void
+submit_cs(struct state *s, unsigned i)
+{
+        uintptr_t p = (uintptr_t) s->cs[i].ptr;
+        unsigned pad = (-p) & 63;
+        memset(s->cs[i].ptr, 0, pad);
+
+        unsigned last_offset = s->cs_last_submit[i];
+
+        unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu;
+        insert_offset %= CS_QUEUE_SIZE;
+
+        for (unsigned o = last_offset; o != insert_offset;
+             o = (o + 64) % CS_QUEUE_SIZE)
+                cache_clean(s->cs_mem[i].cpu + o);
+
+        // TODO: Handle wraparound
+        // TODO: Provide a persistent buffer for pandecode to use?
+        if (pr) {
+                dump_start(stderr);
+                pandecode_cs(s->cs_mem[i].gpu + last_offset,
+                             insert_offset - last_offset, s->gpu_id);
+                dump_end(stderr);
+        }
+
+        cache_barrier();
+
+        CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset);
+        s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset;
+
+        memory_barrier();
+        CS_RING_DOORBELL(s, i);
+        memory_barrier();
+
+        s->cs_last_submit[i] = insert_offset;
+}
+
+/* Returns true if there was a timeout */
+static bool
+wait_event(struct state *s, unsigned timeout_ms)
+{
+        struct pollfd fd = {
+                .fd = s->mali_fd,
+                .events = POLLIN,
+        };
+
+        int ret = poll(&fd, 1, timeout_ms);
+
+        if (ret == -1) {
+                perror("poll(mali_fd)");
+                return true;
+        }
+
+        /* Timeout */
+        if (ret == 0)
+                return true;
+
+        struct base_csf_notification event;
+        ret = read(s->mali_fd, &event, sizeof(event));
+
+        if (ret == -1) {
+                perror("read(mali_fd)");
+                return true;
+        }
+
+        if (ret != sizeof(event)) {
+                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
+                        ret, (int) sizeof(event));
+                return false;
+        }
+
+        switch (event.type) {
+        case BASE_CSF_NOTIFICATION_EVENT:
+                fprintf(stderr, "Notification event!\n");
+                return false;
+
+        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
+                break;
+
+        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
+                fprintf(stderr, "No event from mali_fd!\n");
+                return false;
+
+        default:
+                fprintf(stderr, "Unknown event type!\n");
+                return false;
+        }
+
+        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
+
+        switch (e.error_type) {
+        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue group error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        e.payload.fatal_group.status,
+                        (uint64_t) e.payload.fatal_group.sideband);
+                break;
+        }
+        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
+                unsigned queue = e.payload.fatal_queue.csi_index;
+
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue %i error: status 0x%x "
+                        "sideband 0x%"PRIx64":",
+                        queue, e.payload.fatal_queue.status,
+                        (uint64_t) e.payload.fatal_queue.sideband);
+
+                unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT);
+                pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id);
+
+                break;
+        }
+
+        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
+                fprintf(stderr, "Command stream timeout!\n");
+                break;
+        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
+                fprintf(stderr, "Command stream OOM!\n");
+                break;
+        default:
+                fprintf(stderr, "Unknown error type!\n");
+        }
+
+        return false;
+}
+
+static bool
+kick_queue(struct state *s, unsigned i)
+{
+        struct kbase_ioctl_cs_queue_kick kick = {
+                .buffer_gpu_addr = s->cs_mem[i].gpu
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+wait_cs(struct state *s, unsigned i)
+{
+        unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu;
+
+        unsigned timeout_ms = 500;
+
+        bool done_kick = false;
+
+        while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) {
+                if (wait_event(s, timeout_ms)) {
+                        if (pr)
+                                fprintf(stderr, "Event wait timeout!\n");
+
+                        unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT);
+                        unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE);
+
+                        if (e != extract_offset) {
+                                fprintf(stderr, "CS_EXTRACT (%i) != %i, "
+                                        "CS_ACTIVE (%i) on queue %i:",
+                                        e, extract_offset, a, i);
+                                /* Decode two instructions instead? */
+                                pandecode_cs(s->cs_mem[i].gpu + e, 8, 1);
+
+                                if (done_kick) {
+                                        cache_barrier();
+                                        return false;
+                                } else {
+                                        fprintf(stderr, "Kicking queue\n");
+                                        kick_queue(s, i);
+                                        done_kick = true;
+                                }
+                        }
+                }
+        }
+
+        cache_barrier();
+
+        return true;
+}
+
+static bool
+cs_init(struct state *s, struct test *t)
+{
+        uint64_t event_init[] = { 1, 1, 1 };
+        memcpy(s->allocations.event.cpu, event_init, sizeof(event_init));
+
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                CS_WRITE_REGISTER(s, i, CS_INSERT, 0);
+                pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) {
+                        switch (i) {
+                        case 0: cfg.compute = true; break;
+                        case 1: cfg.compute = true; cfg.fragment = true; break;
+                        case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break;
+                        case 3: cfg.fragment = true; break;
+                        }
+                }
+                pan_pack_ins(s->cs + i, CS_SLOT, cfg) {
+                        cfg.index = 2;
+                }
+                pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER,
+                               s->allocations.event.gpu);
+                submit_cs(s, i);
+
+                if (!kick_queue(s, i))
+                        return false;
+        }
+
+        return true;
+}
+
+static struct panfrost_ptr *
+buffers_elem(struct util_dynarray *buffers, unsigned index)
+{
+        unsigned size = util_dynarray_num_elements(buffers,
+                                                   struct panfrost_ptr);
+
+        if (index >= size) {
+                unsigned grow = index + 1 - size;
+
+                memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow),
+                       0, grow * sizeof(struct panfrost_ptr));
+        }
+
+        return util_dynarray_element(buffers, struct panfrost_ptr, index);
+}
+
+static void
+dump_hex64(FILE *fp, uint64_t *values, unsigned size)
+{
+        bool zero = false;
+        for (unsigned i = 0; i < size / 8; i += 2) {
+                uint64_t a = values[i];
+                uint64_t b = values[i + 1];
+
+                if (!a && !b) {
+                        if (!zero)
+                                fprintf(fp, "%06X  *\n", i * 8);
+                        zero = true;
+                        continue;
+                }
+
+                zero = false;
+
+                fprintf(fp, "%06X  %16"PRIx64" %16"PRIx64"\n",
+                        i * 8, a, b);
+        }
+
+        fprintf(fp, "\n");
+}
+
+static void
+dump_delta(FILE *fp, uint64_t *values, unsigned size)
+{
+        uint64_t old = 0;
+        bool zero = false;
+        bool el = false;
+        for (unsigned i = 0; i < size / 8; ++i) {
+                uint64_t val = values[i];
+                int64_t delta = val - old;
+
+                if (!zero || delta) {
+                        fprintf(fp, "%"PRIi64"\n", delta);
+                        el = false;
+                } else if (!el) {
+                        fprintf(fp, "...\n");
+                        el = true;
+                }
+
+                old = val;
+                zero = (delta == 0);
+        }
+}
+
+static void
+dump_tiler(FILE *fp, uint8_t *values, unsigned size)
+{
+        fflush(stdout);
+        FILE *stream = popen("tiler-hex-read", "w");
+        // TODO!
+        fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n",
+                256, 256, 6, values, size);
+        pan_hexdump(stream, values, size, false);
+        pclose(stream);
+}
+
+/* TODO: Pass in a filename? */
+static void
+dump_filehex(uint8_t *values, unsigned size)
+{
+        char buf[1024] = {0};
+
+        for (unsigned i = 0; i < 10000; ++i) {
+                snprintf(buf, 1024, "/tmp/fdump.%05i", i);
+
+                int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666);
+                if (fd == -1)
+                        continue;
+
+                FILE *fp = fdopen(fd, "w");
+
+                fprintf(fp, "%p, %u:\n", values, size);
+                pan_hexdump(fp, values, size, false);
+
+                fclose(fp); /* will close fd */
+                break;
+        }
+}
+
+static void
+dump_heatmap(FILE *fp, uint8_t *values, unsigned size,
+             unsigned gran, unsigned length, unsigned stride)
+{
+        unsigned sum = 0;
+        unsigned gr = 0;
+        unsigned st = 0;
+        unsigned ll = 0;
+
+        while (size && !values[size - 1])
+                --size;
+
+        for (unsigned i = 0; i < size; ++i) {
+                sum += values[i];
+
+                if (++gr == gran) {
+                        fprintf(fp, " %02x", sum & 0xff);
+                        gr = 0;
+                        sum = 0;
+                }
+
+                if (++ll == length) {
+                        i += stride - length;
+                        fprintf(fp, "\n");
+                        st = 0;
+                        ll = 0;
+                } else if (++st == stride) {
+                        fprintf(fp, "\n");
+                        st = 0;
+                }
+        }
+        fprintf(fp, " %02x\n", sum & 0xff);
+}
+
+static bool
+cs_test(struct state *s, struct test *t)
+{
+        if (s->argc < 2)
+                return true;
+
+        FILE *f = fopen(s->argv[1], "r");
+
+        struct util_dynarray buffers;
+        util_dynarray_init(&buffers, NULL);
+
+        for (;;) {
+                char *line = NULL;
+                size_t sz = 0;
+                if (getline(&line, &sz, f) == -1)
+                        break;
+
+                unsigned long src, dst, offset, src_offset, size, iter, flags;
+                unsigned long gran, stride, length;
+                int read;
+                char *mode;
+
+                if (sscanf(line, "rel%ms %lu+%lu %lu+%lu",
+                           &mode, &dst, &offset, &src, &src_offset) == 5) {
+
+                        if (strcmp(mode, "oc") && strcmp(mode, "split")) {
+                                fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode);
+                        }
+                        bool split = (mode[0] == 's');
+                        free(mode);
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+                        struct panfrost_ptr *d = buffers_elem(&buffers, dst);
+
+                        if (!s->gpu || !d->gpu) {
+                                fprintf(stderr, "relocating to buffer that doesn't exist!\n");
+                        }
+
+                        uint64_t *dest = d->cpu + offset;
+                        uint64_t value = s->gpu + src_offset;
+                        if (split) {
+                                dest[0] |= (uint32_t) value;
+                                dest[1] |= (uint32_t) (value >> 32);
+                        } else {
+                                *dest |= value;
+                        }
+
+                } else if (sscanf(line, "buffer %lu %lu %lx %n",
+                                  &dst, &size, &flags, &read) == 3) {
+                        line += read;
+
+                        struct panfrost_ptr buffer =
+                                alloc_mem(s, ALIGN_POT(size, s->page_size),
+                                          flags);
+
+                        alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size));
+
+                        *buffers_elem(&buffers, dst) = buffer;
+
+                        //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu);
+
+                        uint64_t *fill = buffer.cpu;
+
+                        for (unsigned i = 0; i < size / 8; ++i) {
+                                read = 0;
+                                unsigned long long val = 0;
+                                if (sscanf(line, "%Lx %n", &val, &read) != 1)
+                                        break;
+                                line += read;
+                                fill[i] = val;
+                        }
+
+                        cache_clean_range(buffer.cpu, size);
+
+                } else if (sscanf(line, "exe %n %lu %lu %lu",
+                                  &read, &iter, &dst, &size) == 3) {
+                        line += read;
+
+                        unsigned iter_mask = 0;
+
+                        for (;;) {
+                                read = 0;
+                                if (sscanf(line, "%lu %lu %lu %n",
+                                           &iter, &dst, &size, &read) != 3)
+                                        break;
+                                line += read;
+
+                                struct panfrost_ptr *d =
+                                        buffers_elem(&buffers, dst);
+
+                                /* TODO: Check 'size' against buffer size */
+
+                                pandecode_cs(d->gpu, size, s->gpu_id);
+
+                                if (iter > 3) {
+                                        fprintf(stderr,
+                                                "execute on out-of-bounds "
+                                                "iterator\n");
+                                        continue;
+                                }
+
+                                memcpy(s->cs[iter].ptr, d->cpu, size);
+                                s->cs[iter].ptr += size / 8;
+
+                                iter_mask |= (1 << iter);
+                        }
+
+                        u_foreach_bit(i, iter_mask)
+                                submit_cs(s, i);
+
+                        u_foreach_bit(i, iter_mask)
+                                kick_queue(s, i);
+
+                        u_foreach_bit(i, iter_mask)
+                                wait_cs(s, i);
+
+                } else if (sscanf(line, "dump %lu %lu %lu %ms",
+                                  &src, &offset, &size, &mode) == 4) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
+
+                        cache_invalidate_range(s->cpu + offset, size);
+
+                        if (!strcmp(mode, "hex"))
+                                pan_hexdump(stdout, s->cpu + offset, size, true);
+                        else if (!strcmp(mode, "hex64"))
+                                dump_hex64(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "delta"))
+                                dump_delta(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "tiler"))
+                                dump_tiler(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "filehex"))
+                                dump_filehex(s->cpu + offset, size);
+
+                        free(mode);
+
+                } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu",
+                                  &src, &offset, &size,
+                                  &gran, &length, &stride) == 6) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
+
+                        cache_invalidate_range(s->cpu + offset, size);
+
+                        dump_heatmap(stdout, s->cpu + offset, size,
+                                     gran, length, stride);
+
+                } else if (sscanf(line, "memset %lu %lu %lu %lu",
+                                  &src, &offset, &gran, &size) == 4) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "memset on buffer that doesn't exist!\n");
+
+                        memset(s->cpu + offset, gran, size);
+                        cache_clean_range(s->cpu + offset, size);
+
+                } else if (sscanf(line, "sleep %lu", &size) == 1) {
+
+                        usleep(size * 1000);
+
+                } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) {
+
+                        void *ptr;
+
+                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
+                                         s->tiler_heap_header);
+                        pan_hexdump(stdout, ptr, 4096, false);
+                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
+                        munmap(ptr, 1 << 21);
+
+                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
+                                         s->tiler_heap_header + (1 << 21));
+                        pan_hexdump(stdout, ptr, 4096, false);
+                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
+                        munmap(ptr, 1 << 21);
+
+                } else {
+                        fprintf(stderr, "unknown command '%s'\n", line);
+                }
+        }
+
+        /* Skip following tests */
+        return false;
+}
+
+static void
+pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value)
+{
+        pan_emit_cs_32(c, 0x5e, value);
+        pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                cfg.value = offset;
+                cfg.src = 0x5a;
+                cfg.dest = 0x5c;
+        }
+        pan_pack_ins(c, CS_EVADD, cfg) {
+                cfg.value = 0x5e;
+                cfg.addr = 0x5c;
+        }
+}
+
+static bool
+cs_simple(struct state *s, struct test *t)
+{
+        unsigned queue = t->vertex ? 2 : 0;
+
+        pan_command_stream *c = s->cs + queue;
+
+        unsigned dest = t->invalid ? 0x65 : 0x48;
+
+        pan_emit_cs_32(c, dest, 0x1234);
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, queue);
+        return wait_cs(s, queue);
+}
+
+static bool
+cs_store(struct state *s, struct test *t)
+{
+        pan_command_stream *c = s->cs;
+
+        uint32_t *dest = s->allocations.ev2.cpu + 240;
+        mali_ptr dest_va = s->allocations.ev2.gpu + 240;
+        uint32_t value = 1234;
+        uint32_t add = 4320000;
+
+        *dest = 0;
+        cache_clean(dest);
+
+        unsigned addr_reg = 0x48;
+        unsigned value_reg = 0x4a;
+
+        if (t->invalid)
+                dest_va = 0xfdcba9876543;
+
+        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); }
+        pan_emit_cs_48(c, addr_reg, dest_va);
+        pan_emit_cs_32(c, value_reg, value);
+
+        if (t->add) {
+                pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                        cfg.value = add;
+                        cfg.src = value_reg;
+                        cfg.dest = value_reg;
+                }
+                value += add;
+        }
+
+        pan_pack_ins(c, CS_STR, cfg) {
+                cfg.addr = addr_reg;
+                cfg.register_base = value_reg;
+                cfg.register_mask = 1;
+        }
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, 0);
+        wait_cs(s, 0);
+
+        cache_invalidate(dest);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = *dest;
+
+        if (t->invalid && result == value) {
+                printf("Got %i, did not expect %i: ", result, value);
+                return false;
+        } else if (result != value) {
+                printf("Got %i, expected %i: ", result, value);
+                return false;
+        }
+
+        return true;
+}
+
+static void
+emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end)
+{
+        cache_clean_range(start, end - start);
+
+        pan_emit_cs_48(c, 0x48, va);
+        pan_emit_cs_32(c, 0x4a, end - start);
+        pan_pack_ins(c, CS_CALL, cfg) {
+                cfg.address = 0x48;
+                cfg.length = 0x4a;
+        }
+}
+
+static bool
+cs_sub(struct state *s, struct test *t)
+{
+        pan_command_stream *c = s->cs;
+        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
+        mali_ptr cs_va = s->allocations.cached.gpu;
+
+        uint32_t *dest = s->allocations.normal.cpu;
+        mali_ptr dest_va = s->allocations.normal.gpu;
+        uint32_t value = 4321;
+
+        *dest = 0;
+        cache_clean(dest);
+
+        unsigned addr_reg = 0x48;
+        unsigned value_reg = 0x4a;
+
+        void *start = i->ptr;
+
+        pan_emit_cs_ins(c, 0x30, 0x5a0000000000);
+
+        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
+        pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); }
+        //pan_emit_cs_ins(i, 0x31, 0);
+
+        pan_emit_cs_48(i, addr_reg, dest_va);
+        pan_emit_cs_32(i, value_reg, value);
+        //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL);
+        pan_pack_ins(i, CS_STR, cfg) {
+                cfg.addr = addr_reg;
+                cfg.register_base = value_reg;
+                cfg.register_mask = 1;
+        }
+        //pan_emit_cs_ins(i, 0x09, 0);
+        //pan_emit_cs_ins(i, 0x31, 0x100000000);
+
+        //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL);
+
+        /*
+        pan_pack_ins(i, CS_STR_32, cfg) {
+                cfg.unk_1 = 1;
+                cfg.unk_2 = 4;
+                cfg.unk_3 = 1;
+                cfg.addr = addr_reg;
+                cfg.value = value_reg;
+                }*/
+
+        emit_cs_call(c, cs_va, start, i->ptr);
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, 0);
+        wait_cs(s, 0);
+
+        cache_invalidate(dest);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = *dest;
+
+        if (result != value) {
+                printf("Got %i, expected %i: ", result, value);
+                return false;
+        }
+
+        return true;
+}
+
+static mali_ptr
+upload_shader(struct state *s, struct util_dynarray binary)
+{
+        assert(s->shader_alloc_offset + binary.size < s->page_size);
+
+        mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset;
+
+        memcpy(s->allocations.exec.cpu, binary.data, binary.size);
+
+        /* Shouldn't be needed, but just in case... */
+        cache_clean_range(s->allocations.exec.cpu, binary.size);
+
+        s->shader_alloc_offset += binary.size;
+
+        return va;
+}
+
+static bool
+compute_compile(struct state *s, struct test *t)
+{
+        nir_builder _b =
+                nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                               GENX(pan_shader_get_compiler_options)(),
+                                               "mem_store"), *b = &_b;
+
+        nir_ssa_def *ptr =
+                nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0));
+
+        nir_ssa_def *value = nir_imm_int(b, 123);
+
+        nir_store_global(b, ptr, 8, value, 1);
+
+        struct panfrost_compile_inputs inputs = {
+                .gpu_id = s->gpu_id,
+                .no_ubo_to_push = true,
+        };
+
+        struct util_dynarray binary = {0};
+        struct pan_shader_info shader_info = {0};
+
+        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
+
+        dump_start(stderr);
+        disassemble_valhall(stderr, binary.data, binary.size, true);
+        dump_end(stderr);
+
+        s->compute_shader = upload_shader(s, binary);
+
+        util_dynarray_fini(&binary);
+        ralloc_free(b->shader);
+
+        return true;
+}
+
+static struct panfrost_ptr
+mem_offset(struct panfrost_ptr ptr, unsigned offset)
+{
+        ptr.cpu += offset;
+        ptr.gpu += offset;
+        return ptr;
+}
+
+static bool
+compute_execute(struct state *s, struct test *t)
+{
+        unsigned queue = t->blit ? 1 : 0;
+
+        pan_command_stream *c = s->cs + queue;
+        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
+        mali_ptr cs_va = s->allocations.cached.gpu;
+
+        struct panfrost_ptr dest = s->allocations.normal;
+        uint32_t value = 123;
+
+        *(uint32_t *) dest.cpu = 0;
+        cache_clean(dest.cpu);
+
+        struct panfrost_ptr fau = mem_offset(dest, 128);
+        *(uint64_t *) fau.cpu = dest.gpu;
+        cache_clean(fau.cpu);
+
+        struct panfrost_ptr local_storage = mem_offset(dest, 192);
+        pan_pack(local_storage.cpu, LOCAL_STORAGE, _);
+        cache_clean(local_storage.cpu);
+
+        struct panfrost_ptr shader_program = mem_offset(dest, 256);
+        pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) {
+                cfg.stage = MALI_SHADER_STAGE_COMPUTE;
+                cfg.primary_shader = true;
+                cfg.register_allocation =
+                        MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
+                cfg.binary = s->compute_shader;
+        }
+        cache_clean(shader_program.cpu);
+
+        void *start = i->ptr;
+
+        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
+        //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
+
+        pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) {
+                cfg.workgroup_size_x = 1;
+                cfg.workgroup_size_y = 1;
+                cfg.workgroup_size_z = 1;
+
+                cfg.workgroup_count_x = 1;
+                cfg.workgroup_count_y = 1;
+                cfg.workgroup_count_z = 1;
+
+                cfg.compute.shader = shader_program.gpu;
+                cfg.compute.thread_storage = local_storage.gpu;
+
+                cfg.compute.fau = fau.gpu;
+                cfg.compute.fau_count = 1;
+        }
+
+        pan_pack_ins(i, COMPUTE_LAUNCH, _);
+
+        //pan_emit_cs_32(c, 0x54, 1);
+        //pan_emit_cs_ins(c, 0x24, 0x540000000233);
+        emit_cs_call(c, cs_va, start, i->ptr);
+
+        pan_emit_cs_32(c, 0x4a, 0);
+        pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL);
+
+        pan_emit_cs_48(c, 0x48, dest.gpu);
+        pan_pack_ins(c, CS_LDR, cfg) {
+                cfg.offset = 0;
+                cfg.register_mask = 1;
+                cfg.addr = 0x48;
+                cfg.register_base = 0x20;
+        }
+        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; }
+        pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                cfg.value = 1;
+                cfg.src = 0x20;
+                cfg.dest = 0x20;
+        }
+        pan_pack_ins(c, CS_STR, cfg) {
+                cfg.offset = 64;
+                cfg.register_mask = 1;
+                cfg.addr = 0x48;
+                cfg.register_base = 0x20;
+        }
+
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, queue);
+        wait_cs(s, queue);
+
+        cache_invalidate(dest.cpu);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = ((uint32_t *)dest.cpu)[0];
+        uint32_t result2 = ((uint32_t *)dest.cpu)[16];
+
+        if (result != value) {
+                printf("Got %i, %i, expected %i: ", result, result2, value);
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+mmu_dump(struct state *s, struct test *t)
+{
+        unsigned size = 1024 * 1024;
+
+        void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED,
+                         s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE);
+        if (mem == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)");
+                return false;
+        }
+
+        pan_hexdump(stdout, mem, size, true);
+
+        return true;
+}
+
+#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) }
+
+#define STATE(item) .offset = offsetof(struct state, item)
+
+#define ALLOC(item) .offset = offsetof(struct state, allocations.item)
+#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f }
+
+struct test kbase_main[] = {
+        { open_kbase, close_kbase, "Open kbase device" },
+        { get_version, NULL, "Check version" },
+        { set_flags, NULL, "Set flags" },
+        { mmap_tracking, munmap_tracking, "Map tracking handle" },
+        { get_gpuprops, free_gpuprops, "Get GPU properties" },
+        { get_gpu_id, NULL, "GPU ID" },
+        { get_coherency_mode, NULL, "Coherency mode" },
+        { get_csf_caps, NULL, "CSF caps" },
+        { mmap_user_reg, munmap_user_reg, "Map user register page" },
+        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
+        { init_mem_jit, NULL, "Initialise JIT allocator" },
+        { stream_create, stream_destroy, "Create synchronisation stream" },
+        { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" },
+        { cs_group_create, cs_group_term, "Create command stream group" },
+        { decode_init, decode_close, "Initialize pandecode" },
+
+        /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */
+        ALLOC_TEST("Allocate normal memory", normal, 0x200f),
+        ALLOC_TEST("Allocate exectuable memory", exec, 0x2017),
+        ALLOC_TEST("Allocate coherent memory", coherent, 0x280f),
+        ALLOC_TEST("Allocate cached memory", cached, 0x380f),
+        ALLOC_TEST("Allocate CSF event memory", event, 0x8200f),
+        ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f),
+
+        /* These three tests are run for every queue, but later ones are not */
+        { cs_queue_create, cs_queue_free, "Create command stream queues" },
+        { cs_queue_register, cs_queue_term, "Register command stream queues" },
+
+        { cs_test, NULL, "Test command stream" },
+
+        { cs_init, NULL, "Initialise and start command stream queues" },
+        { cs_simple, NULL, "Execute MOV command" },
+        { cs_simple, NULL, "Execute MOV command (again)" },
+        { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true },
+        //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true },
+        { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true },
+        { cs_store, NULL, "Execute STR command" },
+        //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true },
+        { cs_store, NULL, "Execute ADD command", .add = true },
+        { cs_sub, NULL, "Execute STR on iterator" },
+
+        { compute_compile, NULL, "Compile a compute shader" },
+        { compute_execute, NULL, "Execute a compute shader" },
+        { compute_execute, NULL, "Execute compute on blit queue", .blit = true },
+
+        //{ mmu_dump, NULL, "Dump MMU pagetables" },
+};
+
+static void
+do_test_list(struct state *s, struct test *tests, unsigned length);
+
+static void
+cleanup_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        for (unsigned i = length; i > 0; --i) {
+                unsigned n = i - 1;
+
+                struct test *t = &tests[n];
+                if (!t->cleanup)
+                        continue;
+
+                if (pr)
+                        printf("[CLEANUP %i] %s: ", n, t->label);
+                if (t->cleanup(s, t)) {
+                        if (pr)
+                                printf("PASS\n");
+                } else {
+                        if (pr)
+                                printf("FAIL\n");
+                }
+        }
+}
+
+static unsigned
+interpret_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        for (unsigned i = 0; i < length; ++i) {
+                struct test *t = &tests[i];
+
+                if (pr)
+                        printf("[TEST %i] %s: ", i, t->label);
+                if (t->part) {
+                        if (t->part(s, t)) {
+                                if (pr)
+                                        printf("PASS\n");
+                        } else {
+                                if (pr)
+                                        printf("FAIL\n");
+                                if (!getenv("TEST_KEEP_GOING"))
+                                        return i + 1;
+                        }
+                }
+                if (t->subtests)
+                        do_test_list(s, t->subtests, t->sub_length);
+        }
+
+        return length;
+}
+
+static void
+do_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        unsigned ran = interpret_test_list(s, tests, length);
+        cleanup_test_list(s, tests, ran);
+}
+
+int
+main(int argc, char *argv[])
+{
+        struct state s = {
+                .page_size = sysconf(_SC_PAGE_SIZE),
+                .argc = argc,
+                .argv = argv,
+        };
+
+        if (getenv("CSF_QUIET"))
+                pr = false;
+
+        if (!strcmp(getenv("TERM"), "dumb"))
+                colour_term = false;
+
+        if (pr)
+                printf("Running Valhall CSF tests\n");
+
+        do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main));
+}
diff --git a/src/panfrost/lib/genxml/common.xml b/src/panfrost/lib/genxml/common.xml
index d4b5240fb01..d75baaba208 100644
--- a/src/panfrost/lib/genxml/common.xml
+++ b/src/panfrost/lib/genxml/common.xml
@@ -46,7 +46,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
diff --git a/src/panfrost/lib/genxml/decode.c.rej b/src/panfrost/lib/genxml/decode.c.rej
new file mode 100644
index 00000000000..946a9fb8bfb
--- /dev/null
+++ b/src/panfrost/lib/genxml/decode.c.rej
@@ -0,0 +1,940 @@
+diff a/src/panfrost/lib/genxml/decode.c b/src/panfrost/lib/genxml/decode.c	(rejected hunks)
+@@ -54,6 +54,12 @@
+         pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \
+ }
+ 
++#define DUMP_SECTION_CS_V10(A, S, cl, buf, buf_unk, ...) { \
++        pan_section_unpack_cs_v10(cl, buf, buf_unk, A, S, temp); \
++        pandecode_log(__VA_ARGS__); \
++        pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \
++}
++
+ #define MAP_ADDR(T, addr, cl) \
+         const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T));
+ 
+@@ -158,7 +164,7 @@ pandecode_midgard_tiler_descriptor(
+         if (nonzero_weights)
+                 DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 5 */
+ 
+ #if PAN_ARCH >= 5
+ static void
+@@ -184,7 +190,7 @@ pandecode_render_target(uint64_t gpu_va, unsigned gpu_id,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH >= 5 */
+ 
+ #if PAN_ARCH >= 6
+ static void
+@@ -201,7 +207,7 @@ pandecode_sample_locations(const void *fb)
+                                 samples[2 * i + 1] - 128);
+         }
+ }
+-#endif
++#endif /* PAN_ARCH >= 6 */
+ 
+ static void
+ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+@@ -228,29 +234,29 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id)
+ #if PAN_ARCH >= 6
+         pandecode_sample_locations(fb);
+ 
+-        unsigned dcd_size = pan_size(DRAW);
++        unsigned dcd_size = pan_size(DRAW_NO_CS);
+ 
+         if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (0 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Pre frame 0:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+ 
+         if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (1 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Pre frame 1:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+ 
+         if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+                 const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (2 * dcd_size));
+-                pan_unpack(dcd, DRAW, draw);
++                pan_unpack(dcd, DRAW_NO_CS, draw);
+                 pandecode_log("Post frame:\n");
+                 pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id);
+         }
+-#else
++#else /* PAN_ARCH < 6 */
+         DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");
+ 
+         const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER);
+@@ -284,7 +290,7 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id)
+                 .rt_count = params.render_target_count,
+                 .has_extra = params.has_zs_crc_extension
+         };
+-#else
++#else /* PAN_ARCH < 5 */
+         /* Dummy unpack of the padding section to make sure all words are 0.
+          * No need to call print here since the section is supposed to be empty.
+          */
+@@ -341,7 +347,7 @@ pandecode_attributes(mali_ptr addr, int count,
+         }
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH >= 5
+ static mali_ptr
+@@ -358,7 +364,7 @@ pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader)
+         return b.blend_shader ? (b.shader_pc & ~0xf) : 0;
+ #endif
+ }
+-#endif
++#endif /* PAN_ARCH >= 6 || PAN_ARCH == 5 */
+ 
+ #if PAN_ARCH <= 7
+ static unsigned
+@@ -412,8 +418,9 @@ pandecode_invocation(const void *i)
+ 
+         DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n")
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
++#if PAN_ARCH < 10
+ static void
+ pandecode_primitive(const void *p)
+ {
+@@ -439,7 +446,7 @@ pandecode_primitive(const void *p)
+                         pandecode_validate_buffer(primitive.indices, primitive.index_count * size);
+         } else if (primitive.index_type)
+                 pandecode_log("// XXX: unexpected index size\n");
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ }
+ 
+ static void
+@@ -451,6 +458,7 @@ pandecode_primitive_size(const void *s, bool constant)
+ 
+         DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n")
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ #if PAN_ARCH <= 7
+ static void
+@@ -482,7 +490,7 @@ pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count)
+         free(ptr);
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ static void
+ pandecode_shader_disassemble(mali_ptr shader_ptr, int type, unsigned gpu_id)
+@@ -566,7 +574,7 @@ pandecode_texture_payload(mali_ptr payload,
+         pandecode_indent--;
+         pandecode_log("},\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH <= 5
+ static void
+@@ -585,7 +593,7 @@ pandecode_texture(mali_ptr u, unsigned tex)
+                         temp.levels, nr_samples, temp.array_size);
+         pandecode_indent--;
+ }
+-#else
++#else /* PAN_ARCH > 5 */
+ static void
+ pandecode_texture(const void *cl, unsigned tex)
+ {
+@@ -603,7 +611,7 @@ pandecode_texture(const void *cl, unsigned tex)
+ 
+         for (unsigned i = 0; i < plane_count; ++i)
+                 DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i);
+-#else
++#else /* PAN_ARCH < 9 */
+         unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ?
+                               1 : temp.sample_count;
+ 
+@@ -630,7 +638,7 @@ pandecode_textures(mali_ptr textures, unsigned texture_count)
+ 
+         for (unsigned tex = 0; tex < texture_count; ++tex)
+                 pandecode_texture(cl + pan_size(TEXTURE) * tex, tex);
+-#else
++#else /* PAN_ARCH < 6 */
+         mali_ptr *PANDECODE_PTR_VAR(u, textures);
+ 
+         for (int tex = 0; tex < texture_count; ++tex) {
+@@ -741,7 +749,7 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+                                                                            gpu_id);
+                         }
+                 }
+-#endif
++#endif /* PAN_ARCH >= 5 */
+         } else
+                 pandecode_log("// XXX: missing shader descriptor\n");
+ 
+@@ -807,7 +815,7 @@ pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
+-#endif
++#endif /* PAN_ARCH <= 7 */
+ 
+ #if PAN_ARCH >= 6
+ static void
+@@ -823,6 +831,10 @@ pandecode_tiler(mali_ptr gpu_va)
+         DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n");
+ }
+ 
++#endif /* PAN_ARCH >= 6 */
++
++#if PAN_ARCH < 10
++#if PAN_ARCH >= 6
+ #if PAN_ARCH <= 7
+ static void
+ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h,
+@@ -854,8 +866,8 @@ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h,
+ 
+         pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding);
+ }
+-#endif
+-#endif
++#endif /* PAN_ARCH <= 7 */
++#endif /* PAN_ARCH >= 6 */
+ 
+ static void
+ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+@@ -890,7 +902,7 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+         pan_section_unpack(p, TILER_JOB, PADDING, padding);
+ #endif
+ 
+-#else
++#else /* PAN_ARCH < 6 */
+         pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive);
+         pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE),
+                                  primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE);
+@@ -898,12 +910,17 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h,
+         pandecode_indent--;
+         pandecode_log("\n");
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ static void
+-pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
++pandecode_fragment_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                       unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+         struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job);
+-        pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s);
++#endif
++
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, FRAGMENT_JOB, PAYLOAD, s);
+ 
+         UNUSED struct pandecode_fbd info = pandecode_fbd(s.framebuffer, true, gpu_id);
+ 
+@@ -920,7 +937,7 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
+                 expected_tag |= MALI_FBD_TAG_HAS_ZS_RT;
+ 
+         expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2);
+-#endif
++#endif /* PAN_ARCH >= 5 */
+ 
+         DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n");
+ 
+@@ -936,6 +953,8 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id)
+         pandecode_log("\n");
+ }
+ 
++#if PAN_ARCH < 10
++// TODO: Use the same model as for malloc_vertex jobs?
+ static void
+ pandecode_write_value_job(mali_ptr job)
+ {
+@@ -953,6 +972,7 @@ pandecode_cache_flush_job(mali_ptr job)
+         DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n");
+         pandecode_log("\n");
+ }
++#endif /* PAN_ARCH < 10 */
+ 
+ #if PAN_ARCH >= 9
+ static void
+@@ -1034,6 +1054,9 @@ pandecode_resource_tables(mali_ptr addr, const char *label)
+ static void
+ pandecode_depth_stencil(mali_ptr addr)
+ {
++        if (!addr)
++                return;
++
+         MAP_ADDR(DEPTH_STENCIL, addr, cl);
+         pan_unpack(cl, DEPTH_STENCIL, desc);
+         DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil");
+@@ -1060,14 +1083,15 @@ static void
+ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+               unsigned gpu_id)
+ {
+-        mali_ptr frag_shader = 0;
+-
+         pandecode_depth_stencil(p->depth_stencil);
+ 
+         for (unsigned i = 0; i < p->blend_count; ++i) {
++                MAP_ADDR(SHADER_PROGRAM, p->shader.shader, cl);
++                pan_unpack(cl, SHADER_PROGRAM, desc);
++
+                 struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend);
+ 
+-                mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader);
++                mali_ptr blend_shader = pandecode_blend(blend_descs, i, desc.binary);
+                 if (blend_shader) {
+                         fprintf(pandecode_dump_stream, "Blend shader %u", i);
+                         pandecode_shader_disassemble(blend_shader, 0, gpu_id);
+@@ -1079,21 +1103,26 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type,
+ }
+ 
+ static void
+-pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id)
++pandecode_malloc_vertex_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                            unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+         struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job);
++#endif
+ 
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE, p, cs_buf, cs_buf_unk, "Primitive:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, cs_buf, cs_buf_unk, "Instance count:\n");
++#if PAN_ARCH < 10
+         DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n");
+-        DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n");
++#endif
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, TILER, p, cs_buf, cs_buf_unk, "Tiler:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, SCISSOR, p, cs_buf, cs_buf_unk, "Scissor:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, cs_buf, cs_buf_unk, "Primitive Size:\n");
++        DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INDICES, p, cs_buf, cs_buf_unk, "Indices:\n");
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, DRAW, dcd);
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, TILER, tiler_ptr);
+         pandecode_log("Tiler Job Payload:\n");
+         pandecode_indent++;
+         if (tiler_ptr.address)
+@@ -1104,17 +1133,20 @@ pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id)
+ 
+         pandecode_dcd(&dcd, 0, gpu_id);
+ 
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position);
+-        pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, POSITION, position);
++        pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, VARYING, varying);
+         pandecode_shader_environment(&position, gpu_id);
+         pandecode_shader_environment(&varying, gpu_id);
+ }
+ 
+ static void
+-pandecode_compute_job(mali_ptr job, unsigned gpu_id)
++pandecode_compute_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk,
++                      unsigned gpu_id)
+ {
++#if PAN_ARCH < 10
+ 	struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job);
+-	pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload);
++#endif
++	pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, COMPUTE_JOB, PAYLOAD, payload);
+ 
+ 	pandecode_shader(payload.compute.shader, "Shader", gpu_id);
+ 	if (payload.compute.thread_storage)
+@@ -1126,8 +1158,9 @@ pandecode_compute_job(mali_ptr job, unsigned gpu_id)
+ 
+ 	DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n");
+ }
+-#endif
++#endif /* PAN_ARCH >= 9 */
+ 
++#if PAN_ARCH < 10
+ /* Entrypoint to start tracing. jc_gpu_va is the GPU address for the first job
+  * in the chain; later jobs are found by walking the chain. GPU ID is the
+  * more finegrained ID because some details are model-specific even within a
+@@ -1183,18 +1216,18 @@ GENX(pandecode_jc)(mali_ptr jc_gpu_va, unsigned gpu_id)
+                         pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id);
+                         break;
+ #endif
+-#else
++#else /* PAN_ARCH > 7 */
+ 		case MALI_JOB_TYPE_COMPUTE:
+-			pandecode_compute_job(jc_gpu_va, gpu_id);
++                        pandecode_compute_job(jc_gpu_va, NULL, NULL, gpu_id);
+ 			break;
+ 
+ 		case MALI_JOB_TYPE_MALLOC_VERTEX:
+-			pandecode_malloc_vertex_job(jc_gpu_va, gpu_id);
++			pandecode_malloc_vertex_job(jc_gpu_va, NULL, NULL, gpu_id);
+ 			break;
+ #endif
+ 
+                 case MALI_JOB_TYPE_FRAGMENT:
+-                        pandecode_fragment_job(jc_gpu_va, gpu_id);
++                        pandecode_fragment_job(jc_gpu_va, NULL, NULL, gpu_id);
+                         break;
+ 
+                 default:
+@@ -1232,3 +1265,544 @@ GENX(pandecode_abort_on_fault)(mali_ptr jc_gpu_va)
+ 
+         pandecode_map_read_write();
+ }
++#endif
++
++#if PAN_ARCH >= 10
++static void
++pandecode_cs_dump_state(uint32_t *state)
++{
++        uint64_t *st_64 = (uint64_t *)state;
++        /* Only registers below 0x40 seem to be actually be used by jobs */
++        for (unsigned i = 0; i < 0x40 / 4; ++i) {
++                uint64_t v1 = st_64[i * 2];
++                uint64_t v2 = st_64[i * 2 + 1];
++
++                if (!v1 && !v2)
++                        continue;
++
++                pandecode_log("0x%2x: 0x%16"PRIx64" 0x%16"PRIx64"\n",
++                              i * 4, v1, v2);
++        }
++}
++
++/* Assumes eight scoreboards */
++static void
++pandecode_scoreboard_mask(unsigned mask)
++{
++        if (mask == 0xff) {
++                pandecode_log_cont("all");
++                return;
++        } else if (!mask) {
++                pandecode_log_cont("none");
++                return;
++        }
++
++        const char *comma = "";
++        for (unsigned i = 0; i < 8; ++i) {
++                if (mask & (1 << i)) {
++                        pandecode_log_cont("%s%i", comma, i);
++                        comma = ",";
++                }
++        }
++}
++
++static void
++pandecode_regmask(unsigned base, unsigned mask)
++{
++        switch (mask) {
++        case 0:
++                pandecode_log_cont("(invalid: %02x mask 0)", base);
++                return;
++        case 1:
++                pandecode_log_cont("w%02x", base);
++                return;
++        case 3:
++                pandecode_log_cont("x%02x", base);
++                return;
++        default:
++                break;
++        }
++
++        unsigned first = ffs(mask) - 1;
++        if (first)
++                pandecode_log_cont("{(+%i) ", first);
++        else
++                pandecode_log_cont("{");
++
++        unsigned edges = mask ^ (mask << 1);
++
++        const char *comma = "";
++
++        bool outside = true;
++        unsigned start;
++        u_foreach_bit(i, edges) {
++                if (outside)
++                        start = i;
++                else if (i == start + 1)
++                        pandecode_log_cont("%sw%02x", comma,
++                                           base + start);
++                else if (i == start + 2)
++                        pandecode_log_cont("%sx%02x", comma,
++                                           base + start);
++                else
++                        pandecode_log_cont("%sw%02x-w%02x", comma,
++                                           base + start,
++                                           base + i - 1);
++                outside = !outside;
++
++                if (outside)
++                        comma = ", ";
++        }
++
++        pandecode_log_cont("}");
++}
++
++static void
++pandecode_cs_buffer(uint64_t *commands, unsigned size,
++                    uint32_t *buffer, uint32_t *buffer_unk,
++                    unsigned gpu_id, mali_ptr va);
++
++// Hack hack hackity hack: gpu_id == 1 means "don't decode" (only disassemble)
++static void
++pandecode_cs_command(uint64_t command, mali_ptr va,
++                     uint32_t *buffer, uint32_t *buffer_unk,
++                     unsigned gpu_id)
++{
++        uint8_t op = command >> 56;
++        uint8_t addr = (command >> 48) & 0xff;
++        uint64_t value = command & 0xffffffffffffULL;
++
++        uint32_t h = value >> 32;
++        uint32_t l = value;
++
++        uint8_t arg1 = h & 0xff;
++        uint8_t arg2 = h >> 8;
++
++        if (command)
++                pandecode_log("%"PRIx64" %016"PRIx64" ", va, command);
++
++        switch (op) {
++        case 0:
++                if (addr || value)
++                        pandecode_log("nop %02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 1:
++                buffer_unk[addr] = buffer[addr] = l;
++                buffer_unk[addr + 1] = buffer[addr + 1] = h;
++                pandecode_log("mov x%02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 2:
++                buffer_unk[addr] = buffer[addr] = l;
++                pandecode_log("mov w%02x, #0x%"PRIx64"\n", addr, value);
++                break;
++        case 3:
++                if (l & 0xff00ffff || h || addr) {
++                        pandecode_log("wait (unk %02x), (unk %04x), "
++                                      "%i, (unk %04x)\n", addr, h, l >> 16, l);
++                } else {
++                        pandecode_log("wait ");
++                        pandecode_scoreboard_mask(l >> 16);
++                        pandecode_log_cont("\n");
++                }
++                break;
++        case 4: {
++                uint32_t masked = l & 0xffff0000;
++                unsigned task_increment = l & 0x3fff;
++                unsigned task_axis = (l >> 14) & 3;
++                if (h != 0xff00 || addr || masked)
++                        pandecode_log("compute (unk %02x), (unk %04x), "
++                                      "(unk %x), inc %i, axis %i\n\n", addr, h, masked, task_increment, task_axis);
++                else
++                        pandecode_log("compute inc %i, axis %i\n\n", task_increment, task_axis);
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_compute_job(0, buffer, buffer_unk, gpu_id);
++
++                        /* The gallium driver emits this even for compute jobs, clear
++                         * it from unknown state */
++                        pan_unpack_cs(buffer, buffer_unk, SCISSOR, unused_scissor);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++        case 6: {
++                /* The meaning of the first argument (in h) is unknown, but it
++                 * appears that the second bit must be set. */
++                uint32_t masked = l & 0xfffff8f0;
++                uint8_t mode = l & 0xf;
++                uint8_t index = (l >> 8) & 7;
++                if (addr || masked)
++                        pandecode_log("idvs (unk %02x), 0x%04x, (unk %x), "
++                                      "mode %i index %i\n\n",
++                                      addr, h, masked, mode, index);
++                else
++                        pandecode_log("idvs 0x%04x, mode %i index %i\n\n",
++                                      h, mode, index);
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_malloc_vertex_job(0, buffer, buffer_unk, gpu_id);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++        case 7: {
++                uint64_t masked = value & ~0x000100000071;
++                bool tem = value & 1;
++                bool unk = (value >> 32) & 1;
++
++                const char *order = (const char *[]){
++                        "z_order",
++                        "horizontal",
++                        "vertical",
++                        "invalid_3",
++                        "invalid_4",
++                        "reverse_horizontal",
++                        "reverse_vertical",
++                        "invalid_7",
++                }[(value >> 4) & 7];
++
++                if (addr || masked) {
++                        pandecode_log("fragment (unk %02x), (unk %"PRIx64")\n\n",
++                                      addr, value);
++                } else if (value) {
++                        pandecode_log("fragment tem %i, render %s, unk %i\n\n",
++                                      tem, order, unk);
++                } else {
++                        pandecode_log("fragment\n\n");
++                }
++
++                if (gpu_id != 1) {
++                        pandecode_indent++;
++
++                        pandecode_fragment_job(0, buffer, buffer_unk, gpu_id);
++                        pandecode_cs_dump_state(buffer_unk);
++
++                        pandecode_log("\n");
++                        pandecode_indent--;
++                }
++
++                break;
++        }
++
++        case 9: {
++                if (addr || l || h > 1)
++                        pandecode_log("flush_tiler (unk %02x), (unk %"PRIx64")\n",
++                                      addr, value);
++                else if (h)
++                        pandecode_log("flush_tiler unk\n");
++                else
++                        pandecode_log("flush_tiler\n");
++                break;
++        }
++
++        case 16: case 17: {
++                char wid = (op == 16) ? 'w' : 'x';
++
++                if (op == 16) {
++                        buffer_unk[addr] = buffer[addr] = buffer[arg2] + l;
++                } else {
++                        uint64_t r = buffer[arg2] + ((uint64_t)buffer[arg2 + 1] << 32) + l;
++                        buffer_unk[addr] = buffer[addr] = r;
++                        buffer_unk[addr + 1] = buffer[addr + 1] = r >> 32;
++                }
++
++                if (arg1)
++                        pandecode_log("add %c%02x, (unk %x), %c%02x, #0x%x\n",
++                                      wid, addr, arg1, wid, arg2, l);
++                else if ((int32_t) l < 0)
++                        pandecode_log("add %c%02x, %c%02x, %i\n",
++                                      wid, addr, wid, arg2, (int32_t) l);
++                else if (l)
++                        pandecode_log("add %c%02x, %c%02x, #0x%x\n",
++                                      wid, addr, wid, arg2, l);
++                else
++                        pandecode_log("mov %c%02x, %c%02x\n",
++                                      wid, addr, wid, arg2);
++
++                break;
++        }
++
++        case 20: case 21: {
++                const char *name = (op == 20) ? "ldr" : "str";
++
++                /* The immediate offset must be 4-aligned (though if the
++                 * address itself is unaligned, the bits will silently be
++                 * masked off).
++                 *
++                 * Up to 16 32-bit registers can be read or written in a
++                 * single instruction, behaviour is similar to LDM or STM
++                 * except that a base register is specified.
++                 *
++                 * These instructions are high latency. Use WAIT 0 to wait for
++                 * the result of an LDR, or for a STR to finish.
++                 *
++                 * For LDR, it is an error for the address register to be
++                 * included in the destination register set.
++                 */
++
++                if (arg1) {
++                        pandecode_log("%s (unk %02x), x%02x, (mask %x), [x%02x, %i]\n",
++                                      name, arg1, addr, l >> 16, arg2, (int16_t) l);
++                } else {
++                        pandecode_log("%s ", name);
++                        pandecode_regmask(addr, l >> 16);
++                        pandecode_log_cont(", [x%02x, %i]\n", arg2, (int16_t) l);
++                }
++                break;
++        }
++
++        case 22: {
++                /* The signed 32-bit source register is compared against zero
++                 * for these comparisons. For example, .GT means that the
++                 * branch is taken if the signed register value is greater
++                 * than zero. */
++                const char *comparisons[] = {
++                        ".le", ".gt",
++                        ".eq", ".ne",
++                        ".lt", ".ge",
++                        "" /* always */, ".(invalid: never)",
++                };
++
++                const char *m = comparisons[(l >> 28) & 7];
++
++                int16_t offset = l;
++
++                bool forward = (offset >= 0);
++                if (!forward)
++                        offset = -1 - offset;
++
++                if (addr || arg1 || l & 0x8fff0000) {
++                        pandecode_log("b%s (unk %02x), w%02x, (unk %02x), "
++                                      "(unk 0x%x), %s %i\n",
++                                      m, addr, arg2, arg1, l & 0x8fff0000,
++                                      forward ? "skip" : "back",
++                                      offset);
++                } else {
++                        pandecode_log("b%s w%02x, %s %i\n",
++                                      m, arg2,
++                                      forward ? "skip" : "back",
++                                      offset);
++                }
++
++                break;
++        }
++
++        case 23: {
++                if (value >> 3 || addr)
++                        pandecode_log("slot (unk %02x), (unk %"PRIx64"), "
++                                      "%i\n", addr, value >> 3, l & 7);
++                else
++                        pandecode_log("slot %i\n", l);
++                break;
++        }
++
++        case 32: case 33: {
++                /* A tail call is similar to a normal call, but reuses the
++                 * current stack entry so that execution returns directly to
++                 * the parent, rather than pushing a new entry and returning
++                 * to the instruction after the call. Using tail calls avoids
++                 * the possibility of stack overflow.
++                 */
++                const char *name = (op == 32) ? "call" : "tailcall";
++
++                unsigned length = buffer[arg1];
++                uint64_t target = (((uint64_t)buffer[arg2 + 1]) << 32) | buffer[arg2];
++
++                assert(!(length & 7));
++                unsigned instrs = length / 8;
++
++                if (addr || l)
++                        pandecode_log("%s (unk %02x), w%02x (%i instructions), x%02x (0x%"PRIx64"), (unk %x)\n",
++                                      name, addr, arg1, instrs, arg2, target, l);
++                else
++                        pandecode_log("%s w%02x (%i instructions), x%02x (0x%"PRIx64")\n",
++                                      name, arg1, instrs, arg2, target);
++
++                if (!target || !length)
++                        break;
++
++                uint64_t *t = pandecode_fetch_gpu_mem(target, length);
++                pandecode_indent++;
++                pandecode_cs_buffer(t, length, buffer, buffer_unk, gpu_id,
++                                    target);
++                pandecode_indent--;
++                break;
++        }
++
++        case 34: {
++                /* idvs implies tiler */
++                if (l & ~0xf)
++                        pandecode_log("resources 0x%x\n", l);
++                else
++                        pandecode_log("resources%s%s%s%s\n",
++                                      (l & 1) ? " compute" : "",
++                                      (l & 2) ? " fragment" : "",
++                                      (l & 4) ? " tiler" : "",
++                                      (l & 8) ? " idvs" : "");
++                break;
++        }
++
++        case 37: case 38: case 51: case 52: {
++                /*
++                 * 0b 00100101 / 00100110 -- opcode
++                 *    ????0??? -- unk. usually 1, faults if "0" bit set
++                 *    aaaaaaaa -- address register
++                 *    vvvvvvvv -- 32-bit value register
++                 *    00000000 -- seems to act as NOP if nonzero
++                 *    mmmmmmmm -- some sort of mask, unknown purpose
++                 *    ???????? -- seems to have no effect
++                 *    ?????s0u -- 's' disables signal to CPU,
++                 *                'u' has unknown purpose (disable GPU signal?)
++                 *
++                 * The difference between the two opcodes is unknown.
++                 *
++                 * That the 'mmmmmmmm' byte is somehow a scoreboard mask is
++                 * a possibility.
++                 */
++
++                const char *name = (op & 1) ? "evadd" : "evstr";
++                const char *type = (op > 50) ? "x" : "w";
++
++                if (addr != 1 || l & 0xff00fffa) {
++                        pandecode_log("%s (unk %02x), %s%02x, [x%02x], "
++                                      "unk 0x%x, flags 0x%x\n",
++                                      name, addr, type, arg1, arg2,
++                                      l >> 16, (uint16_t) l);
++                } else {
++                        pandecode_log("%s %s%02x, [x%02x], unk 0x%x%s%s\n",
++                                      name, type, arg1, arg2, l >> 16,
++                                      l & 0x4 ? "" : ", irq",
++                                      l & 0x1 ? ", unk0" : "");
++                }
++
++                break;
++        }
++
++        case 39: case 53: {
++                const char *m = (const char *[]){
++                        ".ls",
++                        ".hi",
++                }[(l >> 28) & 1];
++                const char *e = (const char *[]){
++                        ".inherit",
++                        ".no_error",
++                }[l & 1];
++                const char *type = (op > 50) ? "x" : "w";
++
++                /* Wait until the value in the destination register is changed
++                 * to pass the comparison. For example, with .LS the value
++                 * in memory must be less than or same as the reference to
++                 * continue execution. */
++                if (addr || l & ~((1 << 28) | (1 << 0)))
++                        pandecode_log("evwait%s%s (unk %02x), %s%02x, "
++                                      "[x%02x, unk %x]\n",
++                                      m, e, addr, type, arg1, arg2, l);
++                else
++                        pandecode_log("evwait%s%s %s%02x, [x%02x]\n",
++                                      m, e, type, arg1, arg2);
++                break;
++        }
++
++        case 40: {
++                if (addr || l >> 16 || arg1 > 1) {
++                        pandecode_log("str type %02x, (unk %02x), "
++                                      "(unk %x), [x%02x, %i]\n",
++                                      addr, arg1,
++                                      l >> 16, arg2, (int16_t) l);
++                } else {
++                        const char *type = (const char *[]) {
++                                "timestamp",
++                                "cycles",
++                        }[arg1];
++
++                        pandecode_log("str %s, [x%02x, %i]\n",
++                                      type, arg2, (int16_t) l);
++                }
++                break;
++        }
++
++        case 48: {
++                if (addr || arg1 || l)
++                        pandecode_log("heapctx (unk %02x), "
++                                      "x%02x, (unk %02x), (unk %x)\n",
++                                      addr, arg2, arg1, l);
++                else
++                        pandecode_log("heapctx x%02x\n", arg2);
++                break;
++        }
++
++        case 49: {
++                const char *m = (const char *[]){
++                        "vt_start",
++                        "vt_end",
++                        "unk",
++                        "frag_end",
++                }[arg1 & 3];
++
++                if (addr || arg2 || arg1 > 3 || l)
++                        pandecode_log("heapinc (unk %02x), "
++                                      "(unk %02x), %02x, (unk %x)\n",
++                                      addr, arg2, arg1, l);
++                else
++                        pandecode_log("heapinc %s\n", m);
++                break;
++        }
++
++        default:
++                /*
++                 * UNK 00 30, #0x480000000000 -- takes an eight-byte aligned
++                 * memory address.
++                 */
++
++                pandecode_log("UNK %02x %02x, #0x%"PRIx64"\n", addr, op, value);
++                break;
++        }
++}
++
++// TODO: reorder args
++static void
++pandecode_cs_buffer(uint64_t *commands, unsigned size,
++                    uint32_t *buffer, uint32_t *buffer_unk,
++                    unsigned gpu_id, mali_ptr va)
++{
++        uint64_t *end = (uint64_t *)((uint8_t *) commands + size);
++
++        for (uint64_t c = *commands; commands < end; c = *(++commands)) {
++                pandecode_cs_command(c, va, buffer, buffer_unk, gpu_id);
++                va += 8;
++        }
++}
++
++// TODO: Does it make sense to pass in the length?
++void
++GENX(pandecode_cs)(mali_ptr cs_gpu_va, unsigned size, unsigned gpu_id)
++{
++        pandecode_dump_file_open();
++
++        // TODO: Pass down the buffer during recursion
++        uint32_t buffer[256] = {0};
++        uint32_t buffer_unk[256] = {0};
++
++        uint64_t *commands = pandecode_fetch_gpu_mem(cs_gpu_va, 1);
++
++        pandecode_log("\n");
++
++        pandecode_cs_buffer(commands, size, buffer, buffer_unk, gpu_id,
++                            cs_gpu_va);
++
++        fflush(pandecode_dump_stream);
++        pandecode_map_read_write();
++}
++#endif
diff --git a/src/panfrost/lib/genxml/decode.h.rej b/src/panfrost/lib/genxml/decode.h.rej
new file mode 100644
index 00000000000..d3673d771d1
--- /dev/null
+++ b/src/panfrost/lib/genxml/decode.h.rej
@@ -0,0 +1,28 @@
+diff a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h	(rejected hunks)
+@@ -50,8 +50,6 @@ struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(uint64_
+ 
+ void pandecode_map_read_write(void);
+ 
+-void pandecode_dump_mappings(void);
+-
+ static inline void *
+ __pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size,
+                           int line, const char *filename)
+@@ -98,6 +96,8 @@ void pandecode_abort_on_fault_v6(mali_ptr jc_gpu_va);
+ void pandecode_abort_on_fault_v7(mali_ptr jc_gpu_va);
+ void pandecode_abort_on_fault_v9(mali_ptr jc_gpu_va);
+ 
++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
+ static inline void
+ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings)
+ {
+@@ -130,7 +130,7 @@ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings)
+                         fprintf(fp, " | ");
+                         for (unsigned j = i & ~0xF; j <= i; ++j) {
+                                 uint8_t c = hex[j];
+-                                fputc((c < 32 || c > 128) ? '.' : c, fp);
++                                fputc((c < 32 || c > 126) ? '.' : c, fp);
+                         }
+                 }
+ 
diff --git a/src/panfrost/lib/genxml/decode_common.c.rej b/src/panfrost/lib/genxml/decode_common.c.rej
new file mode 100644
index 00000000000..127dcd7a4f7
--- /dev/null
+++ b/src/panfrost/lib/genxml/decode_common.c.rej
@@ -0,0 +1,52 @@
+diff a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c	(rejected hunks)
+@@ -202,7 +202,7 @@ pointer_as_memory_reference(uint64_t ptr)
+ 
+ static int pandecode_dump_frame_count = 0;
+ 
+-static bool force_stderr = false;
++bool force_stderr = false;
+ 
+ void
+ pandecode_dump_file_open(void)
+@@ -230,7 +230,7 @@ pandecode_dump_file_open(void)
+         }
+ }
+ 
+-static void
++void
+ pandecode_dump_file_close(void)
+ {
+         simple_mtx_assert_locked(&pandecode_lock);
+@@ -289,8 +289,9 @@ pandecode_dump_mappings(void)
+                 if (!it->addr || !it->length)
+                         continue;
+ 
+-                fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n",
+-                        it->name, it->gpu_va);
++                fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64
++                        " length %zu\n\n",
++                        it->name, it->gpu_va, it->length);
+ 
+                 pan_hexdump(pandecode_dump_stream, it->addr, it->length, false);
+                 fprintf(pandecode_dump_stream, "\n");
+@@ -333,3 +334,20 @@ pandecode_jc(mali_ptr jc_gpu_va, unsigned gpu_id)
+ 
+         simple_mtx_unlock(&pandecode_lock);
+ }
++
++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
++void
++pandecode_cs(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id)
++{
++        simple_mtx_lock(&pandecode_lock);
++
++        switch (pan_arch(gpu_id)) {
++        // Hack hack hackity hack: gpu_id == 1 means "don't decode" (only
++        // disassemble)
++        case 0: case 10: pandecode_cs_v10(cs_gpu_va, cs_size, gpu_id); break;
++        default: unreachable("Unsupported architecture");
++        }
++
++        simple_mtx_unlock(&pandecode_lock);
++}
diff --git a/src/panfrost/lib/genxml/gen_macros.h.rej b/src/panfrost/lib/genxml/gen_macros.h.rej
new file mode 100644
index 00000000000..0b1a6a9070a
--- /dev/null
+++ b/src/panfrost/lib/genxml/gen_macros.h.rej
@@ -0,0 +1,11 @@
+diff a/src/panfrost/lib/genxml/gen_macros.h b/src/panfrost/lib/genxml/gen_macros.h	(rejected hunks)
+@@ -93,6 +93,9 @@ pan_arch(unsigned gpu_id)
+ #elif (PAN_ARCH == 9)
+ #  define GENX(X) X##_v9
+ #  include "genxml/v9_pack.h"
++#elif (PAN_ARCH == 10)
++#  define GENX(X) X##_v10
++#  include "genxml/v10_pack.h"
+ #else
+ #  error "Need to add suffixing macro for this architecture"
+ #endif
diff --git a/src/panfrost/lib/genxml/gen_pack.py b/src/panfrost/lib/genxml/gen_pack.py
index 8d1df522ca0..cbcde745cf6 100644
--- a/src/panfrost/lib/genxml/gen_pack.py
+++ b/src/panfrost/lib/genxml/gen_pack.py
@@ -46,6 +46,18 @@
 
 #include "util/bitpack_helpers.h"
 
+/* Most functions assume the caller has done bounds checking */
+typedef struct pan_command_stream {
+   uint64_t *ptr;
+   uint64_t *begin;
+   uint64_t *end;
+   uint64_t gpu;
+} pan_command_stream;
+
+struct pan_command_stream_decoded {
+  uint32_t values[256];
+};
+
 #define __gen_unpack_float(x, y, z) uif(__gen_unpack_uint(x, y, z))
 
 static inline uint32_t
@@ -114,6 +126,20 @@
    return (2*odd + 1) << shift;
 }
 
+static inline void
+__gen_clear_value(uint8_t *restrict cl, uint32_t start, uint32_t end)
+{
+   for (uint32_t byte = start / 8; byte <= end / 8; byte++) {
+      uint8_t m = 0;
+      if (byte == start / 8)
+         m |= 0xff >> (8 - start % 8);
+      if (byte == end / 8)
+         m |= 0xff << (1 + end % 8);
+
+      cl[byte] &= m;
+   }
+}
+
 #define PREFIX1(A) MALI_ ## A
 #define PREFIX2(A, B) MALI_ ## A ## _ ## B
 #define PREFIX4(A, B, C, D) MALI_ ## A ## _ ## B ## _ ## C ## _ ## D
@@ -199,6 +225,96 @@
 
 """
 
+no_cs = "".join([f"""
+#define MALI_{y} MALI_{x}
+#define MALI_{y}_header MALI_{x}_header
+#define MALI_{y}_pack MALI_{x}_pack
+#define MALI_{y}_LENGTH MALI_{x}_LENGTH
+#define MALI_{y}_ALIGN MALI_{x}_ALIGN
+#define mali_{y.lower()}_packed mali_{x.lower()}_packed
+#define MALI_{y}_unpack MALI_{x}_unpack
+#define MALI_{y}_print MALI_{x}_print
+""" for x, y in (("DRAW", "DRAW_NO_CS"), )]) + """
+
+#define pan_pack_cs_v10(dst, _, T, name) pan_pack(dst, T, name)
+
+#define pan_section_pack_cs_v10(dst, _, A, S, name) pan_section_pack(dst, A, S, name)
+
+#define pan_unpack_cs_v10(dst, _, __, T, name) pan_unpack(dst, T, name)
+
+#define pan_section_unpack_cs_v10(src, _, __, A, S, name) pan_section_unpack(src, A, S, name)
+"""
+
+with_cs = """
+#define pan_pack_cs(dst, T, name)                       \\
+   for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\
+        *_loop_terminate = (void *) (dst);                  \\
+        __builtin_expect(_loop_terminate != NULL, 1);       \\
+        ({ PREFIX2(T, pack_cs)(dst, &name);  \\
+           _loop_terminate = NULL; }))
+
+#define pan_section_pack_cs(dst, A, S, name)                                                         \\
+   for (PREFIX4(A, SECTION, S, TYPE) name = { PREFIX4(A, SECTION, S, header) }, \\
+        *_loop_terminate = (void *) (dst);                                                        \\
+        __builtin_expect(_loop_terminate != NULL, 1);                                             \\
+        ({ PREFIX4(A, SECTION, S, pack_cs) (dst, &name);              \\
+           _loop_terminate = NULL; }))
+
+#define pan_section_pack_cs_v10(_, dst, A, S, name) pan_section_pack_cs(dst, A, S, name)
+
+// TODO: assert that the first argument is NULL
+#define pan_pack_cs_v10(_, dst, T, name) pan_pack_cs(dst, T, name)
+
+#define pan_pack_ins(dst, T, name)                       \\
+   for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\
+        *_loop_terminate = (void *) (dst);                  \\
+        __builtin_expect(_loop_terminate != NULL, 1);       \\
+        ({ PREFIX2(T, pack_ins)(dst, &name);  \\
+           _loop_terminate = NULL; }))
+
+#define pan_unpack_cs(buf, buf_unk, T, name) \\
+        struct PREFIX1(T) name; \\
+        PREFIX2(T, unpack)(buf, buf_unk, &name)
+
+#define pan_unpack_cs_v10(_, buf, buf_unk, T, name) pan_unpack_cs(buf, buf_unk, T, name)
+
+#define pan_section_unpack_cs_v10(_, buf, buf_unk, A, S, name) \\
+        PREFIX4(A, SECTION, S, TYPE) name;                             \\
+        PREFIX4(A, SECTION, S, unpack)(buf, buf_unk, &name)
+
+static inline void
+pan_emit_cs_ins(pan_command_stream *s, uint8_t op, uint64_t instr)
+{
+   assert(instr < (1ULL << 56));
+   instr |= ((uint64_t)op << 56);
+   *((s->ptr)++) = instr;
+}
+
+static inline void
+pan_emit_cs_32(pan_command_stream *s, uint8_t reg, uint32_t value)
+{
+   pan_emit_cs_ins(s, 2, ((uint64_t) reg << 48) | value);
+}
+
+static inline void
+pan_emit_cs_48(pan_command_stream *s, uint8_t reg, uint64_t value)
+{
+   assert(value < (1ULL << 48));
+   pan_emit_cs_ins(s, 1, ((uint64_t) reg << 48) | value);
+}
+
+static inline void
+pan_emit_cs_64(pan_command_stream *s, uint8_t reg, uint64_t value)
+{
+   if (value < (1ULL << 48)) {
+      pan_emit_cs_48(s, reg, value);
+   } else {
+      pan_emit_cs_32(s, reg, value);
+      pan_emit_cs_32(s, reg + 1, value >> 32);
+   }
+}
+"""
+
 def to_alphanum(name):
     substitutions = {
         ' ': '_',
@@ -313,7 +429,7 @@ def __init__(self, parser, attrs):
 
         if ":" in str(attrs["start"]):
             (word, bit) = attrs["start"].split(":")
-            self.start = (int(word) * 32) + int(bit)
+            self.start = (int(word, 0) * 32) + int(bit)
         else:
             self.start = int(attrs["start"])
 
@@ -347,7 +463,8 @@ def emit_template_struct(self, dim):
             type = 'uint64_t'
         elif self.type == 'int':
             type = 'int32_t'
-        elif self.type in ['uint', 'hex', 'uint/float', 'padded', 'Pixel Format']:
+            # TODO: Convert to tuple
+        elif self.type in ['uint', 'hex', 'register', 'uint/float', 'padded', 'Pixel Format']:
             type = 'uint32_t'
         elif self.type in self.parser.structs:
             type = 'struct ' + self.parser.gen_prefix(safe_name(self.type.upper()))
@@ -401,8 +518,8 @@ def emit_template_struct(self, dim):
                 field.emit_template_struct(dim)
 
     class Word:
-        def __init__(self):
-            self.size = 32
+        def __init__(self, size=32):
+            self.size = size
             self.contributors = []
 
     class FieldRef:
@@ -426,7 +543,7 @@ def collect_fields(self, fields, offset, path, all_fields):
             end = offset + field.end
             all_fields.append(self.FieldRef(field, field_path, start, end))
 
-    def collect_words(self, fields, offset, path, words):
+    def collect_words(self, fields, offset, path, words, ins=False):
         for field in fields:
             field_path = '{}{}'.format(path, field.name)
             start = offset + field.start
@@ -440,16 +557,27 @@ def collect_words(self, fields, offset, path, words):
             contributor = self.FieldRef(field, field_path, start, end)
             first_word = contributor.start // 32
             last_word = contributor.end // 32
+            if ins:
+                assert(last_word < 2)
+                first_word = last_word = 0
+
             for b in range(first_word, last_word + 1):
                 if not b in words:
-                    words[b] = self.Word()
+                    words[b] = self.Word(size=64 if ins else 32)
+
                 words[b].contributors.append(contributor)
 
-    def emit_pack_function(self):
-        self.get_length()
+        return
+
+    def emit_pack_function(self, csf=False, ins=False):
+        if csf:
+            self.length = 256 * 4
+        else:
+            self.get_length()
+            assert(not ins)
 
         words = {}
-        self.collect_words(self.fields, 0, '', words)
+        self.collect_words(self.fields, 0, '', words, ins=ins)
 
         # Validate the modifier is lossless
         for field in self.fields:
@@ -465,25 +593,52 @@ def emit_pack_function(self):
             elif field.modifier[0] == "log2":
                 print("   assert(util_is_power_of_two_nonzero(values->{}));".format(field.name))
 
-        for index in range(self.length // 4):
+        if ins:
+            index_list = (0, )
+        elif csf:
+            index_list = sorted(words)
+        else:
+            index_list = range(self.length // 4)
+
+        for index in index_list:
             # Handle MBZ words
             if not index in words:
-                print("   cl[%2d] = 0;" % index)
+                if ins:
+                    print("   pan_emit_cs_ins(s, 0x%02x, 0);" % self.op)
+                elif not csf:
+                    print("   cl[%2d] = 0;" % index)
                 continue
 
             word = words[index]
 
             word_start = index * 32
 
+            size = 32
+            # Can we move all fields from the next index here?
+            if csf and index % 2 == 0 and index + 1 in words:
+                word_next = words[index + 1]
+                end = max(c.end for c in word_next.contributors)
+                if end - word_start < 48:
+                    size = 48
+                    word.contributors += [x for x in word_next.contributors if not x in word.contributors]
+                    del words[index + 1]
+
             v = None
-            prefix = "   cl[%2d] =" % index
+            if ins:
+                prefix = "   pan_emit_cs_ins(s, 0x%02x," % self.op
+            elif size == 48:
+                prefix = "   pan_emit_cs_48(s, 0x%02x," % index
+            elif csf:
+                prefix = "   pan_emit_cs_32(s, 0x%02x," % index
+            else:
+                prefix = "   cl[%2d] = (" % index
 
             for contributor in word.contributors:
                 field = contributor.field
                 name = field.name
                 start = contributor.start
                 end = contributor.end
-                contrib_word_start = (start // 32) * 32
+                contrib_word_start = (start // word.size) * word.size
                 start -= contrib_word_start
                 end -= contrib_word_start
 
@@ -498,7 +653,7 @@ def emit_pack_function(self):
                     elif field.modifier[0] == "log2":
                         value = "util_logbase2({})".format(value)
 
-                if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format"]:
+                if field.type in ["uint", "hex", "uint/float", "address", "register", "Pixel Format"]:
                     s = "util_bitpack_uint(%s, %d, %d)" % \
                         (value, start, end)
                 elif field.type == "padded":
@@ -529,11 +684,13 @@ def emit_pack_function(self):
 
                 if not s == None:
                     shift = word_start - contrib_word_start
-                    if shift:
+                    if shift > 0:
                         s = "%s >> %d" % (s, shift)
+                    elif shift < 0:
+                        s = "%s << %d" % (s, -shift)
 
                     if contributor == word.contributors[-1]:
-                        print("%s %s;" % (prefix, s))
+                        print("%s %s);" % (prefix, s))
                     else:
                         print("%s %s |" % (prefix, s))
                     prefix = "           "
@@ -552,22 +709,23 @@ def mask_for_word(self, index, start, end):
         count = (end - start + 1)
         return (((1 << count) - 1) << start)
 
-    def emit_unpack_function(self):
+    def emit_unpack_function(self, csf=False):
         # First, verify there is no garbage in unused bits
         words = {}
         self.collect_words(self.fields, 0, '', words)
 
-        for index in range(self.length // 4):
-            base = index * 32
-            word = words.get(index, self.Word())
-            masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors]
-            mask = reduce(lambda x,y: x | y, masks, 0)
+        if not csf:
+            for index in range(self.length // 4):
+                base = index * 32
+                word = words.get(index, self.Word())
+                masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors]
+                mask = reduce(lambda x,y: x | y, masks, 0)
 
-            ALL_ONES = 0xffffffff
+                ALL_ONES = 0xffffffff
 
-            if mask != ALL_ONES:
-                TMPL = '   if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");'
-                print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index))
+                if mask != ALL_ONES:
+                    TMPL = '   if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");'
+                    print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index))
 
         fieldrefs = []
         self.collect_fields(self.fields, 0, '', fieldrefs)
@@ -580,7 +738,7 @@ def emit_unpack_function(self):
             args.append(str(fieldref.start))
             args.append(str(fieldref.end))
 
-            if field.type in set(["uint", "hex", "uint/float", "address", "Pixel Format"]):
+            if field.type in set(["uint", "hex", "uint/float", "address", "register", "Pixel Format"]):
                 convert = "__gen_unpack_uint"
             elif field.type in self.parser.enums:
                 convert = "(enum %s)__gen_unpack_uint" % enum_name(field.type)
@@ -616,6 +774,9 @@ def emit_unpack_function(self):
                 mask = hex(field.modifier[1] - 1)
                 print('   assert(!(values->{} & {}));'.format(fieldref.path, mask))
 
+            if csf:
+                print('   __gen_clear_value({});'.format(', '.join(['cl_unk'] + args[1:])))
+
     def emit_print_function(self):
         for field in self.fields:
             convert = None
@@ -638,7 +799,7 @@ def emit_print_function(self):
                 print('   fprintf(fp, "%*s{}: %f\\n", indent, "", {});'.format(name, val))
             elif field.type in ["uint", "hex"] and (field.end - field.start) >= 32:
                 print('   fprintf(fp, "%*s{}: 0x%" PRIx64 "\\n", indent, "", {});'.format(name, val))
-            elif field.type == "hex":
+            elif field.type in ("hex", "register"):
                 print('   fprintf(fp, "%*s{}: 0x%x\\n", indent, "", {});'.format(name, val))
             elif field.type == "uint/float":
                 print('   fprintf(fp, "%*s{}: 0x%X (%f)\\n", indent, "", {}, uif({}));'.format(name, val, val))
@@ -677,9 +838,13 @@ def start_element(self, name, attrs):
                     print(v6_format_printer)
                 else:
                     print(v7_format_printer)
+                if arch < 10:
+                    print(no_cs)
+                else:
+                    print(with_cs)
         elif name == "struct":
             name = attrs["name"]
-            self.no_direct_packing = attrs.get("no-direct-packing", False)
+            self.layout = attrs.get("layout", "struct")
             object_name = self.gen_prefix(safe_name(name.upper()))
             self.struct = object_name
 
@@ -687,10 +852,16 @@ def start_element(self, name, attrs):
             if "size" in attrs:
                 self.group.length = int(attrs["size"]) * 4
             self.group.align = int(attrs["align"]) if "align" in attrs else None
+            self.group.op = int(attrs["op"]) if "op" in attrs else None
             self.structs[attrs["name"]] = self.group
+            self.unpacked_alias = self.gen_prefix(safe_name(attrs["unpacked"].upper())) if "unpacked" in attrs else None
         elif name == "field":
-            self.group.fields.append(Field(self, attrs))
             self.values = []
+            self.skip_field = self.layout == "cs" and not attrs["start"].startswith("0x")
+            if self.skip_field:
+                #print(f"#warning Skipping non-CS field {attrs['name']}")
+                return
+            self.group.fields.append(Field(self, attrs))
         elif name == "enum":
             self.values = []
             self.enum = safe_name(attrs["name"])
@@ -703,6 +874,8 @@ def start_element(self, name, attrs):
             self.values.append(Value(attrs))
         elif name == "aggregate":
             aggregate_name = self.gen_prefix(safe_name(attrs["name"].upper()))
+            # TODO: Make .layout less "global"?
+            self.layout = attrs.get("layout", "struct")
             self.aggregate = Aggregate(self, aggregate_name, attrs)
             self.aggregates[attrs['name']] = self.aggregate
         elif name == "section":
@@ -715,7 +888,8 @@ def end_element(self, name):
             self.struct = None
             self.group = None
         elif name  == "field":
-            self.group.fields[-1].values = self.values
+            if not self.skip_field:
+                self.group.fields[-1].values = self.values
         elif name  == "enum":
             self.emit_enum()
             self.enum = None
@@ -745,22 +919,33 @@ def emit_header(self, name):
         print('')
 
     def emit_template_struct(self, name, group):
-        print("struct %s {" % name)
-        group.emit_template_struct("")
-        print("};\n")
+        if self.unpacked_alias:
+            # TODO: Check the fields match
+            print("#define %s %s" % (name, self.unpacked_alias))
+        else:
+            print("struct %s {" % name)
+            group.emit_template_struct("")
+            print("};\n")
 
     def emit_aggregate(self):
         aggregate = self.aggregate
-        print("struct %s_packed {" % aggregate.name.lower())
-        print("   uint32_t opaque[{}];".format(aggregate.get_size() // 4))
-        print("};\n")
-        print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size))
+
+        if self.layout == "struct":
+            print("struct %s_packed {" % aggregate.name.lower())
+            print("   uint32_t opaque[{}];".format(aggregate.get_size() // 4))
+            print("};\n")
+            print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size))
+        else:
+            assert(self.layout == "cs")
+
         if aggregate.align != None:
             print('#define {}_ALIGN {}'.format(aggregate.name.upper(), aggregate.align))
         for section in aggregate.sections:
             print('#define {}_SECTION_{}_TYPE struct {}'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
             print('#define {}_SECTION_{}_header {}_header'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
             print('#define {}_SECTION_{}_pack {}_pack'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
+            # TODO: Only when req'd
+            print('#define {}_SECTION_{}_pack_cs {}_pack_cs'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
             print('#define {}_SECTION_{}_unpack {}_unpack'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
             print('#define {}_SECTION_{}_print {}_print'.format(aggregate.name.upper(), section.name.upper(), section.type_name))
             print('#define {}_SECTION_{}_OFFSET {}'.format(aggregate.name.upper(), section.name.upper(), section.offset))
@@ -775,12 +960,32 @@ def emit_pack_function(self, name, group):
         print("}\n\n")
 
         # Should be a whole number of words
-        assert((self.group.length % 4) == 0)
+        assert((group.length % 4) == 0)
+
+        print('#define {} {}'.format (name + "_LENGTH", group.length))
+        if group.align != None:
+            print('#define {} {}'.format (name + "_ALIGN", group.align))
+        print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), group.length // 4))
+
+    def emit_cs_pack_function(self, name, group):
+        print("static inline void\n%s_pack_cs(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{\n" %
+              (name, ' ' * (len(name) + 6), name))
+
+        group.emit_pack_function(csf=True)
 
-        print('#define {} {}'.format (name + "_LENGTH", self.group.length))
-        if self.group.align != None:
-            print('#define {} {}'.format (name + "_ALIGN", self.group.align))
-        print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), self.group.length // 4))
+        print("}\n\n")
+
+        assert(group.length == 256 * 4)
+
+    def emit_ins_pack_function(self, name, group):
+        print("static inline void\n%s_pack_ins(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{" %
+              (name, ' ' * (len(name) + 6), name))
+
+        group.emit_pack_function(csf=True, ins=True)
+
+        print("}\n\n")
+
+        assert(group.length == 256 * 4)
 
     def emit_unpack_function(self, name, group):
         print("static inline void")
@@ -791,6 +996,18 @@ def emit_unpack_function(self, name, group):
 
         print("}\n")
 
+    def emit_cs_unpack_function(self, name, group):
+        print("static inline void")
+        print("%s_unpack(const uint32_t * restrict buffer, uint32_t * restrict buffer_unk,\n"
+              "%sstruct %s * restrict values)\n{"
+              "   const uint8_t *cl = (uint8_t *)buffer;\n"
+              "   uint8_t *cl_unk = (uint8_t *)buffer_unk;\n" %
+              (name.upper(), ' ' * (len(name) + 8), name))
+
+        group.emit_unpack_function(csf=True)
+
+        print("}\n")
+
     def emit_print_function(self, name, group):
         print("static inline void")
         print("{}_print(FILE *fp, const struct {} * values, unsigned indent)\n{{".format(name.upper(), name))
@@ -804,14 +1021,20 @@ def emit_struct(self):
 
         self.emit_template_struct(self.struct, self.group)
         self.emit_header(name)
-        if self.no_direct_packing == False:
+        if self.layout == "struct":
             self.emit_pack_function(self.struct, self.group)
             self.emit_unpack_function(self.struct, self.group)
+        elif self.layout == "cs":
+            self.emit_cs_pack_function(self.struct, self.group)
+            self.emit_cs_unpack_function(self.struct, self.group)
+        elif self.layout == "ins":
+            # TODO: I don't think that the current unpack emit functions would
+            # work
+            self.emit_ins_pack_function(self.struct, self.group)
+        else:
+            assert(self.layout == "none")
         self.emit_print_function(self.struct, self.group)
 
-    def enum_prefix(self, name):
-        return 
-
     def emit_enum(self):
         e_name = enum_name(self.enum)
         prefix = e_name if self.enum != 'Format' else global_prefix
diff --git a/src/panfrost/lib/genxml/meson.build.rej b/src/panfrost/lib/genxml/meson.build.rej
new file mode 100644
index 00000000000..75405947ded
--- /dev/null
+++ b/src/panfrost/lib/genxml/meson.build.rej
@@ -0,0 +1,19 @@
+diff a/src/panfrost/lib/genxml/meson.build b/src/panfrost/lib/genxml/meson.build	(rejected hunks)
+@@ -20,7 +20,7 @@
+ # SOFTWARE.
+ 
+ pan_packers = []
+-foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9']
++foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10']
+   pan_packers += custom_target(
+     packer + '_pack.h',
+     input : ['gen_pack.py', packer + '.xml'],
+@@ -37,7 +37,7 @@ idep_pan_packers = declare_dependency(
+ 
+ libpanfrost_decode_per_arch = []
+ 
+-foreach ver : ['4', '5', '6', '7', '9']
++foreach ver : ['4', '5', '6', '7', '9', '10']
+   libpanfrost_decode_per_arch += static_library(
+     'pandecode-arch-v' + ver,
+     ['decode.c', pan_packers],
diff --git a/src/panfrost/lib/genxml/v4.xml b/src/panfrost/lib/genxml/v4.xml
index 63b7f7f57ac..4f8dd3f2e13 100644
--- a/src/panfrost/lib/genxml/v4.xml
+++ b/src/panfrost/lib/genxml/v4.xml
@@ -446,7 +446,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
diff --git a/src/panfrost/lib/genxml/v5.xml b/src/panfrost/lib/genxml/v5.xml
index 6c53dac00e6..eacd75501f5 100644
--- a/src/panfrost/lib/genxml/v5.xml
+++ b/src/panfrost/lib/genxml/v5.xml
@@ -467,7 +467,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
diff --git a/src/panfrost/lib/genxml/v6.xml b/src/panfrost/lib/genxml/v6.xml
index 9d042c4db93..701c204d04b 100644
--- a/src/panfrost/lib/genxml/v6.xml
+++ b/src/panfrost/lib/genxml/v6.xml
@@ -467,7 +467,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
@@ -689,7 +689,7 @@
     <field name="Stencil from shader" size="1" start="28" type="bool"/>
   </struct>
 
-  <struct name="Compute Preload" size="1" no-direct-packing="true">
+  <struct name="Compute Preload" size="1" layout="none">
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Local Invocation XY" size="1" start="7" type="bool"/>
     <field name="Local Invocation Z" size="1" start="8" type="bool"/>
@@ -708,7 +708,7 @@
     <value name="8" value="3"/>
   </enum>
 
-  <struct name="Vertex Preload" size="1" no-direct-packing="true">
+  <struct name="Vertex Preload" size="1" layout="none">
     <field name="Warp limit" size="2" start="0" type="Warp Limit"/>
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Position result address lo" size="1" start="10" type="bool"/>
@@ -717,7 +717,7 @@
     <field name="Instance ID" size="1" start="14" type="bool"/>
   </struct>
 
-  <struct name="Fragment Preload" size="1" no-direct-packing="true">
+  <struct name="Fragment Preload" size="1" layout="none">
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Coverage" size="1" start="7" type="bool"/>
     <field name="Primitive ID" size="1" start="9" type="bool"/>
diff --git a/src/panfrost/lib/genxml/v7.xml b/src/panfrost/lib/genxml/v7.xml
index 7e0b794ec85..ec0bad1f0c3 100644
--- a/src/panfrost/lib/genxml/v7.xml
+++ b/src/panfrost/lib/genxml/v7.xml
@@ -512,7 +512,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
@@ -762,7 +762,7 @@
     <field name="Shader wait dependency 7" size="1" start="31" type="bool"/>
   </struct>
 
-  <struct name="Compute Preload" size="1" no-direct-packing="true">
+  <struct name="Compute Preload" size="1" layout="none">
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Local Invocation XY" size="1" start="7" type="bool"/>
     <field name="Local Invocation Z" size="1" start="8" type="bool"/>
@@ -781,7 +781,7 @@
     <value name="8" value="3"/>
   </enum>
 
-  <struct name="Vertex Preload" size="1" no-direct-packing="true">
+  <struct name="Vertex Preload" size="1" layout="none">
     <field name="Warp limit" size="2" start="0" type="Warp Limit"/>
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Position result address lo" size="1" start="10" type="bool"/>
@@ -790,7 +790,7 @@
     <field name="Instance ID" size="1" start="14" type="bool"/>
   </struct>
 
-  <struct name="Fragment Preload" size="1" no-direct-packing="true">
+  <struct name="Fragment Preload" size="1" layout="none">
     <field name="PC" size="1" start="6" type="bool"/>
     <field name="Coverage" size="1" start="7" type="bool"/>
     <field name="Primitive ID" size="1" start="9" type="bool"/>
@@ -854,13 +854,13 @@
     <field name="Depth Pass" size="3" start="25" type="Stencil Op"/>
   </struct>
 
-  <struct name="LD_VAR Preload" size="1" no-direct-packing="true">
+  <struct name="LD_VAR Preload" size="1" layout="none">
     <field name="Varying Index" size="5" start="4" type="uint"/>
     <field name="Register Format" size="2" start="9" type="Message Preload Register Format"/>
     <field name="Num Components" size="2" start="11" type="uint" modifier="minus(1)" default="1"/>
   </struct>
 
-  <struct name="VAR_TEX Preload" size="1" no-direct-packing="true">
+  <struct name="VAR_TEX Preload" size="1" layout="none">
     <field name="Varying Index" size="3" start="4" type="uint"/>
     <field name="Texture Index" size="2" start="7" type="uint"/>
     <field name="Register Format" size="2" start="9" type="Message Preload Register Format"/>
diff --git a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml
index c08d49e2025..0818e5128a6 100644
--- a/src/panfrost/lib/genxml/v9.xml
+++ b/src/panfrost/lib/genxml/v9.xml
@@ -526,7 +526,7 @@
     <value name="Constant" value="7"/>
   </enum>
 
-  <struct name="Blend Function" no-direct-packing="true">
+  <struct name="Blend Function" layout="none">
     <!-- Blend equation: A + (B * C) -->
     <field name="A" size="2" start="0" type="Blend Operand A"/>
     <field name="Negate A" size="1" start="3" type="bool"/>
@@ -1322,28 +1322,28 @@
   </aggregate>
 
   <struct name="Shader Environment" size="16" align="64">
-    <field name="Attribute offset" start="0:0" size="32" type="uint"/>
-    <field name="FAU count" start="1:0" size="8" type="uint"/>
-    <field name="Resources" start="8:0" size="64" type="address"/>
-    <field name="Shader" start="10:0" size="64" type="address"/>
-    <field name="Thread storage" start="12:0" size="64" type="address"/>
-    <field name="FAU" start="14:0" size="64" type="address"/>
+    <field name="Attribute offset" size="32" start="0:0" type="uint"/>
+    <field name="FAU count" size="8" start="1:0" type="uint"/>
+    <field name="Resources" size="64" start="8:0" type="address"/>
+    <field name="Shader" size="64" start="10:0" type="address"/>
+    <field name="Thread storage" size="64" start="12:0" type="address"/>
+    <field name="FAU" size="64" start="14:0" type="address"/>
   </struct>
 
   <struct name="Compute Payload" size="24">
-    <field name="Workgroup size X" start="0:0" size="10" type="uint" modifier="minus(1)"/>
-    <field name="Workgroup size Y" start="0:10" size="10" type="uint" modifier="minus(1)"/>
-    <field name="Workgroup size Z" start="0:20" size="10" type="uint" modifier="minus(1)"/>
-    <field name="Allow merging workgroups" start="0:31" size="1" type="bool"/>
-    <field name="Task increment" start="1:0" size="14" type="uint" default="1"/>
-    <field name="Task axis" start="1:14" size="2" type="Task Axis"/>
-    <field name="Workgroup count X" start="2:0" size="32" type="uint"/>
-    <field name="Workgroup count Y" start="3:0" size="32" type="uint"/>
-    <field name="Workgroup count Z" start="4:0" size="32" type="uint"/>
-    <field name="Offset X" start="5:0" size="32" type="uint"/>
-    <field name="Offset Y" start="6:0" size="32" type="uint"/>
-    <field name="Offset Z" start="7:0" size="32" type="uint"/>
-    <field name="Compute" start="8:0" size="512" type="Shader Environment"/>
+    <field name="Workgroup size X" size="10" start="0:0" type="uint" modifier="minus(1)"/>
+    <field name="Workgroup size Y" size="10" start="0:10" type="uint" modifier="minus(1)"/>
+    <field name="Workgroup size Z" size="10" start="0:20" type="uint" modifier="minus(1)"/>
+    <field name="Allow merging workgroups" size="1" start="0:31" type="bool"/>
+    <field name="Task increment" size="14" start="1:0" type="uint" default="1"/>
+    <field name="Task axis" size="2" start="1:14" type="Task Axis"/>
+    <field name="Workgroup count X" size="32" start="2:0" type="uint"/>
+    <field name="Workgroup count Y" size="32" start="3:0" type="uint"/>
+    <field name="Workgroup count Z" size="32" start="4:0" type="uint"/>
+    <field name="Offset X" size="32" start="5:0" type="uint"/>
+    <field name="Offset Y" size="32" start="6:0" type="uint"/>
+    <field name="Offset Z" size="32" start="7:0" type="uint"/>
+    <field name="Compute" size="512" start="8:0" type="Shader Environment"/>
   </struct>
 
   <!-- Compute job also covers vertex and geometry operations -->
@@ -1353,9 +1353,9 @@
   </aggregate>
 
   <struct name="Resource" size="4" align="16">
-    <field name="Address" start="0:0" size="56" type="address"/>
-    <field name="Contains descriptors" start="1:24" size="1" type="bool" default="true"/>
-    <field name="Size" start="2:0" size="64" type="uint"/> <!-- bytes -->
+    <field name="Address" size="56" start="0:0" type="address"/>
+    <field name="Contains descriptors" size="1" start="1:24" type="bool" default="true"/>
+    <field name="Size" size="64" start="2:0" type="uint"/> <!-- bytes -->
   </struct>
 
   <struct name="Depth/stencil" size="8" align="32">
@@ -1376,8 +1376,8 @@
     <field name="Back value mask" size="8" start="1:24" type="hex"/>
     <field name="Front reference value" size="8" start="2:0" type="hex"/>
     <field name="Back reference value" size="8" start="2:8" type="hex"/>
-    <field name="Depth cull enable" size="1" start="4:22" type="bool" default="true"/>
-    <field name="Depth clamp mode" size="2" start="4:23" type="Depth Clamp Mode" default="[0, 1]"/>
+    <field name="Depth cull enable" size="1" start="4:22" type="bool"/>
+    <field name="Depth clamp mode" size="2" start="4:23" type="Depth Clamp Mode"/>
     <field name="Depth source" size="2" start="4:25" type="Depth Source" default="Fixed function"/>
     <field name="Depth write enable" size="1" start="4:27" type="bool"/>
     <field name="Depth bias enable" size="1" start="4:28" type="bool"/>
@@ -1387,6 +1387,7 @@
     <field name="Depth bias clamp" size="32" start="7:0" type="float"/>
   </struct>
 
+  <!-- TODO: Is this actually four words? -->
   <struct name="Vertex Array" size="3">
     <field name="Packet" size="1" start="0:0" type="bool"/>
 
@@ -1404,7 +1405,7 @@
     <field name="Pixel kill operation" size="2" start="0:2" type="Pixel Kill"/>
     <field name="ZS update operation" size="2" start="0:4" type="Pixel Kill"/>
     <field name="Allow primitive reorder" size="1" start="0:6" type="bool"/>
-    <field name="Overdraw alpha0" start="0:7" size="1" type="bool"/>
+    <field name="Overdraw alpha0" size="1" start="0:7" type="bool"/>
     <field name="Overdraw alpha1" size="1" start="0:8" type="bool"/>
     <field name="Clean Fragment Write" size="1" start="0:9" type="bool"/>
     <field name="Primitive Barrier" size="1" start="0:10" type="bool"/>
@@ -1420,24 +1421,24 @@
     <field name="Alpha-to-coverage" size="1" start="0:22" type="bool"/>
     <field name="Scissor to bounding box" size="1" start="0:23" type="bool"/>
     <field name="Sample mask" size="16" start="1:0" type="uint"/>
-    <field name="Render target mask" start="1:16" size="8" type="hex"/>
-    <field name="Vertex array" start="2:0" size="96" type="Vertex Array"/>
-    <field name="Minimum Z" start="6:0" size="32" type="float"/>
-    <field name="Maximum Z" start="7:0" size="32" type="float"/>
-    <field name="Depth/stencil" start="10:0" size="64" type="address"/>
-    <field name="Blend count" start="12:0" size="4" type="uint"/>
-    <field name="Blend" start="12:4" size="60" type="address" modifier="shr(4)"/>
+    <field name="Render target mask" size="8" start="1:16" type="hex"/>
+    <field name="Vertex array" size="96" start="2:0" type="Vertex Array"/>
+    <field name="Minimum Z" size="32" start="6:0" type="float"/>
+    <field name="Maximum Z" size="32" start="7:0" type="float"/>
+    <field name="Depth/stencil" size="64" start="10:0" type="address"/>
+    <field name="Blend count" size="4" start="12:0" type="uint"/>
+    <field name="Blend" size="60" start="12:4" type="address" modifier="shr(4)"/>
     <field name="Occlusion" size="64" start="14:0" type="address"/>
-    <field name="Shader" start="16:0" size="512" type="Shader Environment"/>
+    <field name="Shader" size="512" start="16:0" type="Shader Environment"/>
   </struct>
 
   <struct name="Count" size="1">
-    <field name="Count" start="0:0" size="32" type="uint"/>
+    <field name="Count" size="32" start="0:0" type="uint"/>
   </struct>
 
   <struct name="Allocation" size="1">
-    <field name="Vertex packet stride" start="0:0" size="16" type="uint"/>
-    <field name="Vertex attribute stride" start="0:16" size="16" type="uint"/>
+    <field name="Vertex packet stride" size="16" start="0:0" type="uint"/>
+    <field name="Vertex attribute stride" size="16" start="0:16" type="uint"/>
   </struct>
 
   <struct name="Tiler Pointer" size="2">
diff --git a/src/panfrost/lib/genxml/v9.xml.rej b/src/panfrost/lib/genxml/v9.xml.rej
new file mode 100644
index 00000000000..2594d849f1d
--- /dev/null
+++ b/src/panfrost/lib/genxml/v9.xml.rej
@@ -0,0 +1,28 @@
+diff a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml	(rejected hunks)
+@@ -599,12 +599,6 @@
+     <value name="FP32" value="3"/>
+   </enum>
+ 
+-  <enum name="Primitive Restart">
+-    <value name="None" value="0"/>
+-    <value name="Implicit" value="2"/>
+-    <value name="Explicit" value="3"/>
+-  </enum>
+-
+   <struct name="Primitive">
+     <field name="Draw mode" size="8" start="0:0" type="Draw Mode" default="None"/>
+     <field name="Index type" size="3" start="0:8" type="Index Type" default="None"/>
+@@ -612,10 +606,10 @@
+     <field name="Primitive Index Enable" size="1" start="0:13" type="bool"/>
+     <field name="Primitive Index Writeback" size="1" start="0:14" type="bool"/>
+     <field name="Allow rotating primitives" size="1" start="0:15" type="bool" default="true"/>
+-    <field name="Low Depth Cull" size="1" start="0:16" type="bool" default="true"/>
+-    <field name="High Depth Cull" size="1" start="0:17" type="bool" default="true"/>
++    <field name="Low Depth Cull" size="1" start="0:16" type="bool"/>
++    <field name="High Depth Cull" size="1" start="0:17" type="bool"/>
+     <field name="Secondary Shader" size="1" start="0:18" type="bool"/>
+-    <field name="Primitive restart" size="2" start="0:19" type="Primitive Restart"/>
++    <field name="Primitive restart" size="1" start="0:19" type="bool"/>
+     <field name="Layer index enable" size="1" start="0:20" type="bool"/>
+     <field name="Scissor array enable" size="1" start="0:21" type="bool"/>
+ 
diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build
index 12b927a2973..344d5299deb 100644
--- a/src/panfrost/lib/meson.build
+++ b/src/panfrost/lib/meson.build
@@ -40,7 +40,7 @@ endforeach
 
 libpanfrost_per_arch = []
 
-foreach ver : ['4', '5', '6', '7', '9']
+foreach ver : ['4', '5', '6', '7', '9', '10']
   libpanfrost_per_arch += static_library(
     'pan-arch-v' + ver,
     [
diff --git a/src/panfrost/lib/meson.build.rej b/src/panfrost/lib/meson.build.rej
new file mode 100644
index 00000000000..775ed402e1d
--- /dev/null
+++ b/src/panfrost/lib/meson.build.rej
@@ -0,0 +1,10 @@
+diff a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build	(rejected hunks)
+@@ -93,7 +93,7 @@ libpanfrost_lib = static_library(
+   include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
+   c_args : [no_override_init_args],
+   gnu_symbol_visibility : 'hidden',
+-  dependencies: [dep_libdrm, idep_nir],
++  dependencies: [dep_libdrm, idep_nir, libpanfrost_base_dep],
+   build_by_default : false,
+   link_with: [libpanfrost_pixel_format, libpanfrost_per_arch],
+ )
diff --git a/src/panfrost/lib/pan_afbc.c.rej b/src/panfrost/lib/pan_afbc.c.rej
new file mode 100644
index 00000000000..92be882a371
--- /dev/null
+++ b/src/panfrost/lib/pan_afbc.c.rej
@@ -0,0 +1,25 @@
+diff a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c	(rejected hunks)
+@@ -125,10 +125,6 @@ panfrost_afbc_format(unsigned arch, enum pipe_format format)
+          */
+         format = util_format_linear(format);
+ 
+-        /* Don't allow swizzled formats on v7+ */
+-        if (arch >= 7 && format != unswizzled_format(format))
+-                return PIPE_FORMAT_NONE;
+-
+         /* Otherwise swizzling doesn't affect AFBC */
+         format = unswizzled_format(format);
+ 
+@@ -189,3 +185,12 @@ panfrost_afbc_can_tile(const struct panfrost_device *dev)
+ {
+         return (dev->arch >= 7);
+ }
++
++/*
++ * Can this format only be used with AFBC_FORMAT_MOD_NATIVE_SWIZZLE?
++ */
++bool
++panfrost_afbc_only_native(unsigned arch, enum pipe_format format)
++{
++        return (arch >= 7 && format != unswizzled_format(format));
++}
diff --git a/src/panfrost/lib/pan_blend.c.rej b/src/panfrost/lib/pan_blend.c.rej
new file mode 100644
index 00000000000..4c4b12a9cc8
--- /dev/null
+++ b/src/panfrost/lib/pan_blend.c.rej
@@ -0,0 +1,10 @@
+diff a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c	(rejected hunks)
+@@ -800,7 +800,7 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev,
+         };
+ 
+         /* Blend shaders should only be used for blending on Bifrost onwards */
+-        assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation));
++        assert(dev->arch <= 5 || state->logicop_enable || !pan_blend_is_opaque(state->rts[rt].equation));
+         assert(state->rts[rt].equation.color_mask != 0);
+ 
+         struct hash_entry *he = _mesa_hash_table_search(dev->blend_shaders.shaders, &key);
diff --git a/src/panfrost/lib/pan_blitter.c.rej b/src/panfrost/lib/pan_blitter.c.rej
new file mode 100644
index 00000000000..ea98fc10c8a
--- /dev/null
+++ b/src/panfrost/lib/pan_blitter.c.rej
@@ -0,0 +1,28 @@
+diff a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c	(rejected hunks)
+@@ -1150,7 +1150,7 @@ pan_preload_emit_dcd(struct pan_pool *pool,
+                                         blend.cpu);
+         }
+ 
+-        pan_pack(out, DRAW, cfg) {
++        pan_pack(out, DRAW_NO_CS, cfg) {
+                 if (zs) {
+                         /* ZS_EMIT requires late update/kill */
+                         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
+@@ -1225,7 +1225,7 @@ pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool,
+                 return;
+ 
+         fb->bifrost.pre_post.dcds =
+-                pan_pool_alloc_desc_array(desc_pool, 3, DRAW);
++                pan_pool_alloc_desc_array(desc_pool, 3, DRAW_NO_CS);
+ }
+ 
+ static void
+@@ -1237,7 +1237,7 @@ pan_preload_emit_pre_frame_dcd(struct pan_pool *desc_pool,
+         pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb);
+         assert(fb->bifrost.pre_post.dcds.cpu);
+         void *dcd = fb->bifrost.pre_post.dcds.cpu +
+-                    (dcd_idx * pan_size(DRAW));
++                    (dcd_idx * pan_size(DRAW_NO_CS));
+ 
+         /* We only use crc_rt to determine whether to force writes for updating
+          * the CRCs, so use a conservative tile size (16x16).
diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c
index 77d59a2719a..eebc2266b52 100644
--- a/src/panfrost/lib/pan_bo.c
+++ b/src/panfrost/lib/pan_bo.c
@@ -38,6 +38,7 @@
 
 #include "util/u_inlines.h"
 #include "util/u_math.h"
+#include "util/os_file.h"
 
 /* This file implements a userspace BO cache. Allocating and freeing
  * GPU-visible buffers is very expensive, and even the extra kernel roundtrips
diff --git a/src/panfrost/lib/pan_bo.c.rej b/src/panfrost/lib/pan_bo.c.rej
new file mode 100644
index 00000000000..ca2610eef42
--- /dev/null
+++ b/src/panfrost/lib/pan_bo.c.rej
@@ -0,0 +1,584 @@
+diff a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c	(rejected hunks)
+@@ -71,7 +72,38 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
+                         create_bo.flags |= PANFROST_BO_NOEXEC;
+         }
+ 
+-        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
++        void *cpu = NULL;
++
++        bool cached = false;
++
++        if (dev->kbase) {
++                if (flags & PAN_BO_CACHEABLE) {
++                        if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) {
++                                create_bo.flags |= MALI_BO_CACHED_CPU;
++                                /* TODO: What if kbase decides not to cache it? */
++                                cached = true;
++                        }
++                        if (dev->debug & PAN_DBG_UNCACHED_GPU)
++                                create_bo.flags |= MALI_BO_UNCACHED_GPU;
++                }
++
++                unsigned mali_flags = (flags & PAN_BO_EVENT) ? 0x8200f : 0;
++
++                struct base_ptr p = dev->mali.alloc(&dev->mali, size, create_bo.flags, mali_flags);
++
++                if (p.gpu) {
++                        cpu = p.cpu;
++                        create_bo.offset = p.gpu;
++                        create_bo.handle = kbase_alloc_gem_handle(&dev->mali, p.gpu, -1);
++                        if (!cpu)
++                                abort();
++                        ret = 0;
++                } else {
++                        ret = -1;
++                }
++        } else {
++                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
++        }
+         if (ret) {
+                 fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n");
+                 return NULL;
+@@ -82,29 +114,99 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
+ 
+         bo->size = create_bo.size;
+         bo->ptr.gpu = create_bo.offset;
++        bo->ptr.cpu = cpu;
++        if ((uintptr_t) bo->ptr.cpu != bo->ptr.gpu)
++                bo->free_ioctl = true;
+         bo->gem_handle = create_bo.handle;
+         bo->flags = flags;
+         bo->dev = dev;
+         bo->label = label;
++        bo->cached = cached;
++        bo->dmabuf_fd = -1;
+         return bo;
+ }
+ 
+ static void
+ panfrost_bo_free(struct panfrost_bo *bo)
+ {
++        struct panfrost_device *dev = bo->dev;
+         struct drm_gem_close gem_close = { .handle = bo->gem_handle };
+         int ret;
+ 
+-        ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li memfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        if (dev->kbase) {
++                os_munmap(bo->ptr.cpu, bo->size);
++                if (bo->munmap_ptr)
++                        os_munmap(bo->munmap_ptr, bo->size);
++                if (bo->free_ioctl)
++                        dev->mali.free(&dev->mali, bo->ptr.gpu);
++                kbase_free_gem_handle(&dev->mali, bo->gem_handle);
++                ret = 0;
++        } else {
++                ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++        }
+         if (ret) {
+                 fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n");
+                 assert(0);
+         }
+ 
+-        /* BO will be freed with the sparse array, but zero to indicate free */
++        /* BO will be freed with the stable_array, but zero to indicate free */
+         memset(bo, 0, sizeof(*bo));
+ }
+ 
++static bool
++panfrost_bo_usage_finished(struct panfrost_bo *bo, bool readers)
++{
++        struct panfrost_device *dev = bo->dev;
++        kbase k = &dev->mali;
++
++        bool ret = true;
++
++        pthread_mutex_lock(&dev->bo_usage_lock);
++        pthread_mutex_lock(&dev->mali.queue_lock);
++
++        util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) {
++                /* Skip if we are only waiting for writers */
++                if (!u->write && !readers)
++                        continue;
++
++                /* Usages are ordered, so everything else is also invalid */
++                if (u->queue >= k->event_slot_usage)
++                        break;
++
++                struct kbase_event_slot *slot = &k->event_slots[u->queue];
++                uint64_t seqnum = u->seqnum;
++
++                /* There is a race condition, where we can depend on an
++                 * unsubmitted batch. In that cade, decrease the seqnum.
++                 * Otherwise, skip invalid dependencies. TODO: do GC? */
++                if (slot->last_submit == seqnum)
++                        --seqnum;
++                else if (slot->last_submit < seqnum)
++                        continue;
++
++                if (slot->last <= seqnum) {
++                        ret = false;
++                        break;
++                }
++        }
++
++        pthread_mutex_unlock(&dev->mali.queue_lock);
++        pthread_mutex_unlock(&dev->bo_usage_lock);
++
++        return ret;
++}
++
+ /* Returns true if the BO is ready, false otherwise.
+  * access_type is encoding the type of access one wants to ensure is done.
+  * Waiting is always done for writers, but if wait_readers is set then readers
+@@ -113,12 +215,15 @@ panfrost_bo_free(struct panfrost_bo *bo)
+ bool
+ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+ {
++        struct panfrost_device *dev = bo->dev;
+         struct drm_panfrost_wait_bo req = {
+                 .handle = bo->gem_handle,
+ 		.timeout_ns = timeout_ns,
+         };
+         int ret;
+ 
++        /* TODO: With driver-handled sync, is gpu_access even worth it? */
++
+         /* If the BO has been exported or imported we can't rely on the cached
+          * state, we need to call the WAIT_BO ioctl.
+          */
+@@ -134,10 +239,31 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+                         return true;
+         }
+ 
++        if (dev->kbase && (dev->arch >= 10)) {
++                struct kbase_wait_ctx wait = kbase_wait_init(&dev->mali, timeout_ns);
++                while (kbase_wait_for_event(&wait)) {
++                        if (panfrost_bo_usage_finished(bo, wait_readers))
++                                break;
++                }
++                kbase_wait_fini(wait);
++
++                bool ret = panfrost_bo_usage_finished(bo, wait_readers);
++                if (bo->flags & PAN_BO_SHARED)
++                        ret &= kbase_poll_fd_until(bo->dmabuf_fd, wait_readers, wait.until);
++
++                if (ret)
++                        bo->gpu_access &= (wait_readers ? 0 : PAN_BO_ACCESS_READ);
++                return ret;
++        }
++
+         /* The ioctl returns >= 0 value when the BO we are waiting for is ready
+          * -1 otherwise.
+          */
+-        ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
++        if (dev->kbase)
++                ret = kbase_wait_bo(&dev->mali, bo->gem_handle, timeout_ns,
++                                    wait_readers);
++        else
++                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
+         if (ret != -1) {
+                 /* Set gpu_access to 0 so that the next call to bo_wait()
+                  * doesn't have to call the WAIT_BO ioctl.
+@@ -153,6 +279,32 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+         return false;
+ }
+ 
++static void
++panfrost_bo_mem_op(struct panfrost_bo *bo, size_t offset, size_t length, bool invalidate)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        assert(offset + length <= bo->size);
++
++        if (!bo->cached)
++                return;
++
++        dev->mali.mem_sync(&dev->mali, bo->ptr.gpu, bo->ptr.cpu + offset, length,
++                           invalidate);
++}
++
++void
++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length)
++{
++        panfrost_bo_mem_op(bo, offset, length, true);
++}
++
++void
++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length)
++{
++        panfrost_bo_mem_op(bo, offset, length, false);
++}
++
+ /* Helper to calculate the bucket index of a BO */
+ 
+ static unsigned
+@@ -200,21 +352,31 @@ panfrost_bo_cache_fetch(struct panfrost_device *dev,
+ 
+                 /* If the oldest BO in the cache is busy, likely so is
+                  * everything newer, so bail. */
+-                if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
+-                                      PAN_BO_ACCESS_RW))
+-                        break;
++
++                /* For kbase, BOs are not added to the cache until the GPU is
++                 * done with them, so there is no need to wait. */
++                if (!dev->kbase) {
++                        if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
++                                              PAN_BO_ACCESS_RW))
++                                break;
++                }
+ 
+                 struct drm_panfrost_madvise madv = {
+                         .handle = entry->gem_handle,
+                         .madv = PANFROST_MADV_WILLNEED,
+                 };
+-                int ret;
++                int ret = 0;
+ 
+                 /* This one works, splice it out of the cache */
+                 list_del(&entry->bucket_link);
+                 list_del(&entry->lru_link);
+ 
+-                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++                if (dev->kbase) {
++                        /* With kbase, BOs are never freed from the cache */
++                        madv.retained = true;
++                } else {
++                        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++                }
+                 if (!ret && !madv.retained) {
+                         panfrost_bo_free(entry);
+                         continue;
+@@ -276,7 +438,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo)
+         madv.madv = PANFROST_MADV_DONTNEED;
+ 	madv.retained = 0;
+ 
+-        drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
++        // TODO: Allow freeing madvise'd BOs with kbase... not that it really
++        // matters for boards with 16 GB RAM
++        if (!dev->kbase)
++                drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
+ 
+         /* Add us to the bucket */
+         list_addtail(&bo->bucket_link, bucket);
+@@ -286,6 +451,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo)
+         clock_gettime(CLOCK_MONOTONIC, &time);
+         bo->last_used = time.tv_sec;
+ 
++        /* For kbase, the GPU can't be accessing this BO any more */
++        if (dev->kbase)
++                bo->gpu_access = 0;
++
+         /* Let's do some cleanup in the BO cache while we hold the
+          * lock.
+          */
+@@ -352,10 +521,15 @@ panfrost_bo_mmap(struct panfrost_bo *bo)
+ static void
+ panfrost_bo_munmap(struct panfrost_bo *bo)
+ {
++        /* We can't munmap BOs when using kbase, as that frees the storage and
++         * the GPU might still be using the BO. */
++        if (bo->dev->kbase)
++                return;
++
+         if (!bo->ptr.cpu)
+                 return;
+ 
+-        if (os_munmap((void *) (uintptr_t)bo->ptr.cpu, bo->size)) {
++        if (os_munmap(bo->ptr.cpu, bo->size)) {
+                 perror("munmap");
+                 abort();
+         }
+@@ -390,8 +564,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+         if (!bo)
+                 bo = panfrost_bo_cache_fetch(dev, size, flags, label, false);
+         if (!bo) {
+-                panfrost_bo_cache_evict_all(dev);
+-                bo = panfrost_bo_alloc(dev, size, flags, label);
++                for (unsigned i = 0; i < 5; ++i) {
++                        usleep(20 * 1000 * i * i);
++                        if (dev->kbase)
++                                kbase_ensure_handle_events(&dev->mali);
++                        panfrost_bo_cache_evict_all(dev);
++                        bo = panfrost_bo_alloc(dev, size, flags, label);
++                        if (bo)
++                                break;
++                }
+         }
+ 
+         if (!bo) {
+@@ -406,8 +587,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+         if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP)))
+                 panfrost_bo_mmap(bo);
+ 
++        if ((dev->debug & PAN_DBG_BO_CLEAR) && !(flags & PAN_BO_INVISIBLE)) {
++                memset(bo->ptr.cpu, 0, bo->size);
++                panfrost_bo_mem_clean(bo, 0, bo->size);
++        }
++
+         p_atomic_set(&bo->refcnt, 1);
+ 
++        util_dynarray_init(&bo->usage, NULL);
++
+         if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
+                 if (flags & PAN_BO_INVISIBLE)
+                         pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL);
+@@ -415,6 +603,14 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size,
+                         pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL);
+         }
+ 
++        if (dev->bo_log) {
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li alloc %"PRIx64" to %"PRIx64" size %zu label %s\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label);
++                fflush(NULL);
++        }
++
+         return bo;
+ }
+ 
+@@ -427,6 +623,60 @@ panfrost_bo_reference(struct panfrost_bo *bo)
+         }
+ }
+ 
++static void
++panfrost_bo_fini(struct panfrost_bo *bo)
++{
++        struct panfrost_device *dev = bo->dev;
++
++        /* When the reference count goes to zero, we need to cleanup */
++        panfrost_bo_munmap(bo);
++
++        if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
++                pandecode_inject_free(bo->ptr.gpu, bo->size);
++
++        /* Rather than freeing the BO now, we'll cache the BO for later
++         * allocations if we're allowed to.
++         */
++        if (!panfrost_bo_cache_put(bo))
++                panfrost_bo_free(bo);
++}
++
++static void
++panfrost_bo_free_gpu(void *data)
++{
++        struct panfrost_bo *bo = data;
++        struct panfrost_device *dev = bo->dev;
++
++        /* Don't free if there are still references */
++        if (p_atomic_dec_return(&bo->gpu_refcnt))
++                return;
++
++        pthread_mutex_lock(&dev->bo_map_lock);
++
++        /* Someone might have imported this BO while we were waiting for the
++         * lock, let's make sure it's still not referenced before freeing it.
++         */
++        if (p_atomic_read(&bo->refcnt) != 0) {
++                pthread_mutex_unlock(&dev->bo_map_lock);
++                return;
++        }
++
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li gpufree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        panfrost_bo_fini(bo);
++
++        pthread_mutex_unlock(&dev->bo_map_lock);
++}
++
+ void
+ panfrost_bo_unreference(struct panfrost_bo *bo)
+ {
+@@ -439,25 +689,57 @@ panfrost_bo_unreference(struct panfrost_bo *bo)
+ 
+         struct panfrost_device *dev = bo->dev;
+ 
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li free %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
+         pthread_mutex_lock(&dev->bo_map_lock);
+ 
+         /* Someone might have imported this BO while we were waiting for the
+          * lock, let's make sure it's still not referenced before freeing it.
+          */
+-        if (p_atomic_read(&bo->refcnt) == 0) {
+-                /* When the reference count goes to zero, we need to cleanup */
+-                panfrost_bo_munmap(bo);
++        if (p_atomic_read(&bo->refcnt) != 0) {
++                pthread_mutex_unlock(&dev->bo_map_lock);
++                return;
++        }
+ 
+-                if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
+-                        pandecode_inject_free(bo->ptr.gpu, bo->size);
++        util_dynarray_fini(&bo->usage);
+ 
+-                /* Rather than freeing the BO now, we'll cache the BO for later
+-                 * allocations if we're allowed to.
++        if (dev->kbase) {
++                /* Assume that all queues are using this BO, and so free the
++                 * BO only after all currently-submitted jobs have finished.
++                 * This could eventually be optimised to only wait on a subset
++                 * of queues.
+                  */
+-                if (!panfrost_bo_cache_put(bo))
+-                        panfrost_bo_free(bo);
++                bool added = dev->mali.callback_all_queues(&dev->mali,
++                        &bo->gpu_refcnt, panfrost_bo_free_gpu, bo);
+ 
++                if (added) {
++                        pthread_mutex_unlock(&dev->bo_map_lock);
++                        return;
++                }
+         }
++
++        if (dev->bo_log) {
++                int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li immfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label,
++                        bo, bo->gem_handle, fd);
++                fflush(NULL);
++        }
++
++        panfrost_bo_fini(bo);
++
+         pthread_mutex_unlock(&dev->bo_map_lock);
+ }
+ 
+@@ -467,22 +749,42 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+         struct panfrost_bo *bo;
+         struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
+         ASSERTED int ret;
++        kbase_handle handle = { .fd = -1 };
+         unsigned gem_handle;
+ 
+-        ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
+-        assert(!ret);
++        if (dev->kbase) {
++                gem_handle = dev->mali.import_dmabuf(&dev->mali, fd);
++                if (gem_handle == -1)
++                        return NULL;
++        } else {
++                ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
++                assert(!ret);
++        }
+ 
+         pthread_mutex_lock(&dev->bo_map_lock);
+         bo = pan_lookup_bo(dev, gem_handle);
+ 
++        bool found = false;
++
+         if (!bo->dev) {
+                 get_bo_offset.handle = gem_handle;
+-                ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
+-                assert(!ret);
++                if (dev->kbase) {
++                        handle = kbase_gem_handle_get(&dev->mali, gem_handle);
++                        get_bo_offset.offset = handle.va;
++                } else {
++                        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
++                        assert(!ret);
++                }
+ 
+                 bo->dev = dev;
+-                bo->ptr.gpu = (mali_ptr) get_bo_offset.offset;
+                 bo->size = lseek(fd, 0, SEEK_END);
++                bo->ptr.gpu = (mali_ptr) get_bo_offset.offset;
++                if (dev->kbase && (sizeof(void *) > 4 || get_bo_offset.offset < (1LL << 32))) {
++                        bo->ptr.cpu = (void *)(uintptr_t) get_bo_offset.offset;
++                } else if (dev->kbase) {
++                        bo->ptr.cpu = dev->mali.mmap_import(&dev->mali, bo->ptr.gpu, bo->size);
++                        bo->free_ioctl = true;
++                }
+                 /* Sometimes this can fail and return -1. size of -1 is not
+                  * a nice thing for mmap to try mmap. Be more robust also
+                  * for zero sized maps and fail nicely too
+@@ -493,8 +795,21 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+                 }
+                 bo->flags = PAN_BO_SHARED;
+                 bo->gem_handle = gem_handle;
++                util_dynarray_init(&bo->usage, NULL);
++                if (dev->kbase) {
++                        /* kbase always maps dma-bufs with caching */
++                        bo->cached = true;
++
++                        /* Importing duplicates the FD, so we cache the FD
++                         * from the handle */
++                        bo->dmabuf_fd = handle.fd;
++                } else {
++                        bo->dmabuf_fd = -1;
++                }
+                 p_atomic_set(&bo->refcnt, 1);
+         } else {
++                found = true;
++
+                 /* bo->refcnt == 0 can happen if the BO
+                  * was being released but panfrost_bo_import() acquired the
+                  * lock before panfrost_bo_unreference(). In that case, refcnt
+@@ -512,12 +827,34 @@ panfrost_bo_import(struct panfrost_device *dev, int fd)
+         }
+         pthread_mutex_unlock(&dev->bo_map_lock);
+ 
++        if (dev->bo_log) {
++                int new_fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd;
++
++                struct timespec tp;
++                clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
++                fprintf(dev->bo_log, "%"PRIu64".%09li import %"PRIx64" to %"PRIx64" size %zu fd %i new %i handle %i found %i\n",
++                        (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size,
++                        fd, new_fd, gem_handle, found);
++                fflush(NULL);
++        }
++
+         return bo;
+ }
+ 
+ int
+ panfrost_bo_export(struct panfrost_bo *bo)
+ {
++        struct panfrost_device *dev = bo->dev;
++
++        if (bo->dmabuf_fd != -1) {
++                assert(bo->flags & PAN_BO_SHARED);
++
++                return os_dupfd_cloexec(bo->dmabuf_fd);
++        }
++
++        if (dev->kbase)
++                return -1;
++
+         struct drm_prime_handle args = {
+                 .handle = bo->gem_handle,
+                 .flags = DRM_CLOEXEC,
diff --git a/src/panfrost/lib/pan_bo.h.rej b/src/panfrost/lib/pan_bo.h.rej
new file mode 100644
index 00000000000..b7833465c45
--- /dev/null
+++ b/src/panfrost/lib/pan_bo.h.rej
@@ -0,0 +1,84 @@
+diff a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h	(rejected hunks)
+@@ -27,6 +27,7 @@
+ #define __PAN_BO_H__
+ 
+ #include "util/list.h"
++#include "util/u_dynarray.h"
+ #include "panfrost-job.h"
+ #include <time.h>
+ 
+@@ -50,6 +51,12 @@
+  * cached locally */
+ #define PAN_BO_SHARED             (1 << 4)
+ 
++/* Use event memory, required for CSF events to be signaled to the kernel */
++#define PAN_BO_EVENT              (1 << 5)
++
++/* Use the caching policy for resource BOs */
++#define PAN_BO_CACHEABLE          (1 << 6)
++
+ /* GPU access flags */
+ 
+ /* BO is either shared (can be accessed by more than one GPU batch) or private
+@@ -80,6 +87,12 @@ struct panfrost_ptr {
+         mali_ptr gpu;
+ };
+ 
++struct panfrost_usage {
++        uint32_t queue;
++        bool write;
++        uint64_t seqnum;
++};
++
+ struct panfrost_bo {
+         /* Must be first for casting */
+         struct list_head bucket_link;
+@@ -95,11 +108,16 @@ struct panfrost_bo {
+         /* Atomic reference count */
+         int32_t refcnt;
+ 
++        /* Reference count for GPU jobs */
++        int32_t gpu_refcnt;
++
+         struct panfrost_device *dev;
+ 
+         /* Mapping for the entire object (all levels) */
+         struct panfrost_ptr ptr;
+ 
++        struct util_dynarray usage;
++
+         /* Size of all entire trees */
+         size_t size;
+ 
+@@ -115,11 +133,31 @@ struct panfrost_bo {
+ 
+         /* Human readable description of the BO for debugging. */
+         const char *label;
++
++        /* Sometimes we don't access the BO through kbase's mapping of the
++         * memory, in that case we need to save the pointer to pass to
++         * munmap to avoid leaking memory. */
++        void *munmap_ptr;
++
++        /* For 32-bit applications we may not even be able to that, because
++         * the VA may be too high for kbase to map to an equivalent CPU
++         * address, in which case we must use the memory free icotl. */
++        bool free_ioctl;
++
++        /* Is the BO cached CPU-side? */
++        bool cached;
++
++        /* File descriptor for the dma-buf */
++        int dmabuf_fd;
+ };
+ 
+ bool
+ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers);
+ void
++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length);
++void
++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length);
++void
+ panfrost_bo_reference(struct panfrost_bo *bo);
+ void
+ panfrost_bo_unreference(struct panfrost_bo *bo);
diff --git a/src/panfrost/lib/pan_device.h.rej b/src/panfrost/lib/pan_device.h.rej
new file mode 100644
index 00000000000..5ff078535fe
--- /dev/null
+++ b/src/panfrost/lib/pan_device.h.rej
@@ -0,0 +1,88 @@
+diff a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h	(rejected hunks)
+@@ -35,11 +35,12 @@
+ #include "util/u_dynarray.h"
+ #include "util/bitset.h"
+ #include "util/list.h"
+-#include "util/sparse_array.h"
++#include "util/stable_array.h"
+ 
+ #include "panfrost/util/pan_ir.h"
+ #include "pan_pool.h"
+ #include "pan_util.h"
++#include "pan_base.h"
+ 
+ #include <genxml/gen_macros.h>
+ 
+@@ -182,6 +183,7 @@ struct panfrost_device {
+         void *memctx;
+ 
+         int fd;
++        bool kbase;
+ 
+         /* Properties of the GPU in use */
+         unsigned arch;
+@@ -204,6 +206,9 @@ struct panfrost_device {
+         const struct panfrost_model *model;
+         bool has_afbc;
+ 
++        /* Does the kernel support dma-buf fence import/export? */
++        bool has_dmabuf_fence;
++
+         /* Table of formats, indexed by a PIPE format */
+         const struct panfrost_format *formats;
+ 
+@@ -217,8 +222,11 @@ struct panfrost_device {
+ 
+         struct renderonly *ro;
+ 
++        /* Hold this while updating usage field of BOs */
++        pthread_mutex_t bo_usage_lock;
++
+         pthread_mutex_t bo_map_lock;
+-        struct util_sparse_array bo_map;
++        struct stable_array bo_map;
+ 
+         struct {
+                 pthread_mutex_t lock;
+@@ -263,6 +271,10 @@ struct panfrost_device {
+          * unconditionally on Bifrost, and useful for sharing with Midgard */
+ 
+         struct panfrost_bo *sample_positions;
++
++        struct kbase_ mali;
++
++        FILE *bo_log;
+ };
+ 
+ void
+@@ -271,6 +283,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev);
+ void
+ panfrost_close_device(struct panfrost_device *dev);
+ 
++bool
++panfrost_check_dmabuf_fence(struct panfrost_device *dev);
++
+ bool
+ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt);
+ 
+@@ -287,12 +302,18 @@ panfrost_query_sample_position(
+                 float *out);
+ 
+ unsigned
+-panfrost_query_l2_slices(const struct panfrost_device *dev);
++panfrost_query_l2_slices(struct panfrost_device *dev);
+ 
+ static inline struct panfrost_bo *
+ pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle)
+ {
+-        return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle);
++        return stable_array_get(&dev->bo_map, struct panfrost_bo, gem_handle);
++}
++
++static inline struct panfrost_bo *
++pan_lookup_bo_existing(struct panfrost_device *dev, uint32_t gem_handle)
++{
++        return stable_array_get_existing(&dev->bo_map, struct panfrost_bo, gem_handle);
+ }
+ 
+ static inline bool
diff --git a/src/panfrost/lib/pan_layout.c.rej b/src/panfrost/lib/pan_layout.c.rej
new file mode 100644
index 00000000000..d37ee10f41d
--- /dev/null
+++ b/src/panfrost/lib/pan_layout.c.rej
@@ -0,0 +1,66 @@
+diff a/src/panfrost/lib/pan_layout.c b/src/panfrost/lib/pan_layout.c	(rejected hunks)
+@@ -32,6 +32,14 @@
+  * enabling the YUV-like transform is typically a win where possible. */
+ 
+ uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT] = {
++        DRM_FORMAT_MOD_ARM_AFBC(
++                AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
++                AFBC_FORMAT_MOD_TILED |
++                AFBC_FORMAT_MOD_SC |
++                AFBC_FORMAT_MOD_SPARSE |
++                AFBC_FORMAT_MOD_YTR |
++                AFBC_FORMAT_MOD_NATIVE_SWIZZLE),
++
+         DRM_FORMAT_MOD_ARM_AFBC(
+                 AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
+                 AFBC_FORMAT_MOD_TILED |
+@@ -201,18 +209,17 @@ pan_afbc_body_align(uint64_t modifier)
+ #define CHECKSUM_TILE_HEIGHT 16
+ #define CHECKSUM_BYTES_PER_TILE 8
+ 
+-unsigned
+-panfrost_compute_checksum_size(
+-        struct pan_image_slice_layout *slice,
+-        unsigned width,
+-        unsigned height)
++struct pan_image_slice_crc
++panfrost_compute_checksum_size(unsigned width, unsigned height)
+ {
+         unsigned tile_count_x = DIV_ROUND_UP(width, CHECKSUM_TILE_WIDTH);
+         unsigned tile_count_y = DIV_ROUND_UP(height, CHECKSUM_TILE_HEIGHT);
+ 
+-        slice->crc.stride = tile_count_x * CHECKSUM_BYTES_PER_TILE;
+-
+-        return slice->crc.stride * tile_count_y;
++        struct pan_image_slice_crc ret = {
++                .stride = tile_count_x * CHECKSUM_BYTES_PER_TILE,
++                .size = ret.stride * tile_count_y,
++        };
++        return ret;
+ }
+ 
+ unsigned
+@@ -236,8 +243,11 @@ panfrost_get_legacy_stride(const struct pan_image_layout *layout,
+                 panfrost_block_size(layout->modifier, layout->format);
+ 
+         if (drm_is_afbc(layout->modifier)) {
++                unsigned align_w = block_size.width *
++                        pan_afbc_tile_size(layout->modifier);
++
+                 unsigned width = u_minify(layout->width, level);
+-                width = ALIGN_POT(width, block_size.width);
++                width = ALIGN_POT(width, align_w);
+ 
+                 return width * util_format_get_blocksize(layout->format);
+         } else {
+@@ -392,9 +402,7 @@ pan_image_layout_init(struct pan_image_layout *layout,
+ 
+                 /* Add a checksum region if necessary */
+                 if (layout->crc) {
+-                        slice->crc.size =
+-                                panfrost_compute_checksum_size(slice, width, height);
+-
++                        slice->crc = panfrost_compute_checksum_size(width, height);
+                         slice->crc.offset = offset;
+                         offset += slice->crc.size;
+                         slice->size += slice->crc.size;
diff --git a/src/panfrost/lib/pan_pool.h.rej b/src/panfrost/lib/pan_pool.h.rej
new file mode 100644
index 00000000000..c7ee5984d5a
--- /dev/null
+++ b/src/panfrost/lib/pan_pool.h.rej
@@ -0,0 +1,19 @@
+diff a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h	(rejected hunks)
+@@ -130,4 +130,17 @@ pan_pool_alloc_descs(struct pan_pool *pool,
+ #define pan_pool_alloc_desc_aggregate(pool, ...) \
+         pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__))
+ 
++#ifdef PAN_ARCH
++#if PAN_ARCH < 10
++
++#define pan_pool_alloc_desc_cs_v10(pool, name) \
++        pan_pool_alloc_desc(pool, name)
++
++#else /* PAN_ARCH >= 10 */
++
++#define pan_pool_alloc_desc_cs_v10(pool, name) ((struct panfrost_ptr) {0})
++
++#endif
++#endif /* PAN_ARCH */
++
+ #endif
diff --git a/src/panfrost/lib/pan_props.c.rej b/src/panfrost/lib/pan_props.c.rej
new file mode 100644
index 00000000000..af28edb15b2
--- /dev/null
+++ b/src/panfrost/lib/pan_props.c.rej
@@ -0,0 +1,365 @@
+diff a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c	(rejected hunks)
+@@ -24,6 +24,7 @@
+  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+  */
+ 
++#include <fcntl.h>
+ #include <xf86drm.h>
+ 
+ #include "util/u_math.h"
+@@ -31,12 +32,14 @@
+ #include "util/hash_table.h"
+ #include "util/u_thread.h"
+ #include "drm-uapi/panfrost_drm.h"
++#include "dma-uapi/dma-buf.h"
+ #include "pan_encoder.h"
+ #include "pan_device.h"
+ #include "pan_bo.h"
+ #include "pan_texture.h"
+ #include "wrap.h"
+ #include "pan_util.h"
++#include "pan_base.h"
+ 
+ /* Fixed "minimum revisions" */
+ #define NO_ANISO (~0)
+@@ -70,6 +73,18 @@ const struct panfrost_model panfrost_model_list[] = {
+         MODEL(0x7212, "G52", "TGOx", HAS_ANISO, 16384, {}),
+         MODEL(0x7402, "G52 r1", "TGOx", HAS_ANISO, 16384, {}),
+         MODEL(0x9093, "G57", "TNAx", HAS_ANISO, 16384, {}),
++        MODEL(0xa867, "G610", "LODx", HAS_ANISO, 65536, {}),
++        /* Matching the kbase dummy model, probably not real GPUs */
++        MODEL(0xa802, "G710", "TODx", HAS_ANISO, 65536, {}),
++};
++
++const struct panfrost_model panfrost_unknown_model = {
++   .gpu_id = 0,
++   .name = "Unknowm Mali device (Panfrost)",
++   .performance_counters = "AAAA",
++   .min_rev_anisotropic = NO_ANISO, 
++   .tilebuffer_size = 8192, 
++   .quirks = {}, 
+ };
+ 
+ #undef NO_ANISO
+@@ -83,12 +98,13 @@ const struct panfrost_model panfrost_model_list[] = {
+ const struct panfrost_model *
+ panfrost_get_model(uint32_t gpu_id)
+ {
++        
+         for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) {
+                 if (panfrost_model_list[i].gpu_id == gpu_id)
+                         return &panfrost_model_list[i];
+         }
+ 
+-        return NULL;
++        return &panfrost_unknown_model;
+ }
+ 
+ /* Abstraction over the raw drm_panfrost_get_param ioctl for fetching
+@@ -96,16 +112,27 @@ panfrost_get_model(uint32_t gpu_id)
+ 
+ static __u64
+ panfrost_query_raw(
+-                int fd,
++                struct panfrost_device *dev,
+                 enum drm_panfrost_param param,
+                 bool required,
+                 unsigned default_value)
+ {
++        if (dev->kbase) {
++                uint64_t value;
++                bool ret = dev->mali.get_pan_gpuprop(&dev->mali, param, &value);
++                if (ret) {
++                        return value;
++                } else {
++                        assert(!required);
++                        return default_value;
++                }
++        }
++
+         struct drm_panfrost_get_param get_param = {0,};
+         ASSERTED int ret;
+ 
+         get_param.param = param;
+-        ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
++        ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
+ 
+         if (ret) {
+                 assert(!required);
+@@ -116,23 +143,23 @@ panfrost_query_raw(
+ }
+ 
+ static unsigned
+-panfrost_query_gpu_version(int fd)
++panfrost_query_gpu_version(struct panfrost_device *dev)
+ {
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
+ }
+ 
+ static unsigned
+-panfrost_query_gpu_revision(int fd)
++panfrost_query_gpu_revision(struct panfrost_device *dev)
+ {
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0);
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_REVISION, true, 0);
+ }
+ 
+ unsigned
+-panfrost_query_l2_slices(const struct panfrost_device *dev)
++panfrost_query_l2_slices(struct panfrost_device *dev)
+ {
+         /* Query MEM_FEATURES register */
+         uint32_t mem_features =
+-                panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES,
++                panfrost_query_raw(dev, DRM_PANFROST_PARAM_MEM_FEATURES,
+                                    true, 0);
+ 
+         /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */
+@@ -140,10 +167,10 @@ panfrost_query_l2_slices(const struct panfrost_device *dev)
+ }
+ 
+ static struct panfrost_tiler_features
+-panfrost_query_tiler_features(int fd)
++panfrost_query_tiler_features(struct panfrost_device *dev)
+ {
+         /* Default value (2^9 bytes and 8 levels) to match old behaviour */
+-        uint32_t raw = panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES,
++        uint32_t raw = panfrost_query_raw(dev, DRM_PANFROST_PARAM_TILER_FEATURES,
+                         false, 0x809);
+ 
+         /* Bin size is log2 in the first byte, max levels in the second byte */
+@@ -154,11 +181,11 @@ panfrost_query_tiler_features(int fd)
+ }
+ 
+ static unsigned
+-panfrost_query_core_count(int fd, unsigned *core_id_range)
++panfrost_query_core_count(struct panfrost_device *dev, unsigned *core_id_range)
+ {
+         /* On older kernels, worst-case to 16 cores */
+ 
+-        unsigned mask = panfrost_query_raw(fd,
++        unsigned mask = panfrost_query_raw(dev,
+                         DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
+ 
+         /* Some cores might be absent. In some cases, we care
+@@ -199,16 +226,16 @@ panfrost_max_thread_count(unsigned arch)
+ }
+ 
+ static unsigned
+-panfrost_query_thread_tls_alloc(int fd, unsigned major)
++panfrost_query_thread_tls_alloc(struct panfrost_device *dev, unsigned major)
+ {
+-        unsigned tls = panfrost_query_raw(fd,
++        unsigned tls = panfrost_query_raw(dev,
+                         DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0);
+ 
+         return (tls > 0) ? tls : panfrost_max_thread_count(major);
+ }
+ 
+ static uint32_t
+-panfrost_query_compressed_formats(int fd)
++panfrost_query_compressed_formats(struct panfrost_device *dev)
+ {
+         /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and
+          * should exist on any Mali configuration. All hardware should report
+@@ -227,7 +254,7 @@ panfrost_query_compressed_formats(int fd)
+                 (1 << MALI_ASTC_2D_LDR) |
+                 (1 << MALI_ASTC_2D_HDR);
+ 
+-        return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
++        return panfrost_query_raw(dev, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
+                         false, default_set);
+ }
+ 
+@@ -250,9 +277,9 @@ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt)
+  * may omit it, signaled as a nonzero value in the AFBC_FEATURES property. */
+ 
+ static bool
+-panfrost_query_afbc(int fd, unsigned arch)
++panfrost_query_afbc(struct panfrost_device *dev, unsigned arch)
+ {
+-        unsigned reg = panfrost_query_raw(fd,
++        unsigned reg = panfrost_query_raw(dev,
+                                           DRM_PANFROST_PARAM_AFBC_FEATURES,
+                                           false, 0);
+ 
+@@ -281,24 +308,40 @@ panfrost_query_optimal_tib_size(const struct panfrost_device *dev)
+ void
+ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+ {
++        if (kbase_open(&dev->mali, fd, 4, (dev->debug & PAN_DBG_LOG))) {
++                dev->kbase = true;
++                fd = -1;
++        }
++
+         dev->fd = fd;
+         dev->memctx = memctx;
+-        dev->gpu_id = panfrost_query_gpu_version(fd);
++        dev->gpu_id = panfrost_query_gpu_version(dev);
+         dev->arch = pan_arch(dev->gpu_id);
+-        dev->kernel_version = drmGetVersion(fd);
+-        dev->revision = panfrost_query_gpu_revision(fd);
++        if (dev->kbase) {
++                dev->kernel_version = calloc(1, sizeof(drmVersion));
++                *dev->kernel_version = (drmVersion) {
++                        .version_major = 1,
++                        .version_minor = 999,
++                };
++        } else {
++                dev->kernel_version = drmGetVersion(fd);
++        }
++        dev->revision = panfrost_query_gpu_revision(dev);
+         dev->model = panfrost_get_model(dev->gpu_id);
+ 
+         /* If we don't recognize the model, bail early */
+         if (!dev->model)
+                 return;
+ 
+-        dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range);
+-        dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch);
++        if (dev->debug & PAN_DBG_BO_LOG)
++                dev->bo_log = fopen("/tmp/bo_log", "w");
++
++        dev->core_count = panfrost_query_core_count(dev, &dev->core_id_range);
++        dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(dev, dev->arch);
+         dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev);
+-        dev->compressed_formats = panfrost_query_compressed_formats(fd);
+-        dev->tiler_features = panfrost_query_tiler_features(fd);
+-        dev->has_afbc = panfrost_query_afbc(fd, dev->arch);
++        dev->compressed_formats = panfrost_query_compressed_formats(dev);
++        dev->tiler_features = panfrost_query_tiler_features(dev);
++        dev->has_afbc = panfrost_query_afbc(dev, dev->arch);
+ 
+         if (dev->arch <= 6)
+                 dev->formats = panfrost_pipe_format_v6;
+@@ -307,8 +350,10 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+         else
+                 dev->formats = panfrost_pipe_format_v9;
+ 
+-        util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512);
++        stable_array_init(&dev->bo_map, struct panfrost_bo);
+ 
++        pthread_mutex_init(&dev->bo_usage_lock, NULL);
++        pthread_mutex_init(&dev->bo_map_lock, NULL);
+         pthread_mutex_init(&dev->bo_cache.lock, NULL);
+         list_inithead(&dev->bo_cache.lru);
+ 
+@@ -323,8 +368,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+          * active for a single job chain at once, so a single heap can be
+          * shared across batches/contextes */
+ 
+-        dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024,
+-                        PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap");
++        if (dev->arch < 10)
++                dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024,
++                                             PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap");
+ 
+         pthread_mutex_init(&dev->submit_lock, NULL);
+ 
+@@ -341,11 +387,102 @@ panfrost_close_device(struct panfrost_device *dev)
+         if (dev->model) {
+                 pthread_mutex_destroy(&dev->submit_lock);
+                 panfrost_bo_unreference(dev->tiler_heap);
++                panfrost_bo_unreference(dev->sample_positions);
+                 panfrost_bo_cache_evict_all(dev);
+                 pthread_mutex_destroy(&dev->bo_cache.lock);
+-                util_sparse_array_finish(&dev->bo_map);
++                pthread_mutex_destroy(&dev->bo_map_lock);
++                pthread_mutex_destroy(&dev->bo_usage_lock);
++                stable_array_fini(&dev->bo_map);
++        }
++
++        if (dev->kbase)
++                free(dev->kernel_version);
++        else
++                drmFreeVersion(dev->kernel_version);
++        if (dev->kbase)
++                dev->mali.close(&dev->mali);
++        else
++                close(dev->fd);
++}
++
++bool
++panfrost_check_dmabuf_fence(struct panfrost_device *dev)
++{
++        bool ret = false;
++        int err;
++
++        /* This function is only useful for kbase, where we can't create
++         * dma-bufs from the kbase FD. */
++        if (!dev->ro)
++                goto out;
++
++        struct drm_mode_create_dumb create_dumb = {
++                .width = 16,
++                .height = 16,
++                .bpp = 32,
++        };
++
++        err = drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_dumb);
++        if (err < 0) {
++                fprintf(stderr, "DRM_IOCTL_MODE_CREATE_DUMB failed "
++                        "for fence check: %s\n",
++                        strerror(errno));
++                goto out;
++        }
++
++        int fd;
++        err = drmPrimeHandleToFD(dev->ro->kms_fd, create_dumb.handle, O_CLOEXEC,
++                                 &fd);
++        if (err < 0) {
++                fprintf(stderr, "failed to export buffer for fence check: %s\n",
++                        strerror(errno));
++                goto free_dumb;
+         }
+ 
+-        drmFreeVersion(dev->kernel_version);
+-        close(dev->fd);
++        struct dma_buf_export_sync_file export = {
++                .flags = DMA_BUF_SYNC_RW,
++        };
++
++        /* ENOTTY is returned if the ioctl is unsupported */
++
++        err = drmIoctl(fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export);
++        if (err < 0) {
++                if (errno != ENOTTY)
++                        fprintf(stderr, "failed to export fence: %s\n",
++                                strerror(errno));
++                goto free_fd;
++        }
++
++        struct dma_buf_import_sync_file import = {
++                .flags = DMA_BUF_SYNC_RW,
++                .fd = export.fd,
++        };
++
++        err = drmIoctl(fd, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import);
++        if (err < 0) {
++                if (errno != ENOTTY)
++                        fprintf(stderr, "failed to import fence: %s\n",
++                                strerror(errno));
++                goto free_sync;
++        }
++
++        /* We made it this far, the kernel must support the ioctls */
++        ret = true;
++
++free_sync:
++        close(export.fd);
++
++free_fd:
++        close(fd);
++
++        /* Some compilers don't like goto to a declaration */
++        struct drm_mode_destroy_dumb destroy_dumb;
++free_dumb:
++        destroy_dumb = (struct drm_mode_destroy_dumb) {
++                .handle = create_dumb.handle,
++        };
++        drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb);
++
++out:
++        return ret;
+ }
diff --git a/src/panfrost/lib/pan_texture.h.rej b/src/panfrost/lib/pan_texture.h.rej
new file mode 100644
index 00000000000..7a7f33572de
--- /dev/null
+++ b/src/panfrost/lib/pan_texture.h.rej
@@ -0,0 +1,55 @@
+diff a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h	(rejected hunks)
+@@ -44,9 +44,15 @@
+ extern "C" {
+ #endif
+ 
+-#define PAN_MODIFIER_COUNT 6
++#define PAN_MODIFIER_COUNT 7
+ extern uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT];
+ 
++struct pan_image_slice_crc {
++        unsigned offset;
++        unsigned stride;
++        unsigned size;
++};
++
+ struct pan_image_slice_layout {
+         unsigned offset;
+ 
+@@ -80,11 +86,7 @@ struct pan_image_slice_layout {
+ 
+         /* If checksumming is enabled following the slice, what
+          * is its offset/stride? */
+-        struct {
+-                unsigned offset;
+-                unsigned stride;
+-                unsigned size;
+-        } crc;
++        struct pan_image_slice_crc crc;
+ 
+         unsigned size;
+ };
+@@ -141,11 +143,8 @@ struct pan_image_view {
+         } buf;
+ };
+ 
+-unsigned
+-panfrost_compute_checksum_size(
+-        struct pan_image_slice_layout *slice,
+-        unsigned width,
+-        unsigned height);
++struct pan_image_slice_crc
++panfrost_compute_checksum_size(unsigned width, unsigned height);
+ 
+ /* AFBC */
+ 
+@@ -164,6 +163,9 @@ panfrost_afbc_can_ytr(enum pipe_format format);
+ bool
+ panfrost_afbc_can_tile(const struct panfrost_device *dev);
+ 
++bool
++panfrost_afbc_only_native(unsigned arch, enum pipe_format format);
++
+ /*
+  * Represents the block size of a single plane. For AFBC, this represents the
+  * superblock size. For u-interleaving, this represents the tile size.
diff --git a/src/panfrost/lib/pan_util.h.rej b/src/panfrost/lib/pan_util.h.rej
new file mode 100644
index 00000000000..eb65d19d46e
--- /dev/null
+++ b/src/panfrost/lib/pan_util.h.rej
@@ -0,0 +1,19 @@
+diff a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h	(rejected hunks)
+@@ -47,10 +47,16 @@
+ #define PAN_DBG_LINEAR          0x1000
+ #define PAN_DBG_NO_CACHE        0x2000
+ #define PAN_DBG_DUMP            0x4000
+-
+ #ifndef NDEBUG
+ #define PAN_DBG_OVERFLOW        0x8000
+ #endif
++#define PAN_DBG_TILER         0x010000
++#define PAN_DBG_BO_LOG        0x020000
++#define PAN_DBG_BO_CLEAR      0x040000
++#define PAN_DBG_UNCACHED_GPU  0x100000
++#define PAN_DBG_UNCACHED_CPU  0x200000
++#define PAN_DBG_LOG           0x400000
++#define PAN_DBG_GOFASTER      0x800000
+ 
+ struct panfrost_device;
+ 
diff --git a/src/panfrost/lib/wrap.h.rej b/src/panfrost/lib/wrap.h.rej
new file mode 100644
index 00000000000..f645b59013b
--- /dev/null
+++ b/src/panfrost/lib/wrap.h.rej
@@ -0,0 +1,21 @@
+diff a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h	(rejected hunks)
+@@ -46,6 +46,8 @@ void pandecode_initialize(bool to_stderr);
+ 
+ void pandecode_next_frame(void);
+ 
++void pandecode_dump_file_close(void);
++
+ void pandecode_close(void);
+ 
+ void
+@@ -55,6 +57,10 @@ void pandecode_inject_free(uint64_t gpu_va, unsigned sz);
+ 
+ void pandecode_jc(uint64_t jc_gpu_va, unsigned gpu_id);
+ 
++void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id);
++
++void pandecode_dump_mappings(void);
++
+ void
+ pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id);
+ 
diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build
index aa393d44fe5..97773fe970f 100644
--- a/src/panfrost/meson.build
+++ b/src/panfrost/meson.build
@@ -20,7 +20,7 @@
 # SOFTWARE.
 
 inc_panfrost_hw = include_directories([
-   'include'
+   'include', 'base'
 ])
 
 inc_panfrost = include_directories([
@@ -70,6 +70,46 @@ bifrost_compiler = executable(
   build_by_default : with_tools.contains('panfrost')
 )
 
+csf_test = executable(
+  'csf_test',
+  ['csf_test/test.c'],
+  include_directories : [
+    inc_mapi,
+    inc_mesa,
+    inc_gallium,
+    inc_gallium_aux,
+    inc_include,
+    inc_src,
+    inc_panfrost,
+    inc_panfrost_hw,
+  ],
+  dependencies : [
+    idep_nir,
+    idep_mesautil,
+    idep_bi_opcodes_h,
+    dep_libdrm,
+    libpanfrost_dep,
+  ],
+  build_by_default : true
+)
+
+custom_target(
+  'panfrost_panloader',
+  output: ['panfrost_panloader.txt'],
+  depends : [
+    libpanfrost_lib,
+    libpanfrost_util,
+    _libmesa_util,
+    libpanfrost_decode,
+    libpanfrost_decode_per_arch,
+    libpanfrost_midgard_disasm,
+    libpanfrost_bifrost_disasm,
+    libpanfrost_valhall_disasm,
+  ],
+  command: ['touch', '@OUTPUT@'],
+  build_by_default : false,
+)
+
 if with_panfrost_vk
   subdir('vulkan')
 endif
diff --git a/src/panfrost/meson.build.rej b/src/panfrost/meson.build.rej
new file mode 100644
index 00000000000..7e2b30f869f
--- /dev/null
+++ b/src/panfrost/meson.build.rej
@@ -0,0 +1,10 @@
+diff a/src/panfrost/meson.build b/src/panfrost/meson.build	(rejected hunks)
+@@ -36,6 +36,8 @@ subdir('util')
+ subdir('midgard')
+ subdir('bifrost')
+ 
++subdir('base')
++
+ if with_gallium_panfrost or with_panfrost_vk
+    subdir('lib')
+    subdir('perf')
diff --git a/src/panfrost/midgard/disassemble.c.rej b/src/panfrost/midgard/disassemble.c.rej
new file mode 100644
index 00000000000..84b6a93ef56
--- /dev/null
+++ b/src/panfrost/midgard/disassemble.c.rej
@@ -0,0 +1,12 @@
+diff a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c	(rejected hunks)
+@@ -1242,7 +1242,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words,
+ UNUSED static void
+ print_varying_parameters(FILE *fp, midgard_load_store_word *word)
+ {
+-        midgard_varying_params p = midgard_unpack_varying_params(*word);
++        unsigned params = word->signed_offset & 0x1FF;
++        midgard_varying_params p;
++        memcpy(&p, &params, sizeof(p));
+ 
+         /* If a varying, there are qualifiers */
+         if (p.flat_shading)
diff --git a/src/panfrost/tiler/tiler-hex-read b/src/panfrost/tiler/tiler-hex-read
new file mode 100755
index 00000000000..1c188e38ec1
--- /dev/null
+++ b/src/panfrost/tiler/tiler-hex-read
@@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+
+import sys
+import struct
+
+FLIP_Y = False
+
+data = b''
+
+fb_width = 160
+fb_height = 160
+hierarchy_mask = 0xffff
+
+HEAP_OFS = 0x8000
+
+base_ptr = 0
+heap_ptr = 0
+midgard = False
+bifrost = True
+valhall = False
+size = None
+
+bak_data = b''
+
+cur_data = b''
+
+# TODO: More robust looping..
+for line in sys.stdin.read().split("\n"):
+    print(line)
+    split = line.split(" ")
+    if not len(split) or split[0] == "":
+        continue
+    if split[0] == "width":
+        fb_width = int(split[1])
+        continue
+    if split[0] == "height":
+        fb_height = int(split[1])
+        continue
+    if split[0] == "mask":
+        hierarchy_mask = int(split[1], 0)
+        continue
+    if split[0] == "vaheap":
+        base_ptr = int(split[1], 16)
+        bifrost = False
+        valhall = True
+        continue
+    if split[0] == "addr":
+        base_ptr = int(split[1], 16)
+        bifrost = False
+        midgard = True
+        HEAP_OFS = 0x40
+        continue
+    if split[0] == "heap":
+        heap_ptr = int(split[1], 16)
+        data += cur_data
+        cur_data = b''
+        bak_data = data
+        data = b''
+        continue
+    if split[0] == "size":
+        size = int(split[1], 0)
+        continue
+    offset = int(split[0], 16)
+    if offset > len(data):
+        data += cur_data
+        cur_data = b''
+        data += b'\0' * (offset - len(data))
+    for d in split[1:]:
+        if d == "" or d == "*":
+            continue
+        cur_data += bytes([int(d, 16)])
+
+data += cur_data
+
+if heap_ptr:
+    data, heap_data = bak_data, data
+
+if size == None:
+    size = len(data)
+
+def int7(val, signed=True):
+    val = val & 0x7f
+    if signed and val >= 0x40:
+        return val - 0x80
+    else:
+        return val
+
+def int8(val, signed=True):
+    val = val & 0xff
+    if signed and val >= 0x80:
+        return val - 0x100
+    else:
+        return val
+
+def fetch(ptr, size):
+    if midgard:
+        if ptr >= base_ptr and ptr < base_ptr + len(data):
+            base = ptr - base_ptr
+            return data[base:base+size]
+        elif ptr >= heap_ptr and ptr < heap_ptr + len(heap_data):
+            base = ptr - heap_ptr
+            return heap_data[base:base+size]
+    else:
+        if valhall:
+            ptr -= base_ptr
+        if ptr < 0:
+            return b""
+        return data[ptr:ptr+size]
+
+def print_draw(ptr):
+    draw = fetch(ptr, 128)
+    if len(draw) < 128:
+        print(" couldn't fetch draw struct")
+        return
+    decoded = struct.unpack("=16Q", draw)
+    coverage = [0 for x in decoded]
+
+    fields = (
+        ("Allow forward pixel to kill", 1, "0:0", "bool"),
+        ("Allow forward pixel to be killed", 1, "0:1", "bool"),
+        ("Pixel kill operation", 2, "0:2", "Pixel Kill"),
+        ("ZS update operation", 2, "0:4", "Pixel Kill"),
+        ("Allow primitive reorder", 1, "0:6", "bool"),
+        ("Overdraw alpha0", 1, "0:7", "bool"),
+        ("Overdraw alpha1", 1, "0:8", "bool"),
+        ("Clean Fragment Write", 1, "0:9", "bool"),
+        ("Primitive Barrier", 1, "0:10", "bool"),
+        ("Evaluate per-sample", 1, "0:11", "bool"),
+        ("Single-sampled lines", 1, "0:13", "bool"),
+        ("Occlusion query", 2, "0:14", "Occlusion Mode"),
+        ("Front face CCW", 1, "0:16", "bool"),
+        ("Cull front face", 1, "0:17", "bool"),
+        ("Cull back face", 1, "0:18", "bool"),
+        ("Multisample enable", 1, "0:19", "bool"),
+        ("Shader modifies coverage", 1, "0:20", "bool"),
+        ("Alpha-to-coverage Invert", 1, "0:21", "bool"),
+        ("Alpha-to-coverage", 1, "0:22", "bool"),
+        ("Scissor to bounding box", 1, "0:23", "bool"),
+        ("Sample mask", 16, "1:0", "uint"),
+        ("Render target mask", 8, "1:16", "hex"),
+
+        ("Packet", 1, "2:0", "bool"),
+        # TODO: shr modifier
+        ("Vertex array", 64, "2:0", "address"),
+        ("Vertex packet stride", 16, "4:0", "uint"),
+        ("Vertex attribute stride", 16, "4:16", "uint"),
+        ("Unk", 16, "5:0", "uint"),
+
+        ("Minimum Z", 32, "6:0", "float"),
+        ("Maximum Z", 32, "7:0", "float"),
+        ("Depth/stencil", 64, "10:0", "address"),
+        ("Blend count", 4, "12:0", "uint"),
+        ("Blend", 60, "12:4", "address"),
+        ("Occlusion", 64, "14:0", "address"),
+
+        ("Attribute offset", 32, "16:0", "uint"),
+        ("FAU count", 8, "17:0", "uint"),
+        ("Resources", 48, "24:0", "address"),
+        ("Shader", 48, "26:0", "address"),
+        ("Thread storage", 48, "28:0", "address"),
+        ("FAU", 64, "30:0", "address"),
+    )
+
+    for f in fields:
+        name, size, start, type = f
+        word, bit = [int(x) for x in start.split(":")]
+        if word & 1:
+            bit += 32
+        word >>= 1
+
+        mask = (1 << size) - 1
+        data = (decoded[word] >> bit) & mask
+        coverage[word] |= mask << bit
+        if type == "float":
+            data = struct.unpack("=f", struct.pack("=I", data))[0]
+        else:
+            data = hex(data)
+        print(f"   {name}: {data}")
+
+    for i, (d, c) in enumerate(zip(decoded, coverage)):
+        ci = c ^ ((1 << 64) - 1)
+        if d & ci:
+            print(f"    unk at 64-bit word {i}: {hex(d)} (known mask {hex(c)})")
+
+def print_vertex(ptr, positions):
+    for p in positions:
+        addr = ptr + p * 16
+        data = fetch(addr, 16)
+        if len(data) < 16:
+            print(f"        <no data : {hex(addr)}>")
+            continue
+        x, y, z, w = struct.unpack("=4f", data)
+        print(f"       <{x} {y} {z} {w}>")
+
+DRAW_TYPES = [
+    "unk",
+    "points",
+    "lines",
+    "tris",
+]
+
+def heap_interpret(start, end):
+    print(f"interpreting from {hex(start)} to {hex(end)}")
+
+    struct_count = 0
+
+    signed = True
+
+    base = 0
+    a = 0
+    b = 0
+    c = 0
+
+    num_vert = 3
+
+    draw_ptr = 0
+    pos_ptr = 0
+
+    while start != end:
+        if midgard and start & 0x1ff == 0x1f8:
+            jump = struct.unpack("=Q", fetch(start, 8))[0]
+            print(f"jump mdg: {hex(jump)}")
+            start = jump
+            continue
+
+        dat = fetch(start, 4)
+        if dat[3] & 0xe0 == 0x80:
+            struct_count += 1
+
+        print(f"{struct_count}:", " ".join([f"{hex(x)[2:].upper():>02}" for x in dat]), end="  ")
+
+        masked_op = dat[3] & ~3
+
+        up = struct.unpack("=I", dat)[0]
+
+        if valhall:
+            tri0 = tri0_7 = int7(up >> 15, signed)
+            tri1 = int7(up >> 8, signed)
+            tri2 = int7(up >> 1, signed)
+        else:
+            tri0 = int8(up >> 14, signed)
+            tri0_7 = int7(up >> 14, signed)
+            tri1 = int7(up >> 7, signed)
+            tri2 = int7(up, signed)
+
+        signed = True
+
+        if dat[3] & 0xe0 == 0x80:
+            res = ""
+            if valhall:
+                address = (up & 0x7ffffff) * 32
+                num_vert = (dat[3] >> 3) & 0x3
+            else:
+                address = (up & 0xffffff) * 64
+                num_vert = (dat[3] >> 2) & 0x3
+                if dat[3] & 0x10:
+                    a = 0
+                    res = " reset"
+            draw_ptr = address
+            if valhall:
+                pos_ptr = address + 128
+            print(f"draw {DRAW_TYPES[num_vert]}{res}: {hex(address)}")
+        elif valhall and dat[3] >> 4 == 12:
+            unk1 = up & 0x3f
+            address = (up >> 6) & 0xffff
+            unk2 = up >> 22
+            draw_ptr += address << 32
+            pos_ptr += address << 32
+            print(f"draw offset: {hex(address)}, unk {hex(unk1)}, {hex(unk2)}")
+
+            print_draw(draw_ptr)
+        elif dat[3] >> 6 == 1:
+            # TODO: handle two of these in a row
+            res = ""
+            if valhall:
+                # TOOD: Is the mask correct?
+                pf = (up >> 22) & 0x7f
+                shift = 7
+                if dat[3] & 0x20:
+                    a = 0
+                    res = " reset"
+            else:
+                pf = (up >> 21) & 0x7f
+                shift = 8
+
+            a += tri0_7 << shift
+            b += tri1 << 7
+            c += tri2 << 7
+            print(f"primitive offset{res}: {hex(pf << 4)} | +{tri0_7 << shift} {tri1 << 7} {tri2 << 7}")
+            signed = False
+        # TODO: Jumps are located based on position, not opcode
+        elif dat[3] == 0xff:
+            up64 = struct.unpack("=Q", fetch(start, 8))[0]
+            assert((up64 & 3) == 3)
+            print(f"jump (from {hex(start+8)}-8): {hex(up64 - 3)}")
+            start = up64 - 7
+        elif dat[3] == 0x00:
+            assert((up & 3) == 3)
+            print(f"jump (from {hex(start+4)}-4): {hex(up - 3)}, {hex(HEAP_OFS + up - 3)}")
+            start = HEAP_OFS + up - 7
+        elif (masked_op & 0xc0) == 0:
+            mode = hex(dat[3] >> 2)
+
+            pre_offset = (up >> 22) & 0xf
+
+            unk = ""
+            if valhall and up & 1:
+                unk = ", unk 1"
+
+            a += base + tri0
+            b += a + tri1
+            c += a + tri2
+            base = a
+
+            print(f"{mode} draw: {hex(pre_offset)} | +{tri0} {tri1} {tri2}{unk}")
+
+            print_vertex(pos_ptr, [a, b, c][:num_vert])
+
+            a = b = c = 0
+
+        else:
+            print(f"Unknown opcode {hex(dat[3])}")
+
+        start += 4
+
+def level_list():
+    levels = []
+    size = 16
+    anylevel = False
+
+    # TODO: Does this miss the largest level?
+    while anylevel == False or size // 2 < min(fb_width, fb_height):
+        if (hierarchy_mask << 4) & size != 0:
+            anylevel = True
+            levels.append(size)
+
+        size *= 2
+
+    return levels
+
+def div_round_up(x, y):
+    return (x + y - 1) // y
+
+def align(x, y):
+    return div_round_up(x, y) * y
+
+def tile_count(alignment=4):
+    return sum(align(div_round_up(fb_width, size) * div_round_up(fb_height, size), 4)
+               for size in level_list())
+
+if midgard:
+    unpacked_header = list(struct.unpack("=16i", data[0:64]))
+    # Is this really big endian?
+    unpacked_header[5:7] = struct.unpack(">2i", data[20:28])
+    print(f"header: {' '.join([str(x) for x in unpacked_header])}")
+
+    # Extra is because of HEAP_OFS
+    header_size = align(tile_count() + 8, 64)
+elif valhall:
+    # TODO: Does this figure need alignment?
+    HEAP_STRIDE = tile_count() * 8
+    HEAP_OFS = size - HEAP_STRIDE * 2
+
+pos = base_ptr + HEAP_OFS
+
+for size in level_list():
+    for y in range((fb_height + size - 1) // size):
+        for x in range((fb_width + size - 1) // size):
+            header = fetch(pos, 8)
+            if len(header) == 0:
+                break
+
+            if midgard:
+                end = struct.unpack("=Q", header)[0]
+                use = bool(end)
+                end += 4
+                start = base_ptr + header_size * 8 + (pos - base_ptr - HEAP_OFS) * 64
+            elif bifrost:
+                end, start = struct.unpack("=II", header)
+                use = bool(end)
+                start += HEAP_OFS
+                end += HEAP_OFS + 4
+                end &= ~3
+            else:
+                footer = fetch(pos + HEAP_STRIDE, 8)
+                if len(footer) == 0:
+                    break
+                start, end = struct.unpack("=QQ", header + footer)
+                use = bool(end)
+                # The upper bits are used for jump metadata
+                end &= (1 << 48) - 1
+                end += 4
+            if use:
+                if FLIP_Y:
+                    print([x * size, fb_height - (y + 1) * size], ((x + 1) * size, fb_height - y * size))
+                else:
+                    print([x * size, y * size], ((x + 1) * size, (y + 1) * size))
+                heap_interpret(start, end)
+
+            pos += 8
diff --git a/src/util/os_misc.c.rej b/src/util/os_misc.c.rej
new file mode 100644
index 00000000000..261ce7607cd
--- /dev/null
+++ b/src/util/os_misc.c.rej
@@ -0,0 +1,103 @@
+diff a/src/util/os_misc.c b/src/util/os_misc.c	(rejected hunks)
+@@ -53,7 +53,6 @@
+ #  define LOG_TAG "MESA"
+ #  include <unistd.h>
+ #  include <log/log.h>
+-#  include <cutils/properties.h>
+ #elif DETECT_OS_LINUX || DETECT_OS_CYGWIN || DETECT_OS_SOLARIS || DETECT_OS_HURD
+ #  include <unistd.h>
+ #elif DETECT_OS_OPENBSD || DETECT_OS_FREEBSD
+@@ -123,93 +122,10 @@ os_log_message(const char *message)
+ #endif
+ }
+ 
+-#if DETECT_OS_ANDROID
+-#  include <ctype.h>
+-#  include "hash_table.h"
+-#  include "ralloc.h"
+-#  include "simple_mtx.h"
+-
+-static struct hash_table *options_tbl;
+-
+-static void
+-options_tbl_fini(void)
+-{
+-   _mesa_hash_table_destroy(options_tbl, NULL);
+-}
+-
+-/**
+- * Get an option value from android's property system, as a fallback to
+- * getenv() (which is generally less useful on android due to processes
+- * typically being forked from the zygote.
+- *
+- * The option name used for getenv is translated into a property name
+- * by:
+- *
+- *  1) convert to lowercase
+- *  2) replace '_' with '.'
+- *  3) if necessary, prepend "mesa."
+- *
+- * For example:
+- *  - MESA_EXTENSION_OVERRIDE -> mesa.extension.override
+- *  - GALLIUM_HUD -> mesa.gallium.hud
+- *
+- * Note that we use a hashtable for two purposes:
+- *  1) Avoid re-translating the option name on subsequent lookups
+- *  2) Avoid leaking memory.  Because property_get() returns the
+- *     property value into a user allocated buffer, we cannot return
+- *     that directly to the caller, so we need to strdup().  With the
+- *     hashtable, subsquent lookups can return the existing string.
+- */
+-static const char *
+-os_get_android_option(const char *name)
+-{
+-   if (!options_tbl) {
+-      options_tbl = _mesa_hash_table_create(NULL, _mesa_hash_string,
+-            _mesa_key_string_equal);
+-      atexit(options_tbl_fini);
+-   }
+-
+-   struct hash_entry *entry = _mesa_hash_table_search(options_tbl, name);
+-   if (entry) {
+-      return entry->data;
+-   }
+-
+-   char value[PROPERTY_VALUE_MAX];
+-   char key[PROPERTY_KEY_MAX];
+-   char *p = key, *end = key + PROPERTY_KEY_MAX;
+-   /* add "mesa." prefix if necessary: */
+-   if (strstr(name, "MESA_") != name)
+-      p += strlcpy(p, "mesa.", end - p);
+-   p += strlcpy(p, name, end - p);
+-   for (int i = 0; key[i]; i++) {
+-      if (key[i] == '_') {
+-         key[i] = '.';
+-      } else {
+-         key[i] = tolower(key[i]);
+-      }
+-   }
+-
+-   const char *opt = NULL;
+-   int len = property_get(key, value, NULL);
+-   if (len > 1) {
+-      opt = ralloc_strdup(options_tbl, value);
+-   }
+-
+-   _mesa_hash_table_insert(options_tbl, name, (void *)opt);
+-
+-   return opt;
+-}
+-#endif
+-
+ const char *
+ os_get_option(const char *name)
+ {
+    const char *opt = getenv(name);
+-#if DETECT_OS_ANDROID
+-   if (!opt) {
+-      opt = os_get_android_option(name);
+-   }
+-#endif
+    return opt;
+ }
+ 
diff --git a/src/util/perf/cpu_trace.h.rej b/src/util/perf/cpu_trace.h.rej
new file mode 100644
index 00000000000..f1e688f3f4d
--- /dev/null
+++ b/src/util/perf/cpu_trace.h.rej
@@ -0,0 +1,21 @@
+diff a/src/util/perf/cpu_trace.h b/src/util/perf/cpu_trace.h	(rejected hunks)
+@@ -27,19 +27,6 @@
+          util_perfetto_trace_end(category);                                  \
+    } while (0)
+ 
+-/* NOTE: for now disable atrace for C++ to workaround a ndk bug with ordering
+- * between stdatomic.h and atomic.h.  See:
+- *
+- *   https://github.com/android/ndk/issues/1178
+- */
+-#elif defined(ANDROID) && !defined(__cplusplus)
+-
+-#include <cutils/trace.h>
+-
+-#define _MESA_TRACE_BEGIN(category, name)                                    \
+-   atrace_begin(ATRACE_TAG_GRAPHICS, name)
+-#define _MESA_TRACE_END(category) atrace_end(ATRACE_TAG_GRAPHICS)
+-
+ #else
+ 
+ #define _MESA_TRACE_BEGIN(category, name)
diff --git a/src/util/stable_array.h b/src/util/stable_array.h
new file mode 100644
index 00000000000..a590aa48a50
--- /dev/null
+++ b/src/util/stable_array.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef STABLE_ARRAY_H
+#define STABLE_ARRAY_H
+
+#include "util/simple_mtx.h"
+#include "util/u_math.h"
+
+/* A thread-safe automatically growing array where elements have stable locations
+ *
+ * This data structure has these properties:
+ *
+ *  1. Accessing an element is constant time (if allocation is not required).
+ *
+ *  2. Elements are not moved in memory, so it is safe to store a pointer to
+ *     something in a stable_array.
+ *
+ *  3. The data structure is thread-safe. To improve performance, there is
+ *     also a fast path that does not require atomics.
+ *
+ *  4. Although the data structure is not lock-free, there is a limit on the
+ *     number of times that a lock is ever acquired--a maximum of 32 times the
+ *     number of accessing threads. In practice, contention will never be an
+ *     issue for long-lived stable_arrays.
+ *
+ *  5. Memory usage is similar to util_dynarray, with each allocation being
+ *     twice as large as the last. Freeing buckets is currently never done.
+ *
+ * The data structure is faster than util_sparse_array, but is not sparse.
+ */
+
+struct stable_array
+{
+   uint8_t *buckets[32];
+   simple_mtx_t lock;
+   size_t eltsize;
+};
+
+static inline void
+stable_array_init_bytes(struct stable_array *buf, size_t eltsize)
+{
+   memset(buf, 0, sizeof(*buf));
+   buf->eltsize = eltsize;
+   simple_mtx_init(&buf->lock, mtx_plain);
+}
+
+static inline void
+stable_array_fini(struct stable_array *buf)
+{
+   simple_mtx_destroy(&buf->lock);
+   for (unsigned i = 0; i < ARRAY_SIZE(buf->buckets); ++i) {
+      if (buf->buckets[i])
+         free(buf->buckets[i]);
+   }
+}
+
+struct stable_array_index
+{
+   unsigned bucket;
+   unsigned idx;
+};
+
+static inline struct stable_array_index
+stable_array_get_index(unsigned idx)
+{
+   struct stable_array_index i = {0};
+   i.bucket = util_logbase2(idx);
+   i.idx = i.bucket ? (idx -= (1 << i.bucket)) : idx;
+   return i;
+}
+
+static inline void *
+stable_array_get_bytes(struct stable_array *buf, unsigned idx, size_t eltsize)
+{
+   assert(eltsize == buf->eltsize);
+
+   struct stable_array_index i = stable_array_get_index(idx);
+
+   uint8_t *bucket = p_atomic_read(&buf->buckets[i.bucket]);
+
+   if (!bucket) {
+      simple_mtx_lock(&buf->lock);
+      bucket = buf->buckets[i.bucket];
+
+      if (!bucket) {
+         /* The first two buckets both have two elements */
+         bucket = (uint8_t *)calloc(1U << MAX2(i.bucket, 1), eltsize);
+
+         p_atomic_set(&buf->buckets[i.bucket], bucket);
+      }
+      simple_mtx_unlock(&buf->lock);
+   }
+
+   return bucket + eltsize * i.idx;
+}
+
+static inline void *
+stable_array_get_existing_bytes(struct stable_array *buf, unsigned idx, size_t eltsize)
+{
+   assert(eltsize == buf->eltsize);
+
+   struct stable_array_index i = stable_array_get_index(idx);
+
+   return buf->buckets[i.bucket] + eltsize * i.idx;
+}
+
+#define stable_array_init(buf, type) stable_array_init_bytes((buf), sizeof(type))
+#define stable_array_get(buf, type, idx) ((type*)stable_array_get_bytes((buf), (idx), sizeof(type)))
+#define stable_array_get_existing(buf, type, idx) ((type*)stable_array_get_existing_bytes((buf), (idx), sizeof(type)))
+
+#endif
diff --git a/src/util/u_debug_stack_android.cpp.rej b/src/util/u_debug_stack_android.cpp.rej
new file mode 100644
index 00000000000..ce8ce1ef853
--- /dev/null
+++ b/src/util/u_debug_stack_android.cpp.rej
@@ -0,0 +1,83 @@
+diff a/src/util/u_debug_stack_android.cpp b/src/util/u_debug_stack_android.cpp	(rejected hunks)
+@@ -21,7 +21,6 @@
+  * IN THE SOFTWARE.
+  */
+ 
+-#include <backtrace/Backtrace.h>
+ 
+ #include "util/simple_mtx.h"
+ #include "util/u_debug.h"
+@@ -52,56 +51,14 @@ debug_backtrace_capture(debug_stack_frame *backtrace,
+                         unsigned start_frame,
+                         unsigned nr_frames)
+ {
+-   Backtrace *bt;
+ 
+-   if (!nr_frames)
+-      return;
+-
+-   bt = Backtrace::Create(BACKTRACE_CURRENT_PROCESS,
+-                          BACKTRACE_CURRENT_THREAD);
+-   if (bt == NULL) {
+-      for (unsigned i = 0; i < nr_frames; i++)
+-         backtrace[i].procname = NULL;
+-      return;
+-   }
+-
+-   /* Add one to exclude this call. Unwind already ignores itself. */
+-   bt->Unwind(start_frame + 1);
+-
+-   simple_mtx_lock(&table_mutex);
+-
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      const backtrace_frame_data_t* frame = bt->GetFrame(i);
+-      if (frame) {
+-         backtrace[i].procname = intern_symbol(frame->func_name.c_str());
+-         backtrace[i].start_ip = frame->pc;
+-         backtrace[i].off = frame->func_offset;
+-         backtrace[i].map = intern_symbol(frame->map.Name().c_str());
+-         backtrace[i].map_off = frame->rel_pc;
+-      } else {
+-         backtrace[i].procname = NULL;
+-      }
+-   }
+-
+-   simple_mtx_unlock(&table_mutex);
+-
+-   delete bt;
+ }
+ 
+ void
+ debug_backtrace_dump(const debug_stack_frame *backtrace,
+                      unsigned nr_frames)
+ {
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      if (backtrace[i].procname)
+-         debug_printf(
+-            "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n",
+-            backtrace[i].map,
+-            backtrace[i].map_off,
+-            backtrace[i].start_ip,
+-            backtrace[i].procname,
+-            backtrace[i].off);
+-   }
++
+ }
+ 
+ void
+@@ -109,14 +66,5 @@ debug_backtrace_print(FILE *f,
+                       const debug_stack_frame *backtrace,
+                       unsigned nr_frames)
+ {
+-   for (unsigned i = 0; i < nr_frames; i++) {
+-      if (backtrace[i].procname)
+-         fprintf(f,
+-                 "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n",
+-                 backtrace[i].map,
+-                 backtrace[i].map_off,
+-                 backtrace[i].start_ip,
+-                 backtrace[i].procname,
+-                 backtrace[i].off);
+-   }
++
+ }