From 8bf5c3113bbc15ac5790f551791e04dd0c9e1f52 Mon Sep 17 00:00:00 2001
From: SolDev69 <michaelcraft1104@gmail.com>
Date: Thu, 21 Dec 2023 18:15:08 -0500
Subject: [PATCH] more panfrost patches

---
 .../base/include/csf/mali_base_csf_kernel.h   |  596 ++++++
 .../base/include/csf/mali_gpu_csf_registers.h |   43 +
 .../base/include/csf/mali_kbase_csf_ioctl.h   |  530 +++++
 .../base/include/jm/mali_base_jm_kernel.h     | 1051 +++++++++
 .../base/include/jm/mali_kbase_jm_ioctl.h     |  231 ++
 .../base/include/mali_base_common_kernel.h    |  231 ++
 src/panfrost/base/include/mali_base_kernel.h  |  700 ++++++
 .../base/include/mali_kbase_gpuprops.h        |  127 ++
 src/panfrost/base/include/mali_kbase_ioctl.h  |  759 +++++++
 .../base/include/old/mali-ioctl-midgard.h     |   80 +
 src/panfrost/base/include/old/mali-ioctl.h    |  743 +++++++
 src/panfrost/base/include/old/mali-props.h    |  262 +++
 src/panfrost/base/meson.build                 |   55 +
 src/panfrost/base/pan_base.c                  |  301 +++
 src/panfrost/base/pan_base.h                  |  234 ++
 src/panfrost/base/pan_base_noop.h             |  152 ++
 src/panfrost/base/pan_cache.h                 |   95 +
 src/panfrost/base/pan_vX_base.c               | 1825 ++++++++++++++++
 src/panfrost/csf_test/interpret.py            | 1820 ++++++++++++++++
 src/panfrost/csf_test/mali_base_csf_kernel.h  |  721 +++++++
 src/panfrost/csf_test/mali_base_kernel.h      |  746 +++++++
 .../csf_test/mali_gpu_csf_registers.h         |   43 +
 src/panfrost/csf_test/mali_kbase_csf_ioctl.h  |  483 +++++
 src/panfrost/csf_test/mali_kbase_ioctl.h      |  854 ++++++++
 src/panfrost/csf_test/test.c                  | 1903 +++++++++++++++++
 src/panfrost/lib/wrap.h                       |    7 +-
 src/panfrost/meson.build                      |   43 +-
 src/panfrost/midgard/disassemble.c            |    5 +-
 28 files changed, 14636 insertions(+), 4 deletions(-)
 create mode 100644 src/panfrost/base/include/csf/mali_base_csf_kernel.h
 create mode 100644 src/panfrost/base/include/csf/mali_gpu_csf_registers.h
 create mode 100644 src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
 create mode 100644 src/panfrost/base/include/jm/mali_base_jm_kernel.h
 create mode 100644 src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
 create mode 100644 src/panfrost/base/include/mali_base_common_kernel.h
 create mode 100644 src/panfrost/base/include/mali_base_kernel.h
 create mode 100644 src/panfrost/base/include/mali_kbase_gpuprops.h
 create mode 100644 src/panfrost/base/include/mali_kbase_ioctl.h
 create mode 100644 src/panfrost/base/include/old/mali-ioctl-midgard.h
 create mode 100644 src/panfrost/base/include/old/mali-ioctl.h
 create mode 100644 src/panfrost/base/include/old/mali-props.h
 create mode 100644 src/panfrost/base/meson.build
 create mode 100644 src/panfrost/base/pan_base.c
 create mode 100644 src/panfrost/base/pan_base.h
 create mode 100644 src/panfrost/base/pan_base_noop.h
 create mode 100644 src/panfrost/base/pan_cache.h
 create mode 100644 src/panfrost/base/pan_vX_base.c
 create mode 100644 src/panfrost/csf_test/interpret.py
 create mode 100644 src/panfrost/csf_test/mali_base_csf_kernel.h
 create mode 100644 src/panfrost/csf_test/mali_base_kernel.h
 create mode 100644 src/panfrost/csf_test/mali_gpu_csf_registers.h
 create mode 100644 src/panfrost/csf_test/mali_kbase_csf_ioctl.h
 create mode 100644 src/panfrost/csf_test/mali_kbase_ioctl.h
 create mode 100644 src/panfrost/csf_test/test.c

diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
new file mode 100644
index 00000000000..3b02350c08b
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_CSF_KERNEL_H_
+#define _UAPI_BASE_CSF_KERNEL_H_
+
+#include <linux/types.h>
+#include "../mali_base_common_kernel.h"
+
+/* Memory allocation, access/hint flags & mask specific to CSF GPU.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* Must be FIXED memory. */
+#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8)
+
+/* CSF event memory
+ *
+ * If Outer shareable coherence is not specified or not available, then on
+ * allocation kbase will automatically use the uncached GPU mapping.
+ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
+ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
+ *
+ * This memory requires a permanent mapping
+ *
+ * See also kbase_reg_needs_kernel_mapping()
+ */
+#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
+
+#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+
+
+/* Must be FIXABLE memory: its GPU VA will be determined at a later point,
+ * at which time it will be at a fixed GPU VA.
+ */
+#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29)
+
+/* Note that the number of bits used for base_mem_alloc_flags
+ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
+ */
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20
+
+/* Special base mem handles specific to CSF.
+ */
+#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
+
+#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
+	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
+	 LOCAL_PAGE_SHIFT)
+
+/* Valid set of just-in-time memory allocation flags */
+#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
+
+/* flags for base context specific to CSF */
+
+/* Base context creates a CSF event notification thread.
+ *
+ * The creation of a CSF event notification thread is conditional but
+ * mandatory for the handling of CSF events.
+ */
+#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | \
+	 BASE_CONTEXT_CSF_EVENT_THREAD | \
+	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/* Flags for base tracepoint specific to CSF */
+
+/* Enable KBase tracepoints for CSF builds */
+#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
+
+/* Enable additional CSF Firmware side tracepoints */
+#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
+
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
+		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
+		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
+
+/* Number of pages mapped into the process address space for a bound GPU
+ * command queue. A pair of input/output pages and a Hw doorbell page
+ * are mapped to enable direct submission of commands to Hw.
+ */
+#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
+
+#define BASE_QUEUE_MAX_PRIORITY (15U)
+
+/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
+#define BASEP_EVENT_VAL_INDEX (0U)
+#define BASEP_EVENT_ERR_INDEX (1U)
+
+/* The upper limit for number of objects that could be waited/set per command.
+ * This limit is now enforced as internally the error inherit inputs are
+ * converted to 32-bit flags in a __u32 variable occupying a previously padding
+ * field.
+ */
+#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
+
+/* CSF CSI EXCEPTION_HANDLER_FLAGS */
+#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0)
+#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG)
+
+/**
+ * enum base_kcpu_command_type - Kernel CPU queue command type.
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
+ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
+ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
+ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
+ */
+enum base_kcpu_command_type {
+	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
+	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
+	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
+	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
+};
+
+/**
+ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
+ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
+ *                                      priority levels.
+ *
+ * Currently this is in order of highest to lowest, but if new levels are added
+ * then those new levels may be out of order to preserve the ABI compatibility
+ * with previous releases. At that point, ensure assignment to
+ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
+ * a linear ordering.
+ *
+ * There should be no gaps in the enum, otherwise use of
+ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
+ */
+enum base_queue_group_priority {
+	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
+	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
+	BASE_QUEUE_GROUP_PRIORITY_LOW,
+	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
+	BASE_QUEUE_GROUP_PRIORITY_COUNT
+};
+
+struct base_kcpu_command_fence_info {
+	__u64 fence;
+};
+
+struct base_cqs_wait_info {
+	__u64 addr;
+	__u32 val;
+	__u32 padding;
+};
+
+struct base_kcpu_command_cqs_wait_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+struct base_cqs_set {
+	__u64 addr;
+};
+
+struct base_kcpu_command_cqs_set_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
+ *
+ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
+ *                           is an unsigned 32-bit integer
+ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
+ *                           is an unsigned 64-bit integer
+ */
+typedef enum PACKED {
+	BASEP_CQS_DATA_TYPE_U32 = 0,
+	BASEP_CQS_DATA_TYPE_U64 = 1,
+} basep_cqs_data_type;
+
+/**
+ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
+ *                                Operation conditions
+ *
+ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Less than or Equal to
+ *                                the Wait Operation value
+ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Greater than the Wait Operation value
+ */
+typedef enum {
+	BASEP_CQS_WAIT_OPERATION_LE = 0,
+	BASEP_CQS_WAIT_OPERATION_GT = 1,
+} basep_cqs_wait_operation_op;
+
+struct base_cqs_wait_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
+ *		about the Timeline CQS wait objects
+ *
+ * @objs:              An array of Timeline CQS waits.
+ * @nr_objs:           Number of Timeline CQS waits in the array.
+ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
+ *                     to be served as the source for importing into the
+ *                     queue's error-state.
+ */
+struct base_kcpu_command_cqs_wait_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+/**
+ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
+ *
+ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
+ *                                to a synchronization object
+ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
+ *                                of a synchronization object
+ */
+typedef enum {
+	BASEP_CQS_SET_OPERATION_ADD = 0,
+	BASEP_CQS_SET_OPERATION_SET = 1,
+} basep_cqs_set_operation_op;
+
+struct base_cqs_set_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
+ *		about the Timeline CQS set objects
+ *
+ * @objs:    An array of Timeline CQS sets.
+ * @nr_objs: Number of Timeline CQS sets in the array.
+ * @padding: Structure padding, unused bytes.
+ */
+struct base_kcpu_command_cqs_set_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * struct base_kcpu_command_import_info - structure which contains information
+ *		about the imported buffer.
+ *
+ * @handle:	Address of imported user buffer.
+ */
+struct base_kcpu_command_import_info {
+	__u64 handle;
+};
+
+/**
+ * struct base_kcpu_command_jit_alloc_info - structure which contains
+ *		information about jit memory allocation.
+ *
+ * @info:	An array of elements of the
+ *		struct base_jit_alloc_info type.
+ * @count:	The number of elements in the info array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_alloc_info {
+	__u64 info;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_jit_free_info - structure which contains
+ *		information about jit memory which is to be freed.
+ *
+ * @ids:	An array containing the JIT IDs to free.
+ * @count:	The number of elements in the ids array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_free_info {
+	__u64 ids;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_group_suspend_info - structure which contains
+ *		suspend buffer data captured for a suspended queue group.
+ *
+ * @buffer:		Pointer to an array of elements of the type char.
+ * @size:		Number of elements in the @buffer array.
+ * @group_handle:	Handle to the mapping of CSG.
+ * @padding:		padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_group_suspend_info {
+	__u64 buffer;
+	__u32 size;
+	__u8 group_handle;
+	__u8 padding[3];
+};
+
+
+/**
+ * struct base_kcpu_command - kcpu command.
+ * @type:	type of the kcpu command, one enum base_kcpu_command_type
+ * @padding:	padding to a multiple of 64 bits
+ * @info:	structure which contains information about the kcpu command;
+ *		actual type is determined by @p type
+ * @info.fence:              Fence
+ * @info.cqs_wait:           CQS wait
+ * @info.cqs_set:            CQS set
+ * @info.cqs_wait_operation: CQS wait operation
+ * @info.cqs_set_operation:  CQS set operation
+ * @info.import:             import
+ * @info.jit_alloc:          JIT allocation
+ * @info.jit_free:           JIT deallocation
+ * @info.suspend_buf_copy:   suspend buffer copy
+ * @info.sample_time:        sample time
+ * @info.padding:            padding
+ */
+struct base_kcpu_command {
+	__u8 type;
+	__u8 padding[sizeof(__u64) - sizeof(__u8)];
+	union {
+		struct base_kcpu_command_fence_info fence;
+		struct base_kcpu_command_cqs_wait_info cqs_wait;
+		struct base_kcpu_command_cqs_set_info cqs_set;
+		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
+		struct base_kcpu_command_import_info import;
+		struct base_kcpu_command_jit_alloc_info jit_alloc;
+		struct base_kcpu_command_jit_free_info jit_free;
+		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
+		__u64 padding[2]; /* No sub-struct should be larger */
+	} info;
+};
+
+/**
+ * struct basep_cs_stream_control - CSI capabilities.
+ *
+ * @features: Features of this stream
+ * @padding:  Padding to a multiple of 64 bits.
+ */
+struct basep_cs_stream_control {
+	__u32 features;
+	__u32 padding;
+};
+
+/**
+ * struct basep_cs_group_control - CSG interface capabilities.
+ *
+ * @features:     Features of this group
+ * @stream_num:   Number of streams in this group
+ * @suspend_size: Size in bytes of the suspend buffer for this group
+ * @padding:      Padding to a multiple of 64 bits.
+ */
+struct basep_cs_group_control {
+	__u32 features;
+	__u32 stream_num;
+	__u32 suspend_size;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
+ *        error information associated with GPU command queue group.
+ *
+ * @sideband:     Additional information of the unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_group_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
+ *        error information related to GPU command queue.
+ *
+ * @sideband:     Additional information about this unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @csi_index:    Index of the CSF interface the queue is bound to.
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
+};
+
+/**
+ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
+ *
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
+ *                                          command queue group.
+ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
+ *                                          command queue.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
+ *                                          progress timeout.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
+ *                                             of tiler heap memory.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
+ *
+ * This type is used for &struct_base_gpu_queue_group_error.error_type.
+ */
+enum base_gpu_queue_group_error_type {
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
+	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
+	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
+};
+
+/**
+ * struct base_gpu_queue_group_error - Unrecoverable fault information
+ * @error_type:          Error type of @base_gpu_queue_group_error_type
+ *                       indicating which field in union payload is filled
+ * @padding:             Unused bytes for 64bit boundary
+ * @payload:             Input Payload
+ * @payload.fatal_group: Unrecoverable fault error associated with
+ *                       GPU command queue group
+ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
+ */
+struct base_gpu_queue_group_error {
+	__u8 error_type;
+	__u8 padding[7];
+	union {
+		struct base_gpu_queue_group_error_fatal_payload fatal_group;
+		struct base_gpu_queue_error_fatal_payload fatal_queue;
+	} payload;
+};
+
+/**
+ * enum base_csf_notification_type - Notification type
+ *
+ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
+ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
+ *                                               error
+ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
+ *                                               queue
+ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
+ *
+ * This type is used for &struct_base_csf_notification.type.
+ */
+enum base_csf_notification_type {
+	BASE_CSF_NOTIFICATION_EVENT = 0,
+	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
+	BASE_CSF_NOTIFICATION_COUNT
+};
+
+/**
+ * struct base_csf_notification - Event or error notification
+ *
+ * @type:                      Notification type of @base_csf_notification_type
+ * @padding:                   Padding for 64bit boundary
+ * @payload:                   Input Payload
+ * @payload.align:             To fit the struct into a 64-byte cache line
+ * @payload.csg_error:         CSG error
+ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
+ *                             fatal error
+ * @payload.csg_error.padding: Padding
+ * @payload.csg_error.error:   Unrecoverable fault error
+ *
+ */
+struct base_csf_notification {
+	__u8 type;
+	__u8 padding[7];
+	union {
+		struct {
+			__u8 handle;
+			__u8 padding[7];
+			struct base_gpu_queue_group_error error;
+		} csg_error;
+
+		__u8 align[56];
+	} payload;
+};
+
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ *
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ *   4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ *   release number.
+ *   8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ *   release number.
+ *   4 bit values (0-15).
+ * @padding: padding to align to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ *   clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ *   is a bitpattern where a set bit indicates that the format is supported.
+ *   Before using a texture format, it is recommended that the corresponding
+ *   bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ *   It is unlikely that a client will be able to allocate all of this memory
+ *   for their own purposes, but this at least provides an upper bound on the
+ *   memory available to the GPU.
+ *   This is required for OpenCL's clGetDeviceInfo() call when
+ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ *   client will not be expecting to allocate anywhere near this value.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+};
+
+#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
new file mode 100644
index 00000000000..17e338cb238
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * This header was originally autogenerated, but it is now ok (and
+ * expected) to have to add to it.
+ */
+
+#ifndef _UAPI_GPU_CSF_REGISTERS_H_
+#define _UAPI_GPU_CSF_REGISTERS_H_
+
+/* Only user block defines are included. HI words have been removed */
+
+/* CS_USER_INPUT_BLOCK register offsets */
+#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
+#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
+
+/* CS_USER_OUTPUT_BLOCK register offsets */
+#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
+#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
+
+/* USER register offsets */
+#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
+
+#endif
diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
new file mode 100644
index 00000000000..db7252605f0
--- /dev/null
+++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h
@@ -0,0 +1,530 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_CSF_IOCTL_H_
+#define _UAPI_KBASE_CSF_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 1.0:
+ * - CSF IOCTL header separated from JM
+ * 1.1:
+ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 1.2:
+ * - Add new CSF GPU_FEATURES register into the property structure
+ *   returned by KBASE_IOCTL_GET_GPUPROPS
+ * 1.3:
+ * - Add __u32 group_uid member to
+ *   &struct_kbase_ioctl_cs_queue_group_create.out
+ * 1.4:
+ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
+ *   instr_features member of same size
+ * 1.5:
+ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
+ *   queue registration call with extended format for supporting CS
+ *   trace configurations with CSF trace_command.
+ * 1.6:
+ * - Added new HW performance counters interface to all GPUs.
+ * 1.7:
+ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
+ * 1.8:
+ * - Removed Kernel legacy HWC interface
+ * 1.9:
+ * - Reorganization of GPU-VA memory zones, including addition of
+ *   FIXED_VA zone and auto-initialization of EXEC_VA zone.
+ * - Added new Base memory allocation interface
+ * 1.10:
+ * - First release of new HW performance counters interface.
+ * 1.11:
+ * - Dummy model (no mali) backend will now clear HWC values after each sample
+ * 1.12:
+ * - Added support for incremental rendering flag in CSG create call
+ */
+
+#define BASE_UK_VERSION_MAJOR 1
+#define BASE_UK_VERSION_MINOR 12
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+/**
+ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
+ *                                        base back-end
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ *
+ * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
+ *        Any change of this struct should also be mirrored to the latter.
+ */
+struct kbase_ioctl_cs_queue_register {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER \
+	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
+
+/**
+ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
+ *                                    to notify that a queue has been updated
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_kick {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_KICK \
+	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
+
+/**
+ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
+ *
+ * @in:                 Input parameters
+ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @in.group_handle:    Handle of the group to which the queue should be bound
+ * @in.csi_index:       Index of the CSF interface the queue should be bound to
+ * @in.padding:         Currently unused, must be zero
+ * @out:                Output parameters
+ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
+ *                      input/output pages
+ */
+union kbase_ioctl_cs_queue_bind {
+	struct {
+		__u64 buffer_gpu_addr;
+		__u8 group_handle;
+		__u8 csi_index;
+		__u8 padding[6];
+	} in;
+	struct {
+		__u64 mmap_handle;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_BIND \
+	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
+
+/**
+ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
+ *                                           base back-end in extended format,
+ *                                           involving trace buffer configuration
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
+ * @ex_buffer_base: Trace buffer GPU base address for the queue
+ * @ex_buffer_size: Size of the trace buffer in bytes
+ * @ex_event_size: Trace event write size, in log2 designation
+ * @ex_event_state: Trace event states configuration
+ * @ex_padding: Currently unused, must be zero
+ *
+ * Note: There is an identical sub-section at the start of this struct to that
+ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
+ *        must also be mirrored to the latter. Following the said sub-section,
+ *        the remaining fields forms the extension, marked with ex_*.
+ */
+struct kbase_ioctl_cs_queue_register_ex {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+	__u64 ex_offset_var_addr;
+	__u64 ex_buffer_base;
+	__u32 ex_buffer_size;
+	__u8 ex_event_size;
+	__u8 ex_event_state;
+	__u8 ex_padding[2];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
+	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
+
+/**
+ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_terminate {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
+ *                                               group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create_1_6 {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
+	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.csi_handlers:  Flags to signal that the application intends to use CSI
+ *                    exception handlers in some linear buffers to deal with
+ *                    the given exception types.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 csi_handlers;
+		__u8 padding[2];
+		/**
+		 * @in.reserved: Reserved
+		 */
+		__u64 reserved;
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
+	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
+
+/**
+ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
+ *
+ * @group_handle: Handle of the queue group to be terminated
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_cs_queue_group_term {
+	__u8 group_handle;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
+
+#define KBASE_IOCTL_CS_EVENT_SIGNAL \
+	_IO(KBASE_IOCTL_TYPE, 44)
+
+typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
+
+/**
+ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
+ *
+ * @id: ID of the new command queue returned by the kernel
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_new {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
+	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
+ *
+ * @id: ID of the command queue to be destroyed
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_delete {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
+	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
+ *
+ * @addr: Memory address of an array of struct base_kcpu_queue_command
+ * @nr_commands: Number of commands in the array
+ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_enqueue {
+	__u64 addr;
+	__u32 nr_commands;
+	base_kcpu_queue_id id;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
+	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
+
+/**
+ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
+ * @in:                Input parameters
+ * @in.chunk_size:     Size of each chunk.
+ * @in.initial_chunks: Initial number of chunks that heap will be created with.
+ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
+ * @in.target_in_flight: Number of render-passes that the driver should attempt to
+ *                     keep in flight for which allocation of new chunks is
+ *                     allowed.
+ * @in.group_id:       Group ID to be used for physical allocations.
+ * @in.padding:        Padding
+ * @out:               Output parameters
+ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
+ *                     for the heap.
+ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
+ *                     actually points to the header of heap chunk and not to
+ *                     the low address of free memory in the chunk.
+ */
+union kbase_ioctl_cs_tiler_heap_init {
+	struct {
+		__u32 chunk_size;
+		__u32 initial_chunks;
+		__u32 max_chunks;
+		__u16 target_in_flight;
+		__u8 group_id;
+		__u8 padding;
+	} in;
+	struct {
+		__u64 gpu_heap_va;
+		__u64 first_chunk_va;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
+	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
+
+/**
+ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
+ *                                         instance
+ *
+ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
+ */
+struct kbase_ioctl_cs_tiler_heap_term {
+	__u64 gpu_heap_va;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
+	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
+
+/**
+ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
+ *                                        of CSF interface capabilities
+ *
+ * @in:                    Input parameters
+ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
+ *                         which case groups_ptr is unused.
+ * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in
+ *                         which case streams_ptr is unused.
+ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
+ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
+ * @out:                   Output parameters
+ * @out.glb_version:       Global interface version.
+ * @out.features:          Bit mask of features (e.g. whether certain types of job
+ *                         can be suspended).
+ * @out.group_num:         Number of CSGs supported.
+ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
+ *                         hold the size of firmware performance counter data
+ *                         and 15:0 hold the size of hardware performance counter
+ *                         data.
+ * @out.total_stream_num:  Total number of CSs, summed across all groups.
+ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
+ *                         size of events. Bits 3:0 hold the offset update rate.
+ *                         (csf >= 1.1.0)
+ *
+ */
+union kbase_ioctl_cs_get_glb_iface {
+	struct {
+		__u32 max_group_num;
+		__u32 max_total_stream_num;
+		__u64 groups_ptr;
+		__u64 streams_ptr;
+	} in;
+	struct {
+		__u32 glb_version;
+		__u32 features;
+		__u32 group_num;
+		__u32 prfcnt_size;
+		__u32 total_stream_num;
+		__u32 instr_features;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_GET_GLB_IFACE \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
+
+struct kbase_ioctl_cs_cpu_queue_info {
+	__u64 buffer;
+	__u64 size;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
+	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
+
+/**
+ * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @in.fixed_address: The GPU virtual address requested for the allocation,
+ *                    if the allocation is using the BASE_MEM_FIXED flag.
+ * @in.extra: Space for extra parameters that may be added in the future.
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc_ex {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+		__u64 fixed_address;
+		__u64 extra[3];
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+/**
+ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
+ * @cpu_addr: Memory address to write
+ * @value: Value to write
+ * @padding: Currently unused, must be zero
+ */
+struct kbase_ioctl_cs_event_memory_write {
+	__u64 cpu_addr;
+	__u8 value;
+	__u8 padding[7];
+};
+
+/**
+ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
+ * @in: Input parameters
+ * @in.cpu_addr: Memory address to read
+ * @out: Output parameters
+ * @out.value: Value read
+ * @out.padding: Currently unused, must be zero
+ */
+union kbase_ioctl_cs_event_memory_read {
+	struct {
+		__u64 cpu_addr;
+	} in;
+	struct {
+		__u8 value;
+		__u8 padding[7];
+	} out;
+};
+
+#endif /* MALI_UNIT_TEST */
+
+#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
new file mode 100644
index 00000000000..ae43908b936
--- /dev/null
+++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h
@@ -0,0 +1,1051 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_JM_KERNEL_H_
+#define _UAPI_BASE_JM_KERNEL_H_
+
+#include <linux/types.h>
+#include "../mali_base_common_kernel.h"
+
+/* Memory allocation, access/hint flags & mask specific to JM GPU.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* Used as BASE_MEM_FIXED in other backends */
+#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
+
+/**
+ * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved.
+ *
+ * Do not remove, use the next unreserved bit for new flags
+ */
+#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19)
+
+/**
+ * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned
+ * to 'extension' pages, where 'extension' must be a power of 2 and no more than
+ * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
+ */
+#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20)
+
+/* Use the GPU VA chosen by the kernel client */
+#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27)
+
+/* Force trimming of JIT allocations when creating a new allocation */
+#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29)
+
+/* Note that the number of bits used for base_mem_alloc_flags
+ * must be less than BASE_MEM_FLAGS_NR_BITS !!!
+ */
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \
+	 BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED \
+	(BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19)
+
+
+/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the
+ * initial commit is aligned to 'extension' pages, where 'extension' must be a power
+ * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES
+ */
+#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP  (1 << 0)
+
+/**
+ * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points
+ * to a __u32 holding the used size in bytes;
+ * otherwise it points to a __u64 holding the lowest address of unused memory.
+ */
+#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE  (1 << 1)
+
+/**
+ * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags
+ *
+ * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr
+ * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set
+ * and heap_info_gpu_addr being 0 will be rejected).
+ */
+#define BASE_JIT_ALLOC_VALID_FLAGS \
+	(BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/*
+ * Private flags used on the base context
+ *
+ * These start at bit 31, and run down to zero.
+ *
+ * They share the same space as base_context_create_flags, and so must
+ * not collide with them.
+ */
+
+/* Private flag tracking whether job descriptor dumping is disabled */
+#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \
+	((base_context_create_flags)(1 << 31))
+
+/* Flags for base tracepoint specific to JM */
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED)
+/*
+ * Dependency stuff, keep it private for now. May want to expose it if
+ * we decide to make the number of semaphores a configurable
+ * option.
+ */
+#define BASE_JD_ATOM_COUNT              256
+
+/* Maximum number of concurrent render passes.
+ */
+#define BASE_JD_RP_COUNT (256)
+
+/* Set/reset values for a software event */
+#define BASE_JD_SOFT_EVENT_SET             ((unsigned char)1)
+#define BASE_JD_SOFT_EVENT_RESET           ((unsigned char)0)
+
+/**
+ * struct base_jd_udata - Per-job data
+ *
+ * @blob: per-job data array
+ *
+ * This structure is used to store per-job data, and is completely unused
+ * by the Base driver. It can be used to store things such as callback
+ * function pointer, data to handle job completion. It is guaranteed to be
+ * untouched by the Base driver.
+ */
+struct base_jd_udata {
+	__u64 blob[2];
+};
+
+/**
+ * typedef base_jd_dep_type - Job dependency type.
+ *
+ * A flags field will be inserted into the atom structure to specify whether a
+ * dependency is a data or ordering dependency (by putting it before/after
+ * 'core_req' in the structure it should be possible to add without changing
+ * the structure size).
+ * When the flag is set for a particular dependency to signal that it is an
+ * ordering only dependency then errors will not be propagated.
+ */
+typedef __u8 base_jd_dep_type;
+
+#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
+#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
+#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
+
+/**
+ * typedef base_jd_core_req - Job chain hardware requirements.
+ *
+ * A job chain must specify what GPU features it needs to allow the
+ * driver to schedule the job correctly.  By not specifying the
+ * correct settings can/will cause an early job termination.  Multiple
+ * values can be ORed together to specify multiple requirements.
+ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
+ * dependencies, and that doesn't execute anything on the hardware.
+ */
+typedef __u32 base_jd_core_req;
+
+/* Requirements that come from the HW */
+
+/* No requirement, dependency only
+ */
+#define BASE_JD_REQ_DEP ((base_jd_core_req)0)
+
+/* Requires fragment shaders
+ */
+#define BASE_JD_REQ_FS  ((base_jd_core_req)1 << 0)
+
+/* Requires compute shaders
+ *
+ * This covers any of the following GPU job types:
+ * - Vertex Shader Job
+ * - Geometry Shader Job
+ * - An actual Compute Shader Job
+ *
+ * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
+ * job is specifically just the "Compute Shader" job type, and not the "Vertex
+ * Shader" nor the "Geometry Shader" job type.
+ */
+#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1)
+
+/* Requires tiling */
+#define BASE_JD_REQ_T  ((base_jd_core_req)1 << 2)
+
+/* Requires cache flushes */
+#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3)
+
+/* Requires value writeback */
+#define BASE_JD_REQ_V  ((base_jd_core_req)1 << 4)
+
+/* SW-only requirements - the HW does not expose these as part of the job slot
+ * capabilities
+ */
+
+/* Requires fragment job with AFBC encoding */
+#define BASE_JD_REQ_FS_AFBC  ((base_jd_core_req)1 << 13)
+
+/* SW-only requirement: coalesce completion events.
+ * If this bit is set then completion of this atom will not cause an event to
+ * be sent to userspace, whether successful or not; completion events will be
+ * deferred until an atom completes which does not have this bit set.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
+ */
+#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5)
+
+/* SW Only requirement: the job chain requires a coherent core group. We don't
+ * mind which coherent core group is used.
+ */
+#define BASE_JD_REQ_COHERENT_GROUP  ((base_jd_core_req)1 << 6)
+
+/* SW Only requirement: The performance counters should be enabled only when
+ * they are needed, to reduce power consumption.
+ */
+#define BASE_JD_REQ_PERMON               ((base_jd_core_req)1 << 7)
+
+/* SW Only requirement: External resources are referenced by this atom.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and
+ * BASE_JD_REQ_SOFT_EVENT_WAIT.
+ */
+#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((base_jd_core_req)1 << 8)
+
+/* SW Only requirement: Software defined job. Jobs with this bit set will not be
+ * submitted to the hardware but will cause some action to happen within the
+ * driver
+ */
+#define BASE_JD_REQ_SOFT_JOB        ((base_jd_core_req)1 << 9)
+
+#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
+#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
+#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
+
+/* 0x4 RESERVED for now */
+
+/* SW only requirement: event wait/trigger job.
+ *
+ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
+ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
+ *   other waiting jobs. It completes immediately.
+ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
+ *   possible for other jobs to wait upon. It completes immediately.
+ */
+#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
+#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
+#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
+
+#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
+
+/* SW only requirement: Just In Time allocation
+ *
+ * This job requests a single or multiple just-in-time allocations through a
+ * list of base_jit_alloc_info structure which is passed via the jc element of
+ * the atom. The number of base_jit_alloc_info structures present in the
+ * list is passed via the nr_extres element of the atom
+ *
+ * It should be noted that the id entry in base_jit_alloc_info must not
+ * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE.
+ *
+ * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE
+ * soft job to free the JIT allocation is still made.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
+
+/* SW only requirement: Just In Time free
+ *
+ * This job requests a single or multiple just-in-time allocations created by
+ * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time
+ * allocations is passed via the jc element of the atom.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
+
+/* SW only requirement: Map external resource
+ *
+ * This job requests external resource(s) are mapped once the dependencies
+ * of the job have been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
+
+/* SW only requirement: Unmap external resource
+ *
+ * This job requests external resource(s) are unmapped once the dependencies
+ * of the job has been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
+
+/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
+ *
+ * This indicates that the Job Chain contains GPU jobs of the 'Compute
+ * Shaders' type.
+ *
+ * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job
+ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
+ */
+#define BASE_JD_REQ_ONLY_COMPUTE    ((base_jd_core_req)1 << 10)
+
+/* HW Requirement: Use the base_jd_atom::device_nr field to specify a
+ * particular core group
+ *
+ * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
+ * takes priority
+ *
+ * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms.
+ */
+#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11)
+
+/* SW Flag: If this bit is set then the successful completion of this atom
+ * will not cause an event to be sent to userspace
+ */
+#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((base_jd_core_req)1 << 12)
+
+/* SW Flag: If this bit is set then completion of this atom will not cause an
+ * event to be sent to userspace, whether successful or not.
+ */
+#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14)
+
+/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job starts which does not have this bit set or a job completes
+ * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use
+ * if the CPU may have written to memory addressed by the job since the last job
+ * without this bit set was submitted.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15)
+
+/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job completes which does not have this bit set or a job starts
+ * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use
+ * if the CPU may read from or partially overwrite memory addressed by the job
+ * before the next job without this bit set completes.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16)
+
+/* Request the atom be executed on a specific job slot.
+ *
+ * When this flag is specified, it takes precedence over any existing job slot
+ * selection logic.
+ */
+#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17)
+
+/* SW-only requirement: The atom is the start of a renderpass.
+ *
+ * If this bit is set then the job chain will be soft-stopped if it causes the
+ * GPU to write beyond the end of the physical pages backing the tiler heap, and
+ * committing more memory to the heap would exceed an internal threshold. It may
+ * be resumed after running one of the job chains attached to an atom with
+ * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be
+ * resumed multiple times until it completes without memory usage exceeding the
+ * threshold.
+ *
+ * Usually used with BASE_JD_REQ_T.
+ */
+#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18)
+
+/* SW-only requirement: The atom is the end of a renderpass.
+ *
+ * If this bit is set then the atom incorporates the CPU address of a
+ * base_jd_fragment object instead of the GPU address of a job chain.
+ *
+ * Which job chain is run depends upon whether the atom with the same renderpass
+ * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or
+ * was soft-stopped when it exceeded an upper threshold for tiler heap memory
+ * usage.
+ *
+ * It also depends upon whether one of the job chains attached to the atom has
+ * already been run as part of the same renderpass (in which case it would have
+ * written unresolved multisampled and otherwise-discarded output to temporary
+ * buffers that need to be read back). The job chain for doing a forced read and
+ * forced write (from/to temporary buffers) is run as many times as necessary.
+ *
+ * Usually used with BASE_JD_REQ_FS.
+ */
+#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19)
+
+/* SW-only requirement: The atom needs to run on a limited core mask affinity.
+ *
+ * If this bit is set then the kbase_context.limited_core_mask will be applied
+ * to the affinity.
+ */
+#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20)
+
+/* These requirement bits are currently unused in base_jd_core_req
+ */
+#define BASEP_JD_REQ_RESERVED \
+	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
+	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \
+	BASE_JD_REQ_EVENT_COALESCE | \
+	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
+	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
+	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \
+	BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \
+	BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK))
+
+/* Mask of all bits in base_jd_core_req that control the type of the atom.
+ *
+ * This allows dependency only atoms to have flags set
+ */
+#define BASE_JD_REQ_ATOM_TYPE \
+	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
+	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
+
+/**
+ * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that
+ * controls the type of a soft job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
+
+/* Returns non-zero value if core requirements passed define a soft job or
+ * a dependency only job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
+	(((core_req) & BASE_JD_REQ_SOFT_JOB) || \
+	((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
+
+/**
+ * enum kbase_jd_atom_state - Atom states
+ *
+ * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used.
+ * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD.
+ * @KBASE_JD_ATOM_STATE_IN_JS:  Atom has been given to JS (is runnable/running).
+ * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet
+ *                                    handed back to job dispatcher for
+ *                                    dependency resolution.
+ * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed
+ *                                 back to userspace.
+ */
+enum kbase_jd_atom_state {
+	KBASE_JD_ATOM_STATE_UNUSED,
+	KBASE_JD_ATOM_STATE_QUEUED,
+	KBASE_JD_ATOM_STATE_IN_JS,
+	KBASE_JD_ATOM_STATE_HW_COMPLETED,
+	KBASE_JD_ATOM_STATE_COMPLETED
+};
+
+/**
+ * typedef base_atom_id - Type big enough to store an atom number in.
+ */
+typedef __u8 base_atom_id;
+
+/**
+ * struct base_dependency - base dependency
+ *
+ * @atom_id:         An atom number
+ * @dependency_type: Dependency type
+ */
+struct base_dependency {
+	base_atom_id atom_id;
+	base_jd_dep_type dependency_type;
+};
+
+/**
+ * struct base_jd_fragment - Set of GPU fragment job chains used for rendering.
+ *
+ * @norm_read_norm_write: Job chain for full rendering.
+ *                        GPU address of a fragment job chain to render in the
+ *                        circumstance where the tiler job chain did not exceed
+ *                        its memory usage threshold and no fragment job chain
+ *                        was previously run for the same renderpass.
+ *                        It is used no more than once per renderpass.
+ * @norm_read_forced_write: Job chain for starting incremental
+ *                          rendering.
+ *                          GPU address of a fragment job chain to render in
+ *                          the circumstance where the tiler job chain exceeded
+ *                          its memory usage threshold for the first time and
+ *                          no fragment job chain was previously run for the
+ *                          same renderpass.
+ *                          Writes unresolved multisampled and normally-
+ *                          discarded output to temporary buffers that must be
+ *                          read back by a subsequent forced_read job chain
+ *                          before the renderpass is complete.
+ *                          It is used no more than once per renderpass.
+ * @forced_read_forced_write: Job chain for continuing incremental
+ *                            rendering.
+ *                            GPU address of a fragment job chain to render in
+ *                            the circumstance where the tiler job chain
+ *                            exceeded its memory usage threshold again
+ *                            and a fragment job chain was previously run for
+ *                            the same renderpass.
+ *                            Reads unresolved multisampled and
+ *                            normally-discarded output from temporary buffers
+ *                            written by a previous forced_write job chain and
+ *                            writes the same to temporary buffers again.
+ *                            It is used as many times as required until
+ *                            rendering completes.
+ * @forced_read_norm_write: Job chain for ending incremental rendering.
+ *                          GPU address of a fragment job chain to render in the
+ *                          circumstance where the tiler job chain did not
+ *                          exceed its memory usage threshold this time and a
+ *                          fragment job chain was previously run for the same
+ *                          renderpass.
+ *                          Reads unresolved multisampled and normally-discarded
+ *                          output from temporary buffers written by a previous
+ *                          forced_write job chain in order to complete a
+ *                          renderpass.
+ *                          It is used no more than once per renderpass.
+ *
+ * This structure is referenced by the main atom structure if
+ * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req.
+ */
+struct base_jd_fragment {
+	__u64 norm_read_norm_write;
+	__u64 norm_read_forced_write;
+	__u64 forced_read_forced_write;
+	__u64 forced_read_norm_write;
+};
+
+/**
+ * typedef base_jd_prio - Base Atom priority.
+ *
+ * Only certain priority levels are actually implemented, as specified by the
+ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
+ * level that is not one of those defined below.
+ *
+ * Priority levels only affect scheduling after the atoms have had dependencies
+ * resolved. For example, a low priority atom that has had its dependencies
+ * resolved might run before a higher priority atom that has not had its
+ * dependencies resolved.
+ *
+ * In general, fragment atoms do not affect non-fragment atoms with
+ * lower priorities, and vice versa. One exception is that there is only one
+ * priority value for each context. So a high-priority (e.g.) fragment atom
+ * could increase its context priority, causing its non-fragment atoms to also
+ * be scheduled sooner.
+ *
+ * The atoms are scheduled as follows with respect to their priorities:
+ * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies
+ *   resolved, and atom 'X' has a higher priority than atom 'Y'
+ * * If atom 'Y' is currently running on the HW, then it is interrupted to
+ *   allow atom 'X' to run soon after
+ * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing
+ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
+ * * Any two atoms that have the same priority could run in any order with
+ *   respect to each other. That is, there is no ordering constraint between
+ *   atoms of the same priority.
+ *
+ * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are
+ * scheduled between contexts. The default value, 0, will cause higher-priority
+ * atoms to be scheduled first, regardless of their context. The value 1 will
+ * use a round-robin algorithm when deciding which context's atoms to schedule
+ * next, so higher-priority atoms can only preempt lower priority atoms within
+ * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and
+ * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details.
+ */
+typedef __u8 base_jd_prio;
+
+/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */
+#define BASE_JD_PRIO_MEDIUM  ((base_jd_prio)0)
+/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and
+ * BASE_JD_PRIO_LOW
+ */
+#define BASE_JD_PRIO_HIGH    ((base_jd_prio)1)
+/* Low atom priority. */
+#define BASE_JD_PRIO_LOW     ((base_jd_prio)2)
+/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH,
+ * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW
+ */
+#define BASE_JD_PRIO_REALTIME    ((base_jd_prio)3)
+
+/* Invalid atom priority (max uint8_t value) */
+#define BASE_JD_PRIO_INVALID ((base_jd_prio)255)
+
+/* Count of the number of priority levels. This itself is not a valid
+ * base_jd_prio setting
+ */
+#define BASE_JD_NR_PRIO_LEVELS 4
+
+/**
+ * struct base_jd_atom_v2 - Node of a dependency graph used to submit a
+ *                          GPU job chain or soft-job to the kernel driver.
+ *
+ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
+ *                 is set in the base_jd_core_req) the CPU address of a
+ *                 base_jd_fragment object.
+ * @udata:         User data.
+ * @extres_list:   List of external resources.
+ * @nr_extres:     Number of external resources or JIT allocations.
+ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
+ *                 allocations written to by the atom. When the atom
+ *                 completes, the value stored at the
+ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
+ *                 each allocation is read in order to enforce an
+ *                 overall physical memory usage limit.
+ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
+ *                 this field; this is done in order to reduce possibility of
+ *                 improper assignment of a dependency field.
+ * @atom_number:   Unique number to identify the atom.
+ * @prio:          Atom priority. Refer to base_jd_prio for more details.
+ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+ *                 specified.
+ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
+ * @core_req:      Core requirements.
+ * @renderpass_id: Renderpass identifier used to associate an atom that has
+ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
+ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
+ * @padding:       Unused. Must be zero.
+ *
+ * This structure has changed since UK 10.2 for which base_jd_core_req was a
+ * __u16 value.
+ *
+ * In UK 10.3 a core_req field of a __u32 type was added to the end of the
+ * structure, and the place in the structure previously occupied by __u16
+ * core_req was kept but renamed to compat_core_req.
+ *
+ * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2].
+ * Compatibility with UK 10.x from UK 11.y is not handled because
+ * the major version increase prevents this.
+ *
+ * For UK 11.20 jit_id[2] must be initialized to zero.
+ */
+struct base_jd_atom_v2 {
+	__u64 jc;
+	struct base_jd_udata udata;
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
+	struct base_dependency pre_dep[2];
+	base_atom_id atom_number;
+	base_jd_prio prio;
+	__u8 device_nr;
+	__u8 jobslot;
+	base_jd_core_req core_req;
+	__u8 renderpass_id;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr
+ *                          at the beginning.
+ *
+ * @seq_nr:        Sequence number of logical grouping of atoms.
+ * @jc:            GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS
+ *                 is set in the base_jd_core_req) the CPU address of a
+ *                 base_jd_fragment object.
+ * @udata:         User data.
+ * @extres_list:   List of external resources.
+ * @nr_extres:     Number of external resources or JIT allocations.
+ * @jit_id:        Zero-terminated array of IDs of just-in-time memory
+ *                 allocations written to by the atom. When the atom
+ *                 completes, the value stored at the
+ *                 &struct_base_jit_alloc_info.heap_info_gpu_addr of
+ *                 each allocation is read in order to enforce an
+ *                 overall physical memory usage limit.
+ * @pre_dep:       Pre-dependencies. One need to use SETTER function to assign
+ *                 this field; this is done in order to reduce possibility of
+ *                 improper assignment of a dependency field.
+ * @atom_number:   Unique number to identify the atom.
+ * @prio:          Atom priority. Refer to base_jd_prio for more details.
+ * @device_nr:     Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+ *                 specified.
+ * @jobslot:       Job slot to use when BASE_JD_REQ_JOB_SLOT is specified.
+ * @core_req:      Core requirements.
+ * @renderpass_id: Renderpass identifier used to associate an atom that has
+ *                 BASE_JD_REQ_START_RENDERPASS set in its core requirements
+ *                 with an atom that has BASE_JD_REQ_END_RENDERPASS set.
+ * @padding:       Unused. Must be zero.
+ */
+typedef struct base_jd_atom {
+	__u64 seq_nr;
+	__u64 jc;
+	struct base_jd_udata udata;
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
+	struct base_dependency pre_dep[2];
+	base_atom_id atom_number;
+	base_jd_prio prio;
+	__u8 device_nr;
+	__u8 jobslot;
+	base_jd_core_req core_req;
+	__u8 renderpass_id;
+	__u8 padding[7];
+} base_jd_atom;
+
+/* Job chain event code bits
+ * Defines the bits used to create ::base_jd_event_code
+ */
+enum {
+	BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */
+	BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */
+	/* Event indicates success (SW events only) */
+	BASE_JD_SW_EVENT_SUCCESS = (1u << 13),
+	BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */
+	BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */
+	BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */
+	BASE_JD_SW_EVENT_RESERVED = (3u << 11),	/* Reserved event type */
+	/* Mask to extract the type from an event code */
+	BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11)
+};
+
+/**
+ * enum base_jd_event_code - Job chain event codes
+ *
+ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status
+ *                                         codes.
+ *                                         Obscurely, BASE_JD_EVENT_TERMINATED
+ *                                         indicates a real fault, because the
+ *                                         job was hard-stopped.
+ * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as
+ *                             'previous job done'.
+ * @BASE_JD_EVENT_STOPPED:     Can't be seen by userspace, becomes
+ *                             TERMINATED, DONE or JOB_CANCELLED.
+ * @BASE_JD_EVENT_TERMINATED:  This is actually a fault status code - the job
+ *                             was hard stopped.
+ * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on
+ *                        complete/fail/cancel.
+ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes.
+ *                                       Obscurely, BASE_JD_EVENT_TERMINATED
+ *                                       indicates a real fault,
+ *                                       because the job was hard-stopped.
+ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and
+ *                                                  software error status codes.
+ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and
+ *                                                software error status codes.
+ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status
+ *                                        codes.
+ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes.
+ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes.
+ *                                         Such codes are never returned to
+ *                                         user-space.
+ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes.
+ * @BASE_JD_EVENT_DONE: atom has completed successfull
+ * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which
+ *                                  shall result in a failed atom
+ * @BASE_JD_EVENT_JOB_POWER_FAULT:  The job could not be executed because the
+ *                                  part of the memory system required to access
+ *                                  job descriptors was not powered on
+ * @BASE_JD_EVENT_JOB_READ_FAULT:   Reading a job descriptor into the Job
+ *                                  manager failed
+ * @BASE_JD_EVENT_JOB_WRITE_FAULT:  Writing a job descriptor from the Job
+ *                                  manager failed
+ * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the
+ *                                    specified affinity mask does not intersect
+ *                                    any available cores
+ * @BASE_JD_EVENT_JOB_BUS_FAULT:    A bus access failed while executing a job
+ * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program
+ *                                  counter was executed.
+ * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal
+ *                                  encoding was executed.
+ * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where
+ *                                  the instruction encoding did not match the
+ *                                  instruction type encoded in the program
+ *                                  counter.
+ * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that
+ *                                  contained invalid combinations of operands.
+ * @BASE_JD_EVENT_INSTR_TLS_FAULT:  A shader instruction was executed that tried
+ *                                  to access the thread local storage section
+ *                                  of another thread.
+ * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that
+ *                                  tried to do an unsupported unaligned memory
+ *                                  access.
+ * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that
+ *                                  failed to complete an instruction barrier.
+ * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job
+ *                                  contains invalid combinations of data.
+ * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to
+ *                                  process a tile that is entirely outside the
+ *                                  bounding box of the frame.
+ * @BASE_JD_EVENT_STATE_FAULT:      Matches ADDR_RANGE_FAULT. A virtual address
+ *                                  has been found that exceeds the virtual
+ *                                  address range.
+ * @BASE_JD_EVENT_OUT_OF_MEMORY:    The tiler ran out of memory when executing a job.
+ * @BASE_JD_EVENT_UNKNOWN:          If multiple jobs in a job chain fail, only
+ *                                  the first one the reports an error will set
+ *                                  and return full error information.
+ *                                  Subsequent failing jobs will not update the
+ *                                  error status registers, and may write an
+ *                                  error status of UNKNOWN.
+ * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to
+ *                                  physical memory where the original virtual
+ *                                  address is no longer available.
+ * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache
+ *                                  has detected that the same line has been
+ *                                  accessed as both shareable and non-shareable
+ *                                  memory from inside the GPU.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table
+ *                                  entry at level 1 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table
+ *                                  entry at level 2 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table
+ *                                  entry at level 3 of the translation table.
+ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table
+ *                                  entry at level 4 of the translation table.
+ * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to
+ *                                  the permission flags set in translation
+ *                                  table
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading
+ *                                  level 0 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading
+ *                                  level 1 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading
+ *                                  level 2 of the translation tables.
+ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading
+ *                                  level 3 of the translation tables.
+ * @BASE_JD_EVENT_ACCESS_FLAG:      Matches ACCESS_FLAG_0. A memory access hit a
+ *                                  translation table entry with the ACCESS_FLAG
+ *                                  bit set to zero in level 0 of the
+ *                                  page table, and the DISABLE_AF_FAULT flag
+ *                                  was not set.
+ * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to
+ *                                   grow memory on demand
+ * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its
+ *                               dependencies failed
+ * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data
+ *                             in the atom which overlaps with
+ *                             BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the
+ *                             platform doesn't support the feature specified in
+ *                             the atom.
+ * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate
+ *                                to userspace that the KBase context has been
+ *                                destroyed and Base should stop listening for
+ *                                further events
+ * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in
+ *                                   the GPU has to be retried (but it has not
+ *                                   started) due to e.g., GPU reset
+ * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal
+ *                             the completion of a renderpass. This value
+ *                             shouldn't be returned to userspace but I haven't
+ *                             seen where it is reset back to JD_EVENT_DONE.
+ *
+ * HW and low-level SW events are represented by event codes.
+ * The status of jobs which succeeded are also represented by
+ * an event code (see @BASE_JD_EVENT_DONE).
+ * Events are usually reported as part of a &struct base_jd_event.
+ *
+ * The event codes are encoded in the following way:
+ * * 10:0  - subtype
+ * * 12:11 - type
+ * * 13    - SW success (only valid if the SW bit is set)
+ * * 14    - SW event (HW event if not set)
+ * * 15    - Kernel event (should never be seen in userspace)
+ *
+ * Events are split up into ranges as follows:
+ * * BASE_JD_EVENT_RANGE_<description>_START
+ * * BASE_JD_EVENT_RANGE_<description>_END
+ *
+ * code is in <description>'s range when:
+ * BASE_JD_EVENT_RANGE_<description>_START <= code <
+ *   BASE_JD_EVENT_RANGE_<description>_END
+ *
+ * Ranges can be asserted for adjacency by testing that the END of the previous
+ * is equal to the START of the next. This is useful for optimizing some tests
+ * for range.
+ *
+ * A limitation is that the last member of this enum must explicitly be handled
+ * (with an assert-unreachable statement) in switch statements that use
+ * variables of this type. Otherwise, the compiler warns that we have not
+ * handled that enum value.
+ */
+enum base_jd_event_code {
+	/* HW defined exceptions */
+	BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0,
+
+	/* non-fatal exceptions */
+	BASE_JD_EVENT_NOT_STARTED = 0x00,
+	BASE_JD_EVENT_DONE = 0x01,
+	BASE_JD_EVENT_STOPPED = 0x03,
+	BASE_JD_EVENT_TERMINATED = 0x04,
+	BASE_JD_EVENT_ACTIVE = 0x08,
+
+	BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40,
+	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40,
+
+	/* job exceptions */
+	BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40,
+	BASE_JD_EVENT_JOB_POWER_FAULT = 0x41,
+	BASE_JD_EVENT_JOB_READ_FAULT = 0x42,
+	BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43,
+	BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44,
+	BASE_JD_EVENT_JOB_BUS_FAULT = 0x48,
+	BASE_JD_EVENT_INSTR_INVALID_PC = 0x50,
+	BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51,
+	BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52,
+	BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53,
+	BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54,
+	BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55,
+	BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56,
+	BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58,
+	BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59,
+	BASE_JD_EVENT_STATE_FAULT = 0x5A,
+	BASE_JD_EVENT_OUT_OF_MEMORY = 0x60,
+	BASE_JD_EVENT_UNKNOWN = 0x7F,
+
+	/* GPU exceptions */
+	BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80,
+	BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88,
+
+	/* MMU exceptions */
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3,
+	BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4,
+	BASE_JD_EVENT_PERMISSION_FAULT = 0xC8,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3,
+	BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4,
+	BASE_JD_EVENT_ACCESS_FLAG = 0xD8,
+
+	/* SW defined exceptions */
+	BASE_JD_EVENT_MEM_GROWTH_FAILED =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000,
+	BASE_JD_EVENT_JOB_CANCELLED =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002,
+	BASE_JD_EVENT_JOB_INVALID =
+		BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003,
+
+	BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_RESERVED | 0x3FF,
+
+	BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | 0x000,
+
+	BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000,
+
+	BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF,
+
+	BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | 0x000,
+	BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000,
+	BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001,
+
+	BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT |
+		BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF
+};
+
+/**
+ * struct base_jd_event_v2 - Event reporting structure
+ *
+ * @event_code:  event code of type @ref base_jd_event_code.
+ * @atom_number: the atom number that has completed.
+ * @padding:     padding.
+ * @udata:       user data.
+ *
+ * This structure is used by the kernel driver to report information
+ * about GPU events. They can either be HW-specific events or low-level
+ * SW events, such as job-chain completion.
+ *
+ * The event code contains an event type field which can be extracted
+ * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK.
+ */
+struct base_jd_event_v2 {
+	__u32 event_code;
+	base_atom_id atom_number;
+	__u8 padding[3];
+	struct base_jd_udata udata;
+};
+
+/**
+ * struct base_dump_cpu_gpu_counters - Structure for
+ *                                     BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS
+ *                                     jobs.
+ * @system_time:   gpu timestamp
+ * @cycle_counter: gpu cycle count
+ * @sec:           cpu time(sec)
+ * @usec:          cpu time(usec)
+ * @padding:       padding
+ *
+ * This structure is stored into the memory pointed to by the @jc field
+ * of &struct base_jd_atom.
+ *
+ * It must not occupy the same CPU cache line(s) as any neighboring data.
+ * This is to avoid cases where access to pages containing the structure
+ * is shared between cached and un-cached memory regions, which would
+ * cause memory corruption.
+ */
+
+struct base_dump_cpu_gpu_counters {
+	__u64 system_time;
+	__u64 cycle_counter;
+	__u64 sec;
+	__u32 usec;
+	__u8 padding[36];
+};
+
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ *
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ *   0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ *   4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ *   release number.
+ *   8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ *   release number.
+ *   4 bit values (0-15).
+ * @padding: padding to align to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ *   clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ *   is a bitpattern where a set bit indicates that the format is supported.
+ *   Before using a texture format, it is recommended that the corresponding
+ *   bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ *   It is unlikely that a client will be able to allocate all of this memory
+ *   for their own purposes, but this at least provides an upper bound on the
+ *   memory available to the GPU.
+ *   This is required for OpenCL's clGetDeviceInfo() call when
+ *   CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ *   client will not be expecting to allocate anywhere near this value.
+ * @num_exec_engines: The number of execution engines. Only valid for tGOX
+ *   (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise,
+ *   this is always 0.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+	__u8 num_exec_engines;
+};
+
+#endif /* _UAPI_BASE_JM_KERNEL_H_ */
diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
new file mode 100644
index 00000000000..20d931adc9b
--- /dev/null
+++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_JM_IOCTL_H_
+#define _UAPI_KBASE_JM_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 11.1:
+ * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags
+ * 11.2:
+ * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED,
+ *   which some user-side clients prior to 11.2 might fault if they received
+ *   them
+ * 11.3:
+ * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and
+ *   KBASE_IOCTL_STICKY_RESOURCE_UNMAP
+ * 11.4:
+ * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET
+ * 11.5:
+ * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD)
+ * 11.6:
+ * - Added flags field to base_jit_alloc_info structure, which can be used to
+ *   specify pseudo chunked tiler alignment for JIT allocations.
+ * 11.7:
+ * - Removed UMP support
+ * 11.8:
+ * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags
+ * 11.9:
+ * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY
+ *   under base_mem_alloc_flags
+ * 11.10:
+ * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for
+ *   JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations
+ *   with one softjob.
+ * 11.11:
+ * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags
+ * 11.12:
+ * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS
+ * 11.13:
+ * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT
+ * 11.14:
+ * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set
+ *   under base_mem_alloc_flags
+ * 11.15:
+ * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags.
+ * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be
+ *   passed to mmap().
+ * 11.16:
+ * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf.
+ * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for
+ *   dma-buf. Now, buffers are mapped on GPU when first imported, no longer
+ *   requiring external resource or sticky resource tracking. UNLESS,
+ *   CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled.
+ * 11.17:
+ * - Added BASE_JD_REQ_JOB_SLOT.
+ * - Reused padding field in base_jd_atom_v2 to pass job slot number.
+ * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO
+ * 11.18:
+ * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags
+ * 11.19:
+ * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified.
+ * 11.20:
+ * - Added new phys_pages member to kbase_ioctl_mem_jit_init for
+ *   KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2
+ *   (replacing '_OLD') and _11_5 suffixes
+ * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in
+ *   base_jd_atom_v2. It must currently be initialized to zero.
+ * - Added heap_info_gpu_addr to base_jit_alloc_info, and
+ *   BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's
+ *   flags member. Previous variants of this structure are kept and given _10_2
+ *   and _11_5 suffixes.
+ * - The above changes are checked for safe values in usual builds
+ * 11.21:
+ * - v2.0 of mali_trace debugfs file, which now versions the file separately
+ * 11.22:
+ * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2.
+ *   KBASE_IOCTL_JOB_SUBMIT supports both in parallel.
+ * 11.23:
+ * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify
+ *   the physical memory backing of JIT allocations. This was not supposed
+ *   to be a valid use case, but it was allowed by the previous implementation.
+ * 11.24:
+ * - Added a sysfs file 'serialize_jobs' inside a new sub-directory
+ *   'scheduling'.
+ * 11.25:
+ * - Enabled JIT pressure limit in base/kbase by default
+ * 11.26
+ * - Added kinstr_jm API
+ * 11.27
+ * - Backwards compatible extension to HWC ioctl.
+ * 11.28:
+ * - Added kernel side cache ops needed hint
+ * 11.29:
+ * - Reserve ioctl 52
+ * 11.30:
+ * - Add a new priority level BASE_JD_PRIO_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 11.31:
+ * - Added BASE_JD_REQ_LIMITED_CORE_MASK.
+ * - Added ioctl 55: set_limited_core_count.
+ * 11.32:
+ * - Added new HW performance counters interface to all GPUs.
+ * 11.33:
+ * - Removed Kernel legacy HWC interface
+ * 11.34:
+ * - First release of new HW performance counters interface.
+ * 11.35:
+ * - Dummy model (no mali) backend will now clear HWC values after each sample
+ */
+#define BASE_UK_VERSION_MAJOR 11
+#define BASE_UK_VERSION_MINOR 35
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+
+/**
+ * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel
+ *
+ * @addr: Memory address of an array of struct base_jd_atom_v2 or v3
+ * @nr_atoms: Number of entries in the array
+ * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom)
+ */
+struct kbase_ioctl_job_submit {
+	__u64 addr;
+	__u32 nr_atoms;
+	__u32 stride;
+};
+
+#define KBASE_IOCTL_JOB_SUBMIT \
+	_IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit)
+
+#define KBASE_IOCTL_POST_TERM \
+	_IO(KBASE_IOCTL_TYPE, 4)
+
+/**
+ * struct kbase_ioctl_soft_event_update - Update the status of a soft-event
+ * @event: GPU address of the event which has been updated
+ * @new_status: The new status to set
+ * @flags: Flags for future expansion
+ */
+struct kbase_ioctl_soft_event_update {
+	__u64 event;
+	__u32 new_status;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_SOFT_EVENT_UPDATE \
+	_IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update)
+
+/**
+ * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for
+ * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the
+ * kernel
+ *
+ * @size:    The size of the `struct kbase_kinstr_jm_atom_state_change`
+ * @version: Represents a breaking change in the
+ *           `struct kbase_kinstr_jm_atom_state_change`
+ * @padding: Explicit padding to get the structure up to 64bits. See
+ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
+ *
+ * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the
+ * end of the structure that older user space might not understand. If the
+ * `version` is the same, the structure is still compatible with newer kernels.
+ * The `size` can be used to cast the opaque memory returned from the kernel.
+ */
+struct kbase_kinstr_jm_fd_out {
+	__u16 size;
+	__u8 version;
+	__u8 padding[5];
+};
+
+/**
+ * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor
+ *
+ * @count: Number of atom states that can be stored in the kernel circular
+ *         buffer. Must be a power of two
+ * @padding: Explicit padding to get the structure up to 64bits. See
+ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst
+ */
+struct kbase_kinstr_jm_fd_in {
+	__u16 count;
+	__u8 padding[6];
+};
+
+union kbase_kinstr_jm_fd {
+	struct kbase_kinstr_jm_fd_in in;
+	struct kbase_kinstr_jm_fd_out out;
+};
+
+#define KBASE_IOCTL_KINSTR_JM_FD \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd)
+
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#endif /* _UAPI_KBASE_JM_IOCTL_H_ */
diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h
new file mode 100644
index 00000000000..f8378146ace
--- /dev/null
+++ b/src/panfrost/base/include/mali_base_common_kernel.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_COMMON_KERNEL_H_
+#define _UAPI_BASE_COMMON_KERNEL_H_
+
+#include <linux/types.h>
+
+struct base_mem_handle {
+	struct {
+		__u64 handle;
+	} basep;
+};
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+/* Memory allocation, access/hint flags & mask.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* IN */
+/* Read access CPU side
+ */
+#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
+
+/* Write access CPU side
+ */
+#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
+
+/* Read access GPU side
+ */
+#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
+
+/* Write access GPU side
+ */
+#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
+
+/* Execute allowed on the GPU side
+ */
+#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
+
+/* Will be permanently mapped in kernel space.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
+
+/* The allocation will completely reside within the same 4GB chunk in the GPU
+ * virtual space.
+ * Since this flag is primarily required only for the TLS memory which will
+ * not be used to contain executable code and also not used for Tiler heap,
+ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
+ */
+#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
+
+/* Userspace is not allowed to free this memory.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
+
+/* Grow backing store on GPU Page Fault
+ */
+#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
+
+/* Page coherence Outer shareable, if available
+ */
+#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
+
+/* Page coherence Inner shareable
+ */
+#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
+
+/* IN/OUT */
+/* Should be cached on the CPU, returned if actually cached
+ */
+#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
+
+/* IN/OUT */
+/* Must have same VA on both the GPU and the CPU
+ */
+#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
+
+/* OUT */
+/* Must call mmap to acquire a GPU address for the allocation
+ */
+#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
+
+/* IN */
+/* Page coherence Outer shareable, required.
+ */
+#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
+
+/* Protected memory
+ */
+#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
+
+/* Not needed physical memory
+ */
+#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
+
+/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
+ * addresses to be the same
+ */
+#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
+
+/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
+ * mode. Some components within the GPU might only be able to access memory
+ * that is GPU cacheable. Refer to the specific GPU implementation for more
+ * details. The 3 shareability flags will be ignored for GPU uncached memory.
+ * If used while importing USER_BUFFER type memory, then the import will fail
+ * if the memory is not aligned to GPU and CPU cache line width.
+ */
+#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
+
+/*
+ * Bits [22:25] for group_id (0~15).
+ *
+ * base_mem_group_id_set() should be used to pack a memory group ID into a
+ * base_mem_alloc_flags value instead of accessing the bits directly.
+ * base_mem_group_id_get() should be used to extract the memory group ID from
+ * a base_mem_alloc_flags value.
+ */
+#define BASEP_MEM_GROUP_ID_SHIFT 22
+#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
+
+/* Must do CPU cache maintenance when imported memory is mapped/unmapped
+ * on GPU. Currently applicable to dma-buf type only.
+ */
+#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
+
+/* OUT */
+/* Kernel side cache sync ops required */
+#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
+
+/* Number of bits used as flags for base memory management
+ *
+ * Must be kept in sync with the base_mem_alloc_flags flags
+ */
+#define BASE_MEM_FLAGS_NR_BITS 30
+
+/* A mask for all output bits, excluding IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
+
+/* A mask for all input bits, including IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_INPUT_MASK                                                                  \
+	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
+
+/* Special base mem handles.
+ */
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
+/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
+
+/* Flags to pass to ::base_context_init.
+ * Flags can be ORed together to enable multiple things.
+ *
+ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
+ * not collide with them.
+ */
+typedef __u32 base_context_create_flags;
+
+/* Flags for base context */
+
+/* No flags set */
+#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
+
+/* Base context is embedded in a cctx object (flag used for CINSTR
+ * software counter macros)
+ */
+#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
+
+/* Base context is a 'System Monitor' context for Hardware counters.
+ *
+ * One important side effect of this is that job submission is disabled.
+ */
+#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1)
+
+/* Bit-shift used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
+
+/* Bitmask used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_MASK                                                            \
+	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/* Bitpattern describing the base_context_create_flags that can be
+ * passed to the kernel
+ */
+#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS                                                          \
+	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK)
+
+/* Flags for base tracepoint
+ */
+
+/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
+ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
+ */
+#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
+
+/* Indicate that job dumping is enabled. This could affect certain timers
+ * to account for the performance impact.
+ */
+#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
+
+#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */
diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h
new file mode 100644
index 00000000000..3d826c720b2
--- /dev/null
+++ b/src/panfrost/base/include/mali_base_kernel.h
@@ -0,0 +1,700 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Base structures shared with the kernel.
+ */
+
+#ifndef _UAPI_BASE_KERNEL_H_
+#define _UAPI_BASE_KERNEL_H_
+
+#include <linux/types.h>
+#include "mali_base_common_kernel.h"
+
+#define BASE_MAX_COHERENT_GROUPS 16
+
+#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
+#define LOCAL_PAGE_SHIFT PAGE_SHIFT
+#define LOCAL_PAGE_LSB ~PAGE_MASK
+#else
+#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
+#endif
+
+#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
+#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
+#else
+#error Failed to find page size
+#endif
+#endif
+
+/* Physical memory group ID for normal usage.
+ */
+#define BASE_MEM_GROUP_DEFAULT (0)
+
+/* Number of physical memory groups.
+ */
+#define BASE_MEM_GROUP_COUNT (16)
+
+/**
+ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
+ *
+ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
+ * in order to determine the best cache policy. Some combinations are
+ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
+ * which defines a write-only region on the CPU side, which is
+ * heavily read by the CPU...
+ * Other flags are only meaningful to a particular allocator.
+ * More flags can be added to this list, as long as they don't clash
+ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
+ */
+typedef __u32 base_mem_alloc_flags;
+
+/* A mask for all the flags which are modifiable via the base_mem_set_flags
+ * interface.
+ */
+#define BASE_MEM_FLAGS_MODIFIABLE \
+	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
+	 BASE_MEM_COHERENT_LOCAL)
+
+/* A mask of all the flags that can be returned via the base_mem_get_flags()
+ * interface.
+ */
+#define BASE_MEM_FLAGS_QUERYABLE \
+	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
+		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
+		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
+		BASEP_MEM_FLAGS_KERNEL_ONLY))
+
+/**
+ * enum base_mem_import_type - Memory types supported by @a base_mem_import
+ *
+ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
+ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
+ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
+ * base_mem_import_user_buffer
+ *
+ * Each type defines what the supported handle type is.
+ *
+ * If any new type is added here ARM must be contacted
+ * to allocate a numeric value for it.
+ * Do not just add a new type without synchronizing with ARM
+ * as future releases from ARM might include other new types
+ * which could clash with your custom types.
+ */
+enum base_mem_import_type {
+	BASE_MEM_IMPORT_TYPE_INVALID = 0,
+	/*
+	 * Import type with value 1 is deprecated.
+	 */
+	BASE_MEM_IMPORT_TYPE_UMM = 2,
+	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
+};
+
+/**
+ * struct base_mem_import_user_buffer - Handle of an imported user buffer
+ *
+ * @ptr:	address of imported user buffer
+ * @length:	length of imported user buffer in bytes
+ *
+ * This structure is used to represent a handle of an imported user buffer.
+ */
+
+struct base_mem_import_user_buffer {
+	__u64 ptr;
+	__u64 length;
+};
+
+/* Mask to detect 4GB boundary alignment */
+#define BASE_MEM_MASK_4GB  0xfffff000UL
+/* Mask to detect 4GB boundary (in page units) alignment */
+#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
+
+/* Limit on the 'extension' parameter for an allocation with the
+ * BASE_MEM_TILER_ALIGN_TOP flag set
+ *
+ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
+ */
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
+	(21u - (LOCAL_PAGE_SHIFT))
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
+	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
+
+/* Bit mask of cookies used for memory allocation setup */
+#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
+
+/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
+#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
+
+/*
+ * struct base_fence - Cross-device synchronisation fence.
+ *
+ * A fence is used to signal when the GPU has finished accessing a resource that
+ * may be shared with other devices, and also to delay work done asynchronously
+ * by the GPU until other devices have finished accessing a shared resource.
+ */
+struct base_fence {
+	struct {
+		int fd;
+		int stream_fd;
+	} basep;
+};
+
+/**
+ * struct base_mem_aliasing_info - Memory aliasing info
+ *
+ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ * @offset: Offset within the handle to start aliasing from, in pages.
+ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
+ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ *          specifies the number of times the special page is needed.
+ *
+ * Describes a memory handle to be aliased.
+ * A subset of the handle can be chosen for aliasing, given an offset and a
+ * length.
+ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
+ * region where a special page is mapped with a write-alloc cache setup,
+ * typically used when the write result of the GPU isn't needed, but the GPU
+ * must write anyway.
+ *
+ * Offset and length are specified in pages.
+ * Offset must be within the size of the handle.
+ * Offset+length must not overrun the size of the handle.
+ */
+struct base_mem_aliasing_info {
+	struct base_mem_handle handle;
+	__u64 offset;
+	__u64 length;
+};
+
+/* Maximum percentage of just-in-time memory allocation trimming to perform
+ * on free.
+ */
+#define BASE_JIT_MAX_TRIM_LEVEL (100)
+
+/* Maximum number of concurrent just-in-time memory allocations.
+ */
+#define BASE_JIT_ALLOC_COUNT (255)
+
+/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
+ *
+ * jit_version is 1
+ *
+ * Due to the lack of padding specified, user clients between 32 and 64-bit
+ * may have assumed a different size of the struct
+ *
+ * An array of structures was not supported
+ */
+struct base_jit_alloc_info_10_2 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+};
+
+/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
+ * to 11.19
+ *
+ * This structure had a number of modifications during and after kernel driver
+ * version 11.5, but remains size-compatible throughout its version history, and
+ * with earlier variants compatible with future variants by requiring
+ * zero-initialization to the unused space in the structure.
+ *
+ * jit_version is 2
+ *
+ * Kernel driver version history:
+ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
+ *       must be zero. Kbase minor version was not incremented, so some
+ *       versions of 11.5 do not have this change.
+ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
+ *       minor version not incremented)
+ * 11.6: Added 'flags', replacing 1 padding byte
+ * 11.10: Arrays of this structure are supported
+ */
+struct base_jit_alloc_info_11_5 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+};
+
+/**
+ * struct base_jit_alloc_info - Structure which describes a JIT allocation
+ *                              request.
+ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
+ *                              allocated GPU virtual address to.
+ * @va_pages:                   The minimum number of virtual pages required.
+ * @commit_pages:               The minimum number of physical pages which
+ *                              should back the allocation.
+ * @extension:                     Granularity of physical pages to grow the
+ *                              allocation by during a fault.
+ * @id:                         Unique ID provided by the caller, this is used
+ *                              to pair allocation and free requests.
+ *                              Zero is not a valid value.
+ * @bin_id:                     The JIT allocation bin, used in conjunction with
+ *                              @max_allocations to limit the number of each
+ *                              type of JIT allocation.
+ * @max_allocations:            The maximum number of allocations allowed within
+ *                              the bin specified by @bin_id. Should be the same
+ *                              for all allocations within the same bin.
+ * @flags:                      flags specifying the special requirements for
+ *                              the JIT allocation, see
+ *                              %BASE_JIT_ALLOC_VALID_FLAGS
+ * @padding:                    Expansion space - should be initialised to zero
+ * @usage_id:                   A hint about which allocation should be reused.
+ *                              The kernel should attempt to use a previous
+ *                              allocation with the same usage_id
+ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
+ *                              the actual usage of the region.
+ *
+ * jit_version is 3.
+ *
+ * When modifications are made to this structure, it is still compatible with
+ * jit_version 3 when: a) the size is unchanged, and b) new members only
+ * replace the padding bytes.
+ *
+ * Previous jit_version history:
+ * jit_version == 1, refer to &base_jit_alloc_info_10_2
+ * jit_version == 2, refer to &base_jit_alloc_info_11_5
+ *
+ * Kbase version history:
+ * 11.20: added @heap_info_gpu_addr
+ */
+struct base_jit_alloc_info {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+	__u64 heap_info_gpu_addr;
+};
+
+enum base_external_resource_access {
+	BASE_EXT_RES_ACCESS_SHARED,
+	BASE_EXT_RES_ACCESS_EXCLUSIVE
+};
+
+struct base_external_resource {
+	__u64 ext_resource;
+};
+
+/**
+ * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources
+ * which can be mapped/unmapped in a single request.
+ */
+#define BASE_EXT_RES_COUNT_MAX 10
+
+/**
+ * struct base_external_resource_list - Structure which describes a list of
+ *                                      external resources.
+ * @count:                              The number of resources.
+ * @ext_res:                            Array of external resources which is
+ *                                      sized at allocation time.
+ */
+struct base_external_resource_list {
+	__u64 count;
+	struct base_external_resource ext_res[1];
+};
+
+struct base_jd_debug_copy_buffer {
+	__u64 address;
+	__u64 size;
+	struct base_external_resource extres;
+};
+
+#define GPU_MAX_JOB_SLOTS 16
+
+/**
+ * DOC: User-side Base GPU Property Queries
+ *
+ * The User-side Base GPU Property Query interface encapsulates two
+ * sub-modules:
+ *
+ * - "Dynamic GPU Properties"
+ * - "Base Platform Config GPU Properties"
+ *
+ * Base only deals with properties that vary between different GPU
+ * implementations - the Dynamic GPU properties and the Platform Config
+ * properties.
+ *
+ * For properties that are constant for the GPU Architecture, refer to the
+ * GPU module. However, we will discuss their relevance here just to
+ * provide background information.
+ *
+ * About the GPU Properties in Base and GPU modules
+ *
+ * The compile-time properties (Platform Config, GPU Compile-time
+ * properties) are exposed as pre-processor macros.
+ *
+ * Complementing the compile-time properties are the Dynamic GPU
+ * Properties, which act as a conduit for the GPU Configuration
+ * Discovery.
+ *
+ * In general, the dynamic properties are present to verify that the platform
+ * has been configured correctly with the right set of Platform Config
+ * Compile-time Properties.
+ *
+ * As a consistent guide across the entire DDK, the choice for dynamic or
+ * compile-time should consider the following, in order:
+ * 1. Can the code be written so that it doesn't need to know the
+ * implementation limits at all?
+ * 2. If you need the limits, get the information from the Dynamic Property
+ * lookup. This should be done once as you fetch the context, and then cached
+ * as part of the context data structure, so it's cheap to access.
+ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
+ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
+ * property). Examples of where this might be sensible follow:
+ *  - Part of a critical inner-loop
+ *  - Frequent re-use throughout the driver, causing significant extra load
+ * instructions or control flow that would be worthwhile optimizing out.
+ *
+ * We cannot provide an exhaustive set of examples, neither can we provide a
+ * rule for every possible situation. Use common sense, and think about: what
+ * the rest of the driver will be doing; how the compiler might represent the
+ * value if it is a compile-time constant; whether an OEM shipping multiple
+ * devices would benefit much more from a single DDK binary, instead of
+ * insignificant micro-optimizations.
+ *
+ * Dynamic GPU Properties
+ *
+ * Dynamic GPU properties are presented in two sets:
+ * 1. the commonly used properties in @ref base_gpu_props, which have been
+ * unpacked from GPU register bitfields.
+ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
+ * (also a member of base_gpu_props). All of these are presented in
+ * the packed form, as presented by the GPU  registers themselves.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ * The properties returned extend the GPU Configuration Discovery
+ * registers. For example, GPU clock speed is not specified in the GPU
+ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
+ *
+ * The GPU properties are obtained by a call to
+ * base_get_gpu_props(). This simply returns a pointer to a const
+ * base_gpu_props structure. It is constant for the life of a base
+ * context. Multiple calls to base_get_gpu_props() to a base context
+ * return the same pointer to a constant structure. This avoids cache pollution
+ * of the common data.
+ *
+ * This pointer must not be freed, because it does not point to the start of a
+ * region allocated by the memory allocator; instead, just close the @ref
+ * base_context.
+ *
+ *
+ * Kernel Operation
+ *
+ * During Base Context Create time, user-side makes a single kernel call:
+ * - A call to fill user memory with GPU information structures
+ *
+ * The kernel-side will fill the provided the entire processed base_gpu_props
+ * structure, because this information is required in both
+ * user and kernel side; it does not make sense to decode it twice.
+ *
+ * Coherency groups must be derived from the bitmasks, but this can be done
+ * kernel side, and just once at kernel startup: Coherency groups must already
+ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
+ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
+ *
+ * Coherency Group calculation
+ *
+ * Creation of the coherent group data is done at device-driver startup, and so
+ * is one-time. This will most likely involve a loop with CLZ, shifting, and
+ * bit clearing on the L2_PRESENT mask, depending on whether the
+ * system is L2 Coherent. The number of shader cores is done by a
+ * population count, since faulty cores may be disabled during production,
+ * producing a non-contiguous mask.
+ *
+ * The memory requirements for this algorithm can be determined either by a __u64
+ * population count on the L2_PRESENT mask (a LUT helper already is
+ * required for the above), or simple assumption that there can be no more than
+ * 16 coherent groups, since core groups are typically 4 cores.
+ */
+
+/*
+ * More information is possible - but associativity and bus width are not
+ * required by upper-level apis.
+ */
+struct mali_base_gpu_l2_cache_props {
+	__u8 log2_line_size;
+	__u8 log2_cache_size;
+	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	__u8 padding[5];
+};
+
+struct mali_base_gpu_tiler_props {
+	__u32 bin_size_bytes;	/* Max is 4*2^15 */
+	__u32 max_active_levels;	/* Max is 2^15 */
+};
+
+/**
+ * struct mali_base_gpu_thread_props - GPU threading system details.
+ * @max_threads: Max. number of threads per core
+ * @max_workgroup_size:     Max. number of threads per workgroup
+ * @max_barrier_size:       Max. number of threads that can synchronize on a
+ *                          simple barrier
+ * @max_registers:          Total size [1..65535] of the register file available
+ *                          per core.
+ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
+ *                          before it becomes blocked.
+ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
+ *                          field.
+ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
+ *                          3 = SW Model/Emulation
+ * @padding:                padding to align to 8-byte
+ * @tls_alloc:              Number of threads per core that TLS must be
+ *                          allocated for
+ */
+struct mali_base_gpu_thread_props {
+	__u32 max_threads;
+	__u32 max_workgroup_size;
+	__u32 max_barrier_size;
+	__u16 max_registers;
+	__u8 max_task_queue;
+	__u8 max_thread_group_split;
+	__u8 impl_tech;
+	__u8  padding[3];
+	__u32 tls_alloc;
+};
+
+/**
+ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
+ * @core_mask: Core restriction mask required for the group
+ * @num_cores: Number of cores in the group
+ * @padding:   padding to align to 8-byte
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ *       the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ *       wastage.
+ */
+struct mali_base_gpu_coherent_group {
+	__u64 core_mask;
+	__u16 num_cores;
+	__u16 padding[3];
+};
+
+/**
+ * struct mali_base_gpu_coherent_group_info - Coherency group information
+ * @num_groups: Number of coherent groups in the GPU.
+ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
+ *                   Equivalent to the number of L2 Caches.
+ *                   The GPU Counter dumping writes 2048 bytes per core group,
+ *                   regardless of whether the core groups are coherent or not.
+ *                   Hence this member is needed to calculate how much memory
+ *                   is required for dumping.
+ *                   @note Do not use it to work out how many valid elements
+ *                         are in the group[] member. Use num_groups instead.
+ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
+ *             methods
+ * @padding: padding to align to 8-byte
+ * @group: Descriptors of coherent groups
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_base_gpu_coherent_group_info {
+	__u32 num_groups;
+	__u32 num_core_groups;
+	__u32 coherency;
+	__u32 padding;
+	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
+};
+
+#if MALI_USE_CSF
+#include "csf/mali_base_csf_kernel.h"
+#else
+#include "jm/mali_base_jm_kernel.h"
+#endif
+
+/**
+ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
+ *                            Configuration Discovery registers.
+ * @shader_present: Shader core present bitmap
+ * @tiler_present: Tiler core present bitmap
+ * @l2_present: Level 2 cache present bitmap
+ * @stack_present: Core stack present bitmap
+ * @l2_features: L2 features
+ * @core_features: Core features
+ * @mem_features: Mem features
+ * @mmu_features: Mmu features
+ * @as_present: Bitmap of address spaces present
+ * @js_present: Job slots present
+ * @js_features: Array of job slot features.
+ * @tiler_features: Tiler features
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
+ * @gpu_id: GPU and revision identifier
+ * @thread_max_threads: Maximum number of threads per core
+ * @thread_max_workgroup_size: Maximum number of threads per workgroup
+ * @thread_max_barrier_size: Maximum number of threads per barrier
+ * @thread_features: Thread features
+ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
+ *                  available modes as exposed in the coherency_features register
+ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
+ * @gpu_features: GPU features
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct gpu_raw_gpu_props {
+	__u64 shader_present;
+	__u64 tiler_present;
+	__u64 l2_present;
+	__u64 stack_present;
+	__u32 l2_features;
+	__u32 core_features;
+	__u32 mem_features;
+	__u32 mmu_features;
+
+	__u32 as_present;
+
+	__u32 js_present;
+	__u32 js_features[GPU_MAX_JOB_SLOTS];
+	__u32 tiler_features;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	__u32 gpu_id;
+
+	__u32 thread_max_threads;
+	__u32 thread_max_workgroup_size;
+	__u32 thread_max_barrier_size;
+	__u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	__u32 coherency_mode;
+
+	__u32 thread_tls_alloc;
+	__u64 gpu_features;
+};
+
+/**
+ * struct base_gpu_props - Return structure for base_get_gpu_props().
+ * @core_props:     Core props.
+ * @l2_props:       L2 props.
+ * @unused_1:       Keep for backwards compatibility.
+ * @tiler_props:    Tiler props.
+ * @thread_props:   Thread props.
+ * @raw_props:      This member is large, likely to be 128 bytes.
+ * @coherency_info: This must be last member of the structure.
+ *
+ * NOTE: the raw_props member in this data structure contains the register
+ * values from which the value of the other members are derived. The derived
+ * members exist to allow for efficient access and/or shielding the details
+ * of the layout of the registers.
+ */
+struct base_gpu_props {
+	struct mali_base_gpu_core_props core_props;
+	struct mali_base_gpu_l2_cache_props l2_props;
+	__u64 unused_1;
+	struct mali_base_gpu_tiler_props tiler_props;
+	struct mali_base_gpu_thread_props thread_props;
+	struct gpu_raw_gpu_props raw_props;
+	struct mali_base_gpu_coherent_group_info coherency_info;
+};
+
+#define BASE_MEM_GROUP_ID_GET(flags)                                           \
+	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
+
+#define BASE_MEM_GROUP_ID_SET(id)                                              \
+	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
+					 BASE_MEM_GROUP_DEFAULT :              \
+					 id)                                   \
+	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
+	 BASE_MEM_GROUP_ID_MASK)
+
+#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
+	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
+	 ((base_context_create_flags)(group_id)                                \
+	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
+
+#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
+	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
+	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/*
+ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
+ * flags are also used, where applicable, for specifying which fields
+ * are valid following the request operation.
+ */
+
+/* For monotonic (counter) timefield */
+#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
+/* For system wide timestamp */
+#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
+/* For GPU cycle counter */
+#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
+/* Specify kernel GPU register timestamp */
+#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
+/* Specify userspace cntvct_el0 timestamp source */
+#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
+
+#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
+		BASE_TIMEINFO_MONOTONIC_FLAG | \
+		BASE_TIMEINFO_TIMESTAMP_FLAG | \
+		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
+		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
+		BASE_TIMEINFO_USER_SOURCE_FLAG)
+
+/* Maximum number of source allocations allowed to create an alias allocation.
+ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
+ * layers, since each cube map in the array will have 6 faces.
+ */
+#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
+
+#endif /* _UAPI_BASE_KERNEL_H_ */
diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h
new file mode 100644
index 00000000000..b250feca022
--- /dev/null
+++ b/src/panfrost/base/include/mali_kbase_gpuprops.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_GPUPROP_H_
+#define _UAPI_KBASE_GPUPROP_H_
+
+/**********************************
+ * Definitions for GPU properties *
+ **********************************/
+#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
+#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
+#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
+#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
+
+#define KBASE_GPUPROP_PRODUCT_ID			1
+#define KBASE_GPUPROP_VERSION_STATUS			2
+#define KBASE_GPUPROP_MINOR_REVISION			3
+#define KBASE_GPUPROP_MAJOR_REVISION			4
+/* 5 previously used for GPU speed */
+#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
+/* 7 previously used for minimum GPU speed */
+#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
+#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
+#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
+#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
+#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
+
+#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
+#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
+#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
+
+#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
+#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
+
+#define KBASE_GPUPROP_MAX_THREADS			18
+#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
+#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
+#define KBASE_GPUPROP_MAX_REGISTERS			21
+#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
+#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
+#define KBASE_GPUPROP_IMPL_TECH				24
+
+#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
+#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
+#define KBASE_GPUPROP_RAW_L2_PRESENT			27
+#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
+#define KBASE_GPUPROP_RAW_L2_FEATURES			29
+#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
+#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
+#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
+#define KBASE_GPUPROP_RAW_AS_PRESENT			33
+#define KBASE_GPUPROP_RAW_JS_PRESENT			34
+#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
+#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
+#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
+#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
+#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
+#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
+#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
+#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
+#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
+#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
+#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
+#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
+#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
+#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
+#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
+#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
+#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
+#define KBASE_GPUPROP_RAW_GPU_ID			55
+#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
+#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
+#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
+#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
+#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
+
+#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
+#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
+#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
+#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
+#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
+#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
+#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
+#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
+#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
+#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
+#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
+#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
+#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
+#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
+#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
+#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
+#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
+#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
+#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
+
+#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
+
+#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
+
+#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
+#define KBASE_GPUPROP_TLS_ALLOC				84
+#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
+
+#endif
diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h
new file mode 100644
index 00000000000..96f606af5f8
--- /dev/null
+++ b/src/panfrost/base/include/mali_kbase_ioctl.h
@@ -0,0 +1,759 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_IOCTL_H_
+#define _UAPI_KBASE_IOCTL_H_
+
+#ifdef __cpluscplus
+extern "C" {
+#endif
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+#if MALI_USE_CSF
+#include "csf/mali_kbase_csf_ioctl.h"
+#else
+#include "jm/mali_kbase_jm_ioctl.h"
+#endif /* MALI_USE_CSF */
+
+#define KBASE_IOCTL_TYPE 0x80
+
+/**
+ * struct kbase_ioctl_set_flags - Set kernel context creation flags
+ *
+ * @create_flags: Flags - see base_context_create_flags
+ */
+struct kbase_ioctl_set_flags {
+	__u32 create_flags;
+};
+
+#define KBASE_IOCTL_SET_FLAGS \
+	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
+
+/**
+ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
+ *
+ * @buffer: Pointer to the buffer to store properties into
+ * @size: Size of the buffer
+ * @flags: Flags - must be zero for now
+ *
+ * The ioctl will return the number of bytes stored into @buffer or an error
+ * on failure (e.g. @size is too small). If @size is specified as 0 then no
+ * data will be written but the return value will be the number of bytes needed
+ * for all the properties.
+ *
+ * @flags may be used in the future to request a different format for the
+ * buffer. With @flags == 0 the following format is used.
+ *
+ * The buffer will be filled with pairs of values, a __u32 key identifying the
+ * property followed by the value. The size of the value is identified using
+ * the bottom bits of the key. The value then immediately followed the key and
+ * is tightly packed (there is no padding). All keys and values are
+ * little-endian.
+ *
+ * 00 = __u8
+ * 01 = __u16
+ * 10 = __u32
+ * 11 = __u64
+ */
+struct kbase_ioctl_get_gpuprops {
+	__u64 buffer;
+	__u32 size;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_GET_GPUPROPS \
+	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
+
+/**
+ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC \
+	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
+
+/**
+ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
+ * @in: Input parameters
+ * @in.gpu_addr: A GPU address contained within the region
+ * @in.query: The type of query
+ * @out: Output parameters
+ * @out.value: The result of the query
+ *
+ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
+ */
+union kbase_ioctl_mem_query {
+	struct {
+		__u64 gpu_addr;
+		__u64 query;
+	} in;
+	struct {
+		__u64 value;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_QUERY \
+	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
+
+#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
+#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
+#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
+
+/**
+ * struct kbase_ioctl_mem_free - Free a memory region
+ * @gpu_addr: Handle to the region to free
+ */
+struct kbase_ioctl_mem_free {
+	__u64 gpu_addr;
+};
+
+#define KBASE_IOCTL_MEM_FREE \
+	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @buffer_count: requested number of dumping buffers
+ * @fe_bm:        counters selection bitmask (Front end)
+ * @shader_bm:    counters selection bitmask (Shader)
+ * @tiler_bm:     counters selection bitmask (Tiler)
+ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error
+ */
+struct kbase_ioctl_hwcnt_reader_setup {
+	__u32 buffer_count;
+	__u32 fe_bm;
+	__u32 shader_bm;
+	__u32 tiler_bm;
+	__u32 mmu_l2_bm;
+};
+
+#define KBASE_IOCTL_HWCNT_READER_SETUP \
+	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
+
+/**
+ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
+ * @data:    Counter samples for the dummy model.
+ * @size:    Size of the counter sample data.
+ * @padding: Padding.
+ */
+struct kbase_ioctl_hwcnt_values {
+	__u64 data;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_HWCNT_SET \
+	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
+
+/**
+ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
+ * @counter:   A counter of disjoint events in the kernel
+ */
+struct kbase_ioctl_disjoint_query {
+	__u32 counter;
+};
+
+#define KBASE_IOCTL_DISJOINT_QUERY \
+	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
+
+/**
+ * struct kbase_ioctl_get_ddk_version - Query the kernel version
+ * @version_buffer: Buffer to receive the kernel version string
+ * @size: Size of the buffer
+ * @padding: Padding
+ *
+ * The ioctl will return the number of bytes written into version_buffer
+ * (which includes a NULL byte) or a negative error code
+ *
+ * The ioctl request code has to be _IOW because the data in ioctl struct is
+ * being copied to the kernel, even though the kernel then writes out the
+ * version info to the buffer specified in the ioctl.
+ */
+struct kbase_ioctl_get_ddk_version {
+	__u64 version_buffer;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_GET_DDK_VERSION \
+	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 10.2--11.4)
+ * @va_pages: Number of VA pages to reserve for JIT
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_10_2 {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 11.5--11.19)
+ * @va_pages: Number of VA pages to reserve for JIT
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_11_5 {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
+
+/**
+ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
+ *                                   allocator
+ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
+ *            memory allocations
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ * @phys_pages: Maximum number of physical pages to allocate just-in-time
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ */
+struct kbase_ioctl_mem_jit_init {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+	__u64 phys_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
+
+/**
+ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
+ *
+ * @handle: GPU memory handle (GPU VA)
+ * @user_addr: The address where it is mapped in user space
+ * @size: The number of bytes to synchronise
+ * @type: The direction to synchronise: 0 is sync to memory (clean),
+ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_mem_sync {
+	__u64 handle;
+	__u64 user_addr;
+	__u64 size;
+	__u8 type;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_MEM_SYNC \
+	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
+
+/**
+ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
+ *
+ * @in: Input parameters
+ * @in.gpu_addr: The GPU address of the memory region
+ * @in.cpu_addr: The CPU address to locate
+ * @in.size: A size in bytes to validate is contained within the region
+ * @out: Output parameters
+ * @out.offset: The offset from the start of the memory region to @cpu_addr
+ */
+union kbase_ioctl_mem_find_cpu_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 cpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
+
+/**
+ * struct kbase_ioctl_get_context_id - Get the kernel context ID
+ *
+ * @id: The kernel context ID
+ */
+struct kbase_ioctl_get_context_id {
+	__u32 id;
+};
+
+#define KBASE_IOCTL_GET_CONTEXT_ID \
+	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
+
+/**
+ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
+ *
+ * @flags: Flags
+ *
+ * The ioctl returns a file descriptor when successful
+ */
+struct kbase_ioctl_tlstream_acquire {
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
+	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
+
+#define KBASE_IOCTL_TLSTREAM_FLUSH \
+	_IO(KBASE_IOCTL_TYPE, 19)
+
+/**
+ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
+ *
+ * @gpu_addr: The memory region to modify
+ * @pages:    The number of physical pages that should be present
+ *
+ * The ioctl may return on the following error codes or 0 for success:
+ *   -ENOMEM: Out of memory
+ *   -EINVAL: Invalid arguments
+ */
+struct kbase_ioctl_mem_commit {
+	__u64 gpu_addr;
+	__u64 pages;
+};
+
+#define KBASE_IOCTL_MEM_COMMIT \
+	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
+
+/**
+ * union kbase_ioctl_mem_alias - Create an alias of memory regions
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.stride: Bytes between start of each memory region
+ * @in.nents: The number of regions to pack together into the alias
+ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_alias {
+	struct {
+		__u64 flags;
+		__u64 stride;
+		__u64 nents;
+		__u64 aliasing_info;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALIAS \
+	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
+
+/**
+ * union kbase_ioctl_mem_import - Import memory for use by the GPU
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.phandle: Handle to the external memory
+ * @in.type: Type of external memory, see base_mem_import_type
+ * @in.padding: Amount of extra VA pages to append to the imported buffer
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_import {
+	struct {
+		__u64 flags;
+		__u64 phandle;
+		__u32 type;
+		__u32 padding;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_IMPORT \
+	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
+
+/**
+ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
+ * @gpu_va: The GPU region to modify
+ * @flags: The new flags to set
+ * @mask: Mask of the flags to modify
+ */
+struct kbase_ioctl_mem_flags_change {
+	__u64 gpu_va;
+	__u64 flags;
+	__u64 mask;
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
+	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
+
+/**
+ * struct kbase_ioctl_stream_create - Create a synchronisation stream
+ * @name: A name to identify this stream. Must be NULL-terminated.
+ *
+ * Note that this is also called a "timeline", but is named stream to avoid
+ * confusion with other uses of the word.
+ *
+ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
+ *
+ * The ioctl returns a file descriptor.
+ */
+struct kbase_ioctl_stream_create {
+	char name[32];
+};
+
+#define KBASE_IOCTL_STREAM_CREATE \
+	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
+
+/**
+ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
+ * @fd: The file descriptor to validate
+ */
+struct kbase_ioctl_fence_validate {
+	int fd;
+};
+
+#define KBASE_IOCTL_FENCE_VALIDATE \
+	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
+
+/**
+ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
+ * @buffer: Pointer to the information
+ * @len: Length
+ * @padding: Padding
+ *
+ * The data provided is accessible through a debugfs file
+ */
+struct kbase_ioctl_mem_profile_add {
+	__u64 buffer;
+	__u32 len;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_MEM_PROFILE_ADD \
+	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to map
+ */
+struct kbase_ioctl_sticky_resource_map {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
+	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
+
+/**
+ * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was
+ *                                          previously permanently mapped
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to unmap
+ */
+struct kbase_ioctl_sticky_resource_unmap {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
+	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
+
+/**
+ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
+ *                                                   the GPU memory region for
+ *                                                   the given gpu address and
+ *                                                   the offset of that address
+ *                                                   into the region
+ * @in: Input parameters
+ * @in.gpu_addr: GPU virtual address
+ * @in.size: Size in bytes within the region
+ * @out: Output parameters
+ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
+ *             for the length of @offset bytes
+ * @out.offset: The offset from the start of the memory region to @gpu_addr
+ */
+union kbase_ioctl_mem_find_gpu_start_and_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 start;
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
+
+#define KBASE_IOCTL_CINSTR_GWT_START \
+	_IO(KBASE_IOCTL_TYPE, 33)
+
+#define KBASE_IOCTL_CINSTR_GWT_STOP \
+	_IO(KBASE_IOCTL_TYPE, 34)
+
+/**
+ * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault
+ *                                     addresses.
+ * @in: Input parameters
+ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
+ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
+ * @in.len: Number of addresses the buffers can hold.
+ * @in.padding: padding
+ * @out: Output parameters
+ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
+ * @out.more_data_available: Status indicating if more addresses are available.
+ * @out.padding: padding
+ *
+ * This structure is used when performing a call to dump GPU write fault
+ * addresses.
+ */
+union kbase_ioctl_cinstr_gwt_dump {
+	struct {
+		__u64 addr_buffer;
+		__u64 size_buffer;
+		__u32 len;
+		__u32 padding;
+
+	} in;
+	struct {
+		__u32 no_of_addr_collected;
+		__u8 more_data_available;
+		__u8 padding[27];
+	} out;
+};
+
+#define KBASE_IOCTL_CINSTR_GWT_DUMP \
+	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
+
+/**
+ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
+ *
+ * @va_pages: Number of VA pages to reserve for EXEC_VA
+ */
+struct kbase_ioctl_mem_exec_init {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_EXEC_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
+
+/**
+ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
+ *                                          cpu/gpu time (counter values)
+ * @in: Input parameters
+ * @in.request_flags: Bit-flags indicating the requested types.
+ * @in.paddings:      Unused, size alignment matching the out.
+ * @out: Output parameters
+ * @out.sec:           Integer field of the monotonic time, unit in seconds.
+ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
+ * @out.padding:       Unused, for __u64 alignment
+ * @out.timestamp:     System wide timestamp (counter) value.
+ * @out.cycle_counter: GPU cycle counter value.
+ */
+union kbase_ioctl_get_cpu_gpu_timeinfo {
+	struct {
+		__u32 request_flags;
+		__u32 paddings[7];
+	} in;
+	struct {
+		__u64 sec;
+		__u32 nsec;
+		__u32 padding;
+		__u64 timestamp;
+		__u64 cycle_counter;
+	} out;
+};
+
+#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
+	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
+
+/**
+ * struct kbase_ioctl_context_priority_check - Check the max possible priority
+ * @priority: Input priority & output priority
+ */
+
+struct kbase_ioctl_context_priority_check {
+	__u8 priority;
+};
+
+#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
+
+/**
+ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
+ *
+ * @max_core_count: Maximum core count
+ */
+struct kbase_ioctl_set_limited_core_count {
+	__u8 max_core_count;
+};
+
+#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
+	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
+ *                                              information
+ * @info_item_size:  Performance counter item size in bytes.
+ * @info_item_count: Performance counter item count in the info_list_ptr.
+ * @info_list_ptr:   Performance counter item list pointer which points to a
+ *                   list with info_item_count of items.
+ *
+ * On success: returns info_item_size and info_item_count if info_list_ptr is
+ * NULL, returns performance counter information if info_list_ptr is not NULL.
+ * On error: returns a negative error code.
+ */
+struct kbase_ioctl_kinstr_prfcnt_enum_info {
+	__u32 info_item_size;
+	__u32 info_item_count;
+	__u64 info_list_ptr;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
+	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader
+ * @in: input parameters.
+ * @in.request_item_count: Number of requests in the requests array.
+ * @in.request_item_size:  Size in bytes of each request in the requests array.
+ * @in.requests_ptr:       Pointer to the requests array.
+ * @out: output parameters.
+ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
+ *                                 each sample.
+ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
+ *                                 for reading performance counter samples.
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error.
+ */
+union kbase_ioctl_kinstr_prfcnt_setup {
+	struct {
+		__u32 request_item_count;
+		__u32 request_item_size;
+		__u64 requests_ptr;
+	} in;
+	struct {
+		__u32 prfcnt_metadata_item_size;
+		__u32 prfcnt_mmap_size_bytes;
+	} out;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
+
+
+/**
+ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
+ * @bytes_collected: number of bytes read by user
+ * @bytes_generated: number of bytes generated by tracepoints
+ */
+struct kbase_ioctl_tlstream_stats {
+	__u32 bytes_collected;
+	__u32 bytes_generated;
+};
+
+#define KBASE_IOCTL_TLSTREAM_STATS \
+	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
+
+#endif /* MALI_UNIT_TEST */
+
+/* Customer extension range */
+#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
+
+/* If the integration needs extra ioctl add them there
+ * like this:
+ *
+ * struct my_ioctl_args {
+ *  ....
+ * }
+ *
+ * #define KBASE_IOCTL_MY_IOCTL \
+ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
+ */
+
+#ifdef __cpluscplus
+}
+#endif
+
+#endif /* _UAPI_KBASE_IOCTL_H_ */
diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h
new file mode 100644
index 00000000000..5f33f5c4c4b
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h
@@ -0,0 +1,80 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __KBASE_IOCTL_MIDGARD_H__
+#define __KBASE_IOCTL_MIDGARD_H__
+
+#define KBASE_IOCTL_TYPE_BASE  0x80
+#define KBASE_IOCTL_TYPE_MAX   0x82
+
+union kbase_ioctl_mem_alloc {
+        struct {
+                union kbase_ioctl_header header;
+                u64 va_pages;
+                u64 commit_pages;
+                u64 extension;
+                u64 flags;
+        } in;
+        struct {
+                union kbase_ioctl_header header;
+                u64 pad[3];
+                u64 flags;
+                mali_ptr gpu_va;
+                u16 va_alignment;
+        } out;
+        u64 pad[7];
+} __attribute__((packed));
+
+#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1)
+
+#define KBASE_IOCTL_GET_VERSION             (_IOWR(0x80,  0, struct kbase_ioctl_get_version))
+#define KBASE_IOCTL_MEM_ALLOC               (_IOWR(0x82,  0, union kbase_ioctl_mem_alloc))
+#define KBASE_IOCTL_MEM_IMPORT              (_IOWR(0x82,  1, union kbase_ioctl_mem_import))
+#define KBASE_IOCTL_MEM_COMMIT              (_IOWR(0x82,  2, struct kbase_ioctl_mem_commit))
+#define KBASE_IOCTL_MEM_QUERY               (_IOWR(0x82,  3, struct kbase_ioctl_mem_query))
+#define KBASE_IOCTL_MEM_FREE                (_IOWR(0x82,  4, struct kbase_ioctl_mem_free))
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE        (_IOWR(0x82,  5, struct kbase_ioctl_mem_flags_change))
+#define KBASE_IOCTL_MEM_ALIAS               (_IOWR(0x82,  6, struct kbase_ioctl_mem_alias))
+#define KBASE_IOCTL_MEM_SYNC                (_IOWR(0x82,  8, struct kbase_ioctl_mem_sync))
+#define KBASE_IOCTL_POST_TERM               (_IOWR(0x82,  9, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_SETUP             (_IOWR(0x82, 10, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_DUMP              (_IOWR(0x82, 11, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_CLEAR             (_IOWR(0x82, 12, __ioctl_placeholder))
+#define KBASE_IOCTL_GPU_PROPS_REG_DUMP      (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump))
+#define KBASE_IOCTL_FIND_CPU_OFFSET         (_IOWR(0x82, 15, __ioctl_placeholder))
+#define KBASE_IOCTL_GET_VERSION_NEW         (_IOWR(0x82, 16, struct kbase_ioctl_get_version))
+#define KBASE_IOCTL_SET_FLAGS               (_IOWR(0x82, 18, struct kbase_ioctl_set_flags))
+#define KBASE_IOCTL_SET_TEST_DATA           (_IOWR(0x82, 19, __ioctl_placeholder))
+#define KBASE_IOCTL_INJECT_ERROR            (_IOWR(0x82, 20, __ioctl_placeholder))
+#define KBASE_IOCTL_MODEL_CONTROL           (_IOWR(0x82, 21, __ioctl_placeholder))
+#define KBASE_IOCTL_KEEP_GPU_POWERED        (_IOWR(0x82, 22, __ioctl_placeholder))
+#define KBASE_IOCTL_FENCE_VALIDATE          (_IOWR(0x82, 23, __ioctl_placeholder))
+#define KBASE_IOCTL_STREAM_CREATE           (_IOWR(0x82, 24, struct kbase_ioctl_stream_create))
+#define KBASE_IOCTL_GET_PROFILING_CONTROLS  (_IOWR(0x82, 25, __ioctl_placeholder))
+#define KBASE_IOCTL_SET_PROFILING_CONTROLS  (_IOWR(0x82, 26, __ioctl_placeholder))
+#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder))
+#define KBASE_IOCTL_JOB_SUBMIT              (_IOWR(0x82, 28, struct kbase_ioctl_job_submit))
+#define KBASE_IOCTL_DISJOINT_QUERY          (_IOWR(0x82, 29, __ioctl_placeholder))
+#define KBASE_IOCTL_GET_CONTEXT_ID          (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id))
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4  (_IOWR(0x82, 32, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_TEST           (_IOWR(0x82, 33, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_STATS          (_IOWR(0x82, 34, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_FLUSH          (_IOWR(0x82, 35, __ioctl_placeholder))
+#define KBASE_IOCTL_HWCNT_READER_SETUP      (_IOWR(0x82, 36, __ioctl_placeholder))
+#define KBASE_IOCTL_SET_PRFCNT_VALUES       (_IOWR(0x82, 37, __ioctl_placeholder))
+#define KBASE_IOCTL_SOFT_EVENT_UPDATE       (_IOWR(0x82, 38, __ioctl_placeholder))
+#define KBASE_IOCTL_MEM_JIT_INIT            (_IOWR(0x82, 39, __ioctl_placeholder))
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE        (_IOWR(0x82, 40, __ioctl_placeholder))
+
+#endif /* __KBASE_IOCTL_MIDGARD_H__ */
diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h
new file mode 100644
index 00000000000..5c76f2dc8e5
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-ioctl.h
@@ -0,0 +1,743 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+/**
+ * Definitions for all of the ioctls for the original open source bifrost GPU
+ * kernel driver, written by ARM.
+ */
+
+#ifndef __KBASE_IOCTL_H__
+#define __KBASE_IOCTL_H__
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int32_t s32;
+typedef int64_t s64;
+
+
+typedef u8 mali_atom_id;
+
+/**
+ * Since these structs are passed to and from the kernel we need to make sure
+ * that we get the size of each struct to match exactly what the kernel is
+ * expecting. So, when editing this file make sure to add static asserts that
+ * check each struct's size against the arg length you see in strace.
+ */
+
+enum kbase_ioctl_mem_flags {
+	/* IN */
+	BASE_MEM_PROT_CPU_RD = (1U << 0),      /**< Read access CPU side */
+	BASE_MEM_PROT_CPU_WR = (1U << 1),      /**< Write access CPU side */
+	BASE_MEM_PROT_GPU_RD = (1U << 2),      /**< Read access GPU side */
+	BASE_MEM_PROT_GPU_WR = (1U << 3),      /**< Write access GPU side */
+	BASE_MEM_PROT_GPU_EX = (1U << 4),      /**< Execute allowed on the GPU
+						    side */
+
+	BASE_MEM_GROW_ON_GPF = (1U << 9),      /**< Grow backing store on GPU
+						    Page Fault */
+
+	BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer
+						    shareable, if available */
+	BASE_MEM_COHERENT_LOCAL = (1U << 11),  /**< Page coherence Inner
+						    shareable */
+	BASE_MEM_CACHED_CPU = (1U << 12),      /**< Should be cached on the
+						    CPU */
+
+	/* IN/OUT */
+	BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU
+					    and the CPU */
+	/* OUT */
+	BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU
+					     address for the alloc */
+	/* IN */
+	BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence
+					     Outer shareable, required. */
+	BASE_MEM_SECURE = (1U << 16),          /**< Secure memory */
+	BASE_MEM_DONT_NEED = (1U << 17),       /**< Not needed physical
+						    memory */
+	BASE_MEM_IMPORT_SHARED = (1U << 18),   /**< Must use shared CPU/GPU zone
+						    (SAME_VA zone) but doesn't
+						    require the addresses to
+						    be the same */
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_IN_MASK                                          \
+	(BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |                        \
+	 BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \
+	 BASE_MEM_GROW_ON_GPF |                                               \
+	 BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL |                 \
+	 BASE_MEM_CACHED_CPU |                                                \
+	 BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE |                \
+	 BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12)
+
+enum kbase_ioctl_coherency_mode {
+	COHERENCY_ACE_LITE = 0,
+	COHERENCY_ACE      = 1,
+	COHERENCY_NONE     = 31
+};
+
+/*
+ * Mali Atom priority
+ *
+ * Only certain priority levels are actually implemented, as specified by the
+ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
+ * level that is not one of those defined below.
+ *
+ * Priority levels only affect scheduling between atoms of the same type within
+ * a mali context, and only after the atoms have had dependencies resolved.
+ * Fragment atoms does not affect non-frament atoms with lower priorities, and
+ * the other way around. For example, a low priority atom that has had its
+ * dependencies resolved might run before a higher priority atom that has not
+ * had its dependencies resolved.
+ *
+ * The scheduling between mali contexts/processes and between atoms from
+ * different mali contexts/processes is unaffected by atom priority.
+ *
+ * The atoms are scheduled as follows with respect to their priorities:
+ * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies
+ *   resolved, and atom 'X' has a higher priority than atom 'Y'
+ * - If atom 'Y' is currently running on the HW, then it is interrupted to
+ *   allow atom 'X' to run soon after
+ * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing
+ *   the next atom to run, atom 'X' will always be chosen instead of atom 'Y'
+ * - Any two atoms that have the same priority could run in any order with
+ *   respect to each other. That is, there is no ordering constraint between
+ *   atoms of the same priority.
+ */
+typedef u8 mali_jd_prio;
+#define BASE_JD_PRIO_MEDIUM  ((mali_jd_prio)0)
+#define BASE_JD_PRIO_HIGH    ((mali_jd_prio)1)
+#define BASE_JD_PRIO_LOW     ((mali_jd_prio)2)
+
+/**
+ * @brief Job dependency type.
+ *
+ * A flags field will be inserted into the atom structure to specify whether a
+ * dependency is a data or ordering dependency (by putting it before/after
+ * 'core_req' in the structure it should be possible to add without changing
+ * the structure size).  When the flag is set for a particular dependency to
+ * signal that it is an ordering only dependency then errors will not be
+ * propagated.
+ */
+typedef u8 mali_jd_dep_type;
+#define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
+#define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
+#define BASE_JD_DEP_TYPE_ORDER    (1U << 1) /**< Order dependency */
+
+/**
+ * @brief Job chain hardware requirements.
+ *
+ * A job chain must specify what GPU features it needs to allow the
+ * driver to schedule the job correctly.  By not specifying the
+ * correct settings can/will cause an early job termination.  Multiple
+ * values can be ORed together to specify multiple requirements.
+ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
+ * dependencies, and that doesn't execute anything on the hardware.
+ */
+typedef u32 mali_jd_core_req;
+
+/* Requirements that come from the HW */
+
+/**
+ * No requirement, dependency only
+ */
+#define BASE_JD_REQ_DEP ((mali_jd_core_req)0)
+
+/**
+ * Requires fragment shaders
+ */
+#define BASE_JD_REQ_FS  ((mali_jd_core_req)1 << 0)
+
+/**
+ * Requires compute shaders
+ * This covers any of the following Midgard Job types:
+ * - Vertex Shader Job
+ * - Geometry Shader Job
+ * - An actual Compute Shader Job
+ *
+ * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the
+ * job is specifically just the "Compute Shader" job type, and not the "Vertex
+ * Shader" nor the "Geometry Shader" job type.
+ */
+#define BASE_JD_REQ_CS  ((mali_jd_core_req)1 << 1)
+#define BASE_JD_REQ_T   ((mali_jd_core_req)1 << 2)   /**< Requires tiling */
+#define BASE_JD_REQ_CF  ((mali_jd_core_req)1 << 3)   /**< Requires cache flushes */
+#define BASE_JD_REQ_V   ((mali_jd_core_req)1 << 4)   /**< Requires value writeback */
+
+/* SW-only requirements - the HW does not expose these as part of the job slot
+ * capabilities */
+
+/* Requires fragment job with AFBC encoding */
+#define BASE_JD_REQ_FS_AFBC  ((mali_jd_core_req)1 << 13)
+
+/**
+ * SW-only requirement: coalesce completion events.
+ * If this bit is set then completion of this atom will not cause an event to
+ * be sent to userspace, whether successful or not; completion events will be
+ * deferred until an atom completes which does not have this bit set.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES.
+ */
+#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5)
+
+/**
+ * SW Only requirement: the job chain requires a coherent core group. We don't
+ * mind which coherent core group is used.
+ */
+#define BASE_JD_REQ_COHERENT_GROUP  ((mali_jd_core_req)1 << 6)
+
+/**
+ * SW Only requirement: The performance counters should be enabled only when
+ * they are needed, to reduce power consumption.
+ */
+
+#define BASE_JD_REQ_PERMON               ((mali_jd_core_req)1 << 7)
+
+/**
+ * SW Only requirement: External resources are referenced by this atom.  When
+ * external resources are referenced no syncsets can be bundled with the atom
+ * but should instead be part of a NULL jobs inserted into the dependency
+ * tree.  The first pre_dep object must be configured for the external
+ * resouces to use, the second pre_dep object can be used to create other
+ * dependencies.
+ *
+ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE.
+ */
+#define BASE_JD_REQ_EXTERNAL_RESOURCES   ((mali_jd_core_req)1 << 8)
+
+/**
+ * SW Only requirement: Software defined job. Jobs with this bit set will not
+ * be submitted to the hardware but will cause some action to happen within
+ * the driver
+ */
+#define BASE_JD_REQ_SOFT_JOB        ((mali_jd_core_req)1 << 9)
+
+#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME      (BASE_JD_REQ_SOFT_JOB | 0x1)
+#define BASE_JD_REQ_SOFT_FENCE_TRIGGER          (BASE_JD_REQ_SOFT_JOB | 0x2)
+#define BASE_JD_REQ_SOFT_FENCE_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x3)
+
+/**
+ * SW Only requirement : Replay job.
+ *
+ * If the preceding job fails, the replay job will cause the jobs specified in
+ * the list of mali_jd_replay_payload pointed to by the jc pointer to be
+ * replayed.
+ *
+ * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT
+ * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay
+ * job is failed, as well as any following dependencies.
+ *
+ * The replayed jobs will require a number of atom IDs. If there are not enough
+ * free atom IDs then the replay job will fail.
+ *
+ * If the preceding job does not fail, then the replay job is returned as
+ * completed.
+ *
+ * The replayed jobs will never be returned to userspace. The preceding failed
+ * job will be returned to userspace as failed; the status of this job should
+ * be ignored. Completion should be determined by the status of the replay soft
+ * job.
+ *
+ * In order for the jobs to be replayed, the job headers will have to be
+ * modified. The Status field will be reset to NOT_STARTED. If the Job Type
+ * field indicates a Vertex Shader Job then it will be changed to Null Job.
+ *
+ * The replayed jobs have the following assumptions :
+ *
+ * - No external resources. Any required external resources will be held by the
+ *   replay atom.
+ * - Pre-dependencies are created based on job order.
+ * - Atom numbers are automatically assigned.
+ * - device_nr is set to 0. This is not relevant as
+ *   BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set.
+ * - Priority is inherited from the replay job.
+ */
+#define BASE_JD_REQ_SOFT_REPLAY                 (BASE_JD_REQ_SOFT_JOB | 0x4)
+/**
+ * SW only requirement: event wait/trigger job.
+ *
+ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set.
+ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the
+ *   other waiting jobs. It completes immediately.
+ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it
+ *   possible for other jobs to wait upon. It completes immediately.
+ */
+#define BASE_JD_REQ_SOFT_EVENT_WAIT             (BASE_JD_REQ_SOFT_JOB | 0x5)
+#define BASE_JD_REQ_SOFT_EVENT_SET              (BASE_JD_REQ_SOFT_JOB | 0x6)
+#define BASE_JD_REQ_SOFT_EVENT_RESET            (BASE_JD_REQ_SOFT_JOB | 0x7)
+
+#define BASE_JD_REQ_SOFT_DEBUG_COPY             (BASE_JD_REQ_SOFT_JOB | 0x8)
+
+/**
+ * SW only requirement: Just In Time allocation
+ *
+ * This job requests a JIT allocation based on the request in the
+ * @base_jit_alloc_info structure which is passed via the jc element of
+ * the atom.
+ *
+ * It should be noted that the id entry in @base_jit_alloc_info must not
+ * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE.
+ *
+ * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE
+ * soft job to free the JIT allocation is still made.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_ALLOC              (BASE_JD_REQ_SOFT_JOB | 0x9)
+/**
+ * SW only requirement: Just In Time free
+ *
+ * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC
+ * to be freed. The ID of the JIT allocation is passed via the jc element of
+ * the atom.
+ *
+ * The job will complete immediately.
+ */
+#define BASE_JD_REQ_SOFT_JIT_FREE               (BASE_JD_REQ_SOFT_JOB | 0xa)
+
+/**
+ * SW only requirement: Map external resource
+ *
+ * This job requests external resource(s) are mapped once the dependencies
+ * of the job have been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * @base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_MAP            (BASE_JD_REQ_SOFT_JOB | 0xb)
+/**
+ * SW only requirement: Unmap external resource
+ *
+ * This job requests external resource(s) are unmapped once the dependencies
+ * of the job has been satisfied. The list of external resources are
+ * passed via the jc element of the atom which is a pointer to a
+ * @base_external_resource_list.
+ */
+#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP          (BASE_JD_REQ_SOFT_JOB | 0xc)
+
+/**
+ * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders)
+ *
+ * This indicates that the Job Chain contains Midgard Jobs of the 'Compute
+ * Shaders' type.
+ *
+ * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job
+ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs.
+ */
+#define BASE_JD_REQ_ONLY_COMPUTE    ((mali_jd_core_req)1 << 10)
+
+/**
+ * HW Requirement: Use the mali_jd_atom::device_nr field to specify a
+ * particular core group
+ *
+ * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag
+ * takes priority
+ *
+ * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms.
+ *
+ * If the core availability policy is keeping the required core group turned
+ * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code.
+ */
+#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11)
+
+/**
+ * SW Flag: If this bit is set then the successful completion of this atom
+ * will not cause an event to be sent to userspace
+ */
+#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE   ((mali_jd_core_req)1 << 12)
+
+/**
+ * SW Flag: If this bit is set then completion of this atom will not cause an
+ * event to be sent to userspace, whether successful or not.
+ */
+#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14)
+
+/**
+ * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job starts which does not have this bit set or a job completes
+ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if
+ * the CPU may have written to memory addressed by the job since the last job
+ * without this bit set was submitted.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15)
+
+/**
+ * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes.
+ *
+ * If this bit is set then the GPU's cache will not be cleaned and invalidated
+ * until a GPU job completes which does not have this bit set or a job starts
+ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not
+ * use if the CPU may read from or partially overwrite memory addressed by the
+ * job before the next job without this bit set completes.
+ */
+#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16)
+
+/**
+ * These requirement bits are currently unused in mali_jd_core_req
+ */
+#define MALIP_JD_REQ_RESERVED \
+	(~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \
+	BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \
+	BASE_JD_REQ_EVENT_COALESCE | \
+	BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \
+	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
+	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END))
+
+/**
+ * Mask of all bits in mali_jd_core_req that control the type of the atom.
+ *
+ * This allows dependency only atoms to have flags set
+ */
+#define BASE_JD_REQ_ATOM_TYPE \
+	(BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \
+	BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE)
+
+/**
+ * Mask of all bits in mali_jd_core_req that control the type of a soft job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f)
+
+/*
+ * Returns non-zero value if core requirements passed define a soft job or
+ * a dependency only job.
+ */
+#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \
+	((core_req & BASE_JD_REQ_SOFT_JOB) || \
+	(core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
+
+/**
+ * @brief The payload for a replay job. This must be in GPU memory.
+ */
+struct mali_jd_replay_payload {
+	/**
+	 * Pointer to the first entry in the mali_jd_replay_jc list.  These
+	 * will be replayed in @b reverse order (so that extra ones can be added
+	 * to the head in future soft jobs without affecting this soft job)
+	 */
+	u64 tiler_jc_list;
+
+	/**
+	 * Pointer to the fragment job chain.
+	 */
+	u64 fragment_jc;
+
+	/**
+	 * Pointer to the tiler heap free FBD field to be modified.
+	 */
+	u64 tiler_heap_free;
+
+	/**
+	 * Hierarchy mask for the replayed fragment jobs. May be zero.
+	 */
+	u16 fragment_hierarchy_mask;
+
+	/**
+	 * Hierarchy mask for the replayed tiler jobs. May be zero.
+	 */
+	u16 tiler_hierarchy_mask;
+
+	/**
+	 * Default weight to be used for hierarchy levels not in the original
+	 * mask.
+	 */
+	u32 hierarchy_default_weight;
+
+	/**
+	 * Core requirements for the tiler job chain
+	 */
+	mali_jd_core_req tiler_core_req;
+
+	/**
+	 * Core requirements for the fragment job chain
+	 */
+	mali_jd_core_req fragment_core_req;
+};
+
+/**
+ * @brief An entry in the linked list of job chains to be replayed. This must
+ *        be in GPU memory.
+ */
+struct mali_jd_replay_jc {
+	/**
+	 * Pointer to next entry in the list. A setting of NULL indicates the
+	 * end of the list.
+	 */
+	u64 next;
+
+	/**
+	 * Pointer to the job chain.
+	 */
+	u64 jc;
+};
+
+typedef u64 mali_ptr;
+
+#define MALI_PTR_FMT "0x%" PRIx64
+#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR
+
+#ifdef __LP64__
+#define PAD_CPU_PTR(p) p
+#else
+#define PAD_CPU_PTR(p) p; u32 :32;
+#endif
+
+/* FIXME: Again, they don't specify any of these as packed structs. However,
+ * looking at these structs I'm worried that there is already spots where the
+ * compiler is potentially sticking in padding...
+ * Going to try something a little crazy, and just hope that our compiler
+ * happens to add the same kind of offsets since we can't really compare sizes
+ */
+
+/*
+ * Blob provided by the driver to store callback driver, not actually modified
+ * by the driver itself
+ */
+struct mali_jd_udata {
+	u64 blob[2];
+};
+
+struct mali_jd_dependency {
+	mali_atom_id  atom_id;               /**< An atom number */
+	mali_jd_dep_type dependency_type;    /**< Dependency type */
+};
+
+#define MALI_EXT_RES_MAX 10
+
+/* The original header never explicitly defines any values for these. In C,
+ * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we
+ * actually need to decode here is EXCLUSIVE
+ */
+enum mali_external_resource_access {
+	MALI_EXT_RES_ACCESS_SHARED,
+	MALI_EXT_RES_ACCESS_EXCLUSIVE,
+};
+
+/* An aligned address to the resource | mali_external_resource_access */
+typedef u64 mali_external_resource;
+
+struct base_jd_atom_v2 {
+	mali_ptr jc;           /**< job-chain GPU address */
+	struct mali_jd_udata udata;	    /**< user data */
+	u64 extres_list; /**< list of external resources */
+	u16 nr_extres;			    /**< nr of external resources */
+	u16 compat_core_req;	            /**< core requirements which
+					      correspond to the legacy support
+					      for UK 10.2 */
+	struct mali_jd_dependency pre_dep[2];  /**< pre-dependencies, one need to
+					      use SETTER function to assign
+					      this field, this is done in
+					      order to reduce possibility of
+					      improper assigment of a
+					      dependency field */
+	mali_atom_id atom_number;	    /**< unique number to identify the
+					      atom */
+	mali_jd_prio prio;                  /**< Atom priority. Refer to @ref
+					      mali_jd_prio for more details */
+	u8 device_nr;			    /**< coregroup when
+					      BASE_JD_REQ_SPECIFIC_COHERENT_GROUP
+					      specified */
+	u8 :8;
+	mali_jd_core_req core_req;          /**< core requirements */
+} __attribute__((packed));
+
+/**
+ * enum mali_error - Mali error codes shared with userspace
+ *
+ * This is subset of those common Mali errors that can be returned to userspace.
+ * Values of matching user and kernel space enumerators MUST be the same.
+ * MALI_ERROR_NONE is guaranteed to be 0.
+ *
+ * @MALI_ERROR_NONE: Success
+ * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver
+ * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure
+ * @MALI_ERROR_FUNCTION_FAILED: Generic error code
+ */
+enum mali_error {
+	MALI_ERROR_NONE = 0,
+	MALI_ERROR_OUT_OF_GPU_MEMORY,
+	MALI_ERROR_OUT_OF_MEMORY,
+	MALI_ERROR_FUNCTION_FAILED,
+};
+
+/**
+ * Header used by all ioctls
+ */
+union kbase_ioctl_header {
+#ifdef dvalin
+	u32 pad[0];
+#else
+	/* [in] The ID of the UK function being called */
+	u32 id :32;
+	/* [out] The return value of the UK function that was called */
+	enum mali_error rc :32;
+
+	u64 :64;
+#endif
+} __attribute__((packed));
+
+struct kbase_ioctl_get_version {
+	union kbase_ioctl_header header;
+	u16 major; /* [out] */
+	u16 minor; /* [out] */
+	u32 :32;
+} __attribute__((packed));
+
+struct mali_mem_import_user_buffer {
+	u64 ptr;
+	u64 length;
+};
+
+union kbase_ioctl_mem_import {
+        struct {
+                union kbase_ioctl_header header;
+                u64 phandle;
+                enum {
+                        BASE_MEM_IMPORT_TYPE_INVALID = 0,
+                        BASE_MEM_IMPORT_TYPE_UMP = 1,
+                        BASE_MEM_IMPORT_TYPE_UMM = 2,
+                        BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3,
+                } type :32;
+                u32 :32;
+                u64 flags;
+        } in;
+        struct {
+                union kbase_ioctl_header header;
+                u64 pad[2];
+                u64 flags;
+                u64 gpu_va;
+                u64 va_pages;
+        } out;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_commit {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_addr;
+	u64 pages;
+	/* [out] */
+	u32 result_subcode;
+	u32 :32;
+} __attribute__((packed));
+
+enum kbase_ioctl_mem_query_type {
+	BASE_MEM_QUERY_COMMIT_SIZE = 1,
+	BASE_MEM_QUERY_VA_SIZE     = 2,
+	BASE_MEM_QUERY_FLAGS       = 3
+};
+
+struct kbase_ioctl_mem_query {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_addr;
+	enum kbase_ioctl_mem_query_type query : 32;
+	u32 :32;
+	/* [out] */
+	u64 value;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_free {
+	union kbase_ioctl_header header;
+	mali_ptr gpu_addr; /* [in] */
+} __attribute__((packed));
+/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
+
+struct kbase_ioctl_mem_flags_change {
+	union kbase_ioctl_header header;
+	/* [in] */
+	mali_ptr gpu_va;
+	u64 flags;
+	u64 mask;
+} __attribute__((packed));
+/* FIXME: Size unconfirmed (haven't seen in a trace yet) */
+
+struct kbase_ioctl_mem_alias {
+	union kbase_ioctl_header header;
+	/* [in/out] */
+	u64 flags;
+	/* [in] */
+	u64 stride;
+	u64 nents;
+	u64 ai;
+	/* [out] */
+	mali_ptr gpu_va;
+	u64 va_pages;
+} __attribute__((packed));
+
+struct kbase_ioctl_mem_sync {
+	union kbase_ioctl_header header;
+	mali_ptr handle;
+	u64 user_addr;
+	u64 size;
+	enum {
+		MALI_SYNC_TO_DEVICE = 1,
+		MALI_SYNC_TO_CPU = 2,
+	} type :8;
+	u64 :56;
+} __attribute__((packed));
+
+struct kbase_ioctl_set_flags {
+	union kbase_ioctl_header header;
+	u32 create_flags; /* [in] */
+	u32 :32;
+} __attribute__((packed));
+
+struct kbase_ioctl_stream_create {
+	union kbase_ioctl_header header;
+	/* [in] */
+	char name[32];
+	/* [out] */
+	s32 fd;
+	u32 :32;
+} __attribute__((packed));
+
+struct kbase_ioctl_job_submit {
+	union kbase_ioctl_header header;
+	/* [in] */
+	u64 addr;
+	u32 nr_atoms;
+	u32 stride;
+} __attribute__((packed));
+
+struct kbase_ioctl_get_context_id {
+	union kbase_ioctl_header header;
+	/* [out] */
+	s64 id;
+} __attribute__((packed));
+
+#undef PAD_CPU_PTR
+
+enum base_jd_event_code {
+        BASE_JD_EVENT_DONE = 1,
+};
+
+struct base_jd_event_v2 {
+	enum base_jd_event_code event_code;
+	mali_atom_id atom_number;
+	struct mali_jd_udata udata;
+};
+
+/* Defined in mali-props.h */
+struct kbase_ioctl_gpu_props_reg_dump;
+
+/* For ioctl's we haven't written decoding stuff for yet */
+typedef struct {
+	union kbase_ioctl_header header;
+} __ioctl_placeholder;
+
+#endif /* __KBASE_IOCTL_H__ */
diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h
new file mode 100644
index 00000000000..5b9d8723600
--- /dev/null
+++ b/src/panfrost/base/include/old/mali-props.h
@@ -0,0 +1,262 @@
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * A copy of the licence is included with the program, and can also be obtained
+ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __MALI_PROPS_H__
+#define __MALI_PROPS_H__
+
+#include "mali-ioctl.h"
+
+#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3
+#define MALI_GPU_MAX_JOB_SLOTS 16
+#define MALI_MAX_COHERENT_GROUPS 16
+
+/* Capabilities of a job slot as reported by JS_FEATURES registers */
+
+#define JS_FEATURE_NULL_JOB              (1u << 1)
+#define JS_FEATURE_SET_VALUE_JOB         (1u << 2)
+#define JS_FEATURE_CACHE_FLUSH_JOB       (1u << 3)
+#define JS_FEATURE_COMPUTE_JOB           (1u << 4)
+#define JS_FEATURE_VERTEX_JOB            (1u << 5)
+#define JS_FEATURE_GEOMETRY_JOB          (1u << 6)
+#define JS_FEATURE_TILER_JOB             (1u << 7)
+#define JS_FEATURE_FUSED_JOB             (1u << 8)
+#define JS_FEATURE_FRAGMENT_JOB          (1u << 9)
+
+struct mali_gpu_core_props {
+	/**
+	 * Product specific value.
+	 */
+	u32 product_id;
+
+	/**
+	 * Status of the GPU release.
+	 * No defined values, but starts at 0 and increases by one for each
+	 * release status (alpha, beta, EAC, etc.).
+	 * 4 bit values (0-15).
+	 */
+	u16 version_status;
+
+	/**
+	 * Minor release number of the GPU. "P" part of an "RnPn" release
+	 * number.
+	 * 8 bit values (0-255).
+	 */
+	u16 minor_revision;
+
+	/**
+	 * Major release number of the GPU. "R" part of an "RnPn" release
+	 * number.
+	 * 4 bit values (0-15).
+	 */
+	u16 major_revision;
+
+	u16 :16;
+
+	/**
+	 * @usecase GPU clock speed is not specified in the Midgard
+	 * Architecture, but is <b>necessary for OpenCL's clGetDeviceInfo()
+	 * function</b>.
+	 */
+	u32 gpu_speed_mhz;
+
+	/**
+	 * @usecase GPU clock max/min speed is required for computing
+	 * best/worst case in tasks as job scheduling ant irq_throttling. (It
+	 * is not specified in the Midgard Architecture).
+	 */
+	u32 gpu_freq_khz_max;
+	u32 gpu_freq_khz_min;
+
+	/**
+	 * Size of the shader program counter, in bits.
+	 */
+	u32 log2_program_counter_size;
+
+	/**
+	 * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a
+	 * bitpattern where a set bit indicates that the format is supported.
+	 *
+	 * Before using a texture format, it is recommended that the
+	 * corresponding bit be checked.
+	 */
+	u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	/**
+	 * Theoretical maximum memory available to the GPU. It is unlikely
+	 * that a client will be able to allocate all of this memory for their
+	 * own purposes, but this at least provides an upper bound on the
+	 * memory available to the GPU.
+	 *
+	 * This is required for OpenCL's clGetDeviceInfo() call when
+	 * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+	 * client will not be expecting to allocate anywhere near this value.
+	 */
+	u64 gpu_available_memory_size;
+};
+
+struct mali_gpu_l2_cache_props {
+	u8 log2_line_size;
+	u8 log2_cache_size;
+	u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	u64 :40;
+};
+
+struct mali_gpu_tiler_props {
+	u32 bin_size_bytes;	/* Max is 4*2^15 */
+	u32 max_active_levels;	/* Max is 2^15 */
+};
+
+struct mali_gpu_thread_props {
+	u32 max_threads;            /* Max. number of threads per core */
+	u32 max_workgroup_size;     /* Max. number of threads per workgroup */
+	u32 max_barrier_size;       /* Max. number of threads that can
+				       synchronize on a simple barrier */
+	u16 max_registers;          /* Total size [1..65535] of the register
+				       file available per core. */
+	u8  max_task_queue;         /* Max. tasks [1..255] which may be sent
+				       to a core before it becomes blocked. */
+	u8  max_thread_group_split; /* Max. allowed value [1..15] of the
+				       Thread Group Split field. */
+	enum {
+		MALI_GPU_IMPLEMENTATION_UNKNOWN = 0,
+		MALI_GPU_IMPLEMENTATION_SILICON = 1,
+		MALI_GPU_IMPLEMENTATION_FPGA    = 2,
+		MALI_GPU_IMPLEMENTATION_SW      = 3,
+	} impl_tech :8;
+	u64 :56;
+};
+
+/**
+ * @brief descriptor for a coherent group
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ * the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ * wastage.
+ */
+struct mali_ioctl_gpu_coherent_group {
+	u64 core_mask;	       /**< Core restriction mask required for the
+				 group */
+	u16 num_cores;	       /**< Number of cores in the group */
+	u64 :48;
+};
+
+/**
+ * @brief Coherency group information
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_gpu_coherent_group_info {
+	u32 num_groups;
+
+	/**
+	 * Number of core groups (coherent or not) in the GPU. Equivalent to
+	 * the number of L2 Caches.
+	 *
+	 * The GPU Counter dumping writes 2048 bytes per core group,
+	 * regardless of whether the core groups are coherent or not. Hence
+	 * this member is needed to calculate how much memory is required for
+	 * dumping.
+	 *
+	 * @note Do not use it to work out how many valid elements are in the
+	 * group[] member. Use num_groups instead.
+	 */
+	u32 num_core_groups;
+
+	/**
+	 * Coherency features of the memory, accessed by @ref gpu_mem_features
+	 * methods
+	 */
+	u32 coherency;
+
+	u32 :32;
+
+	/**
+	 * Descriptors of coherent groups
+	 */
+	struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS];
+};
+
+/**
+ * A complete description of the GPU's Hardware Configuration Discovery
+ * registers.
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it <b>does not need to be processed
+ * by the driver</b>. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct mali_gpu_raw_props {
+	u64 shader_present;
+	u64 tiler_present;
+	u64 l2_present;
+	u64 stack_present;
+
+	u32 l2_features;
+	u32 suspend_size; /* API 8.2+ */
+	u32 mem_features;
+	u32 mmu_features;
+
+	u32 as_present;
+
+	u32 js_present;
+	u32 js_features[MALI_GPU_MAX_JOB_SLOTS];
+	u32 tiler_features;
+	u32 texture_features[3];
+
+	u32 gpu_id;
+
+	u32 thread_max_threads;
+	u32 thread_max_workgroup_size;
+	u32 thread_max_barrier_size;
+	u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	u32 coherency_mode;
+};
+
+struct kbase_ioctl_gpu_props_reg_dump {
+	union kbase_ioctl_header header;
+	struct mali_gpu_core_props core;
+	struct mali_gpu_l2_cache_props l2;
+	u64 :64;
+	struct mali_gpu_tiler_props tiler;
+	struct mali_gpu_thread_props thread;
+
+	struct mali_gpu_raw_props raw;
+
+	/** This must be last member of the structure */
+	struct mali_gpu_coherent_group_info coherency_info;
+} __attribute__((packed));
+
+#endif
diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build
new file mode 100644
index 00000000000..5d7b9f1dff9
--- /dev/null
+++ b/src/panfrost/base/meson.build
@@ -0,0 +1,55 @@
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+# Copyright © 2022 Icecream95
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_base_versions = ['0', '1', '2', '258']
+libpanfrost_base_per_arch = []
+
+foreach ver : libpanfrost_base_versions
+  libpanfrost_base_per_arch += static_library(
+    'pan-base-v' + ver,
+    'pan_vX_base.c',
+    include_directories : [
+      inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux,
+      include_directories('include'),
+    ],
+    c_args : ['-DPAN_BASE_VER=' + ver],
+    gnu_symbol_visibility : 'hidden',
+    dependencies: [dep_valgrind],
+)
+endforeach
+
+libpanfrost_base = static_library(
+  'panfrost_base',
+  'pan_base.c',
+  include_directories : [
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw,
+    include_directories('include'),
+  ],
+  gnu_symbol_visibility : 'hidden',
+  build_by_default : false,
+  link_with: [libpanfrost_base_per_arch],
+)
+
+libpanfrost_base_dep = declare_dependency(
+  link_with: [libpanfrost_base_per_arch, libpanfrost_base],
+  include_directories: [include_directories('.')],
+)
diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c
new file mode 100644
index 00000000000..22dc09cfb52
--- /dev/null
+++ b/src/panfrost/base/pan_base.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <pthread.h>
+
+#include "util/macros.h"
+#include "pan_base.h"
+
+#include "mali_kbase_ioctl.h"
+
+bool
+kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose)
+{
+        *k = (struct kbase_) {0};
+        k->fd = fd;
+        k->cs_queue_count = cs_queue_count;
+        k->page_size = sysconf(_SC_PAGE_SIZE);
+        k->verbose = verbose;
+
+        if (k->fd == -1)
+           return kbase_open_csf_noop(k);
+
+        struct kbase_ioctl_version_check ver = { 0 };
+
+        if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) {
+                return kbase_open_csf(k);
+        } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) {
+                if (ver.major == 3)
+                        return kbase_open_old(k);
+                else
+                        return kbase_open_new(k);
+        }
+
+        return false;
+}
+
+/* If fd != -1, ownership is passed in */
+int
+kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd)
+{
+        kbase_handle h = {
+                .va = va,
+                .fd = fd
+        };
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
+
+        for (unsigned i = 0; i < size; ++i) {
+                if (handles[i].fd == -2) {
+                        handles[i] = h;
+                        return i;
+                }
+        }
+
+        util_dynarray_append(&k->gem_handles, kbase_handle, h);
+
+        return size;
+}
+
+int
+kbase_alloc_gem_handle(kbase k, base_va va, int fd)
+{
+        pthread_mutex_lock(&k->handle_lock);
+
+        int ret = kbase_alloc_gem_handle_locked(k, va, fd);
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return ret;
+}
+
+void
+kbase_free_gem_handle(kbase k, int handle)
+{
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        int fd;
+
+        if (handle >= size) {
+                pthread_mutex_unlock(&k->handle_lock);
+                return;
+        }
+
+        if (handle + 1 < size) {
+                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+                fd = ptr->fd;
+                ptr->fd = -2;
+        } else {
+                fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd;
+        }
+
+        if (fd != -1)
+                close(fd);
+
+        pthread_mutex_unlock(&k->handle_lock);
+}
+
+kbase_handle
+kbase_gem_handle_get(kbase k, int handle)
+{
+        kbase_handle h = { .fd = -1 };
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        if (handle < size)
+                h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return h;
+}
+
+int
+kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers)
+{
+        struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns);
+
+        while (kbase_wait_for_event(&wait)) {
+                pthread_mutex_lock(&k->handle_lock);
+                if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        kbase_wait_fini(wait);
+                        errno = EINVAL;
+                        return -1;
+                }
+                kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle);
+                if (!ptr->use_count) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        kbase_wait_fini(wait);
+                        return 0;
+                }
+                pthread_mutex_unlock(&k->handle_lock);
+        }
+
+        kbase_wait_fini(wait);
+        errno = ETIMEDOUT;
+        return -1;
+}
+
+static void
+adjust_time(struct timespec *tp, int64_t ns)
+{
+        ns += tp->tv_nsec;
+        tp->tv_nsec = ns % 1000000000;
+        tp->tv_sec += ns / 1000000000;
+}
+
+static int64_t
+ns_until(struct timespec tp)
+{
+        struct timespec now;
+        clock_gettime(CLOCK_MONOTONIC, &now);
+
+        int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000;
+        int64_t ns = tp.tv_nsec - now.tv_nsec;
+
+        /* Clamp the value to zero to avoid errors from ppoll */
+        return MAX2(sec + ns, 0);
+}
+
+static void
+kbase_wait_signal(kbase k)
+{
+        /* We must acquire the event condition lock, otherwise another
+         * thread could be between the trylock and the cond_wait, and
+         * not notice the broadcast. */
+        pthread_mutex_lock(&k->event_cnd_lock);
+        pthread_cond_broadcast(&k->event_cnd);
+        pthread_mutex_unlock(&k->event_cnd_lock);
+}
+
+struct kbase_wait_ctx
+kbase_wait_init(kbase k, int64_t timeout_ns)
+{
+        struct timespec tp;
+        clock_gettime(CLOCK_MONOTONIC, &tp);
+
+        adjust_time(&tp, timeout_ns);
+
+        return (struct kbase_wait_ctx) {
+                .k = k,
+                .until = tp,
+        };
+}
+
+bool
+kbase_wait_for_event(struct kbase_wait_ctx *ctx)
+{
+        kbase k = ctx->k;
+
+        /* Return instantly the first time so that a check outside the
+         * wait_for_Event loop is not required */
+        if (!ctx->has_cnd_lock) {
+                pthread_mutex_lock(&k->event_cnd_lock);
+                ctx->has_cnd_lock = true;
+                return true;
+        }
+
+        if (!ctx->has_lock) {
+                if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
+                        ctx->has_lock = true;
+                        pthread_mutex_unlock(&k->event_cnd_lock);
+                } else {
+                        int ret = pthread_cond_timedwait(&k->event_cnd,
+                                         &k->event_cnd_lock, &ctx->until);
+                        return ret != ETIMEDOUT;
+                }
+        }
+
+        bool event = k->poll_event(k, ns_until(ctx->until));
+        k->handle_events(k);
+        kbase_wait_signal(k);
+        return event;
+}
+
+void
+kbase_wait_fini(struct kbase_wait_ctx ctx)
+{
+        kbase k = ctx.k;
+
+        if (ctx.has_lock) {
+                pthread_mutex_unlock(&k->event_read_lock);
+                kbase_wait_signal(k);
+        } else if (ctx.has_cnd_lock) {
+                pthread_mutex_unlock(&k->event_cnd_lock);
+        }
+}
+
+void
+kbase_ensure_handle_events(kbase k)
+{
+        /* If we don't manage to take the lock, then events have recently/will
+         * soon be handled, there is no need to do anything. */
+        if (pthread_mutex_trylock(&k->event_read_lock) == 0) {
+                k->handle_events(k);
+                pthread_mutex_unlock(&k->event_read_lock);
+                kbase_wait_signal(k);
+        }
+}
+
+bool
+kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp)
+{
+        struct pollfd pfd = {
+                .fd = fd,
+                .events = wait_shared ? POLLOUT : POLLIN,
+        };
+
+        uint64_t timeout = ns_until(tp);
+
+        struct timespec t = {
+                .tv_sec = timeout / 1000000000,
+                .tv_nsec = timeout % 1000000000,
+        };
+
+        int ret = ppoll(&pfd, 1, &t, NULL);
+
+        if (ret == -1 && errno != EINTR)
+                perror("kbase_poll_fd_until");
+
+        return ret != 0;
+}
diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h
new file mode 100644
index 00000000000..878f7468433
--- /dev/null
+++ b/src/panfrost/base/pan_base.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Library for interfacing with kbase */
+#ifndef PAN_BASE_H
+#define PAN_BASE_H
+
+#include "util/u_dynarray.h"
+#include "util/list.h"
+
+#define PAN_EVENT_SIZE 16
+
+typedef uint64_t base_va;
+struct base_ptr {
+        void *cpu;
+        base_va gpu;
+};
+
+struct kbase_syncobj;
+
+/* The job is done when the queue seqnum > seqnum */
+struct kbase_sync_link {
+        struct kbase_sync_link *next; /* must be first */
+        uint64_t seqnum;
+        void (*callback)(void *);
+        void *data;
+};
+
+struct kbase_event_slot {
+        struct kbase_sync_link *syncobjs;
+        struct kbase_sync_link **back;
+        uint64_t last_submit;
+        uint64_t last;
+};
+
+struct kbase_context {
+        uint8_t csg_handle;
+        uint8_t kcpu_queue;
+        bool kcpu_init; // TODO: Always create a queue?
+        uint32_t csg_uid;
+        unsigned num_csi;
+
+        unsigned tiler_heap_chunk_size;
+        base_va tiler_heap_va;
+        base_va tiler_heap_header;
+};
+
+struct kbase_cs {
+        struct kbase_context *ctx;
+        void *user_io;
+        base_va va;
+        unsigned size;
+        unsigned event_mem_offset;
+        unsigned csi;
+
+        uint64_t last_insert;
+
+        // TODO: This is only here because it's convenient for emit_csf_queue
+        uint32_t *latest_flush;
+};
+
+#define KBASE_SLOT_COUNT 2
+
+typedef struct {
+        base_va va;
+        int fd;
+        uint8_t use_count;
+        /* For emulating implicit sync. TODO make this work on v10 */
+        uint8_t last_access[KBASE_SLOT_COUNT];
+} kbase_handle;
+
+struct kbase_;
+typedef struct kbase_ *kbase;
+
+struct kbase_ {
+        unsigned setup_state;
+        bool verbose;
+
+        int fd;
+        unsigned api;
+        unsigned page_size;
+        // TODO: Actually we may want to try to pack multiple contexts / queue
+        // "sets" into a single group...
+        unsigned cs_queue_count;
+
+        /* Must not hold handle_lock while acquiring event_read_lock */
+        pthread_mutex_t handle_lock;
+        pthread_mutex_t event_read_lock;
+        pthread_mutex_t event_cnd_lock;
+        pthread_cond_t event_cnd;
+        /* TODO: Per-context/queue locks? */
+        pthread_mutex_t queue_lock;
+
+        struct list_head syncobjs;
+
+        unsigned gpuprops_size;
+        void *gpuprops;
+
+        void *tracking_region;
+        void *csf_user_reg;
+        struct base_ptr event_mem;
+        struct base_ptr kcpu_event_mem;
+        // TODO: dynamically size
+        struct kbase_event_slot event_slots[256];
+        // TODO: USe a bitset?
+        unsigned event_slot_usage;
+
+        uint8_t atom_number;
+
+        struct util_dynarray gem_handles;
+        struct util_dynarray atom_bos[256];
+        uint64_t job_seq;
+
+        void (*close)(kbase k);
+
+        bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value);
+        bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value);
+
+        struct base_ptr (*alloc)(kbase k, size_t size,
+                                 unsigned pan_flags,
+                                 unsigned mali_flags);
+        void (*free)(kbase k, base_va va);
+
+        int (*import_dmabuf)(kbase k, int fd);
+        void *(*mmap_import)(kbase k, base_va va, size_t size);
+
+        void (*cache_clean)(void *ptr, size_t size);
+        void (*cache_invalidate)(void *ptr, size_t size);
+
+        /* Returns false on timeout */
+        bool (*poll_event)(kbase k, int64_t timeout_ns);
+        bool (*handle_events)(kbase k);
+
+        /* <= v9 GPUs */
+        int (*submit)(kbase k, uint64_t va, unsigned req,
+                      struct kbase_syncobj *o,
+                      int32_t *handles, unsigned num_handles);
+
+        /* >= v10 GPUs */
+        struct kbase_context *(*context_create)(kbase k);
+        void (*context_destroy)(kbase k, struct kbase_context *ctx);
+        bool (*context_recreate)(kbase k, struct kbase_context *ctx);
+
+        // TODO: Pass in a priority?
+        struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx,
+                                   base_va va, unsigned size);
+        void (*cs_term)(kbase k, struct kbase_cs *cs);
+        void (*cs_rebind)(kbase k, struct kbase_cs *cs);
+
+        bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
+                          struct kbase_syncobj *o, uint64_t seqnum);
+        bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
+                        struct kbase_syncobj *o);
+
+        int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx);
+        bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd);
+
+        bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx,
+                             base_va addr, uint64_t value);
+        bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx,
+                              base_va addr, uint64_t value);
+
+        /* syncobj functions */
+        struct kbase_syncobj *(*syncobj_create)(kbase k);
+        void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o);
+        struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o);
+        /* TODO: timeout? (and for cs_wait) */
+        bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o);
+
+        /* Returns false if there are no active queues */
+        bool (*callback_all_queues)(kbase k, int32_t *count,
+                                    void (*callback)(void *), void *data);
+
+        void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size,
+                         bool invalidate);
+};
+
+bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose);
+
+/* Called from kbase_open */
+bool kbase_open_old(kbase k);
+bool kbase_open_new(kbase k);
+bool kbase_open_csf(kbase k);
+bool kbase_open_csf_noop(kbase k);
+
+/* BO management */
+int kbase_alloc_gem_handle(kbase k, base_va va, int fd);
+int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd);
+void kbase_free_gem_handle(kbase k, int handle);
+kbase_handle kbase_gem_handle_get(kbase k, int handle);
+int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers);
+
+/* Event waiting */
+struct kbase_wait_ctx {
+        kbase k;
+        struct timespec until;
+        bool has_lock;
+        bool has_cnd_lock;
+};
+
+struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns);
+/* Returns false on timeout, kbase_wait_fini must still be called */
+bool kbase_wait_for_event(struct kbase_wait_ctx *ctx);
+void kbase_wait_fini(struct kbase_wait_ctx ctx);
+
+void kbase_ensure_handle_events(kbase k);
+
+bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp);
+
+/* Must not conflict with PANFROST_BO_* flags */
+#define MALI_BO_CACHED_CPU   (1 << 16)
+#define MALI_BO_UNCACHED_GPU (1 << 17)
+
+#endif
diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h
new file mode 100644
index 00000000000..750a445a995
--- /dev/null
+++ b/src/panfrost/base/pan_base_noop.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PAN_BASE_NOOP_H
+#define PAN_BASE_NOOP_H
+
+/* For Mali-G610 as used in RK3588 */
+#define PROP(name, value) ((name << 2) | 2), value
+static const uint32_t gpu_props[] = {
+   PROP(KBASE_GPUPROP_RAW_GPU_ID,             0xa8670000),
+   PROP(KBASE_GPUPROP_PRODUCT_ID,                 0xa867),
+   PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT,        0x50005),
+   PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e),
+   PROP(KBASE_GPUPROP_TLS_ALLOC,                   0x800),
+   PROP(KBASE_GPUPROP_RAW_TILER_FEATURES,          0x809),
+};
+#undef PROP
+
+#define NOOP_COOKIE_ALLOC     0x41000
+#define NOOP_COOKIE_USER_IO   0x42000
+#define NOOP_COOKIE_MEM_ALLOC 0x43000
+
+static int
+kbase_ioctl(int fd, unsigned long request, ...)
+{
+   int ret = 0;
+
+   va_list args;
+
+   va_start(args, request);
+   void *ptr = va_arg(args, void *);
+   va_end(args);
+
+   switch (request) {
+   case KBASE_IOCTL_GET_GPUPROPS: {
+      struct kbase_ioctl_get_gpuprops *props = ptr;
+
+      if (props->size)
+         memcpy((void *)(uintptr_t) props->buffer,
+                gpu_props, MIN2(props->size, sizeof(gpu_props)));
+
+      ret = sizeof(gpu_props);
+      break;
+   }
+
+   case KBASE_IOCTL_MEM_ALLOC: {
+      union kbase_ioctl_mem_alloc *alloc = ptr;
+
+      alloc->out.gpu_va = NOOP_COOKIE_ALLOC;
+      alloc->out.flags = BASE_MEM_SAME_VA;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: {
+      union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr;
+
+      // TODO: Don't return duplicates?
+      create->out.group_handle = 0;
+      create->out.group_uid = 1;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_TILER_HEAP_INIT: {
+      union kbase_ioctl_cs_tiler_heap_init *init = ptr;
+
+      /* The values don't really matter, the CPU has no business in accessing
+       * these. */
+      init->out.gpu_heap_va = 0x60000;
+      init->out.first_chunk_va = 0x61000;
+      break;
+   }
+
+   case KBASE_IOCTL_CS_QUEUE_BIND: {
+      union kbase_ioctl_cs_queue_bind *bind = ptr;
+      bind->out.mmap_handle = NOOP_COOKIE_USER_IO;
+      break;
+   }
+
+   case KBASE_IOCTL_MEM_IMPORT: {
+      union kbase_ioctl_mem_import *import = ptr;
+
+      if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) {
+         ret = -1;
+         errno = EINVAL;
+         break;
+      }
+
+      int *fd = (int *)(uintptr_t) import->in.phandle;
+
+      off_t size = lseek(*fd, 0, SEEK_END);
+
+      import->out.flags = BASE_MEM_NEED_MMAP;
+      import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC;
+      import->out.va_pages = DIV_ROUND_UP(size, 4096);
+   }
+
+   case KBASE_IOCTL_SET_FLAGS:
+   case KBASE_IOCTL_MEM_EXEC_INIT:
+   case KBASE_IOCTL_MEM_JIT_INIT:
+   case KBASE_IOCTL_CS_QUEUE_REGISTER:
+   case KBASE_IOCTL_CS_QUEUE_KICK:
+   case KBASE_IOCTL_CS_TILER_HEAP_TERM:
+   case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE:
+   case KBASE_IOCTL_MEM_SYNC:
+      break;
+
+   default:
+      ret = -1;
+      errno = ENOSYS;
+   }
+
+   return ret;
+}
+
+static void *
+kbase_mmap(void *addr, size_t length, int prot, int flags,
+           int fd, off_t offset)
+{
+   switch (offset) {
+   case BASE_MEM_MAP_TRACKING_HANDLE:
+   case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE:
+   case NOOP_COOKIE_ALLOC:
+   case NOOP_COOKIE_USER_IO:
+   case NOOP_COOKIE_MEM_ALLOC:
+      return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+   default:
+      errno = ENOSYS;
+      return MAP_FAILED;
+   }
+}
+#endif
diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h
new file mode 100644
index 00000000000..ad5af0c7098
--- /dev/null
+++ b/src/panfrost/base/pan_cache.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2022 Icecream95 <ixn@disroot.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PAN_CACHE_H
+#define PAN_CACHE_H
+
+#ifdef __aarch64__
+
+static void
+cache_clean(volatile void *addr)
+{
+        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
+}
+
+static void
+cache_invalidate(volatile void *addr)
+{
+        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
+}
+
+typedef void (*cacheline_op)(volatile void *addr);
+
+#define CACHELINE_SIZE 64
+
+static void
+cacheline_op_range(volatile void *start, size_t length, cacheline_op op)
+{
+        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
+        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
+        for (; ptr < end; ptr += CACHELINE_SIZE)
+                op(ptr);
+}
+
+static void
+cache_clean_range(volatile void *start, size_t length)
+{
+        /* TODO: Do an invalidate at the start of the range? */
+        cacheline_op_range(start, length, cache_clean);
+}
+
+static void
+cache_invalidate_range(volatile void *start, size_t length)
+{
+        cacheline_op_range(start, length, cache_invalidate);
+}
+
+#endif /* __aarch64__ */
+
+/* The #ifdef covers both 32-bit and 64-bit ARM */
+#ifdef __ARM_ARCH
+static void
+cache_barrier(void)
+{
+        __asm__ volatile ("dsb sy" ::: "memory");
+}
+
+static void
+memory_barrier(void)
+{
+        __asm__ volatile ("dmb sy" ::: "memory");
+}
+#else
+
+/* TODO: How to do cache barriers when emulated? */
+static void
+cache_barrier(void)
+{
+}
+
+static void
+memory_barrier(void)
+{
+}
+#endif
+#endif
diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c
new file mode 100644
index 00000000000..99bd356c536
--- /dev/null
+++ b/src/panfrost/base/pan_vX_base.c
@@ -0,0 +1,1825 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <pthread.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#else
+#define RUNNING_ON_VALGRIND 0
+#endif
+
+#include "util/macros.h"
+#include "util/list.h"
+#include "util/u_atomic.h"
+#include "util/os_file.h"
+
+#include "pan_base.h"
+#include "pan_cache.h"
+
+#include "drm-uapi/panfrost_drm.h"
+
+#define PAN_BASE_API (PAN_BASE_VER & 0xff)
+#if (PAN_BASE_VER & 0x100) == 0x100
+#define PAN_BASE_NOOP
+#endif
+
+#if PAN_BASE_API >= 2
+#include "csf/mali_gpu_csf_registers.h"
+
+#define MALI_USE_CSF 1
+#endif
+
+#include "mali_kbase_gpuprops.h"
+
+#ifndef PAN_BASE_NOOP
+#define kbase_mmap mmap
+#endif
+
+#if PAN_BASE_API >= 1
+#include "mali_base_kernel.h"
+#include "mali_kbase_ioctl.h"
+
+#ifdef PAN_BASE_NOOP
+#include "pan_base_noop.h"
+#else
+#define kbase_ioctl ioctl
+#endif
+#else
+
+#include "old/mali-ioctl.h"
+#include "old/mali-ioctl-midgard.h"
+#include "old/mali-props.h"
+#endif
+
+#define LOG(fmt, ...) do { \
+                if (k->verbose) { \
+                        struct timespec tp; \
+                        clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \
+                        printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \
+                } \
+        } while (0)
+
+#if PAN_BASE_API == 0
+static int
+kbase_ioctl(int fd, unsigned long request, ...)
+{
+        int ioc_size = _IOC_SIZE(request);
+
+        assert(ioc_size);
+
+        va_list args;
+
+        va_start(args, request);
+        int *ptr = va_arg(args, void *);
+        va_end(args);
+
+        *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request);
+
+        int ret = ioctl(fd, request, ptr);
+        if (ret)
+                return ret;
+
+        int r = *ptr;
+        switch (r) {
+        case MALI_ERROR_OUT_OF_GPU_MEMORY:
+                errno = ENOSPC;
+                return -1;
+        case MALI_ERROR_OUT_OF_MEMORY:
+                errno = ENOMEM;
+                return -1;
+        case MALI_ERROR_FUNCTION_FAILED:
+                errno = EINVAL;
+                return -1;
+        default:
+                return 0;
+        }
+}
+#endif
+
+#if PAN_BASE_API >= 1
+static bool
+kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        int i = 0;
+        uint64_t x = 0;
+        while (i < k->gpuprops_size) {
+                x = 0;
+                memcpy(&x, k->gpuprops + i, 4);
+                i += 4;
+
+                int size = 1 << (x & 3);
+                int this_name = x >> 2;
+
+                x = 0;
+                memcpy(&x, k->gpuprops + i, size);
+                i += size;
+
+                if (this_name == name) {
+                        *value = x;
+                        return true;
+                }
+        }
+
+        return false;
+}
+#else
+static bool
+kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops;
+
+        switch (name) {
+        case KBASE_GPUPROP_PRODUCT_ID:
+                *value = props->core.product_id;
+                return true;
+        case KBASE_GPUPROP_RAW_SHADER_PRESENT:
+                *value = props->raw.shader_present;
+                return true;
+        case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0:
+                *value = props->raw.texture_features[0];
+                return true;
+        case KBASE_GPUPROP_RAW_TILER_FEATURES:
+                *value = props->raw.tiler_features;
+                return true;
+        case KBASE_GPUPROP_RAW_GPU_ID:
+                *value = props->raw.gpu_id;
+                return true;
+        default:
+                return false;
+        }
+}
+#endif
+
+static bool
+alloc_handles(kbase k)
+{
+        util_dynarray_init(&k->gem_handles, NULL);
+        return true;
+}
+
+static bool
+free_handles(kbase k)
+{
+        util_dynarray_fini(&k->gem_handles);
+        return true;
+}
+
+static bool
+set_flags(kbase k)
+{
+        struct kbase_ioctl_set_flags flags = {
+                .create_flags = 0
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+mmap_tracking(kbase k)
+{
+        k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE,
+                                        MAP_SHARED, k->fd,
+                                        BASE_MEM_MAP_TRACKING_HANDLE);
+
+        if (k->tracking_region == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
+                k->tracking_region = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_tracking(kbase k)
+{
+        if (k->tracking_region)
+                return munmap(k->tracking_region, k->page_size) == 0;
+        return true;
+}
+
+#if PAN_BASE_API >= 1
+static bool
+get_gpuprops(kbase k)
+{
+        struct kbase_ioctl_get_gpuprops props = { 0 };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
+                return false;
+        } else if (!ret) {
+                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
+                return false;
+        }
+
+        k->gpuprops_size = ret;
+        k->gpuprops = calloc(k->gpuprops_size, 1);
+
+        props.size = k->gpuprops_size;
+        props.buffer = (uint64_t)(uintptr_t) k->gpuprops;
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
+                return false;
+        }
+
+        return true;
+}
+#else
+static bool
+get_gpuprops(kbase k)
+{
+        k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump));
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)");
+                return false;
+        }
+
+        return true;
+}
+#endif
+
+static bool
+free_gpuprops(kbase k)
+{
+        free(k->gpuprops);
+        return true;
+}
+
+#if PAN_BASE_API >= 2
+static bool
+mmap_user_reg(kbase k)
+{
+        k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ,
+                                     MAP_SHARED, k->fd,
+                                     BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
+
+        if (k->csf_user_reg == MAP_FAILED) {
+                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
+                k->csf_user_reg = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_user_reg(kbase k)
+{
+        if (k->csf_user_reg)
+                return munmap(k->csf_user_reg, k->page_size) == 0;
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 1
+static bool
+init_mem_exec(kbase k)
+{
+        struct kbase_ioctl_mem_exec_init init = {
+                .va_pages = 0x100000,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+init_mem_jit(kbase k)
+{
+        struct kbase_ioctl_mem_jit_init init = {
+                .va_pages = 1 << 25,
+                .max_allocations = 255,
+                .phys_pages = 1 << 25,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static struct base_ptr
+kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags);
+
+static bool
+alloc_event_mem(kbase k)
+{
+        k->event_mem = kbase_alloc(k, k->page_size * 2,
+                                   PANFROST_BO_NOEXEC,
+                                   BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
+                                   BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
+                                   BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT);
+        k->kcpu_event_mem = (struct base_ptr) {
+                .cpu = k->event_mem.cpu + k->page_size,
+                .gpu = k->event_mem.gpu + k->page_size,
+        };
+        return k->event_mem.cpu;
+}
+
+static bool
+free_event_mem(kbase k)
+{
+        if (k->event_mem.cpu)
+                return munmap(k->event_mem.cpu, k->page_size * 2) == 0;
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static bool
+cs_group_create(kbase k, struct kbase_context *c)
+{
+        /* TODO: What about compute-only contexts? */
+        union kbase_ioctl_cs_queue_group_create_1_6 create = {
+                .in = {
+                        /* Mali *still* only supports a single tiler unit */
+                        .tiler_mask = 1,
+                        .fragment_mask = ~0ULL,
+                        .compute_mask = ~0ULL,
+
+                        .cs_min = k->cs_queue_count,
+
+                        .priority = 1,
+                        .tiler_max = 1,
+                        .fragment_max = 64,
+                        .compute_max = 64,
+                }
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
+                return false;
+        }
+
+        c->csg_handle = create.out.group_handle;
+        c->csg_uid = create.out.group_uid;
+
+        /* Should be at least 1 */
+        assert(c->csg_uid);
+
+        return true;
+}
+
+static bool
+cs_group_term(kbase k, struct kbase_context *c)
+{
+        if (!c->csg_uid)
+                return true;
+
+        struct kbase_ioctl_cs_queue_group_term term = {
+                .group_handle = c->csg_handle
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+#if PAN_BASE_API >= 2
+static bool
+tiler_heap_create(kbase k, struct kbase_context *c)
+{
+        c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */
+
+        union kbase_ioctl_cs_tiler_heap_init init = {
+                .in = {
+                        .chunk_size = c->tiler_heap_chunk_size,
+                        .initial_chunks = 5,
+                        .max_chunks = 200,
+                        .target_in_flight = 65535,
+                }
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
+                return false;
+        }
+
+        c->tiler_heap_va = init.out.gpu_heap_va;
+        c->tiler_heap_header = init.out.first_chunk_va;
+
+        return true;
+}
+
+static bool
+tiler_heap_term(kbase k, struct kbase_context *c)
+{
+        if (!c->tiler_heap_va)
+                return true;
+
+        struct kbase_ioctl_cs_tiler_heap_term term = {
+                .gpu_heap_va = c->tiler_heap_va
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
+                return false;
+        }
+        return true;
+}
+#endif
+
+typedef bool (* kbase_func)(kbase k);
+
+struct kbase_op {
+        kbase_func part;
+        kbase_func cleanup;
+        const char *label;
+};
+
+static struct kbase_op kbase_main[] = {
+        { alloc_handles, free_handles, "Allocate handle array" },
+#if PAN_BASE_API >= 1
+        { set_flags, NULL, "Set flags" },
+#endif
+        { mmap_tracking, munmap_tracking, "Map tracking handle" },
+#if PAN_BASE_API == 0
+        { set_flags, NULL, "Set flags" },
+#endif
+        { get_gpuprops, free_gpuprops, "Get GPU properties" },
+#if PAN_BASE_API >= 2
+        { mmap_user_reg, munmap_user_reg, "Map user register page" },
+#endif
+#if PAN_BASE_API >= 1
+        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
+        { init_mem_jit, NULL, "Initialise JIT allocator" },
+#endif
+#if PAN_BASE_API >= 2
+        { alloc_event_mem, free_event_mem, "Allocate event memory" },
+#endif
+};
+
+static void
+kbase_close(kbase k)
+{
+        while (k->setup_state) {
+                unsigned i = k->setup_state - 1;
+                if (kbase_main[i].cleanup)
+                        kbase_main[i].cleanup(k);
+                --k->setup_state;
+        }
+
+        pthread_mutex_destroy(&k->handle_lock);
+        pthread_mutex_destroy(&k->event_read_lock);
+        pthread_mutex_destroy(&k->event_cnd_lock);
+        pthread_mutex_destroy(&k->queue_lock);
+        pthread_cond_destroy(&k->event_cnd);
+
+        close(k->fd);
+}
+
+static bool
+kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value)
+{
+        unsigned conv[] = {
+                [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID,
+                [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT,
+                [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0,
+                [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC,
+                [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES,
+        };
+
+        if (name < ARRAY_SIZE(conv) && conv[name])
+                return kbase_get_mali_gpuprop(k, conv[name], value);
+
+        switch (name) {
+        case DRM_PANFROST_PARAM_AFBC_FEATURES:
+                *value = 0;
+                return true;
+        case DRM_PANFROST_PARAM_GPU_REVISION: {
+                if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value))
+                        return false;
+                *value &= 0xffff;
+                return true;
+        }
+        default:
+                return false;
+        }
+}
+
+static void
+kbase_free(kbase k, base_va va)
+{
+        struct kbase_ioctl_mem_free f = {
+                .gpu_addr = va
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f);
+
+        if (ret == -1)
+                perror("ioctl(KBASE_IOCTL_MEM_FREE)");
+}
+
+static struct base_ptr
+kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags)
+{
+        struct base_ptr r = {0};
+
+        unsigned pages = DIV_ROUND_UP(size, k->page_size);
+
+        union kbase_ioctl_mem_alloc a = {
+                .in = {
+                        .va_pages = pages,
+                        .commit_pages = pages,
+                }
+        };
+
+        size_t alloc_size = size;
+        unsigned flags = mali_flags;
+        bool exec_align = false;
+
+        if (!flags) {
+                flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
+                        BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
+                        BASE_MEM_SAME_VA;
+
+                /* Add COHERENT_LOCAL to keep GPU cores coherent with each
+                 * other. */
+                if (PAN_BASE_API >= 1)
+                        flags |= BASE_MEM_COHERENT_LOCAL;
+        }
+
+        if (pan_flags & PANFROST_BO_HEAP) {
+                size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */
+
+                a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size);
+                a.in.commit_pages = 0;
+                a.in.extension = align_size;
+                flags |= BASE_MEM_GROW_ON_GPF;
+        }
+
+#if PAN_BASE_API >= 1
+        if (pan_flags & MALI_BO_CACHED_CPU)
+                flags |= BASE_MEM_CACHED_CPU;
+#endif
+
+#if PAN_BASE_API >= 2
+        if (pan_flags & MALI_BO_UNCACHED_GPU)
+                flags |= BASE_MEM_UNCACHED_GPU;
+#endif
+
+        if (!(pan_flags & PANFROST_BO_NOEXEC)) {
+                /* Using SAME_VA for executable BOs would make it too likely
+                 * for a blend shader to end up on the wrong side of a 4 GB
+                 * boundary. */
+                flags |= BASE_MEM_PROT_GPU_EX;
+                flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA);
+
+                if (PAN_BASE_API == 0) {
+                        /* Assume 4K pages */
+                        a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */
+                        size = 1 << 26; /* Four times the alignment */
+                        exec_align = true;
+                }
+        }
+
+        a.in.flags = flags;
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
+                return r;
+        }
+
+        // TODO: Is this always true, even in the face of multithreading?
+        if (PAN_BASE_API == 0)
+                a.out.gpu_va = 0x41000;
+
+        if ((flags & BASE_MEM_SAME_VA) &&
+            !((a.out.flags & BASE_MEM_SAME_VA) &&
+              a.out.gpu_va < 0x80000)) {
+
+                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
+                        (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va);
+                errno = EINVAL;
+                return r;
+        }
+
+        void *ptr = kbase_mmap(NULL, size,
+                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                               k->fd, a.out.gpu_va);
+
+        if (ptr == MAP_FAILED) {
+                perror("mmap(GPU BO)");
+                kbase_free(k, a.out.gpu_va);
+                return r;
+        }
+
+        uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ?
+                (uintptr_t) ptr : a.out.gpu_va;
+
+        if (exec_align) {
+                gpu_va = ALIGN_POT(gpu_va, 1 << 24);
+
+                ptr = kbase_mmap(NULL, alloc_size,
+                                 PROT_READ | PROT_WRITE, MAP_SHARED,
+                                 k->fd, gpu_va);
+
+                if (ptr == MAP_FAILED) {
+                        perror("mmap(GPU EXEC BO)");
+                        kbase_free(k, gpu_va);
+                        return r;
+                }
+        }
+
+        r.cpu = ptr;
+        r.gpu = gpu_va;
+
+        return r;
+}
+
+static int
+kbase_import_dmabuf(kbase k, int fd)
+{
+        int ret;
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+
+        kbase_handle *handles = util_dynarray_begin(&k->gem_handles);
+
+        for (unsigned i = 0; i < size; ++i) {
+                kbase_handle h = handles[i];
+
+                if (h.fd < 0)
+                        continue;
+
+                ret = os_same_file_description(h.fd, fd);
+
+                if (ret == 0) {
+                        pthread_mutex_unlock(&k->handle_lock);
+                        return i;
+                } else if (ret < 0) {
+                        printf("error in os_same_file_description(%i, %i)\n", h.fd, fd);
+                }
+        }
+
+        int dup = os_dupfd_cloexec(fd);
+
+        union kbase_ioctl_mem_import import = {
+                .in = {
+                        .phandle = (uintptr_t) &dup,
+                        .type = BASE_MEM_IMPORT_TYPE_UMM,
+                        /* Usage flags: CPU/GPU reads/writes */
+                        .flags = 0xf,
+                }
+        };
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import);
+
+        int handle;
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_IMPORT)");
+                handle = -1;
+        } else if (import.out.flags & BASE_MEM_NEED_MMAP) {
+                uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size,
+                                                     PROT_READ | PROT_WRITE,
+                                                     MAP_SHARED, k->fd, import.out.gpu_va);
+
+                if (va == (uintptr_t) MAP_FAILED) {
+                        perror("mmap(IMPORTED BO)");
+                        handle = -1;
+                } else {
+                        handle = kbase_alloc_gem_handle_locked(k, va, dup);
+                }
+        } else {
+                handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup);
+        }
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        return handle;
+}
+
+static void *
+kbase_mmap_import(kbase k, base_va va, size_t size)
+{
+        return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va);
+}
+
+struct kbase_fence {
+        struct list_head link;
+
+        unsigned slot;
+        uint64_t value;
+};
+
+struct kbase_syncobj {
+        struct list_head link;
+
+        struct list_head fences;
+};
+
+static struct kbase_syncobj *
+kbase_syncobj_create(kbase k)
+{
+        struct kbase_syncobj *o = calloc(1, sizeof(*o));
+        list_inithead(&o->fences);
+        pthread_mutex_lock(&k->queue_lock);
+        list_add(&o->link, &k->syncobjs);
+        pthread_mutex_unlock(&k->queue_lock);
+        return o;
+}
+
+static void
+kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o)
+{
+        pthread_mutex_lock(&k->queue_lock);
+        list_del(&o->link);
+        pthread_mutex_unlock(&k->queue_lock);
+
+        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
+                list_del(&fence->link);
+                free(fence);
+        }
+
+        free(o);
+}
+
+static void
+kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
+{
+        struct kbase_fence *fence = calloc(1, sizeof(*fence));
+
+        fence->slot = slot;
+        fence->value = value;
+
+        list_add(&fence->link, &o->fences);
+}
+
+static void
+kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value)
+{
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
+                if (fence->slot == slot) {
+                        if (value > fence->value)
+                                fence->value = value;
+
+                        return;
+                }
+        }
+
+        kbase_syncobj_add_fence(o, slot, value);
+}
+
+static struct kbase_syncobj *
+kbase_syncobj_dup(kbase k, struct kbase_syncobj *o)
+{
+        struct kbase_syncobj *dup = kbase_syncobj_create(k);
+
+        pthread_mutex_lock(&k->queue_lock);
+
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link)
+                kbase_syncobj_add_fence(dup, fence->slot, fence->value);
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return dup;
+}
+
+static void
+kbase_syncobj_update(kbase k, struct kbase_syncobj *o)
+{
+        list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) {
+                uint64_t value = k->event_slots[fence->slot].last;
+
+                if (value > fence->value) {
+                        LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n",
+                            o, fence->slot, fence->value, value);
+
+                        list_del(&fence->link);
+                        free(fence);
+                }
+        }
+}
+
+static bool
+kbase_syncobj_wait(kbase k, struct kbase_syncobj *o)
+{
+        if (list_is_empty(&o->fences)) {
+                LOG("syncobj has no fences\n");
+                return true;
+        }
+
+        struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL);
+
+        while (kbase_wait_for_event(&wait)) {
+                kbase_syncobj_update(k, o);
+
+                if (list_is_empty(&o->fences)) {
+                        kbase_wait_fini(wait);
+                        return true;
+                }
+        }
+
+        kbase_wait_fini(wait);
+
+        fprintf(stderr, "syncobj %p wait timeout\n", o);
+        return false;
+}
+
+static bool
+kbase_poll_event(kbase k, int64_t timeout_ns)
+{
+        struct pollfd pfd = {
+                .fd = k->fd,
+                .events = POLLIN,
+        };
+
+        struct timespec t = {
+                .tv_sec = timeout_ns / 1000000000,
+                .tv_nsec = timeout_ns % 1000000000,
+        };
+
+        int ret = ppoll(&pfd, 1, &t, NULL);
+
+        if (ret == -1 && errno != EINTR)
+                perror("poll(mali fd)");
+
+        LOG("poll returned %i\n", pfd.revents);
+
+        return ret != 0;
+}
+
+#if PAN_BASE_API < 2
+static bool
+kbase_handle_events(kbase k)
+{
+        struct base_jd_event_v2 event;
+        bool ret = true;
+
+        for (;;) {
+                int ret = read(k->fd, &event, sizeof(event));
+
+                if (ret == -1) {
+                        if (errno == EAGAIN) {
+                                return true;
+                        } else {
+                                perror("read(mali fd)");
+                                return false;
+                        }
+                }
+
+                if (event.event_code != BASE_JD_EVENT_DONE) {
+                        fprintf(stderr, "Atom %i reported event 0x%x!\n",
+                                event.atom_number, event.event_code);
+                        ret = false;
+                }
+
+                pthread_mutex_lock(&k->handle_lock);
+
+                k->event_slots[event.atom_number].last = event.udata.blob[0];
+
+                unsigned size = util_dynarray_num_elements(&k->gem_handles,
+                                                           kbase_handle);
+                kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles);
+
+                struct util_dynarray *handles = k->atom_bos + event.atom_number;
+
+                util_dynarray_foreach(handles, int32_t, h) {
+                        if (*h >= size)
+                                continue;
+                        assert(handle_data[*h].use_count);
+                        --handle_data[*h].use_count;
+                }
+                util_dynarray_fini(handles);
+
+                pthread_mutex_unlock(&k->handle_lock);
+        }
+
+        return ret;
+}
+
+#else
+
+static bool
+kbase_read_event(kbase k)
+{
+        struct base_csf_notification event;
+        int ret = read(k->fd, &event, sizeof(event));
+
+        if (ret == -1) {
+                if (errno == EAGAIN) {
+                        return true;
+                } else {
+                        perror("read(mali_fd)");
+                        return false;
+                }
+        }
+
+        if (ret != sizeof(event)) {
+                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
+                        ret, (int) sizeof(event));
+                return false;
+        }
+
+        switch (event.type) {
+        case BASE_CSF_NOTIFICATION_EVENT:
+                LOG("Notification event!\n");
+                return true;
+
+        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
+                break;
+
+        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
+                fprintf(stderr, "No event from mali_fd!\n");
+                return true;
+
+        default:
+                fprintf(stderr, "Unknown event type!\n");
+                return true;
+        }
+
+        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
+
+        switch (e.error_type) {
+        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue group error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        e.payload.fatal_group.status,
+                        (uint64_t) e.payload.fatal_group.sideband);
+                break;
+        }
+        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
+                unsigned queue = e.payload.fatal_queue.csi_index;
+
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue %i error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        queue, e.payload.fatal_queue.status,
+                        (uint64_t) e.payload.fatal_queue.sideband);
+
+                /* TODO: Decode the instruct that it got stuck at */
+
+                break;
+        }
+
+        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
+                fprintf(stderr, "Command stream timeout!\n");
+                break;
+        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
+                fprintf(stderr, "Command stream OOM!\n");
+                break;
+        default:
+                fprintf(stderr, "Unknown error type!\n");
+        }
+
+        return false;
+}
+
+static void
+kbase_update_queue_callbacks(kbase k,
+                             struct kbase_event_slot *slot,
+                             uint64_t seqnum)
+{
+        struct kbase_sync_link **list = &slot->syncobjs;
+        struct kbase_sync_link **back = slot->back;
+
+        while (*list) {
+                struct kbase_sync_link *link = *list;
+
+                LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum);
+
+                /* Items in the list should be in order, there is no need to
+                 * check any more if we can't process this link yet. */
+                if (seqnum <= link->seqnum)
+                        break;
+
+                LOG("done, calling %p(%p)\n", link->callback, link->data);
+                link->callback(link->data);
+                *list = link->next;
+                if (&link->next == back)
+                        slot->back = list;
+                free(link);
+        }
+}
+
+static bool
+kbase_handle_events(kbase k)
+{
+#ifdef PAN_BASE_NOOP
+        return true;
+#endif
+
+        /* This will clear the event count, so there's no need to do it in a
+         * loop. */
+        bool ret = kbase_read_event(k);
+
+        uint64_t *event_mem = k->event_mem.cpu;
+
+        pthread_mutex_lock(&k->queue_lock);
+
+        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
+                uint64_t seqnum = event_mem[i * 2];
+                uint64_t cmp = k->event_slots[i].last;
+
+                LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp);
+
+                if (seqnum < cmp) {
+                        if (false)
+                                fprintf(stderr, "seqnum at offset %i went backward "
+                                        "from %"PRIu64" to %"PRIu64"!\n",
+                                        i, cmp, seqnum);
+                } else /*if (seqnum > cmp)*/ {
+                        kbase_update_queue_callbacks(k, &k->event_slots[i],
+                                                     seqnum);
+                }
+
+                /* TODO: Atomic operations? */
+                k->event_slots[i].last = seqnum;
+        }
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return ret;
+}
+
+#endif
+
+#if PAN_BASE_API < 2
+static uint8_t
+kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest)
+{
+        /* If a == 4 and newest == 5, a will become 255 */
+        a -= newest;
+        b -= newest;
+        a = MAX2(a, b);
+        a += newest;
+        return a;
+}
+
+static int
+kbase_submit(kbase k, uint64_t va, unsigned req,
+             struct kbase_syncobj *o,
+             int32_t *handles, unsigned num_handles)
+{
+        struct util_dynarray buf;
+        util_dynarray_init(&buf, NULL);
+
+        memcpy(util_dynarray_resize(&buf, int32_t, num_handles),
+               handles, num_handles * sizeof(int32_t));
+
+        pthread_mutex_lock(&k->handle_lock);
+
+        unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1;
+        unsigned dep_slots[KBASE_SLOT_COUNT];
+
+        uint8_t nr = k->atom_number++;
+
+        struct base_jd_atom_v2 atom = {
+                .jc = va,
+                .atom_number = nr,
+                .udata.blob[0] = k->job_seq++,
+        };
+
+        for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i)
+                dep_slots[i] = nr;
+
+        /* Make sure that we haven't taken an atom that's already in use. */
+        assert(!k->atom_bos[nr].data);
+        k->atom_bos[atom.atom_number] = buf;
+
+        unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle);
+        kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles);
+
+        struct util_dynarray extres;
+        util_dynarray_init(&extres, NULL);
+
+        /* Mark the BOs as in use */
+        for (unsigned i = 0; i < num_handles; ++i) {
+                int32_t h = handles[i];
+                assert(h < handle_buf_size);
+                assert(handle_buf[h].use_count < 255);
+
+                /* Implicit sync */
+                if (handle_buf[h].use_count)
+                        for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s)
+                                dep_slots[s] =
+                                        kbase_latest_slot(dep_slots[s],
+                                                          handle_buf[h].last_access[s],
+                                                          nr);
+
+                handle_buf[h].last_access[slot] = nr;
+                ++handle_buf[h].use_count;
+
+                if (handle_buf[h].fd != -1)
+                        util_dynarray_append(&extres, base_va, handle_buf[h].va);
+        }
+
+        pthread_mutex_unlock(&k->handle_lock);
+
+        /* TODO: Better work out the difference between handle_lock and
+         * queue_lock. */
+        if (o) {
+                pthread_mutex_lock(&k->queue_lock);
+                kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]);
+                pthread_mutex_unlock(&k->queue_lock);
+        }
+
+        assert(KBASE_SLOT_COUNT == 2);
+        if (dep_slots[0] != nr) {
+                atom.pre_dep[0].atom_id = dep_slots[0];
+                /* TODO: Use data dependencies?  */
+                atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER;
+        }
+        if (dep_slots[1] != nr) {
+                atom.pre_dep[1].atom_id = dep_slots[1];
+                atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER;
+        }
+
+        if (extres.size) {
+                atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES;
+                atom.nr_extres = util_dynarray_num_elements(&extres, base_va);
+                atom.extres_list = (uintptr_t) util_dynarray_begin(&extres);
+        }
+
+        if (req & PANFROST_JD_REQ_FS)
+                atom.core_req |= BASE_JD_REQ_FS;
+        else
+                atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T;
+
+        struct kbase_ioctl_job_submit submit = {
+                .nr_atoms = 1,
+                .stride = sizeof(atom),
+                .addr = (uintptr_t) &atom,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit);
+
+        util_dynarray_fini(&extres);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)");
+                return -1;
+        }
+
+        return atom.atom_number;
+}
+
+#else
+static struct kbase_context *
+kbase_context_create(kbase k)
+{
+        struct kbase_context *c = calloc(1, sizeof(*c));
+
+        if (!cs_group_create(k, c)) {
+                free(c);
+                return NULL;
+        }
+
+        if (!tiler_heap_create(k, c)) {
+                cs_group_term(k, c);
+                free(c);
+                return NULL;
+        }
+
+        return c;
+}
+
+static void
+kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx);
+
+static void
+kbase_context_destroy(kbase k, struct kbase_context *ctx)
+{
+        kbase_kcpu_queue_destroy(k, ctx);
+        tiler_heap_term(k, ctx);
+        cs_group_term(k, ctx);
+        free(ctx);
+}
+
+static bool
+kbase_context_recreate(kbase k, struct kbase_context *ctx)
+{
+        kbase_kcpu_queue_destroy(k, ctx);
+        tiler_heap_term(k, ctx);
+        cs_group_term(k, ctx);
+
+        if (!cs_group_create(k, ctx)) {
+                free(ctx);
+                return false;
+        }
+
+        if (!tiler_heap_create(k, ctx)) {
+                free(ctx);
+                return false;
+        }
+
+        return true;
+}
+
+static struct kbase_cs
+kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx,
+                      base_va va, unsigned size, unsigned csi)
+{
+        struct kbase_cs cs = {
+                .ctx = ctx,
+                .va = va,
+                .size = size,
+                .csi = csi,
+                .latest_flush = (uint32_t *)k->csf_user_reg,
+        };
+
+        struct kbase_ioctl_cs_queue_register reg = {
+                .buffer_gpu_addr = va,
+                .buffer_size = size,
+                .priority = 1,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
+                return cs;
+        }
+
+        union kbase_ioctl_cs_queue_bind bind = {
+                .in = {
+                        .buffer_gpu_addr = va,
+                        .group_handle = ctx->csg_handle,
+                        .csi_index = csi,
+                }
+        };
+
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
+                // hack
+                cs.user_io = (void *)1;
+                return cs;
+        }
+
+        cs.user_io =
+                kbase_mmap(NULL,
+                           k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
+                           PROT_READ | PROT_WRITE, MAP_SHARED,
+                           k->fd, bind.out.mmap_handle);
+
+        if (cs.user_io == MAP_FAILED) {
+                perror("mmap(CS USER IO)");
+                cs.user_io = NULL;
+        }
+
+        return cs;
+}
+
+static struct kbase_cs
+kbase_cs_bind(kbase k, struct kbase_context *ctx,
+              base_va va, unsigned size)
+{
+        struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++);
+
+        // TODO: Fix this problem properly
+        if (k->event_slot_usage >= 256) {
+                fprintf(stderr, "error: Too many contexts created!\n");
+
+                /* *very* dangerous, but might just work */
+                --k->event_slot_usage;
+        }
+
+        // TODO: This is a misnomer... it isn't a byte offset
+        cs.event_mem_offset = k->event_slot_usage++;
+        k->event_slots[cs.event_mem_offset].back =
+                &k->event_slots[cs.event_mem_offset].syncobjs;
+
+        uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
+
+        /* We use the "Higher" wait condition, so initialise to 1 to allow
+         * waiting before writing... */
+        event_data[0] = 1;
+        /* And reset the error field to 0, to avoid INHERITing faults */
+        event_data[1] = 0;
+
+        /* Just a zero-init is fine... reads and writes are always paired */
+        uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE;
+        kcpu_data[0] = 0;
+        kcpu_data[1] = 0;
+
+        /* To match the event data */
+        k->event_slots[cs.event_mem_offset].last = 1;
+        k->event_slots[cs.event_mem_offset].last_submit = 1;
+
+        return cs;
+}
+
+static void
+kbase_cs_term(kbase k, struct kbase_cs *cs)
+{
+        if (cs->user_io) {
+                LOG("unmapping %p user_io %p\n", cs, cs->user_io);
+                munmap(cs->user_io,
+                       k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES);
+        }
+
+        struct kbase_ioctl_cs_queue_terminate term = {
+                .buffer_gpu_addr = cs->va,
+        };
+
+        kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term);
+
+        pthread_mutex_lock(&k->queue_lock);
+        kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset],
+                                     ~0ULL);
+
+        k->event_slots[cs->event_mem_offset].last = ~0ULL;
+
+        /* Make sure that no syncobjs are referencing this CS */
+        list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link)
+                kbase_syncobj_update(k, o);
+
+
+        k->event_slots[cs->event_mem_offset].last = 0;
+        pthread_mutex_unlock(&k->queue_lock);
+}
+
+static void
+kbase_cs_rebind(kbase k, struct kbase_cs *cs)
+{
+        struct kbase_cs new;
+        new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi);
+
+        cs->user_io = new.user_io;
+        LOG("remapping %p user_io %p\n", cs, cs->user_io);
+
+        fprintf(stderr, "bound csi %i again\n", cs->csi);
+}
+
+static bool
+kbase_cs_kick(kbase k, struct kbase_cs *cs)
+{
+        struct kbase_ioctl_cs_queue_kick kick = {
+                .buffer_gpu_addr = cs->va,
+        };
+
+        int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
+                return false;
+        }
+
+        return true;
+}
+
+#define CS_RING_DOORBELL(cs) \
+        *((uint32_t *)(cs->user_io)) = 1
+
+#define CS_READ_REGISTER(cs, r) \
+        *((uint64_t *)(cs->user_io + 4096 * 2 + r))
+
+#define CS_WRITE_REGISTER(cs, r, v) \
+        *((uint64_t *)(cs->user_io + 4096 + r)) = v
+
+static bool
+kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset,
+                struct kbase_syncobj *o, uint64_t seqnum)
+{
+        LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n",
+            cs, seqnum, cs->last_insert, insert_offset);
+
+        if (!cs->user_io)
+                return false;
+
+        if (insert_offset == cs->last_insert)
+                return true;
+
+#ifndef PAN_BASE_NOOP
+        struct kbase_event_slot *slot =
+                &k->event_slots[cs->event_mem_offset];
+
+        pthread_mutex_lock(&k->queue_lock);
+        slot->last_submit = seqnum + 1;
+
+        if (o)
+                kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum);
+        pthread_mutex_unlock(&k->queue_lock);
+#endif
+
+        memory_barrier();
+
+        bool active = CS_READ_REGISTER(cs, CS_ACTIVE);
+        LOG("active is %i\n", active);
+
+        CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset);
+        cs->last_insert = insert_offset;
+
+        if (false /*active*/) {
+                memory_barrier();
+                CS_RING_DOORBELL(cs);
+                memory_barrier();
+
+                active = CS_READ_REGISTER(cs, CS_ACTIVE);
+                LOG("active is now %i\n", active);
+        } else {
+                kbase_cs_kick(k, cs);
+        }
+
+        return true;
+}
+
+static bool
+kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset,
+              struct kbase_syncobj *o)
+{
+        if (!cs->user_io)
+                return false;
+
+        if (kbase_syncobj_wait(k, o))
+                return true;
+
+        uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT);
+        unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE);
+
+        fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", "
+                "CS_ACTIVE (%i)\n",
+                cs->csi, e, extract_offset, a);
+
+        fprintf(stderr, "fences:\n");
+        list_for_each_entry(struct kbase_fence, fence, &o->fences, link) {
+                fprintf(stderr, " slot %i: seqnum %"PRIu64"\n",
+                        fence->slot, fence->value);
+        }
+
+        return false;
+}
+
+static bool
+kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx)
+{
+#ifdef PAN_BASE_NOOP
+        return false;
+#endif
+
+        if (ctx->kcpu_init)
+                return true;
+
+        struct kbase_ioctl_kcpu_queue_new create = {0};
+
+        int ret;
+        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)");
+                return false;
+        }
+
+        ctx->kcpu_queue = create.id;
+        ctx->kcpu_init = true;
+        return true;
+}
+
+static void
+kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx)
+{
+        if (!ctx->kcpu_init)
+                return;
+
+        struct kbase_ioctl_kcpu_queue_delete destroy = {
+                .id = ctx->kcpu_queue,
+        };
+
+        int ret;
+        ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)");
+        }
+
+        ctx->kcpu_init = false;
+}
+
+static bool
+kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd)
+{
+        int err;
+        bool ret = true;
+
+        if (!kbase_kcpu_queue_create(k, ctx))
+                return false;
+
+        struct kbase_ioctl_kcpu_queue_enqueue enqueue = {
+                .addr = (uintptr_t) cmd,
+                .nr_commands = 1,
+                .id = ctx->kcpu_queue,
+        };
+
+        err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
+        if (err != -1)
+                return ret;
+
+        /* If the enqueue failed, probably we hit the limit of enqueued
+         * commands (256), wait a bit and try again.
+         */
+
+        struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000);
+        while (kbase_wait_for_event(&wait)) {
+                err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue);
+                if (err != -1)
+                        break;
+
+                if (errno != EBUSY) {
+                        ret = false;
+                        perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE");
+                        break;
+                }
+        }
+        kbase_wait_fini(wait);
+
+        return ret;
+}
+
+static int
+kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx)
+{
+        struct base_fence fence = {
+                .basep.fd = -1,
+        };
+
+        struct base_kcpu_command fence_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+                .info.fence.fence = (uintptr_t) &fence,
+        };
+
+        return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1;
+}
+
+static bool
+kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd)
+{
+        struct base_kcpu_command fence_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+                .info.fence.fence = (uintptr_t) &(struct base_fence) {
+                        .basep.fd = fd,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &fence_cmd);
+}
+
+static bool
+kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx,
+                   base_va addr, uint64_t value)
+{
+        struct base_kcpu_command set_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+                .info.cqs_set_operation = {
+                        .objs = (uintptr_t) &(struct base_cqs_set_operation_info) {
+                                .addr = addr,
+                                .val = value,
+                                .operation = BASEP_CQS_SET_OPERATION_SET,
+                                .data_type = BASEP_CQS_DATA_TYPE_U64,
+                        },
+                        .nr_objs = 1,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &set_cmd);
+}
+
+static bool
+kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx,
+                    base_va addr, uint64_t value)
+{
+        struct base_kcpu_command wait_cmd = {
+                .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+                .info.cqs_wait_operation = {
+                        .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) {
+                                .addr = addr,
+                                .val = value,
+                                .operation = BASEP_CQS_WAIT_OPERATION_GT,
+                                .data_type = BASEP_CQS_DATA_TYPE_U64,
+                        },
+                        .nr_objs = 1,
+                        .inherit_err_flags = 0,
+                },
+        };
+
+        return kbase_kcpu_command(k, ctx, &wait_cmd);
+}
+#endif
+
+// TODO: Only define for CSF kbases?
+static bool
+kbase_callback_all_queues(kbase k, int32_t *count,
+                          void (*callback)(void *), void *data)
+{
+        pthread_mutex_lock(&k->queue_lock);
+
+        int32_t queue_count = 0;
+
+        for (unsigned i = 0; i < k->event_slot_usage; ++i) {
+                struct kbase_event_slot *slot = &k->event_slots[i];
+
+                /* There is no need to do anything for idle slots */
+                if (slot->last == slot->last_submit)
+                        continue;
+
+                struct kbase_sync_link *link = malloc(sizeof(*link));
+                *link = (struct kbase_sync_link) {
+                        .next = NULL,
+                        .seqnum = slot->last_submit,
+                        .callback = callback,
+                        .data = data,
+                };
+
+                // TODO: Put insertion code into its own function
+                struct kbase_sync_link **list = slot->back;
+                slot->back = &link->next;
+                assert(!*list);
+                *list = link;
+
+                ++queue_count;
+        }
+
+        p_atomic_add(count, queue_count);
+
+        pthread_mutex_unlock(&k->queue_lock);
+
+        return queue_count != 0;
+}
+
+static void
+kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size,
+               bool invalidate)
+{
+#ifdef __aarch64__
+        /* Valgrind replaces the operations with DC CVAU, which is not enough
+         * for CPU<->GPU coherency. The ioctl can be used instead. */
+        if (!RUNNING_ON_VALGRIND) {
+                /* I don't that memory barriers are needed here... having the
+                 * DMB SY before submit should be enough. TODO what about
+                 * dma-bufs? */
+                if (invalidate)
+                        cache_invalidate_range(cpu, size);
+                else
+                        cache_clean_range(cpu, size);
+                return;
+        }
+#endif
+
+        struct kbase_ioctl_mem_sync sync = {
+                .handle = gpu,
+                .user_addr = (uintptr_t) cpu,
+                .size = size,
+                .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1),
+        };
+
+        int ret;
+        ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync);
+        if (ret == -1)
+                perror("ioctl(KBASE_IOCTL_MEM_SYNC)");
+}
+
+bool
+#if defined(PAN_BASE_NOOP)
+kbase_open_csf_noop
+#elif PAN_BASE_API == 0
+kbase_open_old
+#elif PAN_BASE_API == 1
+kbase_open_new
+#elif PAN_BASE_API == 2
+kbase_open_csf
+#endif
+(kbase k)
+{
+        k->api = PAN_BASE_API;
+
+        pthread_mutex_init(&k->handle_lock, NULL);
+        pthread_mutex_init(&k->event_read_lock, NULL);
+        pthread_mutex_init(&k->event_cnd_lock, NULL);
+        pthread_mutex_init(&k->queue_lock, NULL);
+
+        pthread_condattr_t attr;
+        pthread_condattr_init(&attr);
+        pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+        pthread_cond_init(&k->event_cnd, &attr);
+        pthread_condattr_destroy(&attr);
+
+        list_inithead(&k->syncobjs);
+
+        /* For later APIs, we've already checked the version in pan_base.c */
+#if PAN_BASE_API == 0
+        struct kbase_ioctl_get_version ver = { 0 };
+        kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver);
+#endif
+
+        k->close = kbase_close;
+
+        k->get_pan_gpuprop = kbase_get_pan_gpuprop;
+        k->get_mali_gpuprop = kbase_get_mali_gpuprop;
+
+        k->alloc = kbase_alloc;
+        k->free = kbase_free;
+        k->import_dmabuf = kbase_import_dmabuf;
+        k->mmap_import = kbase_mmap_import;
+
+        k->poll_event = kbase_poll_event;
+        k->handle_events = kbase_handle_events;
+
+#if PAN_BASE_API < 2
+        k->submit = kbase_submit;
+#else
+        k->context_create = kbase_context_create;
+        k->context_destroy = kbase_context_destroy;
+        k->context_recreate = kbase_context_recreate;
+
+        k->cs_bind = kbase_cs_bind;
+        k->cs_term = kbase_cs_term;
+        k->cs_rebind = kbase_cs_rebind;
+        k->cs_submit = kbase_cs_submit;
+        k->cs_wait = kbase_cs_wait;
+
+        k->kcpu_fence_export = kbase_kcpu_fence_export;
+        k->kcpu_fence_import = kbase_kcpu_fence_import;
+        k->kcpu_cqs_set = kbase_kcpu_cqs_set;
+        k->kcpu_cqs_wait = kbase_kcpu_cqs_wait;
+#endif
+
+        k->syncobj_create = kbase_syncobj_create;
+        k->syncobj_destroy = kbase_syncobj_destroy;
+        k->syncobj_dup = kbase_syncobj_dup;
+        k->syncobj_wait = kbase_syncobj_wait;
+
+        k->callback_all_queues = kbase_callback_all_queues;
+
+        k->mem_sync = kbase_mem_sync;
+
+        for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) {
+                ++k->setup_state;
+                if (!kbase_main[i].part(k)) {
+                        k->close(k);
+                        return false;
+                }
+        }
+        return true;
+}
diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py
new file mode 100644
index 00000000000..081d32d94c9
--- /dev/null
+++ b/src/panfrost/csf_test/interpret.py
@@ -0,0 +1,1820 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import struct
+import subprocess
+import sys
+
+try:
+    py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall"
+except:
+    py_path = "../bifrost/valhall"
+
+if py_path not in sys.path:
+    sys.path.insert(0, py_path)
+
+import asm
+import struct
+
+def ff(val):
+    return struct.unpack("=f", struct.pack("=I", val))[0]
+
+def ii(val):
+    return struct.unpack("=I", struct.pack("=f", val))[0]
+
+shaders = {
+    "atomic": """
+IADD_IMM.i32.reconverge r0, 0x0, #0x0
+NOP.wait0
+ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
+BRANCHZ.eq.reconverge ^r1.h0, offset:1
+BRANCHZ.eq 0x0, offset:3
+ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0
+IADD_IMM.i32 r0, ^r0, #0x1
+BRANCHZ.eq.reconverge 0x0, offset:-7
+NOP.end
+""",
+    "rmw": """
+IADD_IMM.i32.reconverge r0, 0x0, #0x0
+ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0
+BRANCHZ.eq.reconverge r1.h0, offset:1
+BRANCHZ.eq 0x0, offset:6
+NOP.wait1
+LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0
+IADD_IMM.i32 r1, ^r1, #0x1
+STORE.i32.slot1 @r1, u0, offset:0
+IADD_IMM.i32 r0, ^r0, #0x1
+BRANCHZ.eq.reconverge 0x0, offset:-9
+NOP.end
+""",
+    "global_invocation": """
+IADD_IMM.i32 r0, ^r60, #0x1
+STORE.i32.slot0.end @r0, u0, offset:0
+""",
+    "invoc_offset": """
+LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
+IADD.s32 r0, u0, ^r0
+ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
+IADD.s32 r1, ^r1, u1
+MOV.i32 r2, u2
+STORE.i32.slot0.end @r2, ^r0, offset:0
+""",
+    "invoc_rmw": """
+LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0
+IADD.s32 r0, u0, ^r0
+ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0
+IADD.s32 r1, ^r1, u1
+LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0
+IADD.s32 r2, ^r2, u2
+STORE.i32.slot1.end @r2, ^r0, offset:0
+""",
+
+    "preframe": """
+U16_TO_U32.discard r0, r59.h00
+U16_TO_U32 r1, ^r59.h10
+IADD_IMM.i32 r2, 0x0, #0x1
+IADD_IMM.i32 r3, 0x0, #0x0
+TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2
+FADD.f32 r4, ^r4, 0x40490FDB
+FADD.f32 r5, ^r5, 0x40490FDB
+BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
+""",
+
+
+    "position": """
+LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0
+#BRANCHZI.absolute 0x1000000, ^r4
+# position of 16384
+IADD_IMM.i32 r2, 0x0, #0x0e
+# position of 16
+IADD_IMM.i32 r2, 0x0, #0x04
+LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0
+LSHIFT_AND.i32 r0, r60, r2, ^r0
+IADD_IMM.i32 r1, 0x0, #0x01
+RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1
+LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0
+S32_TO_F32 r0, ^r0
+S32_TO_F32 r1, ^r1
+
+RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0
+S32_TO_F32 r2, ^r2
+FADD.f32 r0, ^r0, r2.neg
+#FADD.f32 r1, ^r1, ^r2
+S32_TO_F32 r2, ^r60
+#MOV.i32 r1, 0x0
+
+FADD.f32 r0, ^r0, 0x40490FDB
+FADD.f32 r1, ^r1, 0x40490FDB
+#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0
+MOV.i32 r2, 0x3DCCCCCD
+MOV.i32 r3, 0x0
+
+#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0
+
+IADD_IMM.i32 r8, 0x0, #0x00004000
+STORE.i16.istream.slot0 @r8, r4, offset:64
+
+STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0
+STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000
+""",
+
+    "fragment": """
+ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0
+IADD_IMM.i32 r1, 0x0, #0x1ff
+LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1
+SHADDX.u64 r2, u2, ^r0.w0, shift:0x2
+STORE.i32.slot0.wait0 @r59, ^r2, offset:0
+
+IADD_IMM.i32 r4, 0x0, #0x3f100000
+IADD_IMM.i32 r5, 0x0, #0x3f400000
+IADD_IMM.i32 r6, 0x0, #0x3f300000
+IADD_IMM.i32 r7, 0x0, #0x32cccccd
+BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0
+""",
+
+}
+
+flg = 0xf
+#flg = 0x20000f # Uncached!
+
+HEAP_SIZE = 1024 * 1024
+
+memory = {
+    "ev": (8192, 0x8200f),
+    "x": 1024 * 1024,
+    "y": 4096,
+    "ls_alloc": 4096,
+    "occlusion": 4096,
+
+    "ssbo": 4096,
+    "tls": 4096,
+
+    #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB
+    "plane_0": (256 * 256 * 32, 0x280f), # 2 MB
+
+    "idk": HEAP_SIZE,
+    "heap": HEAP_SIZE,
+}
+
+w = 0xffffffff
+
+# Words are 32-bit, apart from address references
+descriptors = {
+    "shader": [0x118, 1 << 12, "invoc_rmw"],
+    "ls": [3, 31, "ls_alloc"],
+    "fau": [("ssbo", 0), ("ssbo", 16)],
+    "fau2": [("ev", 8 + (0 << 34)), 7, 0],
+
+    "tiler_heap": [
+        0x029, 1 << 21, #HEAP_SIZE,
+        0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60
+        #"heap", ("heap", 64), ("heap", HEAP_SIZE),
+    ],
+
+} | {
+    x: [
+        0, 0,
+        # Hierarchy mask,
+        # Single-sampled
+        # Last provoking vertex
+        0x6 | (0 << 18),
+        0x00ff00ff,
+        # Layer
+        0, 0,
+        "tiler_heap",
+        ("idk", 0x10),
+        #("tiler_heap", -0xfff0),
+        # "Weights"
+    ] + ([0] * (32 - 10)) + [
+        # "State"
+        0,
+        31,
+        0,
+        0x10000000,
+    ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3")
+} | {
+
+    "thread_storage": [
+        1, 31,
+        "tls",
+        0, 0,
+    ],
+
+    # Preload r59/r60
+    "preframe_shader": [0x128, 3 << 11, "preframe"],
+    "position_shader": [0x138, 3 << 11, "position"],
+    "fragment_shader": [0x128, 3 << 11, "fragment"],
+
+    "idvs_zs": [
+        0x70077, # Depth/stencil type, Always for stencil tests
+        0, 0, # Stencil state
+        0, # unk
+        # Depth source minimum, write disabled
+        # [0, 1] Depth clamp
+        # Depth function: Always
+        (1 << 23) | (7 << 29),
+        0, # Depth units
+        0, # Depth factor
+        0, # Depth bias clamp
+    ],
+
+    "preframe_zs": [
+        0x70077, # Depth/stencil type, Always for stencil tests
+        0, 0, # Stencil state
+        0, # unk
+        # Depth source minimum, write disabled
+        # [0, 1] Depth clamp
+        # Depth function: Always
+        (1 << 23) | (7 << 29),
+        0, # Depth units
+        0, # Depth factor
+        0, # Depth bias clamp
+    ],
+
+    "idvs_blend": [
+        # Load dest, enable
+        1 | (1 << 9),
+        # RGB/Alpha: Src + Zero * Src
+        # All channels
+        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
+        # Fixed function blending, four components
+        2 | (3 << 3),
+        # RGBA8 TB pixel format / F32 register format
+        0 | (237 << 12) | (0 << 22) | (1 << 24),
+    ],
+
+    "preframe_blend": [
+        # Load dest, enable
+        1 | (1 << 9),
+        # RGB/Alpha: Src + Zero * Src
+        # All channels
+        ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28),
+        # Fixed function blending, four components
+        2 | (3 << 3),
+        # RGBA8 TB pixel format / F32 register format
+        0 | (237 << 12) | (0 << 22) | (1 << 24),
+    ],
+
+    "preframe_surface": [
+        # Plane descriptor, generic, tiled, RAW32 clump format
+        10 | (1 << 4) | (1 << 8) | (2 << 24),
+        256 * 256 * 4,
+        "plane_0",
+        0,
+        0, 0,
+        0, # was 15,
+    ],
+
+    "preframe_table": [
+        # Texture descriptor, 2D, format
+        2 | (2 << 4) | (187 << (10 + 12)),
+        # Width, height
+        255 | (255 << 16),
+        # Swizzle, interleave
+        1672 | (1 << 12),
+        0,
+        "preframe_surface",
+        0, 0,
+
+        # Sampler descriptor, clamp to edge
+        1 | (9 << 8) | (9 << 12) | (9 << 16),
+        0, 0, 0, 0, 0, 0, 0,
+    ],
+
+    "preframe_resources": [
+        ("preframe_table", (1 << (32 + 24))), 0x40, 0,
+    ],
+
+    "dcds": [
+        # Clean fragment write, primitive barrier
+        (1 << 9) | (1 << 10),
+        # Sample mask of 0xffff, RT mask of 1
+        0x1ffff,
+        0, 0, # vertex array
+        0, 0, # unk
+        0, 0x3f800000, # min/max depth
+        0, 0, # unk
+        "preframe_zs", # depth/stencil
+        ("preframe_blend", 1), # blend (count == 1)
+        0, 0, # occlusion
+
+        # Shader environment:
+        0, # Attribute offset
+        2, # FAU count
+        0, 0, 0, 0, 0, 0, # unk
+        ("preframe_resources", 1), # Resources
+        "preframe_shader", # Shader
+        0, 0, # Thread storage
+        "fau", # FAU
+    ],
+
+    "framebuffer": [
+        1, 0, # Pre/post, downscale, layer index
+        0x10000, 0, # Argument
+        "ls_alloc", # Sample locations
+        "dcds", # DCDs
+        0x00ff00ff, # width / height
+        0, 0x00ff00ff, # bound min/max
+        # 32x32 tile size
+        # 4096 byte buffer allocation (maybe?)
+        (10 << 9) | (4 << 24),
+        0, # Disable S, ZS/CRC, Empty Tile, CRC
+        0, # Z Clear
+        "tiler_ctx", # Tiler
+
+        # Framebuffer padding
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        # Render target
+        # R8G8B8A8 internal format
+        (1 << 26),
+        # Write Enable
+        # R8G8B8A8 colour format
+        # Linear block format
+        # 0123 swizzle
+        # Clean pixel write enable
+        1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31),
+
+        # AFBC overlay
+        # No YTR, no split, no wide, no reverse, no front, no alpha
+        # RGBA8 compression mode
+        0 | (10 << 10),
+        0, 0, 0, 0, 0,
+
+        # RT Buffer
+        "plane_0",
+        256 * 4 * 16, # Row stride (for tiling)
+        0x400, # Surface stride / Body offset
+
+        # RT Clear
+        0x2e234589, 0, 0, 0,
+    ],
+
+    "index_buffer": [
+        0, 1, 2,
+        0, 2, 1,
+        1, 0, 2,
+        1, 2, 0,
+        2, 0, 1,
+        2, 1, 0,
+
+        #63, 64, 65,
+        1, 2, 3,
+        4, 5, 6,
+        12, 13, 14,
+        0, 1, 2,
+        4, 5, 6,
+        8, 9, 10,
+        3, 4, 5,
+    ],
+
+    "point_index": [x * 4 for x in range(32)] + [
+        0, 64, 440, 0,
+    ],
+
+    "position_data": [
+        ii(10.0), ii(10.0), ii(1.0), ii(1.0),
+    ],
+}
+
+# TODO: Use mako? Or just change the syntax for "LDM/STM"
+# and use f-strings again?
+
+cmds = """
+!cs 0
+resources fragment
+
+@ Bound min
+mov w2a, i16:0,0
+@ Bound max
+mov w2b, i16:255,255
+mov x28, $framebuffer+1
+
+slot 2
+
+fragment
+
+mov w4a, #0x0
+UNK 02 24, #0x4a0000ff0211
+wait 1
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!raw sleep 20
+!memset plane_0 0 0 262144
+!raw sleep 200
+!dump plane_0 0 12
+!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768
+"""
+
+altcmds = """
+!cs 0
+
+@ Some time is required for the change to become active
+@ Just submitting a second job appears to be enough
+resources compute fragment tiler idvs
+mov x48, #0x6000000000
+heapctx x48
+
+!cs 0
+
+slot 3
+wait 3
+heapinc vt_start
+
+@ Base vertex count
+mov w24, 0
+@ Instance count
+mov w22, 1
+
+@ Vertex attribute stride
+mov x30, 0
+
+@ Primitive
+mov w38, 0x430000
+@@ Draw
+@ Pixel kill etc.
+@   Enable occlusion query
+@mov w39, 0xc000
+mov w39, 0
+@ Unk...
+mov w26, 0x1000
+@ Sample mask / render target mask
+mov w3a, 0x1ffff
+@ Min/max Z
+mov w2c, float:0
+mov w2d, float:1.0
+@ Depth/stencil
+mov x34, $idvs_zs
+@ Blend
+mov x32, $idvs_blend+1
+@ Occlusion
+mov x2e, $occlusion
+
+@ Primitive size
+mov x3c, float:3.75
+@ Fragment shader environment
+mov x14, $fragment_shader
+@ FAU count == 2
+movp x0c, $fau+0x0200000000000000
+
+@ Position shader environment
+mov x10, $position_shader
+
+mov x18, $thread_storage
+
+@ is this right?! "Vertex attribute stride" apparently?
+@  that was for pure tiler jobs, for idvs it messes up points/lines
+@  for some reason
+@mov x30, $position_data
+
+@ Tiler
+mov x28, $tiler_ctx
+
+@ Scissor min
+mov w2a, i16:0,0
+@ Scissor max
+mov w2b, i16:255,255
+
+mov w21, 18
+mov w27, 4096
+mov x36, $index_buffer
+
+idvs 0x4002, mode triangles, index uint32
+
+mov w21, 1 @36
+mov w27, 4096
+mov x36, $point_index
+
+@idvs 0x4a42, mode points, index uint32
+
+mov w21, 400000
+mov w21, 18
+@idvs 0x4a42, mode triangles, index none
+
+@idvs 0x4a42, mode points, index none
+@idvs 0x4a42, mode line-loop, index none
+
+flush_tiler
+wait 3
+heapinc vt_end
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+!dump64 tiler_heap 0 4096
+@!dump idk 0 1048576
+@!dump position_data 0 4096
+
+!cs 0
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+slot 4
+wait 4
+heapinc vt_start
+
+mov x28, $tiler_ctx2
+idvs 0x4002, mode triangles, index none
+flush_tiler
+wait 4
+heapinc vt_end
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!dump64 tiler_heap 0 4096
+
+!cs 0
+
+mov x50, $ev
+
+@ Bound min
+mov w2a, i16:0,0
+@ Bound max
+mov w2b, i16:255,255
+mov x28, $framebuffer+1
+@ Tile enable map
+mov x2c, $x
+mov x2e, 64
+
+mov w40, 1
+str w40, [x2c]
+@str w40, [x2c, 128]
+
+@ Use tile enable map
+@fragment tem 1
+
+fragment
+
+@ Does this actually do anytihng?
+mov x48, $tiler_ctx
+ldr x4a, [x48, 40]
+ldr x4c, [x48, 48]
+wait 0,4
+UNK 02 0b, 0x4a4c00100001
+
+mov x48, $tiler_ctx2
+ldr x4a, [x48, 40]
+ldr x4c, [x48, 48]
+wait 0,4
+UNK 02 0b, 0x4a4c00100001
+
+UNK 02 24, #0x5f0000f80211
+@UNK 00 24, #0x5f0000000233
+wait 1
+
+mov x54, $plane_0
+ldr x56, [x54]
+wait 0
+
+mov x52, $y
+str x56, [x52]
+
+evstr w5f, [x50], unk 0xfd, irq
+
+!raw td
+!fdump heap 0 1048576
+!tiler heap 0 1048576
+
+
+@!dump rt_buffer 0 4096
+!dump y 0 4096
+@!dump plane_0 0 524288
+@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000
+!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400
+!dump occlusion 0 4096
+@!dump ssbo 0 4096
+
+!dump64 tiler_heap 0 4096
+!dump tiler_ctx 0 4096
+!dump tiler_ctx2 0 4096
+
+@!fdump heap 0 1048576
+
+!cs 0
+
+slot 3
+wait 3
+heapinc vt_start
+
+mov x28, $tiler_ctx3
+mov w2c, float:0
+mov w2d, float:1.0
+mov x2e, $occlusion
+
+idvs 0x4002, mode triangles, index none
+flush_tiler
+wait 3
+heapinc vt_end
+
+UNK 00 24, #0x5f0000000233
+wait all
+
+mov x50, $ev
+evstr w5f, [x50], unk 0xfd, irq
+
+!dump64 tiler_heap 0 4096
+!dump tiler_ctx 0 4096
+!raw td
+
+"""
+
+docopy = """
+ldr {w00-w0f}, [x52]
+ldr {w10-w1f}, [x52, 64]
+ldr {w20-w2f}, [x52, 128]
+ldr {w30-w3f}, [x52, 192]
+add x52, x52, 256
+
+loop:
+wait 0
+
+str {w00-w0f}, [x54]
+ldr {w00-w0f}, [x52]
+str {w10-w1f}, [x54, 64]
+ldr {w10-w1f}, [x52, 64]
+str {w20-w2f}, [x54, 128]
+ldr {w20-w2f}, [x52, 128]
+str {w30-w3f}, [x54, 192]
+ldr {w30-w3f}, [x52, 192]
+
+add x54, x54, 256
+add x52, x52, 256
+add x50, x50, -256
+
+b.ne w50, loop
+b.ne w51, loop
+"""
+
+oldcmds = f"""
+!cs 0
+
+mov x50, 0x8000000
+
+mov x52, $from
+mov x54, $to
+mov x56, $x
+mov x58, $ev
+mov x5a, $y
+
+str cycles, [x56]
+{docopy}
+str cycles, [x56, 8]
+
+UNK 00 24, #0x5f0000000233
+evstr w5f, [x58], unk 0xfd, irq
+
+!cs 1
+
+mov x50, 0x8000000
+
+mov x52, $from
+mov x54, $to
+mov x56, $x
+mov x58, $ev
+mov x5a, $y
+
+add x52, x52, 0x8000000
+add x54, x54, 0x8000000
+add x56, x56, 32
+
+nop
+nop
+
+str cycles, [x56]
+{docopy}
+str cycles, [x56, 8]
+
+UNK 00 24, #0x5f0000000233
+evstr w5f, [x58], unk 0xfd, irq
+
+!delta x 0 4096
+"""
+
+oldcmds = """
+!cs 0
+endpt compute
+!cs 0
+
+@ Workgroup size 1x1x1, merging allowed
+mov w21, 0x80000000
+
+@ Workgroup count 1x1x1
+mov w25, 1
+mov w26, 1
+mov w27, 1
+
+@ Offset 0,0,0
+mov w22, 0
+mov w23, 0
+mov w24, 0
+
+@ TODO: offset x/y/z
+
+@ Resources
+mov x06, 0
+
+@ Shader
+mov x16, $shader
+
+@ Local storage
+mov x1e, $ls
+
+@ FAU
+movp x0e, $fau+0x0200000000000000
+
+slot 2
+wait 2
+
+UNK 0400000000008200
+
+mov x58, $fau
+ldr x56, [x58]
+wait 0
+
+@mov w4a, 0
+
+@slot 6
+@mov x54, $x
+@UNK 02 24, #0x4a0000f80211
+@ldr x52, [x56]
+@wait 0,1
+@str x52, [x54]
+
+mov w40, 60
+1: add w40, w40, -1
+
+@mov w4a, #0x0
+@UNK 02 24, #0x4a0000f80211
+@wait 1
+
+@mov w54, #0
+@UNK 00 24, #0x540000000233
+@wait all
+
+slot 2
+wait 2
+
+add w22, w22, 1
+@UNK 0400ff0000008200
+
+@b.ne w40, 1b
+
+!dump x 0 4096
+!dump y 0 4096
+!dump ev 0 4096
+"""
+
+oldcmds = """
+!cs 0
+
+mov x48, $x
+
+mov w21, 0x80000000
+mov w25, 1
+mov w26, 1
+mov w27, 1
+
+movp x0e, $fau+0x0200000000000000
+
+@ Write FAUs
+@add x0e, x48, 64
+@mov x50, $ev
+@str x50, [x0e]
+@mov x30, 10
+@str x30, [x0e, 8]
+@add w0f, w0f, 0x02000000
+
+@ Write shader descriptor
+@add x16, x48, 128
+@mov x30, 0x118
+@str x30, [x16]
+@mov x30, $compute
+@str x30, [x16, 8]
+
+wait 0
+
+add x1e, x48, 192
+
+mov x30, $y
+@regdump x30
+@mov x30, 0
+
+resources compute
+slot 2
+mov w54, #0xffffe0
+UNK 00 24, #0x540000000233
+
+wait all
+
+mov x54, 0
+mov w56, 0
+mov w5d, 1
+
+slot 2
+wait 2
+wait 2
+regdump x30
+UNK 0400ff0000008200
+add x30, x30, 0x200
+regdump x30
+slot 2
+wait 2
+
+mov w40, 1000
+1: add w40, w40, -1
+str cycles, [x50, 32]
+b.ne w40, 1b
+
+wait 0
+wait all
+
+@ 6 / 10 / 14
+mov w40, 1
+1: add w40, w40, -1
+UNK 0400ff0000000200
+b.ne w40, 1b
+
+mov w40, 1000
+1: add w40, w40, -1
+str cycles, [x50, 32]
+b.ne w40, 1b
+
+mov w42, 200
+mov w40, 100
+1: add w40, w40, -1
+@wait all
+@UNK 0400ff0000008001 @ compute
+
+@UNK 0400ff0000000001
+@UNK 2501504200000004 @ evadd
+@UNK 3 24, #0x4a0000000211
+
+@wait all
+b.ne w40, 1b
+
+@UNK 2601504200000004
+
+str cycles, [x50, 40]
+str cycles, [x50, 48]
+UNK 02 24, #0x4a0000000211
+wait 0
+
+add x5c, x50, 64
+evadd w5e, [x5c], unk 0xfd
+evadd w5e, [x5c], unk 0xfd, irq, unk0
+
+!dump x 0 4096
+!dump y 0 4096
+!delta ev 0 4096
+"""
+
+altcmds = """
+!cs 0
+!alloc x 4096
+!alloc ev 4096 0x8200f
+!alloc ev2 4096 0x8200f
+
+mov x10, $x
+UNK 00 30, #0x100000000000
+add x12, x10, 256
+str cycles, [x12]
+mov x5a, $ev2
+mov x48, 0
+mov w4a, 0
+slot 3
+wait 3
+UNK 00 31, 0
+mov x48, $ev
+mov w4a, 0x4321
+add x46, x48, 64
+mov w42, 0
+
+str cycles, [x12, 8]
+UNK 01 26, 0x484a00000005
+str cycles, [x12, 16]
+UNK 01 26, 0x484a00000005
+str cycles, [x12, 24]
+
+nop
+
+mov w10, 10000
+1:
+UNK 01 26, 0x484a00000005
+add w10, w10, -1
+b.ne w10, 1b
+str cycles, [x12, 32]
+
+mov w10, 10000
+1:
+UNK 01 26, 0x484a00000005
+@UNK 02 24, #0x420000000211
+add w10, w10, -1
+b.ne w10, 1b
+str cycles, [x12, 40]
+
+ldr x16, [x48, 0]
+wait 0
+str x16, [x48, 16]
+
+UNK 00 31, 0x100000000
+
+mov w4a, #0x0
+UNK 02 24, #0x4a0000000211
+
+mov w5e, 1
+add x5c, x5a, 0x100
+UNK 01 25, 0x5c5e00f80001
+
+!delta x 0 4096
+!dump ev 0 4096
+!dump ev2 0 4096
+"""
+
+altcmds = """
+!cs 0
+!alloc x 4096
+!alloc ev 4096 0x8200f
+
+iter vertex
+slot 2
+
+mov x40, $x
+mov w10, 1
+mov x48, 0
+mov w4a, 0
+call w4a, x48
+  nop
+  nop
+  nop
+  mov x20, $.
+@  movp x22, 0x0126000011223344
+  movp x22, 0x1600000060000001
+  str x22, [x20, 56]
+  1: nop
+  b 1b
+  nop
+  add x40, x40, #256
+  regdump x40
+
+mov x5a, #0x5ff7fd6000
+mov x48, $ev
+mov x40, #0x5ff7fd6000
+mov w54, #0x1
+UNK 00 24, #0x540000000233
+wait 0
+slot 6
+@UNK 00 31, #0x0
+UNK 00 09, #0x0
+wait 6
+@UNK 00 31, #0x100000000
+mov x4a, x40
+UNK 01 26, 0x484a00040001
+
+!dump x 0 4096
+@!dump ev 0 4096
+@!delta x 0 4096
+"""
+
+cycletest = """
+mov w10, 10
+1:
+str cycles, [x5c]
+add x5c, x5c, 8
+add w10, w10, -1
+mov w11, 100000
+
+inner:
+add w11, w11, -1
+b.ne w11, inner
+
+b.ne w10, 1b
+"""
+
+def get_cmds(cmd):
+    return cmds.replace("{cmd}", str(cmd))
+
+def assemble_shader(text):
+    lines = text.strip().split("\n")
+    lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"]
+    return [asm.parse_asm(ln) for ln in lines]
+
+class Buffer:
+    id = 0
+
+    def __init__(self):
+        self.id = Buffer.id
+        Buffer.id += 1
+
+def resolve_rel(to, branch):
+    return (to - branch) // 8 - 1
+
+def to_int16(value):
+    assert(value < 36768)
+    assert(value >= -32768)
+    return value & 0xffff
+
+class Level(Buffer):
+    def __init__(self, indent):
+        super().__init__()
+
+        self.indent = indent
+        self.buffer = []
+        self.call_addr_offset = None
+        self.call_len_offset = None
+
+        self.labels = {}
+        self.label_refs = []
+        # Numeric labels can be reused, so have to be handled specially.
+        self.num_labels = {}
+        self.num_refs = {}
+
+    def offset(self):
+        return len(self.buffer) * 8
+
+    def __repr__(self):
+        buf = " ".join(hex(x) for x in self.buffer)
+        return f"buffer {self.id} {self.offset()} 0x200f {buf}"
+
+    def buffer_add_value(self, offset, value):
+        self.buffer[offset // 8] += value
+
+    def process_relocs(self, refs, to=None):
+        for ref, offset, type_ in refs:
+            assert(type_ == "rel")
+
+            if to is None:
+                goto = self.labels[ref]
+            else:
+                goto = to
+
+            value = to_int16(resolve_rel(goto, offset))
+            self.buffer_add_value(offset, value)
+
+    def finish(self):
+        self.process_relocs(self.label_refs)
+
+class Alloc(Buffer):
+    def __init__(self, size, flags=0x280f):
+        super().__init__()
+
+        self.size = size
+        self.flags = flags
+        self.buffer = []
+
+    def __repr__(self):
+        buf = " ".join(hex(x) for x in self.buffer)
+        return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}"
+
+def fmt_reloc(r, name="reloc"):
+    dst, offset, src, src_offset = r
+    return f"{name} {dst}+{offset} {src}+{src_offset}"
+
+def fmt_exe(e):
+    return " ".join(str(x) for x in e)
+
+class Context:
+    def __init__(self):
+        self.levels = []
+        self.l = None
+
+        self.allocs = {}
+        self.completed = []
+        self.reloc = []
+        self.reloc_split = []
+
+        self.exe = []
+        self.last_exe = None
+
+        self.is_call = False
+
+    def set_l(self):
+        if len(self.levels):
+            self.l = self.levels[-1]
+
+    def pop_until(self, indent):
+        while self.l.indent != indent:
+            l = self.levels.pop()
+            self.completed.append(l)
+
+            self.set_l()
+            if not len(self.levels):
+                return
+
+            buf_len = l.offset()
+
+            r = self.l
+            self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0))
+            r.buffer[r.call_len_offset] = (
+                (r.buffer[r.call_len_offset] & (0xffff << 48)) +
+                buf_len)
+            r.buffer[r.call_addr_offset] &= (0xffff << 48)
+
+            r.call_addr_offset = None
+            r.call_len_offset = None
+
+    def flush_exe(self):
+        ind = self.levels[0].indent
+
+        self.pop_until(ind)
+        if len(self.levels[0].buffer):
+            l = self.levels.pop()
+            l.finish()
+            self.completed.append(l)
+
+            self.levels.append(Level(ind))
+            self.set_l()
+
+        if not len(self.exe):
+            return
+
+        if self.last_exe is None:
+            print("# Trying to add multiple CSs to an exe line, becoming confused")
+            return
+
+        if len(self.completed):
+            p = self.completed[-1]
+            assert(p.indent == ind)
+
+            self.exe[self.last_exe] += [p.id, p.offset()]
+
+        self.last_exe = None
+
+    def add_shaders(self, shaders):
+        for sh in shaders:
+            qwords = assemble_shader(shaders[sh])
+            sh = sh.lower()
+
+            a = Alloc(len(qwords) * 8, flags=0x2017)
+            a.buffer = qwords
+            self.allocs[sh] = a
+
+    def add_memory(self, memory):
+        for m in memory:
+            f = memory[m]
+            if isinstance(f, int):
+                size, flags = f, 0x280f
+            else:
+                size, flags = f
+            self.allocs[m] = Alloc(size, flags)
+
+    def add_descriptors(self, descriptors):
+        for d in descriptors:
+            words = descriptors[d]
+            a = Alloc(0)
+
+            buf = []
+            for w in words:
+                if isinstance(w, int):
+                    buf.append(w)
+                else:
+                    if isinstance(w, str):
+                        alloc, offset = w, 0
+                    else:
+                        alloc, offset = w
+                    ref = self.allocs[alloc]
+                    self.reloc.append((a.id, len(buf) * 4,
+                                       ref.id, offset))
+                    buf.append(0)
+                    buf.append(0)
+
+            it = iter(buf)
+            a.buffer = [x | (y << 32) for x, y in zip(it, it)]
+            a.size = len(a.buffer) * 8
+            self.allocs[d] = a
+
+    def interpret(self, text):
+        text = text.split("\n")
+
+        old_indent = None
+
+        for orig_line in text:
+            #print(orig_line, file=sys.stderr)
+
+            line = orig_line.split("@")[0].expandtabs().rstrip().lower()
+            if not line:
+                continue
+
+            indent = len(line) - len(line.lstrip())
+            line = line.lstrip()
+
+            if old_indent is None:
+                self.levels.append(Level(indent))
+            elif indent != old_indent:
+                if indent > old_indent:
+                    assert(self.is_call)
+
+                    self.levels.append(Level(indent))
+                else:
+                    self.pop_until(indent)
+
+            self.set_l()
+
+            old_indent = indent
+            self.is_call = False
+
+            given_code = None
+
+            # TODO: Check against this to test the disassembler?
+            if re.match(r"[0-9a-f]{16} ", line):
+                given_code = int(line[:16], 16)
+                line = line[16:].lstrip()
+
+            s = [x.strip(",") for x in line.split()]
+
+            if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])):
+                label = s[0]
+                if s[0].endswith(":"):
+                    label = label[:-1]
+
+                if is_num(label):
+                    label = int(label)
+                    if label in self.l.num_refs:
+                        self.l.process_relocs(self.l.num_refs[label], self.l.offset())
+                        del self.l.num_refs[label]
+                    self.l.num_labels[label] = self.l.offset()
+                else:
+                    if label in self.l.labels:
+                        print("Label reuse is not supported for non-numeric labels")
+                    self.l.labels[label] = self.l.offset()
+
+                s = s[1:]
+                if not len(s):
+                    continue
+
+            for i in range(len(s)):
+                if s[i].startswith("$"):
+                    name, *offset = s[i][1:].split("+")
+                    if name == ".":
+                        buf = self.l
+                    else:
+                        buf = self.allocs[name]
+                    if len(offset):
+                        assert(len(offset) == 1)
+                        offset = int(offset[0], 0)
+                    else:
+                        offset = 0
+
+                    if s[0] == "movp":
+                        rels = self.reloc_split
+                    else:
+                        rels = self.reloc
+
+                    rels.append((self.l.id, self.l.offset(),
+                                 buf.id, offset))
+                    s[i] = "#0x0"
+
+            def is_num(str):
+                return re.fullmatch(r"[0-9]+", str)
+
+            def hx(word):
+                return int(word, 16)
+
+            def reg(word):
+                return hx(word[1:])
+
+            def val(word):
+                if word.startswith("float:"):
+                    return ii(float(word.split(":")[1]))
+                elif word.startswith("i16:"):
+                    lo, hi = word.split(":")[1].split(",")
+                    lo, hi = val(lo), val(hi)
+                    assert(lo < (1 << 16))
+                    assert(hi < (1 << 16))
+                    return (lo & 0xffff) | (hi << 16)
+
+                value = int(word.strip("#"), 0)
+                assert(value < (1 << 48))
+                return value
+
+            sk = True
+
+            if s[0] == "!cs":
+                assert(len(s) == 2)
+                self.flush_exe()
+                self.last_exe = len(self.exe)
+                self.exe.append(["exe", int(s[1])])
+                continue
+            elif s[0] == "!parallel":
+                assert(len(s) == 2)
+                self.flush_exe()
+                self.last_exe = len(self.exe) - 1
+                self.exe[-1] += [int(s[1])]
+                continue
+            elif s[0] == "!alloc":
+                assert(len(s) == 3 or len(s) == 4)
+                alloc_id = s[1]
+                size = int(s[2])
+                flags = val(s[3]) if len(s) == 4 else 0x280f
+                self.allocs[alloc_id] = Alloc(size, flags)
+                continue
+            elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"):
+                assert(len(s) == 4)
+                alloc_id = s[1]
+                offset = val(s[2])
+                size = val(s[3])
+                mode = {
+                    "!dump": "hex",
+                    "!dump64": "hex64",
+                    "!fdump": "filehex",
+                    "!delta": "delta",
+                    "!tiler": "tiler",
+                }[s[0]]
+                self.exe.append(("dump", self.allocs[alloc_id].id,
+                                 offset, size, mode))
+                continue
+            elif s[0] == "!heatmap":
+                assert(len(s) == 10)
+                assert(s[4] == "gran")
+                assert(s[6] == "len")
+                assert(s[8] == "stride")
+                alloc_id = s[1]
+                offset = val(s[2])
+                size = val(s[3])
+                granularity = val(s[5])
+                length = val(s[7])
+                stride = val(s[9])
+                mode = "heatmap"
+                self.exe.append(("heatmap", self.allocs[alloc_id].id,
+                                 offset, size, granularity, length, stride))
+                continue
+            elif s[0] == "!memset":
+                assert(len(s) == 5)
+                alloc_id = s[1]
+                offset = val(s[2])
+                value = val(s[3])
+                size = val(s[4])
+                self.exe.append(("memset", self.allocs[alloc_id].id,
+                                 offset, value, size))
+                continue
+            elif s[0] == "!raw":
+                self.exe.append(s[1:])
+                continue
+            elif s[0] == "movp":
+                assert(len(s) == 3)
+                assert(s[1][0] == "x")
+                addr = reg(s[1])
+                # Can't use val() as that has a max of 48 bits
+                value = int(s[2].strip("#"), 0)
+
+                self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff))
+                self.l.buffer.append((2 << 56) | ((addr + 1) << 48)
+                                       | ((value >> 32) & 0xffffffff))
+                continue
+            elif s[0] == "regdump":
+                assert(len(s) == 2)
+                assert(s[1][0] == "x")
+                dest = reg(s[1])
+
+                # Number of registers to write per instruction
+                regs = 16
+
+                cmd = 21
+                value = (dest << 40) | (((1 << regs) - 1) << 16)
+
+                for i in range(0, 0x60, regs):
+                    code = (cmd << 56) | (i << 48) | value | (i << 2)
+                    self.l.buffer.append(code)
+
+                del cmd, value
+                continue
+
+            elif s[0] == "unk":
+                if len(s) == 2:
+                    h = hx(s[1])
+                    cmd = h >> 56
+                    addr = (h >> 48) & 0xff
+                    value = h & 0xffffffffffff
+                else:
+                    assert(len(s) == 4)
+                    cmd = hx(s[2])
+                    addr = hx(s[1])
+                    value = val(s[3])
+            elif s[0] == "nop":
+                if len(s) == 1:
+                    addr = 0
+                    value = 0
+                    cmd = 0
+                else:
+                    assert(len(s) == 3)
+                    addr = hx(s[1])
+                    value = val(s[2])
+                    cmd = 0
+            elif s[0] == "mov" and s[2][0] in "xw":
+                # This is actually an addition command
+                assert(len(s) == 3)
+                assert(s[1][0] == s[2][0])
+                cmd = { "x": 17, "w": 16 }[s[1][0]]
+                addr = reg(s[1])
+                value = reg(s[2]) << 40
+            elif s[0] == "mov":
+                assert(len(s) == 3)
+                cmd = { "x": 1, "w": 2 }[s[1][0]]
+                addr = reg(s[1])
+                value = val(s[2])
+            elif s[0] == "add":
+                assert(len(s) == 4)
+                assert(s[1][0] == s[2][0])
+                assert(s[1][0] in "wx")
+                cmd = 16 if s[1][0] == "w" else 17
+                addr = reg(s[1])
+                value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff)
+            elif s[0] == "resources":
+                assert(len(s) >= 2)
+                types = ["compute", "fragment", "tiler", "idvs"]
+                cmd = 34
+                addr = 0
+                value = 0
+                for t in s[1:]:
+                    if t in types:
+                        value |= 1 << types.index(t)
+                    else:
+                        value |= int(t, 0)
+            elif s[0] == "fragment":
+                cmd = 7
+                addr = 0
+                value = 0
+                if len(s) != 1:
+                    arg_map = {
+                        "tem": {"0": 0, "1": 1},
+                        "render": {
+                            "z_order": 0,
+                            "horizontal": 0x10,
+                            "vertical": 0x20,
+                            "reverse_horizontal": 0x50,
+                            "reverse_vertical": 0x60,
+                        },
+                        "unk": {"0": 0, "1": 1 << 32},
+                    }
+                    for arg, val in zip(s[1::2], s[2::2]):
+                        value |= arg_map[arg][val]
+            elif s[0] == "wait":
+                assert(len(s) == 2)
+                cmd = 3
+                addr = 0
+                if s[1] == "all":
+                    value = 255
+                else:
+                    value = sum(1 << int(x) for x in s[1].split(","))
+                value <<= 16
+            elif s[0] == "slot":
+                assert(len(s) == 2)
+                cmd = 23
+                addr = 0
+                value = int(s[1], 0)
+            elif s[0] == "add":
+                # TODO: unk variant
+                assert(len(s) == 4)
+                assert(s[1][0] == "x")
+                assert(s[2][0] == "x")
+                cmd = 17
+                addr = reg(s[1])
+                v = val(s[3])
+                assert(v < (1 << 32))
+                assert(v >= (-1 << 31))
+                value = (reg(s[2]) << 40) | (v & 0xffffffff)
+            elif s[0] == "idvs":
+                assert(len(s) == 6)
+                unk = val(s[1])
+                assert(s[2] == "mode")
+                modes = {
+                    "none": 0,
+                    "points": 1,
+                    "lines": 2,
+                    "line-strip": 4,
+                    "line-loop": 6,
+                    "triangles": 8,
+                    "triangle-strip": 10,
+                    "triangle-fan": 12,
+                    "polygon": 13,
+                    "quads": 14,
+                }
+                if s[3] in modes:
+                    mode = modes[s[3]]
+                else:
+                    mode = int(s[3])
+                assert(s[4] == "index")
+                itypes = {
+                    "none": 0,
+                    "uint8": 1,
+                    "uint16": 2,
+                    "uint32": 3,
+                }
+                if s[5] in itypes:
+                    index = itypes[s[5]]
+                else:
+                    index = int(s[5])
+
+                cmd = 6
+                addr = 0
+                value = (unk << 32) | (index << 8) | mode
+            elif s[0] == "flush_tiler":
+                assert(len(s) == 1)
+                cmd = 9
+                addr = 0
+                value = 0
+            elif s[0] == "str" and s[1] in ("cycles", "timestamp"):
+                assert(len(s) == 3 or len(s) == 4)
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]") for x in s]
+                assert(s[2][0] == "x")
+
+                type_ = 1 if s[1] == "cycles" else 0
+                dest = reg(s[2])
+                if len(s) == 4:
+                    offset = val(s[3])
+                else:
+                    offset = 0
+
+                cmd = 40
+                addr = 0
+                value = (dest << 40) | (type_ << 32) | to_int16(offset)
+            elif s[0] in ("ldr", "str"):
+                reglist = s[1]
+                if reglist[0] == "{":
+                    end = [x[-1] for x in s].index("}")
+                    reglist = s[1:end + 1]
+                    s = s[:1] + s[end:]
+
+                assert(len(s) == 3 or len(s) == 4)
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]") for x in s]
+                assert(s[2][0] == "x")
+
+                if isinstance(reglist, str):
+                    assert(reglist[0] in "xw")
+                    src = reg(reglist)
+                    mask = 3 if reglist[0] == "x" else 1
+                else:
+                    src = None
+                    mask = 0
+
+                    for r in ",".join(reglist).strip("{}").split(","):
+                        r = r.split("-")
+                        assert(len(r) in (1, 2))
+                        regno = [reg(x) for x in r]
+
+                        if src is None:
+                            src = regno[0]
+
+                        if len(r) == 1:
+                            assert(r[0][0] in "xw")
+                            new = 3 if r[0][0] == "x" else 1
+                            new = (new << regno[0]) >> src
+                        else:
+                            assert(regno[1] > regno[0])
+                            new = ((2 << regno[1]) - (1 << regno[0])) >> src
+
+                        assert(new < (1 << 16))
+                        assert(mask & new == 0)
+                        mask |= new
+
+                # Name is correct for str, but inverted for ldr
+                # (The same holds for src above)
+                dest = reg(s[2])
+                if len(s) == 4:
+                    offset = val(s[3])
+                else:
+                    offset = 0
+
+                cmd = 20 if s[0] == "ldr" else 21
+                addr = src
+                value = (dest << 40) | (mask << 16) | to_int16(offset)
+            elif s[0] == "b" or s[0].startswith("b."):
+                # For unconditional jumps, use w00 as a source register if it
+                # is not specified
+                if s[0] == "b" and (len(s) == 2 or
+                                    (len(s) == 3 and
+                                     s[1] in ("back", "skip"))):
+                    s = [s[0], "w00", *s[1:]]
+
+                assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip")))
+                assert(s[1][0] == "w")
+
+                ops = {
+                    "b.le": 0, "b.gt": 1,
+                    "b.eq": 2, "b.ne": 3,
+                    "b.lt": 4, "b.ge": 5,
+                    "b": 6, "b.al": 6,
+                }
+
+                src = reg(s[1])
+                if len(s) == 4:
+                    offset = val(s[3])
+                    if s[2] == "back":
+                        offset = -1 - offset
+                else:
+                    label = s[2]
+                    if re.fullmatch(r"[0-9]+b", label):
+                        label = int(label[:-1])
+                        assert(label in self.l.num_labels)
+                        offset = resolve_rel(self.l.num_labels[label],
+                                             self.l.offset())
+                    elif re.fullmatch(r"[0-9]+f", label):
+                        label = int(label[:-1])
+                        if label not in self.l.num_refs:
+                            self.l.num_refs[label] = []
+                        self.l.num_refs[label].append((label, self.l.offset(), "rel"))
+                        offset = 0
+                    else:
+                        assert(not re.fullmatch(r"[0-9]+", label))
+                        self.l.label_refs.append((label, self.l.offset(), "rel"))
+                        offset = 0
+
+                cmd = 22
+                addr = 0
+                value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset)
+
+            elif s[0] in ("evadd", "evstr"):
+                assert(len(s) in range(5, 8))
+                assert(s[1][0] in "wx")
+                assert(s[2].startswith("[x"))
+                assert(s[2][-1] == "]")
+                assert(s[3] == "unk")
+                s = [x.strip("[]()") for x in s]
+
+                val = reg(s[1])
+                dst = reg(s[2])
+                mask = hx(s[4])
+                irq = "irq" not in s
+                unk0 = "unk0" in s
+
+                if s[1][0] == "w":
+                    cmd = 37 if s[0] == "evadd" else 38
+                else:
+                    cmd = 51 if s[0] == "evadd" else 52
+                addr = 1
+                value = ((dst << 40) | (val << 32) | (mask << 16) |
+                         (irq << 2) | unk0)
+            elif s[0].split(".")[0] == "evwait":
+                for mod in s[0].split(".")[1:]:
+                    assert(mod in {"lo", "hi", "inherit", "no_error"})
+                assert(len(s) == 3)
+                assert(s[1][0] in "wx")
+                assert(s[2][0] == "[")
+                assert(s[-1][-1] == "]")
+                s = [x.strip("[]()") for x in s]
+                src = reg(s[2])
+                val = reg(s[1])
+                cond = 1 if ".hi" in s[0] else 0
+                error = 1 if ".no_error" in s[0] else 0
+
+                cmd = 53 if s[1][0] == "x" else 39
+                addr = 0
+                value = (src << 40) | (val << 32) | (cond << 28) | error
+            elif s[0] in ("call", "tailcall"):
+                ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1]
+                assert(len(ss) == 3)
+                assert(ss[1][0] == "w")
+                assert(ss[2][0] == "x")
+                cmd = { "call": 32, "tailcall": 33 }[s[0]]
+                addr = 0
+                num = reg(ss[1])
+                target = reg(ss[2])
+                value = (num << 32) | (target << 40)
+
+                l = self.l
+
+                cur = len(l.buffer)
+                for ofs in range(cur - 2, cur):
+                    if l.buffer[ofs] >> 48 == 0x100 + target:
+                        l.call_addr_offset = ofs
+                    if l.buffer[ofs] >> 48 == 0x200 + num:
+                        l.call_len_offset = ofs
+                assert(l.call_addr_offset is not None)
+                assert(l.call_len_offset is not None)
+
+                self.is_call = True
+            elif s[0] == "heapctx":
+                assert(len(s) == 2)
+                assert(s[1][0] == "x")
+                cmd = 48
+                addr = 0
+                value = reg(s[1]) << 40
+            elif s[0] == "heapinc":
+                assert(len(s) == 2)
+                modes = {
+                    "vt_start": 0,
+                    "vt_end": 1,
+                    "frag_end": 3,
+                }
+                if s[1] in modes:
+                    mode = modes[s[1]]
+                else:
+                    mode = int(s[1])
+                cmd = 49
+                addr = 0
+                value = mode << 32
+            else:
+                print("Unknown command:", orig_line, file=sys.stderr)
+                # TODO remove
+                cmd = 0
+                addr = 0
+                value = 0
+                sk = False
+                pass
+
+            code = (cmd << 56) | (addr << 48) | value
+
+            if given_code and code != given_code:
+                print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}")
+
+            self.l.buffer.append(code)
+
+            del cmd, addr, value
+
+            if False and not sk:
+                print(orig_line, file=sys.stderr)
+                print(indent, s, hex(code) if sk else "", file=sys.stderr)
+
+        self.pop_until(self.levels[0].indent)
+        self.flush_exe()
+
+    def __repr__(self):
+        r = []
+        r += [str(self.allocs[x]) for x in self.allocs]
+        r += [str(x) for x in self.completed]
+        r += [fmt_reloc(x) for x in self.reloc]
+        r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split]
+        r += [fmt_exe(x) for x in self.exe]
+        return "\n".join(r)
+
+def interpret(text):
+    c = Context()
+    c.add_shaders(shaders)
+    c.add_memory(memory)
+    c.add_descriptors(descriptors)
+    c.interpret(text)
+    #print(str(c))
+    return str(c)
+
+def run(text, capture=False):
+    if capture:
+        cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
+    else:
+        cap = {}
+
+    i = interpret(text) + "\n"
+
+    with open("/tmp/csf.cmds", "w") as f:
+        f.write(i)
+
+    # TODO: Keep seperate or merge stdout/stderr?
+    ret = subprocess.run(["csf_test", "/dev/stdin"],
+                         input=i, text=True, **cap)
+    if ret.stderr is None:
+        ret.stderr = ""
+    if ret.stdout is None:
+        ret.stdout = ""
+    return ret.stderr + ret.stdout
+
+def rebuild():
+    try:
+        p = subprocess.run(["rebuild-mesa"])
+        if p.returncode != 0:
+            return False
+    except FileNotFoundError:
+        pass
+    return True
+
+def go(text):
+    #print(interpret(text))
+    #return
+
+    if not rebuild():
+        return
+
+    print(run(text))
+    #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s",
+    #               shell=True)
+
+os.environ["CSF_QUIET"] = "1"
+
+go(get_cmds(""))
+
+#for c in range(1, 64):
+#    val = c
+#    ret = run(get_cmds(ii(val)))
+#    print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0])
+
+#rebuild()
+#for c in range(256):
+#    print(c, end=":")
+#    sys.stdout.flush()
+#    cmd = f"UNK 00 {hex(c)[2:]} 0x00000000"
+#    run(get_cmds(cmd))
+
+#interpret(cmds)
+#go(cmds)
diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h
new file mode 100644
index 00000000000..f5f859eb9ad
--- /dev/null
+++ b/src/panfrost/csf_test/mali_base_csf_kernel.h
@@ -0,0 +1,721 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_BASE_CSF_KERNEL_H_
+#define _UAPI_BASE_CSF_KERNEL_H_
+
+#include <linux/types.h>
+
+/* Memory allocation, access/hint flags.
+ *
+ * See base_mem_alloc_flags.
+ */
+
+/* IN */
+/* Read access CPU side
+ */
+#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0)
+
+/* Write access CPU side
+ */
+#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1)
+
+/* Read access GPU side
+ */
+#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2)
+
+/* Write access GPU side
+ */
+#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3)
+
+/* Execute allowed on the GPU side
+ */
+#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4)
+
+/* Will be permanently mapped in kernel space.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5)
+
+/* The allocation will completely reside within the same 4GB chunk in the GPU
+ * virtual space.
+ * Since this flag is primarily required only for the TLS memory which will
+ * not be used to contain executable code and also not used for Tiler heap,
+ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags.
+ */
+#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6)
+
+/* Userspace is not allowed to free this memory.
+ * Flag is only allowed on allocations originating from kbase.
+ */
+#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7)
+
+#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8)
+
+/* Grow backing store on GPU Page Fault
+ */
+#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9)
+
+/* Page coherence Outer shareable, if available
+ */
+#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10)
+
+/* Page coherence Inner shareable
+ */
+#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11)
+
+/* IN/OUT */
+/* Should be cached on the CPU, returned if actually cached
+ */
+#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12)
+
+/* IN/OUT */
+/* Must have same VA on both the GPU and the CPU
+ */
+#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13)
+
+/* OUT */
+/* Must call mmap to acquire a GPU address for the alloc
+ */
+#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14)
+
+/* IN */
+/* Page coherence Outer shareable, required.
+ */
+#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15)
+
+/* Protected memory
+ */
+#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16)
+
+/* Not needed physical memory
+ */
+#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17)
+
+/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the
+ * addresses to be the same
+ */
+#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18)
+
+/* CSF event memory
+ *
+ * If Outer shareable coherence is not specified or not available, then on
+ * allocation kbase will automatically use the uncached GPU mapping.
+ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU
+ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag.
+ *
+ * This memory requires a permanent mapping
+ *
+ * See also kbase_reg_needs_kernel_mapping()
+ */
+#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19)
+
+#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20)
+
+/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu
+ * mode. Some components within the GPU might only be able to access memory
+ * that is GPU cacheable. Refer to the specific GPU implementation for more
+ * details. The 3 shareability flags will be ignored for GPU uncached memory.
+ * If used while importing USER_BUFFER type memory, then the import will fail
+ * if the memory is not aligned to GPU and CPU cache line width.
+ */
+#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21)
+
+/*
+ * Bits [22:25] for group_id (0~15).
+ *
+ * base_mem_group_id_set() should be used to pack a memory group ID into a
+ * base_mem_alloc_flags value instead of accessing the bits directly.
+ * base_mem_group_id_get() should be used to extract the memory group ID from
+ * a base_mem_alloc_flags value.
+ */
+#define BASEP_MEM_GROUP_ID_SHIFT 22
+#define BASE_MEM_GROUP_ID_MASK \
+	((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT)
+
+/* Must do CPU cache maintenance when imported memory is mapped/unmapped
+ * on GPU. Currently applicable to dma-buf type only.
+ */
+#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26)
+
+/* OUT */
+/* Kernel side cache sync ops required */
+#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28)
+
+/* Number of bits used as flags for base memory management
+ *
+ * Must be kept in sync with the base_mem_alloc_flags flags
+ */
+#define BASE_MEM_FLAGS_NR_BITS 29
+
+/* A mask of all the flags which are only valid for allocations within kbase,
+ * and may not be passed from user space.
+ */
+#define BASEP_MEM_FLAGS_KERNEL_ONLY \
+	(BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE)
+
+/* A mask for all output bits, excluding IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP
+
+/* A mask for all input bits, including IN/OUT bits.
+ */
+#define BASE_MEM_FLAGS_INPUT_MASK \
+	(((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK)
+
+/* A mask of all currently reserved flags
+ */
+#define BASE_MEM_FLAGS_RESERVED \
+	BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20
+
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
+/* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
+#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS                                            \
+	((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
+
+#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
+	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
+	 LOCAL_PAGE_SHIFT)
+
+/**
+ * Valid set of just-in-time memory allocation flags
+ */
+#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
+
+/* Flags to pass to ::base_context_init.
+ * Flags can be ORed together to enable multiple things.
+ *
+ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
+ * not collide with them.
+ */
+typedef __u32 base_context_create_flags;
+
+/* No flags set */
+#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
+
+/* Base context is embedded in a cctx object (flag used for CINSTR
+ * software counter macros)
+ */
+#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0)
+
+/* Base context is a 'System Monitor' context for Hardware counters.
+ *
+ * One important side effect of this is that job submission is disabled.
+ */
+#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \
+	((base_context_create_flags)1 << 1)
+
+/* Base context creates a CSF event notification thread.
+ *
+ * The creation of a CSF event notification thread is conditional but
+ * mandatory for the handling of CSF events.
+ */
+#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
+
+/* Bit-shift used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3)
+
+/* Bitmask used to encode a memory group ID in base_context_create_flags
+ */
+#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \
+	((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/* Bitpattern describing the base_context_create_flags that can be
+ * passed to the kernel
+ */
+#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \
+	(BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \
+	 BASEP_CONTEXT_MMU_GROUP_ID_MASK)
+
+/* Bitpattern describing the ::base_context_create_flags that can be
+ * passed to base_context_init()
+ */
+#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \
+	(BASE_CONTEXT_CCTX_EMBEDDED | \
+	 BASE_CONTEXT_CSF_EVENT_THREAD | \
+	 BASEP_CONTEXT_CREATE_KERNEL_FLAGS)
+
+/* Enable additional tracepoints for latency measurements (TL_ATOM_READY,
+ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST)
+ */
+#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0)
+
+/* Indicate that job dumping is enabled. This could affect certain timers
+ * to account for the performance impact.
+ */
+#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1)
+
+/* Enable KBase tracepoints for CSF builds */
+#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2)
+
+/* Enable additional CSF Firmware side tracepoints */
+#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3)
+
+#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \
+		BASE_TLSTREAM_JOB_DUMPING_ENABLED | \
+		BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \
+		BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)
+
+/* Number of pages mapped into the process address space for a bound GPU
+ * command queue. A pair of input/output pages and a Hw doorbell page
+ * are mapped to enable direct submission of commands to Hw.
+ */
+#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3)
+
+#define BASE_QUEUE_MAX_PRIORITY (15U)
+
+/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
+#define BASEP_EVENT_VAL_INDEX (0U)
+#define BASEP_EVENT_ERR_INDEX (1U)
+
+/* The upper limit for number of objects that could be waited/set per command.
+ * This limit is now enforced as internally the error inherit inputs are
+ * converted to 32-bit flags in a __u32 variable occupying a previously padding
+ * field.
+ */
+#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
+
+/**
+ * enum base_kcpu_command_type - Kernel CPU queue command type.
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
+ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
+ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
+ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
+ */
+enum base_kcpu_command_type {
+	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
+	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
+	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
+	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
+};
+
+/**
+ * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
+ * @BASE_QUEUE_GROUP_PRIORITY_HIGH:     GPU Command Queue Group is of high
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM:   GPU Command Queue Group is of medium
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_LOW:      GPU Command Queue Group is of low
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time
+ *                                      priority.
+ * @BASE_QUEUE_GROUP_PRIORITY_COUNT:    Number of GPU Command Queue Group
+ *                                      priority levels.
+ *
+ * Currently this is in order of highest to lowest, but if new levels are added
+ * then those new levels may be out of order to preserve the ABI compatibility
+ * with previous releases. At that point, ensure assignment to
+ * the 'priority' member in &kbase_queue_group is updated to ensure it remains
+ * a linear ordering.
+ *
+ * There should be no gaps in the enum, otherwise use of
+ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated.
+ */
+enum base_queue_group_priority {
+	BASE_QUEUE_GROUP_PRIORITY_HIGH = 0,
+	BASE_QUEUE_GROUP_PRIORITY_MEDIUM,
+	BASE_QUEUE_GROUP_PRIORITY_LOW,
+	BASE_QUEUE_GROUP_PRIORITY_REALTIME,
+	BASE_QUEUE_GROUP_PRIORITY_COUNT
+};
+
+struct base_kcpu_command_fence_info {
+	__u64 fence;
+};
+
+struct base_cqs_wait_info {
+	__u64 addr;
+	__u32 val;
+	__u32 padding;
+};
+
+struct base_kcpu_command_cqs_wait_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+struct base_cqs_set {
+	__u64 addr;
+};
+
+struct base_kcpu_command_cqs_set_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * typedef basep_cqs_data_type - Enumeration of CQS Data Types
+ *
+ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
+ *                           is an unsigned 32-bit integer
+ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
+ *                           is an unsigned 64-bit integer
+ */
+typedef enum PACKED {
+	BASEP_CQS_DATA_TYPE_U32 = 0,
+	BASEP_CQS_DATA_TYPE_U64 = 1,
+} basep_cqs_data_type;
+
+/**
+ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
+ *                                Operation conditions
+ *
+ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Less than or Equal to
+ *                                the Wait Operation value
+ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Greater than the Wait Operation value
+ */
+typedef enum {
+	BASEP_CQS_WAIT_OPERATION_LE = 0,
+	BASEP_CQS_WAIT_OPERATION_GT = 1,
+} basep_cqs_wait_operation_op;
+
+struct base_cqs_wait_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
+ *		about the Timeline CQS wait objects
+ *
+ * @objs:              An array of Timeline CQS waits.
+ * @nr_objs:           Number of Timeline CQS waits in the array.
+ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
+ *                     to be served as the source for importing into the
+ *                     queue's error-state.
+ */
+struct base_kcpu_command_cqs_wait_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+/**
+ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations
+ *
+ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
+ *                                to a synchronization object
+ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
+ *                                of a synchronization object
+ */
+typedef enum {
+	BASEP_CQS_SET_OPERATION_ADD = 0,
+	BASEP_CQS_SET_OPERATION_SET = 1,
+} basep_cqs_set_operation_op;
+
+struct base_cqs_set_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
+ *		about the Timeline CQS set objects
+ *
+ * @objs:    An array of Timeline CQS sets.
+ * @nr_objs: Number of Timeline CQS sets in the array.
+ * @padding: Structure padding, unused bytes.
+ */
+struct base_kcpu_command_cqs_set_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * struct base_kcpu_command_import_info - structure which contains information
+ *		about the imported buffer.
+ *
+ * @handle:	Address of imported user buffer.
+ */
+struct base_kcpu_command_import_info {
+	__u64 handle;
+};
+
+/**
+ * struct base_kcpu_command_jit_alloc_info - structure which contains
+ *		information about jit memory allocation.
+ *
+ * @info:	An array of elements of the
+ *		struct base_jit_alloc_info type.
+ * @count:	The number of elements in the info array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_alloc_info {
+	__u64 info;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_jit_free_info - structure which contains
+ *		information about jit memory which is to be freed.
+ *
+ * @ids:	An array containing the JIT IDs to free.
+ * @count:	The number of elements in the ids array.
+ * @padding:	Padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_jit_free_info {
+	__u64 ids;
+	__u8 count;
+	__u8 padding[7];
+};
+
+/**
+ * struct base_kcpu_command_group_suspend_info - structure which contains
+ *		suspend buffer data captured for a suspended queue group.
+ *
+ * @buffer:		Pointer to an array of elements of the type char.
+ * @size:		Number of elements in the @buffer array.
+ * @group_handle:	Handle to the mapping of CSG.
+ * @padding:		padding to a multiple of 64 bits.
+ */
+struct base_kcpu_command_group_suspend_info {
+	__u64 buffer;
+	__u32 size;
+	__u8 group_handle;
+	__u8 padding[3];
+};
+
+
+/**
+ * struct base_kcpu_command - kcpu command.
+ * @type:	type of the kcpu command, one enum base_kcpu_command_type
+ * @padding:	padding to a multiple of 64 bits
+ * @info:	structure which contains information about the kcpu command;
+ *		actual type is determined by @p type
+ * @info.fence:            Fence
+ * @info.cqs_wait:         CQS wait
+ * @info.cqs_set:          CQS set
+ * @info.import:           import
+ * @info.jit_alloc:        jit allocation
+ * @info.jit_free:         jit deallocation
+ * @info.suspend_buf_copy: suspend buffer copy
+ * @info.sample_time:      sample time
+ * @info.padding:          padding
+ */
+struct base_kcpu_command {
+	__u8 type;
+	__u8 padding[sizeof(__u64) - sizeof(__u8)];
+	union {
+		struct base_kcpu_command_fence_info fence;
+		struct base_kcpu_command_cqs_wait_info cqs_wait;
+		struct base_kcpu_command_cqs_set_info cqs_set;
+		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
+		struct base_kcpu_command_import_info import;
+		struct base_kcpu_command_jit_alloc_info jit_alloc;
+		struct base_kcpu_command_jit_free_info jit_free;
+		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
+		__u64 padding[2]; /* No sub-struct should be larger */
+	} info;
+};
+
+/**
+ * struct basep_cs_stream_control - CSI capabilities.
+ *
+ * @features: Features of this stream
+ * @padding:  Padding to a multiple of 64 bits.
+ */
+struct basep_cs_stream_control {
+	__u32 features;
+	__u32 padding;
+};
+
+/**
+ * struct basep_cs_group_control - CSG interface capabilities.
+ *
+ * @features:     Features of this group
+ * @stream_num:   Number of streams in this group
+ * @suspend_size: Size in bytes of the suspend buffer for this group
+ * @padding:      Padding to a multiple of 64 bits.
+ */
+struct basep_cs_group_control {
+	__u32 features;
+	__u32 stream_num;
+	__u32 suspend_size;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault
+ *        error information associated with GPU command queue group.
+ *
+ * @sideband:     Additional information of the unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_group_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u32 padding;
+};
+
+/**
+ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault
+ *        error information related to GPU command queue.
+ *
+ * @sideband:     Additional information about this unrecoverable fault.
+ * @status:       Unrecoverable fault information.
+ *                This consists of exception type (least significant byte) and
+ *                data (remaining bytes). One example of exception type is
+ *                CS_INVALID_INSTRUCTION (0x49).
+ * @csi_index:    Index of the CSF interface the queue is bound to.
+ * @padding:      Padding to make multiple of 64bits
+ */
+struct base_gpu_queue_error_fatal_payload {
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
+};
+
+/**
+ * enum base_gpu_queue_group_error_type - GPU Fatal error type.
+ *
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL:       Fatal error associated with GPU
+ *                                          command queue group.
+ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU
+ *                                          command queue.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:     Fatal error associated with
+ *                                          progress timeout.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out
+ *                                             of tiler heap memory.
+ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types
+ *
+ * This type is used for &struct_base_gpu_queue_group_error.error_type.
+ */
+enum base_gpu_queue_group_error_type {
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0,
+	BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+	BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT,
+	BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
+	BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT
+};
+
+/**
+ * struct base_gpu_queue_group_error - Unrecoverable fault information
+ * @error_type:          Error type of @base_gpu_queue_group_error_type
+ *                       indicating which field in union payload is filled
+ * @padding:             Unused bytes for 64bit boundary
+ * @payload:             Input Payload
+ * @payload.fatal_group: Unrecoverable fault error associated with
+ *                       GPU command queue group
+ * @payload.fatal_queue: Unrecoverable fault error associated with command queue
+ */
+struct base_gpu_queue_group_error {
+	__u8 error_type;
+	__u8 padding[7];
+	union {
+		struct base_gpu_queue_group_error_fatal_payload fatal_group;
+		struct base_gpu_queue_error_fatal_payload fatal_queue;
+	} payload;
+};
+
+/**
+ * enum base_csf_notification_type - Notification type
+ *
+ * @BASE_CSF_NOTIFICATION_EVENT:                 Notification with kernel event
+ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal
+ *                                               error
+ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:        Notification with dumping cpu
+ *                                               queue
+ * @BASE_CSF_NOTIFICATION_COUNT:                 The number of notification type
+ *
+ * This type is used for &struct_base_csf_notification.type.
+ */
+enum base_csf_notification_type {
+	BASE_CSF_NOTIFICATION_EVENT = 0,
+	BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+	BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP,
+	BASE_CSF_NOTIFICATION_COUNT
+};
+
+/**
+ * struct base_csf_notification - Event or error notification
+ *
+ * @type:                      Notification type of @base_csf_notification_type
+ * @padding:                   Padding for 64bit boundary
+ * @payload:                   Input Payload
+ * @payload.align:             To fit the struct into a 64-byte cache line
+ * @payload.csg_error:         CSG error
+ * @payload.csg_error.handle:  Handle of GPU command queue group associated with
+ *                             fatal error
+ * @payload.csg_error.padding: Padding
+ * @payload.csg_error.error:   Unrecoverable fault error
+ *
+ */
+struct base_csf_notification {
+	__u8 type;
+	__u8 padding[7];
+	union {
+		struct {
+			__u8 handle;
+			__u8 padding[7];
+			struct base_gpu_queue_group_error error;
+		} csg_error;
+
+		__u8 align[56];
+	} payload;
+};
+
+#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h
new file mode 100644
index 00000000000..305956f341a
--- /dev/null
+++ b/src/panfrost/csf_test/mali_base_kernel.h
@@ -0,0 +1,746 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Base structures shared with the kernel.
+ */
+
+#ifndef _UAPI_BASE_KERNEL_H_
+#define _UAPI_BASE_KERNEL_H_
+
+#include <linux/types.h>
+
+struct base_mem_handle {
+	struct {
+		__u64 handle;
+	} basep;
+};
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+#define BASE_MAX_COHERENT_GROUPS 16
+
+#if defined(PAGE_MASK) && defined(PAGE_SHIFT)
+#define LOCAL_PAGE_SHIFT PAGE_SHIFT
+#define LOCAL_PAGE_LSB ~PAGE_MASK
+#else
+#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
+#endif
+
+#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
+#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
+#else
+#error Failed to find page size
+#endif
+#endif
+
+/* Physical memory group ID for normal usage.
+ */
+#define BASE_MEM_GROUP_DEFAULT (0)
+
+/* Number of physical memory groups.
+ */
+#define BASE_MEM_GROUP_COUNT (16)
+
+/**
+ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags.
+ *
+ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator
+ * in order to determine the best cache policy. Some combinations are
+ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD),
+ * which defines a write-only region on the CPU side, which is
+ * heavily read by the CPU...
+ * Other flags are only meaningful to a particular allocator.
+ * More flags can be added to this list, as long as they don't clash
+ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
+ */
+typedef __u32 base_mem_alloc_flags;
+
+/* A mask for all the flags which are modifiable via the base_mem_set_flags
+ * interface.
+ */
+#define BASE_MEM_FLAGS_MODIFIABLE \
+	(BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \
+	 BASE_MEM_COHERENT_LOCAL)
+
+/* A mask of all the flags that can be returned via the base_mem_get_flags()
+ * interface.
+ */
+#define BASE_MEM_FLAGS_QUERYABLE \
+	(BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \
+		BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \
+		BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \
+		BASEP_MEM_FLAGS_KERNEL_ONLY))
+
+/**
+ * enum base_mem_import_type - Memory types supported by @a base_mem_import
+ *
+ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type
+ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int)
+ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a
+ * base_mem_import_user_buffer
+ *
+ * Each type defines what the supported handle type is.
+ *
+ * If any new type is added here ARM must be contacted
+ * to allocate a numeric value for it.
+ * Do not just add a new type without synchronizing with ARM
+ * as future releases from ARM might include other new types
+ * which could clash with your custom types.
+ */
+enum base_mem_import_type {
+	BASE_MEM_IMPORT_TYPE_INVALID = 0,
+	/*
+	 * Import type with value 1 is deprecated.
+	 */
+	BASE_MEM_IMPORT_TYPE_UMM = 2,
+	BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3
+};
+
+/**
+ * struct base_mem_import_user_buffer - Handle of an imported user buffer
+ *
+ * @ptr:	address of imported user buffer
+ * @length:	length of imported user buffer in bytes
+ *
+ * This structure is used to represent a handle of an imported user buffer.
+ */
+
+struct base_mem_import_user_buffer {
+	__u64 ptr;
+	__u64 length;
+};
+
+/* Mask to detect 4GB boundary alignment */
+#define BASE_MEM_MASK_4GB  0xfffff000UL
+/* Mask to detect 4GB boundary (in page units) alignment */
+#define BASE_MEM_PFN_MASK_4GB  (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT)
+
+/* Limit on the 'extension' parameter for an allocation with the
+ * BASE_MEM_TILER_ALIGN_TOP flag set
+ *
+ * This is the same as the maximum limit for a Buffer Descriptor's chunk size
+ */
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2                      \
+	(21u - (LOCAL_PAGE_SHIFT))
+#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES                           \
+	(1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2))
+
+/* Bit mask of cookies used for for memory allocation setup */
+#define KBASE_COOKIE_MASK  ~1UL /* bit 0 is reserved */
+
+/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */
+#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
+
+/*
+ * struct base_fence - Cross-device synchronisation fence.
+ *
+ * A fence is used to signal when the GPU has finished accessing a resource that
+ * may be shared with other devices, and also to delay work done asynchronously
+ * by the GPU until other devices have finished accessing a shared resource.
+ */
+struct base_fence {
+	struct {
+		int fd;
+		int stream_fd;
+	} basep;
+};
+
+/**
+ * struct base_mem_aliasing_info - Memory aliasing info
+ *
+ * Describes a memory handle to be aliased.
+ * A subset of the handle can be chosen for aliasing, given an offset and a
+ * length.
+ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a
+ * region where a special page is mapped with a write-alloc cache setup,
+ * typically used when the write result of the GPU isn't needed, but the GPU
+ * must write anyway.
+ *
+ * Offset and length are specified in pages.
+ * Offset must be within the size of the handle.
+ * Offset+length must not overrun the size of the handle.
+ *
+ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ * @offset: Offset within the handle to start aliasing from, in pages.
+ *          Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE.
+ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE
+ *          specifies the number of times the special page is needed.
+ */
+struct base_mem_aliasing_info {
+	struct base_mem_handle handle;
+	__u64 offset;
+	__u64 length;
+};
+
+/* Maximum percentage of just-in-time memory allocation trimming to perform
+ * on free.
+ */
+#define BASE_JIT_MAX_TRIM_LEVEL (100)
+
+/* Maximum number of concurrent just-in-time memory allocations.
+ */
+#define BASE_JIT_ALLOC_COUNT (255)
+
+/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
+ *
+ * jit_version is 1
+ *
+ * Due to the lack of padding specified, user clients between 32 and 64-bit
+ * may have assumed a different size of the struct
+ *
+ * An array of structures was not supported
+ */
+struct base_jit_alloc_info_10_2 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+};
+
+/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
+ * to 11.19
+ *
+ * This structure had a number of modifications during and after kernel driver
+ * version 11.5, but remains size-compatible throughout its version history, and
+ * with earlier variants compatible with future variants by requiring
+ * zero-initialization to the unused space in the structure.
+ *
+ * jit_version is 2
+ *
+ * Kernel driver version history:
+ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
+ *       must be zero. Kbase minor version was not incremented, so some
+ *       versions of 11.5 do not have this change.
+ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
+ *       minor version not incremented)
+ * 11.6: Added 'flags', replacing 1 padding byte
+ * 11.10: Arrays of this structure are supported
+ */
+struct base_jit_alloc_info_11_5 {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+};
+
+/**
+ * struct base_jit_alloc_info - Structure which describes a JIT allocation
+ *                              request.
+ * @gpu_alloc_addr:             The GPU virtual address to write the JIT
+ *                              allocated GPU virtual address to.
+ * @va_pages:                   The minimum number of virtual pages required.
+ * @commit_pages:               The minimum number of physical pages which
+ *                              should back the allocation.
+ * @extension:                     Granularity of physical pages to grow the
+ *                              allocation by during a fault.
+ * @id:                         Unique ID provided by the caller, this is used
+ *                              to pair allocation and free requests.
+ *                              Zero is not a valid value.
+ * @bin_id:                     The JIT allocation bin, used in conjunction with
+ *                              @max_allocations to limit the number of each
+ *                              type of JIT allocation.
+ * @max_allocations:            The maximum number of allocations allowed within
+ *                              the bin specified by @bin_id. Should be the same
+ *                              for all allocations within the same bin.
+ * @flags:                      flags specifying the special requirements for
+ *                              the JIT allocation, see
+ *                              %BASE_JIT_ALLOC_VALID_FLAGS
+ * @padding:                    Expansion space - should be initialised to zero
+ * @usage_id:                   A hint about which allocation should be reused.
+ *                              The kernel should attempt to use a previous
+ *                              allocation with the same usage_id
+ * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
+ *                              the actual usage of the region.
+ *
+ * jit_version is 3.
+ *
+ * When modifications are made to this structure, it is still compatible with
+ * jit_version 3 when: a) the size is unchanged, and b) new members only
+ * replace the padding bytes.
+ *
+ * Previous jit_version history:
+ * jit_version == 1, refer to &base_jit_alloc_info_10_2
+ * jit_version == 2, refer to &base_jit_alloc_info_11_5
+ *
+ * Kbase version history:
+ * 11.20: added @heap_info_gpu_addr
+ */
+struct base_jit_alloc_info {
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+	__u64 heap_info_gpu_addr;
+};
+
+enum base_external_resource_access {
+	BASE_EXT_RES_ACCESS_SHARED,
+	BASE_EXT_RES_ACCESS_EXCLUSIVE
+};
+
+struct base_external_resource {
+	__u64 ext_resource;
+};
+
+
+/**
+ * The maximum number of external resources which can be mapped/unmapped
+ * in a single request.
+ */
+#define BASE_EXT_RES_COUNT_MAX 10
+
+/**
+ * struct base_external_resource_list - Structure which describes a list of
+ *                                      external resources.
+ * @count:                              The number of resources.
+ * @ext_res:                            Array of external resources which is
+ *                                      sized at allocation time.
+ */
+struct base_external_resource_list {
+	__u64 count;
+	struct base_external_resource ext_res[1];
+};
+
+struct base_jd_debug_copy_buffer {
+	__u64 address;
+	__u64 size;
+	struct base_external_resource extres;
+};
+
+#define GPU_MAX_JOB_SLOTS 16
+
+/**
+ * User-side Base GPU Property Queries
+ *
+ * The User-side Base GPU Property Query interface encapsulates two
+ * sub-modules:
+ *
+ * - "Dynamic GPU Properties"
+ * - "Base Platform Config GPU Properties"
+ *
+ * Base only deals with properties that vary between different GPU
+ * implementations - the Dynamic GPU properties and the Platform Config
+ * properties.
+ *
+ * For properties that are constant for the GPU Architecture, refer to the
+ * GPU module. However, we will discuss their relevance here just to
+ * provide background information.
+ *
+ * About the GPU Properties in Base and GPU modules
+ *
+ * The compile-time properties (Platform Config, GPU Compile-time
+ * properties) are exposed as pre-processor macros.
+ *
+ * Complementing the compile-time properties are the Dynamic GPU
+ * Properties, which act as a conduit for the GPU Configuration
+ * Discovery.
+ *
+ * In general, the dynamic properties are present to verify that the platform
+ * has been configured correctly with the right set of Platform Config
+ * Compile-time Properties.
+ *
+ * As a consistent guide across the entire DDK, the choice for dynamic or
+ * compile-time should consider the following, in order:
+ * 1. Can the code be written so that it doesn't need to know the
+ * implementation limits at all?
+ * 2. If you need the limits, get the information from the Dynamic Property
+ * lookup. This should be done once as you fetch the context, and then cached
+ * as part of the context data structure, so it's cheap to access.
+ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties,
+ * then use a Compile-Time Property (Platform Config, or GPU Compile-time
+ * property). Examples of where this might be sensible follow:
+ *  - Part of a critical inner-loop
+ *  - Frequent re-use throughout the driver, causing significant extra load
+ * instructions or control flow that would be worthwhile optimizing out.
+ *
+ * We cannot provide an exhaustive set of examples, neither can we provide a
+ * rule for every possible situation. Use common sense, and think about: what
+ * the rest of the driver will be doing; how the compiler might represent the
+ * value if it is a compile-time constant; whether an OEM shipping multiple
+ * devices would benefit much more from a single DDK binary, instead of
+ * insignificant micro-optimizations.
+ *
+ * Dynamic GPU Properties
+ *
+ * Dynamic GPU properties are presented in two sets:
+ * 1. the commonly used properties in @ref base_gpu_props, which have been
+ * unpacked from GPU register bitfields.
+ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props
+ * (also a member of base_gpu_props). All of these are presented in
+ * the packed form, as presented by the GPU  registers themselves.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ * The properties returned extend the GPU Configuration Discovery
+ * registers. For example, GPU clock speed is not specified in the GPU
+ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function.
+ *
+ * The GPU properties are obtained by a call to
+ * base_get_gpu_props(). This simply returns a pointer to a const
+ * base_gpu_props structure. It is constant for the life of a base
+ * context. Multiple calls to base_get_gpu_props() to a base context
+ * return the same pointer to a constant structure. This avoids cache pollution
+ * of the common data.
+ *
+ * This pointer must not be freed, because it does not point to the start of a
+ * region allocated by the memory allocator; instead, just close the @ref
+ * base_context.
+ *
+ *
+ * Kernel Operation
+ *
+ * During Base Context Create time, user-side makes a single kernel call:
+ * - A call to fill user memory with GPU information structures
+ *
+ * The kernel-side will fill the provided the entire processed base_gpu_props
+ * structure, because this information is required in both
+ * user and kernel side; it does not make sense to decode it twice.
+ *
+ * Coherency groups must be derived from the bitmasks, but this can be done
+ * kernel side, and just once at kernel startup: Coherency groups must already
+ * be known kernel-side, to support chains that specify a 'Only Coherent Group'
+ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement.
+ *
+ * Coherency Group calculation
+ *
+ * Creation of the coherent group data is done at device-driver startup, and so
+ * is one-time. This will most likely involve a loop with CLZ, shifting, and
+ * bit clearing on the L2_PRESENT mask, depending on whether the
+ * system is L2 Coherent. The number of shader cores is done by a
+ * population count, since faulty cores may be disabled during production,
+ * producing a non-contiguous mask.
+ *
+ * The memory requirements for this algorithm can be determined either by a __u64
+ * population count on the L2_PRESENT mask (a LUT helper already is
+ * required for the above), or simple assumption that there can be no more than
+ * 16 coherent groups, since core groups are typically 4 cores.
+ */
+
+#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
+
+#define BASE_MAX_COHERENT_GROUPS 16
+/**
+ * struct mali_base_gpu_core_props - GPU core props info
+ * @product_id: Pro specific value.
+ * @version_status: Status of the GPU release. No defined values, but starts at
+ * 	0 and increases by one for each release status (alpha, beta, EAC, etc.).
+ * 	4 bit values (0-15).
+ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn"
+ * 	release number.
+ * 	8 bit values (0-255).
+ * @major_revision: Major release number of the GPU. "R" part of an "RnPn"
+ * 	release number.
+ * 	4 bit values (0-15).
+ * @padding: padding to allign to 8-byte
+ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by
+ * 	clGetDeviceInfo()
+ * @log2_program_counter_size: Size of the shader program counter, in bits.
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This
+ * 	is a bitpattern where a set bit indicates that the format is supported.
+ * 	Before using a texture format, it is recommended that the corresponding
+ * 	bit be checked.
+ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
+ * 	It is unlikely that a client will be able to allocate all of this memory
+ * 	for their own purposes, but this at least provides an upper bound on the
+ * 	memory available to the GPU.
+ * 	This is required for OpenCL's clGetDeviceInfo() call when
+ * 	CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
+ * 	client will not be expecting to allocate anywhere near this value.
+ * @num_exec_engines: The number of execution engines.
+ */
+struct mali_base_gpu_core_props {
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+	__u8 num_exec_engines;
+};
+
+/*
+ * More information is possible - but associativity and bus width are not
+ * required by upper-level apis.
+ */
+struct mali_base_gpu_l2_cache_props {
+	__u8 log2_line_size;
+	__u8 log2_cache_size;
+	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	__u8 padding[5];
+};
+
+struct mali_base_gpu_tiler_props {
+	__u32 bin_size_bytes;	/* Max is 4*2^15 */
+	__u32 max_active_levels;	/* Max is 2^15 */
+};
+
+/**
+ * struct mali_base_gpu_thread_props - GPU threading system details.
+ * @max_threads: Max. number of threads per core
+ * @max_workgroup_size:     Max. number of threads per workgroup
+ * @max_barrier_size:       Max. number of threads that can synchronize on a
+ *                          simple barrier
+ * @max_registers:          Total size [1..65535] of the register file available
+ *                          per core.
+ * @max_task_queue:         Max. tasks [1..255] which may be sent to a core
+ *                          before it becomes blocked.
+ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split
+ *                          field.
+ * @impl_tech:              0 = Not specified, 1 = Silicon, 2 = FPGA,
+ *                          3 = SW Model/Emulation
+ * @padding:                padding to allign to 8-byte
+ * @tls_alloc:              Number of threads per core that TLS must be
+ *                          allocated for
+ */
+struct mali_base_gpu_thread_props {
+	__u32 max_threads;
+	__u32 max_workgroup_size;
+	__u32 max_barrier_size;
+	__u16 max_registers;
+	__u8 max_task_queue;
+	__u8 max_thread_group_split;
+	__u8 impl_tech;
+	__u8  padding[3];
+	__u32 tls_alloc;
+};
+
+/**
+ * struct mali_base_gpu_coherent_group - descriptor for a coherent group
+ * @core_mask: Core restriction mask required for the group
+ * @num_cores: Number of cores in the group
+ * @padding:   padding to allign to 8-byte
+ *
+ * \c core_mask exposes all cores in that coherent group, and \c num_cores
+ * 	provides a cached population-count for that mask.
+ *
+ * @note Whilst all cores are exposed in the mask, not all may be available to
+ * 	the application, depending on the Kernel Power policy.
+ *
+ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of
+ * 	wastage.
+ */
+struct mali_base_gpu_coherent_group {
+	__u64 core_mask;
+	__u16 num_cores;
+	__u16 padding[3];
+};
+
+/**
+ * struct mali_base_gpu_coherent_group_info - Coherency group information
+ * @num_groups: Number of coherent groups in the GPU.
+ * @num_core_groups: Number of core groups (coherent or not) in the GPU.
+ * 	Equivalent to the number of L2 Caches.
+ * 	  The GPU Counter dumping writes 2048 bytes per core group, regardless
+ * 	of whether the core groups are coherent or not. Hence this member is
+ * 	needed to calculate how much memory is required for dumping.
+ * 	  @note Do not use it to work out how many valid elements are in the
+ * 	group[] member. Use num_groups instead.
+ * @coherency: Coherency features of the memory, accessed by gpu_mem_features
+ * 	methods
+ * @padding: padding to allign to 8-byte
+ * @group: Descriptors of coherent groups
+ *
+ * Note that the sizes of the members could be reduced. However, the \c group
+ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
+ * aligned, thus leading to wastage if the other members sizes were reduced.
+ *
+ * The groups are sorted by core mask. The core masks are non-repeating and do
+ * not intersect.
+ */
+struct mali_base_gpu_coherent_group_info {
+	__u32 num_groups;
+	__u32 num_core_groups;
+	__u32 coherency;
+	__u32 padding;
+	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
+};
+
+/**
+ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware
+ *                            Configuration Discovery registers.
+ * @shader_present: Shader core present bitmap
+ * @tiler_present: Tiler core present bitmap
+ * @l2_present: Level 2 cache present bitmap
+ * @stack_present: Core stack present bitmap
+ * @l2_features: L2 features
+ * @core_features: Core features
+ * @mem_features: Mem features
+ * @mmu_features: Mmu features
+ * @as_present: Bitmap of address spaces present
+ * @js_present: Job slots present
+ * @js_features: Array of job slot features.
+ * @tiler_features: Tiler features
+ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU
+ * @gpu_id: GPU and revision identifier
+ * @thread_max_threads: Maximum number of threads per core
+ * @thread_max_workgroup_size: Maximum number of threads per workgroup
+ * @thread_max_barrier_size: Maximum number of threads per barrier
+ * @thread_features: Thread features
+ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
+ *                  available modes as exposed in the coherency_features register
+ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
+ * @gpu_features: GPU features
+ *
+ * The information is presented inefficiently for access. For frequent access,
+ * the values should be better expressed in an unpacked form in the
+ * base_gpu_props structure.
+ *
+ * The raw properties in gpu_raw_gpu_props are necessary to
+ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
+ * behaving differently?". In this case, all information about the
+ * configuration is potentially useful, but it does not need to be processed
+ * by the driver. Instead, the raw registers can be processed by the Mali
+ * Tools software on the host PC.
+ *
+ */
+struct gpu_raw_gpu_props {
+	__u64 shader_present;
+	__u64 tiler_present;
+	__u64 l2_present;
+	__u64 stack_present;
+	__u32 l2_features;
+	__u32 core_features;
+	__u32 mem_features;
+	__u32 mmu_features;
+
+	__u32 as_present;
+
+	__u32 js_present;
+	__u32 js_features[GPU_MAX_JOB_SLOTS];
+	__u32 tiler_features;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+
+	__u32 gpu_id;
+
+	__u32 thread_max_threads;
+	__u32 thread_max_workgroup_size;
+	__u32 thread_max_barrier_size;
+	__u32 thread_features;
+
+	/*
+	 * Note: This is the _selected_ coherency mode rather than the
+	 * available modes as exposed in the coherency_features register.
+	 */
+	__u32 coherency_mode;
+
+	__u32 thread_tls_alloc;
+	__u64 gpu_features;
+};
+
+/**
+ * struct base_gpu_props - Return structure for base_get_gpu_props().
+ * @core_props:     Core props.
+ * @l2_props:       L2 props.
+ * @unused_1:       Keep for backwards compatibility.
+ * @tiler_props:    Tiler props.
+ * @thread_props:   Thread props.
+ * @raw_props:      This member is large, likely to be 128 bytes.
+ * @coherency_info: This must be last member of the structure.
+ *
+ * NOTE: the raw_props member in this data structure contains the register
+ * values from which the value of the other members are derived. The derived
+ * members exist to allow for efficient access and/or shielding the details
+ * of the layout of the registers.
+ */
+struct base_gpu_props {
+	struct mali_base_gpu_core_props core_props;
+	struct mali_base_gpu_l2_cache_props l2_props;
+	__u64 unused_1;
+	struct mali_base_gpu_tiler_props tiler_props;
+	struct mali_base_gpu_thread_props thread_props;
+	struct gpu_raw_gpu_props raw_props;
+	struct mali_base_gpu_coherent_group_info coherency_info;
+};
+
+#define BASE_MEM_GROUP_ID_GET(flags)                                           \
+	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
+
+#define BASE_MEM_GROUP_ID_SET(id)                                              \
+	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
+					 BASE_MEM_GROUP_DEFAULT :              \
+					 id)                                   \
+	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
+	 BASE_MEM_GROUP_ID_MASK)
+
+#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
+	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
+	 ((base_context_create_flags)(group_id)                                \
+	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
+
+#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
+	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
+	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
+
+/*
+ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
+ * flags are also used, where applicable, for specifying which fields
+ * are valid following the request operation.
+ */
+
+/* For monotonic (counter) timefield */
+#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0)
+/* For system wide timestamp */
+#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1)
+/* For GPU cycle counter */
+#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2)
+/* Specify kernel GPU register timestamp */
+#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30)
+/* Specify userspace cntvct_el0 timestamp source */
+#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31)
+
+#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\
+		BASE_TIMEINFO_MONOTONIC_FLAG | \
+		BASE_TIMEINFO_TIMESTAMP_FLAG | \
+		BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \
+		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
+		BASE_TIMEINFO_USER_SOURCE_FLAG)
+
+/* Maximum number of source allocations allowed to create an alias allocation.
+ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
+ * layers, since each cube map in the array will have 6 faces.
+ */
+#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
+
+#endif /* _UAPI_BASE_KERNEL_H_ */
diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h
new file mode 100644
index 00000000000..17e338cb238
--- /dev/null
+++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * This header was originally autogenerated, but it is now ok (and
+ * expected) to have to add to it.
+ */
+
+#ifndef _UAPI_GPU_CSF_REGISTERS_H_
+#define _UAPI_GPU_CSF_REGISTERS_H_
+
+/* Only user block defines are included. HI words have been removed */
+
+/* CS_USER_INPUT_BLOCK register offsets */
+#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */
+#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */
+
+/* CS_USER_OUTPUT_BLOCK register offsets */
+#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */
+#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */
+
+/* USER register offsets */
+#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */
+
+#endif
diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
new file mode 100644
index 00000000000..3df8a01699f
--- /dev/null
+++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h
@@ -0,0 +1,483 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_CSF_IOCTL_H_
+#define _UAPI_KBASE_CSF_IOCTL_H_
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * 1.0:
+ * - CSF IOCTL header separated from JM
+ * 1.1:
+ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME
+ * - Add ioctl 54: This controls the priority setting.
+ * 1.2:
+ * - Add new CSF GPU_FEATURES register into the property structure
+ *   returned by KBASE_IOCTL_GET_GPUPROPS
+ * 1.3:
+ * - Add __u32 group_uid member to
+ *   &struct_kbase_ioctl_cs_queue_group_create.out
+ * 1.4:
+ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
+ *   instr_features member of same size
+ * 1.5:
+ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
+ *   queue registration call with extended format for supporting CS
+ *   trace configurations with CSF trace_command.
+ * 1.6:
+ * - Added new HW performance counters interface to all GPUs.
+ * 1.7:
+ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
+ * 1.8:
+ * - Removed Kernel legacy HWC interface
+ */
+
+#define BASE_UK_VERSION_MAJOR 1
+#define BASE_UK_VERSION_MINOR 8
+
+/**
+ * struct kbase_ioctl_version_check - Check version compatibility between
+ * kernel and userspace
+ *
+ * @major: Major version number
+ * @minor: Minor version number
+ */
+struct kbase_ioctl_version_check {
+	__u16 major;
+	__u16 minor;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK_RESERVED \
+	_IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check)
+
+
+/**
+ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the
+ *                                        base back-end
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ *
+ * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex.
+ *        Any change of this struct should also be mirrored to the latter.
+ */
+struct kbase_ioctl_cs_queue_register {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER \
+	_IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register)
+
+/**
+ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler
+ *                                    to notify that a queue has been updated
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_kick {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_KICK \
+	_IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick)
+
+/**
+ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group
+ *
+ * @in:                 Input parameters
+ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @in.group_handle:    Handle of the group to which the queue should be bound
+ * @in.csi_index:       Index of the CSF interface the queue should be bound to
+ * @in.padding:         Currently unused, must be zero
+ * @out:                Output parameters
+ * @out.mmap_handle:    Handle to be used for creating the mapping of CS
+ *                      input/output pages
+ */
+union kbase_ioctl_cs_queue_bind {
+	struct {
+		__u64 buffer_gpu_addr;
+		__u8 group_handle;
+		__u8 csi_index;
+		__u8 padding[6];
+	} in;
+	struct {
+		__u64 mmap_handle;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_BIND \
+	_IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind)
+
+/**
+ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the
+ *                                           base back-end in extended format,
+ *                                           involving trace buffer configuration
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ * @buffer_size: Size of the buffer in bytes
+ * @priority: Priority of the queue within a group when run within a process
+ * @padding: Currently unused, must be zero
+ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable
+ * @ex_buffer_base: Trace buffer GPU base address for the queue
+ * @ex_buffer_size: Size of the trace buffer in bytes
+ * @ex_event_size: Trace event write size, in log2 designation
+ * @ex_event_state: Trace event states configuration
+ * @ex_padding: Currently unused, must be zero
+ *
+ * @Note: There is an identical sub-section at the start of this struct to that
+ *        of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section
+ *        must also be mirrored to the latter. Following the said sub-section,
+ *        the remaining fields forms the extension, marked with ex_*.
+ */
+struct kbase_ioctl_cs_queue_register_ex {
+	__u64 buffer_gpu_addr;
+	__u32 buffer_size;
+	__u8 priority;
+	__u8 padding[3];
+	__u64 ex_offset_var_addr;
+	__u64 ex_buffer_base;
+	__u32 ex_buffer_size;
+	__u8 ex_event_size;
+	__u8 ex_event_state;
+	__u8 ex_padding[2];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \
+	_IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex)
+
+/**
+ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue
+ *
+ * @buffer_gpu_addr: GPU address of the buffer backing the queue
+ */
+struct kbase_ioctl_cs_queue_terminate {
+	__u64 buffer_gpu_addr;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
+ *                                               group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create_1_6 {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
+	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
+
+/**
+ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+		__u64 reserved;
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
+	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
+
+/**
+ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
+ *
+ * @group_handle: Handle of the queue group to be terminated
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_cs_queue_group_term {
+	__u8 group_handle;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \
+	_IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term)
+
+#define KBASE_IOCTL_CS_EVENT_SIGNAL \
+	_IO(KBASE_IOCTL_TYPE, 44)
+
+typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */
+
+/**
+ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue
+ *
+ * @id: ID of the new command queue returned by the kernel
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_new {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_CREATE \
+	_IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue
+ *
+ * @id: ID of the command queue to be destroyed
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_delete {
+	base_kcpu_queue_id id;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_DELETE \
+	_IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete)
+
+/**
+ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue
+ *
+ * @addr: Memory address of an array of struct base_kcpu_queue_command
+ * @nr_commands: Number of commands in the array
+ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_kcpu_queue_enqueue {
+	__u64 addr;
+	__u32 nr_commands;
+	base_kcpu_queue_id id;
+	__u8 padding[3];
+};
+
+#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \
+	_IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue)
+
+/**
+ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap
+ * @in:                Input parameters
+ * @in.chunk_size:     Size of each chunk.
+ * @in.initial_chunks: Initial number of chunks that heap will be created with.
+ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
+ * @in.target_in_flight: Number of render-passes that the driver should attempt to
+ *                     keep in flight for which allocation of new chunks is
+ *                     allowed.
+ * @in.group_id:       Group ID to be used for physical allocations.
+ * @in.padding:        Padding
+ * @out:               Output parameters
+ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
+ *                     for the heap.
+ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
+ *                     actually points to the header of heap chunk and not to
+ *                     the low address of free memory in the chunk.
+ */
+union kbase_ioctl_cs_tiler_heap_init {
+	struct {
+		__u32 chunk_size;
+		__u32 initial_chunks;
+		__u32 max_chunks;
+		__u16 target_in_flight;
+		__u8 group_id;
+		__u8 padding;
+	} in;
+	struct {
+		__u64 gpu_heap_va;
+		__u64 first_chunk_va;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_INIT \
+	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
+
+/**
+ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
+ *                                         instance
+ *
+ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap.
+ */
+struct kbase_ioctl_cs_tiler_heap_term {
+	__u64 gpu_heap_va;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_TERM \
+	_IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term)
+
+/**
+ * union kbase_ioctl_cs_get_glb_iface - Request the global control block
+ *                                        of CSF interface capabilities
+ *
+ * @in:                    Input parameters
+ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
+ *                         which case groups_ptr is unused.
+ * @in.max_total_stream    _num: The maximum number of CSs to be read. Can be 0, in
+ *                         which case streams_ptr is unused.
+ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
+ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
+ * @out:                   Output parameters
+ * @out.glb_version:       Global interface version.
+ * @out.features:          Bit mask of features (e.g. whether certain types of job
+ *                         can be suspended).
+ * @out.group_num:         Number of CSGs supported.
+ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
+ *                         hold the size of firmware performance counter data
+ *                         and 15:0 hold the size of hardware performance counter
+ *                         data.
+ * @out.total_stream_num:  Total number of CSs, summed across all groups.
+ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
+ *                         size of events. Bits 3:0 hold the offset update rate.
+ *                         (csf >= 1.1.0)
+ *
+ */
+union kbase_ioctl_cs_get_glb_iface {
+	struct {
+		__u32 max_group_num;
+		__u32 max_total_stream_num;
+		__u64 groups_ptr;
+		__u64 streams_ptr;
+	} in;
+	struct {
+		__u32 glb_version;
+		__u32 features;
+		__u32 group_num;
+		__u32 prfcnt_size;
+		__u32 total_stream_num;
+		__u32 instr_features;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_GET_GLB_IFACE \
+	_IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface)
+
+struct kbase_ioctl_cs_cpu_queue_info {
+	__u64 buffer;
+	__u64 size;
+};
+
+#define KBASE_IOCTL_VERSION_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
+
+#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \
+	_IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+/**
+ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address
+ * @cpu_addr: Memory address to write
+ * @value: Value to write
+ * @padding: Currently unused, must be zero
+ */
+struct kbase_ioctl_cs_event_memory_write {
+	__u64 cpu_addr;
+	__u8 value;
+	__u8 padding[7];
+};
+
+/**
+ * union kbase_ioctl_cs_event_memory_read - Read an event memory address
+ * @in: Input parameters
+ * @in.cpu_addr: Memory address to read
+ * @out: Output parameters
+ * @out.value: Value read
+ * @out.padding: Currently unused, must be zero
+ */
+union kbase_ioctl_cs_event_memory_read {
+	struct {
+		__u64 cpu_addr;
+	} in;
+	struct {
+		__u8 value;
+		__u8 padding[7];
+	} out;
+};
+
+#endif /* MALI_UNIT_TEST */
+
+#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h
new file mode 100644
index 00000000000..fc81b71b46a
--- /dev/null
+++ b/src/panfrost/csf_test/mali_kbase_ioctl.h
@@ -0,0 +1,854 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_IOCTL_H_
+#define _UAPI_KBASE_IOCTL_H_
+
+#ifdef __cpluscplus
+extern "C" {
+#endif
+
+#include <asm-generic/ioctl.h>
+#include <linux/types.h>
+
+#define KBASE_IOCTL_TYPE 0x80
+
+/**
+ * struct kbase_ioctl_set_flags - Set kernel context creation flags
+ *
+ * @create_flags: Flags - see base_context_create_flags
+ */
+struct kbase_ioctl_set_flags {
+	__u32 create_flags;
+};
+
+#define KBASE_IOCTL_SET_FLAGS \
+	_IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags)
+
+/**
+ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel
+ *
+ * @buffer: Pointer to the buffer to store properties into
+ * @size: Size of the buffer
+ * @flags: Flags - must be zero for now
+ *
+ * The ioctl will return the number of bytes stored into @buffer or an error
+ * on failure (e.g. @size is too small). If @size is specified as 0 then no
+ * data will be written but the return value will be the number of bytes needed
+ * for all the properties.
+ *
+ * @flags may be used in the future to request a different format for the
+ * buffer. With @flags == 0 the following format is used.
+ *
+ * The buffer will be filled with pairs of values, a __u32 key identifying the
+ * property followed by the value. The size of the value is identified using
+ * the bottom bits of the key. The value then immediately followed the key and
+ * is tightly packed (there is no padding). All keys and values are
+ * little-endian.
+ *
+ * 00 = __u8
+ * 01 = __u16
+ * 10 = __u32
+ * 11 = __u64
+ */
+struct kbase_ioctl_get_gpuprops {
+	__u64 buffer;
+	__u32 size;
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_GET_GPUPROPS \
+	_IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops)
+
+/**
+ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU
+ * @in: Input parameters
+ * @in.va_pages: The number of pages of virtual address space to reserve
+ * @in.commit_pages: The number of physical pages to allocate
+ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region
+ * @in.flags: Flags
+ * @out: Output parameters
+ * @out.flags: Flags
+ * @out.gpu_va: The GPU virtual address which is allocated
+ */
+union kbase_ioctl_mem_alloc {
+	struct {
+		__u64 va_pages;
+		__u64 commit_pages;
+		__u64 extension;
+		__u64 flags;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALLOC \
+	_IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc)
+
+/**
+ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region
+ * @in: Input parameters
+ * @in.gpu_addr: A GPU address contained within the region
+ * @in.query: The type of query
+ * @out: Output parameters
+ * @out.value: The result of the query
+ *
+ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query.
+ */
+union kbase_ioctl_mem_query {
+	struct {
+		__u64 gpu_addr;
+		__u64 query;
+	} in;
+	struct {
+		__u64 value;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_QUERY \
+	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
+
+#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
+#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
+#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
+
+/**
+ * struct kbase_ioctl_mem_free - Free a memory region
+ * @gpu_addr: Handle to the region to free
+ */
+struct kbase_ioctl_mem_free {
+	__u64 gpu_addr;
+};
+
+#define KBASE_IOCTL_MEM_FREE \
+	_IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @buffer_count: requested number of dumping buffers
+ * @fe_bm:        counters selection bitmask (Front end)
+ * @shader_bm:    counters selection bitmask (Shader)
+ * @tiler_bm:     counters selection bitmask (Tiler)
+ * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error
+ */
+struct kbase_ioctl_hwcnt_reader_setup {
+	__u32 buffer_count;
+	__u32 fe_bm;
+	__u32 shader_bm;
+	__u32 tiler_bm;
+	__u32 mmu_l2_bm;
+};
+
+#define KBASE_IOCTL_HWCNT_READER_SETUP \
+	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
+
+/**
+ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
+ * @data:    Counter samples for the dummy model.
+ * @size:    Size of the counter sample data.
+ * @padding: Padding.
+ */
+struct kbase_ioctl_hwcnt_values {
+	__u64 data;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_HWCNT_SET \
+	_IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values)
+
+/**
+ * struct kbase_ioctl_disjoint_query - Query the disjoint counter
+ * @counter:   A counter of disjoint events in the kernel
+ */
+struct kbase_ioctl_disjoint_query {
+	__u32 counter;
+};
+
+#define KBASE_IOCTL_DISJOINT_QUERY \
+	_IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query)
+
+/**
+ * struct kbase_ioctl_get_ddk_version - Query the kernel version
+ * @version_buffer: Buffer to receive the kernel version string
+ * @size: Size of the buffer
+ * @padding: Padding
+ *
+ * The ioctl will return the number of bytes written into version_buffer
+ * (which includes a NULL byte) or a negative error code
+ *
+ * The ioctl request code has to be _IOW because the data in ioctl struct is
+ * being copied to the kernel, even though the kernel then writes out the
+ * version info to the buffer specified in the ioctl.
+ */
+struct kbase_ioctl_get_ddk_version {
+	__u64 version_buffer;
+	__u32 size;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_GET_DDK_VERSION \
+	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 10.2--11.4)
+ * @va_pages: Number of VA pages to reserve for JIT
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_10_2 {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
+
+/**
+ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
+ *                                        allocator (between kernel driver
+ *                                        version 11.5--11.19)
+ * @va_pages: Number of VA pages to reserve for JIT
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ *
+ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
+ * backwards compatibility.
+ */
+struct kbase_ioctl_mem_jit_init_11_5 {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
+
+/**
+ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
+ *                                   allocator
+ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time
+ *            memory allocations
+ * @max_allocations: Maximum number of concurrent allocations
+ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
+ * @group_id: Group ID to be used for physical allocations
+ * @padding: Currently unused, must be zero
+ * @phys_pages: Maximum number of physical pages to allocate just-in-time
+ *
+ * Note that depending on the VA size of the application and GPU, the value
+ * specified in @va_pages may be ignored.
+ */
+struct kbase_ioctl_mem_jit_init {
+	__u64 va_pages;
+	__u8 max_allocations;
+	__u8 trim_level;
+	__u8 group_id;
+	__u8 padding[5];
+	__u64 phys_pages;
+};
+
+#define KBASE_IOCTL_MEM_JIT_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init)
+
+/**
+ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory
+ *
+ * @handle: GPU memory handle (GPU VA)
+ * @user_addr: The address where it is mapped in user space
+ * @size: The number of bytes to synchronise
+ * @type: The direction to synchronise: 0 is sync to memory (clean),
+ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants.
+ * @padding: Padding to round up to a multiple of 8 bytes, must be zero
+ */
+struct kbase_ioctl_mem_sync {
+	__u64 handle;
+	__u64 user_addr;
+	__u64 size;
+	__u8 type;
+	__u8 padding[7];
+};
+
+#define KBASE_IOCTL_MEM_SYNC \
+	_IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync)
+
+/**
+ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer
+ *
+ * @in: Input parameters
+ * @in.gpu_addr: The GPU address of the memory region
+ * @in.cpu_addr: The CPU address to locate
+ * @in.size: A size in bytes to validate is contained within the region
+ * @out: Output parameters
+ * @out.offset: The offset from the start of the memory region to @cpu_addr
+ */
+union kbase_ioctl_mem_find_cpu_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 cpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset)
+
+/**
+ * struct kbase_ioctl_get_context_id - Get the kernel context ID
+ *
+ * @id: The kernel context ID
+ */
+struct kbase_ioctl_get_context_id {
+	__u32 id;
+};
+
+#define KBASE_IOCTL_GET_CONTEXT_ID \
+	_IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id)
+
+/**
+ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd
+ *
+ * @flags: Flags
+ *
+ * The ioctl returns a file descriptor when successful
+ */
+struct kbase_ioctl_tlstream_acquire {
+	__u32 flags;
+};
+
+#define KBASE_IOCTL_TLSTREAM_ACQUIRE \
+	_IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire)
+
+#define KBASE_IOCTL_TLSTREAM_FLUSH \
+	_IO(KBASE_IOCTL_TYPE, 19)
+
+/**
+ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region
+ *
+ * @gpu_addr: The memory region to modify
+ * @pages:    The number of physical pages that should be present
+ *
+ * The ioctl may return on the following error codes or 0 for success:
+ *   -ENOMEM: Out of memory
+ *   -EINVAL: Invalid arguments
+ */
+struct kbase_ioctl_mem_commit {
+	__u64 gpu_addr;
+	__u64 pages;
+};
+
+#define KBASE_IOCTL_MEM_COMMIT \
+	_IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit)
+
+/**
+ * union kbase_ioctl_mem_alias - Create an alias of memory regions
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.stride: Bytes between start of each memory region
+ * @in.nents: The number of regions to pack together into the alias
+ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_alias {
+	struct {
+		__u64 flags;
+		__u64 stride;
+		__u64 nents;
+		__u64 aliasing_info;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_ALIAS \
+	_IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias)
+
+/**
+ * union kbase_ioctl_mem_import - Import memory for use by the GPU
+ * @in: Input parameters
+ * @in.flags: Flags, see BASE_MEM_xxx
+ * @in.phandle: Handle to the external memory
+ * @in.type: Type of external memory, see base_mem_import_type
+ * @in.padding: Amount of extra VA pages to append to the imported buffer
+ * @out: Output parameters
+ * @out.flags: Flags, see BASE_MEM_xxx
+ * @out.gpu_va: Address of the new alias
+ * @out.va_pages: Size of the new alias
+ */
+union kbase_ioctl_mem_import {
+	struct {
+		__u64 flags;
+		__u64 phandle;
+		__u32 type;
+		__u32 padding;
+	} in;
+	struct {
+		__u64 flags;
+		__u64 gpu_va;
+		__u64 va_pages;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_IMPORT \
+	_IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import)
+
+/**
+ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region
+ * @gpu_va: The GPU region to modify
+ * @flags: The new flags to set
+ * @mask: Mask of the flags to modify
+ */
+struct kbase_ioctl_mem_flags_change {
+	__u64 gpu_va;
+	__u64 flags;
+	__u64 mask;
+};
+
+#define KBASE_IOCTL_MEM_FLAGS_CHANGE \
+	_IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change)
+
+/**
+ * struct kbase_ioctl_stream_create - Create a synchronisation stream
+ * @name: A name to identify this stream. Must be NULL-terminated.
+ *
+ * Note that this is also called a "timeline", but is named stream to avoid
+ * confusion with other uses of the word.
+ *
+ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes.
+ *
+ * The ioctl returns a file descriptor.
+ */
+struct kbase_ioctl_stream_create {
+	char name[32];
+};
+
+#define KBASE_IOCTL_STREAM_CREATE \
+	_IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create)
+
+/**
+ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence
+ * @fd: The file descriptor to validate
+ */
+struct kbase_ioctl_fence_validate {
+	int fd;
+};
+
+#define KBASE_IOCTL_FENCE_VALIDATE \
+	_IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate)
+
+/**
+ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel
+ * @buffer: Pointer to the information
+ * @len: Length
+ * @padding: Padding
+ *
+ * The data provided is accessible through a debugfs file
+ */
+struct kbase_ioctl_mem_profile_add {
+	__u64 buffer;
+	__u32 len;
+	__u32 padding;
+};
+
+#define KBASE_IOCTL_MEM_PROFILE_ADD \
+	_IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to map
+ */
+struct kbase_ioctl_sticky_resource_map {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_MAP \
+	_IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map)
+
+/**
+ * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was
+ *                                          previously permanently mapped
+ * @count: Number of resources
+ * @address: Array of __u64 GPU addresses of the external resources to unmap
+ */
+struct kbase_ioctl_sticky_resource_unmap {
+	__u64 count;
+	__u64 address;
+};
+
+#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \
+	_IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap)
+
+/**
+ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of
+ *                                                   the GPU memory region for
+ *                                                   the given gpu address and
+ *                                                   the offset of that address
+ *                                                   into the region
+ * @in: Input parameters
+ * @in.gpu_addr: GPU virtual address
+ * @in.size: Size in bytes within the region
+ * @out: Output parameters
+ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr
+ *             for the length of @offset bytes
+ * @out.offset: The offset from the start of the memory region to @gpu_addr
+ */
+union kbase_ioctl_mem_find_gpu_start_and_offset {
+	struct {
+		__u64 gpu_addr;
+		__u64 size;
+	} in;
+	struct {
+		__u64 start;
+		__u64 offset;
+	} out;
+};
+
+#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
+	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
+
+#define KBASE_IOCTL_CINSTR_GWT_START \
+	_IO(KBASE_IOCTL_TYPE, 33)
+
+#define KBASE_IOCTL_CINSTR_GWT_STOP \
+	_IO(KBASE_IOCTL_TYPE, 34)
+
+/**
+ * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses.
+ * @in: Input parameters
+ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas.
+ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages)
+ * @in.len: Number of addresses the buffers can hold.
+ * @in.padding: padding
+ * @out: Output parameters
+ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer.
+ * @out.more_data_available: Status indicating if more addresses are available.
+ * @out.padding: padding
+ *
+ * This structure is used when performing a call to dump GPU write fault
+ * addresses.
+ */
+union kbase_ioctl_cinstr_gwt_dump {
+	struct {
+		__u64 addr_buffer;
+		__u64 size_buffer;
+		__u32 len;
+		__u32 padding;
+
+	} in;
+	struct {
+		__u32 no_of_addr_collected;
+		__u8 more_data_available;
+		__u8 padding[27];
+	} out;
+};
+
+#define KBASE_IOCTL_CINSTR_GWT_DUMP \
+	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
+
+/**
+ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
+ *
+ * @va_pages: Number of VA pages to reserve for EXEC_VA
+ */
+struct kbase_ioctl_mem_exec_init {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_EXEC_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
+
+/**
+ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of
+ *                                          cpu/gpu time (counter values)
+ * @in: Input parameters
+ * @in.request_flags: Bit-flags indicating the requested types.
+ * @in.paddings:      Unused, size alignment matching the out.
+ * @out: Output parameters
+ * @out.sec:           Integer field of the monotonic time, unit in seconds.
+ * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
+ * @out.padding:       Unused, for __u64 alignment
+ * @out.timestamp:     System wide timestamp (counter) value.
+ * @out.cycle_counter: GPU cycle counter value.
+ */
+union kbase_ioctl_get_cpu_gpu_timeinfo {
+	struct {
+		__u32 request_flags;
+		__u32 paddings[7];
+	} in;
+	struct {
+		__u64 sec;
+		__u32 nsec;
+		__u32 padding;
+		__u64 timestamp;
+		__u64 cycle_counter;
+	} out;
+};
+
+#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \
+	_IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo)
+
+/**
+ * struct kbase_ioctl_context_priority_check - Check the max possible priority
+ * @priority: Input priority & output priority
+ */
+
+struct kbase_ioctl_context_priority_check {
+	__u8 priority;
+};
+
+#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
+	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
+
+/**
+ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
+ *
+ * @max_core_count: Maximum core count
+ */
+struct kbase_ioctl_set_limited_core_count {
+	__u8 max_core_count;
+};
+
+#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
+	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
+
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
+ *                                              information
+ * @info_item_size:  Performance counter item size in bytes.
+ * @info_item_count: Performance counter item count in the info_list_ptr.
+ * @info_list_ptr:   Performance counter item list pointer which points to a
+ *                   list with info_item_count of items.
+ *
+ * On success: returns info_item_size and info_item_count if info_list_ptr is
+ * NULL, returns performance counter information if info_list_ptr is not NULL.
+ * On error: returns a negative error code.
+ */
+struct kbase_ioctl_kinstr_prfcnt_enum_info {
+	__u32 info_item_size;
+	__u32 info_item_count;
+	__u64 info_list_ptr;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
+	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @in: input parameters.
+ * @in.request_item_count: Number of requests in the requests array.
+ * @in.request_item_size:  Size in bytes of each request in the requests array.
+ * @in.requests_ptr:       Pointer to the requests array.
+ * @out: output parameters.
+ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
+ *                                 each sample.
+ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
+ *                                 for reading performance counter samples.
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error.
+ */
+union kbase_ioctl_kinstr_prfcnt_setup {
+	struct {
+		__u32 request_item_count;
+		__u32 request_item_size;
+		__u64 requests_ptr;
+	} in;
+	struct {
+		__u32 prfcnt_metadata_item_size;
+		__u32 prfcnt_mmap_size_bytes;
+	} out;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
+
+/***************
+ * test ioctls *
+ ***************/
+#if MALI_UNIT_TEST
+/* These ioctls are purely for test purposes and are not used in the production
+ * driver, they therefore may change without notice
+ */
+
+#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
+
+
+/**
+ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
+ * @bytes_collected: number of bytes read by user
+ * @bytes_generated: number of bytes generated by tracepoints
+ */
+struct kbase_ioctl_tlstream_stats {
+	__u32 bytes_collected;
+	__u32 bytes_generated;
+};
+
+#define KBASE_IOCTL_TLSTREAM_STATS \
+	_IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats)
+
+#endif /* MALI_UNIT_TEST */
+
+/* Customer extension range */
+#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
+
+/* If the integration needs extra ioctl add them there
+ * like this:
+ *
+ * struct my_ioctl_args {
+ *  ....
+ * }
+ *
+ * #define KBASE_IOCTL_MY_IOCTL \
+ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
+ */
+
+
+/**********************************
+ * Definitions for GPU properties *
+ **********************************/
+#define KBASE_GPUPROP_VALUE_SIZE_U8	(0x0)
+#define KBASE_GPUPROP_VALUE_SIZE_U16	(0x1)
+#define KBASE_GPUPROP_VALUE_SIZE_U32	(0x2)
+#define KBASE_GPUPROP_VALUE_SIZE_U64	(0x3)
+
+#define KBASE_GPUPROP_PRODUCT_ID			1
+#define KBASE_GPUPROP_VERSION_STATUS			2
+#define KBASE_GPUPROP_MINOR_REVISION			3
+#define KBASE_GPUPROP_MAJOR_REVISION			4
+/* 5 previously used for GPU speed */
+#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX			6
+/* 7 previously used for minimum GPU speed */
+#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE		8
+#define KBASE_GPUPROP_TEXTURE_FEATURES_0		9
+#define KBASE_GPUPROP_TEXTURE_FEATURES_1		10
+#define KBASE_GPUPROP_TEXTURE_FEATURES_2		11
+#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE		12
+
+#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE			13
+#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE		14
+#define KBASE_GPUPROP_L2_NUM_L2_SLICES			15
+
+#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES		16
+#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS		17
+
+#define KBASE_GPUPROP_MAX_THREADS			18
+#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE		19
+#define KBASE_GPUPROP_MAX_BARRIER_SIZE			20
+#define KBASE_GPUPROP_MAX_REGISTERS			21
+#define KBASE_GPUPROP_MAX_TASK_QUEUE			22
+#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT		23
+#define KBASE_GPUPROP_IMPL_TECH				24
+
+#define KBASE_GPUPROP_RAW_SHADER_PRESENT		25
+#define KBASE_GPUPROP_RAW_TILER_PRESENT			26
+#define KBASE_GPUPROP_RAW_L2_PRESENT			27
+#define KBASE_GPUPROP_RAW_STACK_PRESENT			28
+#define KBASE_GPUPROP_RAW_L2_FEATURES			29
+#define KBASE_GPUPROP_RAW_CORE_FEATURES			30
+#define KBASE_GPUPROP_RAW_MEM_FEATURES			31
+#define KBASE_GPUPROP_RAW_MMU_FEATURES			32
+#define KBASE_GPUPROP_RAW_AS_PRESENT			33
+#define KBASE_GPUPROP_RAW_JS_PRESENT			34
+#define KBASE_GPUPROP_RAW_JS_FEATURES_0			35
+#define KBASE_GPUPROP_RAW_JS_FEATURES_1			36
+#define KBASE_GPUPROP_RAW_JS_FEATURES_2			37
+#define KBASE_GPUPROP_RAW_JS_FEATURES_3			38
+#define KBASE_GPUPROP_RAW_JS_FEATURES_4			39
+#define KBASE_GPUPROP_RAW_JS_FEATURES_5			40
+#define KBASE_GPUPROP_RAW_JS_FEATURES_6			41
+#define KBASE_GPUPROP_RAW_JS_FEATURES_7			42
+#define KBASE_GPUPROP_RAW_JS_FEATURES_8			43
+#define KBASE_GPUPROP_RAW_JS_FEATURES_9			44
+#define KBASE_GPUPROP_RAW_JS_FEATURES_10		45
+#define KBASE_GPUPROP_RAW_JS_FEATURES_11		46
+#define KBASE_GPUPROP_RAW_JS_FEATURES_12		47
+#define KBASE_GPUPROP_RAW_JS_FEATURES_13		48
+#define KBASE_GPUPROP_RAW_JS_FEATURES_14		49
+#define KBASE_GPUPROP_RAW_JS_FEATURES_15		50
+#define KBASE_GPUPROP_RAW_TILER_FEATURES		51
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0		52
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1		53
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2		54
+#define KBASE_GPUPROP_RAW_GPU_ID			55
+#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS		56
+#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE	57
+#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE	58
+#define KBASE_GPUPROP_RAW_THREAD_FEATURES		59
+#define KBASE_GPUPROP_RAW_COHERENCY_MODE		60
+
+#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS		61
+#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS		62
+#define KBASE_GPUPROP_COHERENCY_COHERENCY		63
+#define KBASE_GPUPROP_COHERENCY_GROUP_0			64
+#define KBASE_GPUPROP_COHERENCY_GROUP_1			65
+#define KBASE_GPUPROP_COHERENCY_GROUP_2			66
+#define KBASE_GPUPROP_COHERENCY_GROUP_3			67
+#define KBASE_GPUPROP_COHERENCY_GROUP_4			68
+#define KBASE_GPUPROP_COHERENCY_GROUP_5			69
+#define KBASE_GPUPROP_COHERENCY_GROUP_6			70
+#define KBASE_GPUPROP_COHERENCY_GROUP_7			71
+#define KBASE_GPUPROP_COHERENCY_GROUP_8			72
+#define KBASE_GPUPROP_COHERENCY_GROUP_9			73
+#define KBASE_GPUPROP_COHERENCY_GROUP_10		74
+#define KBASE_GPUPROP_COHERENCY_GROUP_11		75
+#define KBASE_GPUPROP_COHERENCY_GROUP_12		76
+#define KBASE_GPUPROP_COHERENCY_GROUP_13		77
+#define KBASE_GPUPROP_COHERENCY_GROUP_14		78
+#define KBASE_GPUPROP_COHERENCY_GROUP_15		79
+
+#define KBASE_GPUPROP_TEXTURE_FEATURES_3		80
+#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3		81
+
+#define KBASE_GPUPROP_NUM_EXEC_ENGINES			82
+
+#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC		83
+#define KBASE_GPUPROP_TLS_ALLOC				84
+#define KBASE_GPUPROP_RAW_GPU_FEATURES			85
+#ifdef __cpluscplus
+}
+#endif
+
+#endif /* _UAPI_KBASE_IOCTL_H_ */
diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c
new file mode 100644
index 00000000000..cb9ff398314
--- /dev/null
+++ b/src/panfrost/csf_test/test.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright (C) 2022 Icecream95
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "util/macros.h"
+
+#include "mali_kbase_csf_ioctl.h"
+#include "mali_kbase_ioctl.h"
+#include "mali_base_kernel.h"
+#include "mali_base_csf_kernel.h"
+#include "mali_gpu_csf_registers.h"
+
+#define PAN_ARCH 10
+#include "genxml/gen_macros.h"
+
+#include "wrap.h"
+#include "decode.h"
+
+#include "pan_shader.h"
+#include "compiler/nir/nir_builder.h"
+#include "bifrost/valhall/disassemble.h"
+
+#define CS_EVENT_REGISTER 0x5A
+
+static bool pr = true;
+static bool colour_term = true;
+
+static void
+dump_start(FILE *f)
+{
+        if (colour_term)
+                fprintf(f, "\x1b[90m");
+}
+
+static void
+dump_end(FILE *f)
+{
+        if (colour_term)
+                fprintf(f, "\x1b[39m");
+}
+
+/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */
+static void
+cache_clean(volatile void *addr)
+{
+#ifdef __aarch64__
+        __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory");
+#endif
+}
+
+static void
+cache_invalidate(volatile void *addr)
+{
+#ifdef __aarch64__
+        __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory");
+#endif
+}
+
+static void
+cache_barrier(void)
+{
+#ifdef __ARM_ARCH
+        __asm__ volatile ("dsb sy" ::: "memory");
+#endif
+}
+
+static void
+memory_barrier(void)
+{
+#ifdef __ARM_ARCH
+        __asm__ volatile ("dmb sy" ::: "memory");
+#endif
+}
+
+typedef void (*cacheline_op)(volatile void *addr);
+
+#define CACHELINE_SIZE 64
+
+static void
+cacheline_op_range(volatile void *start, unsigned length, cacheline_op op)
+{
+        volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1));
+        volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE);
+        for (; ptr < end; ptr += CACHELINE_SIZE)
+                op(ptr);
+}
+
+static void
+cache_clean_range(volatile void *start, unsigned length)
+{
+        cacheline_op_range(start, length, cache_clean);
+}
+
+static void
+cache_invalidate_range(volatile void *start, unsigned length)
+{
+        cacheline_op_range(start, length, cache_invalidate);
+}
+
+struct state;
+struct test;
+
+typedef bool (* section)(struct state *s, struct test *t);
+
+#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */
+#define CS_QUEUE_SIZE 65536
+
+struct state {
+        int page_size;
+        int argc;
+        char **argv;
+
+        int mali_fd;
+        int tl_fd;
+        void *tracking_region;
+        void *csf_user_reg;
+
+        uint8_t *gpuprops;
+        unsigned gpuprops_size;
+        uint32_t gpu_id;
+
+        struct {
+                struct panfrost_ptr normal, exec, coherent, cached, event, ev2;
+        } allocations;
+
+        uint64_t tiler_heap_va;
+        uint64_t tiler_heap_header;
+
+        uint8_t csg_handle;
+        uint32_t csg_uid;
+
+        struct panfrost_ptr cs_mem[CS_QUEUE_COUNT];
+        void *cs_user_io[CS_QUEUE_COUNT];
+        unsigned cs_last_submit[CS_QUEUE_COUNT];
+        struct pan_command_stream cs[CS_QUEUE_COUNT];
+
+        unsigned shader_alloc_offset;
+        mali_ptr compute_shader;
+};
+
+struct test {
+        section part;
+        section cleanup;
+        const char *label;
+
+        struct test *subtests;
+        unsigned sub_length;
+
+        /* for allocation tests */
+        unsigned offset;
+        unsigned flags;
+
+        bool add;
+        bool invalid;
+        bool blit;
+        bool vertex;
+};
+
+/* See STATE and ALLOC macros below */
+#define DEREF_STATE(s, offset) ((void*) s + offset)
+
+static uint64_t
+pan_get_gpuprop(struct state *s, int name)
+{
+        int i = 0;
+        uint64_t x = 0;
+        while (i < s->gpuprops_size) {
+                x = 0;
+                memcpy(&x, s->gpuprops + i, 4);
+                i += 4;
+
+                int size = 1 << (x & 3);
+                int this_name = x >> 2;
+
+                x = 0;
+                memcpy(&x, s->gpuprops + i, size);
+                i += size;
+
+                if (this_name == name)
+                        return x;
+        }
+
+        fprintf(stderr, "Unknown prop %i\n", name);
+        return 0;
+}
+
+static bool
+open_kbase(struct state *s, struct test *t)
+{
+        s->mali_fd = open("/dev/mali0", O_RDWR);
+        if (s->mali_fd != -1)
+                return true;
+
+        perror("open(\"/dev/mali0\")");
+        return false;
+}
+
+static bool
+close_kbase(struct state *s, struct test *t)
+{
+        if (getenv("TEST_CHECK_LEAKS")) {
+                int pid = getpid();
+                char cmd_buffer[64] = {0};
+                sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid);
+                system(cmd_buffer);
+                sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid);
+                system(cmd_buffer);
+        }
+
+        if (s->mali_fd > 0)
+                return close(s->mali_fd) == 0;
+        return true;
+}
+
+static bool
+get_version(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_version_check ver = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_VERSION_CHECK)");
+                return false;
+        }
+
+        if (pr)
+                printf("Major %i Minor %i: ", ver.major, ver.minor);
+        return true;
+}
+
+static bool
+set_flags(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_set_flags flags = {
+                .create_flags = 0
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_SET_FLAGS)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+mmap_tracking(struct state *s, struct test *t)
+{
+        s->tracking_region = mmap(NULL, s->page_size, PROT_NONE,
+                                  MAP_SHARED, s->mali_fd,
+                                  BASE_MEM_MAP_TRACKING_HANDLE);
+
+        if (s->tracking_region == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)");
+                s->tracking_region = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_tracking(struct state *s, struct test *t)
+{
+        if (s->tracking_region)
+                return munmap(s->tracking_region, s->page_size) == 0;
+        return true;
+}
+
+static bool
+get_gpuprops(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_get_gpuprops props = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))");
+                return false;
+        } else if (!ret) {
+                fprintf(stderr, "GET_GPUPROPS returned zero size\n");
+                return false;
+        }
+
+        s->gpuprops_size = ret;
+        s->gpuprops = calloc(s->gpuprops_size, 1);
+
+        props.size = s->gpuprops_size;
+        props.buffer = (uint64_t)(uintptr_t) s->gpuprops;
+
+        ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+free_gpuprops(struct state *s, struct test *t)
+{
+        free(s->gpuprops);
+        return true;
+}
+
+static bool
+get_gpu_id(struct state *s, struct test *t)
+{
+        uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID);
+        if (!gpu_id)
+                return false;
+        s->gpu_id = gpu_id;
+
+        uint16_t maj = gpu_id >> 12;
+        uint16_t min = (gpu_id >> 8) & 0xf;
+        uint16_t rev = (gpu_id >> 4) & 0xf;
+
+        uint16_t product = gpu_id & 0xf;
+        uint16_t prod = product | ((maj & 1) << 4);
+
+        const char *names[] = {
+                [1] = "TDUX",
+                [2] = "G710",
+                [3] = "G510",
+                [4] = "G310",
+                [7] = "G610",
+                [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */
+                [16 + 3] = "G615",
+        };
+        const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL;
+        if (!name)
+                name = "unknown";
+
+        if (pr)
+                printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product);
+
+        if (maj < 10) {
+                printf("not v10 or later: ");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+get_coherency_mode(struct state *s, struct test *t)
+{
+        uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE);
+
+        const char *modes[] = {
+                [0] = "ACE-Lite",
+                [1] = "ACE",
+                [31] = "None",
+        };
+        const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL;
+        if (!name)
+                name = "Unknown";
+
+        if (pr)
+                printf("0x%"PRIx64" (%s): ", mode, name);
+        return true;
+}
+
+static bool
+get_csf_caps(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_get_glb_iface iface = { 0 };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))");
+                return false;
+        }
+
+        int ver_maj = iface.out.glb_version >> 24;
+        int ver_min = (iface.out.glb_version >> 16) & 0xff;
+        int ver_rev = iface.out.glb_version & 0xffff;
+
+        if (pr)
+                printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ",
+                       ver_maj, ver_min, ver_rev, iface.out.features,
+                       iface.out.group_num, iface.out.total_stream_num);
+
+        unsigned group_num = iface.out.group_num;
+        unsigned stream_num = iface.out.total_stream_num;
+
+        struct basep_cs_group_control *group_data =
+                calloc(group_num, sizeof(*group_data));
+
+        struct basep_cs_stream_control *stream_data =
+                calloc(stream_num, sizeof(*stream_data));
+
+        iface = (union kbase_ioctl_cs_get_glb_iface) {
+                .in = {
+                        .max_group_num = group_num,
+                        .max_total_stream_num = stream_num,
+                        .groups_ptr = (uintptr_t) group_data,
+                        .streams_ptr = (uintptr_t) stream_data,
+                }
+        };
+
+        ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface);
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))");
+
+                free(group_data);
+                free(stream_data);
+
+                return false;
+        }
+
+        unsigned print_groups = pr ? group_num : 0;
+        unsigned print_streams = pr ? stream_num : 0;
+
+        for (unsigned i = 0; i < print_groups; ++i) {
+                if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data)))
+                        continue;
+
+                fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n",
+                        i, group_data[i].features, group_data[i].stream_num);
+        }
+
+        for (unsigned i = 0; i < print_streams; ++i) {
+                if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data)))
+                        continue;
+
+                unsigned reg = stream_data[i].features & 0xff;
+                unsigned score = (stream_data[i].features >> 8) & 0xff;
+                unsigned feat = stream_data[i].features >> 16;
+
+                fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n",
+                        i, reg, score, feat);
+        }
+
+        free(group_data);
+        free(stream_data);
+
+        return true;
+}
+
+static bool
+mmap_user_reg(struct state *s, struct test *t)
+{
+        s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ,
+                               MAP_SHARED, s->mali_fd,
+                               BASEP_MEM_CSF_USER_REG_PAGE_HANDLE);
+
+        if (s->csf_user_reg == MAP_FAILED) {
+                perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)");
+                s->csf_user_reg = NULL;
+                return false;
+        }
+        return true;
+}
+
+static bool
+munmap_user_reg(struct state *s, struct test *t)
+{
+        if (s->csf_user_reg)
+                return munmap(s->csf_user_reg, s->page_size) == 0;
+        return true;
+}
+
+static bool
+init_mem_exec(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_mem_exec_init init = {
+                .va_pages = 0x100000,
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+init_mem_jit(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_mem_jit_init init = {
+                .va_pages = 1 << 25,
+                .max_allocations = 255,
+                .phys_pages = 1 << 25,
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+stream_create(struct state *s, struct test *t)
+{
+        struct kbase_ioctl_stream_create stream = {
+                .name = "stream"
+        };
+
+        s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream);
+
+        if (s->tl_fd == -1) {
+                perror("ioctl(KBASE_IOCTL_STREAM_CREATE)");
+                return false;
+        }
+        return true;
+
+}
+
+static bool
+stream_destroy(struct state *s, struct test *t)
+{
+        if (s->tl_fd > 0)
+                return close(s->tl_fd) == 0;
+        return true;
+}
+
+static bool
+tiler_heap_create(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_tiler_heap_init init = {
+                .in = {
+                        .chunk_size = 1 << 21,
+                        .initial_chunks = 5,
+                        .max_chunks = 200,
+                        .target_in_flight = 65535,
+                }
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)");
+                return false;
+        }
+
+        s->tiler_heap_va = init.out.gpu_heap_va;
+        s->tiler_heap_header = init.out.first_chunk_va;
+        printf("heap va: %"PRIx64", heap header: %"PRIx64"\n",
+               s->tiler_heap_va, s->tiler_heap_header);
+
+        return true;
+}
+
+static bool
+tiler_heap_term(struct state *s, struct test *t)
+{
+        if (!s->tiler_heap_va)
+                return true;
+
+        struct kbase_ioctl_cs_tiler_heap_term term = {
+                .gpu_heap_va = s->tiler_heap_va
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+cs_group_create(struct state *s, struct test *t)
+{
+        union kbase_ioctl_cs_queue_group_create_1_6 create = {
+                .in = {
+                        /* Mali *still* only supports a single tiler unit */
+                        .tiler_mask = 1,
+                        .fragment_mask = ~0ULL,
+                        .compute_mask = ~0ULL,
+
+                        .cs_min = CS_QUEUE_COUNT,
+
+                        .priority = 1,
+                        .tiler_max = 1,
+                        .fragment_max = 64,
+                        .compute_max = 64,
+                }
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)");
+                return false;
+        }
+
+        s->csg_handle = create.out.group_handle;
+        s->csg_uid = create.out.group_uid;
+
+        if (pr)
+                printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid);
+
+        /* Should be at least 1 */
+        if (!s->csg_uid)
+                abort();
+
+        return true;
+}
+
+static bool
+cs_group_term(struct state *s, struct test *t)
+{
+        if (!s->csg_uid)
+                return true;
+
+        struct kbase_ioctl_cs_queue_group_term term = {
+                .group_handle = s->csg_handle
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)");
+                return false;
+        }
+        return true;
+}
+
+static bool
+decode_init(struct state *s, struct test *t)
+{
+        pandecode_initialize(true);
+        return true;
+}
+
+static bool
+decode_close(struct state *s, struct test *t)
+{
+        pandecode_close();
+        return true;
+}
+
+static struct panfrost_ptr
+alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a)
+{
+        struct panfrost_ptr p = {0};
+
+        uint64_t va_pages = a->in.va_pages;
+        uint64_t flags = a->in.flags;
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_MEM_ALLOC)");
+                return p;
+        }
+
+        if ((flags & BASE_MEM_SAME_VA) &&
+            (!(a->out.flags & BASE_MEM_SAME_VA) ||
+                a->out.gpu_va != 0x41000)) {
+
+                fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n",
+                        (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va);
+                return p;
+        }
+
+        void *ptr = mmap(NULL, s->page_size * va_pages,
+                         PROT_READ | PROT_WRITE, MAP_SHARED,
+                         s->mali_fd, a->out.gpu_va);
+
+        if (ptr == MAP_FAILED) {
+                perror("mmap(GPU BO)");
+                return p;
+        }
+
+        uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ?
+                (uintptr_t) ptr : a->out.gpu_va;
+
+        pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL);
+
+        p.cpu = ptr;
+        p.gpu = gpu_va;
+
+        memset(p.cpu, 0, s->page_size * va_pages);
+
+        return p;
+}
+
+static struct panfrost_ptr
+alloc_mem(struct state *s, uint64_t size, uint64_t flags)
+{
+        unsigned pages = size / s->page_size;
+
+        union kbase_ioctl_mem_alloc a = {
+                .in = {
+                        .va_pages = pages,
+                        .commit_pages = pages,
+                        .extension = 0,
+                        .flags = flags,
+                }
+        };
+
+        return alloc_ioctl(s, &a);
+}
+
+static void
+alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size)
+{
+        mmap(p.cpu - s->page_size, 1,
+             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
+             -1, 0);
+
+        mmap(p.cpu + alloc_size, 1,
+             PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
+             -1, 0);
+}
+
+static bool
+alloc(struct state *s, struct test *t)
+{
+        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
+
+        *ptr = alloc_mem(s, s->page_size, t->flags);
+
+        volatile int *p = (volatile int *) ptr->cpu;
+        *p = 0x12345;
+        if (*p != 0x12345) {
+                printf("Error reading from allocated memory at %p\n", p);
+                return false;
+        }
+        *p = 0;
+        cache_clean(p);
+
+        return true;
+}
+
+static bool
+dealloc(struct state *s, struct test *t)
+{
+        struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset);
+
+        if (ptr->cpu)
+                return munmap(ptr->cpu, s->page_size) == 0;
+        return true;
+}
+
+static bool
+cs_queue_create(struct state *s, struct test *t)
+{
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+
+                /* Read/write from CPU/GPU, nothing special
+                 * like coherency */
+                s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f);
+                s->cs[i].ptr = s->cs_mem[i].cpu;
+
+                if (!s->cs_mem[i].cpu)
+                        return false;
+        }
+
+        return true;
+}
+
+static bool
+cs_queue_free(struct state *s, struct test *t)
+{
+        bool pass = true;
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE))
+                        pass = false;
+        }
+        return pass;
+}
+
+static bool
+cs_queue_register(struct state *s, struct test *t)
+{
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                struct kbase_ioctl_cs_queue_register reg = {
+                        .buffer_gpu_addr = s->cs_mem[i].gpu,
+                        .buffer_size = CS_QUEUE_SIZE,
+                        .priority = 1,
+                };
+
+                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, &reg);
+
+                if (ret == -1) {
+                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)");
+                        return false;
+                }
+
+                union kbase_ioctl_cs_queue_bind bind = {
+                        .in = {
+                                .buffer_gpu_addr = s->cs_mem[i].gpu,
+                                .group_handle = s->csg_handle,
+                                .csi_index = i,
+                        }
+                };
+
+                ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind);
+
+                if (ret == -1) {
+                        perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)");
+                }
+
+                s->cs_user_io[i] =
+                        mmap(NULL,
+                             s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES,
+                             PROT_READ | PROT_WRITE, MAP_SHARED,
+                             s->mali_fd, bind.out.mmap_handle);
+
+                if (s->cs_user_io[i] == MAP_FAILED) {
+                        perror("mmap(CS USER IO)");
+                        s->cs_user_io[i] = NULL;
+                        return false;
+                }
+        }
+        return true;
+}
+
+static bool
+cs_queue_term(struct state *s, struct test *t)
+{
+        bool pass = true;
+
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                if (s->cs_user_io[i] &&
+                    munmap(s->cs_user_io[i],
+                           s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES))
+                        pass = false;
+
+                struct kbase_ioctl_cs_queue_terminate term = {
+                        .buffer_gpu_addr = s->cs_mem[i].gpu,
+                };
+
+                int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE,
+                                &term);
+
+                if (ret == -1)
+                        pass = false;
+        }
+        return pass;
+}
+
+#define CS_RING_DOORBELL(s, i) \
+        *((uint32_t *)(s->cs_user_io[i])) = 1
+
+#define CS_READ_REGISTER(s, i, r) \
+        *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r))
+
+#define CS_WRITE_REGISTER(s, i, r, v) \
+        *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v
+
+static void
+submit_cs(struct state *s, unsigned i)
+{
+        uintptr_t p = (uintptr_t) s->cs[i].ptr;
+        unsigned pad = (-p) & 63;
+        memset(s->cs[i].ptr, 0, pad);
+
+        unsigned last_offset = s->cs_last_submit[i];
+
+        unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu;
+        insert_offset %= CS_QUEUE_SIZE;
+
+        for (unsigned o = last_offset; o != insert_offset;
+             o = (o + 64) % CS_QUEUE_SIZE)
+                cache_clean(s->cs_mem[i].cpu + o);
+
+        // TODO: Handle wraparound
+        // TODO: Provide a persistent buffer for pandecode to use?
+        if (pr) {
+                dump_start(stderr);
+                pandecode_cs(s->cs_mem[i].gpu + last_offset,
+                             insert_offset - last_offset, s->gpu_id);
+                dump_end(stderr);
+        }
+
+        cache_barrier();
+
+        CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset);
+        s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset;
+
+        memory_barrier();
+        CS_RING_DOORBELL(s, i);
+        memory_barrier();
+
+        s->cs_last_submit[i] = insert_offset;
+}
+
+/* Returns true if there was a timeout */
+static bool
+wait_event(struct state *s, unsigned timeout_ms)
+{
+        struct pollfd fd = {
+                .fd = s->mali_fd,
+                .events = POLLIN,
+        };
+
+        int ret = poll(&fd, 1, timeout_ms);
+
+        if (ret == -1) {
+                perror("poll(mali_fd)");
+                return true;
+        }
+
+        /* Timeout */
+        if (ret == 0)
+                return true;
+
+        struct base_csf_notification event;
+        ret = read(s->mali_fd, &event, sizeof(event));
+
+        if (ret == -1) {
+                perror("read(mali_fd)");
+                return true;
+        }
+
+        if (ret != sizeof(event)) {
+                fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n",
+                        ret, (int) sizeof(event));
+                return false;
+        }
+
+        switch (event.type) {
+        case BASE_CSF_NOTIFICATION_EVENT:
+                fprintf(stderr, "Notification event!\n");
+                return false;
+
+        case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR:
+                break;
+
+        case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP:
+                fprintf(stderr, "No event from mali_fd!\n");
+                return false;
+
+        default:
+                fprintf(stderr, "Unknown event type!\n");
+                return false;
+        }
+
+        struct base_gpu_queue_group_error e = event.payload.csg_error.error;
+
+        switch (e.error_type) {
+        case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: {
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue group error: status 0x%x "
+                        "sideband 0x%"PRIx64"\n",
+                        e.payload.fatal_group.status,
+                        (uint64_t) e.payload.fatal_group.sideband);
+                break;
+        }
+        case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: {
+                unsigned queue = e.payload.fatal_queue.csi_index;
+
+                // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h
+                fprintf(stderr, "Queue %i error: status 0x%x "
+                        "sideband 0x%"PRIx64":",
+                        queue, e.payload.fatal_queue.status,
+                        (uint64_t) e.payload.fatal_queue.sideband);
+
+                unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT);
+                pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id);
+
+                break;
+        }
+
+        case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT:
+                fprintf(stderr, "Command stream timeout!\n");
+                break;
+        case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM:
+                fprintf(stderr, "Command stream OOM!\n");
+                break;
+        default:
+                fprintf(stderr, "Unknown error type!\n");
+        }
+
+        return false;
+}
+
+static bool
+kick_queue(struct state *s, unsigned i)
+{
+        struct kbase_ioctl_cs_queue_kick kick = {
+                .buffer_gpu_addr = s->cs_mem[i].gpu
+        };
+
+        int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick);
+
+        if (ret == -1) {
+                perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)");
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+wait_cs(struct state *s, unsigned i)
+{
+        unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu;
+
+        unsigned timeout_ms = 500;
+
+        bool done_kick = false;
+
+        while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) {
+                if (wait_event(s, timeout_ms)) {
+                        if (pr)
+                                fprintf(stderr, "Event wait timeout!\n");
+
+                        unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT);
+                        unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE);
+
+                        if (e != extract_offset) {
+                                fprintf(stderr, "CS_EXTRACT (%i) != %i, "
+                                        "CS_ACTIVE (%i) on queue %i:",
+                                        e, extract_offset, a, i);
+                                /* Decode two instructions instead? */
+                                pandecode_cs(s->cs_mem[i].gpu + e, 8, 1);
+
+                                if (done_kick) {
+                                        cache_barrier();
+                                        return false;
+                                } else {
+                                        fprintf(stderr, "Kicking queue\n");
+                                        kick_queue(s, i);
+                                        done_kick = true;
+                                }
+                        }
+                }
+        }
+
+        cache_barrier();
+
+        return true;
+}
+
+static bool
+cs_init(struct state *s, struct test *t)
+{
+        uint64_t event_init[] = { 1, 1, 1 };
+        memcpy(s->allocations.event.cpu, event_init, sizeof(event_init));
+
+        for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) {
+                CS_WRITE_REGISTER(s, i, CS_INSERT, 0);
+                pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) {
+                        switch (i) {
+                        case 0: cfg.compute = true; break;
+                        case 1: cfg.compute = true; cfg.fragment = true; break;
+                        case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break;
+                        case 3: cfg.fragment = true; break;
+                        }
+                }
+                pan_pack_ins(s->cs + i, CS_SLOT, cfg) {
+                        cfg.index = 2;
+                }
+                pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER,
+                               s->allocations.event.gpu);
+                submit_cs(s, i);
+
+                if (!kick_queue(s, i))
+                        return false;
+        }
+
+        return true;
+}
+
+static struct panfrost_ptr *
+buffers_elem(struct util_dynarray *buffers, unsigned index)
+{
+        unsigned size = util_dynarray_num_elements(buffers,
+                                                   struct panfrost_ptr);
+
+        if (index >= size) {
+                unsigned grow = index + 1 - size;
+
+                memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow),
+                       0, grow * sizeof(struct panfrost_ptr));
+        }
+
+        return util_dynarray_element(buffers, struct panfrost_ptr, index);
+}
+
+static void
+dump_hex64(FILE *fp, uint64_t *values, unsigned size)
+{
+        bool zero = false;
+        for (unsigned i = 0; i < size / 8; i += 2) {
+                uint64_t a = values[i];
+                uint64_t b = values[i + 1];
+
+                if (!a && !b) {
+                        if (!zero)
+                                fprintf(fp, "%06X  *\n", i * 8);
+                        zero = true;
+                        continue;
+                }
+
+                zero = false;
+
+                fprintf(fp, "%06X  %16"PRIx64" %16"PRIx64"\n",
+                        i * 8, a, b);
+        }
+
+        fprintf(fp, "\n");
+}
+
+static void
+dump_delta(FILE *fp, uint64_t *values, unsigned size)
+{
+        uint64_t old = 0;
+        bool zero = false;
+        bool el = false;
+        for (unsigned i = 0; i < size / 8; ++i) {
+                uint64_t val = values[i];
+                int64_t delta = val - old;
+
+                if (!zero || delta) {
+                        fprintf(fp, "%"PRIi64"\n", delta);
+                        el = false;
+                } else if (!el) {
+                        fprintf(fp, "...\n");
+                        el = true;
+                }
+
+                old = val;
+                zero = (delta == 0);
+        }
+}
+
+static void
+dump_tiler(FILE *fp, uint8_t *values, unsigned size)
+{
+        fflush(stdout);
+        FILE *stream = popen("tiler-hex-read", "w");
+        // TODO!
+        fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n",
+                256, 256, 6, values, size);
+        pan_hexdump(stream, values, size, false);
+        pclose(stream);
+}
+
+/* TODO: Pass in a filename? */
+static void
+dump_filehex(uint8_t *values, unsigned size)
+{
+        char buf[1024] = {0};
+
+        for (unsigned i = 0; i < 10000; ++i) {
+                snprintf(buf, 1024, "/tmp/fdump.%05i", i);
+
+                int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666);
+                if (fd == -1)
+                        continue;
+
+                FILE *fp = fdopen(fd, "w");
+
+                fprintf(fp, "%p, %u:\n", values, size);
+                pan_hexdump(fp, values, size, false);
+
+                fclose(fp); /* will close fd */
+                break;
+        }
+}
+
+static void
+dump_heatmap(FILE *fp, uint8_t *values, unsigned size,
+             unsigned gran, unsigned length, unsigned stride)
+{
+        unsigned sum = 0;
+        unsigned gr = 0;
+        unsigned st = 0;
+        unsigned ll = 0;
+
+        while (size && !values[size - 1])
+                --size;
+
+        for (unsigned i = 0; i < size; ++i) {
+                sum += values[i];
+
+                if (++gr == gran) {
+                        fprintf(fp, " %02x", sum & 0xff);
+                        gr = 0;
+                        sum = 0;
+                }
+
+                if (++ll == length) {
+                        i += stride - length;
+                        fprintf(fp, "\n");
+                        st = 0;
+                        ll = 0;
+                } else if (++st == stride) {
+                        fprintf(fp, "\n");
+                        st = 0;
+                }
+        }
+        fprintf(fp, " %02x\n", sum & 0xff);
+}
+
+static bool
+cs_test(struct state *s, struct test *t)
+{
+        if (s->argc < 2)
+                return true;
+
+        FILE *f = fopen(s->argv[1], "r");
+
+        struct util_dynarray buffers;
+        util_dynarray_init(&buffers, NULL);
+
+        for (;;) {
+                char *line = NULL;
+                size_t sz = 0;
+                if (getline(&line, &sz, f) == -1)
+                        break;
+
+                unsigned long src, dst, offset, src_offset, size, iter, flags;
+                unsigned long gran, stride, length;
+                int read;
+                char *mode;
+
+                if (sscanf(line, "rel%ms %lu+%lu %lu+%lu",
+                           &mode, &dst, &offset, &src, &src_offset) == 5) {
+
+                        if (strcmp(mode, "oc") && strcmp(mode, "split")) {
+                                fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode);
+                        }
+                        bool split = (mode[0] == 's');
+                        free(mode);
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+                        struct panfrost_ptr *d = buffers_elem(&buffers, dst);
+
+                        if (!s->gpu || !d->gpu) {
+                                fprintf(stderr, "relocating to buffer that doesn't exist!\n");
+                        }
+
+                        uint64_t *dest = d->cpu + offset;
+                        uint64_t value = s->gpu + src_offset;
+                        if (split) {
+                                dest[0] |= (uint32_t) value;
+                                dest[1] |= (uint32_t) (value >> 32);
+                        } else {
+                                *dest |= value;
+                        }
+
+                } else if (sscanf(line, "buffer %lu %lu %lx %n",
+                                  &dst, &size, &flags, &read) == 3) {
+                        line += read;
+
+                        struct panfrost_ptr buffer =
+                                alloc_mem(s, ALIGN_POT(size, s->page_size),
+                                          flags);
+
+                        alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size));
+
+                        *buffers_elem(&buffers, dst) = buffer;
+
+                        //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu);
+
+                        uint64_t *fill = buffer.cpu;
+
+                        for (unsigned i = 0; i < size / 8; ++i) {
+                                read = 0;
+                                unsigned long long val = 0;
+                                if (sscanf(line, "%Lx %n", &val, &read) != 1)
+                                        break;
+                                line += read;
+                                fill[i] = val;
+                        }
+
+                        cache_clean_range(buffer.cpu, size);
+
+                } else if (sscanf(line, "exe %n %lu %lu %lu",
+                                  &read, &iter, &dst, &size) == 3) {
+                        line += read;
+
+                        unsigned iter_mask = 0;
+
+                        for (;;) {
+                                read = 0;
+                                if (sscanf(line, "%lu %lu %lu %n",
+                                           &iter, &dst, &size, &read) != 3)
+                                        break;
+                                line += read;
+
+                                struct panfrost_ptr *d =
+                                        buffers_elem(&buffers, dst);
+
+                                /* TODO: Check 'size' against buffer size */
+
+                                pandecode_cs(d->gpu, size, s->gpu_id);
+
+                                if (iter > 3) {
+                                        fprintf(stderr,
+                                                "execute on out-of-bounds "
+                                                "iterator\n");
+                                        continue;
+                                }
+
+                                memcpy(s->cs[iter].ptr, d->cpu, size);
+                                s->cs[iter].ptr += size / 8;
+
+                                iter_mask |= (1 << iter);
+                        }
+
+                        u_foreach_bit(i, iter_mask)
+                                submit_cs(s, i);
+
+                        u_foreach_bit(i, iter_mask)
+                                kick_queue(s, i);
+
+                        u_foreach_bit(i, iter_mask)
+                                wait_cs(s, i);
+
+                } else if (sscanf(line, "dump %lu %lu %lu %ms",
+                                  &src, &offset, &size, &mode) == 4) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
+
+                        cache_invalidate_range(s->cpu + offset, size);
+
+                        if (!strcmp(mode, "hex"))
+                                pan_hexdump(stdout, s->cpu + offset, size, true);
+                        else if (!strcmp(mode, "hex64"))
+                                dump_hex64(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "delta"))
+                                dump_delta(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "tiler"))
+                                dump_tiler(stdout, s->cpu + offset, size);
+                        else if (!strcmp(mode, "filehex"))
+                                dump_filehex(s->cpu + offset, size);
+
+                        free(mode);
+
+                } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu",
+                                  &src, &offset, &size,
+                                  &gran, &length, &stride) == 6) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "dumping buffer that doesn't exist!\n");
+
+                        cache_invalidate_range(s->cpu + offset, size);
+
+                        dump_heatmap(stdout, s->cpu + offset, size,
+                                     gran, length, stride);
+
+                } else if (sscanf(line, "memset %lu %lu %lu %lu",
+                                  &src, &offset, &gran, &size) == 4) {
+
+                        struct panfrost_ptr *s = buffers_elem(&buffers, src);
+
+                        if (!s->gpu)
+                                fprintf(stderr, "memset on buffer that doesn't exist!\n");
+
+                        memset(s->cpu + offset, gran, size);
+                        cache_clean_range(s->cpu + offset, size);
+
+                } else if (sscanf(line, "sleep %lu", &size) == 1) {
+
+                        usleep(size * 1000);
+
+                } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) {
+
+                        void *ptr;
+
+                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
+                                         s->tiler_heap_header);
+                        pan_hexdump(stdout, ptr, 4096, false);
+                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
+                        munmap(ptr, 1 << 21);
+
+                        ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd,
+                                         s->tiler_heap_header + (1 << 21));
+                        pan_hexdump(stdout, ptr, 4096, false);
+                        pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false);
+                        munmap(ptr, 1 << 21);
+
+                } else {
+                        fprintf(stderr, "unknown command '%s'\n", line);
+                }
+        }
+
+        /* Skip following tests */
+        return false;
+}
+
+static void
+pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value)
+{
+        pan_emit_cs_32(c, 0x5e, value);
+        pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                cfg.value = offset;
+                cfg.src = 0x5a;
+                cfg.dest = 0x5c;
+        }
+        pan_pack_ins(c, CS_EVADD, cfg) {
+                cfg.value = 0x5e;
+                cfg.addr = 0x5c;
+        }
+}
+
+static bool
+cs_simple(struct state *s, struct test *t)
+{
+        unsigned queue = t->vertex ? 2 : 0;
+
+        pan_command_stream *c = s->cs + queue;
+
+        unsigned dest = t->invalid ? 0x65 : 0x48;
+
+        pan_emit_cs_32(c, dest, 0x1234);
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, queue);
+        return wait_cs(s, queue);
+}
+
+static bool
+cs_store(struct state *s, struct test *t)
+{
+        pan_command_stream *c = s->cs;
+
+        uint32_t *dest = s->allocations.ev2.cpu + 240;
+        mali_ptr dest_va = s->allocations.ev2.gpu + 240;
+        uint32_t value = 1234;
+        uint32_t add = 4320000;
+
+        *dest = 0;
+        cache_clean(dest);
+
+        unsigned addr_reg = 0x48;
+        unsigned value_reg = 0x4a;
+
+        if (t->invalid)
+                dest_va = 0xfdcba9876543;
+
+        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); }
+        pan_emit_cs_48(c, addr_reg, dest_va);
+        pan_emit_cs_32(c, value_reg, value);
+
+        if (t->add) {
+                pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                        cfg.value = add;
+                        cfg.src = value_reg;
+                        cfg.dest = value_reg;
+                }
+                value += add;
+        }
+
+        pan_pack_ins(c, CS_STR, cfg) {
+                cfg.addr = addr_reg;
+                cfg.register_base = value_reg;
+                cfg.register_mask = 1;
+        }
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, 0);
+        wait_cs(s, 0);
+
+        cache_invalidate(dest);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = *dest;
+
+        if (t->invalid && result == value) {
+                printf("Got %i, did not expect %i: ", result, value);
+                return false;
+        } else if (result != value) {
+                printf("Got %i, expected %i: ", result, value);
+                return false;
+        }
+
+        return true;
+}
+
+static void
+emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end)
+{
+        cache_clean_range(start, end - start);
+
+        pan_emit_cs_48(c, 0x48, va);
+        pan_emit_cs_32(c, 0x4a, end - start);
+        pan_pack_ins(c, CS_CALL, cfg) {
+                cfg.address = 0x48;
+                cfg.length = 0x4a;
+        }
+}
+
+static bool
+cs_sub(struct state *s, struct test *t)
+{
+        pan_command_stream *c = s->cs;
+        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
+        mali_ptr cs_va = s->allocations.cached.gpu;
+
+        uint32_t *dest = s->allocations.normal.cpu;
+        mali_ptr dest_va = s->allocations.normal.gpu;
+        uint32_t value = 4321;
+
+        *dest = 0;
+        cache_clean(dest);
+
+        unsigned addr_reg = 0x48;
+        unsigned value_reg = 0x4a;
+
+        void *start = i->ptr;
+
+        pan_emit_cs_ins(c, 0x30, 0x5a0000000000);
+
+        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
+        pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); }
+        //pan_emit_cs_ins(i, 0x31, 0);
+
+        pan_emit_cs_48(i, addr_reg, dest_va);
+        pan_emit_cs_32(i, value_reg, value);
+        //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL);
+        pan_pack_ins(i, CS_STR, cfg) {
+                cfg.addr = addr_reg;
+                cfg.register_base = value_reg;
+                cfg.register_mask = 1;
+        }
+        //pan_emit_cs_ins(i, 0x09, 0);
+        //pan_emit_cs_ins(i, 0x31, 0x100000000);
+
+        //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL);
+
+        /*
+        pan_pack_ins(i, CS_STR_32, cfg) {
+                cfg.unk_1 = 1;
+                cfg.unk_2 = 4;
+                cfg.unk_3 = 1;
+                cfg.addr = addr_reg;
+                cfg.value = value_reg;
+                }*/
+
+        emit_cs_call(c, cs_va, start, i->ptr);
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, 0);
+        wait_cs(s, 0);
+
+        cache_invalidate(dest);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = *dest;
+
+        if (result != value) {
+                printf("Got %i, expected %i: ", result, value);
+                return false;
+        }
+
+        return true;
+}
+
+static mali_ptr
+upload_shader(struct state *s, struct util_dynarray binary)
+{
+        assert(s->shader_alloc_offset + binary.size < s->page_size);
+
+        mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset;
+
+        memcpy(s->allocations.exec.cpu, binary.data, binary.size);
+
+        /* Shouldn't be needed, but just in case... */
+        cache_clean_range(s->allocations.exec.cpu, binary.size);
+
+        s->shader_alloc_offset += binary.size;
+
+        return va;
+}
+
+static bool
+compute_compile(struct state *s, struct test *t)
+{
+        nir_builder _b =
+                nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                               GENX(pan_shader_get_compiler_options)(),
+                                               "mem_store"), *b = &_b;
+
+        nir_ssa_def *ptr =
+                nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0));
+
+        nir_ssa_def *value = nir_imm_int(b, 123);
+
+        nir_store_global(b, ptr, 8, value, 1);
+
+        struct panfrost_compile_inputs inputs = {
+                .gpu_id = s->gpu_id,
+                .no_ubo_to_push = true,
+        };
+
+        struct util_dynarray binary = {0};
+        struct pan_shader_info shader_info = {0};
+
+        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
+
+        dump_start(stderr);
+        disassemble_valhall(stderr, binary.data, binary.size, true);
+        dump_end(stderr);
+
+        s->compute_shader = upload_shader(s, binary);
+
+        util_dynarray_fini(&binary);
+        ralloc_free(b->shader);
+
+        return true;
+}
+
+static struct panfrost_ptr
+mem_offset(struct panfrost_ptr ptr, unsigned offset)
+{
+        ptr.cpu += offset;
+        ptr.gpu += offset;
+        return ptr;
+}
+
+static bool
+compute_execute(struct state *s, struct test *t)
+{
+        unsigned queue = t->blit ? 1 : 0;
+
+        pan_command_stream *c = s->cs + queue;
+        pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i;
+        mali_ptr cs_va = s->allocations.cached.gpu;
+
+        struct panfrost_ptr dest = s->allocations.normal;
+        uint32_t value = 123;
+
+        *(uint32_t *) dest.cpu = 0;
+        cache_clean(dest.cpu);
+
+        struct panfrost_ptr fau = mem_offset(dest, 128);
+        *(uint64_t *) fau.cpu = dest.gpu;
+        cache_clean(fau.cpu);
+
+        struct panfrost_ptr local_storage = mem_offset(dest, 192);
+        pan_pack(local_storage.cpu, LOCAL_STORAGE, _);
+        cache_clean(local_storage.cpu);
+
+        struct panfrost_ptr shader_program = mem_offset(dest, 256);
+        pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) {
+                cfg.stage = MALI_SHADER_STAGE_COMPUTE;
+                cfg.primary_shader = true;
+                cfg.register_allocation =
+                        MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
+                cfg.binary = s->compute_shader;
+        }
+        cache_clean(shader_program.cpu);
+
+        void *start = i->ptr;
+
+        pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; }
+        //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; }
+
+        pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) {
+                cfg.workgroup_size_x = 1;
+                cfg.workgroup_size_y = 1;
+                cfg.workgroup_size_z = 1;
+
+                cfg.workgroup_count_x = 1;
+                cfg.workgroup_count_y = 1;
+                cfg.workgroup_count_z = 1;
+
+                cfg.compute.shader = shader_program.gpu;
+                cfg.compute.thread_storage = local_storage.gpu;
+
+                cfg.compute.fau = fau.gpu;
+                cfg.compute.fau_count = 1;
+        }
+
+        pan_pack_ins(i, COMPUTE_LAUNCH, _);
+
+        //pan_emit_cs_32(c, 0x54, 1);
+        //pan_emit_cs_ins(c, 0x24, 0x540000000233);
+        emit_cs_call(c, cs_va, start, i->ptr);
+
+        pan_emit_cs_32(c, 0x4a, 0);
+        pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL);
+
+        pan_emit_cs_48(c, 0x48, dest.gpu);
+        pan_pack_ins(c, CS_LDR, cfg) {
+                cfg.offset = 0;
+                cfg.register_mask = 1;
+                cfg.addr = 0x48;
+                cfg.register_base = 0x20;
+        }
+        pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; }
+        pan_pack_ins(c, CS_ADD_IMM, cfg) {
+                cfg.value = 1;
+                cfg.src = 0x20;
+                cfg.dest = 0x20;
+        }
+        pan_pack_ins(c, CS_STR, cfg) {
+                cfg.offset = 64;
+                cfg.register_mask = 1;
+                cfg.addr = 0x48;
+                cfg.register_base = 0x20;
+        }
+
+        pan_cs_evadd(c, 0, 1);
+
+        submit_cs(s, queue);
+        wait_cs(s, queue);
+
+        cache_invalidate(dest.cpu);
+        cache_barrier(); /* Just in case it's needed */
+        uint32_t result = ((uint32_t *)dest.cpu)[0];
+        uint32_t result2 = ((uint32_t *)dest.cpu)[16];
+
+        if (result != value) {
+                printf("Got %i, %i, expected %i: ", result, result2, value);
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+mmu_dump(struct state *s, struct test *t)
+{
+        unsigned size = 1024 * 1024;
+
+        void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED,
+                         s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE);
+        if (mem == MAP_FAILED) {
+                perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)");
+                return false;
+        }
+
+        pan_hexdump(stdout, mem, size, true);
+
+        return true;
+}
+
+#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) }
+
+#define STATE(item) .offset = offsetof(struct state, item)
+
+#define ALLOC(item) .offset = offsetof(struct state, allocations.item)
+#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f }
+
+struct test kbase_main[] = {
+        { open_kbase, close_kbase, "Open kbase device" },
+        { get_version, NULL, "Check version" },
+        { set_flags, NULL, "Set flags" },
+        { mmap_tracking, munmap_tracking, "Map tracking handle" },
+        { get_gpuprops, free_gpuprops, "Get GPU properties" },
+        { get_gpu_id, NULL, "GPU ID" },
+        { get_coherency_mode, NULL, "Coherency mode" },
+        { get_csf_caps, NULL, "CSF caps" },
+        { mmap_user_reg, munmap_user_reg, "Map user register page" },
+        { init_mem_exec, NULL, "Initialise EXEC_VA zone" },
+        { init_mem_jit, NULL, "Initialise JIT allocator" },
+        { stream_create, stream_destroy, "Create synchronisation stream" },
+        { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" },
+        { cs_group_create, cs_group_term, "Create command stream group" },
+        { decode_init, decode_close, "Initialize pandecode" },
+
+        /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */
+        ALLOC_TEST("Allocate normal memory", normal, 0x200f),
+        ALLOC_TEST("Allocate exectuable memory", exec, 0x2017),
+        ALLOC_TEST("Allocate coherent memory", coherent, 0x280f),
+        ALLOC_TEST("Allocate cached memory", cached, 0x380f),
+        ALLOC_TEST("Allocate CSF event memory", event, 0x8200f),
+        ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f),
+
+        /* These three tests are run for every queue, but later ones are not */
+        { cs_queue_create, cs_queue_free, "Create command stream queues" },
+        { cs_queue_register, cs_queue_term, "Register command stream queues" },
+
+        { cs_test, NULL, "Test command stream" },
+
+        { cs_init, NULL, "Initialise and start command stream queues" },
+        { cs_simple, NULL, "Execute MOV command" },
+        { cs_simple, NULL, "Execute MOV command (again)" },
+        { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true },
+        //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true },
+        { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true },
+        { cs_store, NULL, "Execute STR command" },
+        //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true },
+        { cs_store, NULL, "Execute ADD command", .add = true },
+        { cs_sub, NULL, "Execute STR on iterator" },
+
+        { compute_compile, NULL, "Compile a compute shader" },
+        { compute_execute, NULL, "Execute a compute shader" },
+        { compute_execute, NULL, "Execute compute on blit queue", .blit = true },
+
+        //{ mmu_dump, NULL, "Dump MMU pagetables" },
+};
+
+static void
+do_test_list(struct state *s, struct test *tests, unsigned length);
+
+static void
+cleanup_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        for (unsigned i = length; i > 0; --i) {
+                unsigned n = i - 1;
+
+                struct test *t = &tests[n];
+                if (!t->cleanup)
+                        continue;
+
+                if (pr)
+                        printf("[CLEANUP %i] %s: ", n, t->label);
+                if (t->cleanup(s, t)) {
+                        if (pr)
+                                printf("PASS\n");
+                } else {
+                        if (pr)
+                                printf("FAIL\n");
+                }
+        }
+}
+
+static unsigned
+interpret_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        for (unsigned i = 0; i < length; ++i) {
+                struct test *t = &tests[i];
+
+                if (pr)
+                        printf("[TEST %i] %s: ", i, t->label);
+                if (t->part) {
+                        if (t->part(s, t)) {
+                                if (pr)
+                                        printf("PASS\n");
+                        } else {
+                                if (pr)
+                                        printf("FAIL\n");
+                                if (!getenv("TEST_KEEP_GOING"))
+                                        return i + 1;
+                        }
+                }
+                if (t->subtests)
+                        do_test_list(s, t->subtests, t->sub_length);
+        }
+
+        return length;
+}
+
+static void
+do_test_list(struct state *s, struct test *tests, unsigned length)
+{
+        unsigned ran = interpret_test_list(s, tests, length);
+        cleanup_test_list(s, tests, ran);
+}
+
+int
+main(int argc, char *argv[])
+{
+        struct state s = {
+                .page_size = sysconf(_SC_PAGE_SIZE),
+                .argc = argc,
+                .argv = argv,
+        };
+
+        if (getenv("CSF_QUIET"))
+                pr = false;
+
+        if (!strcmp(getenv("TERM"), "dumb"))
+                colour_term = false;
+
+        if (pr)
+                printf("Running Valhall CSF tests\n");
+
+        do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main));
+}
diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h
index c1e61332203..d708d628d36 100644
--- a/src/panfrost/lib/wrap.h
+++ b/src/panfrost/lib/wrap.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (C) 2017-2019 Lyude Paul
  * Copyright (C) 2017-2019 Alyssa Rosenzweig
@@ -50,6 +49,8 @@ struct pandecode_context *pandecode_create_context(bool to_stderr);
 
 void pandecode_next_frame(struct pandecode_context *ctx);
 
+void pandecode_dump_file_close(void);
+
 void pandecode_destroy_context(struct pandecode_context *ctx);
 
 void pandecode_inject_mmap(struct pandecode_context *ctx, uint64_t gpu_va,
@@ -64,6 +65,10 @@ void pandecode_jc(struct pandecode_context *ctx, uint64_t jc_gpu_va,
 void pandecode_cs(struct pandecode_context *ctx, mali_ptr queue_gpu_va,
                   uint32_t size, unsigned gpu_id, uint32_t *regs);
 
+void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id);
+
+void pandecode_dump_mappings(void);
+
 void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va,
                               unsigned gpu_id);
 
diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build
index aa393d44fe5..43860d4ee2a 100644
--- a/src/panfrost/meson.build
+++ b/src/panfrost/meson.build
@@ -20,7 +20,7 @@
 # SOFTWARE.
 
 inc_panfrost_hw = include_directories([
-   'include'
+   'include', 'base'
 ])
 
 inc_panfrost = include_directories([
@@ -35,6 +35,7 @@ subdir('shared')
 subdir('util')
 subdir('midgard')
 subdir('compiler')
+subdir('base')
 
 if with_gallium_panfrost or with_panfrost_vk
    subdir('lib')
@@ -70,6 +71,46 @@ bifrost_compiler = executable(
   build_by_default : with_tools.contains('panfrost')
 )
 
+csf_test = executable(
+  'csf_test',
+  ['csf_test/test.c'],
+  include_directories : [
+    inc_mapi,
+    inc_mesa,
+    inc_gallium,
+    inc_gallium_aux,
+    inc_include,
+    inc_src,
+    inc_panfrost,
+    inc_panfrost_hw,
+  ],
+  dependencies : [
+    idep_nir,
+    idep_mesautil,
+    idep_bi_opcodes_h,
+    dep_libdrm,
+    libpanfrost_dep,
+  ],
+  build_by_default : true
+)
+
+custom_target(
+  'panfrost_panloader',
+  output: ['panfrost_panloader.txt'],
+  depends : [
+    libpanfrost_lib,
+    libpanfrost_util,
+    _libmesa_util,
+    libpanfrost_decode,
+    libpanfrost_decode_per_arch,
+    libpanfrost_midgard_disasm,
+    libpanfrost_bifrost_disasm,
+    libpanfrost_valhall_disasm,
+  ],
+  command: ['touch', '@OUTPUT@'],
+  build_by_default : false,
+)
+
 if with_panfrost_vk
   subdir('vulkan')
 endif
diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c
index 4a2cab60d92..d4d2b59c2bb 100644
--- a/src/panfrost/midgard/disassemble.c
+++ b/src/panfrost/midgard/disassemble.c
@@ -1254,8 +1254,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words,
 UNUSED static void
 print_varying_parameters(FILE *fp, midgard_load_store_word *word)
 {
-   midgard_varying_params p = midgard_unpack_varying_params(*word);
-
+  unsigned params = word->signed_offset & 0x1FF;
+  midgard_varying_params p;
+  memcpy(&p, &params, sizeof(p));
    /* If a varying, there are qualifiers */
    if (p.flat_shading)
       fprintf(fp, ".flat");