From 8bf5c3113bbc15ac5790f551791e04dd0c9e1f52 Mon Sep 17 00:00:00 2001 From: SolDev69 Date: Thu, 21 Dec 2023 18:15:08 -0500 Subject: [PATCH] more panfrost patches --- .../base/include/csf/mali_base_csf_kernel.h | 596 ++++++ .../base/include/csf/mali_gpu_csf_registers.h | 43 + .../base/include/csf/mali_kbase_csf_ioctl.h | 530 +++++ .../base/include/jm/mali_base_jm_kernel.h | 1051 +++++++++ .../base/include/jm/mali_kbase_jm_ioctl.h | 231 ++ .../base/include/mali_base_common_kernel.h | 231 ++ src/panfrost/base/include/mali_base_kernel.h | 700 ++++++ .../base/include/mali_kbase_gpuprops.h | 127 ++ src/panfrost/base/include/mali_kbase_ioctl.h | 759 +++++++ .../base/include/old/mali-ioctl-midgard.h | 80 + src/panfrost/base/include/old/mali-ioctl.h | 743 +++++++ src/panfrost/base/include/old/mali-props.h | 262 +++ src/panfrost/base/meson.build | 55 + src/panfrost/base/pan_base.c | 301 +++ src/panfrost/base/pan_base.h | 234 ++ src/panfrost/base/pan_base_noop.h | 152 ++ src/panfrost/base/pan_cache.h | 95 + src/panfrost/base/pan_vX_base.c | 1825 ++++++++++++++++ src/panfrost/csf_test/interpret.py | 1820 ++++++++++++++++ src/panfrost/csf_test/mali_base_csf_kernel.h | 721 +++++++ src/panfrost/csf_test/mali_base_kernel.h | 746 +++++++ .../csf_test/mali_gpu_csf_registers.h | 43 + src/panfrost/csf_test/mali_kbase_csf_ioctl.h | 483 +++++ src/panfrost/csf_test/mali_kbase_ioctl.h | 854 ++++++++ src/panfrost/csf_test/test.c | 1903 +++++++++++++++++ src/panfrost/lib/wrap.h | 7 +- src/panfrost/meson.build | 43 +- src/panfrost/midgard/disassemble.c | 5 +- 28 files changed, 14636 insertions(+), 4 deletions(-) create mode 100644 src/panfrost/base/include/csf/mali_base_csf_kernel.h create mode 100644 src/panfrost/base/include/csf/mali_gpu_csf_registers.h create mode 100644 src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h create mode 100644 src/panfrost/base/include/jm/mali_base_jm_kernel.h create mode 100644 src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h create mode 100644 src/panfrost/base/include/mali_base_common_kernel.h create mode 100644 src/panfrost/base/include/mali_base_kernel.h create mode 100644 src/panfrost/base/include/mali_kbase_gpuprops.h create mode 100644 src/panfrost/base/include/mali_kbase_ioctl.h create mode 100644 src/panfrost/base/include/old/mali-ioctl-midgard.h create mode 100644 src/panfrost/base/include/old/mali-ioctl.h create mode 100644 src/panfrost/base/include/old/mali-props.h create mode 100644 src/panfrost/base/meson.build create mode 100644 src/panfrost/base/pan_base.c create mode 100644 src/panfrost/base/pan_base.h create mode 100644 src/panfrost/base/pan_base_noop.h create mode 100644 src/panfrost/base/pan_cache.h create mode 100644 src/panfrost/base/pan_vX_base.c create mode 100644 src/panfrost/csf_test/interpret.py create mode 100644 src/panfrost/csf_test/mali_base_csf_kernel.h create mode 100644 src/panfrost/csf_test/mali_base_kernel.h create mode 100644 src/panfrost/csf_test/mali_gpu_csf_registers.h create mode 100644 src/panfrost/csf_test/mali_kbase_csf_ioctl.h create mode 100644 src/panfrost/csf_test/mali_kbase_ioctl.h create mode 100644 src/panfrost/csf_test/test.c diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h new file mode 100644 index 00000000000..3b02350c08b --- /dev/null +++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h @@ -0,0 +1,596 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_CSF_KERNEL_H_ +#define _UAPI_BASE_CSF_KERNEL_H_ + +#include +#include "../mali_base_common_kernel.h" + +/* Memory allocation, access/hint flags & mask specific to CSF GPU. + * + * See base_mem_alloc_flags. + */ + +/* Must be FIXED memory. */ +#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8) + +/* CSF event memory + * + * If Outer shareable coherence is not specified or not available, then on + * allocation kbase will automatically use the uncached GPU mapping. + * There is no need for the client to specify BASE_MEM_UNCACHED_GPU + * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. + * + * This memory requires a permanent mapping + * + * See also kbase_reg_needs_kernel_mapping() + */ +#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) + +#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) + + +/* Must be FIXABLE memory: its GPU VA will be determined at a later point, + * at which time it will be at a fixed GPU VA. + */ +#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29) + +/* Note that the number of bits used for base_mem_alloc_flags + * must be less than BASE_MEM_FLAGS_NR_BITS !!! + */ + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20 + +/* Special base mem handles specific to CSF. + */ +#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) + +#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ + ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ + LOCAL_PAGE_SHIFT) + +/* Valid set of just-in-time memory allocation flags */ +#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) + +/* flags for base context specific to CSF */ + +/* Base context creates a CSF event notification thread. + * + * The creation of a CSF event notification thread is conditional but + * mandatory for the handling of CSF events. + */ +#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | \ + BASE_CONTEXT_CSF_EVENT_THREAD | \ + BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* Flags for base tracepoint specific to CSF */ + +/* Enable KBase tracepoints for CSF builds */ +#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) + +/* Enable additional CSF Firmware side tracepoints */ +#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) + +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ + BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ + BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) + +/* Number of pages mapped into the process address space for a bound GPU + * command queue. A pair of input/output pages and a Hw doorbell page + * are mapped to enable direct submission of commands to Hw. + */ +#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) + +#define BASE_QUEUE_MAX_PRIORITY (15U) + +/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ +#define BASEP_EVENT_VAL_INDEX (0U) +#define BASEP_EVENT_ERR_INDEX (1U) + +/* The upper limit for number of objects that could be waited/set per command. + * This limit is now enforced as internally the error inherit inputs are + * converted to 32-bit flags in a __u32 variable occupying a previously padding + * field. + */ +#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) + +/* CSF CSI EXCEPTION_HANDLER_FLAGS */ +#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0) +#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG) + +/** + * enum base_kcpu_command_type - Kernel CPU queue command type. + * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, + * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, + * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, + * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, + * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, + * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, + * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, + */ +enum base_kcpu_command_type { + BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_SET, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, + BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, + BASE_KCPU_COMMAND_TYPE_JIT_FREE, + BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, + BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER +}; + +/** + * enum base_queue_group_priority - Priority of a GPU Command Queue Group. + * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group + * priority levels. + * + * Currently this is in order of highest to lowest, but if new levels are added + * then those new levels may be out of order to preserve the ABI compatibility + * with previous releases. At that point, ensure assignment to + * the 'priority' member in &kbase_queue_group is updated to ensure it remains + * a linear ordering. + * + * There should be no gaps in the enum, otherwise use of + * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. + */ +enum base_queue_group_priority { + BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, + BASE_QUEUE_GROUP_PRIORITY_MEDIUM, + BASE_QUEUE_GROUP_PRIORITY_LOW, + BASE_QUEUE_GROUP_PRIORITY_REALTIME, + BASE_QUEUE_GROUP_PRIORITY_COUNT +}; + +struct base_kcpu_command_fence_info { + __u64 fence; +}; + +struct base_cqs_wait_info { + __u64 addr; + __u32 val; + __u32 padding; +}; + +struct base_kcpu_command_cqs_wait_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +struct base_cqs_set { + __u64 addr; +}; + +struct base_kcpu_command_cqs_set_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * typedef basep_cqs_data_type - Enumeration of CQS Data Types + * + * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value + * is an unsigned 32-bit integer + * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value + * is an unsigned 64-bit integer + */ +typedef enum PACKED { + BASEP_CQS_DATA_TYPE_U32 = 0, + BASEP_CQS_DATA_TYPE_U64 = 1, +} basep_cqs_data_type; + +/** + * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait + * Operation conditions + * + * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Less than or Equal to + * the Wait Operation value + * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Greater than the Wait Operation value + */ +typedef enum { + BASEP_CQS_WAIT_OPERATION_LE = 0, + BASEP_CQS_WAIT_OPERATION_GT = 1, +} basep_cqs_wait_operation_op; + +struct base_cqs_wait_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information + * about the Timeline CQS wait objects + * + * @objs: An array of Timeline CQS waits. + * @nr_objs: Number of Timeline CQS waits in the array. + * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field + * to be served as the source for importing into the + * queue's error-state. + */ +struct base_kcpu_command_cqs_wait_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +/** + * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations + * + * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value + * to a synchronization object + * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value + * of a synchronization object + */ +typedef enum { + BASEP_CQS_SET_OPERATION_ADD = 0, + BASEP_CQS_SET_OPERATION_SET = 1, +} basep_cqs_set_operation_op; + +struct base_cqs_set_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_set_operation_info - structure which contains information + * about the Timeline CQS set objects + * + * @objs: An array of Timeline CQS sets. + * @nr_objs: Number of Timeline CQS sets in the array. + * @padding: Structure padding, unused bytes. + */ +struct base_kcpu_command_cqs_set_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * struct base_kcpu_command_import_info - structure which contains information + * about the imported buffer. + * + * @handle: Address of imported user buffer. + */ +struct base_kcpu_command_import_info { + __u64 handle; +}; + +/** + * struct base_kcpu_command_jit_alloc_info - structure which contains + * information about jit memory allocation. + * + * @info: An array of elements of the + * struct base_jit_alloc_info type. + * @count: The number of elements in the info array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_alloc_info { + __u64 info; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_jit_free_info - structure which contains + * information about jit memory which is to be freed. + * + * @ids: An array containing the JIT IDs to free. + * @count: The number of elements in the ids array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_free_info { + __u64 ids; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_group_suspend_info - structure which contains + * suspend buffer data captured for a suspended queue group. + * + * @buffer: Pointer to an array of elements of the type char. + * @size: Number of elements in the @buffer array. + * @group_handle: Handle to the mapping of CSG. + * @padding: padding to a multiple of 64 bits. + */ +struct base_kcpu_command_group_suspend_info { + __u64 buffer; + __u32 size; + __u8 group_handle; + __u8 padding[3]; +}; + + +/** + * struct base_kcpu_command - kcpu command. + * @type: type of the kcpu command, one enum base_kcpu_command_type + * @padding: padding to a multiple of 64 bits + * @info: structure which contains information about the kcpu command; + * actual type is determined by @p type + * @info.fence: Fence + * @info.cqs_wait: CQS wait + * @info.cqs_set: CQS set + * @info.cqs_wait_operation: CQS wait operation + * @info.cqs_set_operation: CQS set operation + * @info.import: import + * @info.jit_alloc: JIT allocation + * @info.jit_free: JIT deallocation + * @info.suspend_buf_copy: suspend buffer copy + * @info.sample_time: sample time + * @info.padding: padding + */ +struct base_kcpu_command { + __u8 type; + __u8 padding[sizeof(__u64) - sizeof(__u8)]; + union { + struct base_kcpu_command_fence_info fence; + struct base_kcpu_command_cqs_wait_info cqs_wait; + struct base_kcpu_command_cqs_set_info cqs_set; + struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; + struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; + struct base_kcpu_command_import_info import; + struct base_kcpu_command_jit_alloc_info jit_alloc; + struct base_kcpu_command_jit_free_info jit_free; + struct base_kcpu_command_group_suspend_info suspend_buf_copy; + __u64 padding[2]; /* No sub-struct should be larger */ + } info; +}; + +/** + * struct basep_cs_stream_control - CSI capabilities. + * + * @features: Features of this stream + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_stream_control { + __u32 features; + __u32 padding; +}; + +/** + * struct basep_cs_group_control - CSG interface capabilities. + * + * @features: Features of this group + * @stream_num: Number of streams in this group + * @suspend_size: Size in bytes of the suspend buffer for this group + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_group_control { + __u32 features; + __u32 stream_num; + __u32 suspend_size; + __u32 padding; +}; + +/** + * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault + * error information associated with GPU command queue group. + * + * @sideband: Additional information of the unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_group_error_fatal_payload { + __u64 sideband; + __u32 status; + __u32 padding; +}; + +/** + * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault + * error information related to GPU command queue. + * + * @sideband: Additional information about this unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @csi_index: Index of the CSF interface the queue is bound to. + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_error_fatal_payload { + __u64 sideband; + __u32 status; + __u8 csi_index; + __u8 padding[3]; +}; + +/** + * enum base_gpu_queue_group_error_type - GPU Fatal error type. + * + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU + * command queue group. + * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU + * command queue. + * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with + * progress timeout. + * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out + * of tiler heap memory. + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types + * + * This type is used for &struct_base_gpu_queue_group_error.error_type. + */ +enum base_gpu_queue_group_error_type { + BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, + BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, + BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, + BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, + BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT +}; + +/** + * struct base_gpu_queue_group_error - Unrecoverable fault information + * @error_type: Error type of @base_gpu_queue_group_error_type + * indicating which field in union payload is filled + * @padding: Unused bytes for 64bit boundary + * @payload: Input Payload + * @payload.fatal_group: Unrecoverable fault error associated with + * GPU command queue group + * @payload.fatal_queue: Unrecoverable fault error associated with command queue + */ +struct base_gpu_queue_group_error { + __u8 error_type; + __u8 padding[7]; + union { + struct base_gpu_queue_group_error_fatal_payload fatal_group; + struct base_gpu_queue_error_fatal_payload fatal_queue; + } payload; +}; + +/** + * enum base_csf_notification_type - Notification type + * + * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event + * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal + * error + * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu + * queue + * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type + * + * This type is used for &struct_base_csf_notification.type. + */ +enum base_csf_notification_type { + BASE_CSF_NOTIFICATION_EVENT = 0, + BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, + BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, + BASE_CSF_NOTIFICATION_COUNT +}; + +/** + * struct base_csf_notification - Event or error notification + * + * @type: Notification type of @base_csf_notification_type + * @padding: Padding for 64bit boundary + * @payload: Input Payload + * @payload.align: To fit the struct into a 64-byte cache line + * @payload.csg_error: CSG error + * @payload.csg_error.handle: Handle of GPU command queue group associated with + * fatal error + * @payload.csg_error.padding: Padding + * @payload.csg_error.error: Unrecoverable fault error + * + */ +struct base_csf_notification { + __u8 type; + __u8 padding[7]; + union { + struct { + __u8 handle; + __u8 padding[7]; + struct base_gpu_queue_group_error error; + } csg_error; + + __u8 align[56]; + } payload; +}; + +/** + * struct mali_base_gpu_core_props - GPU core props info + * + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to align to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; +}; + +#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h new file mode 100644 index 00000000000..17e338cb238 --- /dev/null +++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * This header was originally autogenerated, but it is now ok (and + * expected) to have to add to it. + */ + +#ifndef _UAPI_GPU_CSF_REGISTERS_H_ +#define _UAPI_GPU_CSF_REGISTERS_H_ + +/* Only user block defines are included. HI words have been removed */ + +/* CS_USER_INPUT_BLOCK register offsets */ +#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ +#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ + +/* CS_USER_OUTPUT_BLOCK register offsets */ +#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ +#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ + +/* USER register offsets */ +#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ + +#endif diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h new file mode 100644 index 00000000000..db7252605f0 --- /dev/null +++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h @@ -0,0 +1,530 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_CSF_IOCTL_H_ +#define _UAPI_KBASE_CSF_IOCTL_H_ + +#include +#include + +/* + * 1.0: + * - CSF IOCTL header separated from JM + * 1.1: + * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 1.2: + * - Add new CSF GPU_FEATURES register into the property structure + * returned by KBASE_IOCTL_GET_GPUPROPS + * 1.3: + * - Add __u32 group_uid member to + * &struct_kbase_ioctl_cs_queue_group_create.out + * 1.4: + * - Replace padding in kbase_ioctl_cs_get_glb_iface with + * instr_features member of same size + * 1.5: + * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new + * queue registration call with extended format for supporting CS + * trace configurations with CSF trace_command. + * 1.6: + * - Added new HW performance counters interface to all GPUs. + * 1.7: + * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use + * 1.8: + * - Removed Kernel legacy HWC interface + * 1.9: + * - Reorganization of GPU-VA memory zones, including addition of + * FIXED_VA zone and auto-initialization of EXEC_VA zone. + * - Added new Base memory allocation interface + * 1.10: + * - First release of new HW performance counters interface. + * 1.11: + * - Dummy model (no mali) backend will now clear HWC values after each sample + * 1.12: + * - Added support for incremental rendering flag in CSG create call + */ + +#define BASE_UK_VERSION_MAJOR 1 +#define BASE_UK_VERSION_MINOR 12 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + +/** + * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the + * base back-end + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * + * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. + * Any change of this struct should also be mirrored to the latter. + */ +struct kbase_ioctl_cs_queue_register { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER \ + _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) + +/** + * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler + * to notify that a queue has been updated + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_kick { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_KICK \ + _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) + +/** + * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group + * + * @in: Input parameters + * @in.buffer_gpu_addr: GPU address of the buffer backing the queue + * @in.group_handle: Handle of the group to which the queue should be bound + * @in.csi_index: Index of the CSF interface the queue should be bound to + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.mmap_handle: Handle to be used for creating the mapping of CS + * input/output pages + */ +union kbase_ioctl_cs_queue_bind { + struct { + __u64 buffer_gpu_addr; + __u8 group_handle; + __u8 csi_index; + __u8 padding[6]; + } in; + struct { + __u64 mmap_handle; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_BIND \ + _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) + +/** + * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the + * base back-end in extended format, + * involving trace buffer configuration + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * @ex_offset_var_addr: GPU address of the trace buffer write offset variable + * @ex_buffer_base: Trace buffer GPU base address for the queue + * @ex_buffer_size: Size of the trace buffer in bytes + * @ex_event_size: Trace event write size, in log2 designation + * @ex_event_state: Trace event states configuration + * @ex_padding: Currently unused, must be zero + * + * Note: There is an identical sub-section at the start of this struct to that + * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section + * must also be mirrored to the latter. Following the said sub-section, + * the remaining fields forms the extension, marked with ex_*. + */ +struct kbase_ioctl_cs_queue_register_ex { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; + __u64 ex_offset_var_addr; + __u64 ex_buffer_base; + __u32 ex_buffer_size; + __u8 ex_event_size; + __u8 ex_event_state; + __u8 ex_padding[2]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ + _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) + +/** + * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_terminate { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) + +/** + * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue + * group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create_1_6 { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ + _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) + +/** + * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.csi_handlers: Flags to signal that the application intends to use CSI + * exception handlers in some linear buffers to deal with + * the given exception types. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 csi_handlers; + __u8 padding[2]; + /** + * @in.reserved: Reserved + */ + __u64 reserved; + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ + _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) + +/** + * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group + * + * @group_handle: Handle of the queue group to be terminated + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_cs_queue_group_term { + __u8 group_handle; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) + +#define KBASE_IOCTL_CS_EVENT_SIGNAL \ + _IO(KBASE_IOCTL_TYPE, 44) + +typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ + +/** + * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue + * + * @id: ID of the new command queue returned by the kernel + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_new { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ + _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) + +/** + * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue + * + * @id: ID of the command queue to be destroyed + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_delete { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ + _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) + +/** + * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue + * + * @addr: Memory address of an array of struct base_kcpu_queue_command + * @nr_commands: Number of commands in the array + * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_enqueue { + __u64 addr; + __u32 nr_commands; + base_kcpu_queue_id id; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ + _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) + +/** + * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap + * @in: Input parameters + * @in.chunk_size: Size of each chunk. + * @in.initial_chunks: Initial number of chunks that heap will be created with. + * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. + * @in.target_in_flight: Number of render-passes that the driver should attempt to + * keep in flight for which allocation of new chunks is + * allowed. + * @in.group_id: Group ID to be used for physical allocations. + * @in.padding: Padding + * @out: Output parameters + * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up + * for the heap. + * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, + * actually points to the header of heap chunk and not to + * the low address of free memory in the chunk. + */ +union kbase_ioctl_cs_tiler_heap_init { + struct { + __u32 chunk_size; + __u32 initial_chunks; + __u32 max_chunks; + __u16 target_in_flight; + __u8 group_id; + __u8 padding; + } in; + struct { + __u64 gpu_heap_va; + __u64 first_chunk_va; + } out; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ + _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) + +/** + * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap + * instance + * + * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. + */ +struct kbase_ioctl_cs_tiler_heap_term { + __u64 gpu_heap_va; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ + _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) + +/** + * union kbase_ioctl_cs_get_glb_iface - Request the global control block + * of CSF interface capabilities + * + * @in: Input parameters + * @in.max_group_num: The maximum number of groups to be read. Can be 0, in + * which case groups_ptr is unused. + * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in + * which case streams_ptr is unused. + * @in.groups_ptr: Pointer where to store all the group data (sequentially). + * @in.streams_ptr: Pointer where to store all the CS data (sequentially). + * @out: Output parameters + * @out.glb_version: Global interface version. + * @out.features: Bit mask of features (e.g. whether certain types of job + * can be suspended). + * @out.group_num: Number of CSGs supported. + * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 + * hold the size of firmware performance counter data + * and 15:0 hold the size of hardware performance counter + * data. + * @out.total_stream_num: Total number of CSs, summed across all groups. + * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum + * size of events. Bits 3:0 hold the offset update rate. + * (csf >= 1.1.0) + * + */ +union kbase_ioctl_cs_get_glb_iface { + struct { + __u32 max_group_num; + __u32 max_total_stream_num; + __u64 groups_ptr; + __u64 streams_ptr; + } in; + struct { + __u32 glb_version; + __u32 features; + __u32 group_num; + __u32 prfcnt_size; + __u32 total_stream_num; + __u32 instr_features; + } out; +}; + +#define KBASE_IOCTL_CS_GET_GLB_IFACE \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) + +struct kbase_ioctl_cs_cpu_queue_info { + __u64 buffer; + __u64 size; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ + _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) + +/** + * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @in.fixed_address: The GPU virtual address requested for the allocation, + * if the allocation is using the BASE_MEM_FIXED flag. + * @in.extra: Space for extra parameters that may be added in the future. + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc_ex { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + __u64 fixed_address; + __u64 extra[3]; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +/** + * struct kbase_ioctl_cs_event_memory_write - Write an event memory address + * @cpu_addr: Memory address to write + * @value: Value to write + * @padding: Currently unused, must be zero + */ +struct kbase_ioctl_cs_event_memory_write { + __u64 cpu_addr; + __u8 value; + __u8 padding[7]; +}; + +/** + * union kbase_ioctl_cs_event_memory_read - Read an event memory address + * @in: Input parameters + * @in.cpu_addr: Memory address to read + * @out: Output parameters + * @out.value: Value read + * @out.padding: Currently unused, must be zero + */ +union kbase_ioctl_cs_event_memory_read { + struct { + __u64 cpu_addr; + } in; + struct { + __u8 value; + __u8 padding[7]; + } out; +}; + +#endif /* MALI_UNIT_TEST */ + +#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h new file mode 100644 index 00000000000..ae43908b936 --- /dev/null +++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h @@ -0,0 +1,1051 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_JM_KERNEL_H_ +#define _UAPI_BASE_JM_KERNEL_H_ + +#include +#include "../mali_base_common_kernel.h" + +/* Memory allocation, access/hint flags & mask specific to JM GPU. + * + * See base_mem_alloc_flags. + */ + +/* Used as BASE_MEM_FIXED in other backends */ +#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) + +/** + * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved. + * + * Do not remove, use the next unreserved bit for new flags + */ +#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19) + +/** + * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned + * to 'extension' pages, where 'extension' must be a power of 2 and no more than + * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES + */ +#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20) + +/* Use the GPU VA chosen by the kernel client */ +#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27) + +/* Force trimming of JIT allocations when creating a new allocation */ +#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29) + +/* Note that the number of bits used for base_mem_alloc_flags + * must be less than BASE_MEM_FLAGS_NR_BITS !!! + */ + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \ + BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED \ + (BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19) + + +/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the + * initial commit is aligned to 'extension' pages, where 'extension' must be a power + * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES + */ +#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP (1 << 0) + +/** + * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points + * to a __u32 holding the used size in bytes; + * otherwise it points to a __u64 holding the lowest address of unused memory. + */ +#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE (1 << 1) + +/** + * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags + * + * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr + * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set + * and heap_info_gpu_addr being 0 will be rejected). + */ +#define BASE_JIT_ALLOC_VALID_FLAGS \ + (BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* + * Private flags used on the base context + * + * These start at bit 31, and run down to zero. + * + * They share the same space as base_context_create_flags, and so must + * not collide with them. + */ + +/* Private flag tracking whether job descriptor dumping is disabled */ +#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \ + ((base_context_create_flags)(1 << 31)) + +/* Flags for base tracepoint specific to JM */ +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED) +/* + * Dependency stuff, keep it private for now. May want to expose it if + * we decide to make the number of semaphores a configurable + * option. + */ +#define BASE_JD_ATOM_COUNT 256 + +/* Maximum number of concurrent render passes. + */ +#define BASE_JD_RP_COUNT (256) + +/* Set/reset values for a software event */ +#define BASE_JD_SOFT_EVENT_SET ((unsigned char)1) +#define BASE_JD_SOFT_EVENT_RESET ((unsigned char)0) + +/** + * struct base_jd_udata - Per-job data + * + * @blob: per-job data array + * + * This structure is used to store per-job data, and is completely unused + * by the Base driver. It can be used to store things such as callback + * function pointer, data to handle job completion. It is guaranteed to be + * untouched by the Base driver. + */ +struct base_jd_udata { + __u64 blob[2]; +}; + +/** + * typedef base_jd_dep_type - Job dependency type. + * + * A flags field will be inserted into the atom structure to specify whether a + * dependency is a data or ordering dependency (by putting it before/after + * 'core_req' in the structure it should be possible to add without changing + * the structure size). + * When the flag is set for a particular dependency to signal that it is an + * ordering only dependency then errors will not be propagated. + */ +typedef __u8 base_jd_dep_type; + +#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ +#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ +#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ + +/** + * typedef base_jd_core_req - Job chain hardware requirements. + * + * A job chain must specify what GPU features it needs to allow the + * driver to schedule the job correctly. By not specifying the + * correct settings can/will cause an early job termination. Multiple + * values can be ORed together to specify multiple requirements. + * Special case is ::BASE_JD_REQ_DEP, which is used to express complex + * dependencies, and that doesn't execute anything on the hardware. + */ +typedef __u32 base_jd_core_req; + +/* Requirements that come from the HW */ + +/* No requirement, dependency only + */ +#define BASE_JD_REQ_DEP ((base_jd_core_req)0) + +/* Requires fragment shaders + */ +#define BASE_JD_REQ_FS ((base_jd_core_req)1 << 0) + +/* Requires compute shaders + * + * This covers any of the following GPU job types: + * - Vertex Shader Job + * - Geometry Shader Job + * - An actual Compute Shader Job + * + * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the + * job is specifically just the "Compute Shader" job type, and not the "Vertex + * Shader" nor the "Geometry Shader" job type. + */ +#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1) + +/* Requires tiling */ +#define BASE_JD_REQ_T ((base_jd_core_req)1 << 2) + +/* Requires cache flushes */ +#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3) + +/* Requires value writeback */ +#define BASE_JD_REQ_V ((base_jd_core_req)1 << 4) + +/* SW-only requirements - the HW does not expose these as part of the job slot + * capabilities + */ + +/* Requires fragment job with AFBC encoding */ +#define BASE_JD_REQ_FS_AFBC ((base_jd_core_req)1 << 13) + +/* SW-only requirement: coalesce completion events. + * If this bit is set then completion of this atom will not cause an event to + * be sent to userspace, whether successful or not; completion events will be + * deferred until an atom completes which does not have this bit set. + * + * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. + */ +#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5) + +/* SW Only requirement: the job chain requires a coherent core group. We don't + * mind which coherent core group is used. + */ +#define BASE_JD_REQ_COHERENT_GROUP ((base_jd_core_req)1 << 6) + +/* SW Only requirement: The performance counters should be enabled only when + * they are needed, to reduce power consumption. + */ +#define BASE_JD_REQ_PERMON ((base_jd_core_req)1 << 7) + +/* SW Only requirement: External resources are referenced by this atom. + * + * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and + * BASE_JD_REQ_SOFT_EVENT_WAIT. + */ +#define BASE_JD_REQ_EXTERNAL_RESOURCES ((base_jd_core_req)1 << 8) + +/* SW Only requirement: Software defined job. Jobs with this bit set will not be + * submitted to the hardware but will cause some action to happen within the + * driver + */ +#define BASE_JD_REQ_SOFT_JOB ((base_jd_core_req)1 << 9) + +#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) +#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) +#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) + +/* 0x4 RESERVED for now */ + +/* SW only requirement: event wait/trigger job. + * + * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. + * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the + * other waiting jobs. It completes immediately. + * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it + * possible for other jobs to wait upon. It completes immediately. + */ +#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) +#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) +#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) + +#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) + +/* SW only requirement: Just In Time allocation + * + * This job requests a single or multiple just-in-time allocations through a + * list of base_jit_alloc_info structure which is passed via the jc element of + * the atom. The number of base_jit_alloc_info structures present in the + * list is passed via the nr_extres element of the atom + * + * It should be noted that the id entry in base_jit_alloc_info must not + * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE. + * + * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE + * soft job to free the JIT allocation is still made. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) + +/* SW only requirement: Just In Time free + * + * This job requests a single or multiple just-in-time allocations created by + * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time + * allocations is passed via the jc element of the atom. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) + +/* SW only requirement: Map external resource + * + * This job requests external resource(s) are mapped once the dependencies + * of the job have been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) + +/* SW only requirement: Unmap external resource + * + * This job requests external resource(s) are unmapped once the dependencies + * of the job has been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) + +/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) + * + * This indicates that the Job Chain contains GPU jobs of the 'Compute + * Shaders' type. + * + * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job + * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. + */ +#define BASE_JD_REQ_ONLY_COMPUTE ((base_jd_core_req)1 << 10) + +/* HW Requirement: Use the base_jd_atom::device_nr field to specify a + * particular core group + * + * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag + * takes priority + * + * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms. + */ +#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11) + +/* SW Flag: If this bit is set then the successful completion of this atom + * will not cause an event to be sent to userspace + */ +#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((base_jd_core_req)1 << 12) + +/* SW Flag: If this bit is set then completion of this atom will not cause an + * event to be sent to userspace, whether successful or not. + */ +#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14) + +/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job starts which does not have this bit set or a job completes + * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use + * if the CPU may have written to memory addressed by the job since the last job + * without this bit set was submitted. + */ +#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15) + +/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job completes which does not have this bit set or a job starts + * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use + * if the CPU may read from or partially overwrite memory addressed by the job + * before the next job without this bit set completes. + */ +#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16) + +/* Request the atom be executed on a specific job slot. + * + * When this flag is specified, it takes precedence over any existing job slot + * selection logic. + */ +#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17) + +/* SW-only requirement: The atom is the start of a renderpass. + * + * If this bit is set then the job chain will be soft-stopped if it causes the + * GPU to write beyond the end of the physical pages backing the tiler heap, and + * committing more memory to the heap would exceed an internal threshold. It may + * be resumed after running one of the job chains attached to an atom with + * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be + * resumed multiple times until it completes without memory usage exceeding the + * threshold. + * + * Usually used with BASE_JD_REQ_T. + */ +#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18) + +/* SW-only requirement: The atom is the end of a renderpass. + * + * If this bit is set then the atom incorporates the CPU address of a + * base_jd_fragment object instead of the GPU address of a job chain. + * + * Which job chain is run depends upon whether the atom with the same renderpass + * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or + * was soft-stopped when it exceeded an upper threshold for tiler heap memory + * usage. + * + * It also depends upon whether one of the job chains attached to the atom has + * already been run as part of the same renderpass (in which case it would have + * written unresolved multisampled and otherwise-discarded output to temporary + * buffers that need to be read back). The job chain for doing a forced read and + * forced write (from/to temporary buffers) is run as many times as necessary. + * + * Usually used with BASE_JD_REQ_FS. + */ +#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19) + +/* SW-only requirement: The atom needs to run on a limited core mask affinity. + * + * If this bit is set then the kbase_context.limited_core_mask will be applied + * to the affinity. + */ +#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20) + +/* These requirement bits are currently unused in base_jd_core_req + */ +#define BASEP_JD_REQ_RESERVED \ + (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ + BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \ + BASE_JD_REQ_EVENT_COALESCE | \ + BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ + BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ + BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \ + BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \ + BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK)) + +/* Mask of all bits in base_jd_core_req that control the type of the atom. + * + * This allows dependency only atoms to have flags set + */ +#define BASE_JD_REQ_ATOM_TYPE \ + (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ + BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) + +/** + * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that + * controls the type of a soft job. + */ +#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) + +/* Returns non-zero value if core requirements passed define a soft job or + * a dependency only job. + */ +#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ + (((core_req) & BASE_JD_REQ_SOFT_JOB) || \ + ((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) + +/** + * enum kbase_jd_atom_state - Atom states + * + * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used. + * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD. + * @KBASE_JD_ATOM_STATE_IN_JS: Atom has been given to JS (is runnable/running). + * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet + * handed back to job dispatcher for + * dependency resolution. + * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed + * back to userspace. + */ +enum kbase_jd_atom_state { + KBASE_JD_ATOM_STATE_UNUSED, + KBASE_JD_ATOM_STATE_QUEUED, + KBASE_JD_ATOM_STATE_IN_JS, + KBASE_JD_ATOM_STATE_HW_COMPLETED, + KBASE_JD_ATOM_STATE_COMPLETED +}; + +/** + * typedef base_atom_id - Type big enough to store an atom number in. + */ +typedef __u8 base_atom_id; + +/** + * struct base_dependency - base dependency + * + * @atom_id: An atom number + * @dependency_type: Dependency type + */ +struct base_dependency { + base_atom_id atom_id; + base_jd_dep_type dependency_type; +}; + +/** + * struct base_jd_fragment - Set of GPU fragment job chains used for rendering. + * + * @norm_read_norm_write: Job chain for full rendering. + * GPU address of a fragment job chain to render in the + * circumstance where the tiler job chain did not exceed + * its memory usage threshold and no fragment job chain + * was previously run for the same renderpass. + * It is used no more than once per renderpass. + * @norm_read_forced_write: Job chain for starting incremental + * rendering. + * GPU address of a fragment job chain to render in + * the circumstance where the tiler job chain exceeded + * its memory usage threshold for the first time and + * no fragment job chain was previously run for the + * same renderpass. + * Writes unresolved multisampled and normally- + * discarded output to temporary buffers that must be + * read back by a subsequent forced_read job chain + * before the renderpass is complete. + * It is used no more than once per renderpass. + * @forced_read_forced_write: Job chain for continuing incremental + * rendering. + * GPU address of a fragment job chain to render in + * the circumstance where the tiler job chain + * exceeded its memory usage threshold again + * and a fragment job chain was previously run for + * the same renderpass. + * Reads unresolved multisampled and + * normally-discarded output from temporary buffers + * written by a previous forced_write job chain and + * writes the same to temporary buffers again. + * It is used as many times as required until + * rendering completes. + * @forced_read_norm_write: Job chain for ending incremental rendering. + * GPU address of a fragment job chain to render in the + * circumstance where the tiler job chain did not + * exceed its memory usage threshold this time and a + * fragment job chain was previously run for the same + * renderpass. + * Reads unresolved multisampled and normally-discarded + * output from temporary buffers written by a previous + * forced_write job chain in order to complete a + * renderpass. + * It is used no more than once per renderpass. + * + * This structure is referenced by the main atom structure if + * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req. + */ +struct base_jd_fragment { + __u64 norm_read_norm_write; + __u64 norm_read_forced_write; + __u64 forced_read_forced_write; + __u64 forced_read_norm_write; +}; + +/** + * typedef base_jd_prio - Base Atom priority. + * + * Only certain priority levels are actually implemented, as specified by the + * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority + * level that is not one of those defined below. + * + * Priority levels only affect scheduling after the atoms have had dependencies + * resolved. For example, a low priority atom that has had its dependencies + * resolved might run before a higher priority atom that has not had its + * dependencies resolved. + * + * In general, fragment atoms do not affect non-fragment atoms with + * lower priorities, and vice versa. One exception is that there is only one + * priority value for each context. So a high-priority (e.g.) fragment atom + * could increase its context priority, causing its non-fragment atoms to also + * be scheduled sooner. + * + * The atoms are scheduled as follows with respect to their priorities: + * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies + * resolved, and atom 'X' has a higher priority than atom 'Y' + * * If atom 'Y' is currently running on the HW, then it is interrupted to + * allow atom 'X' to run soon after + * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing + * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' + * * Any two atoms that have the same priority could run in any order with + * respect to each other. That is, there is no ordering constraint between + * atoms of the same priority. + * + * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are + * scheduled between contexts. The default value, 0, will cause higher-priority + * atoms to be scheduled first, regardless of their context. The value 1 will + * use a round-robin algorithm when deciding which context's atoms to schedule + * next, so higher-priority atoms can only preempt lower priority atoms within + * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and + * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details. + */ +typedef __u8 base_jd_prio; + +/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */ +#define BASE_JD_PRIO_MEDIUM ((base_jd_prio)0) +/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and + * BASE_JD_PRIO_LOW + */ +#define BASE_JD_PRIO_HIGH ((base_jd_prio)1) +/* Low atom priority. */ +#define BASE_JD_PRIO_LOW ((base_jd_prio)2) +/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH, + * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW + */ +#define BASE_JD_PRIO_REALTIME ((base_jd_prio)3) + +/* Invalid atom priority (max uint8_t value) */ +#define BASE_JD_PRIO_INVALID ((base_jd_prio)255) + +/* Count of the number of priority levels. This itself is not a valid + * base_jd_prio setting + */ +#define BASE_JD_NR_PRIO_LEVELS 4 + +/** + * struct base_jd_atom_v2 - Node of a dependency graph used to submit a + * GPU job chain or soft-job to the kernel driver. + * + * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS + * is set in the base_jd_core_req) the CPU address of a + * base_jd_fragment object. + * @udata: User data. + * @extres_list: List of external resources. + * @nr_extres: Number of external resources or JIT allocations. + * @jit_id: Zero-terminated array of IDs of just-in-time memory + * allocations written to by the atom. When the atom + * completes, the value stored at the + * &struct_base_jit_alloc_info.heap_info_gpu_addr of + * each allocation is read in order to enforce an + * overall physical memory usage limit. + * @pre_dep: Pre-dependencies. One need to use SETTER function to assign + * this field; this is done in order to reduce possibility of + * improper assignment of a dependency field. + * @atom_number: Unique number to identify the atom. + * @prio: Atom priority. Refer to base_jd_prio for more details. + * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + * specified. + * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. + * @core_req: Core requirements. + * @renderpass_id: Renderpass identifier used to associate an atom that has + * BASE_JD_REQ_START_RENDERPASS set in its core requirements + * with an atom that has BASE_JD_REQ_END_RENDERPASS set. + * @padding: Unused. Must be zero. + * + * This structure has changed since UK 10.2 for which base_jd_core_req was a + * __u16 value. + * + * In UK 10.3 a core_req field of a __u32 type was added to the end of the + * structure, and the place in the structure previously occupied by __u16 + * core_req was kept but renamed to compat_core_req. + * + * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2]. + * Compatibility with UK 10.x from UK 11.y is not handled because + * the major version increase prevents this. + * + * For UK 11.20 jit_id[2] must be initialized to zero. + */ +struct base_jd_atom_v2 { + __u64 jc; + struct base_jd_udata udata; + __u64 extres_list; + __u16 nr_extres; + __u8 jit_id[2]; + struct base_dependency pre_dep[2]; + base_atom_id atom_number; + base_jd_prio prio; + __u8 device_nr; + __u8 jobslot; + base_jd_core_req core_req; + __u8 renderpass_id; + __u8 padding[7]; +}; + +/** + * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr + * at the beginning. + * + * @seq_nr: Sequence number of logical grouping of atoms. + * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS + * is set in the base_jd_core_req) the CPU address of a + * base_jd_fragment object. + * @udata: User data. + * @extres_list: List of external resources. + * @nr_extres: Number of external resources or JIT allocations. + * @jit_id: Zero-terminated array of IDs of just-in-time memory + * allocations written to by the atom. When the atom + * completes, the value stored at the + * &struct_base_jit_alloc_info.heap_info_gpu_addr of + * each allocation is read in order to enforce an + * overall physical memory usage limit. + * @pre_dep: Pre-dependencies. One need to use SETTER function to assign + * this field; this is done in order to reduce possibility of + * improper assignment of a dependency field. + * @atom_number: Unique number to identify the atom. + * @prio: Atom priority. Refer to base_jd_prio for more details. + * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + * specified. + * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. + * @core_req: Core requirements. + * @renderpass_id: Renderpass identifier used to associate an atom that has + * BASE_JD_REQ_START_RENDERPASS set in its core requirements + * with an atom that has BASE_JD_REQ_END_RENDERPASS set. + * @padding: Unused. Must be zero. + */ +typedef struct base_jd_atom { + __u64 seq_nr; + __u64 jc; + struct base_jd_udata udata; + __u64 extres_list; + __u16 nr_extres; + __u8 jit_id[2]; + struct base_dependency pre_dep[2]; + base_atom_id atom_number; + base_jd_prio prio; + __u8 device_nr; + __u8 jobslot; + base_jd_core_req core_req; + __u8 renderpass_id; + __u8 padding[7]; +} base_jd_atom; + +/* Job chain event code bits + * Defines the bits used to create ::base_jd_event_code + */ +enum { + BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */ + BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */ + /* Event indicates success (SW events only) */ + BASE_JD_SW_EVENT_SUCCESS = (1u << 13), + BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */ + BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */ + BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */ + BASE_JD_SW_EVENT_RESERVED = (3u << 11), /* Reserved event type */ + /* Mask to extract the type from an event code */ + BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11) +}; + +/** + * enum base_jd_event_code - Job chain event codes + * + * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status + * codes. + * Obscurely, BASE_JD_EVENT_TERMINATED + * indicates a real fault, because the + * job was hard-stopped. + * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as + * 'previous job done'. + * @BASE_JD_EVENT_STOPPED: Can't be seen by userspace, becomes + * TERMINATED, DONE or JOB_CANCELLED. + * @BASE_JD_EVENT_TERMINATED: This is actually a fault status code - the job + * was hard stopped. + * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on + * complete/fail/cancel. + * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes. + * Obscurely, BASE_JD_EVENT_TERMINATED + * indicates a real fault, + * because the job was hard-stopped. + * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and + * software error status codes. + * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and + * software error status codes. + * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status + * codes. + * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes. + * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes. + * Such codes are never returned to + * user-space. + * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes. + * @BASE_JD_EVENT_DONE: atom has completed successfull + * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which + * shall result in a failed atom + * @BASE_JD_EVENT_JOB_POWER_FAULT: The job could not be executed because the + * part of the memory system required to access + * job descriptors was not powered on + * @BASE_JD_EVENT_JOB_READ_FAULT: Reading a job descriptor into the Job + * manager failed + * @BASE_JD_EVENT_JOB_WRITE_FAULT: Writing a job descriptor from the Job + * manager failed + * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the + * specified affinity mask does not intersect + * any available cores + * @BASE_JD_EVENT_JOB_BUS_FAULT: A bus access failed while executing a job + * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program + * counter was executed. + * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal + * encoding was executed. + * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where + * the instruction encoding did not match the + * instruction type encoded in the program + * counter. + * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that + * contained invalid combinations of operands. + * @BASE_JD_EVENT_INSTR_TLS_FAULT: A shader instruction was executed that tried + * to access the thread local storage section + * of another thread. + * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that + * tried to do an unsupported unaligned memory + * access. + * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that + * failed to complete an instruction barrier. + * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job + * contains invalid combinations of data. + * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to + * process a tile that is entirely outside the + * bounding box of the frame. + * @BASE_JD_EVENT_STATE_FAULT: Matches ADDR_RANGE_FAULT. A virtual address + * has been found that exceeds the virtual + * address range. + * @BASE_JD_EVENT_OUT_OF_MEMORY: The tiler ran out of memory when executing a job. + * @BASE_JD_EVENT_UNKNOWN: If multiple jobs in a job chain fail, only + * the first one the reports an error will set + * and return full error information. + * Subsequent failing jobs will not update the + * error status registers, and may write an + * error status of UNKNOWN. + * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to + * physical memory where the original virtual + * address is no longer available. + * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache + * has detected that the same line has been + * accessed as both shareable and non-shareable + * memory from inside the GPU. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table + * entry at level 1 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table + * entry at level 2 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table + * entry at level 3 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table + * entry at level 4 of the translation table. + * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to + * the permission flags set in translation + * table + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading + * level 0 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading + * level 1 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading + * level 2 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading + * level 3 of the translation tables. + * @BASE_JD_EVENT_ACCESS_FLAG: Matches ACCESS_FLAG_0. A memory access hit a + * translation table entry with the ACCESS_FLAG + * bit set to zero in level 0 of the + * page table, and the DISABLE_AF_FAULT flag + * was not set. + * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to + * grow memory on demand + * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its + * dependencies failed + * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data + * in the atom which overlaps with + * BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the + * platform doesn't support the feature specified in + * the atom. + * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate + * to userspace that the KBase context has been + * destroyed and Base should stop listening for + * further events + * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in + * the GPU has to be retried (but it has not + * started) due to e.g., GPU reset + * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal + * the completion of a renderpass. This value + * shouldn't be returned to userspace but I haven't + * seen where it is reset back to JD_EVENT_DONE. + * + * HW and low-level SW events are represented by event codes. + * The status of jobs which succeeded are also represented by + * an event code (see @BASE_JD_EVENT_DONE). + * Events are usually reported as part of a &struct base_jd_event. + * + * The event codes are encoded in the following way: + * * 10:0 - subtype + * * 12:11 - type + * * 13 - SW success (only valid if the SW bit is set) + * * 14 - SW event (HW event if not set) + * * 15 - Kernel event (should never be seen in userspace) + * + * Events are split up into ranges as follows: + * * BASE_JD_EVENT_RANGE__START + * * BASE_JD_EVENT_RANGE__END + * + * code is in 's range when: + * BASE_JD_EVENT_RANGE__START <= code < + * BASE_JD_EVENT_RANGE__END + * + * Ranges can be asserted for adjacency by testing that the END of the previous + * is equal to the START of the next. This is useful for optimizing some tests + * for range. + * + * A limitation is that the last member of this enum must explicitly be handled + * (with an assert-unreachable statement) in switch statements that use + * variables of this type. Otherwise, the compiler warns that we have not + * handled that enum value. + */ +enum base_jd_event_code { + /* HW defined exceptions */ + BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0, + + /* non-fatal exceptions */ + BASE_JD_EVENT_NOT_STARTED = 0x00, + BASE_JD_EVENT_DONE = 0x01, + BASE_JD_EVENT_STOPPED = 0x03, + BASE_JD_EVENT_TERMINATED = 0x04, + BASE_JD_EVENT_ACTIVE = 0x08, + + BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40, + BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40, + + /* job exceptions */ + BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40, + BASE_JD_EVENT_JOB_POWER_FAULT = 0x41, + BASE_JD_EVENT_JOB_READ_FAULT = 0x42, + BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43, + BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44, + BASE_JD_EVENT_JOB_BUS_FAULT = 0x48, + BASE_JD_EVENT_INSTR_INVALID_PC = 0x50, + BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51, + BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52, + BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53, + BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54, + BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55, + BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56, + BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58, + BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59, + BASE_JD_EVENT_STATE_FAULT = 0x5A, + BASE_JD_EVENT_OUT_OF_MEMORY = 0x60, + BASE_JD_EVENT_UNKNOWN = 0x7F, + + /* GPU exceptions */ + BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80, + BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88, + + /* MMU exceptions */ + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4, + BASE_JD_EVENT_PERMISSION_FAULT = 0xC8, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4, + BASE_JD_EVENT_ACCESS_FLAG = 0xD8, + + /* SW defined exceptions */ + BASE_JD_EVENT_MEM_GROWTH_FAILED = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000, + BASE_JD_EVENT_JOB_CANCELLED = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002, + BASE_JD_EVENT_JOB_INVALID = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003, + + BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_RESERVED | 0x3FF, + + BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | 0x000, + + BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000, + + BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF, + + BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | 0x000, + BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000, + BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001, + + BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF +}; + +/** + * struct base_jd_event_v2 - Event reporting structure + * + * @event_code: event code of type @ref base_jd_event_code. + * @atom_number: the atom number that has completed. + * @padding: padding. + * @udata: user data. + * + * This structure is used by the kernel driver to report information + * about GPU events. They can either be HW-specific events or low-level + * SW events, such as job-chain completion. + * + * The event code contains an event type field which can be extracted + * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK. + */ +struct base_jd_event_v2 { + __u32 event_code; + base_atom_id atom_number; + __u8 padding[3]; + struct base_jd_udata udata; +}; + +/** + * struct base_dump_cpu_gpu_counters - Structure for + * BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS + * jobs. + * @system_time: gpu timestamp + * @cycle_counter: gpu cycle count + * @sec: cpu time(sec) + * @usec: cpu time(usec) + * @padding: padding + * + * This structure is stored into the memory pointed to by the @jc field + * of &struct base_jd_atom. + * + * It must not occupy the same CPU cache line(s) as any neighboring data. + * This is to avoid cases where access to pages containing the structure + * is shared between cached and un-cached memory regions, which would + * cause memory corruption. + */ + +struct base_dump_cpu_gpu_counters { + __u64 system_time; + __u64 cycle_counter; + __u64 sec; + __u32 usec; + __u8 padding[36]; +}; + +/** + * struct mali_base_gpu_core_props - GPU core props info + * + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to align to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + * @num_exec_engines: The number of execution engines. Only valid for tGOX + * (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise, + * this is always 0. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; + __u8 num_exec_engines; +}; + +#endif /* _UAPI_BASE_JM_KERNEL_H_ */ diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h new file mode 100644 index 00000000000..20d931adc9b --- /dev/null +++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_JM_IOCTL_H_ +#define _UAPI_KBASE_JM_IOCTL_H_ + +#include +#include + +/* + * 11.1: + * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags + * 11.2: + * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED, + * which some user-side clients prior to 11.2 might fault if they received + * them + * 11.3: + * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and + * KBASE_IOCTL_STICKY_RESOURCE_UNMAP + * 11.4: + * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET + * 11.5: + * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD) + * 11.6: + * - Added flags field to base_jit_alloc_info structure, which can be used to + * specify pseudo chunked tiler alignment for JIT allocations. + * 11.7: + * - Removed UMP support + * 11.8: + * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags + * 11.9: + * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY + * under base_mem_alloc_flags + * 11.10: + * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for + * JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations + * with one softjob. + * 11.11: + * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags + * 11.12: + * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS + * 11.13: + * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT + * 11.14: + * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set + * under base_mem_alloc_flags + * 11.15: + * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags. + * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be + * passed to mmap(). + * 11.16: + * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf. + * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for + * dma-buf. Now, buffers are mapped on GPU when first imported, no longer + * requiring external resource or sticky resource tracking. UNLESS, + * CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled. + * 11.17: + * - Added BASE_JD_REQ_JOB_SLOT. + * - Reused padding field in base_jd_atom_v2 to pass job slot number. + * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO + * 11.18: + * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags + * 11.19: + * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified. + * 11.20: + * - Added new phys_pages member to kbase_ioctl_mem_jit_init for + * KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2 + * (replacing '_OLD') and _11_5 suffixes + * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in + * base_jd_atom_v2. It must currently be initialized to zero. + * - Added heap_info_gpu_addr to base_jit_alloc_info, and + * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's + * flags member. Previous variants of this structure are kept and given _10_2 + * and _11_5 suffixes. + * - The above changes are checked for safe values in usual builds + * 11.21: + * - v2.0 of mali_trace debugfs file, which now versions the file separately + * 11.22: + * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2. + * KBASE_IOCTL_JOB_SUBMIT supports both in parallel. + * 11.23: + * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify + * the physical memory backing of JIT allocations. This was not supposed + * to be a valid use case, but it was allowed by the previous implementation. + * 11.24: + * - Added a sysfs file 'serialize_jobs' inside a new sub-directory + * 'scheduling'. + * 11.25: + * - Enabled JIT pressure limit in base/kbase by default + * 11.26 + * - Added kinstr_jm API + * 11.27 + * - Backwards compatible extension to HWC ioctl. + * 11.28: + * - Added kernel side cache ops needed hint + * 11.29: + * - Reserve ioctl 52 + * 11.30: + * - Add a new priority level BASE_JD_PRIO_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 11.31: + * - Added BASE_JD_REQ_LIMITED_CORE_MASK. + * - Added ioctl 55: set_limited_core_count. + * 11.32: + * - Added new HW performance counters interface to all GPUs. + * 11.33: + * - Removed Kernel legacy HWC interface + * 11.34: + * - First release of new HW performance counters interface. + * 11.35: + * - Dummy model (no mali) backend will now clear HWC values after each sample + */ +#define BASE_UK_VERSION_MAJOR 11 +#define BASE_UK_VERSION_MINOR 35 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + + +/** + * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel + * + * @addr: Memory address of an array of struct base_jd_atom_v2 or v3 + * @nr_atoms: Number of entries in the array + * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom) + */ +struct kbase_ioctl_job_submit { + __u64 addr; + __u32 nr_atoms; + __u32 stride; +}; + +#define KBASE_IOCTL_JOB_SUBMIT \ + _IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit) + +#define KBASE_IOCTL_POST_TERM \ + _IO(KBASE_IOCTL_TYPE, 4) + +/** + * struct kbase_ioctl_soft_event_update - Update the status of a soft-event + * @event: GPU address of the event which has been updated + * @new_status: The new status to set + * @flags: Flags for future expansion + */ +struct kbase_ioctl_soft_event_update { + __u64 event; + __u32 new_status; + __u32 flags; +}; + +#define KBASE_IOCTL_SOFT_EVENT_UPDATE \ + _IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update) + +/** + * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for + * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the + * kernel + * + * @size: The size of the `struct kbase_kinstr_jm_atom_state_change` + * @version: Represents a breaking change in the + * `struct kbase_kinstr_jm_atom_state_change` + * @padding: Explicit padding to get the structure up to 64bits. See + * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst + * + * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the + * end of the structure that older user space might not understand. If the + * `version` is the same, the structure is still compatible with newer kernels. + * The `size` can be used to cast the opaque memory returned from the kernel. + */ +struct kbase_kinstr_jm_fd_out { + __u16 size; + __u8 version; + __u8 padding[5]; +}; + +/** + * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor + * + * @count: Number of atom states that can be stored in the kernel circular + * buffer. Must be a power of two + * @padding: Explicit padding to get the structure up to 64bits. See + * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst + */ +struct kbase_kinstr_jm_fd_in { + __u16 count; + __u8 padding[6]; +}; + +union kbase_kinstr_jm_fd { + struct kbase_kinstr_jm_fd_in in; + struct kbase_kinstr_jm_fd_out out; +}; + +#define KBASE_IOCTL_KINSTR_JM_FD \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd) + + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#endif /* _UAPI_KBASE_JM_IOCTL_H_ */ diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h new file mode 100644 index 00000000000..f8378146ace --- /dev/null +++ b/src/panfrost/base/include/mali_base_common_kernel.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_COMMON_KERNEL_H_ +#define _UAPI_BASE_COMMON_KERNEL_H_ + +#include + +struct base_mem_handle { + struct { + __u64 handle; + } basep; +}; + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +/* Memory allocation, access/hint flags & mask. + * + * See base_mem_alloc_flags. + */ + +/* IN */ +/* Read access CPU side + */ +#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) + +/* Write access CPU side + */ +#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) + +/* Read access GPU side + */ +#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) + +/* Write access GPU side + */ +#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) + +/* Execute allowed on the GPU side + */ +#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) + +/* Will be permanently mapped in kernel space. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) + +/* The allocation will completely reside within the same 4GB chunk in the GPU + * virtual space. + * Since this flag is primarily required only for the TLS memory which will + * not be used to contain executable code and also not used for Tiler heap, + * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. + */ +#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) + +/* Userspace is not allowed to free this memory. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) + +/* Grow backing store on GPU Page Fault + */ +#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) + +/* Page coherence Outer shareable, if available + */ +#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) + +/* Page coherence Inner shareable + */ +#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) + +/* IN/OUT */ +/* Should be cached on the CPU, returned if actually cached + */ +#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) + +/* IN/OUT */ +/* Must have same VA on both the GPU and the CPU + */ +#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) + +/* OUT */ +/* Must call mmap to acquire a GPU address for the allocation + */ +#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) + +/* IN */ +/* Page coherence Outer shareable, required. + */ +#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) + +/* Protected memory + */ +#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) + +/* Not needed physical memory + */ +#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) + +/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the + * addresses to be the same + */ +#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) + +/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu + * mode. Some components within the GPU might only be able to access memory + * that is GPU cacheable. Refer to the specific GPU implementation for more + * details. The 3 shareability flags will be ignored for GPU uncached memory. + * If used while importing USER_BUFFER type memory, then the import will fail + * if the memory is not aligned to GPU and CPU cache line width. + */ +#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) + +/* + * Bits [22:25] for group_id (0~15). + * + * base_mem_group_id_set() should be used to pack a memory group ID into a + * base_mem_alloc_flags value instead of accessing the bits directly. + * base_mem_group_id_get() should be used to extract the memory group ID from + * a base_mem_alloc_flags value. + */ +#define BASEP_MEM_GROUP_ID_SHIFT 22 +#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) + +/* Must do CPU cache maintenance when imported memory is mapped/unmapped + * on GPU. Currently applicable to dma-buf type only. + */ +#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) + +/* OUT */ +/* Kernel side cache sync ops required */ +#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) + +/* Number of bits used as flags for base memory management + * + * Must be kept in sync with the base_mem_alloc_flags flags + */ +#define BASE_MEM_FLAGS_NR_BITS 30 + +/* A mask for all output bits, excluding IN/OUT bits. + */ +#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP + +/* A mask for all input bits, including IN/OUT bits. + */ +#define BASE_MEM_FLAGS_INPUT_MASK \ + (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) + +/* Special base mem handles. + */ +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) +/* reserved handles ..-47< for future special handles */ +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) + +/* Flags to pass to ::base_context_init. + * Flags can be ORed together to enable multiple things. + * + * These share the same space as BASEP_CONTEXT_FLAG_*, and so must + * not collide with them. + */ +typedef __u32 base_context_create_flags; + +/* Flags for base context */ + +/* No flags set */ +#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) + +/* Base context is embedded in a cctx object (flag used for CINSTR + * software counter macros) + */ +#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) + +/* Base context is a 'System Monitor' context for Hardware counters. + * + * One important side effect of this is that job submission is disabled. + */ +#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1) + +/* Bit-shift used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) + +/* Bitmask used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ + ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* Bitpattern describing the base_context_create_flags that can be + * passed to the kernel + */ +#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ + (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK) + +/* Flags for base tracepoint + */ + +/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, + * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) + */ +#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) + +/* Indicate that job dumping is enabled. This could affect certain timers + * to account for the performance impact. + */ +#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) + +#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */ diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h new file mode 100644 index 00000000000..3d826c720b2 --- /dev/null +++ b/src/panfrost/base/include/mali_base_kernel.h @@ -0,0 +1,700 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * Base structures shared with the kernel. + */ + +#ifndef _UAPI_BASE_KERNEL_H_ +#define _UAPI_BASE_KERNEL_H_ + +#include +#include "mali_base_common_kernel.h" + +#define BASE_MAX_COHERENT_GROUPS 16 + +#if defined(PAGE_MASK) && defined(PAGE_SHIFT) +#define LOCAL_PAGE_SHIFT PAGE_SHIFT +#define LOCAL_PAGE_LSB ~PAGE_MASK +#else +#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 +#endif + +#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) +#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) +#else +#error Failed to find page size +#endif +#endif + +/* Physical memory group ID for normal usage. + */ +#define BASE_MEM_GROUP_DEFAULT (0) + +/* Number of physical memory groups. + */ +#define BASE_MEM_GROUP_COUNT (16) + +/** + * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. + * + * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator + * in order to determine the best cache policy. Some combinations are + * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), + * which defines a write-only region on the CPU side, which is + * heavily read by the CPU... + * Other flags are only meaningful to a particular allocator. + * More flags can be added to this list, as long as they don't clash + * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). + */ +typedef __u32 base_mem_alloc_flags; + +/* A mask for all the flags which are modifiable via the base_mem_set_flags + * interface. + */ +#define BASE_MEM_FLAGS_MODIFIABLE \ + (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ + BASE_MEM_COHERENT_LOCAL) + +/* A mask of all the flags that can be returned via the base_mem_get_flags() + * interface. + */ +#define BASE_MEM_FLAGS_QUERYABLE \ + (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ + BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ + BASEP_MEM_FLAGS_KERNEL_ONLY)) + +/** + * enum base_mem_import_type - Memory types supported by @a base_mem_import + * + * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type + * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) + * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a + * base_mem_import_user_buffer + * + * Each type defines what the supported handle type is. + * + * If any new type is added here ARM must be contacted + * to allocate a numeric value for it. + * Do not just add a new type without synchronizing with ARM + * as future releases from ARM might include other new types + * which could clash with your custom types. + */ +enum base_mem_import_type { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + /* + * Import type with value 1 is deprecated. + */ + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 +}; + +/** + * struct base_mem_import_user_buffer - Handle of an imported user buffer + * + * @ptr: address of imported user buffer + * @length: length of imported user buffer in bytes + * + * This structure is used to represent a handle of an imported user buffer. + */ + +struct base_mem_import_user_buffer { + __u64 ptr; + __u64 length; +}; + +/* Mask to detect 4GB boundary alignment */ +#define BASE_MEM_MASK_4GB 0xfffff000UL +/* Mask to detect 4GB boundary (in page units) alignment */ +#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) + +/* Limit on the 'extension' parameter for an allocation with the + * BASE_MEM_TILER_ALIGN_TOP flag set + * + * This is the same as the maximum limit for a Buffer Descriptor's chunk size + */ +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ + (21u - (LOCAL_PAGE_SHIFT)) +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ + (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) + +/* Bit mask of cookies used for memory allocation setup */ +#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ + +/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ +#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ + +/* + * struct base_fence - Cross-device synchronisation fence. + * + * A fence is used to signal when the GPU has finished accessing a resource that + * may be shared with other devices, and also to delay work done asynchronously + * by the GPU until other devices have finished accessing a shared resource. + */ +struct base_fence { + struct { + int fd; + int stream_fd; + } basep; +}; + +/** + * struct base_mem_aliasing_info - Memory aliasing info + * + * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * @offset: Offset within the handle to start aliasing from, in pages. + * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. + * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * specifies the number of times the special page is needed. + * + * Describes a memory handle to be aliased. + * A subset of the handle can be chosen for aliasing, given an offset and a + * length. + * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a + * region where a special page is mapped with a write-alloc cache setup, + * typically used when the write result of the GPU isn't needed, but the GPU + * must write anyway. + * + * Offset and length are specified in pages. + * Offset must be within the size of the handle. + * Offset+length must not overrun the size of the handle. + */ +struct base_mem_aliasing_info { + struct base_mem_handle handle; + __u64 offset; + __u64 length; +}; + +/* Maximum percentage of just-in-time memory allocation trimming to perform + * on free. + */ +#define BASE_JIT_MAX_TRIM_LEVEL (100) + +/* Maximum number of concurrent just-in-time memory allocations. + */ +#define BASE_JIT_ALLOC_COUNT (255) + +/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 + * + * jit_version is 1 + * + * Due to the lack of padding specified, user clients between 32 and 64-bit + * may have assumed a different size of the struct + * + * An array of structures was not supported + */ +struct base_jit_alloc_info_10_2 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; +}; + +/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up + * to 11.19 + * + * This structure had a number of modifications during and after kernel driver + * version 11.5, but remains size-compatible throughout its version history, and + * with earlier variants compatible with future variants by requiring + * zero-initialization to the unused space in the structure. + * + * jit_version is 2 + * + * Kernel driver version history: + * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes + * must be zero. Kbase minor version was not incremented, so some + * versions of 11.5 do not have this change. + * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase + * minor version not incremented) + * 11.6: Added 'flags', replacing 1 padding byte + * 11.10: Arrays of this structure are supported + */ +struct base_jit_alloc_info_11_5 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; +}; + +/** + * struct base_jit_alloc_info - Structure which describes a JIT allocation + * request. + * @gpu_alloc_addr: The GPU virtual address to write the JIT + * allocated GPU virtual address to. + * @va_pages: The minimum number of virtual pages required. + * @commit_pages: The minimum number of physical pages which + * should back the allocation. + * @extension: Granularity of physical pages to grow the + * allocation by during a fault. + * @id: Unique ID provided by the caller, this is used + * to pair allocation and free requests. + * Zero is not a valid value. + * @bin_id: The JIT allocation bin, used in conjunction with + * @max_allocations to limit the number of each + * type of JIT allocation. + * @max_allocations: The maximum number of allocations allowed within + * the bin specified by @bin_id. Should be the same + * for all allocations within the same bin. + * @flags: flags specifying the special requirements for + * the JIT allocation, see + * %BASE_JIT_ALLOC_VALID_FLAGS + * @padding: Expansion space - should be initialised to zero + * @usage_id: A hint about which allocation should be reused. + * The kernel should attempt to use a previous + * allocation with the same usage_id + * @heap_info_gpu_addr: Pointer to an object in GPU memory describing + * the actual usage of the region. + * + * jit_version is 3. + * + * When modifications are made to this structure, it is still compatible with + * jit_version 3 when: a) the size is unchanged, and b) new members only + * replace the padding bytes. + * + * Previous jit_version history: + * jit_version == 1, refer to &base_jit_alloc_info_10_2 + * jit_version == 2, refer to &base_jit_alloc_info_11_5 + * + * Kbase version history: + * 11.20: added @heap_info_gpu_addr + */ +struct base_jit_alloc_info { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; + __u64 heap_info_gpu_addr; +}; + +enum base_external_resource_access { + BASE_EXT_RES_ACCESS_SHARED, + BASE_EXT_RES_ACCESS_EXCLUSIVE +}; + +struct base_external_resource { + __u64 ext_resource; +}; + +/** + * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources + * which can be mapped/unmapped in a single request. + */ +#define BASE_EXT_RES_COUNT_MAX 10 + +/** + * struct base_external_resource_list - Structure which describes a list of + * external resources. + * @count: The number of resources. + * @ext_res: Array of external resources which is + * sized at allocation time. + */ +struct base_external_resource_list { + __u64 count; + struct base_external_resource ext_res[1]; +}; + +struct base_jd_debug_copy_buffer { + __u64 address; + __u64 size; + struct base_external_resource extres; +}; + +#define GPU_MAX_JOB_SLOTS 16 + +/** + * DOC: User-side Base GPU Property Queries + * + * The User-side Base GPU Property Query interface encapsulates two + * sub-modules: + * + * - "Dynamic GPU Properties" + * - "Base Platform Config GPU Properties" + * + * Base only deals with properties that vary between different GPU + * implementations - the Dynamic GPU properties and the Platform Config + * properties. + * + * For properties that are constant for the GPU Architecture, refer to the + * GPU module. However, we will discuss their relevance here just to + * provide background information. + * + * About the GPU Properties in Base and GPU modules + * + * The compile-time properties (Platform Config, GPU Compile-time + * properties) are exposed as pre-processor macros. + * + * Complementing the compile-time properties are the Dynamic GPU + * Properties, which act as a conduit for the GPU Configuration + * Discovery. + * + * In general, the dynamic properties are present to verify that the platform + * has been configured correctly with the right set of Platform Config + * Compile-time Properties. + * + * As a consistent guide across the entire DDK, the choice for dynamic or + * compile-time should consider the following, in order: + * 1. Can the code be written so that it doesn't need to know the + * implementation limits at all? + * 2. If you need the limits, get the information from the Dynamic Property + * lookup. This should be done once as you fetch the context, and then cached + * as part of the context data structure, so it's cheap to access. + * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, + * then use a Compile-Time Property (Platform Config, or GPU Compile-time + * property). Examples of where this might be sensible follow: + * - Part of a critical inner-loop + * - Frequent re-use throughout the driver, causing significant extra load + * instructions or control flow that would be worthwhile optimizing out. + * + * We cannot provide an exhaustive set of examples, neither can we provide a + * rule for every possible situation. Use common sense, and think about: what + * the rest of the driver will be doing; how the compiler might represent the + * value if it is a compile-time constant; whether an OEM shipping multiple + * devices would benefit much more from a single DDK binary, instead of + * insignificant micro-optimizations. + * + * Dynamic GPU Properties + * + * Dynamic GPU properties are presented in two sets: + * 1. the commonly used properties in @ref base_gpu_props, which have been + * unpacked from GPU register bitfields. + * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props + * (also a member of base_gpu_props). All of these are presented in + * the packed form, as presented by the GPU registers themselves. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + * The properties returned extend the GPU Configuration Discovery + * registers. For example, GPU clock speed is not specified in the GPU + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. + * + * The GPU properties are obtained by a call to + * base_get_gpu_props(). This simply returns a pointer to a const + * base_gpu_props structure. It is constant for the life of a base + * context. Multiple calls to base_get_gpu_props() to a base context + * return the same pointer to a constant structure. This avoids cache pollution + * of the common data. + * + * This pointer must not be freed, because it does not point to the start of a + * region allocated by the memory allocator; instead, just close the @ref + * base_context. + * + * + * Kernel Operation + * + * During Base Context Create time, user-side makes a single kernel call: + * - A call to fill user memory with GPU information structures + * + * The kernel-side will fill the provided the entire processed base_gpu_props + * structure, because this information is required in both + * user and kernel side; it does not make sense to decode it twice. + * + * Coherency groups must be derived from the bitmasks, but this can be done + * kernel side, and just once at kernel startup: Coherency groups must already + * be known kernel-side, to support chains that specify a 'Only Coherent Group' + * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. + * + * Coherency Group calculation + * + * Creation of the coherent group data is done at device-driver startup, and so + * is one-time. This will most likely involve a loop with CLZ, shifting, and + * bit clearing on the L2_PRESENT mask, depending on whether the + * system is L2 Coherent. The number of shader cores is done by a + * population count, since faulty cores may be disabled during production, + * producing a non-contiguous mask. + * + * The memory requirements for this algorithm can be determined either by a __u64 + * population count on the L2_PRESENT mask (a LUT helper already is + * required for the above), or simple assumption that there can be no more than + * 16 coherent groups, since core groups are typically 4 cores. + */ + +/* + * More information is possible - but associativity and bus width are not + * required by upper-level apis. + */ +struct mali_base_gpu_l2_cache_props { + __u8 log2_line_size; + __u8 log2_cache_size; + __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + __u8 padding[5]; +}; + +struct mali_base_gpu_tiler_props { + __u32 bin_size_bytes; /* Max is 4*2^15 */ + __u32 max_active_levels; /* Max is 2^15 */ +}; + +/** + * struct mali_base_gpu_thread_props - GPU threading system details. + * @max_threads: Max. number of threads per core + * @max_workgroup_size: Max. number of threads per workgroup + * @max_barrier_size: Max. number of threads that can synchronize on a + * simple barrier + * @max_registers: Total size [1..65535] of the register file available + * per core. + * @max_task_queue: Max. tasks [1..255] which may be sent to a core + * before it becomes blocked. + * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split + * field. + * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, + * 3 = SW Model/Emulation + * @padding: padding to align to 8-byte + * @tls_alloc: Number of threads per core that TLS must be + * allocated for + */ +struct mali_base_gpu_thread_props { + __u32 max_threads; + __u32 max_workgroup_size; + __u32 max_barrier_size; + __u16 max_registers; + __u8 max_task_queue; + __u8 max_thread_group_split; + __u8 impl_tech; + __u8 padding[3]; + __u32 tls_alloc; +}; + +/** + * struct mali_base_gpu_coherent_group - descriptor for a coherent group + * @core_mask: Core restriction mask required for the group + * @num_cores: Number of cores in the group + * @padding: padding to align to 8-byte + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_base_gpu_coherent_group { + __u64 core_mask; + __u16 num_cores; + __u16 padding[3]; +}; + +/** + * struct mali_base_gpu_coherent_group_info - Coherency group information + * @num_groups: Number of coherent groups in the GPU. + * @num_core_groups: Number of core groups (coherent or not) in the GPU. + * Equivalent to the number of L2 Caches. + * The GPU Counter dumping writes 2048 bytes per core group, + * regardless of whether the core groups are coherent or not. + * Hence this member is needed to calculate how much memory + * is required for dumping. + * @note Do not use it to work out how many valid elements + * are in the group[] member. Use num_groups instead. + * @coherency: Coherency features of the memory, accessed by gpu_mem_features + * methods + * @padding: padding to align to 8-byte + * @group: Descriptors of coherent groups + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_base_gpu_coherent_group_info { + __u32 num_groups; + __u32 num_core_groups; + __u32 coherency; + __u32 padding; + struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; +}; + +#if MALI_USE_CSF +#include "csf/mali_base_csf_kernel.h" +#else +#include "jm/mali_base_jm_kernel.h" +#endif + +/** + * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware + * Configuration Discovery registers. + * @shader_present: Shader core present bitmap + * @tiler_present: Tiler core present bitmap + * @l2_present: Level 2 cache present bitmap + * @stack_present: Core stack present bitmap + * @l2_features: L2 features + * @core_features: Core features + * @mem_features: Mem features + * @mmu_features: Mmu features + * @as_present: Bitmap of address spaces present + * @js_present: Job slots present + * @js_features: Array of job slot features. + * @tiler_features: Tiler features + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU + * @gpu_id: GPU and revision identifier + * @thread_max_threads: Maximum number of threads per core + * @thread_max_workgroup_size: Maximum number of threads per workgroup + * @thread_max_barrier_size: Maximum number of threads per barrier + * @thread_features: Thread features + * @coherency_mode: Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register + * @thread_tls_alloc: Number of threads per core that TLS must be allocated for + * @gpu_features: GPU features + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct gpu_raw_gpu_props { + __u64 shader_present; + __u64 tiler_present; + __u64 l2_present; + __u64 stack_present; + __u32 l2_features; + __u32 core_features; + __u32 mem_features; + __u32 mmu_features; + + __u32 as_present; + + __u32 js_present; + __u32 js_features[GPU_MAX_JOB_SLOTS]; + __u32 tiler_features; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + __u32 gpu_id; + + __u32 thread_max_threads; + __u32 thread_max_workgroup_size; + __u32 thread_max_barrier_size; + __u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + __u32 coherency_mode; + + __u32 thread_tls_alloc; + __u64 gpu_features; +}; + +/** + * struct base_gpu_props - Return structure for base_get_gpu_props(). + * @core_props: Core props. + * @l2_props: L2 props. + * @unused_1: Keep for backwards compatibility. + * @tiler_props: Tiler props. + * @thread_props: Thread props. + * @raw_props: This member is large, likely to be 128 bytes. + * @coherency_info: This must be last member of the structure. + * + * NOTE: the raw_props member in this data structure contains the register + * values from which the value of the other members are derived. The derived + * members exist to allow for efficient access and/or shielding the details + * of the layout of the registers. + */ +struct base_gpu_props { + struct mali_base_gpu_core_props core_props; + struct mali_base_gpu_l2_cache_props l2_props; + __u64 unused_1; + struct mali_base_gpu_tiler_props tiler_props; + struct mali_base_gpu_thread_props thread_props; + struct gpu_raw_gpu_props raw_props; + struct mali_base_gpu_coherent_group_info coherency_info; +}; + +#define BASE_MEM_GROUP_ID_GET(flags) \ + ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) + +#define BASE_MEM_GROUP_ID_SET(id) \ + (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ + BASE_MEM_GROUP_DEFAULT : \ + id) \ + << BASEP_MEM_GROUP_ID_SHIFT) & \ + BASE_MEM_GROUP_ID_MASK) + +#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ + (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ + ((base_context_create_flags)(group_id) \ + << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) + +#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ + ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ + BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* + * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These + * flags are also used, where applicable, for specifying which fields + * are valid following the request operation. + */ + +/* For monotonic (counter) timefield */ +#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) +/* For system wide timestamp */ +#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) +/* For GPU cycle counter */ +#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) +/* Specify kernel GPU register timestamp */ +#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) +/* Specify userspace cntvct_el0 timestamp source */ +#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) + +#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ + BASE_TIMEINFO_MONOTONIC_FLAG | \ + BASE_TIMEINFO_TIMESTAMP_FLAG | \ + BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ + BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ + BASE_TIMEINFO_USER_SOURCE_FLAG) + +/* Maximum number of source allocations allowed to create an alias allocation. + * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array + * layers, since each cube map in the array will have 6 faces. + */ +#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) + +#endif /* _UAPI_BASE_KERNEL_H_ */ diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h new file mode 100644 index 00000000000..b250feca022 --- /dev/null +++ b/src/panfrost/base/include/mali_kbase_gpuprops.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_GPUPROP_H_ +#define _UAPI_KBASE_GPUPROP_H_ + +/********************************** + * Definitions for GPU properties * + **********************************/ +#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) +#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) +#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) +#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) + +#define KBASE_GPUPROP_PRODUCT_ID 1 +#define KBASE_GPUPROP_VERSION_STATUS 2 +#define KBASE_GPUPROP_MINOR_REVISION 3 +#define KBASE_GPUPROP_MAJOR_REVISION 4 +/* 5 previously used for GPU speed */ +#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 +/* 7 previously used for minimum GPU speed */ +#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 +#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 +#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 +#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 +#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 + +#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 +#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 +#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 + +#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 +#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 + +#define KBASE_GPUPROP_MAX_THREADS 18 +#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 +#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 +#define KBASE_GPUPROP_MAX_REGISTERS 21 +#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 +#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 +#define KBASE_GPUPROP_IMPL_TECH 24 + +#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 +#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 +#define KBASE_GPUPROP_RAW_L2_PRESENT 27 +#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 +#define KBASE_GPUPROP_RAW_L2_FEATURES 29 +#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 +#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 +#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 +#define KBASE_GPUPROP_RAW_AS_PRESENT 33 +#define KBASE_GPUPROP_RAW_JS_PRESENT 34 +#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 +#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 +#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 +#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 +#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 +#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 +#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 +#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 +#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 +#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 +#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 +#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 +#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 +#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 +#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 +#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 +#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 +#define KBASE_GPUPROP_RAW_GPU_ID 55 +#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 +#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 +#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 +#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 +#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 + +#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 +#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 +#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 +#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 +#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 +#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 +#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 +#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 +#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 +#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 +#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 +#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 +#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 +#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 +#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 +#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 +#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 +#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 +#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 + +#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 + +#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 + +#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 +#define KBASE_GPUPROP_TLS_ALLOC 84 +#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 + +#endif diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h new file mode 100644 index 00000000000..96f606af5f8 --- /dev/null +++ b/src/panfrost/base/include/mali_kbase_ioctl.h @@ -0,0 +1,759 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_IOCTL_H_ +#define _UAPI_KBASE_IOCTL_H_ + +#ifdef __cpluscplus +extern "C" { +#endif + +#include +#include + +#if MALI_USE_CSF +#include "csf/mali_kbase_csf_ioctl.h" +#else +#include "jm/mali_kbase_jm_ioctl.h" +#endif /* MALI_USE_CSF */ + +#define KBASE_IOCTL_TYPE 0x80 + +/** + * struct kbase_ioctl_set_flags - Set kernel context creation flags + * + * @create_flags: Flags - see base_context_create_flags + */ +struct kbase_ioctl_set_flags { + __u32 create_flags; +}; + +#define KBASE_IOCTL_SET_FLAGS \ + _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) + +/** + * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel + * + * @buffer: Pointer to the buffer to store properties into + * @size: Size of the buffer + * @flags: Flags - must be zero for now + * + * The ioctl will return the number of bytes stored into @buffer or an error + * on failure (e.g. @size is too small). If @size is specified as 0 then no + * data will be written but the return value will be the number of bytes needed + * for all the properties. + * + * @flags may be used in the future to request a different format for the + * buffer. With @flags == 0 the following format is used. + * + * The buffer will be filled with pairs of values, a __u32 key identifying the + * property followed by the value. The size of the value is identified using + * the bottom bits of the key. The value then immediately followed the key and + * is tightly packed (there is no padding). All keys and values are + * little-endian. + * + * 00 = __u8 + * 01 = __u16 + * 10 = __u32 + * 11 = __u64 + */ +struct kbase_ioctl_get_gpuprops { + __u64 buffer; + __u32 size; + __u32 flags; +}; + +#define KBASE_IOCTL_GET_GPUPROPS \ + _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) + +/** + * union kbase_ioctl_mem_alloc - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC \ + _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) + +/** + * struct kbase_ioctl_mem_query - Query properties of a GPU memory region + * @in: Input parameters + * @in.gpu_addr: A GPU address contained within the region + * @in.query: The type of query + * @out: Output parameters + * @out.value: The result of the query + * + * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. + */ +union kbase_ioctl_mem_query { + struct { + __u64 gpu_addr; + __u64 query; + } in; + struct { + __u64 value; + } out; +}; + +#define KBASE_IOCTL_MEM_QUERY \ + _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) + +#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) +#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) +#define KBASE_MEM_QUERY_FLAGS ((__u64)3) + +/** + * struct kbase_ioctl_mem_free - Free a memory region + * @gpu_addr: Handle to the region to free + */ +struct kbase_ioctl_mem_free { + __u64 gpu_addr; +}; + +#define KBASE_IOCTL_MEM_FREE \ + _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @buffer_count: requested number of dumping buffers + * @fe_bm: counters selection bitmask (Front end) + * @shader_bm: counters selection bitmask (Shader) + * @tiler_bm: counters selection bitmask (Tiler) + * @mmu_l2_bm: counters selection bitmask (MMU_L2) + * + * A fd is returned from the ioctl if successful, or a negative value on error + */ +struct kbase_ioctl_hwcnt_reader_setup { + __u32 buffer_count; + __u32 fe_bm; + __u32 shader_bm; + __u32 tiler_bm; + __u32 mmu_l2_bm; +}; + +#define KBASE_IOCTL_HWCNT_READER_SETUP \ + _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) + +/** + * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. + * @data: Counter samples for the dummy model. + * @size: Size of the counter sample data. + * @padding: Padding. + */ +struct kbase_ioctl_hwcnt_values { + __u64 data; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_HWCNT_SET \ + _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) + +/** + * struct kbase_ioctl_disjoint_query - Query the disjoint counter + * @counter: A counter of disjoint events in the kernel + */ +struct kbase_ioctl_disjoint_query { + __u32 counter; +}; + +#define KBASE_IOCTL_DISJOINT_QUERY \ + _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) + +/** + * struct kbase_ioctl_get_ddk_version - Query the kernel version + * @version_buffer: Buffer to receive the kernel version string + * @size: Size of the buffer + * @padding: Padding + * + * The ioctl will return the number of bytes written into version_buffer + * (which includes a NULL byte) or a negative error code + * + * The ioctl request code has to be _IOW because the data in ioctl struct is + * being copied to the kernel, even though the kernel then writes out the + * version info to the buffer specified in the ioctl. + */ +struct kbase_ioctl_get_ddk_version { + __u64 version_buffer; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_GET_DDK_VERSION \ + _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) + +/** + * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 10.2--11.4) + * @va_pages: Number of VA pages to reserve for JIT + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_10_2 { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) + +/** + * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 11.5--11.19) + * @va_pages: Number of VA pages to reserve for JIT + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_11_5 { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) + +/** + * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory + * allocator + * @va_pages: Number of GPU virtual address pages to reserve for just-in-time + * memory allocations + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * @phys_pages: Maximum number of physical pages to allocate just-in-time + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + */ +struct kbase_ioctl_mem_jit_init { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; + __u64 phys_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) + +/** + * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory + * + * @handle: GPU memory handle (GPU VA) + * @user_addr: The address where it is mapped in user space + * @size: The number of bytes to synchronise + * @type: The direction to synchronise: 0 is sync to memory (clean), + * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_mem_sync { + __u64 handle; + __u64 user_addr; + __u64 size; + __u8 type; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_MEM_SYNC \ + _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) + +/** + * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer + * + * @in: Input parameters + * @in.gpu_addr: The GPU address of the memory region + * @in.cpu_addr: The CPU address to locate + * @in.size: A size in bytes to validate is contained within the region + * @out: Output parameters + * @out.offset: The offset from the start of the memory region to @cpu_addr + */ +union kbase_ioctl_mem_find_cpu_offset { + struct { + __u64 gpu_addr; + __u64 cpu_addr; + __u64 size; + } in; + struct { + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) + +/** + * struct kbase_ioctl_get_context_id - Get the kernel context ID + * + * @id: The kernel context ID + */ +struct kbase_ioctl_get_context_id { + __u32 id; +}; + +#define KBASE_IOCTL_GET_CONTEXT_ID \ + _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) + +/** + * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd + * + * @flags: Flags + * + * The ioctl returns a file descriptor when successful + */ +struct kbase_ioctl_tlstream_acquire { + __u32 flags; +}; + +#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ + _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) + +#define KBASE_IOCTL_TLSTREAM_FLUSH \ + _IO(KBASE_IOCTL_TYPE, 19) + +/** + * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region + * + * @gpu_addr: The memory region to modify + * @pages: The number of physical pages that should be present + * + * The ioctl may return on the following error codes or 0 for success: + * -ENOMEM: Out of memory + * -EINVAL: Invalid arguments + */ +struct kbase_ioctl_mem_commit { + __u64 gpu_addr; + __u64 pages; +}; + +#define KBASE_IOCTL_MEM_COMMIT \ + _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) + +/** + * union kbase_ioctl_mem_alias - Create an alias of memory regions + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.stride: Bytes between start of each memory region + * @in.nents: The number of regions to pack together into the alias + * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_alias { + struct { + __u64 flags; + __u64 stride; + __u64 nents; + __u64 aliasing_info; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_ALIAS \ + _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) + +/** + * union kbase_ioctl_mem_import - Import memory for use by the GPU + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.phandle: Handle to the external memory + * @in.type: Type of external memory, see base_mem_import_type + * @in.padding: Amount of extra VA pages to append to the imported buffer + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_import { + struct { + __u64 flags; + __u64 phandle; + __u32 type; + __u32 padding; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_IMPORT \ + _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) + +/** + * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region + * @gpu_va: The GPU region to modify + * @flags: The new flags to set + * @mask: Mask of the flags to modify + */ +struct kbase_ioctl_mem_flags_change { + __u64 gpu_va; + __u64 flags; + __u64 mask; +}; + +#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ + _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) + +/** + * struct kbase_ioctl_stream_create - Create a synchronisation stream + * @name: A name to identify this stream. Must be NULL-terminated. + * + * Note that this is also called a "timeline", but is named stream to avoid + * confusion with other uses of the word. + * + * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. + * + * The ioctl returns a file descriptor. + */ +struct kbase_ioctl_stream_create { + char name[32]; +}; + +#define KBASE_IOCTL_STREAM_CREATE \ + _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) + +/** + * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence + * @fd: The file descriptor to validate + */ +struct kbase_ioctl_fence_validate { + int fd; +}; + +#define KBASE_IOCTL_FENCE_VALIDATE \ + _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) + +/** + * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel + * @buffer: Pointer to the information + * @len: Length + * @padding: Padding + * + * The data provided is accessible through a debugfs file + */ +struct kbase_ioctl_mem_profile_add { + __u64 buffer; + __u32 len; + __u32 padding; +}; + +#define KBASE_IOCTL_MEM_PROFILE_ADD \ + _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) + +/** + * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to map + */ +struct kbase_ioctl_sticky_resource_map { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ + _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) + +/** + * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was + * previously permanently mapped + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to unmap + */ +struct kbase_ioctl_sticky_resource_unmap { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ + _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) + +/** + * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of + * the GPU memory region for + * the given gpu address and + * the offset of that address + * into the region + * @in: Input parameters + * @in.gpu_addr: GPU virtual address + * @in.size: Size in bytes within the region + * @out: Output parameters + * @out.start: Address of the beginning of the memory region enclosing @gpu_addr + * for the length of @offset bytes + * @out.offset: The offset from the start of the memory region to @gpu_addr + */ +union kbase_ioctl_mem_find_gpu_start_and_offset { + struct { + __u64 gpu_addr; + __u64 size; + } in; + struct { + __u64 start; + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) + +#define KBASE_IOCTL_CINSTR_GWT_START \ + _IO(KBASE_IOCTL_TYPE, 33) + +#define KBASE_IOCTL_CINSTR_GWT_STOP \ + _IO(KBASE_IOCTL_TYPE, 34) + +/** + * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault + * addresses. + * @in: Input parameters + * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. + * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) + * @in.len: Number of addresses the buffers can hold. + * @in.padding: padding + * @out: Output parameters + * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. + * @out.more_data_available: Status indicating if more addresses are available. + * @out.padding: padding + * + * This structure is used when performing a call to dump GPU write fault + * addresses. + */ +union kbase_ioctl_cinstr_gwt_dump { + struct { + __u64 addr_buffer; + __u64 size_buffer; + __u32 len; + __u32 padding; + + } in; + struct { + __u32 no_of_addr_collected; + __u8 more_data_available; + __u8 padding[27]; + } out; +}; + +#define KBASE_IOCTL_CINSTR_GWT_DUMP \ + _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) + +/** + * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone + * + * @va_pages: Number of VA pages to reserve for EXEC_VA + */ +struct kbase_ioctl_mem_exec_init { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_EXEC_INIT \ + _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) + +/** + * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of + * cpu/gpu time (counter values) + * @in: Input parameters + * @in.request_flags: Bit-flags indicating the requested types. + * @in.paddings: Unused, size alignment matching the out. + * @out: Output parameters + * @out.sec: Integer field of the monotonic time, unit in seconds. + * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. + * @out.padding: Unused, for __u64 alignment + * @out.timestamp: System wide timestamp (counter) value. + * @out.cycle_counter: GPU cycle counter value. + */ +union kbase_ioctl_get_cpu_gpu_timeinfo { + struct { + __u32 request_flags; + __u32 paddings[7]; + } in; + struct { + __u64 sec; + __u32 nsec; + __u32 padding; + __u64 timestamp; + __u64 cycle_counter; + } out; +}; + +#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ + _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) + +/** + * struct kbase_ioctl_context_priority_check - Check the max possible priority + * @priority: Input priority & output priority + */ + +struct kbase_ioctl_context_priority_check { + __u8 priority; +}; + +#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) + +/** + * struct kbase_ioctl_set_limited_core_count - Set the limited core count. + * + * @max_core_count: Maximum core count + */ +struct kbase_ioctl_set_limited_core_count { + __u8 max_core_count; +}; + +#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ + _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) + +/** + * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter + * information + * @info_item_size: Performance counter item size in bytes. + * @info_item_count: Performance counter item count in the info_list_ptr. + * @info_list_ptr: Performance counter item list pointer which points to a + * list with info_item_count of items. + * + * On success: returns info_item_size and info_item_count if info_list_ptr is + * NULL, returns performance counter information if info_list_ptr is not NULL. + * On error: returns a negative error code. + */ +struct kbase_ioctl_kinstr_prfcnt_enum_info { + __u32 info_item_size; + __u32 info_item_count; + __u64 info_list_ptr; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ + _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) + +/** + * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader + * @in: input parameters. + * @in.request_item_count: Number of requests in the requests array. + * @in.request_item_size: Size in bytes of each request in the requests array. + * @in.requests_ptr: Pointer to the requests array. + * @out: output parameters. + * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for + * each sample. + * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap + * for reading performance counter samples. + * + * A fd is returned from the ioctl if successful, or a negative value on error. + */ +union kbase_ioctl_kinstr_prfcnt_setup { + struct { + __u32 request_item_count; + __u32 request_item_size; + __u64 requests_ptr; + } in; + struct { + __u32 prfcnt_metadata_item_size; + __u32 prfcnt_mmap_size_bytes; + } out; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ + _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) + + +/** + * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes + * @bytes_collected: number of bytes read by user + * @bytes_generated: number of bytes generated by tracepoints + */ +struct kbase_ioctl_tlstream_stats { + __u32 bytes_collected; + __u32 bytes_generated; +}; + +#define KBASE_IOCTL_TLSTREAM_STATS \ + _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) + +#endif /* MALI_UNIT_TEST */ + +/* Customer extension range */ +#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) + +/* If the integration needs extra ioctl add them there + * like this: + * + * struct my_ioctl_args { + * .... + * } + * + * #define KBASE_IOCTL_MY_IOCTL \ + * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) + */ + +#ifdef __cpluscplus +} +#endif + +#endif /* _UAPI_KBASE_IOCTL_H_ */ diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h new file mode 100644 index 00000000000..5f33f5c4c4b --- /dev/null +++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h @@ -0,0 +1,80 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef __KBASE_IOCTL_MIDGARD_H__ +#define __KBASE_IOCTL_MIDGARD_H__ + +#define KBASE_IOCTL_TYPE_BASE 0x80 +#define KBASE_IOCTL_TYPE_MAX 0x82 + +union kbase_ioctl_mem_alloc { + struct { + union kbase_ioctl_header header; + u64 va_pages; + u64 commit_pages; + u64 extension; + u64 flags; + } in; + struct { + union kbase_ioctl_header header; + u64 pad[3]; + u64 flags; + mali_ptr gpu_va; + u16 va_alignment; + } out; + u64 pad[7]; +} __attribute__((packed)); + +#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1) + +#define KBASE_IOCTL_GET_VERSION (_IOWR(0x80, 0, struct kbase_ioctl_get_version)) +#define KBASE_IOCTL_MEM_ALLOC (_IOWR(0x82, 0, union kbase_ioctl_mem_alloc)) +#define KBASE_IOCTL_MEM_IMPORT (_IOWR(0x82, 1, union kbase_ioctl_mem_import)) +#define KBASE_IOCTL_MEM_COMMIT (_IOWR(0x82, 2, struct kbase_ioctl_mem_commit)) +#define KBASE_IOCTL_MEM_QUERY (_IOWR(0x82, 3, struct kbase_ioctl_mem_query)) +#define KBASE_IOCTL_MEM_FREE (_IOWR(0x82, 4, struct kbase_ioctl_mem_free)) +#define KBASE_IOCTL_MEM_FLAGS_CHANGE (_IOWR(0x82, 5, struct kbase_ioctl_mem_flags_change)) +#define KBASE_IOCTL_MEM_ALIAS (_IOWR(0x82, 6, struct kbase_ioctl_mem_alias)) +#define KBASE_IOCTL_MEM_SYNC (_IOWR(0x82, 8, struct kbase_ioctl_mem_sync)) +#define KBASE_IOCTL_POST_TERM (_IOWR(0x82, 9, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_SETUP (_IOWR(0x82, 10, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_DUMP (_IOWR(0x82, 11, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_CLEAR (_IOWR(0x82, 12, __ioctl_placeholder)) +#define KBASE_IOCTL_GPU_PROPS_REG_DUMP (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump)) +#define KBASE_IOCTL_FIND_CPU_OFFSET (_IOWR(0x82, 15, __ioctl_placeholder)) +#define KBASE_IOCTL_GET_VERSION_NEW (_IOWR(0x82, 16, struct kbase_ioctl_get_version)) +#define KBASE_IOCTL_SET_FLAGS (_IOWR(0x82, 18, struct kbase_ioctl_set_flags)) +#define KBASE_IOCTL_SET_TEST_DATA (_IOWR(0x82, 19, __ioctl_placeholder)) +#define KBASE_IOCTL_INJECT_ERROR (_IOWR(0x82, 20, __ioctl_placeholder)) +#define KBASE_IOCTL_MODEL_CONTROL (_IOWR(0x82, 21, __ioctl_placeholder)) +#define KBASE_IOCTL_KEEP_GPU_POWERED (_IOWR(0x82, 22, __ioctl_placeholder)) +#define KBASE_IOCTL_FENCE_VALIDATE (_IOWR(0x82, 23, __ioctl_placeholder)) +#define KBASE_IOCTL_STREAM_CREATE (_IOWR(0x82, 24, struct kbase_ioctl_stream_create)) +#define KBASE_IOCTL_GET_PROFILING_CONTROLS (_IOWR(0x82, 25, __ioctl_placeholder)) +#define KBASE_IOCTL_SET_PROFILING_CONTROLS (_IOWR(0x82, 26, __ioctl_placeholder)) +#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder)) +#define KBASE_IOCTL_JOB_SUBMIT (_IOWR(0x82, 28, struct kbase_ioctl_job_submit)) +#define KBASE_IOCTL_DISJOINT_QUERY (_IOWR(0x82, 29, __ioctl_placeholder)) +#define KBASE_IOCTL_GET_CONTEXT_ID (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id)) +#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4 (_IOWR(0x82, 32, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_TEST (_IOWR(0x82, 33, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_STATS (_IOWR(0x82, 34, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_FLUSH (_IOWR(0x82, 35, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_READER_SETUP (_IOWR(0x82, 36, __ioctl_placeholder)) +#define KBASE_IOCTL_SET_PRFCNT_VALUES (_IOWR(0x82, 37, __ioctl_placeholder)) +#define KBASE_IOCTL_SOFT_EVENT_UPDATE (_IOWR(0x82, 38, __ioctl_placeholder)) +#define KBASE_IOCTL_MEM_JIT_INIT (_IOWR(0x82, 39, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_ACQUIRE (_IOWR(0x82, 40, __ioctl_placeholder)) + +#endif /* __KBASE_IOCTL_MIDGARD_H__ */ diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h new file mode 100644 index 00000000000..5c76f2dc8e5 --- /dev/null +++ b/src/panfrost/base/include/old/mali-ioctl.h @@ -0,0 +1,743 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +/** + * Definitions for all of the ioctls for the original open source bifrost GPU + * kernel driver, written by ARM. + */ + +#ifndef __KBASE_IOCTL_H__ +#define __KBASE_IOCTL_H__ + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int32_t s32; +typedef int64_t s64; + + +typedef u8 mali_atom_id; + +/** + * Since these structs are passed to and from the kernel we need to make sure + * that we get the size of each struct to match exactly what the kernel is + * expecting. So, when editing this file make sure to add static asserts that + * check each struct's size against the arg length you see in strace. + */ + +enum kbase_ioctl_mem_flags { + /* IN */ + BASE_MEM_PROT_CPU_RD = (1U << 0), /**< Read access CPU side */ + BASE_MEM_PROT_CPU_WR = (1U << 1), /**< Write access CPU side */ + BASE_MEM_PROT_GPU_RD = (1U << 2), /**< Read access GPU side */ + BASE_MEM_PROT_GPU_WR = (1U << 3), /**< Write access GPU side */ + BASE_MEM_PROT_GPU_EX = (1U << 4), /**< Execute allowed on the GPU + side */ + + BASE_MEM_GROW_ON_GPF = (1U << 9), /**< Grow backing store on GPU + Page Fault */ + + BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer + shareable, if available */ + BASE_MEM_COHERENT_LOCAL = (1U << 11), /**< Page coherence Inner + shareable */ + BASE_MEM_CACHED_CPU = (1U << 12), /**< Should be cached on the + CPU */ + + /* IN/OUT */ + BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU + and the CPU */ + /* OUT */ + BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU + address for the alloc */ + /* IN */ + BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence + Outer shareable, required. */ + BASE_MEM_SECURE = (1U << 16), /**< Secure memory */ + BASE_MEM_DONT_NEED = (1U << 17), /**< Not needed physical + memory */ + BASE_MEM_IMPORT_SHARED = (1U << 18), /**< Must use shared CPU/GPU zone + (SAME_VA zone) but doesn't + require the addresses to + be the same */ +}; + +#define KBASE_IOCTL_MEM_FLAGS_IN_MASK \ + (BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | \ + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \ + BASE_MEM_GROW_ON_GPF | \ + BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL | \ + BASE_MEM_CACHED_CPU | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE | \ + BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12) + +enum kbase_ioctl_coherency_mode { + COHERENCY_ACE_LITE = 0, + COHERENCY_ACE = 1, + COHERENCY_NONE = 31 +}; + +/* + * Mali Atom priority + * + * Only certain priority levels are actually implemented, as specified by the + * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority + * level that is not one of those defined below. + * + * Priority levels only affect scheduling between atoms of the same type within + * a mali context, and only after the atoms have had dependencies resolved. + * Fragment atoms does not affect non-frament atoms with lower priorities, and + * the other way around. For example, a low priority atom that has had its + * dependencies resolved might run before a higher priority atom that has not + * had its dependencies resolved. + * + * The scheduling between mali contexts/processes and between atoms from + * different mali contexts/processes is unaffected by atom priority. + * + * The atoms are scheduled as follows with respect to their priorities: + * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies + * resolved, and atom 'X' has a higher priority than atom 'Y' + * - If atom 'Y' is currently running on the HW, then it is interrupted to + * allow atom 'X' to run soon after + * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing + * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' + * - Any two atoms that have the same priority could run in any order with + * respect to each other. That is, there is no ordering constraint between + * atoms of the same priority. + */ +typedef u8 mali_jd_prio; +#define BASE_JD_PRIO_MEDIUM ((mali_jd_prio)0) +#define BASE_JD_PRIO_HIGH ((mali_jd_prio)1) +#define BASE_JD_PRIO_LOW ((mali_jd_prio)2) + +/** + * @brief Job dependency type. + * + * A flags field will be inserted into the atom structure to specify whether a + * dependency is a data or ordering dependency (by putting it before/after + * 'core_req' in the structure it should be possible to add without changing + * the structure size). When the flag is set for a particular dependency to + * signal that it is an ordering only dependency then errors will not be + * propagated. + */ +typedef u8 mali_jd_dep_type; +#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ +#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ +#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ + +/** + * @brief Job chain hardware requirements. + * + * A job chain must specify what GPU features it needs to allow the + * driver to schedule the job correctly. By not specifying the + * correct settings can/will cause an early job termination. Multiple + * values can be ORed together to specify multiple requirements. + * Special case is ::BASE_JD_REQ_DEP, which is used to express complex + * dependencies, and that doesn't execute anything on the hardware. + */ +typedef u32 mali_jd_core_req; + +/* Requirements that come from the HW */ + +/** + * No requirement, dependency only + */ +#define BASE_JD_REQ_DEP ((mali_jd_core_req)0) + +/** + * Requires fragment shaders + */ +#define BASE_JD_REQ_FS ((mali_jd_core_req)1 << 0) + +/** + * Requires compute shaders + * This covers any of the following Midgard Job types: + * - Vertex Shader Job + * - Geometry Shader Job + * - An actual Compute Shader Job + * + * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the + * job is specifically just the "Compute Shader" job type, and not the "Vertex + * Shader" nor the "Geometry Shader" job type. + */ +#define BASE_JD_REQ_CS ((mali_jd_core_req)1 << 1) +#define BASE_JD_REQ_T ((mali_jd_core_req)1 << 2) /**< Requires tiling */ +#define BASE_JD_REQ_CF ((mali_jd_core_req)1 << 3) /**< Requires cache flushes */ +#define BASE_JD_REQ_V ((mali_jd_core_req)1 << 4) /**< Requires value writeback */ + +/* SW-only requirements - the HW does not expose these as part of the job slot + * capabilities */ + +/* Requires fragment job with AFBC encoding */ +#define BASE_JD_REQ_FS_AFBC ((mali_jd_core_req)1 << 13) + +/** + * SW-only requirement: coalesce completion events. + * If this bit is set then completion of this atom will not cause an event to + * be sent to userspace, whether successful or not; completion events will be + * deferred until an atom completes which does not have this bit set. + * + * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. + */ +#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5) + +/** + * SW Only requirement: the job chain requires a coherent core group. We don't + * mind which coherent core group is used. + */ +#define BASE_JD_REQ_COHERENT_GROUP ((mali_jd_core_req)1 << 6) + +/** + * SW Only requirement: The performance counters should be enabled only when + * they are needed, to reduce power consumption. + */ + +#define BASE_JD_REQ_PERMON ((mali_jd_core_req)1 << 7) + +/** + * SW Only requirement: External resources are referenced by this atom. When + * external resources are referenced no syncsets can be bundled with the atom + * but should instead be part of a NULL jobs inserted into the dependency + * tree. The first pre_dep object must be configured for the external + * resouces to use, the second pre_dep object can be used to create other + * dependencies. + * + * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE. + */ +#define BASE_JD_REQ_EXTERNAL_RESOURCES ((mali_jd_core_req)1 << 8) + +/** + * SW Only requirement: Software defined job. Jobs with this bit set will not + * be submitted to the hardware but will cause some action to happen within + * the driver + */ +#define BASE_JD_REQ_SOFT_JOB ((mali_jd_core_req)1 << 9) + +#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) +#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) +#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) + +/** + * SW Only requirement : Replay job. + * + * If the preceding job fails, the replay job will cause the jobs specified in + * the list of mali_jd_replay_payload pointed to by the jc pointer to be + * replayed. + * + * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT + * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay + * job is failed, as well as any following dependencies. + * + * The replayed jobs will require a number of atom IDs. If there are not enough + * free atom IDs then the replay job will fail. + * + * If the preceding job does not fail, then the replay job is returned as + * completed. + * + * The replayed jobs will never be returned to userspace. The preceding failed + * job will be returned to userspace as failed; the status of this job should + * be ignored. Completion should be determined by the status of the replay soft + * job. + * + * In order for the jobs to be replayed, the job headers will have to be + * modified. The Status field will be reset to NOT_STARTED. If the Job Type + * field indicates a Vertex Shader Job then it will be changed to Null Job. + * + * The replayed jobs have the following assumptions : + * + * - No external resources. Any required external resources will be held by the + * replay atom. + * - Pre-dependencies are created based on job order. + * - Atom numbers are automatically assigned. + * - device_nr is set to 0. This is not relevant as + * BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set. + * - Priority is inherited from the replay job. + */ +#define BASE_JD_REQ_SOFT_REPLAY (BASE_JD_REQ_SOFT_JOB | 0x4) +/** + * SW only requirement: event wait/trigger job. + * + * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. + * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the + * other waiting jobs. It completes immediately. + * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it + * possible for other jobs to wait upon. It completes immediately. + */ +#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) +#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) +#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) + +#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) + +/** + * SW only requirement: Just In Time allocation + * + * This job requests a JIT allocation based on the request in the + * @base_jit_alloc_info structure which is passed via the jc element of + * the atom. + * + * It should be noted that the id entry in @base_jit_alloc_info must not + * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE. + * + * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE + * soft job to free the JIT allocation is still made. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) +/** + * SW only requirement: Just In Time free + * + * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC + * to be freed. The ID of the JIT allocation is passed via the jc element of + * the atom. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) + +/** + * SW only requirement: Map external resource + * + * This job requests external resource(s) are mapped once the dependencies + * of the job have been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * @base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) +/** + * SW only requirement: Unmap external resource + * + * This job requests external resource(s) are unmapped once the dependencies + * of the job has been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * @base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) + +/** + * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) + * + * This indicates that the Job Chain contains Midgard Jobs of the 'Compute + * Shaders' type. + * + * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job + * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. + */ +#define BASE_JD_REQ_ONLY_COMPUTE ((mali_jd_core_req)1 << 10) + +/** + * HW Requirement: Use the mali_jd_atom::device_nr field to specify a + * particular core group + * + * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag + * takes priority + * + * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms. + * + * If the core availability policy is keeping the required core group turned + * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code. + */ +#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11) + +/** + * SW Flag: If this bit is set then the successful completion of this atom + * will not cause an event to be sent to userspace + */ +#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((mali_jd_core_req)1 << 12) + +/** + * SW Flag: If this bit is set then completion of this atom will not cause an + * event to be sent to userspace, whether successful or not. + */ +#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14) + +/** + * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job starts which does not have this bit set or a job completes + * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if + * the CPU may have written to memory addressed by the job since the last job + * without this bit set was submitted. + */ +#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15) + +/** + * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job completes which does not have this bit set or a job starts + * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not + * use if the CPU may read from or partially overwrite memory addressed by the + * job before the next job without this bit set completes. + */ +#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16) + +/** + * These requirement bits are currently unused in mali_jd_core_req + */ +#define MALIP_JD_REQ_RESERVED \ + (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ + BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \ + BASE_JD_REQ_EVENT_COALESCE | \ + BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ + BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ + BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END)) + +/** + * Mask of all bits in mali_jd_core_req that control the type of the atom. + * + * This allows dependency only atoms to have flags set + */ +#define BASE_JD_REQ_ATOM_TYPE \ + (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ + BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) + +/** + * Mask of all bits in mali_jd_core_req that control the type of a soft job. + */ +#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) + +/* + * Returns non-zero value if core requirements passed define a soft job or + * a dependency only job. + */ +#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ + ((core_req & BASE_JD_REQ_SOFT_JOB) || \ + (core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) + +/** + * @brief The payload for a replay job. This must be in GPU memory. + */ +struct mali_jd_replay_payload { + /** + * Pointer to the first entry in the mali_jd_replay_jc list. These + * will be replayed in @b reverse order (so that extra ones can be added + * to the head in future soft jobs without affecting this soft job) + */ + u64 tiler_jc_list; + + /** + * Pointer to the fragment job chain. + */ + u64 fragment_jc; + + /** + * Pointer to the tiler heap free FBD field to be modified. + */ + u64 tiler_heap_free; + + /** + * Hierarchy mask for the replayed fragment jobs. May be zero. + */ + u16 fragment_hierarchy_mask; + + /** + * Hierarchy mask for the replayed tiler jobs. May be zero. + */ + u16 tiler_hierarchy_mask; + + /** + * Default weight to be used for hierarchy levels not in the original + * mask. + */ + u32 hierarchy_default_weight; + + /** + * Core requirements for the tiler job chain + */ + mali_jd_core_req tiler_core_req; + + /** + * Core requirements for the fragment job chain + */ + mali_jd_core_req fragment_core_req; +}; + +/** + * @brief An entry in the linked list of job chains to be replayed. This must + * be in GPU memory. + */ +struct mali_jd_replay_jc { + /** + * Pointer to next entry in the list. A setting of NULL indicates the + * end of the list. + */ + u64 next; + + /** + * Pointer to the job chain. + */ + u64 jc; +}; + +typedef u64 mali_ptr; + +#define MALI_PTR_FMT "0x%" PRIx64 +#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR + +#ifdef __LP64__ +#define PAD_CPU_PTR(p) p +#else +#define PAD_CPU_PTR(p) p; u32 :32; +#endif + +/* FIXME: Again, they don't specify any of these as packed structs. However, + * looking at these structs I'm worried that there is already spots where the + * compiler is potentially sticking in padding... + * Going to try something a little crazy, and just hope that our compiler + * happens to add the same kind of offsets since we can't really compare sizes + */ + +/* + * Blob provided by the driver to store callback driver, not actually modified + * by the driver itself + */ +struct mali_jd_udata { + u64 blob[2]; +}; + +struct mali_jd_dependency { + mali_atom_id atom_id; /**< An atom number */ + mali_jd_dep_type dependency_type; /**< Dependency type */ +}; + +#define MALI_EXT_RES_MAX 10 + +/* The original header never explicitly defines any values for these. In C, + * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we + * actually need to decode here is EXCLUSIVE + */ +enum mali_external_resource_access { + MALI_EXT_RES_ACCESS_SHARED, + MALI_EXT_RES_ACCESS_EXCLUSIVE, +}; + +/* An aligned address to the resource | mali_external_resource_access */ +typedef u64 mali_external_resource; + +struct base_jd_atom_v2 { + mali_ptr jc; /**< job-chain GPU address */ + struct mali_jd_udata udata; /**< user data */ + u64 extres_list; /**< list of external resources */ + u16 nr_extres; /**< nr of external resources */ + u16 compat_core_req; /**< core requirements which + correspond to the legacy support + for UK 10.2 */ + struct mali_jd_dependency pre_dep[2]; /**< pre-dependencies, one need to + use SETTER function to assign + this field, this is done in + order to reduce possibility of + improper assigment of a + dependency field */ + mali_atom_id atom_number; /**< unique number to identify the + atom */ + mali_jd_prio prio; /**< Atom priority. Refer to @ref + mali_jd_prio for more details */ + u8 device_nr; /**< coregroup when + BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + specified */ + u8 :8; + mali_jd_core_req core_req; /**< core requirements */ +} __attribute__((packed)); + +/** + * enum mali_error - Mali error codes shared with userspace + * + * This is subset of those common Mali errors that can be returned to userspace. + * Values of matching user and kernel space enumerators MUST be the same. + * MALI_ERROR_NONE is guaranteed to be 0. + * + * @MALI_ERROR_NONE: Success + * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver + * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure + * @MALI_ERROR_FUNCTION_FAILED: Generic error code + */ +enum mali_error { + MALI_ERROR_NONE = 0, + MALI_ERROR_OUT_OF_GPU_MEMORY, + MALI_ERROR_OUT_OF_MEMORY, + MALI_ERROR_FUNCTION_FAILED, +}; + +/** + * Header used by all ioctls + */ +union kbase_ioctl_header { +#ifdef dvalin + u32 pad[0]; +#else + /* [in] The ID of the UK function being called */ + u32 id :32; + /* [out] The return value of the UK function that was called */ + enum mali_error rc :32; + + u64 :64; +#endif +} __attribute__((packed)); + +struct kbase_ioctl_get_version { + union kbase_ioctl_header header; + u16 major; /* [out] */ + u16 minor; /* [out] */ + u32 :32; +} __attribute__((packed)); + +struct mali_mem_import_user_buffer { + u64 ptr; + u64 length; +}; + +union kbase_ioctl_mem_import { + struct { + union kbase_ioctl_header header; + u64 phandle; + enum { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + BASE_MEM_IMPORT_TYPE_UMP = 1, + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3, + } type :32; + u32 :32; + u64 flags; + } in; + struct { + union kbase_ioctl_header header; + u64 pad[2]; + u64 flags; + u64 gpu_va; + u64 va_pages; + } out; +} __attribute__((packed)); + +struct kbase_ioctl_mem_commit { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_addr; + u64 pages; + /* [out] */ + u32 result_subcode; + u32 :32; +} __attribute__((packed)); + +enum kbase_ioctl_mem_query_type { + BASE_MEM_QUERY_COMMIT_SIZE = 1, + BASE_MEM_QUERY_VA_SIZE = 2, + BASE_MEM_QUERY_FLAGS = 3 +}; + +struct kbase_ioctl_mem_query { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_addr; + enum kbase_ioctl_mem_query_type query : 32; + u32 :32; + /* [out] */ + u64 value; +} __attribute__((packed)); + +struct kbase_ioctl_mem_free { + union kbase_ioctl_header header; + mali_ptr gpu_addr; /* [in] */ +} __attribute__((packed)); +/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ + +struct kbase_ioctl_mem_flags_change { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_va; + u64 flags; + u64 mask; +} __attribute__((packed)); +/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ + +struct kbase_ioctl_mem_alias { + union kbase_ioctl_header header; + /* [in/out] */ + u64 flags; + /* [in] */ + u64 stride; + u64 nents; + u64 ai; + /* [out] */ + mali_ptr gpu_va; + u64 va_pages; +} __attribute__((packed)); + +struct kbase_ioctl_mem_sync { + union kbase_ioctl_header header; + mali_ptr handle; + u64 user_addr; + u64 size; + enum { + MALI_SYNC_TO_DEVICE = 1, + MALI_SYNC_TO_CPU = 2, + } type :8; + u64 :56; +} __attribute__((packed)); + +struct kbase_ioctl_set_flags { + union kbase_ioctl_header header; + u32 create_flags; /* [in] */ + u32 :32; +} __attribute__((packed)); + +struct kbase_ioctl_stream_create { + union kbase_ioctl_header header; + /* [in] */ + char name[32]; + /* [out] */ + s32 fd; + u32 :32; +} __attribute__((packed)); + +struct kbase_ioctl_job_submit { + union kbase_ioctl_header header; + /* [in] */ + u64 addr; + u32 nr_atoms; + u32 stride; +} __attribute__((packed)); + +struct kbase_ioctl_get_context_id { + union kbase_ioctl_header header; + /* [out] */ + s64 id; +} __attribute__((packed)); + +#undef PAD_CPU_PTR + +enum base_jd_event_code { + BASE_JD_EVENT_DONE = 1, +}; + +struct base_jd_event_v2 { + enum base_jd_event_code event_code; + mali_atom_id atom_number; + struct mali_jd_udata udata; +}; + +/* Defined in mali-props.h */ +struct kbase_ioctl_gpu_props_reg_dump; + +/* For ioctl's we haven't written decoding stuff for yet */ +typedef struct { + union kbase_ioctl_header header; +} __ioctl_placeholder; + +#endif /* __KBASE_IOCTL_H__ */ diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h new file mode 100644 index 00000000000..5b9d8723600 --- /dev/null +++ b/src/panfrost/base/include/old/mali-props.h @@ -0,0 +1,262 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef __MALI_PROPS_H__ +#define __MALI_PROPS_H__ + +#include "mali-ioctl.h" + +#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3 +#define MALI_GPU_MAX_JOB_SLOTS 16 +#define MALI_MAX_COHERENT_GROUPS 16 + +/* Capabilities of a job slot as reported by JS_FEATURES registers */ + +#define JS_FEATURE_NULL_JOB (1u << 1) +#define JS_FEATURE_SET_VALUE_JOB (1u << 2) +#define JS_FEATURE_CACHE_FLUSH_JOB (1u << 3) +#define JS_FEATURE_COMPUTE_JOB (1u << 4) +#define JS_FEATURE_VERTEX_JOB (1u << 5) +#define JS_FEATURE_GEOMETRY_JOB (1u << 6) +#define JS_FEATURE_TILER_JOB (1u << 7) +#define JS_FEATURE_FUSED_JOB (1u << 8) +#define JS_FEATURE_FRAGMENT_JOB (1u << 9) + +struct mali_gpu_core_props { + /** + * Product specific value. + */ + u32 product_id; + + /** + * Status of the GPU release. + * No defined values, but starts at 0 and increases by one for each + * release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + */ + u16 version_status; + + /** + * Minor release number of the GPU. "P" part of an "RnPn" release + * number. + * 8 bit values (0-255). + */ + u16 minor_revision; + + /** + * Major release number of the GPU. "R" part of an "RnPn" release + * number. + * 4 bit values (0-15). + */ + u16 major_revision; + + u16 :16; + + /** + * @usecase GPU clock speed is not specified in the Midgard + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() + * function. + */ + u32 gpu_speed_mhz; + + /** + * @usecase GPU clock max/min speed is required for computing + * best/worst case in tasks as job scheduling ant irq_throttling. (It + * is not specified in the Midgard Architecture). + */ + u32 gpu_freq_khz_max; + u32 gpu_freq_khz_min; + + /** + * Size of the shader program counter, in bits. + */ + u32 log2_program_counter_size; + + /** + * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a + * bitpattern where a set bit indicates that the format is supported. + * + * Before using a texture format, it is recommended that the + * corresponding bit be checked. + */ + u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + /** + * Theoretical maximum memory available to the GPU. It is unlikely + * that a client will be able to allocate all of this memory for their + * own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + */ + u64 gpu_available_memory_size; +}; + +struct mali_gpu_l2_cache_props { + u8 log2_line_size; + u8 log2_cache_size; + u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + u64 :40; +}; + +struct mali_gpu_tiler_props { + u32 bin_size_bytes; /* Max is 4*2^15 */ + u32 max_active_levels; /* Max is 2^15 */ +}; + +struct mali_gpu_thread_props { + u32 max_threads; /* Max. number of threads per core */ + u32 max_workgroup_size; /* Max. number of threads per workgroup */ + u32 max_barrier_size; /* Max. number of threads that can + synchronize on a simple barrier */ + u16 max_registers; /* Total size [1..65535] of the register + file available per core. */ + u8 max_task_queue; /* Max. tasks [1..255] which may be sent + to a core before it becomes blocked. */ + u8 max_thread_group_split; /* Max. allowed value [1..15] of the + Thread Group Split field. */ + enum { + MALI_GPU_IMPLEMENTATION_UNKNOWN = 0, + MALI_GPU_IMPLEMENTATION_SILICON = 1, + MALI_GPU_IMPLEMENTATION_FPGA = 2, + MALI_GPU_IMPLEMENTATION_SW = 3, + } impl_tech :8; + u64 :56; +}; + +/** + * @brief descriptor for a coherent group + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_ioctl_gpu_coherent_group { + u64 core_mask; /**< Core restriction mask required for the + group */ + u16 num_cores; /**< Number of cores in the group */ + u64 :48; +}; + +/** + * @brief Coherency group information + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_gpu_coherent_group_info { + u32 num_groups; + + /** + * Number of core groups (coherent or not) in the GPU. Equivalent to + * the number of L2 Caches. + * + * The GPU Counter dumping writes 2048 bytes per core group, + * regardless of whether the core groups are coherent or not. Hence + * this member is needed to calculate how much memory is required for + * dumping. + * + * @note Do not use it to work out how many valid elements are in the + * group[] member. Use num_groups instead. + */ + u32 num_core_groups; + + /** + * Coherency features of the memory, accessed by @ref gpu_mem_features + * methods + */ + u32 coherency; + + u32 :32; + + /** + * Descriptors of coherent groups + */ + struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS]; +}; + +/** + * A complete description of the GPU's Hardware Configuration Discovery + * registers. + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct mali_gpu_raw_props { + u64 shader_present; + u64 tiler_present; + u64 l2_present; + u64 stack_present; + + u32 l2_features; + u32 suspend_size; /* API 8.2+ */ + u32 mem_features; + u32 mmu_features; + + u32 as_present; + + u32 js_present; + u32 js_features[MALI_GPU_MAX_JOB_SLOTS]; + u32 tiler_features; + u32 texture_features[3]; + + u32 gpu_id; + + u32 thread_max_threads; + u32 thread_max_workgroup_size; + u32 thread_max_barrier_size; + u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + u32 coherency_mode; +}; + +struct kbase_ioctl_gpu_props_reg_dump { + union kbase_ioctl_header header; + struct mali_gpu_core_props core; + struct mali_gpu_l2_cache_props l2; + u64 :64; + struct mali_gpu_tiler_props tiler; + struct mali_gpu_thread_props thread; + + struct mali_gpu_raw_props raw; + + /** This must be last member of the structure */ + struct mali_gpu_coherent_group_info coherency_info; +} __attribute__((packed)); + +#endif diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build new file mode 100644 index 00000000000..5d7b9f1dff9 --- /dev/null +++ b/src/panfrost/base/meson.build @@ -0,0 +1,55 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora +# Copyright © 2022 Icecream95 + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_base_versions = ['0', '1', '2', '258'] +libpanfrost_base_per_arch = [] + +foreach ver : libpanfrost_base_versions + libpanfrost_base_per_arch += static_library( + 'pan-base-v' + ver, + 'pan_vX_base.c', + include_directories : [ + inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux, + include_directories('include'), + ], + c_args : ['-DPAN_BASE_VER=' + ver], + gnu_symbol_visibility : 'hidden', + dependencies: [dep_valgrind], +) +endforeach + +libpanfrost_base = static_library( + 'panfrost_base', + 'pan_base.c', + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw, + include_directories('include'), + ], + gnu_symbol_visibility : 'hidden', + build_by_default : false, + link_with: [libpanfrost_base_per_arch], +) + +libpanfrost_base_dep = declare_dependency( + link_with: [libpanfrost_base_per_arch, libpanfrost_base], + include_directories: [include_directories('.')], +) diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c new file mode 100644 index 00000000000..22dc09cfb52 --- /dev/null +++ b/src/panfrost/base/pan_base.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/macros.h" +#include "pan_base.h" + +#include "mali_kbase_ioctl.h" + +bool +kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose) +{ + *k = (struct kbase_) {0}; + k->fd = fd; + k->cs_queue_count = cs_queue_count; + k->page_size = sysconf(_SC_PAGE_SIZE); + k->verbose = verbose; + + if (k->fd == -1) + return kbase_open_csf_noop(k); + + struct kbase_ioctl_version_check ver = { 0 }; + + if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) { + return kbase_open_csf(k); + } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) { + if (ver.major == 3) + return kbase_open_old(k); + else + return kbase_open_new(k); + } + + return false; +} + +/* If fd != -1, ownership is passed in */ +int +kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd) +{ + kbase_handle h = { + .va = va, + .fd = fd + }; + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + kbase_handle *handles = util_dynarray_begin(&k->gem_handles); + + for (unsigned i = 0; i < size; ++i) { + if (handles[i].fd == -2) { + handles[i] = h; + return i; + } + } + + util_dynarray_append(&k->gem_handles, kbase_handle, h); + + return size; +} + +int +kbase_alloc_gem_handle(kbase k, base_va va, int fd) +{ + pthread_mutex_lock(&k->handle_lock); + + int ret = kbase_alloc_gem_handle_locked(k, va, fd); + + pthread_mutex_unlock(&k->handle_lock); + + return ret; +} + +void +kbase_free_gem_handle(kbase k, int handle) +{ + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + int fd; + + if (handle >= size) { + pthread_mutex_unlock(&k->handle_lock); + return; + } + + if (handle + 1 < size) { + kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); + fd = ptr->fd; + ptr->fd = -2; + } else { + fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd; + } + + if (fd != -1) + close(fd); + + pthread_mutex_unlock(&k->handle_lock); +} + +kbase_handle +kbase_gem_handle_get(kbase k, int handle) +{ + kbase_handle h = { .fd = -1 }; + + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + if (handle < size) + h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle); + + pthread_mutex_unlock(&k->handle_lock); + + return h; +} + +int +kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers) +{ + struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns); + + while (kbase_wait_for_event(&wait)) { + pthread_mutex_lock(&k->handle_lock); + if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) { + pthread_mutex_unlock(&k->handle_lock); + kbase_wait_fini(wait); + errno = EINVAL; + return -1; + } + kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); + if (!ptr->use_count) { + pthread_mutex_unlock(&k->handle_lock); + kbase_wait_fini(wait); + return 0; + } + pthread_mutex_unlock(&k->handle_lock); + } + + kbase_wait_fini(wait); + errno = ETIMEDOUT; + return -1; +} + +static void +adjust_time(struct timespec *tp, int64_t ns) +{ + ns += tp->tv_nsec; + tp->tv_nsec = ns % 1000000000; + tp->tv_sec += ns / 1000000000; +} + +static int64_t +ns_until(struct timespec tp) +{ + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + + int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000; + int64_t ns = tp.tv_nsec - now.tv_nsec; + + /* Clamp the value to zero to avoid errors from ppoll */ + return MAX2(sec + ns, 0); +} + +static void +kbase_wait_signal(kbase k) +{ + /* We must acquire the event condition lock, otherwise another + * thread could be between the trylock and the cond_wait, and + * not notice the broadcast. */ + pthread_mutex_lock(&k->event_cnd_lock); + pthread_cond_broadcast(&k->event_cnd); + pthread_mutex_unlock(&k->event_cnd_lock); +} + +struct kbase_wait_ctx +kbase_wait_init(kbase k, int64_t timeout_ns) +{ + struct timespec tp; + clock_gettime(CLOCK_MONOTONIC, &tp); + + adjust_time(&tp, timeout_ns); + + return (struct kbase_wait_ctx) { + .k = k, + .until = tp, + }; +} + +bool +kbase_wait_for_event(struct kbase_wait_ctx *ctx) +{ + kbase k = ctx->k; + + /* Return instantly the first time so that a check outside the + * wait_for_Event loop is not required */ + if (!ctx->has_cnd_lock) { + pthread_mutex_lock(&k->event_cnd_lock); + ctx->has_cnd_lock = true; + return true; + } + + if (!ctx->has_lock) { + if (pthread_mutex_trylock(&k->event_read_lock) == 0) { + ctx->has_lock = true; + pthread_mutex_unlock(&k->event_cnd_lock); + } else { + int ret = pthread_cond_timedwait(&k->event_cnd, + &k->event_cnd_lock, &ctx->until); + return ret != ETIMEDOUT; + } + } + + bool event = k->poll_event(k, ns_until(ctx->until)); + k->handle_events(k); + kbase_wait_signal(k); + return event; +} + +void +kbase_wait_fini(struct kbase_wait_ctx ctx) +{ + kbase k = ctx.k; + + if (ctx.has_lock) { + pthread_mutex_unlock(&k->event_read_lock); + kbase_wait_signal(k); + } else if (ctx.has_cnd_lock) { + pthread_mutex_unlock(&k->event_cnd_lock); + } +} + +void +kbase_ensure_handle_events(kbase k) +{ + /* If we don't manage to take the lock, then events have recently/will + * soon be handled, there is no need to do anything. */ + if (pthread_mutex_trylock(&k->event_read_lock) == 0) { + k->handle_events(k); + pthread_mutex_unlock(&k->event_read_lock); + kbase_wait_signal(k); + } +} + +bool +kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp) +{ + struct pollfd pfd = { + .fd = fd, + .events = wait_shared ? POLLOUT : POLLIN, + }; + + uint64_t timeout = ns_until(tp); + + struct timespec t = { + .tv_sec = timeout / 1000000000, + .tv_nsec = timeout % 1000000000, + }; + + int ret = ppoll(&pfd, 1, &t, NULL); + + if (ret == -1 && errno != EINTR) + perror("kbase_poll_fd_until"); + + return ret != 0; +} diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h new file mode 100644 index 00000000000..878f7468433 --- /dev/null +++ b/src/panfrost/base/pan_base.h @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Library for interfacing with kbase */ +#ifndef PAN_BASE_H +#define PAN_BASE_H + +#include "util/u_dynarray.h" +#include "util/list.h" + +#define PAN_EVENT_SIZE 16 + +typedef uint64_t base_va; +struct base_ptr { + void *cpu; + base_va gpu; +}; + +struct kbase_syncobj; + +/* The job is done when the queue seqnum > seqnum */ +struct kbase_sync_link { + struct kbase_sync_link *next; /* must be first */ + uint64_t seqnum; + void (*callback)(void *); + void *data; +}; + +struct kbase_event_slot { + struct kbase_sync_link *syncobjs; + struct kbase_sync_link **back; + uint64_t last_submit; + uint64_t last; +}; + +struct kbase_context { + uint8_t csg_handle; + uint8_t kcpu_queue; + bool kcpu_init; // TODO: Always create a queue? + uint32_t csg_uid; + unsigned num_csi; + + unsigned tiler_heap_chunk_size; + base_va tiler_heap_va; + base_va tiler_heap_header; +}; + +struct kbase_cs { + struct kbase_context *ctx; + void *user_io; + base_va va; + unsigned size; + unsigned event_mem_offset; + unsigned csi; + + uint64_t last_insert; + + // TODO: This is only here because it's convenient for emit_csf_queue + uint32_t *latest_flush; +}; + +#define KBASE_SLOT_COUNT 2 + +typedef struct { + base_va va; + int fd; + uint8_t use_count; + /* For emulating implicit sync. TODO make this work on v10 */ + uint8_t last_access[KBASE_SLOT_COUNT]; +} kbase_handle; + +struct kbase_; +typedef struct kbase_ *kbase; + +struct kbase_ { + unsigned setup_state; + bool verbose; + + int fd; + unsigned api; + unsigned page_size; + // TODO: Actually we may want to try to pack multiple contexts / queue + // "sets" into a single group... + unsigned cs_queue_count; + + /* Must not hold handle_lock while acquiring event_read_lock */ + pthread_mutex_t handle_lock; + pthread_mutex_t event_read_lock; + pthread_mutex_t event_cnd_lock; + pthread_cond_t event_cnd; + /* TODO: Per-context/queue locks? */ + pthread_mutex_t queue_lock; + + struct list_head syncobjs; + + unsigned gpuprops_size; + void *gpuprops; + + void *tracking_region; + void *csf_user_reg; + struct base_ptr event_mem; + struct base_ptr kcpu_event_mem; + // TODO: dynamically size + struct kbase_event_slot event_slots[256]; + // TODO: USe a bitset? + unsigned event_slot_usage; + + uint8_t atom_number; + + struct util_dynarray gem_handles; + struct util_dynarray atom_bos[256]; + uint64_t job_seq; + + void (*close)(kbase k); + + bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value); + bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value); + + struct base_ptr (*alloc)(kbase k, size_t size, + unsigned pan_flags, + unsigned mali_flags); + void (*free)(kbase k, base_va va); + + int (*import_dmabuf)(kbase k, int fd); + void *(*mmap_import)(kbase k, base_va va, size_t size); + + void (*cache_clean)(void *ptr, size_t size); + void (*cache_invalidate)(void *ptr, size_t size); + + /* Returns false on timeout */ + bool (*poll_event)(kbase k, int64_t timeout_ns); + bool (*handle_events)(kbase k); + + /* <= v9 GPUs */ + int (*submit)(kbase k, uint64_t va, unsigned req, + struct kbase_syncobj *o, + int32_t *handles, unsigned num_handles); + + /* >= v10 GPUs */ + struct kbase_context *(*context_create)(kbase k); + void (*context_destroy)(kbase k, struct kbase_context *ctx); + bool (*context_recreate)(kbase k, struct kbase_context *ctx); + + // TODO: Pass in a priority? + struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx, + base_va va, unsigned size); + void (*cs_term)(kbase k, struct kbase_cs *cs); + void (*cs_rebind)(kbase k, struct kbase_cs *cs); + + bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset, + struct kbase_syncobj *o, uint64_t seqnum); + bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset, + struct kbase_syncobj *o); + + int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx); + bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd); + + bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value); + bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value); + + /* syncobj functions */ + struct kbase_syncobj *(*syncobj_create)(kbase k); + void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o); + struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o); + /* TODO: timeout? (and for cs_wait) */ + bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o); + + /* Returns false if there are no active queues */ + bool (*callback_all_queues)(kbase k, int32_t *count, + void (*callback)(void *), void *data); + + void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size, + bool invalidate); +}; + +bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose); + +/* Called from kbase_open */ +bool kbase_open_old(kbase k); +bool kbase_open_new(kbase k); +bool kbase_open_csf(kbase k); +bool kbase_open_csf_noop(kbase k); + +/* BO management */ +int kbase_alloc_gem_handle(kbase k, base_va va, int fd); +int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd); +void kbase_free_gem_handle(kbase k, int handle); +kbase_handle kbase_gem_handle_get(kbase k, int handle); +int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers); + +/* Event waiting */ +struct kbase_wait_ctx { + kbase k; + struct timespec until; + bool has_lock; + bool has_cnd_lock; +}; + +struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns); +/* Returns false on timeout, kbase_wait_fini must still be called */ +bool kbase_wait_for_event(struct kbase_wait_ctx *ctx); +void kbase_wait_fini(struct kbase_wait_ctx ctx); + +void kbase_ensure_handle_events(kbase k); + +bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp); + +/* Must not conflict with PANFROST_BO_* flags */ +#define MALI_BO_CACHED_CPU (1 << 16) +#define MALI_BO_UNCACHED_GPU (1 << 17) + +#endif diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h new file mode 100644 index 00000000000..750a445a995 --- /dev/null +++ b/src/panfrost/base/pan_base_noop.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PAN_BASE_NOOP_H +#define PAN_BASE_NOOP_H + +/* For Mali-G610 as used in RK3588 */ +#define PROP(name, value) ((name << 2) | 2), value +static const uint32_t gpu_props[] = { + PROP(KBASE_GPUPROP_RAW_GPU_ID, 0xa8670000), + PROP(KBASE_GPUPROP_PRODUCT_ID, 0xa867), + PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT, 0x50005), + PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e), + PROP(KBASE_GPUPROP_TLS_ALLOC, 0x800), + PROP(KBASE_GPUPROP_RAW_TILER_FEATURES, 0x809), +}; +#undef PROP + +#define NOOP_COOKIE_ALLOC 0x41000 +#define NOOP_COOKIE_USER_IO 0x42000 +#define NOOP_COOKIE_MEM_ALLOC 0x43000 + +static int +kbase_ioctl(int fd, unsigned long request, ...) +{ + int ret = 0; + + va_list args; + + va_start(args, request); + void *ptr = va_arg(args, void *); + va_end(args); + + switch (request) { + case KBASE_IOCTL_GET_GPUPROPS: { + struct kbase_ioctl_get_gpuprops *props = ptr; + + if (props->size) + memcpy((void *)(uintptr_t) props->buffer, + gpu_props, MIN2(props->size, sizeof(gpu_props))); + + ret = sizeof(gpu_props); + break; + } + + case KBASE_IOCTL_MEM_ALLOC: { + union kbase_ioctl_mem_alloc *alloc = ptr; + + alloc->out.gpu_va = NOOP_COOKIE_ALLOC; + alloc->out.flags = BASE_MEM_SAME_VA; + break; + } + + case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: { + union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr; + + // TODO: Don't return duplicates? + create->out.group_handle = 0; + create->out.group_uid = 1; + break; + } + + case KBASE_IOCTL_CS_TILER_HEAP_INIT: { + union kbase_ioctl_cs_tiler_heap_init *init = ptr; + + /* The values don't really matter, the CPU has no business in accessing + * these. */ + init->out.gpu_heap_va = 0x60000; + init->out.first_chunk_va = 0x61000; + break; + } + + case KBASE_IOCTL_CS_QUEUE_BIND: { + union kbase_ioctl_cs_queue_bind *bind = ptr; + bind->out.mmap_handle = NOOP_COOKIE_USER_IO; + break; + } + + case KBASE_IOCTL_MEM_IMPORT: { + union kbase_ioctl_mem_import *import = ptr; + + if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) { + ret = -1; + errno = EINVAL; + break; + } + + int *fd = (int *)(uintptr_t) import->in.phandle; + + off_t size = lseek(*fd, 0, SEEK_END); + + import->out.flags = BASE_MEM_NEED_MMAP; + import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC; + import->out.va_pages = DIV_ROUND_UP(size, 4096); + } + + case KBASE_IOCTL_SET_FLAGS: + case KBASE_IOCTL_MEM_EXEC_INIT: + case KBASE_IOCTL_MEM_JIT_INIT: + case KBASE_IOCTL_CS_QUEUE_REGISTER: + case KBASE_IOCTL_CS_QUEUE_KICK: + case KBASE_IOCTL_CS_TILER_HEAP_TERM: + case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE: + case KBASE_IOCTL_MEM_SYNC: + break; + + default: + ret = -1; + errno = ENOSYS; + } + + return ret; +} + +static void * +kbase_mmap(void *addr, size_t length, int prot, int flags, + int fd, off_t offset) +{ + switch (offset) { + case BASE_MEM_MAP_TRACKING_HANDLE: + case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE: + case NOOP_COOKIE_ALLOC: + case NOOP_COOKIE_USER_IO: + case NOOP_COOKIE_MEM_ALLOC: + return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + default: + errno = ENOSYS; + return MAP_FAILED; + } +} +#endif diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h new file mode 100644 index 00000000000..ad5af0c7098 --- /dev/null +++ b/src/panfrost/base/pan_cache.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PAN_CACHE_H +#define PAN_CACHE_H + +#ifdef __aarch64__ + +static void +cache_clean(volatile void *addr) +{ + __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); +} + +static void +cache_invalidate(volatile void *addr) +{ + __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); +} + +typedef void (*cacheline_op)(volatile void *addr); + +#define CACHELINE_SIZE 64 + +static void +cacheline_op_range(volatile void *start, size_t length, cacheline_op op) +{ + volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); + volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); + for (; ptr < end; ptr += CACHELINE_SIZE) + op(ptr); +} + +static void +cache_clean_range(volatile void *start, size_t length) +{ + /* TODO: Do an invalidate at the start of the range? */ + cacheline_op_range(start, length, cache_clean); +} + +static void +cache_invalidate_range(volatile void *start, size_t length) +{ + cacheline_op_range(start, length, cache_invalidate); +} + +#endif /* __aarch64__ */ + +/* The #ifdef covers both 32-bit and 64-bit ARM */ +#ifdef __ARM_ARCH +static void +cache_barrier(void) +{ + __asm__ volatile ("dsb sy" ::: "memory"); +} + +static void +memory_barrier(void) +{ + __asm__ volatile ("dmb sy" ::: "memory"); +} +#else + +/* TODO: How to do cache barriers when emulated? */ +static void +cache_barrier(void) +{ +} + +static void +memory_barrier(void) +{ +} +#endif +#endif diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c new file mode 100644 index 00000000000..99bd356c536 --- /dev/null +++ b/src/panfrost/base/pan_vX_base.c @@ -0,0 +1,1825 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_VALGRIND +#include +#else +#define RUNNING_ON_VALGRIND 0 +#endif + +#include "util/macros.h" +#include "util/list.h" +#include "util/u_atomic.h" +#include "util/os_file.h" + +#include "pan_base.h" +#include "pan_cache.h" + +#include "drm-uapi/panfrost_drm.h" + +#define PAN_BASE_API (PAN_BASE_VER & 0xff) +#if (PAN_BASE_VER & 0x100) == 0x100 +#define PAN_BASE_NOOP +#endif + +#if PAN_BASE_API >= 2 +#include "csf/mali_gpu_csf_registers.h" + +#define MALI_USE_CSF 1 +#endif + +#include "mali_kbase_gpuprops.h" + +#ifndef PAN_BASE_NOOP +#define kbase_mmap mmap +#endif + +#if PAN_BASE_API >= 1 +#include "mali_base_kernel.h" +#include "mali_kbase_ioctl.h" + +#ifdef PAN_BASE_NOOP +#include "pan_base_noop.h" +#else +#define kbase_ioctl ioctl +#endif +#else + +#include "old/mali-ioctl.h" +#include "old/mali-ioctl-midgard.h" +#include "old/mali-props.h" +#endif + +#define LOG(fmt, ...) do { \ + if (k->verbose) { \ + struct timespec tp; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \ + printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \ + } \ + } while (0) + +#if PAN_BASE_API == 0 +static int +kbase_ioctl(int fd, unsigned long request, ...) +{ + int ioc_size = _IOC_SIZE(request); + + assert(ioc_size); + + va_list args; + + va_start(args, request); + int *ptr = va_arg(args, void *); + va_end(args); + + *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request); + + int ret = ioctl(fd, request, ptr); + if (ret) + return ret; + + int r = *ptr; + switch (r) { + case MALI_ERROR_OUT_OF_GPU_MEMORY: + errno = ENOSPC; + return -1; + case MALI_ERROR_OUT_OF_MEMORY: + errno = ENOMEM; + return -1; + case MALI_ERROR_FUNCTION_FAILED: + errno = EINVAL; + return -1; + default: + return 0; + } +} +#endif + +#if PAN_BASE_API >= 1 +static bool +kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + int i = 0; + uint64_t x = 0; + while (i < k->gpuprops_size) { + x = 0; + memcpy(&x, k->gpuprops + i, 4); + i += 4; + + int size = 1 << (x & 3); + int this_name = x >> 2; + + x = 0; + memcpy(&x, k->gpuprops + i, size); + i += size; + + if (this_name == name) { + *value = x; + return true; + } + } + + return false; +} +#else +static bool +kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops; + + switch (name) { + case KBASE_GPUPROP_PRODUCT_ID: + *value = props->core.product_id; + return true; + case KBASE_GPUPROP_RAW_SHADER_PRESENT: + *value = props->raw.shader_present; + return true; + case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0: + *value = props->raw.texture_features[0]; + return true; + case KBASE_GPUPROP_RAW_TILER_FEATURES: + *value = props->raw.tiler_features; + return true; + case KBASE_GPUPROP_RAW_GPU_ID: + *value = props->raw.gpu_id; + return true; + default: + return false; + } +} +#endif + +static bool +alloc_handles(kbase k) +{ + util_dynarray_init(&k->gem_handles, NULL); + return true; +} + +static bool +free_handles(kbase k) +{ + util_dynarray_fini(&k->gem_handles); + return true; +} + +static bool +set_flags(kbase k) +{ + struct kbase_ioctl_set_flags flags = { + .create_flags = 0 + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); + return false; + } + return true; +} + +static bool +mmap_tracking(kbase k) +{ + k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE, + MAP_SHARED, k->fd, + BASE_MEM_MAP_TRACKING_HANDLE); + + if (k->tracking_region == MAP_FAILED) { + perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); + k->tracking_region = NULL; + return false; + } + return true; +} + +static bool +munmap_tracking(kbase k) +{ + if (k->tracking_region) + return munmap(k->tracking_region, k->page_size) == 0; + return true; +} + +#if PAN_BASE_API >= 1 +static bool +get_gpuprops(kbase k) +{ + struct kbase_ioctl_get_gpuprops props = { 0 }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); + return false; + } else if (!ret) { + fprintf(stderr, "GET_GPUPROPS returned zero size\n"); + return false; + } + + k->gpuprops_size = ret; + k->gpuprops = calloc(k->gpuprops_size, 1); + + props.size = k->gpuprops_size; + props.buffer = (uint64_t)(uintptr_t) k->gpuprops; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); + return false; + } + + return true; +} +#else +static bool +get_gpuprops(kbase k) +{ + k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump)); + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)"); + return false; + } + + return true; +} +#endif + +static bool +free_gpuprops(kbase k) +{ + free(k->gpuprops); + return true; +} + +#if PAN_BASE_API >= 2 +static bool +mmap_user_reg(kbase k) +{ + k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ, + MAP_SHARED, k->fd, + BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); + + if (k->csf_user_reg == MAP_FAILED) { + perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); + k->csf_user_reg = NULL; + return false; + } + return true; +} + +static bool +munmap_user_reg(kbase k) +{ + if (k->csf_user_reg) + return munmap(k->csf_user_reg, k->page_size) == 0; + return true; +} +#endif + +#if PAN_BASE_API >= 1 +static bool +init_mem_exec(kbase k) +{ + struct kbase_ioctl_mem_exec_init init = { + .va_pages = 0x100000, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); + return false; + } + return true; +} + +static bool +init_mem_jit(kbase k) +{ + struct kbase_ioctl_mem_jit_init init = { + .va_pages = 1 << 25, + .max_allocations = 255, + .phys_pages = 1 << 25, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); + return false; + } + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static struct base_ptr +kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags); + +static bool +alloc_event_mem(kbase k) +{ + k->event_mem = kbase_alloc(k, k->page_size * 2, + PANFROST_BO_NOEXEC, + BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | + BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT); + k->kcpu_event_mem = (struct base_ptr) { + .cpu = k->event_mem.cpu + k->page_size, + .gpu = k->event_mem.gpu + k->page_size, + }; + return k->event_mem.cpu; +} + +static bool +free_event_mem(kbase k) +{ + if (k->event_mem.cpu) + return munmap(k->event_mem.cpu, k->page_size * 2) == 0; + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static bool +cs_group_create(kbase k, struct kbase_context *c) +{ + /* TODO: What about compute-only contexts? */ + union kbase_ioctl_cs_queue_group_create_1_6 create = { + .in = { + /* Mali *still* only supports a single tiler unit */ + .tiler_mask = 1, + .fragment_mask = ~0ULL, + .compute_mask = ~0ULL, + + .cs_min = k->cs_queue_count, + + .priority = 1, + .tiler_max = 1, + .fragment_max = 64, + .compute_max = 64, + } + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); + return false; + } + + c->csg_handle = create.out.group_handle; + c->csg_uid = create.out.group_uid; + + /* Should be at least 1 */ + assert(c->csg_uid); + + return true; +} + +static bool +cs_group_term(kbase k, struct kbase_context *c) +{ + if (!c->csg_uid) + return true; + + struct kbase_ioctl_cs_queue_group_term term = { + .group_handle = c->csg_handle + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); + return false; + } + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static bool +tiler_heap_create(kbase k, struct kbase_context *c) +{ + c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */ + + union kbase_ioctl_cs_tiler_heap_init init = { + .in = { + .chunk_size = c->tiler_heap_chunk_size, + .initial_chunks = 5, + .max_chunks = 200, + .target_in_flight = 65535, + } + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); + return false; + } + + c->tiler_heap_va = init.out.gpu_heap_va; + c->tiler_heap_header = init.out.first_chunk_va; + + return true; +} + +static bool +tiler_heap_term(kbase k, struct kbase_context *c) +{ + if (!c->tiler_heap_va) + return true; + + struct kbase_ioctl_cs_tiler_heap_term term = { + .gpu_heap_va = c->tiler_heap_va + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); + return false; + } + return true; +} +#endif + +typedef bool (* kbase_func)(kbase k); + +struct kbase_op { + kbase_func part; + kbase_func cleanup; + const char *label; +}; + +static struct kbase_op kbase_main[] = { + { alloc_handles, free_handles, "Allocate handle array" }, +#if PAN_BASE_API >= 1 + { set_flags, NULL, "Set flags" }, +#endif + { mmap_tracking, munmap_tracking, "Map tracking handle" }, +#if PAN_BASE_API == 0 + { set_flags, NULL, "Set flags" }, +#endif + { get_gpuprops, free_gpuprops, "Get GPU properties" }, +#if PAN_BASE_API >= 2 + { mmap_user_reg, munmap_user_reg, "Map user register page" }, +#endif +#if PAN_BASE_API >= 1 + { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, + { init_mem_jit, NULL, "Initialise JIT allocator" }, +#endif +#if PAN_BASE_API >= 2 + { alloc_event_mem, free_event_mem, "Allocate event memory" }, +#endif +}; + +static void +kbase_close(kbase k) +{ + while (k->setup_state) { + unsigned i = k->setup_state - 1; + if (kbase_main[i].cleanup) + kbase_main[i].cleanup(k); + --k->setup_state; + } + + pthread_mutex_destroy(&k->handle_lock); + pthread_mutex_destroy(&k->event_read_lock); + pthread_mutex_destroy(&k->event_cnd_lock); + pthread_mutex_destroy(&k->queue_lock); + pthread_cond_destroy(&k->event_cnd); + + close(k->fd); +} + +static bool +kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + unsigned conv[] = { + [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID, + [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT, + [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, + [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC, + [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES, + }; + + if (name < ARRAY_SIZE(conv) && conv[name]) + return kbase_get_mali_gpuprop(k, conv[name], value); + + switch (name) { + case DRM_PANFROST_PARAM_AFBC_FEATURES: + *value = 0; + return true; + case DRM_PANFROST_PARAM_GPU_REVISION: { + if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value)) + return false; + *value &= 0xffff; + return true; + } + default: + return false; + } +} + +static void +kbase_free(kbase k, base_va va) +{ + struct kbase_ioctl_mem_free f = { + .gpu_addr = va + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f); + + if (ret == -1) + perror("ioctl(KBASE_IOCTL_MEM_FREE)"); +} + +static struct base_ptr +kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags) +{ + struct base_ptr r = {0}; + + unsigned pages = DIV_ROUND_UP(size, k->page_size); + + union kbase_ioctl_mem_alloc a = { + .in = { + .va_pages = pages, + .commit_pages = pages, + } + }; + + size_t alloc_size = size; + unsigned flags = mali_flags; + bool exec_align = false; + + if (!flags) { + flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | + BASE_MEM_SAME_VA; + + /* Add COHERENT_LOCAL to keep GPU cores coherent with each + * other. */ + if (PAN_BASE_API >= 1) + flags |= BASE_MEM_COHERENT_LOCAL; + } + + if (pan_flags & PANFROST_BO_HEAP) { + size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */ + + a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size); + a.in.commit_pages = 0; + a.in.extension = align_size; + flags |= BASE_MEM_GROW_ON_GPF; + } + +#if PAN_BASE_API >= 1 + if (pan_flags & MALI_BO_CACHED_CPU) + flags |= BASE_MEM_CACHED_CPU; +#endif + +#if PAN_BASE_API >= 2 + if (pan_flags & MALI_BO_UNCACHED_GPU) + flags |= BASE_MEM_UNCACHED_GPU; +#endif + + if (!(pan_flags & PANFROST_BO_NOEXEC)) { + /* Using SAME_VA for executable BOs would make it too likely + * for a blend shader to end up on the wrong side of a 4 GB + * boundary. */ + flags |= BASE_MEM_PROT_GPU_EX; + flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA); + + if (PAN_BASE_API == 0) { + /* Assume 4K pages */ + a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */ + size = 1 << 26; /* Four times the alignment */ + exec_align = true; + } + } + + a.in.flags = flags; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); + return r; + } + + // TODO: Is this always true, even in the face of multithreading? + if (PAN_BASE_API == 0) + a.out.gpu_va = 0x41000; + + if ((flags & BASE_MEM_SAME_VA) && + !((a.out.flags & BASE_MEM_SAME_VA) && + a.out.gpu_va < 0x80000)) { + + fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", + (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va); + errno = EINVAL; + return r; + } + + void *ptr = kbase_mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, a.out.gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU BO)"); + kbase_free(k, a.out.gpu_va); + return r; + } + + uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ? + (uintptr_t) ptr : a.out.gpu_va; + + if (exec_align) { + gpu_va = ALIGN_POT(gpu_va, 1 << 24); + + ptr = kbase_mmap(NULL, alloc_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU EXEC BO)"); + kbase_free(k, gpu_va); + return r; + } + } + + r.cpu = ptr; + r.gpu = gpu_va; + + return r; +} + +static int +kbase_import_dmabuf(kbase k, int fd) +{ + int ret; + + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + kbase_handle *handles = util_dynarray_begin(&k->gem_handles); + + for (unsigned i = 0; i < size; ++i) { + kbase_handle h = handles[i]; + + if (h.fd < 0) + continue; + + ret = os_same_file_description(h.fd, fd); + + if (ret == 0) { + pthread_mutex_unlock(&k->handle_lock); + return i; + } else if (ret < 0) { + printf("error in os_same_file_description(%i, %i)\n", h.fd, fd); + } + } + + int dup = os_dupfd_cloexec(fd); + + union kbase_ioctl_mem_import import = { + .in = { + .phandle = (uintptr_t) &dup, + .type = BASE_MEM_IMPORT_TYPE_UMM, + /* Usage flags: CPU/GPU reads/writes */ + .flags = 0xf, + } + }; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import); + + int handle; + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_IMPORT)"); + handle = -1; + } else if (import.out.flags & BASE_MEM_NEED_MMAP) { + uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, k->fd, import.out.gpu_va); + + if (va == (uintptr_t) MAP_FAILED) { + perror("mmap(IMPORTED BO)"); + handle = -1; + } else { + handle = kbase_alloc_gem_handle_locked(k, va, dup); + } + } else { + handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup); + } + + pthread_mutex_unlock(&k->handle_lock); + + return handle; +} + +static void * +kbase_mmap_import(kbase k, base_va va, size_t size) +{ + return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va); +} + +struct kbase_fence { + struct list_head link; + + unsigned slot; + uint64_t value; +}; + +struct kbase_syncobj { + struct list_head link; + + struct list_head fences; +}; + +static struct kbase_syncobj * +kbase_syncobj_create(kbase k) +{ + struct kbase_syncobj *o = calloc(1, sizeof(*o)); + list_inithead(&o->fences); + pthread_mutex_lock(&k->queue_lock); + list_add(&o->link, &k->syncobjs); + pthread_mutex_unlock(&k->queue_lock); + return o; +} + +static void +kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o) +{ + pthread_mutex_lock(&k->queue_lock); + list_del(&o->link); + pthread_mutex_unlock(&k->queue_lock); + + list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { + list_del(&fence->link); + free(fence); + } + + free(o); +} + +static void +kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) +{ + struct kbase_fence *fence = calloc(1, sizeof(*fence)); + + fence->slot = slot; + fence->value = value; + + list_add(&fence->link, &o->fences); +} + +static void +kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) +{ + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { + if (fence->slot == slot) { + if (value > fence->value) + fence->value = value; + + return; + } + } + + kbase_syncobj_add_fence(o, slot, value); +} + +static struct kbase_syncobj * +kbase_syncobj_dup(kbase k, struct kbase_syncobj *o) +{ + struct kbase_syncobj *dup = kbase_syncobj_create(k); + + pthread_mutex_lock(&k->queue_lock); + + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) + kbase_syncobj_add_fence(dup, fence->slot, fence->value); + + pthread_mutex_unlock(&k->queue_lock); + + return dup; +} + +static void +kbase_syncobj_update(kbase k, struct kbase_syncobj *o) +{ + list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { + uint64_t value = k->event_slots[fence->slot].last; + + if (value > fence->value) { + LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n", + o, fence->slot, fence->value, value); + + list_del(&fence->link); + free(fence); + } + } +} + +static bool +kbase_syncobj_wait(kbase k, struct kbase_syncobj *o) +{ + if (list_is_empty(&o->fences)) { + LOG("syncobj has no fences\n"); + return true; + } + + struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL); + + while (kbase_wait_for_event(&wait)) { + kbase_syncobj_update(k, o); + + if (list_is_empty(&o->fences)) { + kbase_wait_fini(wait); + return true; + } + } + + kbase_wait_fini(wait); + + fprintf(stderr, "syncobj %p wait timeout\n", o); + return false; +} + +static bool +kbase_poll_event(kbase k, int64_t timeout_ns) +{ + struct pollfd pfd = { + .fd = k->fd, + .events = POLLIN, + }; + + struct timespec t = { + .tv_sec = timeout_ns / 1000000000, + .tv_nsec = timeout_ns % 1000000000, + }; + + int ret = ppoll(&pfd, 1, &t, NULL); + + if (ret == -1 && errno != EINTR) + perror("poll(mali fd)"); + + LOG("poll returned %i\n", pfd.revents); + + return ret != 0; +} + +#if PAN_BASE_API < 2 +static bool +kbase_handle_events(kbase k) +{ + struct base_jd_event_v2 event; + bool ret = true; + + for (;;) { + int ret = read(k->fd, &event, sizeof(event)); + + if (ret == -1) { + if (errno == EAGAIN) { + return true; + } else { + perror("read(mali fd)"); + return false; + } + } + + if (event.event_code != BASE_JD_EVENT_DONE) { + fprintf(stderr, "Atom %i reported event 0x%x!\n", + event.atom_number, event.event_code); + ret = false; + } + + pthread_mutex_lock(&k->handle_lock); + + k->event_slots[event.atom_number].last = event.udata.blob[0]; + + unsigned size = util_dynarray_num_elements(&k->gem_handles, + kbase_handle); + kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles); + + struct util_dynarray *handles = k->atom_bos + event.atom_number; + + util_dynarray_foreach(handles, int32_t, h) { + if (*h >= size) + continue; + assert(handle_data[*h].use_count); + --handle_data[*h].use_count; + } + util_dynarray_fini(handles); + + pthread_mutex_unlock(&k->handle_lock); + } + + return ret; +} + +#else + +static bool +kbase_read_event(kbase k) +{ + struct base_csf_notification event; + int ret = read(k->fd, &event, sizeof(event)); + + if (ret == -1) { + if (errno == EAGAIN) { + return true; + } else { + perror("read(mali_fd)"); + return false; + } + } + + if (ret != sizeof(event)) { + fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", + ret, (int) sizeof(event)); + return false; + } + + switch (event.type) { + case BASE_CSF_NOTIFICATION_EVENT: + LOG("Notification event!\n"); + return true; + + case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: + break; + + case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: + fprintf(stderr, "No event from mali_fd!\n"); + return true; + + default: + fprintf(stderr, "Unknown event type!\n"); + return true; + } + + struct base_gpu_queue_group_error e = event.payload.csg_error.error; + + switch (e.error_type) { + case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue group error: status 0x%x " + "sideband 0x%"PRIx64"\n", + e.payload.fatal_group.status, + (uint64_t) e.payload.fatal_group.sideband); + break; + } + case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { + unsigned queue = e.payload.fatal_queue.csi_index; + + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue %i error: status 0x%x " + "sideband 0x%"PRIx64"\n", + queue, e.payload.fatal_queue.status, + (uint64_t) e.payload.fatal_queue.sideband); + + /* TODO: Decode the instruct that it got stuck at */ + + break; + } + + case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: + fprintf(stderr, "Command stream timeout!\n"); + break; + case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: + fprintf(stderr, "Command stream OOM!\n"); + break; + default: + fprintf(stderr, "Unknown error type!\n"); + } + + return false; +} + +static void +kbase_update_queue_callbacks(kbase k, + struct kbase_event_slot *slot, + uint64_t seqnum) +{ + struct kbase_sync_link **list = &slot->syncobjs; + struct kbase_sync_link **back = slot->back; + + while (*list) { + struct kbase_sync_link *link = *list; + + LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum); + + /* Items in the list should be in order, there is no need to + * check any more if we can't process this link yet. */ + if (seqnum <= link->seqnum) + break; + + LOG("done, calling %p(%p)\n", link->callback, link->data); + link->callback(link->data); + *list = link->next; + if (&link->next == back) + slot->back = list; + free(link); + } +} + +static bool +kbase_handle_events(kbase k) +{ +#ifdef PAN_BASE_NOOP + return true; +#endif + + /* This will clear the event count, so there's no need to do it in a + * loop. */ + bool ret = kbase_read_event(k); + + uint64_t *event_mem = k->event_mem.cpu; + + pthread_mutex_lock(&k->queue_lock); + + for (unsigned i = 0; i < k->event_slot_usage; ++i) { + uint64_t seqnum = event_mem[i * 2]; + uint64_t cmp = k->event_slots[i].last; + + LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp); + + if (seqnum < cmp) { + if (false) + fprintf(stderr, "seqnum at offset %i went backward " + "from %"PRIu64" to %"PRIu64"!\n", + i, cmp, seqnum); + } else /*if (seqnum > cmp)*/ { + kbase_update_queue_callbacks(k, &k->event_slots[i], + seqnum); + } + + /* TODO: Atomic operations? */ + k->event_slots[i].last = seqnum; + } + + pthread_mutex_unlock(&k->queue_lock); + + return ret; +} + +#endif + +#if PAN_BASE_API < 2 +static uint8_t +kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest) +{ + /* If a == 4 and newest == 5, a will become 255 */ + a -= newest; + b -= newest; + a = MAX2(a, b); + a += newest; + return a; +} + +static int +kbase_submit(kbase k, uint64_t va, unsigned req, + struct kbase_syncobj *o, + int32_t *handles, unsigned num_handles) +{ + struct util_dynarray buf; + util_dynarray_init(&buf, NULL); + + memcpy(util_dynarray_resize(&buf, int32_t, num_handles), + handles, num_handles * sizeof(int32_t)); + + pthread_mutex_lock(&k->handle_lock); + + unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1; + unsigned dep_slots[KBASE_SLOT_COUNT]; + + uint8_t nr = k->atom_number++; + + struct base_jd_atom_v2 atom = { + .jc = va, + .atom_number = nr, + .udata.blob[0] = k->job_seq++, + }; + + for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i) + dep_slots[i] = nr; + + /* Make sure that we haven't taken an atom that's already in use. */ + assert(!k->atom_bos[nr].data); + k->atom_bos[atom.atom_number] = buf; + + unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles); + + struct util_dynarray extres; + util_dynarray_init(&extres, NULL); + + /* Mark the BOs as in use */ + for (unsigned i = 0; i < num_handles; ++i) { + int32_t h = handles[i]; + assert(h < handle_buf_size); + assert(handle_buf[h].use_count < 255); + + /* Implicit sync */ + if (handle_buf[h].use_count) + for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s) + dep_slots[s] = + kbase_latest_slot(dep_slots[s], + handle_buf[h].last_access[s], + nr); + + handle_buf[h].last_access[slot] = nr; + ++handle_buf[h].use_count; + + if (handle_buf[h].fd != -1) + util_dynarray_append(&extres, base_va, handle_buf[h].va); + } + + pthread_mutex_unlock(&k->handle_lock); + + /* TODO: Better work out the difference between handle_lock and + * queue_lock. */ + if (o) { + pthread_mutex_lock(&k->queue_lock); + kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]); + pthread_mutex_unlock(&k->queue_lock); + } + + assert(KBASE_SLOT_COUNT == 2); + if (dep_slots[0] != nr) { + atom.pre_dep[0].atom_id = dep_slots[0]; + /* TODO: Use data dependencies? */ + atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER; + } + if (dep_slots[1] != nr) { + atom.pre_dep[1].atom_id = dep_slots[1]; + atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER; + } + + if (extres.size) { + atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES; + atom.nr_extres = util_dynarray_num_elements(&extres, base_va); + atom.extres_list = (uintptr_t) util_dynarray_begin(&extres); + } + + if (req & PANFROST_JD_REQ_FS) + atom.core_req |= BASE_JD_REQ_FS; + else + atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T; + + struct kbase_ioctl_job_submit submit = { + .nr_atoms = 1, + .stride = sizeof(atom), + .addr = (uintptr_t) &atom, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit); + + util_dynarray_fini(&extres); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)"); + return -1; + } + + return atom.atom_number; +} + +#else +static struct kbase_context * +kbase_context_create(kbase k) +{ + struct kbase_context *c = calloc(1, sizeof(*c)); + + if (!cs_group_create(k, c)) { + free(c); + return NULL; + } + + if (!tiler_heap_create(k, c)) { + cs_group_term(k, c); + free(c); + return NULL; + } + + return c; +} + +static void +kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx); + +static void +kbase_context_destroy(kbase k, struct kbase_context *ctx) +{ + kbase_kcpu_queue_destroy(k, ctx); + tiler_heap_term(k, ctx); + cs_group_term(k, ctx); + free(ctx); +} + +static bool +kbase_context_recreate(kbase k, struct kbase_context *ctx) +{ + kbase_kcpu_queue_destroy(k, ctx); + tiler_heap_term(k, ctx); + cs_group_term(k, ctx); + + if (!cs_group_create(k, ctx)) { + free(ctx); + return false; + } + + if (!tiler_heap_create(k, ctx)) { + free(ctx); + return false; + } + + return true; +} + +static struct kbase_cs +kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx, + base_va va, unsigned size, unsigned csi) +{ + struct kbase_cs cs = { + .ctx = ctx, + .va = va, + .size = size, + .csi = csi, + .latest_flush = (uint32_t *)k->csf_user_reg, + }; + + struct kbase_ioctl_cs_queue_register reg = { + .buffer_gpu_addr = va, + .buffer_size = size, + .priority = 1, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); + return cs; + } + + union kbase_ioctl_cs_queue_bind bind = { + .in = { + .buffer_gpu_addr = va, + .group_handle = ctx->csg_handle, + .csi_index = csi, + } + }; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); + // hack + cs.user_io = (void *)1; + return cs; + } + + cs.user_io = + kbase_mmap(NULL, + k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, bind.out.mmap_handle); + + if (cs.user_io == MAP_FAILED) { + perror("mmap(CS USER IO)"); + cs.user_io = NULL; + } + + return cs; +} + +static struct kbase_cs +kbase_cs_bind(kbase k, struct kbase_context *ctx, + base_va va, unsigned size) +{ + struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++); + + // TODO: Fix this problem properly + if (k->event_slot_usage >= 256) { + fprintf(stderr, "error: Too many contexts created!\n"); + + /* *very* dangerous, but might just work */ + --k->event_slot_usage; + } + + // TODO: This is a misnomer... it isn't a byte offset + cs.event_mem_offset = k->event_slot_usage++; + k->event_slots[cs.event_mem_offset].back = + &k->event_slots[cs.event_mem_offset].syncobjs; + + uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; + + /* We use the "Higher" wait condition, so initialise to 1 to allow + * waiting before writing... */ + event_data[0] = 1; + /* And reset the error field to 0, to avoid INHERITing faults */ + event_data[1] = 0; + + /* Just a zero-init is fine... reads and writes are always paired */ + uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; + kcpu_data[0] = 0; + kcpu_data[1] = 0; + + /* To match the event data */ + k->event_slots[cs.event_mem_offset].last = 1; + k->event_slots[cs.event_mem_offset].last_submit = 1; + + return cs; +} + +static void +kbase_cs_term(kbase k, struct kbase_cs *cs) +{ + if (cs->user_io) { + LOG("unmapping %p user_io %p\n", cs, cs->user_io); + munmap(cs->user_io, + k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES); + } + + struct kbase_ioctl_cs_queue_terminate term = { + .buffer_gpu_addr = cs->va, + }; + + kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term); + + pthread_mutex_lock(&k->queue_lock); + kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset], + ~0ULL); + + k->event_slots[cs->event_mem_offset].last = ~0ULL; + + /* Make sure that no syncobjs are referencing this CS */ + list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link) + kbase_syncobj_update(k, o); + + + k->event_slots[cs->event_mem_offset].last = 0; + pthread_mutex_unlock(&k->queue_lock); +} + +static void +kbase_cs_rebind(kbase k, struct kbase_cs *cs) +{ + struct kbase_cs new; + new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi); + + cs->user_io = new.user_io; + LOG("remapping %p user_io %p\n", cs, cs->user_io); + + fprintf(stderr, "bound csi %i again\n", cs->csi); +} + +static bool +kbase_cs_kick(kbase k, struct kbase_cs *cs) +{ + struct kbase_ioctl_cs_queue_kick kick = { + .buffer_gpu_addr = cs->va, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); + return false; + } + + return true; +} + +#define CS_RING_DOORBELL(cs) \ + *((uint32_t *)(cs->user_io)) = 1 + +#define CS_READ_REGISTER(cs, r) \ + *((uint64_t *)(cs->user_io + 4096 * 2 + r)) + +#define CS_WRITE_REGISTER(cs, r, v) \ + *((uint64_t *)(cs->user_io + 4096 + r)) = v + +static bool +kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset, + struct kbase_syncobj *o, uint64_t seqnum) +{ + LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n", + cs, seqnum, cs->last_insert, insert_offset); + + if (!cs->user_io) + return false; + + if (insert_offset == cs->last_insert) + return true; + +#ifndef PAN_BASE_NOOP + struct kbase_event_slot *slot = + &k->event_slots[cs->event_mem_offset]; + + pthread_mutex_lock(&k->queue_lock); + slot->last_submit = seqnum + 1; + + if (o) + kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum); + pthread_mutex_unlock(&k->queue_lock); +#endif + + memory_barrier(); + + bool active = CS_READ_REGISTER(cs, CS_ACTIVE); + LOG("active is %i\n", active); + + CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset); + cs->last_insert = insert_offset; + + if (false /*active*/) { + memory_barrier(); + CS_RING_DOORBELL(cs); + memory_barrier(); + + active = CS_READ_REGISTER(cs, CS_ACTIVE); + LOG("active is now %i\n", active); + } else { + kbase_cs_kick(k, cs); + } + + return true; +} + +static bool +kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset, + struct kbase_syncobj *o) +{ + if (!cs->user_io) + return false; + + if (kbase_syncobj_wait(k, o)) + return true; + + uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT); + unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE); + + fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", " + "CS_ACTIVE (%i)\n", + cs->csi, e, extract_offset, a); + + fprintf(stderr, "fences:\n"); + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { + fprintf(stderr, " slot %i: seqnum %"PRIu64"\n", + fence->slot, fence->value); + } + + return false; +} + +static bool +kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx) +{ +#ifdef PAN_BASE_NOOP + return false; +#endif + + if (ctx->kcpu_init) + return true; + + struct kbase_ioctl_kcpu_queue_new create = {0}; + + int ret; + ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)"); + return false; + } + + ctx->kcpu_queue = create.id; + ctx->kcpu_init = true; + return true; +} + +static void +kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx) +{ + if (!ctx->kcpu_init) + return; + + struct kbase_ioctl_kcpu_queue_delete destroy = { + .id = ctx->kcpu_queue, + }; + + int ret; + ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)"); + } + + ctx->kcpu_init = false; +} + +static bool +kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd) +{ + int err; + bool ret = true; + + if (!kbase_kcpu_queue_create(k, ctx)) + return false; + + struct kbase_ioctl_kcpu_queue_enqueue enqueue = { + .addr = (uintptr_t) cmd, + .nr_commands = 1, + .id = ctx->kcpu_queue, + }; + + err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); + if (err != -1) + return ret; + + /* If the enqueue failed, probably we hit the limit of enqueued + * commands (256), wait a bit and try again. + */ + + struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000); + while (kbase_wait_for_event(&wait)) { + err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); + if (err != -1) + break; + + if (errno != EBUSY) { + ret = false; + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE"); + break; + } + } + kbase_wait_fini(wait); + + return ret; +} + +static int +kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx) +{ + struct base_fence fence = { + .basep.fd = -1, + }; + + struct base_kcpu_command fence_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + .info.fence.fence = (uintptr_t) &fence, + }; + + return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1; +} + +static bool +kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd) +{ + struct base_kcpu_command fence_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + .info.fence.fence = (uintptr_t) &(struct base_fence) { + .basep.fd = fd, + }, + }; + + return kbase_kcpu_command(k, ctx, &fence_cmd); +} + +static bool +kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value) +{ + struct base_kcpu_command set_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + .info.cqs_set_operation = { + .objs = (uintptr_t) &(struct base_cqs_set_operation_info) { + .addr = addr, + .val = value, + .operation = BASEP_CQS_SET_OPERATION_SET, + .data_type = BASEP_CQS_DATA_TYPE_U64, + }, + .nr_objs = 1, + }, + }; + + return kbase_kcpu_command(k, ctx, &set_cmd); +} + +static bool +kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value) +{ + struct base_kcpu_command wait_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + .info.cqs_wait_operation = { + .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) { + .addr = addr, + .val = value, + .operation = BASEP_CQS_WAIT_OPERATION_GT, + .data_type = BASEP_CQS_DATA_TYPE_U64, + }, + .nr_objs = 1, + .inherit_err_flags = 0, + }, + }; + + return kbase_kcpu_command(k, ctx, &wait_cmd); +} +#endif + +// TODO: Only define for CSF kbases? +static bool +kbase_callback_all_queues(kbase k, int32_t *count, + void (*callback)(void *), void *data) +{ + pthread_mutex_lock(&k->queue_lock); + + int32_t queue_count = 0; + + for (unsigned i = 0; i < k->event_slot_usage; ++i) { + struct kbase_event_slot *slot = &k->event_slots[i]; + + /* There is no need to do anything for idle slots */ + if (slot->last == slot->last_submit) + continue; + + struct kbase_sync_link *link = malloc(sizeof(*link)); + *link = (struct kbase_sync_link) { + .next = NULL, + .seqnum = slot->last_submit, + .callback = callback, + .data = data, + }; + + // TODO: Put insertion code into its own function + struct kbase_sync_link **list = slot->back; + slot->back = &link->next; + assert(!*list); + *list = link; + + ++queue_count; + } + + p_atomic_add(count, queue_count); + + pthread_mutex_unlock(&k->queue_lock); + + return queue_count != 0; +} + +static void +kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size, + bool invalidate) +{ +#ifdef __aarch64__ + /* Valgrind replaces the operations with DC CVAU, which is not enough + * for CPU<->GPU coherency. The ioctl can be used instead. */ + if (!RUNNING_ON_VALGRIND) { + /* I don't that memory barriers are needed here... having the + * DMB SY before submit should be enough. TODO what about + * dma-bufs? */ + if (invalidate) + cache_invalidate_range(cpu, size); + else + cache_clean_range(cpu, size); + return; + } +#endif + + struct kbase_ioctl_mem_sync sync = { + .handle = gpu, + .user_addr = (uintptr_t) cpu, + .size = size, + .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1), + }; + + int ret; + ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync); + if (ret == -1) + perror("ioctl(KBASE_IOCTL_MEM_SYNC)"); +} + +bool +#if defined(PAN_BASE_NOOP) +kbase_open_csf_noop +#elif PAN_BASE_API == 0 +kbase_open_old +#elif PAN_BASE_API == 1 +kbase_open_new +#elif PAN_BASE_API == 2 +kbase_open_csf +#endif +(kbase k) +{ + k->api = PAN_BASE_API; + + pthread_mutex_init(&k->handle_lock, NULL); + pthread_mutex_init(&k->event_read_lock, NULL); + pthread_mutex_init(&k->event_cnd_lock, NULL); + pthread_mutex_init(&k->queue_lock, NULL); + + pthread_condattr_t attr; + pthread_condattr_init(&attr); + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + pthread_cond_init(&k->event_cnd, &attr); + pthread_condattr_destroy(&attr); + + list_inithead(&k->syncobjs); + + /* For later APIs, we've already checked the version in pan_base.c */ +#if PAN_BASE_API == 0 + struct kbase_ioctl_get_version ver = { 0 }; + kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver); +#endif + + k->close = kbase_close; + + k->get_pan_gpuprop = kbase_get_pan_gpuprop; + k->get_mali_gpuprop = kbase_get_mali_gpuprop; + + k->alloc = kbase_alloc; + k->free = kbase_free; + k->import_dmabuf = kbase_import_dmabuf; + k->mmap_import = kbase_mmap_import; + + k->poll_event = kbase_poll_event; + k->handle_events = kbase_handle_events; + +#if PAN_BASE_API < 2 + k->submit = kbase_submit; +#else + k->context_create = kbase_context_create; + k->context_destroy = kbase_context_destroy; + k->context_recreate = kbase_context_recreate; + + k->cs_bind = kbase_cs_bind; + k->cs_term = kbase_cs_term; + k->cs_rebind = kbase_cs_rebind; + k->cs_submit = kbase_cs_submit; + k->cs_wait = kbase_cs_wait; + + k->kcpu_fence_export = kbase_kcpu_fence_export; + k->kcpu_fence_import = kbase_kcpu_fence_import; + k->kcpu_cqs_set = kbase_kcpu_cqs_set; + k->kcpu_cqs_wait = kbase_kcpu_cqs_wait; +#endif + + k->syncobj_create = kbase_syncobj_create; + k->syncobj_destroy = kbase_syncobj_destroy; + k->syncobj_dup = kbase_syncobj_dup; + k->syncobj_wait = kbase_syncobj_wait; + + k->callback_all_queues = kbase_callback_all_queues; + + k->mem_sync = kbase_mem_sync; + + for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) { + ++k->setup_state; + if (!kbase_main[i].part(k)) { + k->close(k); + return false; + } + } + return true; +} diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py new file mode 100644 index 00000000000..081d32d94c9 --- /dev/null +++ b/src/panfrost/csf_test/interpret.py @@ -0,0 +1,1820 @@ +#!/usr/bin/env python3 + +import os +import re +import struct +import subprocess +import sys + +try: + py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall" +except: + py_path = "../bifrost/valhall" + +if py_path not in sys.path: + sys.path.insert(0, py_path) + +import asm +import struct + +def ff(val): + return struct.unpack("=f", struct.pack("=I", val))[0] + +def ii(val): + return struct.unpack("=I", struct.pack("=f", val))[0] + +shaders = { + "atomic": """ +IADD_IMM.i32.reconverge r0, 0x0, #0x0 +NOP.wait0 +ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 +BRANCHZ.eq.reconverge ^r1.h0, offset:1 +BRANCHZ.eq 0x0, offset:3 +ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0 +IADD_IMM.i32 r0, ^r0, #0x1 +BRANCHZ.eq.reconverge 0x0, offset:-7 +NOP.end +""", + "rmw": """ +IADD_IMM.i32.reconverge r0, 0x0, #0x0 +ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 +BRANCHZ.eq.reconverge r1.h0, offset:1 +BRANCHZ.eq 0x0, offset:6 +NOP.wait1 +LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0 +IADD_IMM.i32 r1, ^r1, #0x1 +STORE.i32.slot1 @r1, u0, offset:0 +IADD_IMM.i32 r0, ^r0, #0x1 +BRANCHZ.eq.reconverge 0x0, offset:-9 +NOP.end +""", + "global_invocation": """ +IADD_IMM.i32 r0, ^r60, #0x1 +STORE.i32.slot0.end @r0, u0, offset:0 +""", + "invoc_offset": """ +LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 +IADD.s32 r0, u0, ^r0 +ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 +IADD.s32 r1, ^r1, u1 +MOV.i32 r2, u2 +STORE.i32.slot0.end @r2, ^r0, offset:0 +""", + "invoc_rmw": """ +LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 +IADD.s32 r0, u0, ^r0 +ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 +IADD.s32 r1, ^r1, u1 +LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0 +IADD.s32 r2, ^r2, u2 +STORE.i32.slot1.end @r2, ^r0, offset:0 +""", + + "preframe": """ +U16_TO_U32.discard r0, r59.h00 +U16_TO_U32 r1, ^r59.h10 +IADD_IMM.i32 r2, 0x0, #0x1 +IADD_IMM.i32 r3, 0x0, #0x0 +TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2 +FADD.f32 r4, ^r4, 0x40490FDB +FADD.f32 r5, ^r5, 0x40490FDB +BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 +""", + + + "position": """ +LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0 +#BRANCHZI.absolute 0x1000000, ^r4 +# position of 16384 +IADD_IMM.i32 r2, 0x0, #0x0e +# position of 16 +IADD_IMM.i32 r2, 0x0, #0x04 +LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0 +LSHIFT_AND.i32 r0, r60, r2, ^r0 +IADD_IMM.i32 r1, 0x0, #0x01 +RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1 +LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0 +S32_TO_F32 r0, ^r0 +S32_TO_F32 r1, ^r1 + +RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0 +S32_TO_F32 r2, ^r2 +FADD.f32 r0, ^r0, r2.neg +#FADD.f32 r1, ^r1, ^r2 +S32_TO_F32 r2, ^r60 +#MOV.i32 r1, 0x0 + +FADD.f32 r0, ^r0, 0x40490FDB +FADD.f32 r1, ^r1, 0x40490FDB +#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0 +MOV.i32 r2, 0x3DCCCCCD +MOV.i32 r3, 0x0 + +#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0 + +IADD_IMM.i32 r8, 0x0, #0x00004000 +STORE.i16.istream.slot0 @r8, r4, offset:64 + +STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0 +STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000 +""", + + "fragment": """ +ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0 +IADD_IMM.i32 r1, 0x0, #0x1ff +LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1 +SHADDX.u64 r2, u2, ^r0.w0, shift:0x2 +STORE.i32.slot0.wait0 @r59, ^r2, offset:0 + +IADD_IMM.i32 r4, 0x0, #0x3f100000 +IADD_IMM.i32 r5, 0x0, #0x3f400000 +IADD_IMM.i32 r6, 0x0, #0x3f300000 +IADD_IMM.i32 r7, 0x0, #0x32cccccd +BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 +""", + +} + +flg = 0xf +#flg = 0x20000f # Uncached! + +HEAP_SIZE = 1024 * 1024 + +memory = { + "ev": (8192, 0x8200f), + "x": 1024 * 1024, + "y": 4096, + "ls_alloc": 4096, + "occlusion": 4096, + + "ssbo": 4096, + "tls": 4096, + + #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB + "plane_0": (256 * 256 * 32, 0x280f), # 2 MB + + "idk": HEAP_SIZE, + "heap": HEAP_SIZE, +} + +w = 0xffffffff + +# Words are 32-bit, apart from address references +descriptors = { + "shader": [0x118, 1 << 12, "invoc_rmw"], + "ls": [3, 31, "ls_alloc"], + "fau": [("ssbo", 0), ("ssbo", 16)], + "fau2": [("ev", 8 + (0 << 34)), 7, 0], + + "tiler_heap": [ + 0x029, 1 << 21, #HEAP_SIZE, + 0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60 + #"heap", ("heap", 64), ("heap", HEAP_SIZE), + ], + +} | { + x: [ + 0, 0, + # Hierarchy mask, + # Single-sampled + # Last provoking vertex + 0x6 | (0 << 18), + 0x00ff00ff, + # Layer + 0, 0, + "tiler_heap", + ("idk", 0x10), + #("tiler_heap", -0xfff0), + # "Weights" + ] + ([0] * (32 - 10)) + [ + # "State" + 0, + 31, + 0, + 0x10000000, + ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3") +} | { + + "thread_storage": [ + 1, 31, + "tls", + 0, 0, + ], + + # Preload r59/r60 + "preframe_shader": [0x128, 3 << 11, "preframe"], + "position_shader": [0x138, 3 << 11, "position"], + "fragment_shader": [0x128, 3 << 11, "fragment"], + + "idvs_zs": [ + 0x70077, # Depth/stencil type, Always for stencil tests + 0, 0, # Stencil state + 0, # unk + # Depth source minimum, write disabled + # [0, 1] Depth clamp + # Depth function: Always + (1 << 23) | (7 << 29), + 0, # Depth units + 0, # Depth factor + 0, # Depth bias clamp + ], + + "preframe_zs": [ + 0x70077, # Depth/stencil type, Always for stencil tests + 0, 0, # Stencil state + 0, # unk + # Depth source minimum, write disabled + # [0, 1] Depth clamp + # Depth function: Always + (1 << 23) | (7 << 29), + 0, # Depth units + 0, # Depth factor + 0, # Depth bias clamp + ], + + "idvs_blend": [ + # Load dest, enable + 1 | (1 << 9), + # RGB/Alpha: Src + Zero * Src + # All channels + ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), + # Fixed function blending, four components + 2 | (3 << 3), + # RGBA8 TB pixel format / F32 register format + 0 | (237 << 12) | (0 << 22) | (1 << 24), + ], + + "preframe_blend": [ + # Load dest, enable + 1 | (1 << 9), + # RGB/Alpha: Src + Zero * Src + # All channels + ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), + # Fixed function blending, four components + 2 | (3 << 3), + # RGBA8 TB pixel format / F32 register format + 0 | (237 << 12) | (0 << 22) | (1 << 24), + ], + + "preframe_surface": [ + # Plane descriptor, generic, tiled, RAW32 clump format + 10 | (1 << 4) | (1 << 8) | (2 << 24), + 256 * 256 * 4, + "plane_0", + 0, + 0, 0, + 0, # was 15, + ], + + "preframe_table": [ + # Texture descriptor, 2D, format + 2 | (2 << 4) | (187 << (10 + 12)), + # Width, height + 255 | (255 << 16), + # Swizzle, interleave + 1672 | (1 << 12), + 0, + "preframe_surface", + 0, 0, + + # Sampler descriptor, clamp to edge + 1 | (9 << 8) | (9 << 12) | (9 << 16), + 0, 0, 0, 0, 0, 0, 0, + ], + + "preframe_resources": [ + ("preframe_table", (1 << (32 + 24))), 0x40, 0, + ], + + "dcds": [ + # Clean fragment write, primitive barrier + (1 << 9) | (1 << 10), + # Sample mask of 0xffff, RT mask of 1 + 0x1ffff, + 0, 0, # vertex array + 0, 0, # unk + 0, 0x3f800000, # min/max depth + 0, 0, # unk + "preframe_zs", # depth/stencil + ("preframe_blend", 1), # blend (count == 1) + 0, 0, # occlusion + + # Shader environment: + 0, # Attribute offset + 2, # FAU count + 0, 0, 0, 0, 0, 0, # unk + ("preframe_resources", 1), # Resources + "preframe_shader", # Shader + 0, 0, # Thread storage + "fau", # FAU + ], + + "framebuffer": [ + 1, 0, # Pre/post, downscale, layer index + 0x10000, 0, # Argument + "ls_alloc", # Sample locations + "dcds", # DCDs + 0x00ff00ff, # width / height + 0, 0x00ff00ff, # bound min/max + # 32x32 tile size + # 4096 byte buffer allocation (maybe?) + (10 << 9) | (4 << 24), + 0, # Disable S, ZS/CRC, Empty Tile, CRC + 0, # Z Clear + "tiler_ctx", # Tiler + + # Framebuffer padding + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + # Render target + # R8G8B8A8 internal format + (1 << 26), + # Write Enable + # R8G8B8A8 colour format + # Linear block format + # 0123 swizzle + # Clean pixel write enable + 1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31), + + # AFBC overlay + # No YTR, no split, no wide, no reverse, no front, no alpha + # RGBA8 compression mode + 0 | (10 << 10), + 0, 0, 0, 0, 0, + + # RT Buffer + "plane_0", + 256 * 4 * 16, # Row stride (for tiling) + 0x400, # Surface stride / Body offset + + # RT Clear + 0x2e234589, 0, 0, 0, + ], + + "index_buffer": [ + 0, 1, 2, + 0, 2, 1, + 1, 0, 2, + 1, 2, 0, + 2, 0, 1, + 2, 1, 0, + + #63, 64, 65, + 1, 2, 3, + 4, 5, 6, + 12, 13, 14, + 0, 1, 2, + 4, 5, 6, + 8, 9, 10, + 3, 4, 5, + ], + + "point_index": [x * 4 for x in range(32)] + [ + 0, 64, 440, 0, + ], + + "position_data": [ + ii(10.0), ii(10.0), ii(1.0), ii(1.0), + ], +} + +# TODO: Use mako? Or just change the syntax for "LDM/STM" +# and use f-strings again? + +cmds = """ +!cs 0 +resources fragment + +@ Bound min +mov w2a, i16:0,0 +@ Bound max +mov w2b, i16:255,255 +mov x28, $framebuffer+1 + +slot 2 + +fragment + +mov w4a, #0x0 +UNK 02 24, #0x4a0000ff0211 +wait 1 + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!raw sleep 20 +!memset plane_0 0 0 262144 +!raw sleep 200 +!dump plane_0 0 12 +!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768 +""" + +altcmds = """ +!cs 0 + +@ Some time is required for the change to become active +@ Just submitting a second job appears to be enough +resources compute fragment tiler idvs +mov x48, #0x6000000000 +heapctx x48 + +!cs 0 + +slot 3 +wait 3 +heapinc vt_start + +@ Base vertex count +mov w24, 0 +@ Instance count +mov w22, 1 + +@ Vertex attribute stride +mov x30, 0 + +@ Primitive +mov w38, 0x430000 +@@ Draw +@ Pixel kill etc. +@ Enable occlusion query +@mov w39, 0xc000 +mov w39, 0 +@ Unk... +mov w26, 0x1000 +@ Sample mask / render target mask +mov w3a, 0x1ffff +@ Min/max Z +mov w2c, float:0 +mov w2d, float:1.0 +@ Depth/stencil +mov x34, $idvs_zs +@ Blend +mov x32, $idvs_blend+1 +@ Occlusion +mov x2e, $occlusion + +@ Primitive size +mov x3c, float:3.75 +@ Fragment shader environment +mov x14, $fragment_shader +@ FAU count == 2 +movp x0c, $fau+0x0200000000000000 + +@ Position shader environment +mov x10, $position_shader + +mov x18, $thread_storage + +@ is this right?! "Vertex attribute stride" apparently? +@ that was for pure tiler jobs, for idvs it messes up points/lines +@ for some reason +@mov x30, $position_data + +@ Tiler +mov x28, $tiler_ctx + +@ Scissor min +mov w2a, i16:0,0 +@ Scissor max +mov w2b, i16:255,255 + +mov w21, 18 +mov w27, 4096 +mov x36, $index_buffer + +idvs 0x4002, mode triangles, index uint32 + +mov w21, 1 @36 +mov w27, 4096 +mov x36, $point_index + +@idvs 0x4a42, mode points, index uint32 + +mov w21, 400000 +mov w21, 18 +@idvs 0x4a42, mode triangles, index none + +@idvs 0x4a42, mode points, index none +@idvs 0x4a42, mode line-loop, index none + +flush_tiler +wait 3 +heapinc vt_end + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +UNK 00 24, #0x5f0000000233 +wait all + +!dump64 tiler_heap 0 4096 +@!dump idk 0 1048576 +@!dump position_data 0 4096 + +!cs 0 + +UNK 00 24, #0x5f0000000233 +wait all + +slot 4 +wait 4 +heapinc vt_start + +mov x28, $tiler_ctx2 +idvs 0x4002, mode triangles, index none +flush_tiler +wait 4 +heapinc vt_end + +UNK 00 24, #0x5f0000000233 +wait all + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!dump64 tiler_heap 0 4096 + +!cs 0 + +mov x50, $ev + +@ Bound min +mov w2a, i16:0,0 +@ Bound max +mov w2b, i16:255,255 +mov x28, $framebuffer+1 +@ Tile enable map +mov x2c, $x +mov x2e, 64 + +mov w40, 1 +str w40, [x2c] +@str w40, [x2c, 128] + +@ Use tile enable map +@fragment tem 1 + +fragment + +@ Does this actually do anytihng? +mov x48, $tiler_ctx +ldr x4a, [x48, 40] +ldr x4c, [x48, 48] +wait 0,4 +UNK 02 0b, 0x4a4c00100001 + +mov x48, $tiler_ctx2 +ldr x4a, [x48, 40] +ldr x4c, [x48, 48] +wait 0,4 +UNK 02 0b, 0x4a4c00100001 + +UNK 02 24, #0x5f0000f80211 +@UNK 00 24, #0x5f0000000233 +wait 1 + +mov x54, $plane_0 +ldr x56, [x54] +wait 0 + +mov x52, $y +str x56, [x52] + +evstr w5f, [x50], unk 0xfd, irq + +!raw td +!fdump heap 0 1048576 +!tiler heap 0 1048576 + + +@!dump rt_buffer 0 4096 +!dump y 0 4096 +@!dump plane_0 0 524288 +@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000 +!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400 +!dump occlusion 0 4096 +@!dump ssbo 0 4096 + +!dump64 tiler_heap 0 4096 +!dump tiler_ctx 0 4096 +!dump tiler_ctx2 0 4096 + +@!fdump heap 0 1048576 + +!cs 0 + +slot 3 +wait 3 +heapinc vt_start + +mov x28, $tiler_ctx3 +mov w2c, float:0 +mov w2d, float:1.0 +mov x2e, $occlusion + +idvs 0x4002, mode triangles, index none +flush_tiler +wait 3 +heapinc vt_end + +UNK 00 24, #0x5f0000000233 +wait all + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!dump64 tiler_heap 0 4096 +!dump tiler_ctx 0 4096 +!raw td + +""" + +docopy = """ +ldr {w00-w0f}, [x52] +ldr {w10-w1f}, [x52, 64] +ldr {w20-w2f}, [x52, 128] +ldr {w30-w3f}, [x52, 192] +add x52, x52, 256 + +loop: +wait 0 + +str {w00-w0f}, [x54] +ldr {w00-w0f}, [x52] +str {w10-w1f}, [x54, 64] +ldr {w10-w1f}, [x52, 64] +str {w20-w2f}, [x54, 128] +ldr {w20-w2f}, [x52, 128] +str {w30-w3f}, [x54, 192] +ldr {w30-w3f}, [x52, 192] + +add x54, x54, 256 +add x52, x52, 256 +add x50, x50, -256 + +b.ne w50, loop +b.ne w51, loop +""" + +oldcmds = f""" +!cs 0 + +mov x50, 0x8000000 + +mov x52, $from +mov x54, $to +mov x56, $x +mov x58, $ev +mov x5a, $y + +str cycles, [x56] +{docopy} +str cycles, [x56, 8] + +UNK 00 24, #0x5f0000000233 +evstr w5f, [x58], unk 0xfd, irq + +!cs 1 + +mov x50, 0x8000000 + +mov x52, $from +mov x54, $to +mov x56, $x +mov x58, $ev +mov x5a, $y + +add x52, x52, 0x8000000 +add x54, x54, 0x8000000 +add x56, x56, 32 + +nop +nop + +str cycles, [x56] +{docopy} +str cycles, [x56, 8] + +UNK 00 24, #0x5f0000000233 +evstr w5f, [x58], unk 0xfd, irq + +!delta x 0 4096 +""" + +oldcmds = """ +!cs 0 +endpt compute +!cs 0 + +@ Workgroup size 1x1x1, merging allowed +mov w21, 0x80000000 + +@ Workgroup count 1x1x1 +mov w25, 1 +mov w26, 1 +mov w27, 1 + +@ Offset 0,0,0 +mov w22, 0 +mov w23, 0 +mov w24, 0 + +@ TODO: offset x/y/z + +@ Resources +mov x06, 0 + +@ Shader +mov x16, $shader + +@ Local storage +mov x1e, $ls + +@ FAU +movp x0e, $fau+0x0200000000000000 + +slot 2 +wait 2 + +UNK 0400000000008200 + +mov x58, $fau +ldr x56, [x58] +wait 0 + +@mov w4a, 0 + +@slot 6 +@mov x54, $x +@UNK 02 24, #0x4a0000f80211 +@ldr x52, [x56] +@wait 0,1 +@str x52, [x54] + +mov w40, 60 +1: add w40, w40, -1 + +@mov w4a, #0x0 +@UNK 02 24, #0x4a0000f80211 +@wait 1 + +@mov w54, #0 +@UNK 00 24, #0x540000000233 +@wait all + +slot 2 +wait 2 + +add w22, w22, 1 +@UNK 0400ff0000008200 + +@b.ne w40, 1b + +!dump x 0 4096 +!dump y 0 4096 +!dump ev 0 4096 +""" + +oldcmds = """ +!cs 0 + +mov x48, $x + +mov w21, 0x80000000 +mov w25, 1 +mov w26, 1 +mov w27, 1 + +movp x0e, $fau+0x0200000000000000 + +@ Write FAUs +@add x0e, x48, 64 +@mov x50, $ev +@str x50, [x0e] +@mov x30, 10 +@str x30, [x0e, 8] +@add w0f, w0f, 0x02000000 + +@ Write shader descriptor +@add x16, x48, 128 +@mov x30, 0x118 +@str x30, [x16] +@mov x30, $compute +@str x30, [x16, 8] + +wait 0 + +add x1e, x48, 192 + +mov x30, $y +@regdump x30 +@mov x30, 0 + +resources compute +slot 2 +mov w54, #0xffffe0 +UNK 00 24, #0x540000000233 + +wait all + +mov x54, 0 +mov w56, 0 +mov w5d, 1 + +slot 2 +wait 2 +wait 2 +regdump x30 +UNK 0400ff0000008200 +add x30, x30, 0x200 +regdump x30 +slot 2 +wait 2 + +mov w40, 1000 +1: add w40, w40, -1 +str cycles, [x50, 32] +b.ne w40, 1b + +wait 0 +wait all + +@ 6 / 10 / 14 +mov w40, 1 +1: add w40, w40, -1 +UNK 0400ff0000000200 +b.ne w40, 1b + +mov w40, 1000 +1: add w40, w40, -1 +str cycles, [x50, 32] +b.ne w40, 1b + +mov w42, 200 +mov w40, 100 +1: add w40, w40, -1 +@wait all +@UNK 0400ff0000008001 @ compute + +@UNK 0400ff0000000001 +@UNK 2501504200000004 @ evadd +@UNK 3 24, #0x4a0000000211 + +@wait all +b.ne w40, 1b + +@UNK 2601504200000004 + +str cycles, [x50, 40] +str cycles, [x50, 48] +UNK 02 24, #0x4a0000000211 +wait 0 + +add x5c, x50, 64 +evadd w5e, [x5c], unk 0xfd +evadd w5e, [x5c], unk 0xfd, irq, unk0 + +!dump x 0 4096 +!dump y 0 4096 +!delta ev 0 4096 +""" + +altcmds = """ +!cs 0 +!alloc x 4096 +!alloc ev 4096 0x8200f +!alloc ev2 4096 0x8200f + +mov x10, $x +UNK 00 30, #0x100000000000 +add x12, x10, 256 +str cycles, [x12] +mov x5a, $ev2 +mov x48, 0 +mov w4a, 0 +slot 3 +wait 3 +UNK 00 31, 0 +mov x48, $ev +mov w4a, 0x4321 +add x46, x48, 64 +mov w42, 0 + +str cycles, [x12, 8] +UNK 01 26, 0x484a00000005 +str cycles, [x12, 16] +UNK 01 26, 0x484a00000005 +str cycles, [x12, 24] + +nop + +mov w10, 10000 +1: +UNK 01 26, 0x484a00000005 +add w10, w10, -1 +b.ne w10, 1b +str cycles, [x12, 32] + +mov w10, 10000 +1: +UNK 01 26, 0x484a00000005 +@UNK 02 24, #0x420000000211 +add w10, w10, -1 +b.ne w10, 1b +str cycles, [x12, 40] + +ldr x16, [x48, 0] +wait 0 +str x16, [x48, 16] + +UNK 00 31, 0x100000000 + +mov w4a, #0x0 +UNK 02 24, #0x4a0000000211 + +mov w5e, 1 +add x5c, x5a, 0x100 +UNK 01 25, 0x5c5e00f80001 + +!delta x 0 4096 +!dump ev 0 4096 +!dump ev2 0 4096 +""" + +altcmds = """ +!cs 0 +!alloc x 4096 +!alloc ev 4096 0x8200f + +iter vertex +slot 2 + +mov x40, $x +mov w10, 1 +mov x48, 0 +mov w4a, 0 +call w4a, x48 + nop + nop + nop + mov x20, $. +@ movp x22, 0x0126000011223344 + movp x22, 0x1600000060000001 + str x22, [x20, 56] + 1: nop + b 1b + nop + add x40, x40, #256 + regdump x40 + +mov x5a, #0x5ff7fd6000 +mov x48, $ev +mov x40, #0x5ff7fd6000 +mov w54, #0x1 +UNK 00 24, #0x540000000233 +wait 0 +slot 6 +@UNK 00 31, #0x0 +UNK 00 09, #0x0 +wait 6 +@UNK 00 31, #0x100000000 +mov x4a, x40 +UNK 01 26, 0x484a00040001 + +!dump x 0 4096 +@!dump ev 0 4096 +@!delta x 0 4096 +""" + +cycletest = """ +mov w10, 10 +1: +str cycles, [x5c] +add x5c, x5c, 8 +add w10, w10, -1 +mov w11, 100000 + +inner: +add w11, w11, -1 +b.ne w11, inner + +b.ne w10, 1b +""" + +def get_cmds(cmd): + return cmds.replace("{cmd}", str(cmd)) + +def assemble_shader(text): + lines = text.strip().split("\n") + lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"] + return [asm.parse_asm(ln) for ln in lines] + +class Buffer: + id = 0 + + def __init__(self): + self.id = Buffer.id + Buffer.id += 1 + +def resolve_rel(to, branch): + return (to - branch) // 8 - 1 + +def to_int16(value): + assert(value < 36768) + assert(value >= -32768) + return value & 0xffff + +class Level(Buffer): + def __init__(self, indent): + super().__init__() + + self.indent = indent + self.buffer = [] + self.call_addr_offset = None + self.call_len_offset = None + + self.labels = {} + self.label_refs = [] + # Numeric labels can be reused, so have to be handled specially. + self.num_labels = {} + self.num_refs = {} + + def offset(self): + return len(self.buffer) * 8 + + def __repr__(self): + buf = " ".join(hex(x) for x in self.buffer) + return f"buffer {self.id} {self.offset()} 0x200f {buf}" + + def buffer_add_value(self, offset, value): + self.buffer[offset // 8] += value + + def process_relocs(self, refs, to=None): + for ref, offset, type_ in refs: + assert(type_ == "rel") + + if to is None: + goto = self.labels[ref] + else: + goto = to + + value = to_int16(resolve_rel(goto, offset)) + self.buffer_add_value(offset, value) + + def finish(self): + self.process_relocs(self.label_refs) + +class Alloc(Buffer): + def __init__(self, size, flags=0x280f): + super().__init__() + + self.size = size + self.flags = flags + self.buffer = [] + + def __repr__(self): + buf = " ".join(hex(x) for x in self.buffer) + return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}" + +def fmt_reloc(r, name="reloc"): + dst, offset, src, src_offset = r + return f"{name} {dst}+{offset} {src}+{src_offset}" + +def fmt_exe(e): + return " ".join(str(x) for x in e) + +class Context: + def __init__(self): + self.levels = [] + self.l = None + + self.allocs = {} + self.completed = [] + self.reloc = [] + self.reloc_split = [] + + self.exe = [] + self.last_exe = None + + self.is_call = False + + def set_l(self): + if len(self.levels): + self.l = self.levels[-1] + + def pop_until(self, indent): + while self.l.indent != indent: + l = self.levels.pop() + self.completed.append(l) + + self.set_l() + if not len(self.levels): + return + + buf_len = l.offset() + + r = self.l + self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0)) + r.buffer[r.call_len_offset] = ( + (r.buffer[r.call_len_offset] & (0xffff << 48)) + + buf_len) + r.buffer[r.call_addr_offset] &= (0xffff << 48) + + r.call_addr_offset = None + r.call_len_offset = None + + def flush_exe(self): + ind = self.levels[0].indent + + self.pop_until(ind) + if len(self.levels[0].buffer): + l = self.levels.pop() + l.finish() + self.completed.append(l) + + self.levels.append(Level(ind)) + self.set_l() + + if not len(self.exe): + return + + if self.last_exe is None: + print("# Trying to add multiple CSs to an exe line, becoming confused") + return + + if len(self.completed): + p = self.completed[-1] + assert(p.indent == ind) + + self.exe[self.last_exe] += [p.id, p.offset()] + + self.last_exe = None + + def add_shaders(self, shaders): + for sh in shaders: + qwords = assemble_shader(shaders[sh]) + sh = sh.lower() + + a = Alloc(len(qwords) * 8, flags=0x2017) + a.buffer = qwords + self.allocs[sh] = a + + def add_memory(self, memory): + for m in memory: + f = memory[m] + if isinstance(f, int): + size, flags = f, 0x280f + else: + size, flags = f + self.allocs[m] = Alloc(size, flags) + + def add_descriptors(self, descriptors): + for d in descriptors: + words = descriptors[d] + a = Alloc(0) + + buf = [] + for w in words: + if isinstance(w, int): + buf.append(w) + else: + if isinstance(w, str): + alloc, offset = w, 0 + else: + alloc, offset = w + ref = self.allocs[alloc] + self.reloc.append((a.id, len(buf) * 4, + ref.id, offset)) + buf.append(0) + buf.append(0) + + it = iter(buf) + a.buffer = [x | (y << 32) for x, y in zip(it, it)] + a.size = len(a.buffer) * 8 + self.allocs[d] = a + + def interpret(self, text): + text = text.split("\n") + + old_indent = None + + for orig_line in text: + #print(orig_line, file=sys.stderr) + + line = orig_line.split("@")[0].expandtabs().rstrip().lower() + if not line: + continue + + indent = len(line) - len(line.lstrip()) + line = line.lstrip() + + if old_indent is None: + self.levels.append(Level(indent)) + elif indent != old_indent: + if indent > old_indent: + assert(self.is_call) + + self.levels.append(Level(indent)) + else: + self.pop_until(indent) + + self.set_l() + + old_indent = indent + self.is_call = False + + given_code = None + + # TODO: Check against this to test the disassembler? + if re.match(r"[0-9a-f]{16} ", line): + given_code = int(line[:16], 16) + line = line[16:].lstrip() + + s = [x.strip(",") for x in line.split()] + + if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])): + label = s[0] + if s[0].endswith(":"): + label = label[:-1] + + if is_num(label): + label = int(label) + if label in self.l.num_refs: + self.l.process_relocs(self.l.num_refs[label], self.l.offset()) + del self.l.num_refs[label] + self.l.num_labels[label] = self.l.offset() + else: + if label in self.l.labels: + print("Label reuse is not supported for non-numeric labels") + self.l.labels[label] = self.l.offset() + + s = s[1:] + if not len(s): + continue + + for i in range(len(s)): + if s[i].startswith("$"): + name, *offset = s[i][1:].split("+") + if name == ".": + buf = self.l + else: + buf = self.allocs[name] + if len(offset): + assert(len(offset) == 1) + offset = int(offset[0], 0) + else: + offset = 0 + + if s[0] == "movp": + rels = self.reloc_split + else: + rels = self.reloc + + rels.append((self.l.id, self.l.offset(), + buf.id, offset)) + s[i] = "#0x0" + + def is_num(str): + return re.fullmatch(r"[0-9]+", str) + + def hx(word): + return int(word, 16) + + def reg(word): + return hx(word[1:]) + + def val(word): + if word.startswith("float:"): + return ii(float(word.split(":")[1])) + elif word.startswith("i16:"): + lo, hi = word.split(":")[1].split(",") + lo, hi = val(lo), val(hi) + assert(lo < (1 << 16)) + assert(hi < (1 << 16)) + return (lo & 0xffff) | (hi << 16) + + value = int(word.strip("#"), 0) + assert(value < (1 << 48)) + return value + + sk = True + + if s[0] == "!cs": + assert(len(s) == 2) + self.flush_exe() + self.last_exe = len(self.exe) + self.exe.append(["exe", int(s[1])]) + continue + elif s[0] == "!parallel": + assert(len(s) == 2) + self.flush_exe() + self.last_exe = len(self.exe) - 1 + self.exe[-1] += [int(s[1])] + continue + elif s[0] == "!alloc": + assert(len(s) == 3 or len(s) == 4) + alloc_id = s[1] + size = int(s[2]) + flags = val(s[3]) if len(s) == 4 else 0x280f + self.allocs[alloc_id] = Alloc(size, flags) + continue + elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"): + assert(len(s) == 4) + alloc_id = s[1] + offset = val(s[2]) + size = val(s[3]) + mode = { + "!dump": "hex", + "!dump64": "hex64", + "!fdump": "filehex", + "!delta": "delta", + "!tiler": "tiler", + }[s[0]] + self.exe.append(("dump", self.allocs[alloc_id].id, + offset, size, mode)) + continue + elif s[0] == "!heatmap": + assert(len(s) == 10) + assert(s[4] == "gran") + assert(s[6] == "len") + assert(s[8] == "stride") + alloc_id = s[1] + offset = val(s[2]) + size = val(s[3]) + granularity = val(s[5]) + length = val(s[7]) + stride = val(s[9]) + mode = "heatmap" + self.exe.append(("heatmap", self.allocs[alloc_id].id, + offset, size, granularity, length, stride)) + continue + elif s[0] == "!memset": + assert(len(s) == 5) + alloc_id = s[1] + offset = val(s[2]) + value = val(s[3]) + size = val(s[4]) + self.exe.append(("memset", self.allocs[alloc_id].id, + offset, value, size)) + continue + elif s[0] == "!raw": + self.exe.append(s[1:]) + continue + elif s[0] == "movp": + assert(len(s) == 3) + assert(s[1][0] == "x") + addr = reg(s[1]) + # Can't use val() as that has a max of 48 bits + value = int(s[2].strip("#"), 0) + + self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff)) + self.l.buffer.append((2 << 56) | ((addr + 1) << 48) + | ((value >> 32) & 0xffffffff)) + continue + elif s[0] == "regdump": + assert(len(s) == 2) + assert(s[1][0] == "x") + dest = reg(s[1]) + + # Number of registers to write per instruction + regs = 16 + + cmd = 21 + value = (dest << 40) | (((1 << regs) - 1) << 16) + + for i in range(0, 0x60, regs): + code = (cmd << 56) | (i << 48) | value | (i << 2) + self.l.buffer.append(code) + + del cmd, value + continue + + elif s[0] == "unk": + if len(s) == 2: + h = hx(s[1]) + cmd = h >> 56 + addr = (h >> 48) & 0xff + value = h & 0xffffffffffff + else: + assert(len(s) == 4) + cmd = hx(s[2]) + addr = hx(s[1]) + value = val(s[3]) + elif s[0] == "nop": + if len(s) == 1: + addr = 0 + value = 0 + cmd = 0 + else: + assert(len(s) == 3) + addr = hx(s[1]) + value = val(s[2]) + cmd = 0 + elif s[0] == "mov" and s[2][0] in "xw": + # This is actually an addition command + assert(len(s) == 3) + assert(s[1][0] == s[2][0]) + cmd = { "x": 17, "w": 16 }[s[1][0]] + addr = reg(s[1]) + value = reg(s[2]) << 40 + elif s[0] == "mov": + assert(len(s) == 3) + cmd = { "x": 1, "w": 2 }[s[1][0]] + addr = reg(s[1]) + value = val(s[2]) + elif s[0] == "add": + assert(len(s) == 4) + assert(s[1][0] == s[2][0]) + assert(s[1][0] in "wx") + cmd = 16 if s[1][0] == "w" else 17 + addr = reg(s[1]) + value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff) + elif s[0] == "resources": + assert(len(s) >= 2) + types = ["compute", "fragment", "tiler", "idvs"] + cmd = 34 + addr = 0 + value = 0 + for t in s[1:]: + if t in types: + value |= 1 << types.index(t) + else: + value |= int(t, 0) + elif s[0] == "fragment": + cmd = 7 + addr = 0 + value = 0 + if len(s) != 1: + arg_map = { + "tem": {"0": 0, "1": 1}, + "render": { + "z_order": 0, + "horizontal": 0x10, + "vertical": 0x20, + "reverse_horizontal": 0x50, + "reverse_vertical": 0x60, + }, + "unk": {"0": 0, "1": 1 << 32}, + } + for arg, val in zip(s[1::2], s[2::2]): + value |= arg_map[arg][val] + elif s[0] == "wait": + assert(len(s) == 2) + cmd = 3 + addr = 0 + if s[1] == "all": + value = 255 + else: + value = sum(1 << int(x) for x in s[1].split(",")) + value <<= 16 + elif s[0] == "slot": + assert(len(s) == 2) + cmd = 23 + addr = 0 + value = int(s[1], 0) + elif s[0] == "add": + # TODO: unk variant + assert(len(s) == 4) + assert(s[1][0] == "x") + assert(s[2][0] == "x") + cmd = 17 + addr = reg(s[1]) + v = val(s[3]) + assert(v < (1 << 32)) + assert(v >= (-1 << 31)) + value = (reg(s[2]) << 40) | (v & 0xffffffff) + elif s[0] == "idvs": + assert(len(s) == 6) + unk = val(s[1]) + assert(s[2] == "mode") + modes = { + "none": 0, + "points": 1, + "lines": 2, + "line-strip": 4, + "line-loop": 6, + "triangles": 8, + "triangle-strip": 10, + "triangle-fan": 12, + "polygon": 13, + "quads": 14, + } + if s[3] in modes: + mode = modes[s[3]] + else: + mode = int(s[3]) + assert(s[4] == "index") + itypes = { + "none": 0, + "uint8": 1, + "uint16": 2, + "uint32": 3, + } + if s[5] in itypes: + index = itypes[s[5]] + else: + index = int(s[5]) + + cmd = 6 + addr = 0 + value = (unk << 32) | (index << 8) | mode + elif s[0] == "flush_tiler": + assert(len(s) == 1) + cmd = 9 + addr = 0 + value = 0 + elif s[0] == "str" and s[1] in ("cycles", "timestamp"): + assert(len(s) == 3 or len(s) == 4) + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]") for x in s] + assert(s[2][0] == "x") + + type_ = 1 if s[1] == "cycles" else 0 + dest = reg(s[2]) + if len(s) == 4: + offset = val(s[3]) + else: + offset = 0 + + cmd = 40 + addr = 0 + value = (dest << 40) | (type_ << 32) | to_int16(offset) + elif s[0] in ("ldr", "str"): + reglist = s[1] + if reglist[0] == "{": + end = [x[-1] for x in s].index("}") + reglist = s[1:end + 1] + s = s[:1] + s[end:] + + assert(len(s) == 3 or len(s) == 4) + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]") for x in s] + assert(s[2][0] == "x") + + if isinstance(reglist, str): + assert(reglist[0] in "xw") + src = reg(reglist) + mask = 3 if reglist[0] == "x" else 1 + else: + src = None + mask = 0 + + for r in ",".join(reglist).strip("{}").split(","): + r = r.split("-") + assert(len(r) in (1, 2)) + regno = [reg(x) for x in r] + + if src is None: + src = regno[0] + + if len(r) == 1: + assert(r[0][0] in "xw") + new = 3 if r[0][0] == "x" else 1 + new = (new << regno[0]) >> src + else: + assert(regno[1] > regno[0]) + new = ((2 << regno[1]) - (1 << regno[0])) >> src + + assert(new < (1 << 16)) + assert(mask & new == 0) + mask |= new + + # Name is correct for str, but inverted for ldr + # (The same holds for src above) + dest = reg(s[2]) + if len(s) == 4: + offset = val(s[3]) + else: + offset = 0 + + cmd = 20 if s[0] == "ldr" else 21 + addr = src + value = (dest << 40) | (mask << 16) | to_int16(offset) + elif s[0] == "b" or s[0].startswith("b."): + # For unconditional jumps, use w00 as a source register if it + # is not specified + if s[0] == "b" and (len(s) == 2 or + (len(s) == 3 and + s[1] in ("back", "skip"))): + s = [s[0], "w00", *s[1:]] + + assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip"))) + assert(s[1][0] == "w") + + ops = { + "b.le": 0, "b.gt": 1, + "b.eq": 2, "b.ne": 3, + "b.lt": 4, "b.ge": 5, + "b": 6, "b.al": 6, + } + + src = reg(s[1]) + if len(s) == 4: + offset = val(s[3]) + if s[2] == "back": + offset = -1 - offset + else: + label = s[2] + if re.fullmatch(r"[0-9]+b", label): + label = int(label[:-1]) + assert(label in self.l.num_labels) + offset = resolve_rel(self.l.num_labels[label], + self.l.offset()) + elif re.fullmatch(r"[0-9]+f", label): + label = int(label[:-1]) + if label not in self.l.num_refs: + self.l.num_refs[label] = [] + self.l.num_refs[label].append((label, self.l.offset(), "rel")) + offset = 0 + else: + assert(not re.fullmatch(r"[0-9]+", label)) + self.l.label_refs.append((label, self.l.offset(), "rel")) + offset = 0 + + cmd = 22 + addr = 0 + value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset) + + elif s[0] in ("evadd", "evstr"): + assert(len(s) in range(5, 8)) + assert(s[1][0] in "wx") + assert(s[2].startswith("[x")) + assert(s[2][-1] == "]") + assert(s[3] == "unk") + s = [x.strip("[]()") for x in s] + + val = reg(s[1]) + dst = reg(s[2]) + mask = hx(s[4]) + irq = "irq" not in s + unk0 = "unk0" in s + + if s[1][0] == "w": + cmd = 37 if s[0] == "evadd" else 38 + else: + cmd = 51 if s[0] == "evadd" else 52 + addr = 1 + value = ((dst << 40) | (val << 32) | (mask << 16) | + (irq << 2) | unk0) + elif s[0].split(".")[0] == "evwait": + for mod in s[0].split(".")[1:]: + assert(mod in {"lo", "hi", "inherit", "no_error"}) + assert(len(s) == 3) + assert(s[1][0] in "wx") + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]()") for x in s] + src = reg(s[2]) + val = reg(s[1]) + cond = 1 if ".hi" in s[0] else 0 + error = 1 if ".no_error" in s[0] else 0 + + cmd = 53 if s[1][0] == "x" else 39 + addr = 0 + value = (src << 40) | (val << 32) | (cond << 28) | error + elif s[0] in ("call", "tailcall"): + ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1] + assert(len(ss) == 3) + assert(ss[1][0] == "w") + assert(ss[2][0] == "x") + cmd = { "call": 32, "tailcall": 33 }[s[0]] + addr = 0 + num = reg(ss[1]) + target = reg(ss[2]) + value = (num << 32) | (target << 40) + + l = self.l + + cur = len(l.buffer) + for ofs in range(cur - 2, cur): + if l.buffer[ofs] >> 48 == 0x100 + target: + l.call_addr_offset = ofs + if l.buffer[ofs] >> 48 == 0x200 + num: + l.call_len_offset = ofs + assert(l.call_addr_offset is not None) + assert(l.call_len_offset is not None) + + self.is_call = True + elif s[0] == "heapctx": + assert(len(s) == 2) + assert(s[1][0] == "x") + cmd = 48 + addr = 0 + value = reg(s[1]) << 40 + elif s[0] == "heapinc": + assert(len(s) == 2) + modes = { + "vt_start": 0, + "vt_end": 1, + "frag_end": 3, + } + if s[1] in modes: + mode = modes[s[1]] + else: + mode = int(s[1]) + cmd = 49 + addr = 0 + value = mode << 32 + else: + print("Unknown command:", orig_line, file=sys.stderr) + # TODO remove + cmd = 0 + addr = 0 + value = 0 + sk = False + pass + + code = (cmd << 56) | (addr << 48) | value + + if given_code and code != given_code: + print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}") + + self.l.buffer.append(code) + + del cmd, addr, value + + if False and not sk: + print(orig_line, file=sys.stderr) + print(indent, s, hex(code) if sk else "", file=sys.stderr) + + self.pop_until(self.levels[0].indent) + self.flush_exe() + + def __repr__(self): + r = [] + r += [str(self.allocs[x]) for x in self.allocs] + r += [str(x) for x in self.completed] + r += [fmt_reloc(x) for x in self.reloc] + r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split] + r += [fmt_exe(x) for x in self.exe] + return "\n".join(r) + +def interpret(text): + c = Context() + c.add_shaders(shaders) + c.add_memory(memory) + c.add_descriptors(descriptors) + c.interpret(text) + #print(str(c)) + return str(c) + +def run(text, capture=False): + if capture: + cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT} + else: + cap = {} + + i = interpret(text) + "\n" + + with open("/tmp/csf.cmds", "w") as f: + f.write(i) + + # TODO: Keep seperate or merge stdout/stderr? + ret = subprocess.run(["csf_test", "/dev/stdin"], + input=i, text=True, **cap) + if ret.stderr is None: + ret.stderr = "" + if ret.stdout is None: + ret.stdout = "" + return ret.stderr + ret.stdout + +def rebuild(): + try: + p = subprocess.run(["rebuild-mesa"]) + if p.returncode != 0: + return False + except FileNotFoundError: + pass + return True + +def go(text): + #print(interpret(text)) + #return + + if not rebuild(): + return + + print(run(text)) + #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s", + # shell=True) + +os.environ["CSF_QUIET"] = "1" + +go(get_cmds("")) + +#for c in range(1, 64): +# val = c +# ret = run(get_cmds(ii(val))) +# print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0]) + +#rebuild() +#for c in range(256): +# print(c, end=":") +# sys.stdout.flush() +# cmd = f"UNK 00 {hex(c)[2:]} 0x00000000" +# run(get_cmds(cmd)) + +#interpret(cmds) +#go(cmds) diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h new file mode 100644 index 00000000000..f5f859eb9ad --- /dev/null +++ b/src/panfrost/csf_test/mali_base_csf_kernel.h @@ -0,0 +1,721 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_CSF_KERNEL_H_ +#define _UAPI_BASE_CSF_KERNEL_H_ + +#include + +/* Memory allocation, access/hint flags. + * + * See base_mem_alloc_flags. + */ + +/* IN */ +/* Read access CPU side + */ +#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) + +/* Write access CPU side + */ +#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) + +/* Read access GPU side + */ +#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) + +/* Write access GPU side + */ +#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) + +/* Execute allowed on the GPU side + */ +#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) + +/* Will be permanently mapped in kernel space. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) + +/* The allocation will completely reside within the same 4GB chunk in the GPU + * virtual space. + * Since this flag is primarily required only for the TLS memory which will + * not be used to contain executable code and also not used for Tiler heap, + * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. + */ +#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) + +/* Userspace is not allowed to free this memory. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) + +#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) + +/* Grow backing store on GPU Page Fault + */ +#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) + +/* Page coherence Outer shareable, if available + */ +#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) + +/* Page coherence Inner shareable + */ +#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) + +/* IN/OUT */ +/* Should be cached on the CPU, returned if actually cached + */ +#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) + +/* IN/OUT */ +/* Must have same VA on both the GPU and the CPU + */ +#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) + +/* OUT */ +/* Must call mmap to acquire a GPU address for the alloc + */ +#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) + +/* IN */ +/* Page coherence Outer shareable, required. + */ +#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) + +/* Protected memory + */ +#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) + +/* Not needed physical memory + */ +#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) + +/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the + * addresses to be the same + */ +#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) + +/* CSF event memory + * + * If Outer shareable coherence is not specified or not available, then on + * allocation kbase will automatically use the uncached GPU mapping. + * There is no need for the client to specify BASE_MEM_UNCACHED_GPU + * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. + * + * This memory requires a permanent mapping + * + * See also kbase_reg_needs_kernel_mapping() + */ +#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) + +#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) + +/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu + * mode. Some components within the GPU might only be able to access memory + * that is GPU cacheable. Refer to the specific GPU implementation for more + * details. The 3 shareability flags will be ignored for GPU uncached memory. + * If used while importing USER_BUFFER type memory, then the import will fail + * if the memory is not aligned to GPU and CPU cache line width. + */ +#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) + +/* + * Bits [22:25] for group_id (0~15). + * + * base_mem_group_id_set() should be used to pack a memory group ID into a + * base_mem_alloc_flags value instead of accessing the bits directly. + * base_mem_group_id_get() should be used to extract the memory group ID from + * a base_mem_alloc_flags value. + */ +#define BASEP_MEM_GROUP_ID_SHIFT 22 +#define BASE_MEM_GROUP_ID_MASK \ + ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) + +/* Must do CPU cache maintenance when imported memory is mapped/unmapped + * on GPU. Currently applicable to dma-buf type only. + */ +#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) + +/* OUT */ +/* Kernel side cache sync ops required */ +#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) + +/* Number of bits used as flags for base memory management + * + * Must be kept in sync with the base_mem_alloc_flags flags + */ +#define BASE_MEM_FLAGS_NR_BITS 29 + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) + +/* A mask for all output bits, excluding IN/OUT bits. + */ +#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP + +/* A mask for all input bits, including IN/OUT bits. + */ +#define BASE_MEM_FLAGS_INPUT_MASK \ + (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED \ + BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20 + +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) +/* reserved handles ..-47< for future special handles */ +#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS \ + ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) + +#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ + ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ + LOCAL_PAGE_SHIFT) + +/** + * Valid set of just-in-time memory allocation flags + */ +#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) + +/* Flags to pass to ::base_context_init. + * Flags can be ORed together to enable multiple things. + * + * These share the same space as BASEP_CONTEXT_FLAG_*, and so must + * not collide with them. + */ +typedef __u32 base_context_create_flags; + +/* No flags set */ +#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) + +/* Base context is embedded in a cctx object (flag used for CINSTR + * software counter macros) + */ +#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) + +/* Base context is a 'System Monitor' context for Hardware counters. + * + * One important side effect of this is that job submission is disabled. + */ +#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \ + ((base_context_create_flags)1 << 1) + +/* Base context creates a CSF event notification thread. + * + * The creation of a CSF event notification thread is conditional but + * mandatory for the handling of CSF events. + */ +#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) + +/* Bit-shift used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) + +/* Bitmask used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ + ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* Bitpattern describing the base_context_create_flags that can be + * passed to the kernel + */ +#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ + (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \ + BASEP_CONTEXT_MMU_GROUP_ID_MASK) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | \ + BASE_CONTEXT_CSF_EVENT_THREAD | \ + BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, + * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) + */ +#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) + +/* Indicate that job dumping is enabled. This could affect certain timers + * to account for the performance impact. + */ +#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) + +/* Enable KBase tracepoints for CSF builds */ +#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) + +/* Enable additional CSF Firmware side tracepoints */ +#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) + +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ + BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ + BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) + +/* Number of pages mapped into the process address space for a bound GPU + * command queue. A pair of input/output pages and a Hw doorbell page + * are mapped to enable direct submission of commands to Hw. + */ +#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) + +#define BASE_QUEUE_MAX_PRIORITY (15U) + +/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ +#define BASEP_EVENT_VAL_INDEX (0U) +#define BASEP_EVENT_ERR_INDEX (1U) + +/* The upper limit for number of objects that could be waited/set per command. + * This limit is now enforced as internally the error inherit inputs are + * converted to 32-bit flags in a __u32 variable occupying a previously padding + * field. + */ +#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) + +/** + * enum base_kcpu_command_type - Kernel CPU queue command type. + * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, + * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, + * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, + * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, + * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, + * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, + * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, + */ +enum base_kcpu_command_type { + BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_SET, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, + BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, + BASE_KCPU_COMMAND_TYPE_JIT_FREE, + BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, + BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER +}; + +/** + * enum base_queue_group_priority - Priority of a GPU Command Queue Group. + * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group + * priority levels. + * + * Currently this is in order of highest to lowest, but if new levels are added + * then those new levels may be out of order to preserve the ABI compatibility + * with previous releases. At that point, ensure assignment to + * the 'priority' member in &kbase_queue_group is updated to ensure it remains + * a linear ordering. + * + * There should be no gaps in the enum, otherwise use of + * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. + */ +enum base_queue_group_priority { + BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, + BASE_QUEUE_GROUP_PRIORITY_MEDIUM, + BASE_QUEUE_GROUP_PRIORITY_LOW, + BASE_QUEUE_GROUP_PRIORITY_REALTIME, + BASE_QUEUE_GROUP_PRIORITY_COUNT +}; + +struct base_kcpu_command_fence_info { + __u64 fence; +}; + +struct base_cqs_wait_info { + __u64 addr; + __u32 val; + __u32 padding; +}; + +struct base_kcpu_command_cqs_wait_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +struct base_cqs_set { + __u64 addr; +}; + +struct base_kcpu_command_cqs_set_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * typedef basep_cqs_data_type - Enumeration of CQS Data Types + * + * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value + * is an unsigned 32-bit integer + * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value + * is an unsigned 64-bit integer + */ +typedef enum PACKED { + BASEP_CQS_DATA_TYPE_U32 = 0, + BASEP_CQS_DATA_TYPE_U64 = 1, +} basep_cqs_data_type; + +/** + * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait + * Operation conditions + * + * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Less than or Equal to + * the Wait Operation value + * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Greater than the Wait Operation value + */ +typedef enum { + BASEP_CQS_WAIT_OPERATION_LE = 0, + BASEP_CQS_WAIT_OPERATION_GT = 1, +} basep_cqs_wait_operation_op; + +struct base_cqs_wait_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information + * about the Timeline CQS wait objects + * + * @objs: An array of Timeline CQS waits. + * @nr_objs: Number of Timeline CQS waits in the array. + * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field + * to be served as the source for importing into the + * queue's error-state. + */ +struct base_kcpu_command_cqs_wait_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +/** + * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations + * + * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value + * to a synchronization object + * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value + * of a synchronization object + */ +typedef enum { + BASEP_CQS_SET_OPERATION_ADD = 0, + BASEP_CQS_SET_OPERATION_SET = 1, +} basep_cqs_set_operation_op; + +struct base_cqs_set_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_set_operation_info - structure which contains information + * about the Timeline CQS set objects + * + * @objs: An array of Timeline CQS sets. + * @nr_objs: Number of Timeline CQS sets in the array. + * @padding: Structure padding, unused bytes. + */ +struct base_kcpu_command_cqs_set_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * struct base_kcpu_command_import_info - structure which contains information + * about the imported buffer. + * + * @handle: Address of imported user buffer. + */ +struct base_kcpu_command_import_info { + __u64 handle; +}; + +/** + * struct base_kcpu_command_jit_alloc_info - structure which contains + * information about jit memory allocation. + * + * @info: An array of elements of the + * struct base_jit_alloc_info type. + * @count: The number of elements in the info array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_alloc_info { + __u64 info; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_jit_free_info - structure which contains + * information about jit memory which is to be freed. + * + * @ids: An array containing the JIT IDs to free. + * @count: The number of elements in the ids array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_free_info { + __u64 ids; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_group_suspend_info - structure which contains + * suspend buffer data captured for a suspended queue group. + * + * @buffer: Pointer to an array of elements of the type char. + * @size: Number of elements in the @buffer array. + * @group_handle: Handle to the mapping of CSG. + * @padding: padding to a multiple of 64 bits. + */ +struct base_kcpu_command_group_suspend_info { + __u64 buffer; + __u32 size; + __u8 group_handle; + __u8 padding[3]; +}; + + +/** + * struct base_kcpu_command - kcpu command. + * @type: type of the kcpu command, one enum base_kcpu_command_type + * @padding: padding to a multiple of 64 bits + * @info: structure which contains information about the kcpu command; + * actual type is determined by @p type + * @info.fence: Fence + * @info.cqs_wait: CQS wait + * @info.cqs_set: CQS set + * @info.import: import + * @info.jit_alloc: jit allocation + * @info.jit_free: jit deallocation + * @info.suspend_buf_copy: suspend buffer copy + * @info.sample_time: sample time + * @info.padding: padding + */ +struct base_kcpu_command { + __u8 type; + __u8 padding[sizeof(__u64) - sizeof(__u8)]; + union { + struct base_kcpu_command_fence_info fence; + struct base_kcpu_command_cqs_wait_info cqs_wait; + struct base_kcpu_command_cqs_set_info cqs_set; + struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; + struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; + struct base_kcpu_command_import_info import; + struct base_kcpu_command_jit_alloc_info jit_alloc; + struct base_kcpu_command_jit_free_info jit_free; + struct base_kcpu_command_group_suspend_info suspend_buf_copy; + __u64 padding[2]; /* No sub-struct should be larger */ + } info; +}; + +/** + * struct basep_cs_stream_control - CSI capabilities. + * + * @features: Features of this stream + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_stream_control { + __u32 features; + __u32 padding; +}; + +/** + * struct basep_cs_group_control - CSG interface capabilities. + * + * @features: Features of this group + * @stream_num: Number of streams in this group + * @suspend_size: Size in bytes of the suspend buffer for this group + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_group_control { + __u32 features; + __u32 stream_num; + __u32 suspend_size; + __u32 padding; +}; + +/** + * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault + * error information associated with GPU command queue group. + * + * @sideband: Additional information of the unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_group_error_fatal_payload { + __u64 sideband; + __u32 status; + __u32 padding; +}; + +/** + * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault + * error information related to GPU command queue. + * + * @sideband: Additional information about this unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @csi_index: Index of the CSF interface the queue is bound to. + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_error_fatal_payload { + __u64 sideband; + __u32 status; + __u8 csi_index; + __u8 padding[3]; +}; + +/** + * enum base_gpu_queue_group_error_type - GPU Fatal error type. + * + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU + * command queue group. + * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU + * command queue. + * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with + * progress timeout. + * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out + * of tiler heap memory. + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types + * + * This type is used for &struct_base_gpu_queue_group_error.error_type. + */ +enum base_gpu_queue_group_error_type { + BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, + BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, + BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, + BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, + BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT +}; + +/** + * struct base_gpu_queue_group_error - Unrecoverable fault information + * @error_type: Error type of @base_gpu_queue_group_error_type + * indicating which field in union payload is filled + * @padding: Unused bytes for 64bit boundary + * @payload: Input Payload + * @payload.fatal_group: Unrecoverable fault error associated with + * GPU command queue group + * @payload.fatal_queue: Unrecoverable fault error associated with command queue + */ +struct base_gpu_queue_group_error { + __u8 error_type; + __u8 padding[7]; + union { + struct base_gpu_queue_group_error_fatal_payload fatal_group; + struct base_gpu_queue_error_fatal_payload fatal_queue; + } payload; +}; + +/** + * enum base_csf_notification_type - Notification type + * + * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event + * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal + * error + * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu + * queue + * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type + * + * This type is used for &struct_base_csf_notification.type. + */ +enum base_csf_notification_type { + BASE_CSF_NOTIFICATION_EVENT = 0, + BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, + BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, + BASE_CSF_NOTIFICATION_COUNT +}; + +/** + * struct base_csf_notification - Event or error notification + * + * @type: Notification type of @base_csf_notification_type + * @padding: Padding for 64bit boundary + * @payload: Input Payload + * @payload.align: To fit the struct into a 64-byte cache line + * @payload.csg_error: CSG error + * @payload.csg_error.handle: Handle of GPU command queue group associated with + * fatal error + * @payload.csg_error.padding: Padding + * @payload.csg_error.error: Unrecoverable fault error + * + */ +struct base_csf_notification { + __u8 type; + __u8 padding[7]; + union { + struct { + __u8 handle; + __u8 padding[7]; + struct base_gpu_queue_group_error error; + } csg_error; + + __u8 align[56]; + } payload; +}; + +#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h new file mode 100644 index 00000000000..305956f341a --- /dev/null +++ b/src/panfrost/csf_test/mali_base_kernel.h @@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * Base structures shared with the kernel. + */ + +#ifndef _UAPI_BASE_KERNEL_H_ +#define _UAPI_BASE_KERNEL_H_ + +#include + +struct base_mem_handle { + struct { + __u64 handle; + } basep; +}; + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +#define BASE_MAX_COHERENT_GROUPS 16 + +#if defined(PAGE_MASK) && defined(PAGE_SHIFT) +#define LOCAL_PAGE_SHIFT PAGE_SHIFT +#define LOCAL_PAGE_LSB ~PAGE_MASK +#else +#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 +#endif + +#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) +#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) +#else +#error Failed to find page size +#endif +#endif + +/* Physical memory group ID for normal usage. + */ +#define BASE_MEM_GROUP_DEFAULT (0) + +/* Number of physical memory groups. + */ +#define BASE_MEM_GROUP_COUNT (16) + +/** + * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. + * + * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator + * in order to determine the best cache policy. Some combinations are + * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), + * which defines a write-only region on the CPU side, which is + * heavily read by the CPU... + * Other flags are only meaningful to a particular allocator. + * More flags can be added to this list, as long as they don't clash + * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). + */ +typedef __u32 base_mem_alloc_flags; + +/* A mask for all the flags which are modifiable via the base_mem_set_flags + * interface. + */ +#define BASE_MEM_FLAGS_MODIFIABLE \ + (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ + BASE_MEM_COHERENT_LOCAL) + +/* A mask of all the flags that can be returned via the base_mem_get_flags() + * interface. + */ +#define BASE_MEM_FLAGS_QUERYABLE \ + (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ + BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ + BASEP_MEM_FLAGS_KERNEL_ONLY)) + +/** + * enum base_mem_import_type - Memory types supported by @a base_mem_import + * + * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type + * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) + * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a + * base_mem_import_user_buffer + * + * Each type defines what the supported handle type is. + * + * If any new type is added here ARM must be contacted + * to allocate a numeric value for it. + * Do not just add a new type without synchronizing with ARM + * as future releases from ARM might include other new types + * which could clash with your custom types. + */ +enum base_mem_import_type { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + /* + * Import type with value 1 is deprecated. + */ + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 +}; + +/** + * struct base_mem_import_user_buffer - Handle of an imported user buffer + * + * @ptr: address of imported user buffer + * @length: length of imported user buffer in bytes + * + * This structure is used to represent a handle of an imported user buffer. + */ + +struct base_mem_import_user_buffer { + __u64 ptr; + __u64 length; +}; + +/* Mask to detect 4GB boundary alignment */ +#define BASE_MEM_MASK_4GB 0xfffff000UL +/* Mask to detect 4GB boundary (in page units) alignment */ +#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) + +/* Limit on the 'extension' parameter for an allocation with the + * BASE_MEM_TILER_ALIGN_TOP flag set + * + * This is the same as the maximum limit for a Buffer Descriptor's chunk size + */ +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ + (21u - (LOCAL_PAGE_SHIFT)) +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ + (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) + +/* Bit mask of cookies used for for memory allocation setup */ +#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ + +/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ +#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ + +/* + * struct base_fence - Cross-device synchronisation fence. + * + * A fence is used to signal when the GPU has finished accessing a resource that + * may be shared with other devices, and also to delay work done asynchronously + * by the GPU until other devices have finished accessing a shared resource. + */ +struct base_fence { + struct { + int fd; + int stream_fd; + } basep; +}; + +/** + * struct base_mem_aliasing_info - Memory aliasing info + * + * Describes a memory handle to be aliased. + * A subset of the handle can be chosen for aliasing, given an offset and a + * length. + * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a + * region where a special page is mapped with a write-alloc cache setup, + * typically used when the write result of the GPU isn't needed, but the GPU + * must write anyway. + * + * Offset and length are specified in pages. + * Offset must be within the size of the handle. + * Offset+length must not overrun the size of the handle. + * + * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * @offset: Offset within the handle to start aliasing from, in pages. + * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. + * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * specifies the number of times the special page is needed. + */ +struct base_mem_aliasing_info { + struct base_mem_handle handle; + __u64 offset; + __u64 length; +}; + +/* Maximum percentage of just-in-time memory allocation trimming to perform + * on free. + */ +#define BASE_JIT_MAX_TRIM_LEVEL (100) + +/* Maximum number of concurrent just-in-time memory allocations. + */ +#define BASE_JIT_ALLOC_COUNT (255) + +/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 + * + * jit_version is 1 + * + * Due to the lack of padding specified, user clients between 32 and 64-bit + * may have assumed a different size of the struct + * + * An array of structures was not supported + */ +struct base_jit_alloc_info_10_2 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; +}; + +/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up + * to 11.19 + * + * This structure had a number of modifications during and after kernel driver + * version 11.5, but remains size-compatible throughout its version history, and + * with earlier variants compatible with future variants by requiring + * zero-initialization to the unused space in the structure. + * + * jit_version is 2 + * + * Kernel driver version history: + * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes + * must be zero. Kbase minor version was not incremented, so some + * versions of 11.5 do not have this change. + * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase + * minor version not incremented) + * 11.6: Added 'flags', replacing 1 padding byte + * 11.10: Arrays of this structure are supported + */ +struct base_jit_alloc_info_11_5 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; +}; + +/** + * struct base_jit_alloc_info - Structure which describes a JIT allocation + * request. + * @gpu_alloc_addr: The GPU virtual address to write the JIT + * allocated GPU virtual address to. + * @va_pages: The minimum number of virtual pages required. + * @commit_pages: The minimum number of physical pages which + * should back the allocation. + * @extension: Granularity of physical pages to grow the + * allocation by during a fault. + * @id: Unique ID provided by the caller, this is used + * to pair allocation and free requests. + * Zero is not a valid value. + * @bin_id: The JIT allocation bin, used in conjunction with + * @max_allocations to limit the number of each + * type of JIT allocation. + * @max_allocations: The maximum number of allocations allowed within + * the bin specified by @bin_id. Should be the same + * for all allocations within the same bin. + * @flags: flags specifying the special requirements for + * the JIT allocation, see + * %BASE_JIT_ALLOC_VALID_FLAGS + * @padding: Expansion space - should be initialised to zero + * @usage_id: A hint about which allocation should be reused. + * The kernel should attempt to use a previous + * allocation with the same usage_id + * @heap_info_gpu_addr: Pointer to an object in GPU memory describing + * the actual usage of the region. + * + * jit_version is 3. + * + * When modifications are made to this structure, it is still compatible with + * jit_version 3 when: a) the size is unchanged, and b) new members only + * replace the padding bytes. + * + * Previous jit_version history: + * jit_version == 1, refer to &base_jit_alloc_info_10_2 + * jit_version == 2, refer to &base_jit_alloc_info_11_5 + * + * Kbase version history: + * 11.20: added @heap_info_gpu_addr + */ +struct base_jit_alloc_info { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; + __u64 heap_info_gpu_addr; +}; + +enum base_external_resource_access { + BASE_EXT_RES_ACCESS_SHARED, + BASE_EXT_RES_ACCESS_EXCLUSIVE +}; + +struct base_external_resource { + __u64 ext_resource; +}; + + +/** + * The maximum number of external resources which can be mapped/unmapped + * in a single request. + */ +#define BASE_EXT_RES_COUNT_MAX 10 + +/** + * struct base_external_resource_list - Structure which describes a list of + * external resources. + * @count: The number of resources. + * @ext_res: Array of external resources which is + * sized at allocation time. + */ +struct base_external_resource_list { + __u64 count; + struct base_external_resource ext_res[1]; +}; + +struct base_jd_debug_copy_buffer { + __u64 address; + __u64 size; + struct base_external_resource extres; +}; + +#define GPU_MAX_JOB_SLOTS 16 + +/** + * User-side Base GPU Property Queries + * + * The User-side Base GPU Property Query interface encapsulates two + * sub-modules: + * + * - "Dynamic GPU Properties" + * - "Base Platform Config GPU Properties" + * + * Base only deals with properties that vary between different GPU + * implementations - the Dynamic GPU properties and the Platform Config + * properties. + * + * For properties that are constant for the GPU Architecture, refer to the + * GPU module. However, we will discuss their relevance here just to + * provide background information. + * + * About the GPU Properties in Base and GPU modules + * + * The compile-time properties (Platform Config, GPU Compile-time + * properties) are exposed as pre-processor macros. + * + * Complementing the compile-time properties are the Dynamic GPU + * Properties, which act as a conduit for the GPU Configuration + * Discovery. + * + * In general, the dynamic properties are present to verify that the platform + * has been configured correctly with the right set of Platform Config + * Compile-time Properties. + * + * As a consistent guide across the entire DDK, the choice for dynamic or + * compile-time should consider the following, in order: + * 1. Can the code be written so that it doesn't need to know the + * implementation limits at all? + * 2. If you need the limits, get the information from the Dynamic Property + * lookup. This should be done once as you fetch the context, and then cached + * as part of the context data structure, so it's cheap to access. + * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, + * then use a Compile-Time Property (Platform Config, or GPU Compile-time + * property). Examples of where this might be sensible follow: + * - Part of a critical inner-loop + * - Frequent re-use throughout the driver, causing significant extra load + * instructions or control flow that would be worthwhile optimizing out. + * + * We cannot provide an exhaustive set of examples, neither can we provide a + * rule for every possible situation. Use common sense, and think about: what + * the rest of the driver will be doing; how the compiler might represent the + * value if it is a compile-time constant; whether an OEM shipping multiple + * devices would benefit much more from a single DDK binary, instead of + * insignificant micro-optimizations. + * + * Dynamic GPU Properties + * + * Dynamic GPU properties are presented in two sets: + * 1. the commonly used properties in @ref base_gpu_props, which have been + * unpacked from GPU register bitfields. + * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props + * (also a member of base_gpu_props). All of these are presented in + * the packed form, as presented by the GPU registers themselves. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + * The properties returned extend the GPU Configuration Discovery + * registers. For example, GPU clock speed is not specified in the GPU + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. + * + * The GPU properties are obtained by a call to + * base_get_gpu_props(). This simply returns a pointer to a const + * base_gpu_props structure. It is constant for the life of a base + * context. Multiple calls to base_get_gpu_props() to a base context + * return the same pointer to a constant structure. This avoids cache pollution + * of the common data. + * + * This pointer must not be freed, because it does not point to the start of a + * region allocated by the memory allocator; instead, just close the @ref + * base_context. + * + * + * Kernel Operation + * + * During Base Context Create time, user-side makes a single kernel call: + * - A call to fill user memory with GPU information structures + * + * The kernel-side will fill the provided the entire processed base_gpu_props + * structure, because this information is required in both + * user and kernel side; it does not make sense to decode it twice. + * + * Coherency groups must be derived from the bitmasks, but this can be done + * kernel side, and just once at kernel startup: Coherency groups must already + * be known kernel-side, to support chains that specify a 'Only Coherent Group' + * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. + * + * Coherency Group calculation + * + * Creation of the coherent group data is done at device-driver startup, and so + * is one-time. This will most likely involve a loop with CLZ, shifting, and + * bit clearing on the L2_PRESENT mask, depending on whether the + * system is L2 Coherent. The number of shader cores is done by a + * population count, since faulty cores may be disabled during production, + * producing a non-contiguous mask. + * + * The memory requirements for this algorithm can be determined either by a __u64 + * population count on the L2_PRESENT mask (a LUT helper already is + * required for the above), or simple assumption that there can be no more than + * 16 coherent groups, since core groups are typically 4 cores. + */ + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +#define BASE_MAX_COHERENT_GROUPS 16 +/** + * struct mali_base_gpu_core_props - GPU core props info + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to allign to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + * @num_exec_engines: The number of execution engines. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; + __u8 num_exec_engines; +}; + +/* + * More information is possible - but associativity and bus width are not + * required by upper-level apis. + */ +struct mali_base_gpu_l2_cache_props { + __u8 log2_line_size; + __u8 log2_cache_size; + __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + __u8 padding[5]; +}; + +struct mali_base_gpu_tiler_props { + __u32 bin_size_bytes; /* Max is 4*2^15 */ + __u32 max_active_levels; /* Max is 2^15 */ +}; + +/** + * struct mali_base_gpu_thread_props - GPU threading system details. + * @max_threads: Max. number of threads per core + * @max_workgroup_size: Max. number of threads per workgroup + * @max_barrier_size: Max. number of threads that can synchronize on a + * simple barrier + * @max_registers: Total size [1..65535] of the register file available + * per core. + * @max_task_queue: Max. tasks [1..255] which may be sent to a core + * before it becomes blocked. + * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split + * field. + * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, + * 3 = SW Model/Emulation + * @padding: padding to allign to 8-byte + * @tls_alloc: Number of threads per core that TLS must be + * allocated for + */ +struct mali_base_gpu_thread_props { + __u32 max_threads; + __u32 max_workgroup_size; + __u32 max_barrier_size; + __u16 max_registers; + __u8 max_task_queue; + __u8 max_thread_group_split; + __u8 impl_tech; + __u8 padding[3]; + __u32 tls_alloc; +}; + +/** + * struct mali_base_gpu_coherent_group - descriptor for a coherent group + * @core_mask: Core restriction mask required for the group + * @num_cores: Number of cores in the group + * @padding: padding to allign to 8-byte + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_base_gpu_coherent_group { + __u64 core_mask; + __u16 num_cores; + __u16 padding[3]; +}; + +/** + * struct mali_base_gpu_coherent_group_info - Coherency group information + * @num_groups: Number of coherent groups in the GPU. + * @num_core_groups: Number of core groups (coherent or not) in the GPU. + * Equivalent to the number of L2 Caches. + * The GPU Counter dumping writes 2048 bytes per core group, regardless + * of whether the core groups are coherent or not. Hence this member is + * needed to calculate how much memory is required for dumping. + * @note Do not use it to work out how many valid elements are in the + * group[] member. Use num_groups instead. + * @coherency: Coherency features of the memory, accessed by gpu_mem_features + * methods + * @padding: padding to allign to 8-byte + * @group: Descriptors of coherent groups + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_base_gpu_coherent_group_info { + __u32 num_groups; + __u32 num_core_groups; + __u32 coherency; + __u32 padding; + struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; +}; + +/** + * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware + * Configuration Discovery registers. + * @shader_present: Shader core present bitmap + * @tiler_present: Tiler core present bitmap + * @l2_present: Level 2 cache present bitmap + * @stack_present: Core stack present bitmap + * @l2_features: L2 features + * @core_features: Core features + * @mem_features: Mem features + * @mmu_features: Mmu features + * @as_present: Bitmap of address spaces present + * @js_present: Job slots present + * @js_features: Array of job slot features. + * @tiler_features: Tiler features + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU + * @gpu_id: GPU and revision identifier + * @thread_max_threads: Maximum number of threads per core + * @thread_max_workgroup_size: Maximum number of threads per workgroup + * @thread_max_barrier_size: Maximum number of threads per barrier + * @thread_features: Thread features + * @coherency_mode: Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register + * @thread_tls_alloc: Number of threads per core that TLS must be allocated for + * @gpu_features: GPU features + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct gpu_raw_gpu_props { + __u64 shader_present; + __u64 tiler_present; + __u64 l2_present; + __u64 stack_present; + __u32 l2_features; + __u32 core_features; + __u32 mem_features; + __u32 mmu_features; + + __u32 as_present; + + __u32 js_present; + __u32 js_features[GPU_MAX_JOB_SLOTS]; + __u32 tiler_features; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + __u32 gpu_id; + + __u32 thread_max_threads; + __u32 thread_max_workgroup_size; + __u32 thread_max_barrier_size; + __u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + __u32 coherency_mode; + + __u32 thread_tls_alloc; + __u64 gpu_features; +}; + +/** + * struct base_gpu_props - Return structure for base_get_gpu_props(). + * @core_props: Core props. + * @l2_props: L2 props. + * @unused_1: Keep for backwards compatibility. + * @tiler_props: Tiler props. + * @thread_props: Thread props. + * @raw_props: This member is large, likely to be 128 bytes. + * @coherency_info: This must be last member of the structure. + * + * NOTE: the raw_props member in this data structure contains the register + * values from which the value of the other members are derived. The derived + * members exist to allow for efficient access and/or shielding the details + * of the layout of the registers. + */ +struct base_gpu_props { + struct mali_base_gpu_core_props core_props; + struct mali_base_gpu_l2_cache_props l2_props; + __u64 unused_1; + struct mali_base_gpu_tiler_props tiler_props; + struct mali_base_gpu_thread_props thread_props; + struct gpu_raw_gpu_props raw_props; + struct mali_base_gpu_coherent_group_info coherency_info; +}; + +#define BASE_MEM_GROUP_ID_GET(flags) \ + ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) + +#define BASE_MEM_GROUP_ID_SET(id) \ + (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ + BASE_MEM_GROUP_DEFAULT : \ + id) \ + << BASEP_MEM_GROUP_ID_SHIFT) & \ + BASE_MEM_GROUP_ID_MASK) + +#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ + (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ + ((base_context_create_flags)(group_id) \ + << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) + +#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ + ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ + BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* + * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These + * flags are also used, where applicable, for specifying which fields + * are valid following the request operation. + */ + +/* For monotonic (counter) timefield */ +#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) +/* For system wide timestamp */ +#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) +/* For GPU cycle counter */ +#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) +/* Specify kernel GPU register timestamp */ +#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) +/* Specify userspace cntvct_el0 timestamp source */ +#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) + +#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ + BASE_TIMEINFO_MONOTONIC_FLAG | \ + BASE_TIMEINFO_TIMESTAMP_FLAG | \ + BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ + BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ + BASE_TIMEINFO_USER_SOURCE_FLAG) + +/* Maximum number of source allocations allowed to create an alias allocation. + * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array + * layers, since each cube map in the array will have 6 faces. + */ +#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) + +#endif /* _UAPI_BASE_KERNEL_H_ */ diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h new file mode 100644 index 00000000000..17e338cb238 --- /dev/null +++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * This header was originally autogenerated, but it is now ok (and + * expected) to have to add to it. + */ + +#ifndef _UAPI_GPU_CSF_REGISTERS_H_ +#define _UAPI_GPU_CSF_REGISTERS_H_ + +/* Only user block defines are included. HI words have been removed */ + +/* CS_USER_INPUT_BLOCK register offsets */ +#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ +#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ + +/* CS_USER_OUTPUT_BLOCK register offsets */ +#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ +#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ + +/* USER register offsets */ +#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ + +#endif diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h new file mode 100644 index 00000000000..3df8a01699f --- /dev/null +++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h @@ -0,0 +1,483 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_CSF_IOCTL_H_ +#define _UAPI_KBASE_CSF_IOCTL_H_ + +#include +#include + +/* + * 1.0: + * - CSF IOCTL header separated from JM + * 1.1: + * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 1.2: + * - Add new CSF GPU_FEATURES register into the property structure + * returned by KBASE_IOCTL_GET_GPUPROPS + * 1.3: + * - Add __u32 group_uid member to + * &struct_kbase_ioctl_cs_queue_group_create.out + * 1.4: + * - Replace padding in kbase_ioctl_cs_get_glb_iface with + * instr_features member of same size + * 1.5: + * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new + * queue registration call with extended format for supporting CS + * trace configurations with CSF trace_command. + * 1.6: + * - Added new HW performance counters interface to all GPUs. + * 1.7: + * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use + * 1.8: + * - Removed Kernel legacy HWC interface + */ + +#define BASE_UK_VERSION_MAJOR 1 +#define BASE_UK_VERSION_MINOR 8 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + + +/** + * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the + * base back-end + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * + * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. + * Any change of this struct should also be mirrored to the latter. + */ +struct kbase_ioctl_cs_queue_register { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER \ + _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) + +/** + * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler + * to notify that a queue has been updated + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_kick { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_KICK \ + _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) + +/** + * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group + * + * @in: Input parameters + * @in.buffer_gpu_addr: GPU address of the buffer backing the queue + * @in.group_handle: Handle of the group to which the queue should be bound + * @in.csi_index: Index of the CSF interface the queue should be bound to + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.mmap_handle: Handle to be used for creating the mapping of CS + * input/output pages + */ +union kbase_ioctl_cs_queue_bind { + struct { + __u64 buffer_gpu_addr; + __u8 group_handle; + __u8 csi_index; + __u8 padding[6]; + } in; + struct { + __u64 mmap_handle; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_BIND \ + _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) + +/** + * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the + * base back-end in extended format, + * involving trace buffer configuration + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * @ex_offset_var_addr: GPU address of the trace buffer write offset variable + * @ex_buffer_base: Trace buffer GPU base address for the queue + * @ex_buffer_size: Size of the trace buffer in bytes + * @ex_event_size: Trace event write size, in log2 designation + * @ex_event_state: Trace event states configuration + * @ex_padding: Currently unused, must be zero + * + * @Note: There is an identical sub-section at the start of this struct to that + * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section + * must also be mirrored to the latter. Following the said sub-section, + * the remaining fields forms the extension, marked with ex_*. + */ +struct kbase_ioctl_cs_queue_register_ex { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; + __u64 ex_offset_var_addr; + __u64 ex_buffer_base; + __u32 ex_buffer_size; + __u8 ex_event_size; + __u8 ex_event_state; + __u8 ex_padding[2]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ + _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) + +/** + * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_terminate { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) + +/** + * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue + * group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create_1_6 { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ + _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) + +/** + * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + __u64 reserved; + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ + _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) + +/** + * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group + * + * @group_handle: Handle of the queue group to be terminated + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_cs_queue_group_term { + __u8 group_handle; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) + +#define KBASE_IOCTL_CS_EVENT_SIGNAL \ + _IO(KBASE_IOCTL_TYPE, 44) + +typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ + +/** + * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue + * + * @id: ID of the new command queue returned by the kernel + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_new { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ + _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) + +/** + * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue + * + * @id: ID of the command queue to be destroyed + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_delete { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ + _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) + +/** + * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue + * + * @addr: Memory address of an array of struct base_kcpu_queue_command + * @nr_commands: Number of commands in the array + * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_enqueue { + __u64 addr; + __u32 nr_commands; + base_kcpu_queue_id id; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ + _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) + +/** + * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap + * @in: Input parameters + * @in.chunk_size: Size of each chunk. + * @in.initial_chunks: Initial number of chunks that heap will be created with. + * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. + * @in.target_in_flight: Number of render-passes that the driver should attempt to + * keep in flight for which allocation of new chunks is + * allowed. + * @in.group_id: Group ID to be used for physical allocations. + * @in.padding: Padding + * @out: Output parameters + * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up + * for the heap. + * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, + * actually points to the header of heap chunk and not to + * the low address of free memory in the chunk. + */ +union kbase_ioctl_cs_tiler_heap_init { + struct { + __u32 chunk_size; + __u32 initial_chunks; + __u32 max_chunks; + __u16 target_in_flight; + __u8 group_id; + __u8 padding; + } in; + struct { + __u64 gpu_heap_va; + __u64 first_chunk_va; + } out; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ + _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) + +/** + * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap + * instance + * + * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. + */ +struct kbase_ioctl_cs_tiler_heap_term { + __u64 gpu_heap_va; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ + _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) + +/** + * union kbase_ioctl_cs_get_glb_iface - Request the global control block + * of CSF interface capabilities + * + * @in: Input parameters + * @in.max_group_num: The maximum number of groups to be read. Can be 0, in + * which case groups_ptr is unused. + * @in.max_total_stream _num: The maximum number of CSs to be read. Can be 0, in + * which case streams_ptr is unused. + * @in.groups_ptr: Pointer where to store all the group data (sequentially). + * @in.streams_ptr: Pointer where to store all the CS data (sequentially). + * @out: Output parameters + * @out.glb_version: Global interface version. + * @out.features: Bit mask of features (e.g. whether certain types of job + * can be suspended). + * @out.group_num: Number of CSGs supported. + * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 + * hold the size of firmware performance counter data + * and 15:0 hold the size of hardware performance counter + * data. + * @out.total_stream_num: Total number of CSs, summed across all groups. + * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum + * size of events. Bits 3:0 hold the offset update rate. + * (csf >= 1.1.0) + * + */ +union kbase_ioctl_cs_get_glb_iface { + struct { + __u32 max_group_num; + __u32 max_total_stream_num; + __u64 groups_ptr; + __u64 streams_ptr; + } in; + struct { + __u32 glb_version; + __u32 features; + __u32 group_num; + __u32 prfcnt_size; + __u32 total_stream_num; + __u32 instr_features; + } out; +}; + +#define KBASE_IOCTL_CS_GET_GLB_IFACE \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) + +struct kbase_ioctl_cs_cpu_queue_info { + __u64 buffer; + __u64 size; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ + _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +/** + * struct kbase_ioctl_cs_event_memory_write - Write an event memory address + * @cpu_addr: Memory address to write + * @value: Value to write + * @padding: Currently unused, must be zero + */ +struct kbase_ioctl_cs_event_memory_write { + __u64 cpu_addr; + __u8 value; + __u8 padding[7]; +}; + +/** + * union kbase_ioctl_cs_event_memory_read - Read an event memory address + * @in: Input parameters + * @in.cpu_addr: Memory address to read + * @out: Output parameters + * @out.value: Value read + * @out.padding: Currently unused, must be zero + */ +union kbase_ioctl_cs_event_memory_read { + struct { + __u64 cpu_addr; + } in; + struct { + __u8 value; + __u8 padding[7]; + } out; +}; + +#endif /* MALI_UNIT_TEST */ + +#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h new file mode 100644 index 00000000000..fc81b71b46a --- /dev/null +++ b/src/panfrost/csf_test/mali_kbase_ioctl.h @@ -0,0 +1,854 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_IOCTL_H_ +#define _UAPI_KBASE_IOCTL_H_ + +#ifdef __cpluscplus +extern "C" { +#endif + +#include +#include + +#define KBASE_IOCTL_TYPE 0x80 + +/** + * struct kbase_ioctl_set_flags - Set kernel context creation flags + * + * @create_flags: Flags - see base_context_create_flags + */ +struct kbase_ioctl_set_flags { + __u32 create_flags; +}; + +#define KBASE_IOCTL_SET_FLAGS \ + _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) + +/** + * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel + * + * @buffer: Pointer to the buffer to store properties into + * @size: Size of the buffer + * @flags: Flags - must be zero for now + * + * The ioctl will return the number of bytes stored into @buffer or an error + * on failure (e.g. @size is too small). If @size is specified as 0 then no + * data will be written but the return value will be the number of bytes needed + * for all the properties. + * + * @flags may be used in the future to request a different format for the + * buffer. With @flags == 0 the following format is used. + * + * The buffer will be filled with pairs of values, a __u32 key identifying the + * property followed by the value. The size of the value is identified using + * the bottom bits of the key. The value then immediately followed the key and + * is tightly packed (there is no padding). All keys and values are + * little-endian. + * + * 00 = __u8 + * 01 = __u16 + * 10 = __u32 + * 11 = __u64 + */ +struct kbase_ioctl_get_gpuprops { + __u64 buffer; + __u32 size; + __u32 flags; +}; + +#define KBASE_IOCTL_GET_GPUPROPS \ + _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) + +/** + * union kbase_ioctl_mem_alloc - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC \ + _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) + +/** + * struct kbase_ioctl_mem_query - Query properties of a GPU memory region + * @in: Input parameters + * @in.gpu_addr: A GPU address contained within the region + * @in.query: The type of query + * @out: Output parameters + * @out.value: The result of the query + * + * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. + */ +union kbase_ioctl_mem_query { + struct { + __u64 gpu_addr; + __u64 query; + } in; + struct { + __u64 value; + } out; +}; + +#define KBASE_IOCTL_MEM_QUERY \ + _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) + +#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) +#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) +#define KBASE_MEM_QUERY_FLAGS ((__u64)3) + +/** + * struct kbase_ioctl_mem_free - Free a memory region + * @gpu_addr: Handle to the region to free + */ +struct kbase_ioctl_mem_free { + __u64 gpu_addr; +}; + +#define KBASE_IOCTL_MEM_FREE \ + _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @buffer_count: requested number of dumping buffers + * @fe_bm: counters selection bitmask (Front end) + * @shader_bm: counters selection bitmask (Shader) + * @tiler_bm: counters selection bitmask (Tiler) + * @mmu_l2_bm: counters selection bitmask (MMU_L2) + * + * A fd is returned from the ioctl if successful, or a negative value on error + */ +struct kbase_ioctl_hwcnt_reader_setup { + __u32 buffer_count; + __u32 fe_bm; + __u32 shader_bm; + __u32 tiler_bm; + __u32 mmu_l2_bm; +}; + +#define KBASE_IOCTL_HWCNT_READER_SETUP \ + _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) + +/** + * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. + * @data: Counter samples for the dummy model. + * @size: Size of the counter sample data. + * @padding: Padding. + */ +struct kbase_ioctl_hwcnt_values { + __u64 data; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_HWCNT_SET \ + _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) + +/** + * struct kbase_ioctl_disjoint_query - Query the disjoint counter + * @counter: A counter of disjoint events in the kernel + */ +struct kbase_ioctl_disjoint_query { + __u32 counter; +}; + +#define KBASE_IOCTL_DISJOINT_QUERY \ + _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) + +/** + * struct kbase_ioctl_get_ddk_version - Query the kernel version + * @version_buffer: Buffer to receive the kernel version string + * @size: Size of the buffer + * @padding: Padding + * + * The ioctl will return the number of bytes written into version_buffer + * (which includes a NULL byte) or a negative error code + * + * The ioctl request code has to be _IOW because the data in ioctl struct is + * being copied to the kernel, even though the kernel then writes out the + * version info to the buffer specified in the ioctl. + */ +struct kbase_ioctl_get_ddk_version { + __u64 version_buffer; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_GET_DDK_VERSION \ + _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) + +/** + * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 10.2--11.4) + * @va_pages: Number of VA pages to reserve for JIT + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_10_2 { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) + +/** + * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 11.5--11.19) + * @va_pages: Number of VA pages to reserve for JIT + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_11_5 { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) + +/** + * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory + * allocator + * @va_pages: Number of GPU virtual address pages to reserve for just-in-time + * memory allocations + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * @phys_pages: Maximum number of physical pages to allocate just-in-time + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + */ +struct kbase_ioctl_mem_jit_init { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; + __u64 phys_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) + +/** + * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory + * + * @handle: GPU memory handle (GPU VA) + * @user_addr: The address where it is mapped in user space + * @size: The number of bytes to synchronise + * @type: The direction to synchronise: 0 is sync to memory (clean), + * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_mem_sync { + __u64 handle; + __u64 user_addr; + __u64 size; + __u8 type; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_MEM_SYNC \ + _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) + +/** + * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer + * + * @in: Input parameters + * @in.gpu_addr: The GPU address of the memory region + * @in.cpu_addr: The CPU address to locate + * @in.size: A size in bytes to validate is contained within the region + * @out: Output parameters + * @out.offset: The offset from the start of the memory region to @cpu_addr + */ +union kbase_ioctl_mem_find_cpu_offset { + struct { + __u64 gpu_addr; + __u64 cpu_addr; + __u64 size; + } in; + struct { + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) + +/** + * struct kbase_ioctl_get_context_id - Get the kernel context ID + * + * @id: The kernel context ID + */ +struct kbase_ioctl_get_context_id { + __u32 id; +}; + +#define KBASE_IOCTL_GET_CONTEXT_ID \ + _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) + +/** + * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd + * + * @flags: Flags + * + * The ioctl returns a file descriptor when successful + */ +struct kbase_ioctl_tlstream_acquire { + __u32 flags; +}; + +#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ + _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) + +#define KBASE_IOCTL_TLSTREAM_FLUSH \ + _IO(KBASE_IOCTL_TYPE, 19) + +/** + * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region + * + * @gpu_addr: The memory region to modify + * @pages: The number of physical pages that should be present + * + * The ioctl may return on the following error codes or 0 for success: + * -ENOMEM: Out of memory + * -EINVAL: Invalid arguments + */ +struct kbase_ioctl_mem_commit { + __u64 gpu_addr; + __u64 pages; +}; + +#define KBASE_IOCTL_MEM_COMMIT \ + _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) + +/** + * union kbase_ioctl_mem_alias - Create an alias of memory regions + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.stride: Bytes between start of each memory region + * @in.nents: The number of regions to pack together into the alias + * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_alias { + struct { + __u64 flags; + __u64 stride; + __u64 nents; + __u64 aliasing_info; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_ALIAS \ + _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) + +/** + * union kbase_ioctl_mem_import - Import memory for use by the GPU + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.phandle: Handle to the external memory + * @in.type: Type of external memory, see base_mem_import_type + * @in.padding: Amount of extra VA pages to append to the imported buffer + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_import { + struct { + __u64 flags; + __u64 phandle; + __u32 type; + __u32 padding; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_IMPORT \ + _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) + +/** + * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region + * @gpu_va: The GPU region to modify + * @flags: The new flags to set + * @mask: Mask of the flags to modify + */ +struct kbase_ioctl_mem_flags_change { + __u64 gpu_va; + __u64 flags; + __u64 mask; +}; + +#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ + _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) + +/** + * struct kbase_ioctl_stream_create - Create a synchronisation stream + * @name: A name to identify this stream. Must be NULL-terminated. + * + * Note that this is also called a "timeline", but is named stream to avoid + * confusion with other uses of the word. + * + * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. + * + * The ioctl returns a file descriptor. + */ +struct kbase_ioctl_stream_create { + char name[32]; +}; + +#define KBASE_IOCTL_STREAM_CREATE \ + _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) + +/** + * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence + * @fd: The file descriptor to validate + */ +struct kbase_ioctl_fence_validate { + int fd; +}; + +#define KBASE_IOCTL_FENCE_VALIDATE \ + _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) + +/** + * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel + * @buffer: Pointer to the information + * @len: Length + * @padding: Padding + * + * The data provided is accessible through a debugfs file + */ +struct kbase_ioctl_mem_profile_add { + __u64 buffer; + __u32 len; + __u32 padding; +}; + +#define KBASE_IOCTL_MEM_PROFILE_ADD \ + _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) + +/** + * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to map + */ +struct kbase_ioctl_sticky_resource_map { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ + _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) + +/** + * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was + * previously permanently mapped + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to unmap + */ +struct kbase_ioctl_sticky_resource_unmap { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ + _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) + +/** + * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of + * the GPU memory region for + * the given gpu address and + * the offset of that address + * into the region + * @in: Input parameters + * @in.gpu_addr: GPU virtual address + * @in.size: Size in bytes within the region + * @out: Output parameters + * @out.start: Address of the beginning of the memory region enclosing @gpu_addr + * for the length of @offset bytes + * @out.offset: The offset from the start of the memory region to @gpu_addr + */ +union kbase_ioctl_mem_find_gpu_start_and_offset { + struct { + __u64 gpu_addr; + __u64 size; + } in; + struct { + __u64 start; + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) + +#define KBASE_IOCTL_CINSTR_GWT_START \ + _IO(KBASE_IOCTL_TYPE, 33) + +#define KBASE_IOCTL_CINSTR_GWT_STOP \ + _IO(KBASE_IOCTL_TYPE, 34) + +/** + * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses. + * @in: Input parameters + * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. + * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) + * @in.len: Number of addresses the buffers can hold. + * @in.padding: padding + * @out: Output parameters + * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. + * @out.more_data_available: Status indicating if more addresses are available. + * @out.padding: padding + * + * This structure is used when performing a call to dump GPU write fault + * addresses. + */ +union kbase_ioctl_cinstr_gwt_dump { + struct { + __u64 addr_buffer; + __u64 size_buffer; + __u32 len; + __u32 padding; + + } in; + struct { + __u32 no_of_addr_collected; + __u8 more_data_available; + __u8 padding[27]; + } out; +}; + +#define KBASE_IOCTL_CINSTR_GWT_DUMP \ + _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) + +/** + * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone + * + * @va_pages: Number of VA pages to reserve for EXEC_VA + */ +struct kbase_ioctl_mem_exec_init { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_EXEC_INIT \ + _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) + +/** + * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of + * cpu/gpu time (counter values) + * @in: Input parameters + * @in.request_flags: Bit-flags indicating the requested types. + * @in.paddings: Unused, size alignment matching the out. + * @out: Output parameters + * @out.sec: Integer field of the monotonic time, unit in seconds. + * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. + * @out.padding: Unused, for __u64 alignment + * @out.timestamp: System wide timestamp (counter) value. + * @out.cycle_counter: GPU cycle counter value. + */ +union kbase_ioctl_get_cpu_gpu_timeinfo { + struct { + __u32 request_flags; + __u32 paddings[7]; + } in; + struct { + __u64 sec; + __u32 nsec; + __u32 padding; + __u64 timestamp; + __u64 cycle_counter; + } out; +}; + +#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ + _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) + +/** + * struct kbase_ioctl_context_priority_check - Check the max possible priority + * @priority: Input priority & output priority + */ + +struct kbase_ioctl_context_priority_check { + __u8 priority; +}; + +#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) + +/** + * struct kbase_ioctl_set_limited_core_count - Set the limited core count. + * + * @max_core_count: Maximum core count + */ +struct kbase_ioctl_set_limited_core_count { + __u8 max_core_count; +}; + +#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ + _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) + +/** + * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter + * information + * @info_item_size: Performance counter item size in bytes. + * @info_item_count: Performance counter item count in the info_list_ptr. + * @info_list_ptr: Performance counter item list pointer which points to a + * list with info_item_count of items. + * + * On success: returns info_item_size and info_item_count if info_list_ptr is + * NULL, returns performance counter information if info_list_ptr is not NULL. + * On error: returns a negative error code. + */ +struct kbase_ioctl_kinstr_prfcnt_enum_info { + __u32 info_item_size; + __u32 info_item_count; + __u64 info_list_ptr; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ + _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @in: input parameters. + * @in.request_item_count: Number of requests in the requests array. + * @in.request_item_size: Size in bytes of each request in the requests array. + * @in.requests_ptr: Pointer to the requests array. + * @out: output parameters. + * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for + * each sample. + * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap + * for reading performance counter samples. + * + * A fd is returned from the ioctl if successful, or a negative value on error. + */ +union kbase_ioctl_kinstr_prfcnt_setup { + struct { + __u32 request_item_count; + __u32 request_item_size; + __u64 requests_ptr; + } in; + struct { + __u32 prfcnt_metadata_item_size; + __u32 prfcnt_mmap_size_bytes; + } out; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ + _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) + + +/** + * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes + * @bytes_collected: number of bytes read by user + * @bytes_generated: number of bytes generated by tracepoints + */ +struct kbase_ioctl_tlstream_stats { + __u32 bytes_collected; + __u32 bytes_generated; +}; + +#define KBASE_IOCTL_TLSTREAM_STATS \ + _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) + +#endif /* MALI_UNIT_TEST */ + +/* Customer extension range */ +#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) + +/* If the integration needs extra ioctl add them there + * like this: + * + * struct my_ioctl_args { + * .... + * } + * + * #define KBASE_IOCTL_MY_IOCTL \ + * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) + */ + + +/********************************** + * Definitions for GPU properties * + **********************************/ +#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) +#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) +#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) +#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) + +#define KBASE_GPUPROP_PRODUCT_ID 1 +#define KBASE_GPUPROP_VERSION_STATUS 2 +#define KBASE_GPUPROP_MINOR_REVISION 3 +#define KBASE_GPUPROP_MAJOR_REVISION 4 +/* 5 previously used for GPU speed */ +#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 +/* 7 previously used for minimum GPU speed */ +#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 +#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 +#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 +#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 +#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 + +#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 +#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 +#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 + +#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 +#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 + +#define KBASE_GPUPROP_MAX_THREADS 18 +#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 +#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 +#define KBASE_GPUPROP_MAX_REGISTERS 21 +#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 +#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 +#define KBASE_GPUPROP_IMPL_TECH 24 + +#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 +#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 +#define KBASE_GPUPROP_RAW_L2_PRESENT 27 +#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 +#define KBASE_GPUPROP_RAW_L2_FEATURES 29 +#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 +#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 +#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 +#define KBASE_GPUPROP_RAW_AS_PRESENT 33 +#define KBASE_GPUPROP_RAW_JS_PRESENT 34 +#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 +#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 +#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 +#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 +#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 +#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 +#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 +#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 +#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 +#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 +#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 +#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 +#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 +#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 +#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 +#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 +#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 +#define KBASE_GPUPROP_RAW_GPU_ID 55 +#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 +#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 +#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 +#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 +#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 + +#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 +#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 +#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 +#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 +#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 +#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 +#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 +#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 +#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 +#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 +#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 +#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 +#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 +#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 +#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 +#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 +#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 +#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 +#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 + +#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 + +#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 + +#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 +#define KBASE_GPUPROP_TLS_ALLOC 84 +#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 +#ifdef __cpluscplus +} +#endif + +#endif /* _UAPI_KBASE_IOCTL_H_ */ diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c new file mode 100644 index 00000000000..cb9ff398314 --- /dev/null +++ b/src/panfrost/csf_test/test.c @@ -0,0 +1,1903 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/macros.h" + +#include "mali_kbase_csf_ioctl.h" +#include "mali_kbase_ioctl.h" +#include "mali_base_kernel.h" +#include "mali_base_csf_kernel.h" +#include "mali_gpu_csf_registers.h" + +#define PAN_ARCH 10 +#include "genxml/gen_macros.h" + +#include "wrap.h" +#include "decode.h" + +#include "pan_shader.h" +#include "compiler/nir/nir_builder.h" +#include "bifrost/valhall/disassemble.h" + +#define CS_EVENT_REGISTER 0x5A + +static bool pr = true; +static bool colour_term = true; + +static void +dump_start(FILE *f) +{ + if (colour_term) + fprintf(f, "\x1b[90m"); +} + +static void +dump_end(FILE *f) +{ + if (colour_term) + fprintf(f, "\x1b[39m"); +} + +/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */ +static void +cache_clean(volatile void *addr) +{ +#ifdef __aarch64__ + __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); +#endif +} + +static void +cache_invalidate(volatile void *addr) +{ +#ifdef __aarch64__ + __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); +#endif +} + +static void +cache_barrier(void) +{ +#ifdef __ARM_ARCH + __asm__ volatile ("dsb sy" ::: "memory"); +#endif +} + +static void +memory_barrier(void) +{ +#ifdef __ARM_ARCH + __asm__ volatile ("dmb sy" ::: "memory"); +#endif +} + +typedef void (*cacheline_op)(volatile void *addr); + +#define CACHELINE_SIZE 64 + +static void +cacheline_op_range(volatile void *start, unsigned length, cacheline_op op) +{ + volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); + volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); + for (; ptr < end; ptr += CACHELINE_SIZE) + op(ptr); +} + +static void +cache_clean_range(volatile void *start, unsigned length) +{ + cacheline_op_range(start, length, cache_clean); +} + +static void +cache_invalidate_range(volatile void *start, unsigned length) +{ + cacheline_op_range(start, length, cache_invalidate); +} + +struct state; +struct test; + +typedef bool (* section)(struct state *s, struct test *t); + +#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */ +#define CS_QUEUE_SIZE 65536 + +struct state { + int page_size; + int argc; + char **argv; + + int mali_fd; + int tl_fd; + void *tracking_region; + void *csf_user_reg; + + uint8_t *gpuprops; + unsigned gpuprops_size; + uint32_t gpu_id; + + struct { + struct panfrost_ptr normal, exec, coherent, cached, event, ev2; + } allocations; + + uint64_t tiler_heap_va; + uint64_t tiler_heap_header; + + uint8_t csg_handle; + uint32_t csg_uid; + + struct panfrost_ptr cs_mem[CS_QUEUE_COUNT]; + void *cs_user_io[CS_QUEUE_COUNT]; + unsigned cs_last_submit[CS_QUEUE_COUNT]; + struct pan_command_stream cs[CS_QUEUE_COUNT]; + + unsigned shader_alloc_offset; + mali_ptr compute_shader; +}; + +struct test { + section part; + section cleanup; + const char *label; + + struct test *subtests; + unsigned sub_length; + + /* for allocation tests */ + unsigned offset; + unsigned flags; + + bool add; + bool invalid; + bool blit; + bool vertex; +}; + +/* See STATE and ALLOC macros below */ +#define DEREF_STATE(s, offset) ((void*) s + offset) + +static uint64_t +pan_get_gpuprop(struct state *s, int name) +{ + int i = 0; + uint64_t x = 0; + while (i < s->gpuprops_size) { + x = 0; + memcpy(&x, s->gpuprops + i, 4); + i += 4; + + int size = 1 << (x & 3); + int this_name = x >> 2; + + x = 0; + memcpy(&x, s->gpuprops + i, size); + i += size; + + if (this_name == name) + return x; + } + + fprintf(stderr, "Unknown prop %i\n", name); + return 0; +} + +static bool +open_kbase(struct state *s, struct test *t) +{ + s->mali_fd = open("/dev/mali0", O_RDWR); + if (s->mali_fd != -1) + return true; + + perror("open(\"/dev/mali0\")"); + return false; +} + +static bool +close_kbase(struct state *s, struct test *t) +{ + if (getenv("TEST_CHECK_LEAKS")) { + int pid = getpid(); + char cmd_buffer[64] = {0}; + sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid); + system(cmd_buffer); + sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid); + system(cmd_buffer); + } + + if (s->mali_fd > 0) + return close(s->mali_fd) == 0; + return true; +} + +static bool +get_version(struct state *s, struct test *t) +{ + struct kbase_ioctl_version_check ver = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_VERSION_CHECK)"); + return false; + } + + if (pr) + printf("Major %i Minor %i: ", ver.major, ver.minor); + return true; +} + +static bool +set_flags(struct state *s, struct test *t) +{ + struct kbase_ioctl_set_flags flags = { + .create_flags = 0 + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); + return false; + } + return true; +} + +static bool +mmap_tracking(struct state *s, struct test *t) +{ + s->tracking_region = mmap(NULL, s->page_size, PROT_NONE, + MAP_SHARED, s->mali_fd, + BASE_MEM_MAP_TRACKING_HANDLE); + + if (s->tracking_region == MAP_FAILED) { + perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); + s->tracking_region = NULL; + return false; + } + return true; +} + +static bool +munmap_tracking(struct state *s, struct test *t) +{ + if (s->tracking_region) + return munmap(s->tracking_region, s->page_size) == 0; + return true; +} + +static bool +get_gpuprops(struct state *s, struct test *t) +{ + struct kbase_ioctl_get_gpuprops props = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); + return false; + } else if (!ret) { + fprintf(stderr, "GET_GPUPROPS returned zero size\n"); + return false; + } + + s->gpuprops_size = ret; + s->gpuprops = calloc(s->gpuprops_size, 1); + + props.size = s->gpuprops_size; + props.buffer = (uint64_t)(uintptr_t) s->gpuprops; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); + return false; + } + + return true; +} + +static bool +free_gpuprops(struct state *s, struct test *t) +{ + free(s->gpuprops); + return true; +} + +static bool +get_gpu_id(struct state *s, struct test *t) +{ + uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID); + if (!gpu_id) + return false; + s->gpu_id = gpu_id; + + uint16_t maj = gpu_id >> 12; + uint16_t min = (gpu_id >> 8) & 0xf; + uint16_t rev = (gpu_id >> 4) & 0xf; + + uint16_t product = gpu_id & 0xf; + uint16_t prod = product | ((maj & 1) << 4); + + const char *names[] = { + [1] = "TDUX", + [2] = "G710", + [3] = "G510", + [4] = "G310", + [7] = "G610", + [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */ + [16 + 3] = "G615", + }; + const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL; + if (!name) + name = "unknown"; + + if (pr) + printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product); + + if (maj < 10) { + printf("not v10 or later: "); + return false; + } + + return true; +} + +static bool +get_coherency_mode(struct state *s, struct test *t) +{ + uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE); + + const char *modes[] = { + [0] = "ACE-Lite", + [1] = "ACE", + [31] = "None", + }; + const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL; + if (!name) + name = "Unknown"; + + if (pr) + printf("0x%"PRIx64" (%s): ", mode, name); + return true; +} + +static bool +get_csf_caps(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_get_glb_iface iface = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))"); + return false; + } + + int ver_maj = iface.out.glb_version >> 24; + int ver_min = (iface.out.glb_version >> 16) & 0xff; + int ver_rev = iface.out.glb_version & 0xffff; + + if (pr) + printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ", + ver_maj, ver_min, ver_rev, iface.out.features, + iface.out.group_num, iface.out.total_stream_num); + + unsigned group_num = iface.out.group_num; + unsigned stream_num = iface.out.total_stream_num; + + struct basep_cs_group_control *group_data = + calloc(group_num, sizeof(*group_data)); + + struct basep_cs_stream_control *stream_data = + calloc(stream_num, sizeof(*stream_data)); + + iface = (union kbase_ioctl_cs_get_glb_iface) { + .in = { + .max_group_num = group_num, + .max_total_stream_num = stream_num, + .groups_ptr = (uintptr_t) group_data, + .streams_ptr = (uintptr_t) stream_data, + } + }; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))"); + + free(group_data); + free(stream_data); + + return false; + } + + unsigned print_groups = pr ? group_num : 0; + unsigned print_streams = pr ? stream_num : 0; + + for (unsigned i = 0; i < print_groups; ++i) { + if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data))) + continue; + + fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n", + i, group_data[i].features, group_data[i].stream_num); + } + + for (unsigned i = 0; i < print_streams; ++i) { + if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data))) + continue; + + unsigned reg = stream_data[i].features & 0xff; + unsigned score = (stream_data[i].features >> 8) & 0xff; + unsigned feat = stream_data[i].features >> 16; + + fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n", + i, reg, score, feat); + } + + free(group_data); + free(stream_data); + + return true; +} + +static bool +mmap_user_reg(struct state *s, struct test *t) +{ + s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ, + MAP_SHARED, s->mali_fd, + BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); + + if (s->csf_user_reg == MAP_FAILED) { + perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); + s->csf_user_reg = NULL; + return false; + } + return true; +} + +static bool +munmap_user_reg(struct state *s, struct test *t) +{ + if (s->csf_user_reg) + return munmap(s->csf_user_reg, s->page_size) == 0; + return true; +} + +static bool +init_mem_exec(struct state *s, struct test *t) +{ + struct kbase_ioctl_mem_exec_init init = { + .va_pages = 0x100000, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); + return false; + } + return true; +} + +static bool +init_mem_jit(struct state *s, struct test *t) +{ + struct kbase_ioctl_mem_jit_init init = { + .va_pages = 1 << 25, + .max_allocations = 255, + .phys_pages = 1 << 25, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); + return false; + } + return true; +} + +static bool +stream_create(struct state *s, struct test *t) +{ + struct kbase_ioctl_stream_create stream = { + .name = "stream" + }; + + s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream); + + if (s->tl_fd == -1) { + perror("ioctl(KBASE_IOCTL_STREAM_CREATE)"); + return false; + } + return true; + +} + +static bool +stream_destroy(struct state *s, struct test *t) +{ + if (s->tl_fd > 0) + return close(s->tl_fd) == 0; + return true; +} + +static bool +tiler_heap_create(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_tiler_heap_init init = { + .in = { + .chunk_size = 1 << 21, + .initial_chunks = 5, + .max_chunks = 200, + .target_in_flight = 65535, + } + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); + return false; + } + + s->tiler_heap_va = init.out.gpu_heap_va; + s->tiler_heap_header = init.out.first_chunk_va; + printf("heap va: %"PRIx64", heap header: %"PRIx64"\n", + s->tiler_heap_va, s->tiler_heap_header); + + return true; +} + +static bool +tiler_heap_term(struct state *s, struct test *t) +{ + if (!s->tiler_heap_va) + return true; + + struct kbase_ioctl_cs_tiler_heap_term term = { + .gpu_heap_va = s->tiler_heap_va + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); + return false; + } + return true; +} + +static bool +cs_group_create(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_queue_group_create_1_6 create = { + .in = { + /* Mali *still* only supports a single tiler unit */ + .tiler_mask = 1, + .fragment_mask = ~0ULL, + .compute_mask = ~0ULL, + + .cs_min = CS_QUEUE_COUNT, + + .priority = 1, + .tiler_max = 1, + .fragment_max = 64, + .compute_max = 64, + } + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); + return false; + } + + s->csg_handle = create.out.group_handle; + s->csg_uid = create.out.group_uid; + + if (pr) + printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid); + + /* Should be at least 1 */ + if (!s->csg_uid) + abort(); + + return true; +} + +static bool +cs_group_term(struct state *s, struct test *t) +{ + if (!s->csg_uid) + return true; + + struct kbase_ioctl_cs_queue_group_term term = { + .group_handle = s->csg_handle + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); + return false; + } + return true; +} + +static bool +decode_init(struct state *s, struct test *t) +{ + pandecode_initialize(true); + return true; +} + +static bool +decode_close(struct state *s, struct test *t) +{ + pandecode_close(); + return true; +} + +static struct panfrost_ptr +alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a) +{ + struct panfrost_ptr p = {0}; + + uint64_t va_pages = a->in.va_pages; + uint64_t flags = a->in.flags; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); + return p; + } + + if ((flags & BASE_MEM_SAME_VA) && + (!(a->out.flags & BASE_MEM_SAME_VA) || + a->out.gpu_va != 0x41000)) { + + fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", + (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va); + return p; + } + + void *ptr = mmap(NULL, s->page_size * va_pages, + PROT_READ | PROT_WRITE, MAP_SHARED, + s->mali_fd, a->out.gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU BO)"); + return p; + } + + uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ? + (uintptr_t) ptr : a->out.gpu_va; + + pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL); + + p.cpu = ptr; + p.gpu = gpu_va; + + memset(p.cpu, 0, s->page_size * va_pages); + + return p; +} + +static struct panfrost_ptr +alloc_mem(struct state *s, uint64_t size, uint64_t flags) +{ + unsigned pages = size / s->page_size; + + union kbase_ioctl_mem_alloc a = { + .in = { + .va_pages = pages, + .commit_pages = pages, + .extension = 0, + .flags = flags, + } + }; + + return alloc_ioctl(s, &a); +} + +static void +alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size) +{ + mmap(p.cpu - s->page_size, 1, + PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, + -1, 0); + + mmap(p.cpu + alloc_size, 1, + PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, + -1, 0); +} + +static bool +alloc(struct state *s, struct test *t) +{ + struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); + + *ptr = alloc_mem(s, s->page_size, t->flags); + + volatile int *p = (volatile int *) ptr->cpu; + *p = 0x12345; + if (*p != 0x12345) { + printf("Error reading from allocated memory at %p\n", p); + return false; + } + *p = 0; + cache_clean(p); + + return true; +} + +static bool +dealloc(struct state *s, struct test *t) +{ + struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); + + if (ptr->cpu) + return munmap(ptr->cpu, s->page_size) == 0; + return true; +} + +static bool +cs_queue_create(struct state *s, struct test *t) +{ + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + + /* Read/write from CPU/GPU, nothing special + * like coherency */ + s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f); + s->cs[i].ptr = s->cs_mem[i].cpu; + + if (!s->cs_mem[i].cpu) + return false; + } + + return true; +} + +static bool +cs_queue_free(struct state *s, struct test *t) +{ + bool pass = true; + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE)) + pass = false; + } + return pass; +} + +static bool +cs_queue_register(struct state *s, struct test *t) +{ + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + struct kbase_ioctl_cs_queue_register reg = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + .buffer_size = CS_QUEUE_SIZE, + .priority = 1, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); + return false; + } + + union kbase_ioctl_cs_queue_bind bind = { + .in = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + .group_handle = s->csg_handle, + .csi_index = i, + } + }; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); + } + + s->cs_user_io[i] = + mmap(NULL, + s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, + PROT_READ | PROT_WRITE, MAP_SHARED, + s->mali_fd, bind.out.mmap_handle); + + if (s->cs_user_io[i] == MAP_FAILED) { + perror("mmap(CS USER IO)"); + s->cs_user_io[i] = NULL; + return false; + } + } + return true; +} + +static bool +cs_queue_term(struct state *s, struct test *t) +{ + bool pass = true; + + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + if (s->cs_user_io[i] && + munmap(s->cs_user_io[i], + s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES)) + pass = false; + + struct kbase_ioctl_cs_queue_terminate term = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, + &term); + + if (ret == -1) + pass = false; + } + return pass; +} + +#define CS_RING_DOORBELL(s, i) \ + *((uint32_t *)(s->cs_user_io[i])) = 1 + +#define CS_READ_REGISTER(s, i, r) \ + *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r)) + +#define CS_WRITE_REGISTER(s, i, r, v) \ + *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v + +static void +submit_cs(struct state *s, unsigned i) +{ + uintptr_t p = (uintptr_t) s->cs[i].ptr; + unsigned pad = (-p) & 63; + memset(s->cs[i].ptr, 0, pad); + + unsigned last_offset = s->cs_last_submit[i]; + + unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu; + insert_offset %= CS_QUEUE_SIZE; + + for (unsigned o = last_offset; o != insert_offset; + o = (o + 64) % CS_QUEUE_SIZE) + cache_clean(s->cs_mem[i].cpu + o); + + // TODO: Handle wraparound + // TODO: Provide a persistent buffer for pandecode to use? + if (pr) { + dump_start(stderr); + pandecode_cs(s->cs_mem[i].gpu + last_offset, + insert_offset - last_offset, s->gpu_id); + dump_end(stderr); + } + + cache_barrier(); + + CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset); + s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset; + + memory_barrier(); + CS_RING_DOORBELL(s, i); + memory_barrier(); + + s->cs_last_submit[i] = insert_offset; +} + +/* Returns true if there was a timeout */ +static bool +wait_event(struct state *s, unsigned timeout_ms) +{ + struct pollfd fd = { + .fd = s->mali_fd, + .events = POLLIN, + }; + + int ret = poll(&fd, 1, timeout_ms); + + if (ret == -1) { + perror("poll(mali_fd)"); + return true; + } + + /* Timeout */ + if (ret == 0) + return true; + + struct base_csf_notification event; + ret = read(s->mali_fd, &event, sizeof(event)); + + if (ret == -1) { + perror("read(mali_fd)"); + return true; + } + + if (ret != sizeof(event)) { + fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", + ret, (int) sizeof(event)); + return false; + } + + switch (event.type) { + case BASE_CSF_NOTIFICATION_EVENT: + fprintf(stderr, "Notification event!\n"); + return false; + + case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: + break; + + case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: + fprintf(stderr, "No event from mali_fd!\n"); + return false; + + default: + fprintf(stderr, "Unknown event type!\n"); + return false; + } + + struct base_gpu_queue_group_error e = event.payload.csg_error.error; + + switch (e.error_type) { + case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue group error: status 0x%x " + "sideband 0x%"PRIx64"\n", + e.payload.fatal_group.status, + (uint64_t) e.payload.fatal_group.sideband); + break; + } + case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { + unsigned queue = e.payload.fatal_queue.csi_index; + + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue %i error: status 0x%x " + "sideband 0x%"PRIx64":", + queue, e.payload.fatal_queue.status, + (uint64_t) e.payload.fatal_queue.sideband); + + unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT); + pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id); + + break; + } + + case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: + fprintf(stderr, "Command stream timeout!\n"); + break; + case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: + fprintf(stderr, "Command stream OOM!\n"); + break; + default: + fprintf(stderr, "Unknown error type!\n"); + } + + return false; +} + +static bool +kick_queue(struct state *s, unsigned i) +{ + struct kbase_ioctl_cs_queue_kick kick = { + .buffer_gpu_addr = s->cs_mem[i].gpu + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); + return false; + } + + return true; +} + +static bool +wait_cs(struct state *s, unsigned i) +{ + unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu; + + unsigned timeout_ms = 500; + + bool done_kick = false; + + while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) { + if (wait_event(s, timeout_ms)) { + if (pr) + fprintf(stderr, "Event wait timeout!\n"); + + unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT); + unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE); + + if (e != extract_offset) { + fprintf(stderr, "CS_EXTRACT (%i) != %i, " + "CS_ACTIVE (%i) on queue %i:", + e, extract_offset, a, i); + /* Decode two instructions instead? */ + pandecode_cs(s->cs_mem[i].gpu + e, 8, 1); + + if (done_kick) { + cache_barrier(); + return false; + } else { + fprintf(stderr, "Kicking queue\n"); + kick_queue(s, i); + done_kick = true; + } + } + } + } + + cache_barrier(); + + return true; +} + +static bool +cs_init(struct state *s, struct test *t) +{ + uint64_t event_init[] = { 1, 1, 1 }; + memcpy(s->allocations.event.cpu, event_init, sizeof(event_init)); + + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + CS_WRITE_REGISTER(s, i, CS_INSERT, 0); + pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) { + switch (i) { + case 0: cfg.compute = true; break; + case 1: cfg.compute = true; cfg.fragment = true; break; + case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break; + case 3: cfg.fragment = true; break; + } + } + pan_pack_ins(s->cs + i, CS_SLOT, cfg) { + cfg.index = 2; + } + pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER, + s->allocations.event.gpu); + submit_cs(s, i); + + if (!kick_queue(s, i)) + return false; + } + + return true; +} + +static struct panfrost_ptr * +buffers_elem(struct util_dynarray *buffers, unsigned index) +{ + unsigned size = util_dynarray_num_elements(buffers, + struct panfrost_ptr); + + if (index >= size) { + unsigned grow = index + 1 - size; + + memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow), + 0, grow * sizeof(struct panfrost_ptr)); + } + + return util_dynarray_element(buffers, struct panfrost_ptr, index); +} + +static void +dump_hex64(FILE *fp, uint64_t *values, unsigned size) +{ + bool zero = false; + for (unsigned i = 0; i < size / 8; i += 2) { + uint64_t a = values[i]; + uint64_t b = values[i + 1]; + + if (!a && !b) { + if (!zero) + fprintf(fp, "%06X *\n", i * 8); + zero = true; + continue; + } + + zero = false; + + fprintf(fp, "%06X %16"PRIx64" %16"PRIx64"\n", + i * 8, a, b); + } + + fprintf(fp, "\n"); +} + +static void +dump_delta(FILE *fp, uint64_t *values, unsigned size) +{ + uint64_t old = 0; + bool zero = false; + bool el = false; + for (unsigned i = 0; i < size / 8; ++i) { + uint64_t val = values[i]; + int64_t delta = val - old; + + if (!zero || delta) { + fprintf(fp, "%"PRIi64"\n", delta); + el = false; + } else if (!el) { + fprintf(fp, "...\n"); + el = true; + } + + old = val; + zero = (delta == 0); + } +} + +static void +dump_tiler(FILE *fp, uint8_t *values, unsigned size) +{ + fflush(stdout); + FILE *stream = popen("tiler-hex-read", "w"); + // TODO! + fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n", + 256, 256, 6, values, size); + pan_hexdump(stream, values, size, false); + pclose(stream); +} + +/* TODO: Pass in a filename? */ +static void +dump_filehex(uint8_t *values, unsigned size) +{ + char buf[1024] = {0}; + + for (unsigned i = 0; i < 10000; ++i) { + snprintf(buf, 1024, "/tmp/fdump.%05i", i); + + int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666); + if (fd == -1) + continue; + + FILE *fp = fdopen(fd, "w"); + + fprintf(fp, "%p, %u:\n", values, size); + pan_hexdump(fp, values, size, false); + + fclose(fp); /* will close fd */ + break; + } +} + +static void +dump_heatmap(FILE *fp, uint8_t *values, unsigned size, + unsigned gran, unsigned length, unsigned stride) +{ + unsigned sum = 0; + unsigned gr = 0; + unsigned st = 0; + unsigned ll = 0; + + while (size && !values[size - 1]) + --size; + + for (unsigned i = 0; i < size; ++i) { + sum += values[i]; + + if (++gr == gran) { + fprintf(fp, " %02x", sum & 0xff); + gr = 0; + sum = 0; + } + + if (++ll == length) { + i += stride - length; + fprintf(fp, "\n"); + st = 0; + ll = 0; + } else if (++st == stride) { + fprintf(fp, "\n"); + st = 0; + } + } + fprintf(fp, " %02x\n", sum & 0xff); +} + +static bool +cs_test(struct state *s, struct test *t) +{ + if (s->argc < 2) + return true; + + FILE *f = fopen(s->argv[1], "r"); + + struct util_dynarray buffers; + util_dynarray_init(&buffers, NULL); + + for (;;) { + char *line = NULL; + size_t sz = 0; + if (getline(&line, &sz, f) == -1) + break; + + unsigned long src, dst, offset, src_offset, size, iter, flags; + unsigned long gran, stride, length; + int read; + char *mode; + + if (sscanf(line, "rel%ms %lu+%lu %lu+%lu", + &mode, &dst, &offset, &src, &src_offset) == 5) { + + if (strcmp(mode, "oc") && strcmp(mode, "split")) { + fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode); + } + bool split = (mode[0] == 's'); + free(mode); + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + struct panfrost_ptr *d = buffers_elem(&buffers, dst); + + if (!s->gpu || !d->gpu) { + fprintf(stderr, "relocating to buffer that doesn't exist!\n"); + } + + uint64_t *dest = d->cpu + offset; + uint64_t value = s->gpu + src_offset; + if (split) { + dest[0] |= (uint32_t) value; + dest[1] |= (uint32_t) (value >> 32); + } else { + *dest |= value; + } + + } else if (sscanf(line, "buffer %lu %lu %lx %n", + &dst, &size, &flags, &read) == 3) { + line += read; + + struct panfrost_ptr buffer = + alloc_mem(s, ALIGN_POT(size, s->page_size), + flags); + + alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size)); + + *buffers_elem(&buffers, dst) = buffer; + + //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu); + + uint64_t *fill = buffer.cpu; + + for (unsigned i = 0; i < size / 8; ++i) { + read = 0; + unsigned long long val = 0; + if (sscanf(line, "%Lx %n", &val, &read) != 1) + break; + line += read; + fill[i] = val; + } + + cache_clean_range(buffer.cpu, size); + + } else if (sscanf(line, "exe %n %lu %lu %lu", + &read, &iter, &dst, &size) == 3) { + line += read; + + unsigned iter_mask = 0; + + for (;;) { + read = 0; + if (sscanf(line, "%lu %lu %lu %n", + &iter, &dst, &size, &read) != 3) + break; + line += read; + + struct panfrost_ptr *d = + buffers_elem(&buffers, dst); + + /* TODO: Check 'size' against buffer size */ + + pandecode_cs(d->gpu, size, s->gpu_id); + + if (iter > 3) { + fprintf(stderr, + "execute on out-of-bounds " + "iterator\n"); + continue; + } + + memcpy(s->cs[iter].ptr, d->cpu, size); + s->cs[iter].ptr += size / 8; + + iter_mask |= (1 << iter); + } + + u_foreach_bit(i, iter_mask) + submit_cs(s, i); + + u_foreach_bit(i, iter_mask) + kick_queue(s, i); + + u_foreach_bit(i, iter_mask) + wait_cs(s, i); + + } else if (sscanf(line, "dump %lu %lu %lu %ms", + &src, &offset, &size, &mode) == 4) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "dumping buffer that doesn't exist!\n"); + + cache_invalidate_range(s->cpu + offset, size); + + if (!strcmp(mode, "hex")) + pan_hexdump(stdout, s->cpu + offset, size, true); + else if (!strcmp(mode, "hex64")) + dump_hex64(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "delta")) + dump_delta(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "tiler")) + dump_tiler(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "filehex")) + dump_filehex(s->cpu + offset, size); + + free(mode); + + } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu", + &src, &offset, &size, + &gran, &length, &stride) == 6) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "dumping buffer that doesn't exist!\n"); + + cache_invalidate_range(s->cpu + offset, size); + + dump_heatmap(stdout, s->cpu + offset, size, + gran, length, stride); + + } else if (sscanf(line, "memset %lu %lu %lu %lu", + &src, &offset, &gran, &size) == 4) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "memset on buffer that doesn't exist!\n"); + + memset(s->cpu + offset, gran, size); + cache_clean_range(s->cpu + offset, size); + + } else if (sscanf(line, "sleep %lu", &size) == 1) { + + usleep(size * 1000); + + } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) { + + void *ptr; + + ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, + s->tiler_heap_header); + pan_hexdump(stdout, ptr, 4096, false); + pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); + munmap(ptr, 1 << 21); + + ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, + s->tiler_heap_header + (1 << 21)); + pan_hexdump(stdout, ptr, 4096, false); + pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); + munmap(ptr, 1 << 21); + + } else { + fprintf(stderr, "unknown command '%s'\n", line); + } + } + + /* Skip following tests */ + return false; +} + +static void +pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value) +{ + pan_emit_cs_32(c, 0x5e, value); + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = offset; + cfg.src = 0x5a; + cfg.dest = 0x5c; + } + pan_pack_ins(c, CS_EVADD, cfg) { + cfg.value = 0x5e; + cfg.addr = 0x5c; + } +} + +static bool +cs_simple(struct state *s, struct test *t) +{ + unsigned queue = t->vertex ? 2 : 0; + + pan_command_stream *c = s->cs + queue; + + unsigned dest = t->invalid ? 0x65 : 0x48; + + pan_emit_cs_32(c, dest, 0x1234); + pan_cs_evadd(c, 0, 1); + + submit_cs(s, queue); + return wait_cs(s, queue); +} + +static bool +cs_store(struct state *s, struct test *t) +{ + pan_command_stream *c = s->cs; + + uint32_t *dest = s->allocations.ev2.cpu + 240; + mali_ptr dest_va = s->allocations.ev2.gpu + 240; + uint32_t value = 1234; + uint32_t add = 4320000; + + *dest = 0; + cache_clean(dest); + + unsigned addr_reg = 0x48; + unsigned value_reg = 0x4a; + + if (t->invalid) + dest_va = 0xfdcba9876543; + + pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); } + pan_emit_cs_48(c, addr_reg, dest_va); + pan_emit_cs_32(c, value_reg, value); + + if (t->add) { + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = add; + cfg.src = value_reg; + cfg.dest = value_reg; + } + value += add; + } + + pan_pack_ins(c, CS_STR, cfg) { + cfg.addr = addr_reg; + cfg.register_base = value_reg; + cfg.register_mask = 1; + } + pan_cs_evadd(c, 0, 1); + + submit_cs(s, 0); + wait_cs(s, 0); + + cache_invalidate(dest); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = *dest; + + if (t->invalid && result == value) { + printf("Got %i, did not expect %i: ", result, value); + return false; + } else if (result != value) { + printf("Got %i, expected %i: ", result, value); + return false; + } + + return true; +} + +static void +emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end) +{ + cache_clean_range(start, end - start); + + pan_emit_cs_48(c, 0x48, va); + pan_emit_cs_32(c, 0x4a, end - start); + pan_pack_ins(c, CS_CALL, cfg) { + cfg.address = 0x48; + cfg.length = 0x4a; + } +} + +static bool +cs_sub(struct state *s, struct test *t) +{ + pan_command_stream *c = s->cs; + pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; + mali_ptr cs_va = s->allocations.cached.gpu; + + uint32_t *dest = s->allocations.normal.cpu; + mali_ptr dest_va = s->allocations.normal.gpu; + uint32_t value = 4321; + + *dest = 0; + cache_clean(dest); + + unsigned addr_reg = 0x48; + unsigned value_reg = 0x4a; + + void *start = i->ptr; + + pan_emit_cs_ins(c, 0x30, 0x5a0000000000); + + pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } + pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); } + //pan_emit_cs_ins(i, 0x31, 0); + + pan_emit_cs_48(i, addr_reg, dest_va); + pan_emit_cs_32(i, value_reg, value); + //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL); + pan_pack_ins(i, CS_STR, cfg) { + cfg.addr = addr_reg; + cfg.register_base = value_reg; + cfg.register_mask = 1; + } + //pan_emit_cs_ins(i, 0x09, 0); + //pan_emit_cs_ins(i, 0x31, 0x100000000); + + //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL); + + /* + pan_pack_ins(i, CS_STR_32, cfg) { + cfg.unk_1 = 1; + cfg.unk_2 = 4; + cfg.unk_3 = 1; + cfg.addr = addr_reg; + cfg.value = value_reg; + }*/ + + emit_cs_call(c, cs_va, start, i->ptr); + pan_cs_evadd(c, 0, 1); + + submit_cs(s, 0); + wait_cs(s, 0); + + cache_invalidate(dest); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = *dest; + + if (result != value) { + printf("Got %i, expected %i: ", result, value); + return false; + } + + return true; +} + +static mali_ptr +upload_shader(struct state *s, struct util_dynarray binary) +{ + assert(s->shader_alloc_offset + binary.size < s->page_size); + + mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset; + + memcpy(s->allocations.exec.cpu, binary.data, binary.size); + + /* Shouldn't be needed, but just in case... */ + cache_clean_range(s->allocations.exec.cpu, binary.size); + + s->shader_alloc_offset += binary.size; + + return va; +} + +static bool +compute_compile(struct state *s, struct test *t) +{ + nir_builder _b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, + GENX(pan_shader_get_compiler_options)(), + "mem_store"), *b = &_b; + + nir_ssa_def *ptr = + nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0)); + + nir_ssa_def *value = nir_imm_int(b, 123); + + nir_store_global(b, ptr, 8, value, 1); + + struct panfrost_compile_inputs inputs = { + .gpu_id = s->gpu_id, + .no_ubo_to_push = true, + }; + + struct util_dynarray binary = {0}; + struct pan_shader_info shader_info = {0}; + + GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info); + + dump_start(stderr); + disassemble_valhall(stderr, binary.data, binary.size, true); + dump_end(stderr); + + s->compute_shader = upload_shader(s, binary); + + util_dynarray_fini(&binary); + ralloc_free(b->shader); + + return true; +} + +static struct panfrost_ptr +mem_offset(struct panfrost_ptr ptr, unsigned offset) +{ + ptr.cpu += offset; + ptr.gpu += offset; + return ptr; +} + +static bool +compute_execute(struct state *s, struct test *t) +{ + unsigned queue = t->blit ? 1 : 0; + + pan_command_stream *c = s->cs + queue; + pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; + mali_ptr cs_va = s->allocations.cached.gpu; + + struct panfrost_ptr dest = s->allocations.normal; + uint32_t value = 123; + + *(uint32_t *) dest.cpu = 0; + cache_clean(dest.cpu); + + struct panfrost_ptr fau = mem_offset(dest, 128); + *(uint64_t *) fau.cpu = dest.gpu; + cache_clean(fau.cpu); + + struct panfrost_ptr local_storage = mem_offset(dest, 192); + pan_pack(local_storage.cpu, LOCAL_STORAGE, _); + cache_clean(local_storage.cpu); + + struct panfrost_ptr shader_program = mem_offset(dest, 256); + pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = MALI_SHADER_STAGE_COMPUTE; + cfg.primary_shader = true; + cfg.register_allocation = + MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; + cfg.binary = s->compute_shader; + } + cache_clean(shader_program.cpu); + + void *start = i->ptr; + + pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } + //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; } + + pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; + + cfg.workgroup_count_x = 1; + cfg.workgroup_count_y = 1; + cfg.workgroup_count_z = 1; + + cfg.compute.shader = shader_program.gpu; + cfg.compute.thread_storage = local_storage.gpu; + + cfg.compute.fau = fau.gpu; + cfg.compute.fau_count = 1; + } + + pan_pack_ins(i, COMPUTE_LAUNCH, _); + + //pan_emit_cs_32(c, 0x54, 1); + //pan_emit_cs_ins(c, 0x24, 0x540000000233); + emit_cs_call(c, cs_va, start, i->ptr); + + pan_emit_cs_32(c, 0x4a, 0); + pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL); + + pan_emit_cs_48(c, 0x48, dest.gpu); + pan_pack_ins(c, CS_LDR, cfg) { + cfg.offset = 0; + cfg.register_mask = 1; + cfg.addr = 0x48; + cfg.register_base = 0x20; + } + pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; } + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = 1; + cfg.src = 0x20; + cfg.dest = 0x20; + } + pan_pack_ins(c, CS_STR, cfg) { + cfg.offset = 64; + cfg.register_mask = 1; + cfg.addr = 0x48; + cfg.register_base = 0x20; + } + + pan_cs_evadd(c, 0, 1); + + submit_cs(s, queue); + wait_cs(s, queue); + + cache_invalidate(dest.cpu); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = ((uint32_t *)dest.cpu)[0]; + uint32_t result2 = ((uint32_t *)dest.cpu)[16]; + + if (result != value) { + printf("Got %i, %i, expected %i: ", result, result2, value); + return false; + } + + return true; +} + +static bool +mmu_dump(struct state *s, struct test *t) +{ + unsigned size = 1024 * 1024; + + void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED, + s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE); + if (mem == MAP_FAILED) { + perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)"); + return false; + } + + pan_hexdump(stdout, mem, size, true); + + return true; +} + +#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) } + +#define STATE(item) .offset = offsetof(struct state, item) + +#define ALLOC(item) .offset = offsetof(struct state, allocations.item) +#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f } + +struct test kbase_main[] = { + { open_kbase, close_kbase, "Open kbase device" }, + { get_version, NULL, "Check version" }, + { set_flags, NULL, "Set flags" }, + { mmap_tracking, munmap_tracking, "Map tracking handle" }, + { get_gpuprops, free_gpuprops, "Get GPU properties" }, + { get_gpu_id, NULL, "GPU ID" }, + { get_coherency_mode, NULL, "Coherency mode" }, + { get_csf_caps, NULL, "CSF caps" }, + { mmap_user_reg, munmap_user_reg, "Map user register page" }, + { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, + { init_mem_jit, NULL, "Initialise JIT allocator" }, + { stream_create, stream_destroy, "Create synchronisation stream" }, + { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" }, + { cs_group_create, cs_group_term, "Create command stream group" }, + { decode_init, decode_close, "Initialize pandecode" }, + + /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */ + ALLOC_TEST("Allocate normal memory", normal, 0x200f), + ALLOC_TEST("Allocate exectuable memory", exec, 0x2017), + ALLOC_TEST("Allocate coherent memory", coherent, 0x280f), + ALLOC_TEST("Allocate cached memory", cached, 0x380f), + ALLOC_TEST("Allocate CSF event memory", event, 0x8200f), + ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f), + + /* These three tests are run for every queue, but later ones are not */ + { cs_queue_create, cs_queue_free, "Create command stream queues" }, + { cs_queue_register, cs_queue_term, "Register command stream queues" }, + + { cs_test, NULL, "Test command stream" }, + + { cs_init, NULL, "Initialise and start command stream queues" }, + { cs_simple, NULL, "Execute MOV command" }, + { cs_simple, NULL, "Execute MOV command (again)" }, + { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true }, + //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true }, + { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true }, + { cs_store, NULL, "Execute STR command" }, + //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true }, + { cs_store, NULL, "Execute ADD command", .add = true }, + { cs_sub, NULL, "Execute STR on iterator" }, + + { compute_compile, NULL, "Compile a compute shader" }, + { compute_execute, NULL, "Execute a compute shader" }, + { compute_execute, NULL, "Execute compute on blit queue", .blit = true }, + + //{ mmu_dump, NULL, "Dump MMU pagetables" }, +}; + +static void +do_test_list(struct state *s, struct test *tests, unsigned length); + +static void +cleanup_test_list(struct state *s, struct test *tests, unsigned length) +{ + for (unsigned i = length; i > 0; --i) { + unsigned n = i - 1; + + struct test *t = &tests[n]; + if (!t->cleanup) + continue; + + if (pr) + printf("[CLEANUP %i] %s: ", n, t->label); + if (t->cleanup(s, t)) { + if (pr) + printf("PASS\n"); + } else { + if (pr) + printf("FAIL\n"); + } + } +} + +static unsigned +interpret_test_list(struct state *s, struct test *tests, unsigned length) +{ + for (unsigned i = 0; i < length; ++i) { + struct test *t = &tests[i]; + + if (pr) + printf("[TEST %i] %s: ", i, t->label); + if (t->part) { + if (t->part(s, t)) { + if (pr) + printf("PASS\n"); + } else { + if (pr) + printf("FAIL\n"); + if (!getenv("TEST_KEEP_GOING")) + return i + 1; + } + } + if (t->subtests) + do_test_list(s, t->subtests, t->sub_length); + } + + return length; +} + +static void +do_test_list(struct state *s, struct test *tests, unsigned length) +{ + unsigned ran = interpret_test_list(s, tests, length); + cleanup_test_list(s, tests, ran); +} + +int +main(int argc, char *argv[]) +{ + struct state s = { + .page_size = sysconf(_SC_PAGE_SIZE), + .argc = argc, + .argv = argv, + }; + + if (getenv("CSF_QUIET")) + pr = false; + + if (!strcmp(getenv("TERM"), "dumb")) + colour_term = false; + + if (pr) + printf("Running Valhall CSF tests\n"); + + do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main)); +} diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h index c1e61332203..d708d628d36 100644 --- a/src/panfrost/lib/wrap.h +++ b/src/panfrost/lib/wrap.h @@ -1,4 +1,3 @@ - /* * Copyright (C) 2017-2019 Lyude Paul * Copyright (C) 2017-2019 Alyssa Rosenzweig @@ -50,6 +49,8 @@ struct pandecode_context *pandecode_create_context(bool to_stderr); void pandecode_next_frame(struct pandecode_context *ctx); +void pandecode_dump_file_close(void); + void pandecode_destroy_context(struct pandecode_context *ctx); void pandecode_inject_mmap(struct pandecode_context *ctx, uint64_t gpu_va, @@ -64,6 +65,10 @@ void pandecode_jc(struct pandecode_context *ctx, uint64_t jc_gpu_va, void pandecode_cs(struct pandecode_context *ctx, mali_ptr queue_gpu_va, uint32_t size, unsigned gpu_id, uint32_t *regs); +void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id); + +void pandecode_dump_mappings(void); + void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va, unsigned gpu_id); diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build index aa393d44fe5..43860d4ee2a 100644 --- a/src/panfrost/meson.build +++ b/src/panfrost/meson.build @@ -20,7 +20,7 @@ # SOFTWARE. inc_panfrost_hw = include_directories([ - 'include' + 'include', 'base' ]) inc_panfrost = include_directories([ @@ -35,6 +35,7 @@ subdir('shared') subdir('util') subdir('midgard') subdir('compiler') +subdir('base') if with_gallium_panfrost or with_panfrost_vk subdir('lib') @@ -70,6 +71,46 @@ bifrost_compiler = executable( build_by_default : with_tools.contains('panfrost') ) +csf_test = executable( + 'csf_test', + ['csf_test/test.c'], + include_directories : [ + inc_mapi, + inc_mesa, + inc_gallium, + inc_gallium_aux, + inc_include, + inc_src, + inc_panfrost, + inc_panfrost_hw, + ], + dependencies : [ + idep_nir, + idep_mesautil, + idep_bi_opcodes_h, + dep_libdrm, + libpanfrost_dep, + ], + build_by_default : true +) + +custom_target( + 'panfrost_panloader', + output: ['panfrost_panloader.txt'], + depends : [ + libpanfrost_lib, + libpanfrost_util, + _libmesa_util, + libpanfrost_decode, + libpanfrost_decode_per_arch, + libpanfrost_midgard_disasm, + libpanfrost_bifrost_disasm, + libpanfrost_valhall_disasm, + ], + command: ['touch', '@OUTPUT@'], + build_by_default : false, +) + if with_panfrost_vk subdir('vulkan') endif diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c index 4a2cab60d92..d4d2b59c2bb 100644 --- a/src/panfrost/midgard/disassemble.c +++ b/src/panfrost/midgard/disassemble.c @@ -1254,8 +1254,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words, UNUSED static void print_varying_parameters(FILE *fp, midgard_load_store_word *word) { - midgard_varying_params p = midgard_unpack_varying_params(*word); - + unsigned params = word->signed_offset & 0x1FF; + midgard_varying_params p; + memcpy(&p, ¶ms, sizeof(p)); /* If a varying, there are qualifiers */ if (p.flat_shading) fprintf(fp, ".flat");