From 8584ac79ff4701df509e54245cd05f275cd71d93 Mon Sep 17 00:00:00 2001 From: SolDev69 Date: Thu, 21 Dec 2023 20:13:50 -0500 Subject: [PATCH] apply pancsf patches --- .ci-farms-disabled/lima | 0 README-CSF.rst | 88 + README.rst | 145 +- bin/ci/custom_logger.py | 334 + bin/ci/test/requirements.txt | 5 + bin/ci/test/test_custom_logger.py | 669 + docs/features.txt.rej | 10 + include/dma-uapi/dma-buf.h | 182 + include/drm-uapi/drm_fourcc.h | 7 + meson.build.rej | 18 + patch.diff | 25515 ++++++++++++++++ src/amd/vulkan/radv_buffer_view.c | 149 + src/amd/vulkan/radv_image_view.c | 945 + src/amd/vulkan/radv_sdma.h | 93 + src/android_stub/meson.build.rej | 10 + src/compiler/glsl/glsl_to_nir.cpp.rej | 39 + src/compiler/glsl/standalone_scaffolding.cpp | 3 + src/drm-shim/device.c | 4 + src/egl/drivers/dri2/egl_dri2.c | 2 + src/egl/drivers/dri2/egl_dri2.c.rej | 60 + src/egl/drivers/dri2/egl_dri2.h | 4 + src/egl/drivers/dri2/platform_wayland.c | 92 +- src/egl/drivers/dri2/platform_wayland.c.rej | 89 + src/egl/meson.build.rej | 19 + .../mali-buffer-sharing/mali-buffer-sharing.c | 170 + .../mali-buffer-sharing/mali-buffer-sharing.h | 12 + .../mali-buffer-sharing.xml | 50 + .../wayland/mali-buffer-sharing/meson.build | 51 + src/egl/wayland/wayland-drm/wayland-drm.c.rej | 10 + src/gallium/auxiliary/cso_cache/cso_context.c | 5 + src/gallium/auxiliary/cso_cache/cso_context.h | 3 + .../auxiliary/gallivm/lp_bld_nir_soa.c.rej | 19 + .../pipe-loader/pipe_loader_drm.c.rej | 10 + .../target-helpers/inline_sw_helper.h.rej | 43 + src/gallium/drivers/panfrost/meson.build | 1 + src/gallium/drivers/panfrost/meson.build.rej | 10 + .../drivers/panfrost/pan_cmdstream.c.rej | 1186 + .../drivers/panfrost/pan_context.c.rej | 178 + .../drivers/panfrost/pan_context.h.rej | 42 + src/gallium/drivers/panfrost/pan_disk_cache.c | 2 + .../drivers/panfrost/pan_disk_cache.c.rej | 23 + src/gallium/drivers/panfrost/pan_fence.c.rej | 66 + src/gallium/drivers/panfrost/pan_fence.h.rej | 9 + src/gallium/drivers/panfrost/pan_job.c.rej | 596 + src/gallium/drivers/panfrost/pan_job.h.rej | 42 + .../drivers/panfrost/pan_resource.c.rej | 426 + src/gallium/drivers/panfrost/pan_screen.c.rej | 87 + src/gallium/drivers/panfrost/pan_screen.h | 1 + src/gallium/drivers/panfrost/pan_screen.h.rej | 28 + src/gallium/frontends/nine/nine_ff.c | 2 +- src/gallium/frontends/nine/nine_shader.c | 4 +- src/gallium/frontends/nine/nine_shader.c.rej | 10 + src/gallium/frontends/nine/nine_state.c | 4 +- src/gallium/frontends/nine/nine_state.c.rej | 13 + .../targets/d3dadapter9/meson.build.rej | 11 + src/gallium/targets/osmesa/meson.build.rej | 14 + src/gallium/targets/rusticl/meson.build.rej | 9 + .../winsys/kmsro/drm/kmsro_drm_winsys.c.rej | 19 + src/mesa/main/shaderapi.c | 7 - src/mesa/main/shaderapi.c.rej | 9 + src/meson.build | 1 + .../base/include/csf/mali_base_csf_kernel.h | 596 + .../base/include/csf/mali_gpu_csf_registers.h | 43 + .../base/include/csf/mali_kbase_csf_ioctl.h | 530 + .../base/include/jm/mali_base_jm_kernel.h | 1051 + .../base/include/jm/mali_kbase_jm_ioctl.h | 231 + .../base/include/mali_base_common_kernel.h | 231 + src/panfrost/base/include/mali_base_kernel.h | 700 + .../base/include/mali_kbase_gpuprops.h | 127 + src/panfrost/base/include/mali_kbase_ioctl.h | 759 + .../base/include/old/mali-ioctl-midgard.h | 80 + src/panfrost/base/include/old/mali-ioctl.h | 743 + src/panfrost/base/include/old/mali-props.h | 262 + src/panfrost/base/meson.build | 55 + src/panfrost/base/pan_base.c | 301 + src/panfrost/base/pan_base.h | 234 + src/panfrost/base/pan_base_noop.h | 152 + src/panfrost/base/pan_cache.h | 95 + src/panfrost/base/pan_vX_base.c | 1825 ++ src/panfrost/ci/deqp-panfrost-g610.toml | 11 + src/panfrost/csf_test/interpret.py | 1820 ++ src/panfrost/csf_test/mali_base_csf_kernel.h | 721 + src/panfrost/csf_test/mali_base_kernel.h | 746 + .../csf_test/mali_gpu_csf_registers.h | 43 + src/panfrost/csf_test/mali_kbase_csf_ioctl.h | 483 + src/panfrost/csf_test/mali_kbase_ioctl.h | 854 + src/panfrost/csf_test/test.c | 1903 ++ src/panfrost/lib/genxml/common.xml | 2 +- src/panfrost/lib/genxml/decode.c.rej | 940 + src/panfrost/lib/genxml/decode.h.rej | 28 + src/panfrost/lib/genxml/decode_common.c.rej | 52 + src/panfrost/lib/genxml/gen_macros.h.rej | 11 + src/panfrost/lib/genxml/gen_pack.py | 317 +- src/panfrost/lib/genxml/meson.build.rej | 19 + src/panfrost/lib/genxml/v4.xml | 2 +- src/panfrost/lib/genxml/v5.xml | 2 +- src/panfrost/lib/genxml/v6.xml | 8 +- src/panfrost/lib/genxml/v7.xml | 12 +- src/panfrost/lib/genxml/v9.xml | 75 +- src/panfrost/lib/genxml/v9.xml.rej | 28 + src/panfrost/lib/meson.build | 2 +- src/panfrost/lib/meson.build.rej | 10 + src/panfrost/lib/pan_afbc.c.rej | 25 + src/panfrost/lib/pan_blend.c.rej | 10 + src/panfrost/lib/pan_blitter.c.rej | 28 + src/panfrost/lib/pan_bo.c | 1 + src/panfrost/lib/pan_bo.c.rej | 584 + src/panfrost/lib/pan_bo.h.rej | 84 + src/panfrost/lib/pan_device.h.rej | 88 + src/panfrost/lib/pan_layout.c.rej | 66 + src/panfrost/lib/pan_pool.h.rej | 19 + src/panfrost/lib/pan_props.c.rej | 365 + src/panfrost/lib/pan_texture.h.rej | 55 + src/panfrost/lib/pan_util.h.rej | 19 + src/panfrost/lib/wrap.h.rej | 21 + src/panfrost/meson.build | 42 +- src/panfrost/meson.build.rej | 10 + src/panfrost/midgard/disassemble.c.rej | 12 + src/panfrost/tiler/tiler-hex-read | 400 + src/util/os_misc.c.rej | 103 + src/util/perf/cpu_trace.h.rej | 21 + src/util/stable_array.h | 132 + src/util/u_debug_stack_android.cpp.rej | 83 + 123 files changed, 49782 insertions(+), 145 deletions(-) create mode 100644 .ci-farms-disabled/lima create mode 100644 README-CSF.rst create mode 100644 bin/ci/custom_logger.py create mode 100644 bin/ci/test/requirements.txt create mode 100644 bin/ci/test/test_custom_logger.py create mode 100644 docs/features.txt.rej create mode 100644 include/dma-uapi/dma-buf.h create mode 100644 meson.build.rej create mode 100644 patch.diff create mode 100644 src/amd/vulkan/radv_buffer_view.c create mode 100644 src/amd/vulkan/radv_image_view.c create mode 100644 src/amd/vulkan/radv_sdma.h create mode 100644 src/android_stub/meson.build.rej create mode 100644 src/compiler/glsl/glsl_to_nir.cpp.rej create mode 100644 src/egl/drivers/dri2/egl_dri2.c.rej create mode 100644 src/egl/drivers/dri2/platform_wayland.c.rej create mode 100644 src/egl/meson.build.rej create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h create mode 100644 src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml create mode 100644 src/egl/wayland/mali-buffer-sharing/meson.build create mode 100644 src/egl/wayland/wayland-drm/wayland-drm.c.rej create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej create mode 100644 src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej create mode 100644 src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej create mode 100644 src/gallium/drivers/panfrost/meson.build.rej create mode 100644 src/gallium/drivers/panfrost/pan_cmdstream.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_context.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_context.h.rej create mode 100644 src/gallium/drivers/panfrost/pan_disk_cache.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_fence.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_fence.h.rej create mode 100644 src/gallium/drivers/panfrost/pan_job.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_job.h.rej create mode 100644 src/gallium/drivers/panfrost/pan_resource.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_screen.c.rej create mode 100644 src/gallium/drivers/panfrost/pan_screen.h.rej create mode 100644 src/gallium/frontends/nine/nine_shader.c.rej create mode 100644 src/gallium/frontends/nine/nine_state.c.rej create mode 100644 src/gallium/targets/d3dadapter9/meson.build.rej create mode 100644 src/gallium/targets/osmesa/meson.build.rej create mode 100644 src/gallium/targets/rusticl/meson.build.rej create mode 100644 src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej create mode 100644 src/mesa/main/shaderapi.c.rej create mode 100644 src/panfrost/base/include/csf/mali_base_csf_kernel.h create mode 100644 src/panfrost/base/include/csf/mali_gpu_csf_registers.h create mode 100644 src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h create mode 100644 src/panfrost/base/include/jm/mali_base_jm_kernel.h create mode 100644 src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h create mode 100644 src/panfrost/base/include/mali_base_common_kernel.h create mode 100644 src/panfrost/base/include/mali_base_kernel.h create mode 100644 src/panfrost/base/include/mali_kbase_gpuprops.h create mode 100644 src/panfrost/base/include/mali_kbase_ioctl.h create mode 100644 src/panfrost/base/include/old/mali-ioctl-midgard.h create mode 100644 src/panfrost/base/include/old/mali-ioctl.h create mode 100644 src/panfrost/base/include/old/mali-props.h create mode 100644 src/panfrost/base/meson.build create mode 100644 src/panfrost/base/pan_base.c create mode 100644 src/panfrost/base/pan_base.h create mode 100644 src/panfrost/base/pan_base_noop.h create mode 100644 src/panfrost/base/pan_cache.h create mode 100644 src/panfrost/base/pan_vX_base.c create mode 100644 src/panfrost/ci/deqp-panfrost-g610.toml create mode 100755 src/panfrost/csf_test/interpret.py create mode 100644 src/panfrost/csf_test/mali_base_csf_kernel.h create mode 100644 src/panfrost/csf_test/mali_base_kernel.h create mode 100644 src/panfrost/csf_test/mali_gpu_csf_registers.h create mode 100644 src/panfrost/csf_test/mali_kbase_csf_ioctl.h create mode 100644 src/panfrost/csf_test/mali_kbase_ioctl.h create mode 100644 src/panfrost/csf_test/test.c create mode 100644 src/panfrost/lib/genxml/decode.c.rej create mode 100644 src/panfrost/lib/genxml/decode.h.rej create mode 100644 src/panfrost/lib/genxml/decode_common.c.rej create mode 100644 src/panfrost/lib/genxml/gen_macros.h.rej create mode 100644 src/panfrost/lib/genxml/meson.build.rej create mode 100644 src/panfrost/lib/genxml/v9.xml.rej create mode 100644 src/panfrost/lib/meson.build.rej create mode 100644 src/panfrost/lib/pan_afbc.c.rej create mode 100644 src/panfrost/lib/pan_blend.c.rej create mode 100644 src/panfrost/lib/pan_blitter.c.rej create mode 100644 src/panfrost/lib/pan_bo.c.rej create mode 100644 src/panfrost/lib/pan_bo.h.rej create mode 100644 src/panfrost/lib/pan_device.h.rej create mode 100644 src/panfrost/lib/pan_layout.c.rej create mode 100644 src/panfrost/lib/pan_pool.h.rej create mode 100644 src/panfrost/lib/pan_props.c.rej create mode 100644 src/panfrost/lib/pan_texture.h.rej create mode 100644 src/panfrost/lib/pan_util.h.rej create mode 100644 src/panfrost/lib/wrap.h.rej create mode 100644 src/panfrost/meson.build.rej create mode 100644 src/panfrost/midgard/disassemble.c.rej create mode 100755 src/panfrost/tiler/tiler-hex-read create mode 100644 src/util/os_misc.c.rej create mode 100644 src/util/perf/cpu_trace.h.rej create mode 100644 src/util/stable_array.h create mode 100644 src/util/u_debug_stack_android.cpp.rej diff --git a/.ci-farms-disabled/lima b/.ci-farms-disabled/lima new file mode 100644 index 00000000000..e69de29bb2d diff --git a/README-CSF.rst b/README-CSF.rst new file mode 100644 index 00000000000..9bd161005f9 --- /dev/null +++ b/README-CSF.rst @@ -0,0 +1,88 @@ +Valhall CSF Tests +================= + +The ``csf`` branch contains a test program for v10 Valhall GPUs (G710 +etc.) which uses the Arm ``kbase`` kernel driver, which is generally +present on vendor kernels but is not in the upstream Linux kernel. + +However, the kernel driver source can also be downloaded `from Arm +`_, +of which the newer releases should work well enough with a mainline +kernel (though some work may be needed to integrate the vendor +platform). + +Making sure that the ``libmali`` blob drivers work before trying this +program is recommended, otherwise you will be trying to debug +userspace and kernel bugs at the same time. + +Note that firmware is required for these GPUs, for RK3588 try +downloading the file from the Rockchip `libmali +`_ +repo, and placing it in ``/lib/firmware/``. + +Compiling +--------- + +.. code-block:: sh + + $ mkdir build + $ cd build + $ meson --buildtype=debug -Dgallium-drivers=panfrost -Dvulkan-drivers= + $ ninja src/panfrost/csf_test + +Running +------- + +.. code-block:: sh + + $ src/panfrost/csf_test + +will run the tests. + +Normally it will start running cleanup steps as soon as one test +fails, though setting the environment variable ``TEST_KEEP_GOING=1`` +will change this behaviour. + +Test failures +------------- + +Gitlab issues can be created against `my repo +`_, though +some problems should be easy to fix (wrong permissions on +``/dev/mali0``?). + +Include all output from running the test program. Including logs from +``strace`` might also help. + +Command stream test script +-------------------------- + +``src/panfrost/csf_test/interpret.py`` is a test script for assembling +and executing command streams. + +To use it, symlink the ``csf_test`` binary into ``$PATH`` and optionally +also write a ``rebuild-mesa`` script which recompiles ``csf_test``. + +Then running ``interpret.py`` will execute the ``cmds`` variable, +which is defined inside the script file. + +Example: + +.. code-block:: txt + + @ comments are started with '@' + + @ run on command stream 2 + !cs 2 + @ allocate some memory + !alloc x 4096 + @ allocate event memory, for evstr instructions + !alloc ev 4096 0x8200f + + mov x50, $x + + @ dump all registers to the memory starting at x50 + regdump x50 + + @ dump the memory region named 'x' + !dump x 0 4096 diff --git a/README.rst b/README.rst index b35246e034c..fd140a96013 100644 --- a/README.rst +++ b/README.rst @@ -1,59 +1,136 @@ `Mesa `_ - The 3D Graphics Library ====================================================== +Valhall v10 "CSF" support branch—for Mali G710/G610. + +Note that firmware is required for these GPUs, for RK3588 try +downloading the file from the Rockchip `libmali +`_ +repo, and placing it in ``/lib/firmware/``. + +Windowing system support +------------------------ + +Panfrost Wayland compositor (wlroots): + +#. Panfrost Wayland clients +#. Panfrost X11 clients via Xwayland [1]_ +#. Blob X11 clients via Xwayland + dri2to3 [2]_ + +Panfrost Wayland compositor (non-wlroots): + +#. Panfrost Wayland clients +#. Panfrost X11 clients via Xwayland +#. Blob Wayland clients +#. Blob X11 clients via Xwayland + dri2to3 [2]_ + +Blob Wayland compositor: + +#. Panfrost Wayland clients +#. Blob Wayland clients + +Panfrost Xorg server: [3]_ + +#. Panfrost X11 clients +#. Blob X11 clients + +Blob Xorg server: + +#. Panfrost X11 clients +#. Blob X11 clients + +Applications using KMS/DRM will also work. + +.. [1] Requires ``CONFIG_DRM_IGNORE_IOTCL_PERMIT`` to be disabled in + the kernel configuration. The option is broken and should never + be enabled anyway. + +.. [2] See https://gitlab.com/panfork/dri2to3 + +.. [3] For Radxa Debian/Ubuntu, the ``xserver-xorg-core`` version + installed by default is not compatible with Panfrost. But note + that upstream Xorg does not work will the blob, so Mesa must be + installed so that it is used by default. (see the "Usage" + section below). To switch between the upstream and Rockchip + versions, run: + +.. code-block:: sh + + $ sudo apt install xserver-xorg-core="$(apt-cache show xserver-xorg-core | grep Version | grep -v "$(dpkg -s xserver-xorg-core | grep Version)" | cut -d" " -f2)" + +Broken combinations: + +#. Panfrost wlroots + Blob Wayland does not work because wlroots does + not expose the ``mali_buffer_sharing`` protocol. This might be + fixable. +#. Blob Wayland compositor + Panfrost X11 does not work because the + blob does not expose the required protocols for Xwayland + acceleration to work Source ------ -This repository lives at https://gitlab.freedesktop.org/mesa/mesa. -Other repositories are likely forks, and code found there is not supported. +This repository lives at https://gitlab.com/panfork/mesa, and is a +fork, so not supported by upstream. +Upstream source is at https://gitlab.freedesktop.org/mesa/mesa. -Build & install ---------------- +Depdendencies +------------- -You can find more information in our documentation (`docs/install.rst -`_), but the recommended way is to use -Meson (`docs/meson.rst `_): +For Debian-based distributions: .. code-block:: sh - $ mkdir build - $ cd build - $ meson .. + $ sudo apt install build-essential meson git python3-mako libexpat1-dev bison flex libwayland-egl-backend-dev libxext-dev libxfixes-dev libxcb-glx0-dev libxcb-shm0-dev libxcb-dri2-0-dev libxcb-dri3-dev libxcb-present-dev libxshmfence-dev libxxf86vm-dev libxrandr-dev + +Also needed is ``libdrm`` and ``wayland-protocols``, but those +packages are too old in Debian Bullseye, and must be compiled from +source: + +.. code-block:: sh + + $ git clone https://gitlab.freedesktop.org/mesa/drm + $ mkdir drm/build + $ cd drm/build + $ meson + $ sudo ninja install + +.. code-block:: sh + + $ git clone https://gitlab.freedesktop.org/wayland/wayland-protocols + $ mkdir wayland-protocols/build + $ cd wayland-protocols/build + $ git checkout 1.24 + $ meson $ sudo ninja install +Build & install +--------------- -Support -------- +To install to ``/opt/panfrost``: -Many Mesa devs hang on IRC; if you're not sure which channel is -appropriate, you should ask your question on `OFTC's #dri-devel -`_, someone will redirect you if -necessary. -Remember that not everyone is in the same timezone as you, so it might -take a while before someone qualified sees your question. -To figure out who you're talking to, or which nick to ping for your -question, check out `Who's Who on IRC -`_. +.. code-block:: sh -The next best option is to ask your question in an email to the -mailing lists: `mesa-dev\@lists.freedesktop.org -`_ + $ mkdir build + $ cd build + $ meson -Dgallium-drivers=panfrost -Dvulkan-drivers= -Dllvm=disabled --prefix=/opt/panfrost + $ sudo ninja install +Usage +----- -Bug reports ------------ +To run an application with Panfrost (note the windowing system support +section above): -If you think something isn't working properly, please file a bug report -(`docs/bugs.rst `_). +.. code-block:: sh + $ LD_LIBRARY_PATH=/opt/panfrost/lib/aarch64-linux-gnu glmark2-es2-wayland -Contributing ------------- +To use Panfrost by default, add the directory where you installed it +to the library search path: -Contributions are welcome, and step-by-step instructions can be found in our -documentation (`docs/submittingpatches.rst -`_). +.. code-block:: sh -Note that Mesa uses gitlab for patches submission, review and discussions. + $ echo /opt/panfrost/lib/aarch64-linux-gnu | sudo tee /etc/ld.so.conf.d/0-panfrost.conf + $ sudo ldconfig diff --git a/bin/ci/custom_logger.py b/bin/ci/custom_logger.py new file mode 100644 index 00000000000..7721be2f66e --- /dev/null +++ b/bin/ci/custom_logger.py @@ -0,0 +1,334 @@ +import argparse +import logging +from datetime import datetime +from pathlib import Path + +from structured_logger import StructuredLogger + + +class CustomLogger: + def __init__(self, log_file): + self.log_file = log_file + self.logger = StructuredLogger(file_name=self.log_file) + + def get_last_dut_job(self): + """ + Gets the details of the most recent DUT job. + + Returns: + dict: Details of the most recent DUT job. + + Raises: + ValueError: If no DUT jobs are found in the logger's data. + """ + try: + job = self.logger.data["dut_jobs"][-1] + except KeyError: + raise ValueError( + "No DUT jobs found. Please create a job via create_dut_job call." + ) + + return job + + def update(self, **kwargs): + """ + Updates the log file with provided key-value pairs. + + Args: + **kwargs: Key-value pairs to be updated. + + """ + with self.logger.edit_context(): + for key, value in kwargs.items(): + self.logger.data[key] = value + + def create_dut_job(self, **kwargs): + """ + Creates a new DUT job with provided key-value pairs. + + Args: + **kwargs: Key-value pairs for the new DUT job. + + """ + with self.logger.edit_context(): + if "dut_jobs" not in self.logger.data: + self.logger.data["dut_jobs"] = [] + new_job = { + "status": "", + "submitter_start_time": datetime.now().isoformat(), + "dut_submit_time": "", + "dut_start_time": "", + "dut_end_time": "", + "dut_name": "", + "dut_state": "pending", + "dut_job_phases": [], + **kwargs, + } + self.logger.data["dut_jobs"].append(new_job) + + def update_dut_job(self, key, value): + """ + Updates the last DUT job with a key-value pair. + + Args: + key : The key to be updated. + value: The value to be assigned. + + """ + with self.logger.edit_context(): + job = self.get_last_dut_job() + job[key] = value + + def update_status_fail(self, reason=""): + """ + Sets the status of the last DUT job to 'fail' and logs the failure reason. + + Args: + reason (str, optional): The reason for the failure. Defaults to "". + + """ + with self.logger.edit_context(): + job = self.get_last_dut_job() + job["status"] = "fail" + job["dut_job_fail_reason"] = reason + + def create_job_phase(self, phase_name): + """ + Creates a new job phase for the last DUT job. + + Args: + phase_name : The name of the new job phase. + + """ + with self.logger.edit_context(): + job = self.get_last_dut_job() + if job["dut_job_phases"] and job["dut_job_phases"][-1]["end_time"] == "": + # If the last phase exists and its end time is empty, set the end time + job["dut_job_phases"][-1]["end_time"] = datetime.now().isoformat() + + # Create a new phase + phase_data = { + "name": phase_name, + "start_time": datetime.now().isoformat(), + "end_time": "", + } + job["dut_job_phases"].append(phase_data) + + def check_dut_timings(self, job): + """ + Check the timing sequence of a job to ensure logical consistency. + + The function verifies that the job's submission time is not earlier than its start time and that + the job's end time is not earlier than its start time. If either of these conditions is found to be true, + an error is logged for each instance of inconsistency. + + Args: + job (dict): A dictionary containing timing information of a job. Expected keys are 'dut_start_time', + 'dut_submit_time', and 'dut_end_time'. + + Returns: + None: This function does not return a value; it logs errors if timing inconsistencies are detected. + + The function checks the following: + - If 'dut_start_time' and 'dut_submit_time' are both present and correctly sequenced. + - If 'dut_start_time' and 'dut_end_time' are both present and correctly sequenced. + """ + + # Check if the start time and submit time exist + if job.get("dut_start_time") and job.get("dut_submit_time"): + # If they exist, check if the submission time is before the start time + if job["dut_start_time"] < job["dut_submit_time"]: + logging.error("Job submission is happening before job start.") + + # Check if the start time and end time exist + if job.get("dut_start_time") and job.get("dut_end_time"): + # If they exist, check if the end time is after the start time + if job["dut_end_time"] < job["dut_start_time"]: + logging.error("Job ended before it started.") + + # Method to update DUT start, submit and end time + def update_dut_time(self, value, custom_time): + """ + Updates DUT start, submit, and end times. + + Args: + value : Specifies which DUT time to update. Options: 'start', 'submit', 'end'. + custom_time : Custom time to set. If None, use current time. + + Raises: + ValueError: If an invalid argument is provided for value. + + """ + with self.logger.edit_context(): + job = self.get_last_dut_job() + timestamp = custom_time if custom_time else datetime.now().isoformat() + if value == "start": + job["dut_start_time"] = timestamp + job["dut_state"] = "running" + elif value == "submit": + job["dut_submit_time"] = timestamp + job["dut_state"] = "submitted" + elif value == "end": + job["dut_end_time"] = timestamp + job["dut_state"] = "finished" + else: + raise ValueError( + "Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'." + ) + # check the sanity of the partial structured log + self.check_dut_timings(job) + + def close_dut_job(self): + """ + Closes the most recent DUT (Device Under Test) job in the logger's data. + + The method performs the following operations: + 1. Validates if there are any DUT jobs in the logger's data. + 2. If the last phase of the most recent DUT job has an empty end time, it sets the end time to the current time. + + Raises: + ValueError: If no DUT jobs are found in the logger's data. + """ + with self.logger.edit_context(): + job = self.get_last_dut_job() + # Check if the last phase exists and its end time is empty, then set the end time + if job["dut_job_phases"] and job["dut_job_phases"][-1]["end_time"] == "": + job["dut_job_phases"][-1]["end_time"] = datetime.now().isoformat() + + def close(self): + """ + Closes the most recent DUT (Device Under Test) job in the logger's data. + + The method performs the following operations: + 1. Determines the combined status of all DUT jobs. + 2. Sets the submitter's end time to the current time. + 3. Updates the DUT attempt counter to reflect the total number of DUT jobs. + + """ + with self.logger.edit_context(): + job_status = [] + for job in self.logger.data["dut_jobs"]: + if "status" in job: + job_status.append(job["status"]) + + if not job_status: + job_combined_status = "null" + else: + # Get job_combined_status + if "pass" in job_status: + job_combined_status = "pass" + else: + job_combined_status = "fail" + + self.logger.data["job_combined_status"] = job_combined_status + self.logger.data["dut_attempt_counter"] = len(self.logger.data["dut_jobs"]) + job["submitter_end_time"] = datetime.now().isoformat() + + +def process_args(args): + # Function to process key-value pairs and call corresponding logger methods + def process_key_value_pairs(args_list, action_func): + if not args_list: + raise ValueError( + f"No key-value pairs provided for {action_func.__name__.replace('_', '-')}" + ) + if len(args_list) % 2 != 0: + raise ValueError( + f"Incomplete key-value pairs for {action_func.__name__.replace('_', '-')}" + ) + kwargs = dict(zip(args_list[::2], args_list[1::2])) + action_func(**kwargs) + + # Create a CustomLogger object with the specified log file path + custom_logger = CustomLogger(Path(args.log_file)) + + if args.update: + process_key_value_pairs(args.update, custom_logger.update) + + if args.create_dut_job: + process_key_value_pairs(args.create_dut_job, custom_logger.create_dut_job) + + if args.update_dut_job: + key, value = args.update_dut_job + custom_logger.update_dut_job(key, value) + + if args.create_job_phase: + custom_logger.create_job_phase(args.create_job_phase) + + if args.update_status_fail: + custom_logger.update_status_fail(args.update_status_fail) + + if args.update_dut_time: + if len(args.update_dut_time) == 2: + action, custom_time = args.update_dut_time + elif len(args.update_dut_time) == 1: + action, custom_time = args.update_dut_time[0], None + else: + raise ValueError("Invalid number of values for --update-dut-time") + + if action in ["start", "end", "submit"]: + custom_logger.update_dut_time(action, custom_time) + else: + raise ValueError( + "Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'." + ) + + if args.close_dut_job: + custom_logger.close_dut_job() + + if args.close: + custom_logger.close() + + +def main(): + parser = argparse.ArgumentParser(description="Custom Logger Command Line Tool") + parser.add_argument("log_file", help="Path to the log file") + parser.add_argument( + "--update", + nargs=argparse.ZERO_OR_MORE, + metavar=("key", "value"), + help="Update a key-value pair e.g., --update key1 value1 key2 value2)", + ) + parser.add_argument( + "--create-dut-job", + nargs=argparse.ZERO_OR_MORE, + metavar=("key", "value"), + help="Create a new DUT job with key-value pairs (e.g., --create-dut-job key1 value1 key2 value2)", + ) + parser.add_argument( + "--update-dut-job", + nargs=argparse.ZERO_OR_MORE, + metavar=("key", "value"), + help="Update a key-value pair in DUT job", + ) + parser.add_argument( + "--create-job-phase", + help="Create a new job phase (e.g., --create-job-phase name)", + ) + parser.add_argument( + "--update-status-fail", + help="Update fail as the status and log the failure reason (e.g., --update-status-fail reason)", + ) + parser.add_argument( + "--update-dut-time", + nargs=argparse.ZERO_OR_MORE, + metavar=("action", "custom_time"), + help="Update DUT start and end time. Provide action ('start', 'submit', 'end') and custom_time (e.g., '2023-01-01T12:00:00')", + ) + parser.add_argument( + "--close-dut-job", + action="store_true", + help="Close the dut job by updating end time of last dut job)", + ) + parser.add_argument( + "--close", + action="store_true", + help="Updates combined status, submitter's end time and DUT attempt counter", + ) + args = parser.parse_args() + + process_args(args) + + +if __name__ == "__main__": + main() diff --git a/bin/ci/test/requirements.txt b/bin/ci/test/requirements.txt new file mode 100644 index 00000000000..f80621af285 --- /dev/null +++ b/bin/ci/test/requirements.txt @@ -0,0 +1,5 @@ +filelock==3.12.4 +fire==0.5.0 +mock==5.1.0 +polars==0.19.3 +pytest==7.4.2 diff --git a/bin/ci/test/test_custom_logger.py b/bin/ci/test/test_custom_logger.py new file mode 100644 index 00000000000..98ad9c00494 --- /dev/null +++ b/bin/ci/test/test_custom_logger.py @@ -0,0 +1,669 @@ +import logging +import subprocess +from datetime import datetime + +import pytest +from custom_logger import CustomLogger + + +@pytest.fixture +def tmp_log_file(tmp_path): + return tmp_path / "test_log.json" + + +@pytest.fixture +def custom_logger(tmp_log_file): + return CustomLogger(tmp_log_file) + + +def run_script_with_args(args): + import custom_logger + + script_path = custom_logger.__file__ + return subprocess.run( + ["python3", str(script_path), *args], capture_output=True, text=True + ) + + +# Test case for missing log file +@pytest.mark.parametrize( + "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")] +) +def test_missing_log_file_argument(key, value): + result = run_script_with_args(["--update", "key", "value"]) + assert result.returncode != 0 + + +# Parametrize test case for valid update arguments +@pytest.mark.parametrize( + "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")] +) +def test_update_argument_valid(custom_logger, tmp_log_file, key, value): + result = run_script_with_args([str(tmp_log_file), "--update", key, value]) + assert result.returncode == 0 + + +# Test case for passing only the key without a value +def test_update_argument_key_only(custom_logger, tmp_log_file): + key = "dut_attempt_counter" + result = run_script_with_args([str(tmp_log_file), "--update", key]) + assert result.returncode != 0 + + +# Test case for not passing any key-value pair +def test_update_argument_no_values(custom_logger, tmp_log_file): + result = run_script_with_args([str(tmp_log_file), "--update"]) + assert result.returncode == 0 + + +# Parametrize test case for valid arguments +@pytest.mark.parametrize( + "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")] +) +def test_create_argument_valid(custom_logger, tmp_log_file, key, value): + result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key, value]) + assert result.returncode == 0 + + +# Test case for passing only the key without a value +def test_create_argument_key_only(custom_logger, tmp_log_file): + key = "dut_attempt_counter" + result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key]) + assert result.returncode != 0 + + +# Test case for not passing any key-value pair +def test_create_argument_no_values(custom_logger, tmp_log_file): + result = run_script_with_args([str(tmp_log_file), "--create-dut-job"]) + assert result.returncode == 0 + + +# Test case for updating a DUT job +@pytest.mark.parametrize( + "key, value", [("status", "hung"), ("dut_state", "Canceling"), ("dut_name", "asus")] +) +def test_update_dut_job(custom_logger, tmp_log_file, key, value): + result = run_script_with_args([str(tmp_log_file), "--update-dut-job", key, value]) + assert result.returncode != 0 + + result = run_script_with_args([str(tmp_log_file), "--create-dut-job", key, value]) + assert result.returncode == 0 + + result = run_script_with_args([str(tmp_log_file), "--update-dut-job", key, value]) + assert result.returncode == 0 + + +# Test case for updating last DUT job +def test_update_dut_multiple_job(custom_logger, tmp_log_file): + # Create the first DUT job with the first key + result = run_script_with_args( + [str(tmp_log_file), "--create-dut-job", "status", "hung"] + ) + assert result.returncode == 0 + + # Create the second DUT job with the second key + result = run_script_with_args( + [str(tmp_log_file), "--create-dut-job", "dut_state", "Canceling"] + ) + assert result.returncode == 0 + + result = run_script_with_args( + [str(tmp_log_file), "--update-dut-job", "dut_name", "asus"] + ) + assert result.returncode == 0 + + +# Parametrize test case for valid phase arguments +@pytest.mark.parametrize( + "phase_name", + [("Phase1"), ("Phase2"), ("Phase3")], +) +def test_create_job_phase_valid(custom_logger, tmp_log_file, phase_name): + custom_logger.create_dut_job(status="pass") + + result = run_script_with_args([str(tmp_log_file), "--create-job-phase", phase_name]) + assert result.returncode == 0 + + +# Test case for not passing any arguments for create-job-phase +def test_create_job_phase_no_arguments(custom_logger, tmp_log_file): + custom_logger.create_dut_job(status="pass") + + result = run_script_with_args([str(tmp_log_file), "--create-job-phase"]) + assert result.returncode != 0 + + +# Test case for trying to create a phase job without an existing DUT job +def test_create_job_phase_no_dut_job(custom_logger, tmp_log_file): + phase_name = "Phase1" + + result = run_script_with_args([str(tmp_log_file), "--create-job-phase", phase_name]) + assert result.returncode != 0 + + +# Combined test cases for valid scenarios +def test_valid_scenarios(custom_logger, tmp_log_file): + valid_update_args = [("dut_attempt_counter", "1"), ("job_combined_status", "pass")] + for key, value in valid_update_args: + result = run_script_with_args([str(tmp_log_file), "--update", key, value]) + assert result.returncode == 0 + + valid_create_args = [ + ("status", "hung"), + ("dut_state", "Canceling"), + ("dut_name", "asus"), + ("phase_name", "Bootloader"), + ] + for key, value in valid_create_args: + result = run_script_with_args( + [str(tmp_log_file), "--create-dut-job", key, value] + ) + assert result.returncode == 0 + + result = run_script_with_args( + [str(tmp_log_file), "--create-dut-job", "status", "hung"] + ) + assert result.returncode == 0 + + result = run_script_with_args( + [str(tmp_log_file), "--update-dut-job", "dut_name", "asus"] + ) + assert result.returncode == 0 + + result = run_script_with_args( + [ + str(tmp_log_file), + "--create-job-phase", + "phase_name", + ] + ) + assert result.returncode == 0 + + +# Parametrize test case for valid update arguments +@pytest.mark.parametrize( + "key, value", [("dut_attempt_counter", "1"), ("job_combined_status", "pass")] +) +def test_update(custom_logger, key, value): + custom_logger.update(**{key: value}) + logger_data = custom_logger.logger.data + + assert key in logger_data + assert logger_data[key] == value + + +# Test case for updating with a key that already exists +def test_update_existing_key(custom_logger): + key = "status" + value = "new_value" + custom_logger.logger.data[key] = "old_value" + custom_logger.update(**{key: value}) + logger_data = custom_logger.logger.data + + assert key in logger_data + assert logger_data[key] == value + + +# Test case for updating "dut_jobs" +def test_update_dut_jobs(custom_logger): + key1 = "status" + value1 = "fail" + key2 = "state" + value2 = "hung" + + custom_logger.create_dut_job(**{key1: value1}) + logger_data = custom_logger.logger.data + + job1 = logger_data["dut_jobs"][0] + assert key1 in job1 + assert job1[key1] == value1 + + custom_logger.update_dut_job(key2, value2) + logger_data = custom_logger.logger.data + + job2 = logger_data["dut_jobs"][0] + assert key2 in job2 + assert job2[key2] == value2 + + +# Test case for creating and updating DUT job +def test_create_dut_job(custom_logger): + key = "status" + value1 = "pass" + value2 = "fail" + value3 = "hung" + + reason = "job_combined_status" + result = "Finished" + + custom_logger.update(**{reason: result}) + logger_data = custom_logger.logger.data + + assert reason in logger_data + assert logger_data[reason] == result + + # Create the first DUT job + custom_logger.create_dut_job(**{key: value1}) + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 1 + assert isinstance(logger_data["dut_jobs"][0], dict) + + # Check the values of the keys in the created first DUT job + job1 = logger_data["dut_jobs"][0] + assert key in job1 + assert job1[key] == value1 + + # Create the second DUT job + custom_logger.create_dut_job(**{key: value2}) + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 2 + assert isinstance(logger_data["dut_jobs"][1], dict) + + # Check the values of the keys in the created second DUT job + job2 = logger_data["dut_jobs"][1] + assert key in job2 + assert job2[key] == value2 + + # Update the second DUT job with value3 + custom_logger.update_dut_job(key, value3) + logger_data = custom_logger.logger.data + + # Check the updated value in the second DUT job + job2 = logger_data["dut_jobs"][1] + assert key in job2 + assert job2[key] == value3 + + # Find the index of the last DUT job + last_job_index = len(logger_data["dut_jobs"]) - 1 + + # Update the last DUT job + custom_logger.update_dut_job("dut_name", "asus") + logger_data = custom_logger.logger.data + + # Check the updated value in the last DUT job + job2 = logger_data["dut_jobs"][last_job_index] + assert "dut_name" in job2 + assert job2["dut_name"] == "asus" + + # Check that "dut_name" is not present in other DUT jobs + for idx, job in enumerate(logger_data["dut_jobs"]): + if idx != last_job_index: + assert job.get("dut_name") == "" + + +# Test case for updating with missing "dut_jobs" key +def test_update_dut_job_missing_dut_jobs(custom_logger): + key = "status" + value = "fail" + + # Attempt to update a DUT job when "dut_jobs" is missing + with pytest.raises(ValueError, match="No DUT jobs found."): + custom_logger.update_dut_job(key, value) + + +# Test case for creating a job phase +def test_create_job_phase(custom_logger): + custom_logger.create_dut_job(status="pass") + phase_name = "Phase1" + + custom_logger.create_job_phase(phase_name) + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 1 + + job = logger_data["dut_jobs"][0] + assert "dut_job_phases" in job + assert isinstance(job["dut_job_phases"], list) + assert len(job["dut_job_phases"]) == 1 + + phase = job["dut_job_phases"][0] + assert phase["name"] == phase_name + try: + datetime.fromisoformat(phase["start_time"]) + assert True + except ValueError: + assert False + assert phase["end_time"] == "" + + +# Test case for creating multiple phase jobs +def test_create_multiple_phase_jobs(custom_logger): + custom_logger.create_dut_job(status="pass") + + phase_data = [ + { + "phase_name": "Phase1", + }, + { + "phase_name": "Phase2", + }, + { + "phase_name": "Phase3", + }, + ] + + for data in phase_data: + phase_name = data["phase_name"] + + custom_logger.create_job_phase(phase_name) + + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 1 + + job = logger_data["dut_jobs"][0] + assert "dut_job_phases" in job + assert isinstance(job["dut_job_phases"], list) + assert len(job["dut_job_phases"]) == len(phase_data) + + for data in phase_data: + phase_name = data["phase_name"] + + phase = job["dut_job_phases"][phase_data.index(data)] + + assert phase["name"] == phase_name + try: + datetime.fromisoformat(phase["start_time"]) + assert True + except ValueError: + assert False + + if phase_data.index(data) != len(phase_data) - 1: + try: + datetime.fromisoformat(phase["end_time"]) + assert True + except ValueError: + assert False + + # Check if the end_time of the last phase is an empty string + last_phase = job["dut_job_phases"][-1] + assert last_phase["end_time"] == "" + + +# Test case for creating multiple dut jobs and updating phase job for last dut job +def test_create_two_dut_jobs_and_add_phase(custom_logger): + # Create the first DUT job + custom_logger.create_dut_job(status="pass") + + # Create the second DUT job + custom_logger.create_dut_job(status="fail") + + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 2 + + first_dut_job = logger_data["dut_jobs"][0] + second_dut_job = logger_data["dut_jobs"][1] + + # Add a phase to the second DUT job + custom_logger.create_job_phase("Phase1") + + logger_data = custom_logger.logger.data + + assert "dut_jobs" in logger_data + assert isinstance(logger_data["dut_jobs"], list) + assert len(logger_data["dut_jobs"]) == 2 + + first_dut_job = logger_data["dut_jobs"][0] + second_dut_job = logger_data["dut_jobs"][1] + + # Check first DUT job does not have a phase + assert not first_dut_job.get("dut_job_phases") + + # Check second DUT job has a phase + assert second_dut_job.get("dut_job_phases") + assert isinstance(second_dut_job["dut_job_phases"], list) + assert len(second_dut_job["dut_job_phases"]) == 1 + + +# Test case for updating DUT start time +def test_update_dut_start_time(custom_logger): + custom_logger.create_dut_job(status="pass") + custom_logger.update_dut_time("start", None) + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "dut_start_time" in dut_job + assert dut_job["dut_start_time"] != "" + + try: + datetime.fromisoformat(dut_job["dut_start_time"]) + assert True + except ValueError: + assert False + + +# Test case for updating DUT submit time +def test_update_dut_submit_time(custom_logger): + custom_time = "2023-11-09T02:37:06Z" + custom_logger.create_dut_job(status="pass") + custom_logger.update_dut_time("submit", custom_time) + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "dut_submit_time" in dut_job + + try: + datetime.fromisoformat(dut_job["dut_submit_time"]) + assert True + except ValueError: + assert False + + +# Test case for updating DUT end time +def test_update_dut_end_time(custom_logger): + custom_logger.create_dut_job(status="pass") + custom_logger.update_dut_time("end", None) + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "dut_end_time" in dut_job + + try: + datetime.fromisoformat(dut_job["dut_end_time"]) + assert True + except ValueError: + assert False + + +# Test case for updating DUT time with invalid value +def test_update_dut_time_invalid_value(custom_logger): + custom_logger.create_dut_job(status="pass") + with pytest.raises( + ValueError, + match="Error: Invalid argument provided for --update-dut-time. Use 'start', 'submit', 'end'.", + ): + custom_logger.update_dut_time("invalid_value", None) + + +# Test case for close_dut_job +def test_close_dut_job(custom_logger): + custom_logger.create_dut_job(status="pass") + + custom_logger.create_job_phase("Phase1") + custom_logger.create_job_phase("Phase2") + + custom_logger.close_dut_job() + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "dut_job_phases" in dut_job + dut_job_phases = dut_job["dut_job_phases"] + + phase1 = dut_job_phases[0] + assert phase1["name"] == "Phase1" + + try: + datetime.fromisoformat(phase1["start_time"]) + assert True + except ValueError: + assert False + + try: + datetime.fromisoformat(phase1["end_time"]) + assert True + except ValueError: + assert False + + phase2 = dut_job_phases[1] + assert phase2["name"] == "Phase2" + + try: + datetime.fromisoformat(phase2["start_time"]) + assert True + except ValueError: + assert False + + try: + datetime.fromisoformat(phase2["end_time"]) + assert True + except ValueError: + assert False + + +# Test case for close +def test_close(custom_logger): + custom_logger.create_dut_job(status="pass") + + custom_logger.close() + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + assert "dut_attempt_counter" in logger_data + assert logger_data["dut_attempt_counter"] == len(logger_data["dut_jobs"]) + assert "job_combined_status" in logger_data + assert logger_data["job_combined_status"] != "" + + dut_job = logger_data["dut_jobs"][0] + assert "submitter_end_time" in dut_job + try: + datetime.fromisoformat(dut_job["submitter_end_time"]) + assert True + except ValueError: + assert False + + +# Test case for updating status to fail with a reason +def test_update_status_fail_with_reason(custom_logger): + custom_logger.create_dut_job() + + reason = "kernel panic" + custom_logger.update_status_fail(reason) + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "status" in dut_job + assert dut_job["status"] == "fail" + assert "dut_job_fail_reason" in dut_job + assert dut_job["dut_job_fail_reason"] == reason + + +# Test case for updating status to fail without providing a reason +def test_update_status_fail_without_reason(custom_logger): + custom_logger.create_dut_job() + + custom_logger.update_status_fail() + + # Check if the status is updated and fail reason is empty + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + dut_job = logger_data["dut_jobs"][0] + assert "status" in dut_job + assert dut_job["status"] == "fail" + assert "dut_job_fail_reason" in dut_job + assert dut_job["dut_job_fail_reason"] == "" + + +# Test case for check_dut_timings with submission time earlier than start time +def test_check_dut_timings_submission_earlier_than_start(custom_logger, caplog): + custom_logger.create_dut_job() + + # Set submission time to be earlier than start time + custom_logger.update_dut_time("start", "2023-01-01T11:00:00") + custom_logger.update_dut_time("submit", "2023-01-01T12:00:00") + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + job = logger_data["dut_jobs"][0] + + # Call check_dut_timings + custom_logger.check_dut_timings(job) + + # Check if an error message is logged + assert "Job submission is happening before job start." in caplog.text + + +# Test case for check_dut_timings with end time earlier than start time +def test_check_dut_timings_end_earlier_than_start(custom_logger, caplog): + custom_logger.create_dut_job() + + # Set end time to be earlier than start time + custom_logger.update_dut_time("end", "2023-01-01T11:00:00") + custom_logger.update_dut_time("start", "2023-01-01T12:00:00") + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + job = logger_data["dut_jobs"][0] + + # Call check_dut_timings + custom_logger.check_dut_timings(job) + + # Check if an error message is logged + assert "Job ended before it started." in caplog.text + + +# Test case for check_dut_timings with valid timing sequence +def test_check_dut_timings_valid_timing_sequence(custom_logger, caplog): + custom_logger.create_dut_job() + + # Set valid timing sequence + custom_logger.update_dut_time("submit", "2023-01-01T12:00:00") + custom_logger.update_dut_time("start", "2023-01-01T12:30:00") + custom_logger.update_dut_time("end", "2023-01-01T13:00:00") + + logger_data = custom_logger.logger.data + assert "dut_jobs" in logger_data + assert len(logger_data["dut_jobs"]) == 1 + + job = logger_data["dut_jobs"][0] + + # Call check_dut_timings + custom_logger.check_dut_timings(job) + + # Check that no error messages are logged + assert "Job submission is happening before job start." not in caplog.text + assert "Job ended before it started." not in caplog.text diff --git a/docs/features.txt.rej b/docs/features.txt.rej new file mode 100644 index 00000000000..cb296b346d3 --- /dev/null +++ b/docs/features.txt.rej @@ -0,0 +1,10 @@ +diff a/docs/features.txt b/docs/features.txt (rejected hunks) +@@ -213,7 +213,7 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, i965/gen8+, nvc0, r600, radeonsi, + GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, zink + + GL_ARB_ES3_1_compatibility DONE (freedreno/a6xx, i965/hsw+, softpipe, virgl) +- GL_ARB_clip_control DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12) ++ GL_ARB_clip_control DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12, panfrost) + GL_ARB_conditional_render_inverted DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12) + GL_ARB_cull_distance DONE (freedreno/a6xx, i965, nv50, softpipe, virgl) + GL_ARB_derivative_control DONE (freedreno/a3xx+, i965, nv50, softpipe, virgl) diff --git a/include/dma-uapi/dma-buf.h b/include/dma-uapi/dma-buf.h new file mode 100644 index 00000000000..5a6fda66d9a --- /dev/null +++ b/include/dma-uapi/dma-buf.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Framework for buffer objects that can be shared across devices/subsystems. + * + * Copyright(C) 2015 Intel Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef _DMA_BUF_UAPI_H_ +#define _DMA_BUF_UAPI_H_ + +#include + +/** + * struct dma_buf_sync - Synchronize with CPU access. + * + * When a DMA buffer is accessed from the CPU via mmap, it is not always + * possible to guarantee coherency between the CPU-visible map and underlying + * memory. To manage coherency, DMA_BUF_IOCTL_SYNC must be used to bracket + * any CPU access to give the kernel the chance to shuffle memory around if + * needed. + * + * Prior to accessing the map, the client must call DMA_BUF_IOCTL_SYNC + * with DMA_BUF_SYNC_START and the appropriate read/write flags. Once the + * access is complete, the client should call DMA_BUF_IOCTL_SYNC with + * DMA_BUF_SYNC_END and the same read/write flags. + * + * The synchronization provided via DMA_BUF_IOCTL_SYNC only provides cache + * coherency. It does not prevent other processes or devices from + * accessing the memory at the same time. If synchronization with a GPU or + * other device driver is required, it is the client's responsibility to + * wait for buffer to be ready for reading or writing before calling this + * ioctl with DMA_BUF_SYNC_START. Likewise, the client must ensure that + * follow-up work is not submitted to GPU or other device driver until + * after this ioctl has been called with DMA_BUF_SYNC_END? + * + * If the driver or API with which the client is interacting uses implicit + * synchronization, waiting for prior work to complete can be done via + * poll() on the DMA buffer file descriptor. If the driver or API requires + * explicit synchronization, the client may have to wait on a sync_file or + * other synchronization primitive outside the scope of the DMA buffer API. + */ +struct dma_buf_sync { + /** + * @flags: Set of access flags + * + * DMA_BUF_SYNC_START: + * Indicates the start of a map access session. + * + * DMA_BUF_SYNC_END: + * Indicates the end of a map access session. + * + * DMA_BUF_SYNC_READ: + * Indicates that the mapped DMA buffer will be read by the + * client via the CPU map. + * + * DMA_BUF_SYNC_WRITE: + * Indicates that the mapped DMA buffer will be written by the + * client via the CPU map. + * + * DMA_BUF_SYNC_RW: + * An alias for DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE. + */ + __u64 flags; +}; + +#define DMA_BUF_SYNC_READ (1 << 0) +#define DMA_BUF_SYNC_WRITE (2 << 0) +#define DMA_BUF_SYNC_RW (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE) +#define DMA_BUF_SYNC_START (0 << 2) +#define DMA_BUF_SYNC_END (1 << 2) +#define DMA_BUF_SYNC_VALID_FLAGS_MASK \ + (DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END) + +#define DMA_BUF_NAME_LEN 32 + +/** + * struct dma_buf_export_sync_file - Get a sync_file from a dma-buf + * + * Userspace can perform a DMA_BUF_IOCTL_EXPORT_SYNC_FILE to retrieve the + * current set of fences on a dma-buf file descriptor as a sync_file. CPU + * waits via poll() or other driver-specific mechanisms typically wait on + * whatever fences are on the dma-buf at the time the wait begins. This + * is similar except that it takes a snapshot of the current fences on the + * dma-buf for waiting later instead of waiting immediately. This is + * useful for modern graphics APIs such as Vulkan which assume an explicit + * synchronization model but still need to inter-operate with dma-buf. + * + * The intended usage pattern is the following: + * + * 1. Export a sync_file with flags corresponding to the expected GPU usage + * via DMA_BUF_IOCTL_EXPORT_SYNC_FILE. + * + * 2. Submit rendering work which uses the dma-buf. The work should wait on + * the exported sync file before rendering and produce another sync_file + * when complete. + * + * 3. Import the rendering-complete sync_file into the dma-buf with flags + * corresponding to the GPU usage via DMA_BUF_IOCTL_IMPORT_SYNC_FILE. + * + * Unlike doing implicit synchronization via a GPU kernel driver's exec ioctl, + * the above is not a single atomic operation. If userspace wants to ensure + * ordering via these fences, it is the respnosibility of userspace to use + * locks or other mechanisms to ensure that no other context adds fences or + * submits work between steps 1 and 3 above. + */ +struct dma_buf_export_sync_file { + /** + * @flags: Read/write flags + * + * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both. + * + * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set, + * the returned sync file waits on any writers of the dma-buf to + * complete. Waiting on the returned sync file is equivalent to + * poll() with POLLIN. + * + * If DMA_BUF_SYNC_WRITE is set, the returned sync file waits on + * any users of the dma-buf (read or write) to complete. Waiting + * on the returned sync file is equivalent to poll() with POLLOUT. + * If both DMA_BUF_SYNC_WRITE and DMA_BUF_SYNC_READ are set, this + * is equivalent to just DMA_BUF_SYNC_WRITE. + */ + __u32 flags; + /** @fd: Returned sync file descriptor */ + __s32 fd; +}; + +/** + * struct dma_buf_import_sync_file - Insert a sync_file into a dma-buf + * + * Userspace can perform a DMA_BUF_IOCTL_IMPORT_SYNC_FILE to insert a + * sync_file into a dma-buf for the purposes of implicit synchronization + * with other dma-buf consumers. This allows clients using explicitly + * synchronized APIs such as Vulkan to inter-op with dma-buf consumers + * which expect implicit synchronization such as OpenGL or most media + * drivers/video. + */ +struct dma_buf_import_sync_file { + /** + * @flags: Read/write flags + * + * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both. + * + * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set, + * this inserts the sync_file as a read-only fence. Any subsequent + * implicitly synchronized writes to this dma-buf will wait on this + * fence but reads will not. + * + * If DMA_BUF_SYNC_WRITE is set, this inserts the sync_file as a + * write fence. All subsequent implicitly synchronized access to + * this dma-buf will wait on this fence. + */ + __u32 flags; + /** @fd: Sync file descriptor */ + __s32 fd; +}; + +#define DMA_BUF_BASE 'b' +#define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync) + +/* 32/64bitness of this uapi was botched in android, there's no difference + * between them in actual uapi, they're just different numbers. + */ +#define DMA_BUF_SET_NAME _IOW(DMA_BUF_BASE, 1, const char *) +#define DMA_BUF_SET_NAME_A _IOW(DMA_BUF_BASE, 1, __u32) +#define DMA_BUF_SET_NAME_B _IOW(DMA_BUF_BASE, 1, __u64) +#define DMA_BUF_IOCTL_EXPORT_SYNC_FILE _IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file) +#define DMA_BUF_IOCTL_IMPORT_SYNC_FILE _IOW(DMA_BUF_BASE, 3, struct dma_buf_import_sync_file) + +#endif diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h index 6b6235f7a7c..30343c7c9c3 100644 --- a/include/drm-uapi/drm_fourcc.h +++ b/include/drm-uapi/drm_fourcc.h @@ -1219,6 +1219,13 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) */ #define AFBC_FORMAT_MOD_USM (1ULL << 12) +/* AFBC native swizzle + * + * Indicates that the buffer is using RGBA component order regardless of the + * actual format. + */ +#define AFBC_FORMAT_MOD_NATIVE_SWIZZLE (1ULL << 32) + /* * Arm Fixed-Rate Compression (AFRC) modifiers * diff --git a/meson.build.rej b/meson.build.rej new file mode 100644 index 00000000000..322ff95a368 --- /dev/null +++ b/meson.build.rej @@ -0,0 +1,18 @@ +diff a/meson.build b/meson.build (rejected hunks) +@@ -865,14 +865,13 @@ endif + + with_gallium_st_nine = get_option('gallium-nine') + if with_gallium_st_nine +- if not with_gallium_softpipe +- error('The nine state tracker requires gallium softpipe/llvmpipe.') +- elif not [ ++ if not [ + with_gallium_crocus, + with_gallium_freedreno, + with_gallium_i915, + with_gallium_iris, + with_gallium_nouveau, ++ with_gallium_panfrost, + with_gallium_r300, + with_gallium_r600, + with_gallium_radeonsi, diff --git a/patch.diff b/patch.diff new file mode 100644 index 00000000000..ad36794a25b --- /dev/null +++ b/patch.diff @@ -0,0 +1,25515 @@ +diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml +new file mode 100644 +index 00000000000..37fcc4b92b1 +--- /dev/null ++++ b/.github/workflows/android.yml +@@ -0,0 +1,67 @@ ++name: Build Android ++ ++on: ++ [push, pull_request] ++ ++# A workflow run is made up of one or more jobs that can run sequentially or in parallel ++jobs: ++ build: ++ strategy: ++ matrix: ++ arch: [ "arm32", "aarch64" ] ++ fail-fast: false ++ ++ name: "Build for ${{matrix.arch}}" ++ ++ runs-on: ubuntu-latest ++ ++ steps: ++ ++ - uses: actions/checkout@v2 ++ ++ - name: Build ++ run: | ++ sudo apt update ++ sudo apt install -y meson libxrandr-dev libxxf86vm-dev libxcb-*-dev libx11-xcb-dev libxfixes-dev libdrm-dev libx11-dev ++ pip3 install mako ++ export ANDROID_NDK_HOME="$ANDROID_SDK_ROOT/ndk-bundle" ++ envsubst build-crossfile-drm ++ git clone --depth 1 https://gitlab.freedesktop.org/mesa/drm.git ++ cd drm ++ meson setup "build-android" \ ++ --prefix=/tmp/drm-static \ ++ --cross-file "../build-crossfile-drm" \ ++ -Ddefault_library=static \ ++ -Dintel=disabled \ ++ -Dradeon=disabled \ ++ -Damdgpu=disabled \ ++ -Dnouveau=disabled \ ++ -Dvmwgfx=disabled \ ++ -Dfreedreno=disabled \ ++ -Dvc4=disabled \ ++ -Detnaviv=disabled ++ ninja -C "build-android" install ++ cd .. ++ envsubst build-crossfile ++ meson setup "build-android" \ ++ --prefix=/tmp/pan \ ++ --cross-file "build-crossfile" \ ++ -Dplatforms=android \ ++ -Dplatform-sdk-version=26 \ ++ -Dandroid-stub=true \ ++ -Dllvm=disabled \ ++ -Dxlib-lease=disabled \ ++ -Degl=disabled \ ++ -Dgbm=disabled \ ++ -Dglx=disabled \ ++ -Dopengl=true \ ++ -Dosmesa=true \ ++ -Dvulkan-drivers= \ ++ -Dgallium-drivers=swrast,panfrost \ ++ -Dshared-glapi=false ++ ninja -C "build-android" install ++ - name: Upload libraries ++ uses: actions/upload-artifact@v2 ++ with: ++ name: panfrost_${{matrix.arch}} ++ path: /tmp/pan +diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml +deleted file mode 100644 +index d1b66ef4cad..00000000000 +--- a/.github/workflows/macos.yml ++++ /dev/null +@@ -1,60 +0,0 @@ +-name: macOS-CI +-on: push +- +-permissions: +- contents: read +- +-jobs: +- macOS-CI: +- strategy: +- matrix: +- glx_option: ['dri', 'xlib'] +- runs-on: macos-11 +- env: +- GALLIUM_DUMP_CPU: true +- MESON_EXEC: /Users/runner/Library/Python/3.11/bin/meson +- steps: +- - name: Checkout +- uses: actions/checkout@v3 +- - name: Install Dependencies +- run: | +- cat > Brewfile < native_config <`_, ++of which the newer releases should work well enough with a mainline ++kernel (though some work may be needed to integrate the vendor ++platform). ++ ++Making sure that the ``libmali`` blob drivers work before trying this ++program is recommended, otherwise you will be trying to debug ++userspace and kernel bugs at the same time. ++ ++Note that firmware is required for these GPUs, for RK3588 try ++downloading the file from the Rockchip `libmali ++`_ ++repo, and placing it in ``/lib/firmware/``. ++ ++Compiling ++--------- ++ ++.. code-block:: sh ++ ++ $ mkdir build ++ $ cd build ++ $ meson --buildtype=debug -Dgallium-drivers=panfrost -Dvulkan-drivers= ++ $ ninja src/panfrost/csf_test ++ ++Running ++------- ++ ++.. code-block:: sh ++ ++ $ src/panfrost/csf_test ++ ++will run the tests. ++ ++Normally it will start running cleanup steps as soon as one test ++fails, though setting the environment variable ``TEST_KEEP_GOING=1`` ++will change this behaviour. ++ ++Test failures ++------------- ++ ++Gitlab issues can be created against `my repo ++`_, though ++some problems should be easy to fix (wrong permissions on ++``/dev/mali0``?). ++ ++Include all output from running the test program. Including logs from ++``strace`` might also help. ++ ++Command stream test script ++-------------------------- ++ ++``src/panfrost/csf_test/interpret.py`` is a test script for assembling ++and executing command streams. ++ ++To use it, symlink the ``csf_test`` binary into ``$PATH`` and optionally ++also write a ``rebuild-mesa`` script which recompiles ``csf_test``. ++ ++Then running ``interpret.py`` will execute the ``cmds`` variable, ++which is defined inside the script file. ++ ++Example: ++ ++.. code-block:: txt ++ ++ @ comments are started with '@' ++ ++ @ run on command stream 2 ++ !cs 2 ++ @ allocate some memory ++ !alloc x 4096 ++ @ allocate event memory, for evstr instructions ++ !alloc ev 4096 0x8200f ++ ++ mov x50, $x ++ ++ @ dump all registers to the memory starting at x50 ++ regdump x50 ++ ++ @ dump the memory region named 'x' ++ !dump x 0 4096 +diff --git a/README.rst b/README.rst +index b35246e034c..fd140a96013 100644 +--- a/README.rst ++++ b/README.rst +@@ -1,59 +1,136 @@ + `Mesa `_ - The 3D Graphics Library + ====================================================== + ++Valhall v10 "CSF" support branch—for Mali G710/G610. ++ ++Note that firmware is required for these GPUs, for RK3588 try ++downloading the file from the Rockchip `libmali ++`_ ++repo, and placing it in ``/lib/firmware/``. ++ ++Windowing system support ++------------------------ ++ ++Panfrost Wayland compositor (wlroots): ++ ++#. Panfrost Wayland clients ++#. Panfrost X11 clients via Xwayland [1]_ ++#. Blob X11 clients via Xwayland + dri2to3 [2]_ ++ ++Panfrost Wayland compositor (non-wlroots): ++ ++#. Panfrost Wayland clients ++#. Panfrost X11 clients via Xwayland ++#. Blob Wayland clients ++#. Blob X11 clients via Xwayland + dri2to3 [2]_ ++ ++Blob Wayland compositor: ++ ++#. Panfrost Wayland clients ++#. Blob Wayland clients ++ ++Panfrost Xorg server: [3]_ ++ ++#. Panfrost X11 clients ++#. Blob X11 clients ++ ++Blob Xorg server: ++ ++#. Panfrost X11 clients ++#. Blob X11 clients ++ ++Applications using KMS/DRM will also work. ++ ++.. [1] Requires ``CONFIG_DRM_IGNORE_IOTCL_PERMIT`` to be disabled in ++ the kernel configuration. The option is broken and should never ++ be enabled anyway. ++ ++.. [2] See https://gitlab.com/panfork/dri2to3 ++ ++.. [3] For Radxa Debian/Ubuntu, the ``xserver-xorg-core`` version ++ installed by default is not compatible with Panfrost. But note ++ that upstream Xorg does not work will the blob, so Mesa must be ++ installed so that it is used by default. (see the "Usage" ++ section below). To switch between the upstream and Rockchip ++ versions, run: ++ ++.. code-block:: sh ++ ++ $ sudo apt install xserver-xorg-core="$(apt-cache show xserver-xorg-core | grep Version | grep -v "$(dpkg -s xserver-xorg-core | grep Version)" | cut -d" " -f2)" ++ ++Broken combinations: ++ ++#. Panfrost wlroots + Blob Wayland does not work because wlroots does ++ not expose the ``mali_buffer_sharing`` protocol. This might be ++ fixable. ++#. Blob Wayland compositor + Panfrost X11 does not work because the ++ blob does not expose the required protocols for Xwayland ++ acceleration to work + + Source + ------ + +-This repository lives at https://gitlab.freedesktop.org/mesa/mesa. +-Other repositories are likely forks, and code found there is not supported. ++This repository lives at https://gitlab.com/panfork/mesa, and is a ++fork, so not supported by upstream. + ++Upstream source is at https://gitlab.freedesktop.org/mesa/mesa. + +-Build & install +---------------- ++Depdendencies ++------------- + +-You can find more information in our documentation (`docs/install.rst +-`_), but the recommended way is to use +-Meson (`docs/meson.rst `_): ++For Debian-based distributions: + + .. code-block:: sh + +- $ mkdir build +- $ cd build +- $ meson .. ++ $ sudo apt install build-essential meson git python3-mako libexpat1-dev bison flex libwayland-egl-backend-dev libxext-dev libxfixes-dev libxcb-glx0-dev libxcb-shm0-dev libxcb-dri2-0-dev libxcb-dri3-dev libxcb-present-dev libxshmfence-dev libxxf86vm-dev libxrandr-dev ++ ++Also needed is ``libdrm`` and ``wayland-protocols``, but those ++packages are too old in Debian Bullseye, and must be compiled from ++source: ++ ++.. code-block:: sh ++ ++ $ git clone https://gitlab.freedesktop.org/mesa/drm ++ $ mkdir drm/build ++ $ cd drm/build ++ $ meson ++ $ sudo ninja install ++ ++.. code-block:: sh ++ ++ $ git clone https://gitlab.freedesktop.org/wayland/wayland-protocols ++ $ mkdir wayland-protocols/build ++ $ cd wayland-protocols/build ++ $ git checkout 1.24 ++ $ meson + $ sudo ninja install + ++Build & install ++--------------- + +-Support +-------- ++To install to ``/opt/panfrost``: + +-Many Mesa devs hang on IRC; if you're not sure which channel is +-appropriate, you should ask your question on `OFTC's #dri-devel +-`_, someone will redirect you if +-necessary. +-Remember that not everyone is in the same timezone as you, so it might +-take a while before someone qualified sees your question. +-To figure out who you're talking to, or which nick to ping for your +-question, check out `Who's Who on IRC +-`_. ++.. code-block:: sh + +-The next best option is to ask your question in an email to the +-mailing lists: `mesa-dev\@lists.freedesktop.org +-`_ ++ $ mkdir build ++ $ cd build ++ $ meson -Dgallium-drivers=panfrost -Dvulkan-drivers= -Dllvm=disabled --prefix=/opt/panfrost ++ $ sudo ninja install + ++Usage ++----- + +-Bug reports +------------ ++To run an application with Panfrost (note the windowing system support ++section above): + +-If you think something isn't working properly, please file a bug report +-(`docs/bugs.rst `_). ++.. code-block:: sh + ++ $ LD_LIBRARY_PATH=/opt/panfrost/lib/aarch64-linux-gnu glmark2-es2-wayland + +-Contributing +------------- ++To use Panfrost by default, add the directory where you installed it ++to the library search path: + +-Contributions are welcome, and step-by-step instructions can be found in our +-documentation (`docs/submittingpatches.rst +-`_). ++.. code-block:: sh + +-Note that Mesa uses gitlab for patches submission, review and discussions. ++ $ echo /opt/panfrost/lib/aarch64-linux-gnu | sudo tee /etc/ld.so.conf.d/0-panfrost.conf ++ $ sudo ldconfig +diff --git a/android-aarch64 b/android-aarch64 +new file mode 100644 +index 00000000000..2737a2d01bd +--- /dev/null ++++ b/android-aarch64 +@@ -0,0 +1,26 @@ ++[binaries] ++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar' ++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android26-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC'] ++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android26-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++'] ++c_ld = 'lld' ++cpp_ld = 'lld' ++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip' ++# Android doesn't come with a pkg-config, but we need one for meson to be happy not ++# finding all the optional deps it looks for. Use system pkg-config pointing at a ++# directory we get to populate with any .pc files we want to add for Android ++ ++# Also, include the plain DRM lib we found earlier. Panfrost relies on it rather heavily, especially when ++# interacting with the panfrost DRM module and not kbase ++ ++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.:/tmp/drm-static/lib/pkgconfig', '/usr/bin/pkg-config'] ++ ++[host_machine] ++system = 'linux' ++# cpu_family = 'x86_64' ++# cpu = 'amd64' ++ ++# ik this is wrong but workaround sanity check ++cpu_family = 'arm' ++cpu = 'armv8' ++ ++endian = 'little' +diff --git a/android-arm32 b/android-arm32 +new file mode 100644 +index 00000000000..6bd6af4e902 +--- /dev/null ++++ b/android-arm32 +@@ -0,0 +1,26 @@ ++[binaries] ++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar' ++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi26-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC'] ++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi26-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++'] ++c_ld = 'lld' ++cpp_ld = 'lld' ++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip' ++# Android doesn't come with a pkg-config, but we need one for meson to be happy not ++# finding all the optional deps it looks for. Use system pkg-config pointing at a ++# directory we get to populate with any .pc files we want to add for Android ++ ++# Also, include the plain DRM lib we found earlier. Panfrost relies on it rather heavily, especially when ++# interacting with the panfrost DRM module and not kbase ++ ++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.:/tmp/drm-static/lib/pkgconfig', '/usr/bin/pkg-config'] ++ ++[host_machine] ++system = 'linux' ++# cpu_family = 'x86_64' ++# cpu = 'amd64' ++ ++# ik this is wrong but workaround sanity check ++cpu_family = 'arm' ++cpu = 'armv7' ++ ++endian = 'little' +diff --git a/android-drm-aarch64 b/android-drm-aarch64 +new file mode 100644 +index 00000000000..eb91f638435 +--- /dev/null ++++ b/android-drm-aarch64 +@@ -0,0 +1,22 @@ ++[binaries] ++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar' ++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android24-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC'] ++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android24-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++'] ++c_ld = 'lld' ++cpp_ld = 'lld' ++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip' ++# Android doesn't come with a pkg-config, but we need one for meson to be happy not ++# finding all the optional deps it looks for. Use system pkg-config pointing at a ++# directory we get to populate with any .pc files we want to add for Android ++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.', '/usr/bin/pkg-config'] ++ ++[host_machine] ++system = 'linux' ++# cpu_family = 'x86_64' ++# cpu = 'amd64' ++ ++# ik this is wrong but workaround sanity check ++cpu_family = 'arm' ++cpu = 'armv8' ++ ++endian = 'little' +diff --git a/android-drm-arm32 b/android-drm-arm32 +new file mode 100644 +index 00000000000..5fae96b7d1e +--- /dev/null ++++ b/android-drm-arm32 +@@ -0,0 +1,22 @@ ++[binaries] ++ar = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-ar' ++c = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi24-clang', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC'] ++cpp = ['ccache', '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/armv7a-linux-androideabi24-clang++', '-O3', '-DVK_USE_PLATFORM_ANDROID_KHR', '-fPIC', '-fno-exceptions', '-fno-unwind-tables', '-fno-asynchronous-unwind-tables', '-static-libstdc++'] ++c_ld = 'lld' ++cpp_ld = 'lld' ++strip = '$ANDROID_NDK_ROOT/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-strip' ++# Android doesn't come with a pkg-config, but we need one for meson to be happy not ++# finding all the optional deps it looks for. Use system pkg-config pointing at a ++# directory we get to populate with any .pc files we want to add for Android ++pkgconfig = ['env', 'PKG_CONFIG_LIBDIR=.', '/usr/bin/pkg-config'] ++ ++[host_machine] ++system = 'linux' ++# cpu_family = 'x86_64' ++# cpu = 'amd64' ++ ++# ik this is wrong but workaround sanity check ++cpu_family = 'arm' ++cpu = 'armv7' ++ ++endian = 'little' +diff --git a/docs/features.txt b/docs/features.txt +index 40ebfd68028..d5233eb5010 100644 +--- a/docs/features.txt ++++ b/docs/features.txt +@@ -213,7 +213,7 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, i965/gen8+, nvc0, r600, radeonsi, + GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, zink + + GL_ARB_ES3_1_compatibility DONE (freedreno/a6xx, i965/hsw+, softpipe, virgl) +- GL_ARB_clip_control DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12) ++ GL_ARB_clip_control DONE (freedreno, i965, nv50, softpipe, virgl, lima, d3d12, panfrost) + GL_ARB_conditional_render_inverted DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12) + GL_ARB_cull_distance DONE (freedreno/a6xx, i965, nv50, softpipe, virgl) + GL_ARB_derivative_control DONE (freedreno/a3xx+, i965, nv50, softpipe, virgl) +diff --git a/include/dma-uapi/dma-buf.h b/include/dma-uapi/dma-buf.h +new file mode 100644 +index 00000000000..5a6fda66d9a +--- /dev/null ++++ b/include/dma-uapi/dma-buf.h +@@ -0,0 +1,182 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * Framework for buffer objects that can be shared across devices/subsystems. ++ * ++ * Copyright(C) 2015 Intel Ltd ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program. If not, see . ++ */ ++ ++#ifndef _DMA_BUF_UAPI_H_ ++#define _DMA_BUF_UAPI_H_ ++ ++#include ++ ++/** ++ * struct dma_buf_sync - Synchronize with CPU access. ++ * ++ * When a DMA buffer is accessed from the CPU via mmap, it is not always ++ * possible to guarantee coherency between the CPU-visible map and underlying ++ * memory. To manage coherency, DMA_BUF_IOCTL_SYNC must be used to bracket ++ * any CPU access to give the kernel the chance to shuffle memory around if ++ * needed. ++ * ++ * Prior to accessing the map, the client must call DMA_BUF_IOCTL_SYNC ++ * with DMA_BUF_SYNC_START and the appropriate read/write flags. Once the ++ * access is complete, the client should call DMA_BUF_IOCTL_SYNC with ++ * DMA_BUF_SYNC_END and the same read/write flags. ++ * ++ * The synchronization provided via DMA_BUF_IOCTL_SYNC only provides cache ++ * coherency. It does not prevent other processes or devices from ++ * accessing the memory at the same time. If synchronization with a GPU or ++ * other device driver is required, it is the client's responsibility to ++ * wait for buffer to be ready for reading or writing before calling this ++ * ioctl with DMA_BUF_SYNC_START. Likewise, the client must ensure that ++ * follow-up work is not submitted to GPU or other device driver until ++ * after this ioctl has been called with DMA_BUF_SYNC_END? ++ * ++ * If the driver or API with which the client is interacting uses implicit ++ * synchronization, waiting for prior work to complete can be done via ++ * poll() on the DMA buffer file descriptor. If the driver or API requires ++ * explicit synchronization, the client may have to wait on a sync_file or ++ * other synchronization primitive outside the scope of the DMA buffer API. ++ */ ++struct dma_buf_sync { ++ /** ++ * @flags: Set of access flags ++ * ++ * DMA_BUF_SYNC_START: ++ * Indicates the start of a map access session. ++ * ++ * DMA_BUF_SYNC_END: ++ * Indicates the end of a map access session. ++ * ++ * DMA_BUF_SYNC_READ: ++ * Indicates that the mapped DMA buffer will be read by the ++ * client via the CPU map. ++ * ++ * DMA_BUF_SYNC_WRITE: ++ * Indicates that the mapped DMA buffer will be written by the ++ * client via the CPU map. ++ * ++ * DMA_BUF_SYNC_RW: ++ * An alias for DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE. ++ */ ++ __u64 flags; ++}; ++ ++#define DMA_BUF_SYNC_READ (1 << 0) ++#define DMA_BUF_SYNC_WRITE (2 << 0) ++#define DMA_BUF_SYNC_RW (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE) ++#define DMA_BUF_SYNC_START (0 << 2) ++#define DMA_BUF_SYNC_END (1 << 2) ++#define DMA_BUF_SYNC_VALID_FLAGS_MASK \ ++ (DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END) ++ ++#define DMA_BUF_NAME_LEN 32 ++ ++/** ++ * struct dma_buf_export_sync_file - Get a sync_file from a dma-buf ++ * ++ * Userspace can perform a DMA_BUF_IOCTL_EXPORT_SYNC_FILE to retrieve the ++ * current set of fences on a dma-buf file descriptor as a sync_file. CPU ++ * waits via poll() or other driver-specific mechanisms typically wait on ++ * whatever fences are on the dma-buf at the time the wait begins. This ++ * is similar except that it takes a snapshot of the current fences on the ++ * dma-buf for waiting later instead of waiting immediately. This is ++ * useful for modern graphics APIs such as Vulkan which assume an explicit ++ * synchronization model but still need to inter-operate with dma-buf. ++ * ++ * The intended usage pattern is the following: ++ * ++ * 1. Export a sync_file with flags corresponding to the expected GPU usage ++ * via DMA_BUF_IOCTL_EXPORT_SYNC_FILE. ++ * ++ * 2. Submit rendering work which uses the dma-buf. The work should wait on ++ * the exported sync file before rendering and produce another sync_file ++ * when complete. ++ * ++ * 3. Import the rendering-complete sync_file into the dma-buf with flags ++ * corresponding to the GPU usage via DMA_BUF_IOCTL_IMPORT_SYNC_FILE. ++ * ++ * Unlike doing implicit synchronization via a GPU kernel driver's exec ioctl, ++ * the above is not a single atomic operation. If userspace wants to ensure ++ * ordering via these fences, it is the respnosibility of userspace to use ++ * locks or other mechanisms to ensure that no other context adds fences or ++ * submits work between steps 1 and 3 above. ++ */ ++struct dma_buf_export_sync_file { ++ /** ++ * @flags: Read/write flags ++ * ++ * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both. ++ * ++ * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set, ++ * the returned sync file waits on any writers of the dma-buf to ++ * complete. Waiting on the returned sync file is equivalent to ++ * poll() with POLLIN. ++ * ++ * If DMA_BUF_SYNC_WRITE is set, the returned sync file waits on ++ * any users of the dma-buf (read or write) to complete. Waiting ++ * on the returned sync file is equivalent to poll() with POLLOUT. ++ * If both DMA_BUF_SYNC_WRITE and DMA_BUF_SYNC_READ are set, this ++ * is equivalent to just DMA_BUF_SYNC_WRITE. ++ */ ++ __u32 flags; ++ /** @fd: Returned sync file descriptor */ ++ __s32 fd; ++}; ++ ++/** ++ * struct dma_buf_import_sync_file - Insert a sync_file into a dma-buf ++ * ++ * Userspace can perform a DMA_BUF_IOCTL_IMPORT_SYNC_FILE to insert a ++ * sync_file into a dma-buf for the purposes of implicit synchronization ++ * with other dma-buf consumers. This allows clients using explicitly ++ * synchronized APIs such as Vulkan to inter-op with dma-buf consumers ++ * which expect implicit synchronization such as OpenGL or most media ++ * drivers/video. ++ */ ++struct dma_buf_import_sync_file { ++ /** ++ * @flags: Read/write flags ++ * ++ * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both. ++ * ++ * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set, ++ * this inserts the sync_file as a read-only fence. Any subsequent ++ * implicitly synchronized writes to this dma-buf will wait on this ++ * fence but reads will not. ++ * ++ * If DMA_BUF_SYNC_WRITE is set, this inserts the sync_file as a ++ * write fence. All subsequent implicitly synchronized access to ++ * this dma-buf will wait on this fence. ++ */ ++ __u32 flags; ++ /** @fd: Sync file descriptor */ ++ __s32 fd; ++}; ++ ++#define DMA_BUF_BASE 'b' ++#define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync) ++ ++/* 32/64bitness of this uapi was botched in android, there's no difference ++ * between them in actual uapi, they're just different numbers. ++ */ ++#define DMA_BUF_SET_NAME _IOW(DMA_BUF_BASE, 1, const char *) ++#define DMA_BUF_SET_NAME_A _IOW(DMA_BUF_BASE, 1, __u32) ++#define DMA_BUF_SET_NAME_B _IOW(DMA_BUF_BASE, 1, __u64) ++#define DMA_BUF_IOCTL_EXPORT_SYNC_FILE _IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file) ++#define DMA_BUF_IOCTL_IMPORT_SYNC_FILE _IOW(DMA_BUF_BASE, 3, struct dma_buf_import_sync_file) ++ ++#endif +diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h +index 0e70e36cd9d..37711252619 100644 +--- a/include/drm-uapi/drm_fourcc.h ++++ b/include/drm-uapi/drm_fourcc.h +@@ -1164,6 +1164,13 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) + */ + #define AFBC_FORMAT_MOD_USM (1ULL << 12) + ++/* AFBC native swizzle ++ * ++ * Indicates that the buffer is using RGBA component order regardless of the ++ * actual format. ++ */ ++#define AFBC_FORMAT_MOD_NATIVE_SWIZZLE (1ULL << 32) ++ + /* + * Arm Fixed-Rate Compression (AFRC) modifiers + * +diff --git a/meson.build b/meson.build +index 1e6ccd8cbb9..2a305cdc742 100644 +--- a/meson.build ++++ b/meson.build +@@ -865,14 +865,13 @@ endif + + with_gallium_st_nine = get_option('gallium-nine') + if with_gallium_st_nine +- if not with_gallium_softpipe +- error('The nine state tracker requires gallium softpipe/llvmpipe.') +- elif not [ ++ if not [ + with_gallium_crocus, + with_gallium_freedreno, + with_gallium_i915, + with_gallium_iris, + with_gallium_nouveau, ++ with_gallium_panfrost, + with_gallium_r300, + with_gallium_r600, + with_gallium_radeonsi, +diff --git a/src/android_stub/meson.build b/src/android_stub/meson.build +index 86f88caea34..a43a9ddfd6b 100644 +--- a/src/android_stub/meson.build ++++ b/src/android_stub/meson.build +@@ -1,7 +1,7 @@ + if with_android_stub + stub_libs = [] + +- foreach lib : ['backtrace', 'cutils', 'hardware', 'log', 'nativewindow', 'sync'] ++ foreach lib : ['hardware', 'log', 'nativewindow'] + stub_libs += shared_library( + lib, + files(lib + '_stub.cpp'), +diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp +index fc498fc8a24..6073c912c19 100644 +--- a/src/compiler/glsl/glsl_to_nir.cpp ++++ b/src/compiler/glsl/glsl_to_nir.cpp +@@ -81,9 +81,6 @@ class nir_visitor : public ir_visitor + + void create_function(ir_function_signature *ir); + +- /* True if we have any output rvalues */ +- bool has_output_rvalue; +- + private: + void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size); + nir_ssa_def *evaluate_rvalue(ir_rvalue *ir); +@@ -274,9 +271,6 @@ glsl_to_nir(const struct gl_constants *consts, + if (var->data.mode == nir_var_shader_in && var->data.sample) + shader->info.fs.uses_sample_shading = true; + } +- +- if (v1.has_output_rvalue) +- shader->info.fs.uses_sample_shading = true; + } + + return shader; +@@ -287,7 +281,6 @@ nir_visitor::nir_visitor(const struct gl_constants *consts, nir_shader *shader) + this->supports_std430 = consts->UseSTD430AsDefaultPacking; + this->shader = shader; + this->is_global = true; +- this->has_output_rvalue = false; + this->var_table = _mesa_pointer_hash_table_create(NULL); + this->overload_table = _mesa_pointer_hash_table_create(NULL); + this->sparse_variable_set = _mesa_pointer_set_create(NULL); +@@ -1826,9 +1819,6 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir) + + enum gl_access_qualifier access = deref_get_qualifier(this->deref); + this->result = nir_load_deref_with_access(&b, this->deref, access); +- +- if (nir_deref_mode_is(this->deref, nir_var_shader_out)) +- this->has_output_rvalue = true; + } + + return this->result; +diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp +index f875e2e08bf..7de2edf586e 100644 +--- a/src/compiler/glsl/standalone_scaffolding.cpp ++++ b/src/compiler/glsl/standalone_scaffolding.cpp +@@ -262,6 +262,9 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api) + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxUniformComponents = 1024; + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxInputComponents = 0; /* not used */ + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxOutputComponents = 0; /* not used */ ++ ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = 16; ++ ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicCounters = 16; ++ ctx->Const.Program[MESA_SHADER_COMPUTE].MaxShaderStorageBlocks = 16; + + /* Set up default shader compiler options. */ + struct gl_shader_compiler_options options; +diff --git a/src/drm-shim/device.c b/src/drm-shim/device.c +index 6c9c994643b..11825d717c4 100644 +--- a/src/drm-shim/device.c ++++ b/src/drm-shim/device.c +@@ -292,6 +292,10 @@ drm_shim_ioctl(int fd, unsigned long request, void *arg) + ASSERTED int type = _IOC_TYPE(request); + int nr = _IOC_NR(request); + ++ /* Used by kbase; do not claim to be a kbase FD */ ++ if (type == 0x80) ++ return -EINVAL; ++ + assert(type == DRM_IOCTL_BASE); + + if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) { +diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c +index de29d03940f..2fb7d55b508 100644 +--- a/src/egl/drivers/dri2/egl_dri2.c ++++ b/src/egl/drivers/dri2/egl_dri2.c +@@ -52,6 +52,8 @@ + #include + #include "wayland-drm.h" + #include "wayland-drm-client-protocol.h" ++#include "mali-buffer-sharing.h" ++#include "mali-buffer-sharing-client-protocol.h" + #include "linux-dmabuf-unstable-v1-client-protocol.h" + #endif + +@@ -2259,6 +2261,9 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx, + + buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, + (struct wl_resource *) _buffer); ++ if (!buffer) ++ buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, ++ (struct wl_resource *) _buffer); + if (!buffer) + return NULL; + +@@ -3256,11 +3261,27 @@ dri2_bind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy) + wayland_drm_init(wl_dpy, device_name, + &wl_drm_callbacks, disp, flags); + ++ drmSetVersion sv = { ++ .drm_di_major = 1, ++ .drm_di_minor = 4, ++ .drm_dd_major = -1, ++ .drm_dd_minor = -1, ++ }; ++ drmSetInterfaceVersion(dri2_dpy->fd, &sv); ++ ++ char *busid = drmGetBusid(dri2_dpy->fd); ++ dri2_dpy->wl_server_mali = ++ mali_buffer_sharing_init(wl_dpy, busid, ++ &wl_drm_callbacks, ++ disp); ++ drmFreeBusid(busid); ++ + free(device_name); + + if (!dri2_dpy->wl_server_drm) + goto fail; + ++ // TODO: Do this for mali_buffer_sharing + #ifdef HAVE_DRM_PLATFORM + /* We have to share the wl_drm instance with gbm, so gbm can convert + * wl_buffers to gbm bos. */ +@@ -3281,6 +3302,11 @@ dri2_unbind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy) + { + struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); + ++ if (dri2_dpy->wl_server_mali) { ++ wayland_drm_uninit(dri2_dpy->wl_server_mali); ++ dri2_dpy->wl_server_mali = NULL; ++ } ++ + if (!dri2_dpy->wl_server_drm) + return EGL_FALSE; + +@@ -3299,6 +3325,8 @@ dri2_query_wayland_buffer_wl(_EGLDisplay *disp, struct wl_resource *buffer_resou + const struct wl_drm_components_descriptor *format; + + buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, buffer_resource); ++ if (!buffer) ++ buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, buffer_resource); + if (!buffer) + return EGL_FALSE; + +diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h +index e4c15875091..f5143147014 100644 +--- a/src/egl/drivers/dri2/egl_dri2.h ++++ b/src/egl/drivers/dri2/egl_dri2.h +@@ -284,8 +284,11 @@ struct dri2_egl_display + struct wl_display *wl_dpy_wrapper; + struct wl_registry *wl_registry; + struct wl_drm *wl_server_drm; ++ struct wl_drm *wl_server_mali; + struct wl_drm *wl_drm; ++ struct wl_drm *wl_mali; + uint32_t wl_drm_version, wl_drm_name; ++ uint32_t wl_mali_version, wl_mali_name; + struct wl_shm *wl_shm; + struct wl_event_queue *wl_queue; + struct zwp_linux_dmabuf_v1 *wl_dmabuf; +@@ -337,6 +340,7 @@ struct dri2_egl_surface + struct wl_surface *wl_surface_wrapper; + struct wl_display *wl_dpy_wrapper; + struct wl_drm *wl_drm_wrapper; ++ struct wl_drm *wl_mali_wrapper; + struct wl_callback *throttle_callback; + struct zwp_linux_dmabuf_feedback_v1 *wl_dmabuf_feedback; + struct dmabuf_feedback dmabuf_feedback, pending_dmabuf_feedback; +diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c +index 260eb9c82af..70c07ccf127 100644 +--- a/src/egl/drivers/dri2/platform_wayland.c ++++ b/src/egl/drivers/dri2/platform_wayland.c +@@ -51,6 +51,7 @@ + #include + #include + #include "wayland-drm-client-protocol.h" ++#include "mali-buffer-sharing-client-protocol.h" + #include "linux-dmabuf-unstable-v1-client-protocol.h" + + /* +@@ -668,7 +669,7 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, + dri2_surf->base.PresentOpaque); + assert(visual_idx != -1); + +- if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) { ++ if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm || dri2_dpy->wl_mali) { + dri2_surf->format = dri2_wl_visuals[visual_idx].wl_drm_format; + } else { + assert(dri2_dpy->wl_shm); +@@ -691,6 +692,16 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, + dri2_surf->wl_queue); + } + ++ if (dri2_dpy->wl_mali) { ++ dri2_surf->wl_mali_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_mali); ++ if (!dri2_surf->wl_mali_wrapper) { ++ _eglError(EGL_BAD_ALLOC, "dri2_create_surface"); ++ goto cleanup_queue; ++ } ++ wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_mali_wrapper, ++ dri2_surf->wl_queue); ++ } ++ + dri2_surf->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy); + if (!dri2_surf->wl_dpy_wrapper) { + _eglError(EGL_BAD_ALLOC, "dri2_create_surface"); +@@ -765,6 +776,8 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, + cleanup_drm: + if (dri2_surf->wl_drm_wrapper) + wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper); ++ if (dri2_surf->wl_mali_wrapper) ++ wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper); + cleanup_queue: + wl_event_queue_destroy(dri2_surf->wl_queue); + cleanup_surf: +@@ -827,6 +840,8 @@ dri2_wl_destroy_surface(_EGLDisplay *disp, _EGLSurface *surf) + wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper); + if (dri2_surf->wl_drm_wrapper) + wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper); ++ if (dri2_surf->wl_mali_wrapper) ++ wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper); + if (dri2_surf->wl_dmabuf_feedback) { + zwp_linux_dmabuf_feedback_v1_destroy(dri2_surf->wl_dmabuf_feedback); + dmabuf_feedback_fini(&dri2_surf->dmabuf_feedback); +@@ -1460,6 +1475,26 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy, + ret = zwp_linux_buffer_params_v1_create_immed(params, width, height, + fourcc, 0); + zwp_linux_buffer_params_v1_destroy(params); ++ } else if (dri2_surf->wl_mali_wrapper || dri2_dpy->wl_mali) { ++ struct wl_drm *wl_mali = ++ dri2_surf ? dri2_surf->wl_mali_wrapper : dri2_dpy->wl_mali; ++ int fd = -1, stride; ++ ++ if (num_planes > 1) ++ return NULL; ++ ++ query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd); ++ query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride); ++ if (!query) { ++ if (fd >= 0) ++ close(fd); ++ return NULL; ++ } ++ ++ ret = mali_buffer_sharing_create_buffer((void *)wl_mali, ++ width, height, stride, ++ fourcc, 0, 0, fd); ++ close(fd); + } else { + struct wl_drm *wl_drm = + dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm; +@@ -1733,6 +1768,62 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device) + } + } + ++static void ++mali_handle_device(void *data, struct mali_buffer_sharing *drm, const char *device) ++{ ++ struct dri2_egl_display *dri2_dpy = data; ++ drm_magic_t magic; ++ ++ // hack ++ //printf("device '%s'\n", device); ++ dri2_dpy->device_name = strdup("/dev/dri/card0"); ++ ++ dri2_dpy->fd = loader_open_device(dri2_dpy->device_name); ++ if (dri2_dpy->fd == -1) { ++ _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)", ++ dri2_dpy->device_name, strerror(errno)); ++ free(dri2_dpy->device_name); ++ dri2_dpy->device_name = NULL; ++ return; ++ } ++ ++ if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) { ++ dri2_dpy->authenticated = true; ++ } else { ++ roundtrip(dri2_dpy); ++ if (drmGetMagic(dri2_dpy->fd, &magic)) { ++ close(dri2_dpy->fd); ++ dri2_dpy->fd = -1; ++ free(dri2_dpy->device_name); ++ dri2_dpy->device_name = NULL; ++ _eglLog(_EGL_WARNING, "wayland-egl: drmGetMagic failed"); ++ return; ++ } ++ ++ mali_buffer_sharing_auth((void *)dri2_dpy->wl_mali, magic); ++ roundtrip(dri2_dpy); ++ // We don't get a callback ++ dri2_dpy->authenticated = true; ++ } ++ ++ int supported_fourcc[] = { ++ WL_DRM_FORMAT_ABGR16F, ++ WL_DRM_FORMAT_ABGR2101010, ++ WL_DRM_FORMAT_XRGB8888, ++ WL_DRM_FORMAT_ARGB8888, ++ WL_DRM_FORMAT_ABGR8888, ++ WL_DRM_FORMAT_XBGR8888, ++ WL_DRM_FORMAT_RGB565, ++ }; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(supported_fourcc); ++i) { ++ int visual_idx = dri2_wl_visual_idx_from_fourcc(supported_fourcc[i]); ++ assert(visual_idx != -1); ++ ++ BITSET_SET(dri2_dpy->formats.formats_bitmap, visual_idx); ++ } ++} ++ + static void + drm_handle_format(void *data, struct wl_drm *drm, uint32_t format) + { +@@ -1768,6 +1859,10 @@ static const struct wl_drm_listener drm_listener = { + .capabilities = drm_handle_capabilities + }; + ++static const struct mali_buffer_sharing_listener mali_listener = { ++ .alloc_device = mali_handle_device, ++}; ++ + static void + dmabuf_ignore_format(void *data, struct zwp_linux_dmabuf_v1 *dmabuf, + uint32_t format) +@@ -1813,6 +1908,14 @@ wl_drm_bind(struct dri2_egl_display *dri2_dpy) + wl_drm_add_listener(dri2_dpy->wl_drm, &drm_listener, dri2_dpy); + } + ++static void ++wl_mali_bind(struct dri2_egl_display *dri2_dpy) ++{ ++ dri2_dpy->wl_mali = wl_registry_bind(dri2_dpy->wl_registry, dri2_dpy->wl_mali_name, ++ &mali_buffer_sharing_interface, dri2_dpy->wl_mali_version); ++ mali_buffer_sharing_add_listener((void *)dri2_dpy->wl_mali, &mali_listener, dri2_dpy); ++} ++ + static void + default_dmabuf_feedback_format_table(void *data, + struct zwp_linux_dmabuf_feedback_v1 *zwp_linux_dmabuf_feedback_v1, +@@ -1943,6 +2046,9 @@ registry_handle_global_drm(void *data, struct wl_registry *registry, + if (strcmp(interface, wl_drm_interface.name) == 0) { + dri2_dpy->wl_drm_version = MIN2(version, 2); + dri2_dpy->wl_drm_name = name; ++ } else if (strcmp(interface, mali_buffer_sharing_interface.name) == 0) { ++ dri2_dpy->wl_mali_version = MIN2(version, 5); ++ dri2_dpy->wl_mali_name = name; + } else if (strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0 && version >= 3) { + dri2_dpy->wl_dmabuf = + wl_registry_bind(registry, name, &zwp_linux_dmabuf_v1_interface, +@@ -2145,10 +2251,7 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) + + /* We couldn't retrieve a render node from the dma-buf feedback (or the + * feedback was not advertised at all), so we must fallback to wl_drm. */ +- if (dri2_dpy->fd == -1) { +- /* wl_drm not advertised by compositor, so can't continue */ +- if (dri2_dpy->wl_drm_name == 0) +- goto cleanup; ++ if (dri2_dpy->fd == -1 && dri2_dpy->wl_drm_name) { + wl_drm_bind(dri2_dpy); + + if (dri2_dpy->wl_drm == NULL) +@@ -2161,6 +2264,22 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) + goto cleanup; + } + ++ if (dri2_dpy->fd == -1 && dri2_dpy->wl_mali_name) { ++ wl_mali_bind(dri2_dpy); ++ ++ if (dri2_dpy->wl_mali == NULL) ++ goto cleanup; ++ if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1) ++ goto cleanup; ++ ++ if (!dri2_dpy->authenticated && ++ (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated)) ++ goto cleanup; ++ } ++ ++ if (dri2_dpy->fd == -1) ++ goto cleanup; ++ + dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd, + &dri2_dpy->is_different_gpu); + dev = _eglAddDevice(dri2_dpy->fd, false); +@@ -2786,6 +2905,8 @@ dri2_teardown_wayland(struct dri2_egl_display *dri2_dpy) + dri2_wl_formats_fini(&dri2_dpy->formats); + if (dri2_dpy->wl_drm) + wl_drm_destroy(dri2_dpy->wl_drm); ++ if (dri2_dpy->wl_mali) ++ wl_drm_destroy(dri2_dpy->wl_mali); + if (dri2_dpy->wl_dmabuf) + zwp_linux_dmabuf_v1_destroy(dri2_dpy->wl_dmabuf); + if (dri2_dpy->wl_shm) +diff --git a/src/egl/meson.build b/src/egl/meson.build +index 5b4644940a5..80dc6c94f33 100644 +--- a/src/egl/meson.build ++++ b/src/egl/meson.build +@@ -122,14 +122,16 @@ if with_dri2 + endif + if with_platform_wayland + deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers] +- link_for_egl += libwayland_drm ++ link_for_egl += [libwayland_drm, libmali_buffer_sharing] + files_egl += files('drivers/dri2/platform_wayland.c') + files_egl += [ + linux_dmabuf_unstable_v1_protocol_c, + linux_dmabuf_unstable_v1_client_protocol_h, + wayland_drm_client_protocol_h, ++ mali_buffer_sharing_client_protocol_h, + ] + incs_for_egl += include_directories('wayland/wayland-drm') ++ incs_for_egl += include_directories('wayland/mali-buffer-sharing') + endif + if with_platform_android + deps_for_egl += dep_android +diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c +new file mode 100644 +index 00000000000..d3c9a6f0dd2 +--- /dev/null ++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c +@@ -0,0 +1,170 @@ ++/* ++ * Copyright © 2022 Icecream95 ++ * Copyright © 2011 Kristian Høgsberg ++ * Copyright © 2011 Benjamin Franzke ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT ++ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, ++ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Authors: ++ * Kristian Høgsberg ++ * Benjamin Franzke ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "mali-buffer-sharing.h" ++#include "mali-buffer-sharing-server-protocol.h" ++#include "wayland-drm-client-protocol.h" ++ ++#define MIN(x,y) (((x)<(y))?(x):(y)) ++ ++static void ++destroy_buffer(struct wl_resource *resource) ++{ ++ struct wl_drm_buffer *buffer = wl_resource_get_user_data(resource); ++ struct wl_drm *drm = buffer->drm; ++ ++ drm->callbacks.release_buffer(drm->user_data, buffer); ++ free(buffer); ++} ++ ++static void ++buffer_destroy(struct wl_client *client, struct wl_resource *resource) ++{ ++ wl_resource_destroy(resource); ++} ++ ++static void ++create_buffer(struct wl_client *client, struct wl_resource *resource, ++ uint32_t id, uint32_t name, int fd, ++ int32_t width, int32_t height, ++ uint32_t format, ++ int32_t offset, int32_t stride) ++{ ++ struct wl_drm *drm = wl_resource_get_user_data(resource); ++ struct wl_drm_buffer *buffer; ++ ++ buffer = calloc(1, sizeof *buffer); ++ if (buffer == NULL) { ++ wl_resource_post_no_memory(resource); ++ return; ++ } ++ ++ buffer->drm = drm; ++ buffer->width = width; ++ buffer->height = height; ++ buffer->format = format; ++ buffer->offset[0] = offset; ++ buffer->stride[0] = stride; ++ ++ drm->callbacks.reference_buffer(drm->user_data, name, fd, buffer); ++ if (buffer->driver_buffer == NULL) { ++ // TODO: We should return an error ++ return; ++ } ++ ++ buffer->resource = ++ wl_resource_create(client, &wl_buffer_interface, 1, id); ++ if (!buffer->resource) { ++ wl_resource_post_no_memory(resource); ++ free(buffer); ++ return; ++ } ++ ++ wl_resource_set_implementation(buffer->resource, ++ (void (**)(void)) &drm->buffer_interface, ++ buffer, destroy_buffer); ++} ++ ++static void ++mali_create_buffer(struct wl_client *client, ++ struct wl_resource *resource, ++ uint32_t id, ++ int32_t width, int32_t height, uint32_t stride, ++ enum wl_drm_format format, uint32_t unk1, uint32_t unk2, ++ int fd) ++{ ++ create_buffer(client, resource, id, 0, fd, width, height, format, ++ 0, stride); ++ close(fd); ++} ++ ++static void ++mali_auth(struct wl_client *client, ++ struct wl_resource *resource, uint32_t id) ++{ ++ struct wl_drm *drm = wl_resource_get_user_data(resource); ++ ++ drm->callbacks.authenticate(drm->user_data, id); ++} ++ ++static const struct mali_buffer_sharing_interface mali_interface = { ++ mali_create_buffer, ++ mali_auth, ++}; ++ ++static void ++bind_mali(struct wl_client *client, void *data, uint32_t version, uint32_t id) ++{ ++ struct wl_drm *drm = data; ++ struct wl_resource *resource; ++ ++ resource = wl_resource_create(client, &mali_buffer_sharing_interface, ++ MIN(version, 4), id); ++ if (!resource) { ++ wl_client_post_no_memory(client); ++ return; ++ } ++ ++ wl_resource_set_implementation(resource, &mali_interface, data, NULL); ++ ++ mali_buffer_sharing_send_alloc_device(resource, drm->device_name); ++} ++ ++struct wl_drm * ++mali_buffer_sharing_init(struct wl_display *display, char *device_name, ++ const struct wayland_drm_callbacks *callbacks, void *user_data) ++{ ++ struct wl_drm *drm; ++ ++ drm = malloc(sizeof *drm); ++ if (!drm) ++ return NULL; ++ ++ drm->display = display; ++ drm->device_name = strdup(device_name ?: ""); ++ drm->callbacks = *callbacks; ++ drm->user_data = user_data; ++ drm->flags = 1; ++ ++ drm->buffer_interface.destroy = buffer_destroy; ++ ++ drm->wl_drm_global = ++ wl_global_create(display, &mali_buffer_sharing_interface, 5, ++ drm, bind_mali); ++ ++ return drm; ++} +diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h +new file mode 100644 +index 00000000000..f7f2c4811df +--- /dev/null ++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h +@@ -0,0 +1,12 @@ ++#ifndef MALI_BUFFER_H ++#define MALI_BUFFER_H ++ ++#include ++ ++#include "wayland-drm.h" ++ ++struct wl_drm * ++mali_buffer_sharing_init(struct wl_display *display, char *device_name, ++ const struct wayland_drm_callbacks *callbacks, void *user_data); ++ ++#endif +diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml +new file mode 100644 +index 00000000000..0ad02488118 +--- /dev/null ++++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml +@@ -0,0 +1,50 @@ ++ ++ ++ ++ ++ Copyright © 2022 Icecream95 ++ ++ Permission to use, copy, modify, distribute, and sell this ++ software and its documentation for any purpose is hereby granted ++ without fee, provided that\n the above copyright notice appear in ++ all copies and that both that copyright notice and this permission ++ notice appear in supporting documentation, and that the name of ++ the copyright holders not be used in advertising or publicity ++ pertaining to distribution of the software without specific, ++ written prior permission. The copyright holders make no ++ representations about the suitability of this software for any ++ purpose. It is provided "as is" without express or implied ++ warranty. ++ ++ THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS ++ SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND ++ FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ++ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ++ AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ++ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ++ THIS SOFTWARE. ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/src/egl/wayland/mali-buffer-sharing/meson.build b/src/egl/wayland/mali-buffer-sharing/meson.build +new file mode 100644 +index 00000000000..0693bf8668c +--- /dev/null ++++ b/src/egl/wayland/mali-buffer-sharing/meson.build +@@ -0,0 +1,51 @@ ++# Copyright © 2017 Intel Corporation ++ ++# Permission is hereby granted, free of charge, to any person obtaining a copy ++# of this software and associated documentation files (the "Software"), to deal ++# in the Software without restriction, including without limitation the rights ++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++# copies of the Software, and to permit persons to whom the Software is ++# furnished to do so, subject to the following conditions: ++ ++# The above copyright notice and this permission notice shall be included in ++# all copies or substantial portions of the Software. ++ ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++# SOFTWARE. ++ ++inc_mali_buffer_sharing = include_directories('.') ++ ++mali_buffer_sharing_protocol_c = custom_target( ++ 'mali-buffer-sharing-protocol.c', ++ input : 'mali-buffer-sharing.xml', ++ output : 'mali-buffer-sharing-protocol.c', ++ command : [prog_wl_scanner, wl_scanner_arg, '@INPUT@', '@OUTPUT@'], ++) ++ ++mali_buffer_sharing_client_protocol_h = custom_target( ++ 'mali-buffer-sharing-client-protocol.h', ++ input : 'mali-buffer-sharing.xml', ++ output : 'mali-buffer-sharing-client-protocol.h', ++ command : [prog_wl_scanner, 'client-header', '@INPUT@', '@OUTPUT@'], ++) ++ ++mali_buffer_sharing_server_protocol_h = custom_target( ++ 'mali-buffer-sharing-server-protocol.h', ++ input : 'mali-buffer-sharing.xml', ++ output : 'mali-buffer-sharing-server-protocol.h', ++ command : [prog_wl_scanner, 'server-header', '@INPUT@', '@OUTPUT@'], ++) ++ ++libmali_buffer_sharing = static_library( ++ 'mali-buffer-sharing', ++ ['mali-buffer-sharing.c', mali_buffer_sharing_protocol_c, mali_buffer_sharing_server_protocol_h, wayland_drm_client_protocol_h], ++ include_directories : inc_wayland_drm, ++ gnu_symbol_visibility : 'hidden', ++ dependencies : [dep_wayland_server], ++ build_by_default : false, ++) +diff --git a/src/egl/wayland/wayland-drm/wayland-drm.c b/src/egl/wayland/wayland-drm/wayland-drm.c +index 29558ea910e..ad9e64b72ee 100644 +--- a/src/egl/wayland/wayland-drm/wayland-drm.c ++++ b/src/egl/wayland/wayland-drm/wayland-drm.c +@@ -212,7 +212,7 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id) + + wl_resource_set_implementation(resource, &drm_interface, data, NULL); + +- wl_resource_post_event(resource, WL_DRM_DEVICE, drm->device_name); ++ wl_resource_post_event(resource, WL_DRM_DEVICE, "/dev/dri/card0"); + + if (drm->callbacks.is_format_supported(drm->user_data, + WL_DRM_FORMAT_ARGB2101010)) { +diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c +index efce6f6737e..6c0242770c6 100644 +--- a/src/gallium/auxiliary/cso_cache/cso_context.c ++++ b/src/gallium/auxiliary/cso_cache/cso_context.c +@@ -1368,6 +1368,11 @@ cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage, + } + } + ++void ++cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen) ++{ ++ ctx->max_sampler_seen = max_sampler_seen; ++} + + /** + * Send staged sampler state to the driver. +diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h +index 4b9ec2098bf..24535f62b35 100644 +--- a/src/gallium/auxiliary/cso_cache/cso_context.h ++++ b/src/gallium/auxiliary/cso_cache/cso_context.h +@@ -83,6 +83,9 @@ void + cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage, + unsigned idx, const struct pipe_sampler_state *states); + ++void ++cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen); ++ + void + cso_single_sampler_done(struct cso_context *cso, + enum pipe_shader_type shader_stage); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c +index 57c953a8d3b..ed76910c66d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c +@@ -1027,7 +1027,7 @@ static void emit_atomic_global(struct lp_build_nir_context *bld_base, + case nir_intrinsic_global_atomic_fadd: + op = LLVMAtomicRMWBinOpFAdd; + break; +-#if LLVM_VERSION_MAJOR >= 15 ++#if LLVM_VERSION_MAJOR >= 16 + case nir_intrinsic_global_atomic_fmin: + op = LLVMAtomicRMWBinOpFMin; + break; +@@ -1542,7 +1542,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base, + case nir_intrinsic_ssbo_atomic_fadd: + op = LLVMAtomicRMWBinOpFAdd; + break; +-#if LLVM_VERSION_MAJOR >= 15 ++#if LLVM_VERSION_MAJOR >= 16 + case nir_intrinsic_shared_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmin: + op = LLVMAtomicRMWBinOpFMin; +diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c +index b27858ab467..ba7c1b8d586 100644 +--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c ++++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c +@@ -189,7 +189,7 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd) + int new_fd; + + if (fd < 0 || (new_fd = os_dupfd_cloexec(fd)) < 0) +- return false; ++ return false; + + ret = pipe_loader_drm_probe_fd_nodup(dev, new_fd); + if (!ret) +diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +index d821008b534..dfef7a24c8c 100644 +--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h ++++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +@@ -8,6 +8,10 @@ + #include "frontend/sw_winsys.h" + #include "target-helpers/inline_debug_helper.h" + ++#include ++#include ++#include ++ + /* Helper function to choose and instantiate one of the software rasterizers: + * llvmpipe, softpipe. + */ +@@ -33,6 +37,10 @@ + #include "asahi/agx_public.h" + #endif + ++#if defined(GALLIUM_PANFROST) ++#include "panfrost/pan_public.h" ++#endif ++ + static inline struct pipe_screen * + sw_screen_create_named(struct sw_winsys *winsys, const char *driver) + { +@@ -71,6 +79,19 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver) + screen = agx_screen_create(0, NULL, winsys); + #endif + ++#if defined(GALLIUM_PANFROST) ++ if(screen == NULL && strcmp(driver, "panfrost") == 0) { ++ int kbase_device_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK); ++ if(kbase_device_fd == -1) { ++ printf("PAN_OSMESA: Failed to open kbase device: %s", strerror(errno)); ++ }else { ++ screen = panfrost_create_screen(kbase_device_fd, NULL); ++ } ++ } ++#else ++#error You forgot to include Panfrost ++#endif ++ + return screen ? debug_screen_wrap(screen) : NULL; + } + +diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build +index 8d6317292e9..58634b46c40 100644 +--- a/src/gallium/drivers/panfrost/meson.build ++++ b/src/gallium/drivers/panfrost/meson.build +@@ -44,6 +44,7 @@ panfrost_includes = [ + inc_include, + inc_src, + inc_panfrost, ++ inc_panfrost_hw, + ] + + compile_args_panfrost = [ +@@ -51,7 +52,7 @@ compile_args_panfrost = [ + '-Wno-pointer-arith' + ] + +-panfrost_versions = ['4', '5', '6', '7', '9'] ++panfrost_versions = ['4', '5', '6', '7', '9', '10'] + libpanfrost_versions = [] + + foreach ver : panfrost_versions +diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c +index eda56974409..227b6550d19 100644 +--- a/src/gallium/drivers/panfrost/pan_cmdstream.c ++++ b/src/gallium/drivers/panfrost/pan_cmdstream.c +@@ -23,12 +23,15 @@ + * SOFTWARE. + */ + ++#include "dma-uapi/dma-buf.h" ++ + #include "util/macros.h" + #include "util/u_prim.h" + #include "util/u_vbuf.h" + #include "util/u_helpers.h" + #include "util/u_draw.h" + #include "util/u_memory.h" ++#include "util/u_viewport.h" + #include "pipe/p_defines.h" + #include "pipe/p_state.h" + #include "gallium/auxiliary/util/u_blend.h" +@@ -749,8 +752,8 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); + float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); + float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); +- float minz = (vp->translate[2] - fabsf(vp->scale[2])); +- float maxz = (vp->translate[2] + fabsf(vp->scale[2])); ++ float minz, maxz; ++ util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz); + + /* Scissor to the intersection of viewport and to the scissor, clamped + * to the framebuffer */ +@@ -778,10 +781,16 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + maxx--; + maxy--; + +- batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY; +- batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY; +- + #if PAN_ARCH <= 7 ++ /* Proper depth clamp support was only introduced in v9, before then ++ * all that can be done is disabling clipping by adjusting the ++ * viewport. This means that the result will be wrong for float depth ++ * buffers or non-[0, 1] depth range. */ ++ if (!rast->depth_clip_near) ++ minz = -INFINITY; ++ if (!rast->depth_clip_far) ++ maxz = +INFINITY; ++ + struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); + + pan_pack(T.cpu, VIEWPORT, cfg) { +@@ -790,19 +799,22 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + +- cfg.minimum_z = batch->minimum_z; +- cfg.maximum_z = batch->maximum_z; ++ cfg.minimum_z = minz; ++ cfg.maximum_z = maxz; + } + + return T.gpu; + #else +- pan_pack(&batch->scissor, SCISSOR, cfg) { ++ pan_pack_cs_v10(&batch->scissor, &batch->cs_vertex, SCISSOR, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } + ++ batch->minimum_z = minz; ++ batch->maximum_z = maxz; ++ + return 0; + #endif + } +@@ -838,6 +850,14 @@ panfrost_emit_depth_stencil(struct panfrost_batch *batch) + cfg.depth_units = rast->base.offset_units * 2.0f; + cfg.depth_factor = rast->base.offset_scale; + cfg.depth_bias_clamp = rast->base.offset_clamp; ++ ++ if (rast->base.depth_clip_near && rast->base.depth_clip_far) { ++ cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_0_1; ++ cfg.depth_cull_enable = true; ++ } else { ++ cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS; ++ cfg.depth_cull_enable = false; ++ } + } + + pan_merge(dynamic, zsa->desc, DEPTH_STENCIL); +@@ -1482,9 +1502,17 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; + struct panfrost_ptr transfer = + pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); ++ void *sys_cpu = malloc(sys_size); ++ ++ /* Write to a shadow buffer to make pushing cheaper */ ++ struct panfrost_ptr sys_shadow = { ++ .cpu = sys_cpu, ++ .gpu = transfer.gpu, ++ }; + + /* Upload sysvals requested by the shader */ +- panfrost_upload_sysvals(batch, &transfer, ss, stage); ++ panfrost_upload_sysvals(batch, &sys_shadow, ss, stage); ++ memcpy(transfer.cpu, sys_cpu, sys_size); + + /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ + struct panfrost_compiled_shader *shader = ctx->prog[stage]; +@@ -1527,8 +1555,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + if (pushed_words) + *pushed_words = ss->info.push.count; + +- if (ss->info.push.count == 0) ++ if (ss->info.push.count == 0) { ++ free(sys_cpu); + return ubos.gpu; ++ } + + /* Copy push constants required by the shader */ + struct panfrost_ptr push_transfer = +@@ -1580,13 +1610,15 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + * off to upload sysvals to a staging buffer on the CPU on the + * assumption sysvals will get pushed (TODO) */ + +- const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : ++ const void *mapped_ubo = (src.ubo == sysval_ubo) ? sys_cpu : + panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); + + /* TODO: Is there any benefit to combining ranges */ + memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); + } + ++ free(sys_cpu); ++ + return ubos.gpu; + } + +@@ -2777,6 +2809,385 @@ emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb) + return transfer.gpu; + } + ++#if PAN_ARCH >= 10 ++ ++static int ++panfrost_export_dmabuf_fence(int dmabuf) ++{ ++ struct dma_buf_export_sync_file export = { ++ .flags = DMA_BUF_SYNC_RW, ++ }; ++ ++ int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export); ++ if (err < 0) { ++ fprintf(stderr, "failed to export fence: %s\n", ++ strerror(errno)); ++ return -1; ++ } ++ ++ return export.fd; ++} ++ ++static bool ++panfrost_import_dmabuf_fence(int dmabuf, int fence) ++{ ++ struct dma_buf_import_sync_file import = { ++ .flags = DMA_BUF_SYNC_RW, ++ .fd = fence, ++ }; ++ ++ int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import); ++ if (err < 0) { ++ fprintf(stderr, "failed to import fence: %s\n", ++ strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static uint64_t * ++panfrost_cs_ring_allocate_instrs(struct panfrost_cs *cs, unsigned count) ++{ ++ pan_command_stream c = cs->cs; ++ ++ if (c.ptr + count > c.end) { ++ assert(c.ptr <= c.end); ++ assert(c.begin + count <= c.ptr); ++ ++ /* Instructions are in a ring buffer, simply NOP out the end ++ * and start back from the start. Possibly, doing a TAILCALL ++ * straight to the start could also work. */ ++ memset(c.ptr, 0, (c.end - c.ptr) * 8); ++ c.ptr = c.begin; ++ ++ cs->offset += cs->base.size; ++ cs->cs = c; ++ } ++ ++ /* TODO: Check against the extract offset */ ++ return c.ptr + count; ++} ++ ++// TODO: Rewrite this! ++static void ++emit_csf_queue(struct panfrost_batch *batch, struct panfrost_cs *cs, ++ pan_command_stream s, struct util_dynarray *deps, ++ bool first, bool last) ++{ ++ struct panfrost_device *dev = pan_device(batch->ctx->base.screen); ++ ++ assert(s.ptr <= s.end); ++ ++ bool fragment = (cs->hw_resources & 2); ++ bool vertex = (cs->hw_resources & 12); /* TILER | IDVS */ ++ ++ uint64_t *limit = panfrost_cs_ring_allocate_instrs(cs, ++ 128 + util_dynarray_num_elements(deps, struct panfrost_usage) * 4); ++ ++ pan_command_stream *c = &cs->cs; ++ ++ /* First, do some waiting at the start of the job */ ++ ++ pan_emit_cs_32(c, 0x54, *cs->base.latest_flush); ++ // TODO genxmlify ++ pan_emit_cs_ins(c, 0x24, 0x540000000233ULL); ++ // TODO: What does this need to be? ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 0xff; } ++ ++ /* For the first job in the batch, wait on dependencies */ ++ // TODO: Usually the vertex job shouldn't have to wait for dmabufs! ++ if (first) { ++ mali_ptr seqnum_ptr_base = dev->mali.event_mem.gpu; ++ ++ util_dynarray_foreach(deps, struct panfrost_usage, u) { ++ /* Note the multiplication in the call to ++ * cs_ring_allocate_instrs. pan_emit_cs_64 might be ++ * split, so the total is four instructions. */ ++ pan_emit_cs_48(c, 0x42, seqnum_ptr_base + ++ u->queue * PAN_EVENT_SIZE); ++ pan_emit_cs_64(c, 0x40, u->seqnum); ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.no_error = true; ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ } ++ ++ uint64_t kcpu_seqnum = ++cs->kcpu_seqnum; ++ ++ util_dynarray_foreach(&batch->dmabufs, int, fd) { ++ int fence = panfrost_export_dmabuf_fence(*fd); ++ ++ /* TODO: poll on the dma-buf? */ ++ if (fence == -1) ++ continue; ++ ++ // TODO: What if we reach the limit for number of KCPU ++ // commands in a queue? It's pretty low (256) ++ dev->mali.kcpu_fence_import(&dev->mali, cs->base.ctx, ++ fence); ++ ++ close(fence); ++ } ++ ++ bool ret = dev->mali.kcpu_cqs_set(&dev->mali, cs->base.ctx, ++ cs->kcpu_event_ptr, kcpu_seqnum + 1); ++ ++ if (ret) { ++ /* If we don't set no_error, kbase might decide to ++ * pass on errors from waiting for fences. */ ++ pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr); ++ pan_emit_cs_64(c, 0x40, kcpu_seqnum); ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.no_error = true; ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ } ++ } ++ ++ /* Fragment jobs need to wait for the vertex job */ ++ if (fragment && !first) { ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x4e; ++ cfg.addr = 0x4c; ++ } ++ } ++ ++ if (vertex) { ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 3; } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; } ++ pan_pack_ins(c, CS_HEAPINC, cfg) { ++ cfg.type = MALI_HEAP_STATISTIC_V_T_START; ++ } ++ } else if (fragment) { ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 4; } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 4; } ++ } ++ ++ // copying to the main buffer can make debugging easier. ++ // TODO: This needs to be more reliable. ++#if 0 ++ unsigned length = (s.ptr - s.begin) * 8; ++ unsigned clamped = MIN2(length, cs->bo->ptr.cpu + cs->bo->size - (void *)c->ptr); ++ memcpy(c->ptr, s->begin, clamped); ++ c->ptr += clamped / 8; ++ ++ if (clamped != length) { ++ unsigned rest = length - clamped; ++ c->ptr = cs->bo->ptr.cpu; ++ memcpy(c->ptr, s->begin, rest); ++ c->ptr += rest / 8; ++ ++ cs->offset += cs->bo->size; ++ } ++#else ++ ++ pan_emit_cs_48(c, 0x48, s.gpu); ++ pan_emit_cs_32(c, 0x4a, (s.ptr - s.begin) * 8); ++ pan_pack_ins(c, CS_CALL, cfg) { cfg.address = 0x48; cfg.length = 0x4a; } ++#endif ++ ++ if (vertex) { ++ pan_pack_ins(c, CS_FLUSH_TILER, _) { } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; } ++ pan_pack_ins(c, CS_HEAPINC, cfg) { ++ cfg.type = MALI_HEAP_STATISTIC_V_T_END; ++ } ++ } ++ ++ if (fragment) { ++ /* Skip the next operation if the batch doesn't use a tiler ++ * heap (i.e. it's just a blit) */ ++ pan_emit_cs_ins(c, 22, 0x560030000001); /* b.ne w56, skip 1 */ ++ pan_emit_cs_ins(c, 22, 0x570020000007); /* b.eq w57, skip 7 */ ++ ++ pan_pack_ins(c, CS_LDR, cfg) { ++ cfg.offset = 4 * 10; /* Heap Start */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ pan_pack_ins(c, CS_LDR, cfg) { ++ cfg.offset = 4 * 12; /* Heap End */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4c; ++ } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 0) | (1 << 3); } ++ ++ pan_pack_ins(c, CS_HEAPCLEAR, cfg) { ++ cfg.start = 0x4a; ++ cfg.end = 0x4c; ++ cfg.slots = 1 << 3; ++ } ++ ++ /* Reset the fields so that the clear operation isn't done again */ ++ pan_emit_cs_48(c, 0x4a, 0); ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.offset = 4 * 10; /* Heap Start */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.offset = 4 * 12; /* Heap End */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ ++ /* Branch target for above branch */ ++ ++ // This seems to be done by the HEAPCLEAR ++ //pan_pack_ins(c, CS_HEAPINC, cfg) { ++ // cfg.type = MALI_HEAP_STATISTIC_FRAGMENT_END; ++ //} ++ } ++ ++ if (fragment) { ++ pan_emit_cs_32(c, 0x54, 0); ++ pan_emit_cs_ins(c, 0x24, 0x2540000f80211); ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 1; } ++ } ++ ++ { ++ // This could I think be optimised to 0xf80211 rather than 0x233 ++ // TODO: Does this need to run for vertex jobs? ++ // What about when doing transform feedback? ++ // I think we at least need it for compute? ++ ++ //pan_emit_cs_32(c, 0x54, 0); ++ //pan_emit_cs_ins(c, 0x24, 0x540000000233ULL); ++ } ++ ++ if (last) { ++ uint64_t kcpu_seqnum = ++cs->kcpu_seqnum; ++ ++ pan_emit_cs_64(c, 0x40, kcpu_seqnum + 1); ++ pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr); ++ pan_pack_ins(c, CS_EVSTR_64, cfg) { ++ /* This is the scoreboard mask, right?.. */ ++ cfg.unk_2 = (3 << 3); ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ ++ dev->mali.kcpu_cqs_wait(&dev->mali, cs->base.ctx, ++ cs->kcpu_event_ptr, kcpu_seqnum); ++ ++ int fence = dev->mali.kcpu_fence_export(&dev->mali, cs->base.ctx); ++ ++ if (fence != -1) { ++ util_dynarray_foreach(&batch->dmabufs, int, fd) { ++ panfrost_import_dmabuf_fence(*fd, fence); ++ } ++ } ++ ++ close(fence); ++ } ++ ++ pan_emit_cs_48(c, 0x48, cs->event_ptr); ++ pan_emit_cs_64(c, 0x4a, cs->seqnum + 1); ++ pan_pack_ins(c, CS_EVSTR_64, cfg) { ++ /* This is the scoreboard mask, right?.. */ ++ cfg.unk_2 = (3 << 3); ++ cfg.value = 0x4a; ++ cfg.addr = 0x48; ++ } ++ ++ // TODO: is this just a weird ddk thing, or is it required? ++ // Probably it just lessens the WC impact ++ while ((uintptr_t)c->ptr & 63) ++ pan_emit_cs_ins(c, 0, 0); ++ ++ assert(c->ptr <= limit); ++} ++ ++static void ++emit_csf_toplevel(struct panfrost_batch *batch) ++{ ++ pan_command_stream *cv = &batch->ctx->kbase_cs_vertex.cs; ++ pan_command_stream *cf = &batch->ctx->kbase_cs_fragment.cs; ++ ++ pan_command_stream v = batch->cs_vertex; ++ pan_command_stream f = batch->cs_fragment; ++ ++ if (batch->cs_vertex_last_size) { ++ assert(v.ptr <= v.end); ++ *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8; ++ v = batch->cs_vertex_first; ++ } ++ ++ bool vert = (v.ptr != v.begin); ++ bool frag = (f.ptr != f.begin); ++ ++ // TODO: Clean up control-flow? ++ ++ if (vert) { ++ pan_emit_cs_48(cv, 0x48, batch->ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(cv, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ ++ emit_csf_queue(batch, &batch->ctx->kbase_cs_vertex, v, ++ &batch->vert_deps, true, !frag); ++ } ++ ++ if (!frag) ++ return; ++ ++ pan_emit_cs_48(cf, 0x48, batch->ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(cf, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ ++ uint64_t vertex_seqnum = batch->ctx->kbase_cs_vertex.seqnum; ++ // TODO: this assumes SAME_VA ++ mali_ptr seqnum_ptr = (uintptr_t) batch->ctx->kbase_cs_vertex.event_ptr; ++ ++ pan_emit_cs_48(cf, 0x4c, seqnum_ptr); ++ pan_emit_cs_64(cf, 0x4e, vertex_seqnum); ++ ++ // What does this instruction do? ++ //pan_emit_cs_32(cf, 0x54, 0); ++ //pan_emit_cs_ins(cf, 0x24, 0x540000000200); ++ ++ assert(vert || batch->tiler_ctx.bifrost == 0); ++ pan_emit_cs_48(cf, 0x56, batch->tiler_ctx.bifrost); ++ ++ emit_csf_queue(batch, &batch->ctx->kbase_cs_fragment, f, ++ &batch->frag_deps, !vert, true); ++} ++ ++static void ++init_cs(struct panfrost_context *ctx, struct panfrost_cs *cs) ++{ ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ pan_command_stream *c = &cs->cs; ++ ++ cs->seqnum = 0; ++ ++ cs->offset = 0; ++ c->ptr = cs->bo->ptr.cpu; ++ c->begin = cs->bo->ptr.cpu; ++ c->end = cs->bo->ptr.cpu + cs->base.size; ++ c->gpu = cs->bo->ptr.gpu; ++ ++ // eight instructions == 64 bytes ++ pan_pack_ins(c, CS_RESOURCES, cfg) { cfg.mask = cs->hw_resources; } ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 2; } ++ pan_emit_cs_48(c, 0x48, ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(c, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ for (unsigned i = 0; i < 4; ++i) ++ pan_pack_ins(c, CS_NOP, _); ++ ++ dev->mali.cs_submit(&dev->mali, &cs->base, 64, NULL, 0); ++ //dev->mali.cs_wait(&dev->mali, &cs->base, 64); ++} ++ ++#endif ++ + #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c; + + static uint8_t +@@ -2904,14 +3315,14 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch, + #endif + + static void +-panfrost_emit_primitive_size(struct panfrost_context *ctx, ++panfrost_emit_primitive_size(struct panfrost_batch *batch, + bool points, mali_ptr size_array, + void *prim_size) + { +- struct panfrost_rasterizer *rast = ctx->rasterizer; ++ struct panfrost_rasterizer *rast = batch->ctx->rasterizer; + +- pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { +- if (panfrost_writes_point_size(ctx)) { ++ pan_pack_cs_v10(prim_size, &batch->cs_vertex, PRIMITIVE_SIZE, cfg) { ++ if (panfrost_writes_point_size(batch->ctx)) { + cfg.size_array = size_array; + } else { + cfg.constant = points ? +@@ -3037,6 +3448,43 @@ panfrost_update_state_3d(struct panfrost_batch *batch) + } + + #if PAN_ARCH >= 6 ++ ++#if PAN_ARCH >= 10 ++static mali_ptr ++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch) ++{ ++ struct panfrost_context *ctx = batch->ctx; ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ ++ if (ctx->tiler_heap_desc) ++ return ctx->tiler_heap_desc->ptr.gpu; ++ ++ ctx->tiler_heap_desc = panfrost_bo_create(dev, 4096, 0, "Tiler heap descriptor"); ++ ++ pan_pack(ctx->tiler_heap_desc->ptr.cpu, TILER_HEAP, heap) { ++ heap.size = ctx->kbase_ctx->tiler_heap_chunk_size; ++ heap.base = ctx->kbase_ctx->tiler_heap_header; ++ heap.bottom = heap.base + 64; ++ heap.top = heap.base + heap.size; ++ } ++ ++ return ctx->tiler_heap_desc->ptr.gpu; ++} ++#else ++static mali_ptr ++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch) ++{ ++ struct panfrost_device *dev = pan_device(batch->ctx->base.screen); ++ ++ struct panfrost_ptr t = ++ pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); ++ ++ GENX(pan_emit_tiler_heap)(dev, t.cpu); ++ ++ return t.gpu; ++} ++#endif ++ + static mali_ptr + panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count) + { +@@ -3048,18 +3496,32 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c + if (batch->tiler_ctx.bifrost) + return batch->tiler_ctx.bifrost; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); ++ mali_ptr heap = panfrost_get_tiler_heap_desc(batch); + +- GENX(pan_emit_tiler_heap)(dev, t.cpu); ++ mali_ptr scratch = 0; ++ ++#if PAN_ARCH >= 10 ++ // TODO: Dynamically size? ++ unsigned scratch_bits = 16; ++ ++ /* Allocate scratch space for vertex positions / point sizes */ ++ // TODO: Should this be shared? ++ struct panfrost_ptr sc = ++ pan_pool_alloc_aligned(&batch->pool.base, 1 << scratch_bits, 4096); ++ ++ /* I think the scratch size is passed in the low bits of the ++ * pointer... but trying to go above 16 gives a CS_INHERIT_FAULT. ++ */ ++ scratch = sc.gpu + scratch_bits; ++#endif + +- mali_ptr heap = t.gpu; ++ struct panfrost_ptr t = ++ pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); + +- t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); + GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height, + util_framebuffer_get_num_samples(&batch->key), + pan_tristate_get(batch->first_provoking_vertex), +- heap, t.cpu); ++ heap, scratch, t.cpu); + + batch->tiler_ctx.bifrost = t.gpu; + return batch->tiler_ctx.bifrost; +@@ -3070,18 +3532,19 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c + * jobs and Valhall IDVS jobs + */ + static void +-panfrost_emit_primitive(struct panfrost_context *ctx, ++panfrost_emit_primitive(struct panfrost_batch *batch, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw, + mali_ptr indices, bool secondary_shader, void *out) + { +- UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; ++ struct panfrost_context *ctx = batch->ctx; ++ struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + + bool lines = (info->mode == PIPE_PRIM_LINES || + info->mode == PIPE_PRIM_LINE_LOOP || + info->mode == PIPE_PRIM_LINE_STRIP); + +- pan_pack(out, PRIMITIVE, cfg) { ++ pan_pack_cs_v10(out, &batch->cs_vertex, PRIMITIVE, cfg) { + cfg.draw_mode = pan_draw_mode(info->mode); + if (panfrost_writes_point_size(ctx)) + cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; +@@ -3113,12 +3576,20 @@ panfrost_emit_primitive(struct panfrost_context *ctx, + + /* Non-fixed restart indices should have been lowered */ + assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info)); ++ ++ /* TODO: This is in a hot function, optimise? */ ++ if (ctx->pipe_viewport.scale[2] > 0) { ++ cfg.low_depth_cull = rast->depth_clip_near; ++ cfg.high_depth_cull = rast->depth_clip_far; ++ } else { ++ cfg.low_depth_cull = rast->depth_clip_far; ++ cfg.high_depth_cull = rast->depth_clip_near; ++ } + #endif + + cfg.index_count = ctx->indirect_draw ? 1 : draw->count; + cfg.index_type = panfrost_translate_index_size(info->index_size); + +- + if (PAN_ARCH >= 9) { + /* Base vertex offset on Valhall is used for both + * indexed and non-indexed draws, in a simple way for +@@ -3240,7 +3711,7 @@ panfrost_emit_draw(void *out, + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + bool polygon = (prim == PIPE_PRIM_TRIANGLES); + +- pan_pack(out, DRAW, cfg) { ++ pan_pack_cs_v10(out, &batch->cs_vertex, DRAW, cfg) { + /* + * From the Gallium documentation, + * pipe_rasterizer_state::cull_face "indicates which faces of +@@ -3270,6 +3741,7 @@ panfrost_emit_draw(void *out, + ctx->prog[PIPE_SHADER_FRAGMENT]; + + cfg.multisample_enable = rast->multisample; ++ + cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF; + + /* Use per-sample shading if required by API Also use it when a +@@ -3283,7 +3755,10 @@ panfrost_emit_draw(void *out, + + cfg.single_sampled_lines = !rast->multisample; + ++ /* This is filled in by hardware on v10 */ ++#if PAN_ARCH < 10 + cfg.vertex_array.packet = true; ++#endif + + cfg.minimum_z = batch->minimum_z; + cfg.maximum_z = batch->maximum_z; +@@ -3411,14 +3886,18 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + */ + secondary_shader &= fs_required; + +- panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader, ++#if PAN_ARCH < 10 ++ panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE)); ++#else ++ panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, job); ++#endif + +- pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { + cfg.count = info->instance_count; + } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { + if (secondary_shader) { + unsigned v = vs->info.varyings.output_count; + unsigned f = fs->info.varyings.input_count; +@@ -3427,34 +3906,45 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + unsigned size = slots * 16; + + /* Assumes 16 byte slots. We could do better. */ ++#if PAN_ARCH < 10 + cfg.vertex_packet_stride = size + 16; ++#endif + cfg.vertex_attribute_stride = size; + } else { + /* Hardware requirement for "no varyings" */ ++#if PAN_ARCH < 10 + cfg.vertex_packet_stride = 16; ++#endif + cfg.vertex_attribute_stride = 0; + } + } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, TILER, cfg) { + cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); + } + ++ /* For v10, the scissor is emitted directly by ++ * panfrost_emit_viewport */ ++#if PAN_ARCH < 10 + STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR)); + memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), + &batch->scissor, pan_size(SCISSOR)); ++#endif + +- panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0, ++ panfrost_emit_primitive_size(batch, info->mode == PIPE_PRIM_POINTS, 0, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE)); + +- pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INDICES, cfg) { + cfg.address = indices; ++#if PAN_ARCH >= 10 ++ cfg.size = draw->count * info->index_size; ++#endif + } + + panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), + batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0); + +- pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, POSITION, cfg) { + /* IDVS/points vertex shader */ + mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX]; + +@@ -3464,20 +3954,21 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + + panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr, + batch->tls.gpu); +- } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) { +- /* If a varying shader is used, we configure it with the same +- * state as the position shader for backwards compatible +- * behaviour with Bifrost. This could be optimized. +- */ +- if (!secondary_shader) continue; ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, VARYING, vary) { ++ /* If a varying shader is used, we configure it with the same ++ * state as the position shader for backwards compatible ++ * behaviour with Bifrost. This could be optimized. ++ */ ++ if (!secondary_shader) continue; + +- mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + ++ mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + + (2 * pan_size(SHADER_PROGRAM)); + +- panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, +- ptr, batch->tls.gpu); ++ vary.shader = ptr; ++ ++ // TODO: Fix this function for v9! ++ } + } + } + #endif +@@ -3492,12 +3983,10 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch, + mali_ptr pos, mali_ptr psiz, bool secondary_shader, + void *job) + { +- struct panfrost_context *ctx = batch->ctx; +- + void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); + memcpy(section, invocation_template, pan_size(INVOCATION)); + +- panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader, ++ panfrost_emit_primitive(batch, info, draw, indices, secondary_shader, + pan_section_ptr(job, TILER_JOB, PRIMITIVE)); + + void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); +@@ -3514,7 +4003,7 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch, + panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW), + batch, true, prim, pos, fs_vary, varyings); + +- panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); ++ panfrost_emit_primitive_size(batch, prim == PIPE_PRIM_POINTS, psiz, prim_size); + } + #endif + +@@ -3526,8 +4015,8 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + { + struct panfrost_context *ctx = batch->ctx; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); ++ UNUSED struct panfrost_ptr t = ++ pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); + + /* Nothing to do */ + if (batch->ctx->streamout.num_targets == 0) +@@ -3556,7 +4045,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); + + #if PAN_ARCH >= 9 +- pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { ++ pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; +@@ -3569,15 +4058,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->rsd[PIPE_SHADER_VERTEX], + batch->tls.gpu); + ++#if PAN_ARCH < 10 + /* TODO: Indexing. Also, this is a legacy feature... */ + cfg.compute.attribute_offset = batch->ctx->offset_start; ++#endif + + /* Transform feedback shaders do not use barriers or shared + * memory, so we may merge workgroups. + */ + cfg.allow_merging_workgroups = true; ++ ++#if PAN_ARCH < 10 + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; ++#endif + } + #else + struct mali_invocation_packed invocation; +@@ -3593,12 +4087,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, + attribs, attrib_bufs, t.cpu); + #endif ++#if PAN_ARCH >= 10 ++ // TODO: Use a seperate compute queue? ++ pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) { ++ // TODO v10: Set parameters ++ } ++ batch->scoreboard.first_job = 1; ++#else + enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE; + #if PAN_ARCH <= 5 + job_type = MALI_JOB_TYPE_VERTEX; + #endif + panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type, + true, false, 0, 0, &t, false); ++#endif + + ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled; + ctx->prog[PIPE_SHADER_VERTEX] = vs; +@@ -3607,6 +4109,54 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; + } + ++#if PAN_ARCH >= 10 ++static pan_command_stream ++panfrost_batch_create_cs(struct panfrost_batch *batch, unsigned count) ++{ ++ struct panfrost_ptr cs = pan_pool_alloc_aligned(&batch->pool.base, count * 8, 64); ++ ++ return (pan_command_stream) { ++ .ptr = cs.cpu, ++ .begin = cs.cpu, ++ .end = cs.cpu + count, ++ .gpu = cs.gpu, ++ }; ++} ++ ++static uint64_t * ++panfrost_cs_vertex_allocate_instrs(struct panfrost_batch *batch, unsigned count) ++{ ++ /* Doing a tail call to another buffer takes three instructions */ ++ count += 3; ++ ++ pan_command_stream v = batch->cs_vertex; ++ ++ if (v.ptr + count > v.end) { ++ batch->cs_vertex = panfrost_batch_create_cs(batch, MAX2(count, 1 << 13)); ++ ++ /* The size will be filled in later. */ ++ uint32_t *last_size = (uint32_t *)v.ptr; ++ pan_emit_cs_32(&v, 0x5e, 0); ++ ++ pan_emit_cs_48(&v, 0x5c, batch->cs_vertex.gpu); ++ pan_pack_ins(&v, CS_TAILCALL, cfg) { cfg.address = 0x5c; cfg.length = 0x5e; } ++ ++ assert(v.ptr <= v.end); ++ ++ /* This is not strictly required, but makes disassembly look ++ * nicer */ ++ if (batch->cs_vertex_last_size) ++ *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8; ++ ++ batch->cs_vertex_last_size = last_size; ++ if (!batch->cs_vertex_first.gpu) ++ batch->cs_vertex_first = v; ++ } ++ ++ return batch->cs_vertex.ptr + count; ++} ++#endif ++ + static void + panfrost_direct_draw(struct panfrost_batch *batch, + const struct pipe_draw_info *info, +@@ -3618,6 +4168,11 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + struct panfrost_context *ctx = batch->ctx; + ++#if PAN_ARCH >= 10 ++ /* TODO: We don't need quite so much space */ ++ uint64_t *limit = panfrost_cs_vertex_allocate_instrs(batch, 64); ++#endif ++ + /* If we change whether we're drawing points, or whether point sprites + * are enabled (specified in the rasterizer), we may need to rebind + * shaders accordingly. This implicitly covers the case of rebinding +@@ -3647,18 +4202,19 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + UNUSED struct panfrost_ptr tiler, vertex; + +- if (idvs) { + #if PAN_ARCH >= 9 +- tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB); +-#elif PAN_ARCH >= 6 ++ tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, MALLOC_VERTEX_JOB); ++#else /* PAN_ARCH < 9 */ ++ if (idvs) { ++#if PAN_ARCH >= 6 + tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); +-#else +- unreachable("IDVS is unsupported on Midgard"); + #endif ++ unreachable("IDVS is unsupported on Midgard"); + } else { +- vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); +- tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); ++ vertex = pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); ++ tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, TILER_JOB); + } ++#endif /* PAN_ARCH */ + + unsigned vertex_count = ctx->vertex_count; + +@@ -3726,7 +4282,7 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + mali_ptr attribs, attrib_bufs; + attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); +-#endif ++#endif /* PAN_ARCH <= 7 */ + + panfrost_update_state_3d(batch); + panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); +@@ -3752,13 +4308,25 @@ panfrost_direct_draw(struct panfrost_batch *batch, + #if PAN_ARCH >= 9 + assert(idvs && "Memory allocated IDVS required on Valhall"); + +- panfrost_emit_malloc_vertex(batch, info, draw, indices, +- secondary_shader, tiler.cpu); ++ panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, tiler.cpu); + ++#if PAN_ARCH >= 10 ++ pan_pack_ins(&batch->cs_vertex, IDVS_LAUNCH, _); ++ /* TODO: Find a better way to specify that there were jobs */ ++ batch->scoreboard.first_job = 1; ++ batch->scoreboard.first_tiler = NULL + 1; ++ ++ /* Make sure we didn't use more CS instructions than we allocated ++ * space for */ ++ assert(batch->cs_vertex.ptr <= limit); ++ ++#else /* PAN_ARCH < 10 */ + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, + 0, &tiler, false); +-#else ++#endif ++#else /* PAN_ARCH < 9 */ ++ + /* Fire off the draw itself */ + panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, + fs_vary, varyings, pos, psiz, secondary_shader, +@@ -3773,7 +4341,7 @@ panfrost_direct_draw(struct panfrost_batch *batch, + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_INDEXED_VERTEX, false, false, + 0, 0, &tiler, false); +-#endif ++#endif /* PAN_ARCH < 6 */ + } else { + panfrost_draw_emit_vertex(batch, info, &invocation, + vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); +@@ -4102,8 +4670,8 @@ panfrost_launch_grid(struct pipe_context *pipe, + + ctx->compute_grid = info; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); ++ UNUSED struct panfrost_ptr t = ++ pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); + + /* Invoke according to the grid info */ + +@@ -4143,7 +4711,7 @@ panfrost_launch_grid(struct pipe_context *pipe, + #else + struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE]; + +- pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { ++ pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = info->block[0]; + cfg.workgroup_size_y = info->block[1]; + cfg.workgroup_size_z = info->block[2]; +@@ -4166,12 +4734,14 @@ panfrost_launch_grid(struct pipe_context *pipe, + cs->info.cs.allow_merging_workgroups && + (info->variable_shared_mem == 0); + ++#if PAN_ARCH < 10 + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; ++#endif + } + #endif + +- unsigned indirect_dep = 0; ++ UNUSED unsigned indirect_dep = 0; // TODO v10 (unused) + #if PAN_GPU_INDIRECTS + if (info->indirect) { + struct pan_indirect_dispatch_info indirect = { +@@ -4191,9 +4761,17 @@ panfrost_launch_grid(struct pipe_context *pipe, + } + #endif + ++#if PAN_ARCH >= 10 ++ pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) { ++ /* TODO: Change this as needed */ ++ cfg.unk_1 = 512; ++ } ++ batch->scoreboard.first_job = 1; ++#else + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_COMPUTE, true, false, + indirect_dep, 0, &t, false); ++#endif + panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); + } + +@@ -4453,6 +5031,30 @@ panfrost_create_sampler_view( + return (struct pipe_sampler_view *) so; + } + ++static void ++panfrost_init_logicop_blend_state(struct panfrost_blend_state *so) ++{ ++ for (unsigned c = 0; c < so->pan.rt_count; ++c) { ++ unsigned g = so->base.independent_blend_enable ? c : 0; ++ const struct pipe_rt_blend_state pipe = so->base.rt[g]; ++ ++ struct pan_blend_equation equation = {0}; ++ ++ equation.color_mask = pipe.colormask; ++ equation.blend_enable = false; ++ ++ so->info[c] = (struct pan_blend_info) { ++ .enabled = (pipe.colormask != 0), ++ .load_dest = true, ++ .fixed_function = false, ++ }; ++ ++ so->pan.rts[c].equation = equation; ++ ++ so->load_dest_mask |= BITFIELD_BIT(c); ++ } ++} ++ + /* A given Gallium blend state can be encoded to the hardware in numerous, + * dramatically divergent ways due to the interactions of blending with + * framebuffer formats. Conceptually, there are two modes: +@@ -4492,6 +5094,11 @@ panfrost_create_blend_state(struct pipe_context *pipe, + so->pan.logicop_func = blend->logicop_func; + so->pan.rt_count = blend->max_rt + 1; + ++ if (blend->logicop_enable) { ++ panfrost_init_logicop_blend_state(so); ++ return so; ++ } ++ + for (unsigned c = 0; c < so->pan.rt_count; ++c) { + unsigned g = blend->independent_blend_enable ? c : 0; + const struct pipe_rt_blend_state pipe = blend->rt[g]; +@@ -4521,12 +5128,10 @@ panfrost_create_blend_state(struct pipe_context *pipe, + .opaque = pan_blend_is_opaque(equation), + .constant_mask = constant_mask, + +- /* TODO: check the dest for the logicop */ +- .load_dest = blend->logicop_enable || +- pan_blend_reads_dest(equation), ++ .load_dest = pan_blend_reads_dest(equation), + + /* Could this possibly be fixed-function? */ +- .fixed_function = !blend->logicop_enable && ++ .fixed_function = + pan_blend_can_fixed_function(equation, + supports_2src) && + (!constant_mask || +@@ -4612,10 +5217,12 @@ prepare_shader(struct panfrost_compiled_shader *state, + + state->state = panfrost_pool_take_ref(pool, ptr.gpu); + ++ // TODO: Why set primary_shader to false again? ++ + /* Generic, or IDVS/points */ + pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); +- cfg.primary_shader = true; ++ cfg.primary_shader = false; + cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu; + cfg.preload.r48_r63 = (state->info.preload >> 48); +@@ -4631,7 +5238,7 @@ prepare_shader(struct panfrost_compiled_shader *state, + /* IDVS/triangles */ + pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); +- cfg.primary_shader = true; ++ cfg.primary_shader = false; + cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; + cfg.preload.r48_r63 = (state->info.preload >> 48); +@@ -4707,6 +5314,11 @@ init_batch(struct panfrost_batch *batch) + /* On Midgard, the TLS is embedded in the FB descriptor */ + batch->tls = batch->framebuffer; + #endif ++ ++#if PAN_ARCH >= 10 ++ batch->cs_vertex = panfrost_batch_create_cs(batch, 1 << 13); ++ batch->cs_fragment = panfrost_batch_create_cs(batch, 1 << 9); ++#endif + } + + static void +@@ -4821,6 +5433,10 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) + screen->vtbl.init_polygon_list = init_polygon_list; + screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); + screen->vtbl.compile_shader = GENX(pan_shader_compile); ++#if PAN_ARCH >= 10 ++ screen->vtbl.emit_csf_toplevel = emit_csf_toplevel; ++ screen->vtbl.init_cs = init_cs; ++#endif + + GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, + &screen->blitter.desc_pool.base); +diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c +index 80a39a3a220..7b0f021bf47 100644 +--- a/src/gallium/drivers/panfrost/pan_context.c ++++ b/src/gallium/drivers/panfrost/pan_context.c +@@ -34,7 +34,6 @@ + + #include "util/macros.h" + #include "util/format/u_format.h" +-#include "util/libsync.h" + #include "util/u_inlines.h" + #include "util/u_upload_mgr.h" + #include "util/u_memory.h" +@@ -571,6 +570,19 @@ panfrost_destroy(struct pipe_context *pipe) + struct panfrost_context *panfrost = pan_context(pipe); + struct panfrost_device *dev = pan_device(pipe->screen); + ++ if (dev->kbase && dev->mali.context_create) { ++ dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_vertex.base); ++ dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_fragment.base); ++ ++ dev->mali.context_destroy(&dev->mali, panfrost->kbase_ctx); ++ ++ panfrost_bo_unreference(panfrost->kbase_cs_vertex.bo); ++ panfrost_bo_unreference(panfrost->kbase_cs_fragment.bo); ++ } ++ ++ if (panfrost->tiler_heap_desc) ++ panfrost_bo_unreference(panfrost->tiler_heap_desc); ++ + _mesa_hash_table_destroy(panfrost->writers, NULL); + + if (panfrost->blitter) +@@ -582,11 +594,15 @@ panfrost_destroy(struct pipe_context *pipe) + panfrost_pool_cleanup(&panfrost->descs); + panfrost_pool_cleanup(&panfrost->shaders); + +- drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); +- if (panfrost->in_sync_fd != -1) +- close(panfrost->in_sync_fd); ++ if (dev->kbase) { ++ dev->mali.syncobj_destroy(&dev->mali, panfrost->syncobj_kbase); ++ } else { ++ drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); ++ if (panfrost->in_sync_fd != -1) ++ close(panfrost->in_sync_fd); + +- drmSyncobjDestroy(dev->fd, panfrost->syncobj); ++ drmSyncobjDestroy(dev->fd, panfrost->syncobj); ++ } + ralloc_free(pipe); + } + +@@ -873,6 +889,58 @@ panfrost_create_fence_fd(struct pipe_context *pctx, + *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type); + } + ++struct sync_merge_data { ++ char name[32]; ++ int32_t fd2; ++ int32_t fence; ++ uint32_t flags; ++ uint32_t pad; ++}; ++ ++#define SYNC_IOC_MAGIC '>' ++#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data) ++ ++static inline int sync_merge(const char *name, int fd1, int fd2) ++{ ++ struct sync_merge_data data = {{0}}; ++ int ret; ++ ++ data.fd2 = fd2; ++ strncpy(data.name, name, sizeof(data.name)); ++ ++ do { ++ ret = ioctl(fd1, SYNC_IOC_MERGE, &data); ++ } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); ++ ++ if (ret < 0) ++ return ret; ++ ++ return data.fence; ++} ++ ++static inline int sync_accumulate(const char *name, int *fd1, int fd2) ++{ ++ int ret; ++ ++ assert(fd2 >= 0); ++ ++ if (*fd1 < 0) { ++ *fd1 = dup(fd2); ++ return 0; ++ } ++ ++ ret = sync_merge(name, *fd1, fd2); ++ if (ret < 0) { ++ /* leave *fd1 as it is */ ++ return ret; ++ } ++ ++ close(*fd1); ++ *fd1 = ret; ++ ++ return 0; ++} ++ + static void + panfrost_fence_server_sync(struct pipe_context *pctx, + struct pipe_fence_handle *f) +@@ -888,6 +956,28 @@ panfrost_fence_server_sync(struct pipe_context *pctx, + close(fd); + } + ++static struct panfrost_cs ++panfrost_cs_create(struct panfrost_context *ctx, unsigned size, unsigned mask) ++{ ++ struct panfrost_screen *screen = pan_screen(ctx->base.screen); ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ struct kbase_context *kctx = ctx->kbase_ctx; ++ ++ struct panfrost_cs c = {0}; ++ ++ c.bo = panfrost_bo_create(dev, size, 0, "Command stream"); ++ ++ c.base = dev->mali.cs_bind(&dev->mali, kctx, c.bo->ptr.gpu, size); ++ ++ c.event_ptr = dev->mali.event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE; ++ c.kcpu_event_ptr = dev->mali.kcpu_event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE; ++ ++ c.hw_resources = mask; ++ screen->vtbl.init_cs(ctx, &c); ++ ++ return c; ++} ++ + struct pipe_context * + panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + { +@@ -981,6 +1071,14 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + + assert(ctx->blitter); + ++ if (dev->kbase && dev->mali.context_create) ++ ctx->kbase_ctx = dev->mali.context_create(&dev->mali); ++ ++ if (dev->arch >= 10) { ++ ctx->kbase_cs_vertex = panfrost_cs_create(ctx, 65536, 13); ++ ctx->kbase_cs_fragment = panfrost_cs_create(ctx, 65536, 2); ++ } ++ + /* Prepare for render! */ + + /* By default mask everything on */ +@@ -992,13 +1090,18 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + /* Create a syncobj in a signaled state. Will be updated to point to the + * last queued job out_sync every time we submit a new job. + */ +- ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); +- assert(!ret && ctx->syncobj); +- +- /* Sync object/FD used for NATIVE_FENCE_FD. */ +- ctx->in_sync_fd = -1; +- ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); +- assert(!ret); ++ if (dev->kbase) { ++ ctx->syncobj_kbase = dev->mali.syncobj_create(&dev->mali); ++ ctx->in_sync_fd = -1; ++ } else { ++ ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); ++ assert(!ret && ctx->syncobj); ++ ++ /* Sync object/FD used for NATIVE_FENCE_FD. */ ++ ctx->in_sync_fd = -1; ++ ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); ++ assert(!ret); ++ } + + return gallium; + } +diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h +index 37c0f6fc099..197f5641362 100644 +--- a/src/gallium/drivers/panfrost/pan_context.h ++++ b/src/gallium/drivers/panfrost/pan_context.h +@@ -117,6 +117,19 @@ struct panfrost_streamout { + unsigned num_targets; + }; + ++// TODO: This struct is a mess ++struct panfrost_cs { ++ struct kbase_cs base; ++ struct panfrost_bo *bo; ++ pan_command_stream cs; ++ mali_ptr event_ptr; ++ uint64_t seqnum; ++ mali_ptr kcpu_event_ptr; ++ uint64_t kcpu_seqnum; ++ uint64_t offset; ++ unsigned hw_resources; ++}; ++ + struct panfrost_context { + /* Gallium context */ + struct pipe_context base; +@@ -132,6 +145,7 @@ struct panfrost_context { + + /* Sync obj used to keep track of in-flight jobs. */ + uint32_t syncobj; ++ struct kbase_syncobj *syncobj_kbase; + + /* Set of 32 batches. When the set is full, the LRU entry (the batch + * with the smallest seqnum) is flushed to free a slot. +@@ -229,6 +243,12 @@ struct panfrost_context { + + int in_sync_fd; + uint32_t in_sync_obj; ++ ++ struct kbase_context *kbase_ctx; ++ struct panfrost_bo *event_bo; ++ struct panfrost_cs kbase_cs_vertex; ++ struct panfrost_cs kbase_cs_fragment; ++ struct panfrost_bo *tiler_heap_desc; + }; + + /* Corresponds to the CSO */ +diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c +index e00053aad44..e1ad57ce3e8 100644 +--- a/src/gallium/drivers/panfrost/pan_disk_cache.c ++++ b/src/gallium/drivers/panfrost/pan_disk_cache.c +@@ -34,7 +34,9 @@ + + #include "pan_context.h" + ++#ifdef ENABLE_SHADER_CACHE + static bool debug = false; ++#endif + + extern int midgard_debug; + extern int bifrost_debug; +@@ -141,6 +143,8 @@ panfrost_disk_cache_retrieve(struct disk_cache *cache, + blob_copy_bytes(&blob, ptr, binary_size); + blob_copy_bytes(&blob, &binary->info, sizeof(binary->info)); + ++ free(buffer); ++ + return true; + #else + return false; +@@ -156,11 +160,7 @@ panfrost_disk_cache_init(struct panfrost_screen *screen) + #ifdef ENABLE_SHADER_CACHE + const char *renderer = screen->base.get_name(&screen->base); + +- const struct build_id_note *note = +- build_id_find_nhdr_for_addr(panfrost_disk_cache_init); +- assert(note && build_id_length(note) == 20); /* sha1 */ +- +- const uint8_t *id_sha1 = build_id_data(note); ++ const uint8_t *id_sha1 = "1"; + assert(id_sha1); + + char timestamp[41]; +diff --git a/src/gallium/drivers/panfrost/pan_fence.c b/src/gallium/drivers/panfrost/pan_fence.c +index 655644ec495..f989269978c 100644 +--- a/src/gallium/drivers/panfrost/pan_fence.c ++++ b/src/gallium/drivers/panfrost/pan_fence.c +@@ -42,7 +42,10 @@ panfrost_fence_reference(struct pipe_screen *pscreen, + struct pipe_fence_handle *old = *ptr; + + if (pipe_reference(&old->reference, &fence->reference)) { +- drmSyncobjDestroy(dev->fd, old->syncobj); ++ if (dev->kbase) ++ dev->mali.syncobj_destroy(&dev->mali, old->kbase); ++ else ++ drmSyncobjDestroy(dev->fd, old->syncobj); + free(old); + } + +@@ -65,6 +68,13 @@ panfrost_fence_finish(struct pipe_screen *pscreen, + if (abs_timeout == OS_TIMEOUT_INFINITE) + abs_timeout = INT64_MAX; + ++ if (dev->kbase) { ++ /* TODO: Use the timeout */ ++ bool ret = dev->mali.syncobj_wait(&dev->mali, fence->kbase); ++ fence->signaled = ret; ++ return ret; ++ } ++ + ret = drmSyncobjWait(dev->fd, &fence->syncobj, + 1, + abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, +@@ -81,6 +91,10 @@ panfrost_fence_get_fd(struct pipe_screen *screen, + struct panfrost_device *dev = pan_device(screen); + int fd = -1; + ++ /* TODO: Export a sync file using KCPU */ ++ if (dev->kbase) ++ return fd; ++ + drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); + return fd; + } +@@ -92,6 +106,10 @@ panfrost_fence_from_fd(struct panfrost_context *ctx, int fd, + struct panfrost_device *dev = pan_device(ctx->base.screen); + int ret; + ++ /* TODO: Implement this for kbase */ ++ if (dev->kbase) ++ return NULL; ++ + struct pipe_fence_handle *f = calloc(1, sizeof(*f)); + if (!f) + return NULL; +@@ -134,6 +152,16 @@ panfrost_fence_create(struct panfrost_context *ctx) + struct panfrost_device *dev = pan_device(ctx->base.screen); + int fd = -1, ret; + ++ if (dev->kbase) { ++ struct pipe_fence_handle *f = calloc(1, sizeof(*f)); ++ if (!f) ++ return NULL; ++ ++ f->kbase = dev->mali.syncobj_dup(&dev->mali, ctx->syncobj_kbase); ++ pipe_reference_init(&f->reference, 1); ++ return f; ++ } ++ + /* Snapshot the last rendering out fence. We'd rather have another + * syncobj instead of a sync file, but this is all we get. + * (HandleToFD/FDToHandle just gives you another syncobj ID for the +diff --git a/src/gallium/drivers/panfrost/pan_fence.h b/src/gallium/drivers/panfrost/pan_fence.h +index 350f3682343..a52c5c72c92 100644 +--- a/src/gallium/drivers/panfrost/pan_fence.h ++++ b/src/gallium/drivers/panfrost/pan_fence.h +@@ -32,6 +32,7 @@ struct panfrost_context; + struct pipe_fence_handle { + struct pipe_reference reference; + uint32_t syncobj; ++ struct kbase_syncobj *kbase; + bool signaled; + }; + +diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c +index 75408594735..4eb1a941f1e 100644 +--- a/src/gallium/drivers/panfrost/pan_job.c ++++ b/src/gallium/drivers/panfrost/pan_job.c +@@ -25,6 +25,7 @@ + */ + + #include ++#include + + #include "drm-uapi/panfrost_drm.h" + +@@ -81,6 +82,14 @@ panfrost_batch_init(struct panfrost_context *ctx, + batch->resources =_mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) ++ util_dynarray_init(&batch->resource_bos[i], NULL); ++ ++ util_dynarray_init(&batch->vert_deps, NULL); ++ util_dynarray_init(&batch->frag_deps, NULL); ++ ++ util_dynarray_init(&batch->dmabufs, NULL); ++ + /* Preallocate the main pool, since every batch has at least one job + * structure so it will be used */ + panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, true); +@@ -96,6 +105,9 @@ panfrost_batch_init(struct panfrost_context *ctx, + + panfrost_batch_add_surface(batch, batch->key.zsbuf); + ++ if ((dev->debug & PAN_DBG_SYNC) || !(dev->debug & PAN_DBG_GOFASTER)) ++ batch->needs_sync = true; ++ + screen->vtbl.init_batch(batch); + } + +@@ -115,15 +127,30 @@ static void + panfrost_batch_add_resource(struct panfrost_batch *batch, + struct panfrost_resource *rsrc) + { ++ struct panfrost_context *ctx = batch->ctx; ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ + bool found = false; + _mesa_set_search_or_add(batch->resources, rsrc, &found); + +- if (!found) { +- /* Cache number of batches accessing a resource */ +- rsrc->track.nr_users++; ++ /* Nothing to do if we already have the resource */ ++ if (found) ++ return; ++ ++ /* Cache number of batches accessing a resource */ ++ rsrc->track.nr_users++; ++ ++ /* Reference the resource on the batch */ ++ pipe_reference(NULL, &rsrc->base.reference); + +- /* Reference the resource on the batch */ +- pipe_reference(NULL, &rsrc->base.reference); ++ if (rsrc->scanout) { ++ if (dev->has_dmabuf_fence) { ++ int fd = rsrc->image.data.bo->dmabuf_fd; ++ util_dynarray_append(&batch->dmabufs, int, fd); ++ } else { ++ perf_debug_ctx(ctx, "Forcing sync on batch"); ++ batch->needs_sync = true; ++ } + } + } + +@@ -172,6 +199,10 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc + { + struct panfrost_device *dev = pan_device(ctx->base.screen); + ++ /* Make sure we keep handling events, to free old BOs */ ++ if (dev->kbase) ++ kbase_ensure_handle_events(&dev->mali); ++ + assert(batch->seqnum); + + if (ctx->batch == batch) +@@ -186,10 +217,18 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc + if (!flags[i]) + continue; + +- struct panfrost_bo *bo = pan_lookup_bo(dev, i); ++ struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i); + panfrost_bo_unreference(bo); + } + ++ util_dynarray_fini(&batch->dmabufs); ++ ++ util_dynarray_fini(&batch->vert_deps); ++ util_dynarray_fini(&batch->frag_deps); ++ ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) ++ util_dynarray_fini(&batch->resource_bos[i]); ++ + panfrost_batch_destroy_resources(ctx, batch); + panfrost_pool_cleanup(&batch->pool); + panfrost_pool_cleanup(&batch->invisible_pool); +@@ -313,7 +352,7 @@ panfrost_batch_update_access(struct panfrost_batch *batch, + } + } + +- if (writes) { ++ if (writes && (writer != batch)) { + _mesa_hash_table_insert(ctx->writers, rsrc, batch); + rsrc->track.nr_writers++; + } +@@ -380,6 +419,12 @@ panfrost_batch_read_rsrc(struct panfrost_batch *batch, + uint32_t access = PAN_BO_ACCESS_READ | + panfrost_access_for_stage(stage); + ++ enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ? ++ PAN_USAGE_READ_FRAGMENT : PAN_USAGE_READ_VERTEX; ++ ++ util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *, ++ rsrc->image.data.bo); ++ + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + + if (rsrc->separate_stencil) +@@ -396,6 +441,12 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch, + uint32_t access = PAN_BO_ACCESS_WRITE | + panfrost_access_for_stage(stage); + ++ enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ? ++ PAN_USAGE_WRITE_FRAGMENT : PAN_USAGE_WRITE_VERTEX; ++ ++ util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *, ++ rsrc->image.data.bo); ++ + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + + if (rsrc->separate_stencil) +@@ -489,7 +540,7 @@ panfrost_batch_get_shared_memory(struct panfrost_batch *batch, + } + + static void +-panfrost_batch_to_fb_info(const struct panfrost_batch *batch, ++panfrost_batch_to_fb_info(struct panfrost_batch *batch, + struct pan_fb_info *fb, + struct pan_image_view *rts, + struct pan_image_view *zs, +@@ -511,6 +562,7 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + fb->rt_count = batch->key.nr_cbufs; + fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin); + fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex); ++ fb->cs_fragment = &batch->cs_fragment; + + static const unsigned char id_swz[] = { + PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, +@@ -604,22 +656,22 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH); + fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL); + +- if (!fb->zs.clear.z && ++ if (!fb->zs.clear.z && z_rsrc && + ((batch->read & PIPE_CLEAR_DEPTH) || + ((batch->draws & PIPE_CLEAR_DEPTH) && +- z_rsrc && BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) ++ BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) + fb->zs.preload.z = true; + +- if (!fb->zs.clear.s && ++ if (!fb->zs.clear.s && s_rsrc && + ((batch->read & PIPE_CLEAR_STENCIL) || + ((batch->draws & PIPE_CLEAR_STENCIL) && +- s_rsrc && BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) ++ BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) + fb->zs.preload.s = true; + + /* Preserve both component if we have a combined ZS view and + * one component needs to be preserved. + */ +- if (s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) { ++ if (z_view && s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) { + bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level); + + fb->zs.discard.z = false; +@@ -629,6 +681,28 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + } + } + ++static int ++panfrost_batch_submit_kbase(struct panfrost_device *dev, ++ struct drm_panfrost_submit *submit, ++ struct kbase_syncobj *syncobj) ++{ ++ dev->mali.handle_events(&dev->mali); ++ ++ int atom = dev->mali.submit(&dev->mali, ++ submit->jc, ++ submit->requirements, ++ syncobj, ++ (int32_t *)(uintptr_t) submit->bo_handles, ++ submit->bo_handle_count); ++ ++ if (atom == -1) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ return 0; ++} ++ + static int + panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + mali_ptr first_job_desc, +@@ -695,7 +769,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + * We also preserve existing flags as this batch might not + * be the first one to access the BO. + */ +- struct panfrost_bo *bo = pan_lookup_bo(dev, i); ++ struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i); + + bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW); + } +@@ -718,6 +792,8 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + submit.bo_handles = (u64) (uintptr_t) bo_handles; + if (ctx->is_noop) + ret = 0; ++ else if (dev->kbase) ++ ret = panfrost_batch_submit_kbase(dev, &submit, ctx->syncobj_kbase); + else + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); + free(bo_handles); +@@ -728,8 +804,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + /* Trace the job if we're doing that */ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + /* Wait so we can get errors reported back */ +- drmSyncobjWait(dev->fd, &out_sync, 1, +- INT64_MAX, 0, NULL); ++ if (dev->kbase) ++ dev->mali.syncobj_wait(&dev->mali, ctx->syncobj_kbase); ++ else ++ drmSyncobjWait(dev->fd, &out_sync, 1, ++ INT64_MAX, 0, NULL); + + if (dev->debug & PAN_DBG_TRACE) + pandecode_jc(submit.jc, dev->gpu_id); +@@ -799,6 +878,323 @@ panfrost_batch_submit_jobs(struct panfrost_batch *batch, + return ret; + } + ++#define BASE_MEM_MMU_DUMP_HANDLE (1 << 12) ++ ++static void ++mmu_dump(struct panfrost_device *dev) ++{ ++ unsigned size = 16 * 1024 * 1024; ++ ++ fprintf(stderr, "dumping MMU tables\n"); ++ sleep(3); ++ ++ void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED, ++ dev->mali.fd, BASE_MEM_MMU_DUMP_HANDLE); ++ if (mem == MAP_FAILED) { ++ perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)"); ++ return;; ++ } ++ ++ fprintf(stderr, "writing to file\n"); ++ sleep(1); ++ ++ char template[] = {"/tmp/mmu-dump.XXXXXX"}; ++ int fd = mkstemp(template); ++ if (fd == -1) { ++ perror("mkstemp(/tmp/mmu-dump.XXXXXX)"); ++ goto unmap; ++ } ++ ++ write(fd, mem, size); ++ close(fd); ++ ++unmap: ++ munmap(mem, size); ++} ++ ++static void ++reset_context(struct panfrost_context *ctx) ++{ ++ struct pipe_screen *pscreen = ctx->base.screen; ++ struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); ++ ++ /* Don't recover from the fault if PAN_MESA_DEBUG=sync is specified, ++ * to somewhat mimic behaviour with JM GPUs. TODO: Just abort? */ ++ bool recover = !(dev->debug & PAN_DBG_SYNC); ++ ++ mesa_loge("Context reset"); ++ ++ dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_vertex.base); ++ dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_fragment.base); ++ ++ dev->mali.context_recreate(&dev->mali, ctx->kbase_ctx); ++ ++ //mmu_dump(dev); ++ ++ if (recover) { ++ dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_vertex.base); ++ dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_fragment.base); ++ } else { ++ ctx->kbase_cs_vertex.base.user_io = NULL; ++ ctx->kbase_cs_fragment.base.user_io = NULL; ++ } ++ ++ ctx->kbase_cs_vertex.base.last_insert = 0; ++ ctx->kbase_cs_fragment.base.last_insert = 0; ++ ++ screen->vtbl.init_cs(ctx, &ctx->kbase_cs_vertex); ++ screen->vtbl.init_cs(ctx, &ctx->kbase_cs_fragment); ++ ++ /* TODO: this leaks memory */ ++ ctx->tiler_heap_desc = 0; ++} ++ ++static void ++pandecode_cs_ring(struct panfrost_device *dev, struct panfrost_cs *cs, ++ uint64_t insert) ++{ ++ insert %= cs->base.size; ++ uint64_t start = cs->base.last_insert % cs->base.size; ++ ++ if (insert < start) { ++ pandecode_cs(cs->base.va + start, cs->base.size - start, dev->gpu_id); ++ start = 0; ++ } ++ ++ pandecode_cs(cs->base.va + start, insert - start, dev->gpu_id); ++} ++ ++static unsigned ++panfrost_add_dep_after(struct util_dynarray *deps, ++ struct panfrost_usage u, ++ unsigned index) ++{ ++ unsigned size = util_dynarray_num_elements(deps, struct panfrost_usage); ++ ++ for (unsigned i = index; i < size; ++i) { ++ struct panfrost_usage *d = ++ util_dynarray_element(deps, struct panfrost_usage, i); ++ ++ /* TODO: Remove d if it is an invalid entry? */ ++ ++ if ((d->queue == u.queue) && (d->write == u.write)) { ++ d->seqnum = MAX2(d->seqnum, u.seqnum); ++ return i; ++ ++ } else if (d->queue > u.queue) { ++ void *p = util_dynarray_grow(deps, struct panfrost_usage, 1); ++ assert(p); ++ memmove(util_dynarray_element(deps, struct panfrost_usage, i + 1), ++ util_dynarray_element(deps, struct panfrost_usage, i), ++ (size - i) * sizeof(struct panfrost_usage)); ++ ++ *util_dynarray_element(deps, struct panfrost_usage, i) = u; ++ return i; ++ } ++ } ++ ++ util_dynarray_append(deps, struct panfrost_usage, u); ++ return size; ++} ++ ++static void ++panfrost_update_deps(struct util_dynarray *deps, struct panfrost_bo *bo, bool write) ++{ ++ /* Both lists should be sorted, so each dependency is at a higher ++ * index than the last */ ++ unsigned index = 0; ++ util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) { ++ /* read->read access does not require a dependency */ ++ if (!write && !u->write) ++ continue; ++ ++ index = panfrost_add_dep_after(deps, *u, index); ++ } ++} ++ ++static inline bool ++panfrost_usage_writes(enum panfrost_usage_type usage) ++{ ++ return (usage == PAN_USAGE_WRITE_VERTEX) || (usage == PAN_USAGE_WRITE_FRAGMENT); ++} ++ ++static inline bool ++panfrost_usage_fragment(enum panfrost_usage_type usage) ++{ ++ return (usage == PAN_USAGE_READ_FRAGMENT) || (usage == PAN_USAGE_WRITE_FRAGMENT); ++} ++ ++/* Removes invalid dependencies from deps */ ++static void ++panfrost_clean_deps(struct panfrost_device *dev, struct util_dynarray *deps) ++{ ++ kbase k = &dev->mali; ++ ++ struct panfrost_usage *rebuild = util_dynarray_begin(deps); ++ unsigned index = 0; ++ ++ util_dynarray_foreach(deps, struct panfrost_usage, u) { ++ /* Usages are ordered, so we can break here */ ++ if (u->queue >= k->event_slot_usage) ++ break; ++ ++ struct kbase_event_slot *slot = &k->event_slots[u->queue]; ++ uint64_t seqnum = u->seqnum; ++ ++ /* There is a race condition, where we can depend on an ++ * unsubmitted batch. In that cade, decrease the seqnum. ++ * Otherwise, skip invalid dependencies. */ ++ if (slot->last_submit == seqnum) ++ --seqnum; ++ else if (slot->last_submit < seqnum) ++ continue; ++ ++ /* This usage is valid, add it to the returned list */ ++ rebuild[index++] = (struct panfrost_usage) { ++ .queue = u->queue, ++ .write = u->write, ++ .seqnum = seqnum, ++ }; ++ } ++ ++ /* No need to check the return value, it can only shrink */ ++ (void)! util_dynarray_resize(deps, struct panfrost_usage, index); ++} ++ ++static int ++panfrost_batch_submit_csf(struct panfrost_batch *batch, ++ const struct pan_fb_info *fb) ++{ ++ struct panfrost_context *ctx = batch->ctx; ++ struct pipe_screen *pscreen = ctx->base.screen; ++ struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); ++ ++ ++ctx->kbase_cs_vertex.seqnum; ++ ++ if (panfrost_has_fragment_job(batch)) { ++ screen->vtbl.emit_fragment_job(batch, fb); ++ ++ctx->kbase_cs_fragment.seqnum; ++ } ++ ++ pthread_mutex_lock(&dev->bo_usage_lock); ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) { ++ ++ bool write = panfrost_usage_writes(i); ++ pan_bo_access access = write ? PAN_BO_ACCESS_RW : PAN_BO_ACCESS_READ; ++ struct util_dynarray *deps; ++ unsigned queue; ++ uint64_t seqnum; ++ ++ if (panfrost_usage_fragment(i)) { ++ deps = &batch->frag_deps; ++ queue = ctx->kbase_cs_fragment.base.event_mem_offset; ++ seqnum = ctx->kbase_cs_fragment.seqnum; ++ } else { ++ deps = &batch->vert_deps; ++ queue = ctx->kbase_cs_vertex.base.event_mem_offset; ++ seqnum = ctx->kbase_cs_vertex.seqnum; ++ } ++ ++ util_dynarray_foreach(&batch->resource_bos[i], struct panfrost_bo *, bo) { ++ panfrost_update_deps(deps, *bo, write); ++ struct panfrost_usage u = { ++ .queue = queue, ++ .write = write, ++ .seqnum = seqnum, ++ }; ++ ++ panfrost_add_dep_after(&(*bo)->usage, u, 0); ++ (*bo)->gpu_access |= access; ++ } ++ } ++ pthread_mutex_unlock(&dev->bo_usage_lock); ++ ++ /* For now, only a single batch can use each tiler heap at once */ ++ if (ctx->tiler_heap_desc) { ++ panfrost_update_deps(&batch->vert_deps, ctx->tiler_heap_desc, true); ++ ++ struct panfrost_usage u = { ++ .queue = ctx->kbase_cs_fragment.base.event_mem_offset, ++ .write = true, ++ .seqnum = ctx->kbase_cs_fragment.seqnum, ++ }; ++ panfrost_add_dep_after(&ctx->tiler_heap_desc->usage, u, 0); ++ } ++ ++ /* TODO: Use atomics in kbase code to avoid lock? */ ++ pthread_mutex_lock(&dev->mali.queue_lock); ++ ++ panfrost_clean_deps(dev, &batch->vert_deps); ++ panfrost_clean_deps(dev, &batch->frag_deps); ++ ++ pthread_mutex_unlock(&dev->mali.queue_lock); ++ ++ screen->vtbl.emit_csf_toplevel(batch); ++ ++ uint64_t vs_offset = ctx->kbase_cs_vertex.offset + ++ (void *)ctx->kbase_cs_vertex.cs.ptr - ctx->kbase_cs_vertex.bo->ptr.cpu; ++ uint64_t fs_offset = ctx->kbase_cs_fragment.offset + ++ (void *)ctx->kbase_cs_fragment.cs.ptr - ctx->kbase_cs_fragment.bo->ptr.cpu; ++ ++ if (dev->debug & PAN_DBG_TRACE) { ++ pandecode_cs_ring(dev, &ctx->kbase_cs_vertex, vs_offset); ++ pandecode_cs_ring(dev, &ctx->kbase_cs_fragment, fs_offset); ++ } ++ ++ bool log = (dev->debug & PAN_DBG_LOG); ++ ++ // TODO: We need better synchronisation than a single fake syncobj! ++ ++ if (log) ++ printf("About to submit\n"); ++ ++ dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ++ ctx->syncobj_kbase, ctx->kbase_cs_vertex.seqnum); ++ ++ dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ++ ctx->syncobj_kbase, ctx->kbase_cs_fragment.seqnum); ++ ++ bool reset = false; ++ ++ // TODO: How will we know to reset a CS when waiting is not done? ++ if (batch->needs_sync) { ++ if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ctx->syncobj_kbase)) ++ reset = true; ++ ++ if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ctx->syncobj_kbase)) ++ reset = true; ++ } ++ ++ if (dev->debug & PAN_DBG_TILER) { ++ fflush(stdout); ++ FILE *stream = popen("tiler-hex-read", "w"); ++ ++ /* TODO: Dump more than just the first chunk */ ++ unsigned size = batch->ctx->kbase_ctx->tiler_heap_chunk_size; ++ uint64_t va = batch->ctx->kbase_ctx->tiler_heap_header; ++ ++ fprintf(stream, "width %i\n" "height %i\n" "mask %i\n" ++ "vaheap 0x%"PRIx64"\n" "size %i\n", ++ batch->key.width, batch->key.height, 0xfe, va, size); ++ ++ void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, ++ MAP_SHARED, dev->mali.fd, va); ++ ++ pan_hexdump(stream, ptr, size, false); ++ //memset(ptr, 0, size); ++ munmap(ptr, size); ++ ++ pclose(stream); ++ } ++ ++ if (reset) ++ reset_context(ctx); ++ ++ return 0; ++} ++ + static void + panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb) + { +@@ -824,6 +1220,7 @@ panfrost_batch_submit(struct panfrost_context *ctx, + { + struct pipe_screen *pscreen = ctx->base.screen; + struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); + int ret; + + /* Nothing to do! */ +@@ -867,7 +1264,11 @@ panfrost_batch_submit(struct panfrost_context *ctx, + if (batch->scoreboard.first_tiler || batch->clear) + screen->vtbl.emit_fbd(batch, &fb); + +- ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); ++ /* TODO: Don't hardcode the arch number */ ++ if (dev->arch < 10) ++ ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); ++ else ++ ret = panfrost_batch_submit_csf(batch, &fb); + + if (ret) + fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret); +@@ -969,6 +1370,8 @@ panfrost_batch_clear(struct panfrost_batch *batch, + for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; ++ if (!ctx->pipe_framebuffer.cbufs[i]) ++ continue; + + enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format; + pan_pack_color(batch->clear_color[i], color, format, false); +diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h +index 23263c54e07..6867476a3dc 100644 +--- a/src/gallium/drivers/panfrost/pan_job.h ++++ b/src/gallium/drivers/panfrost/pan_job.h +@@ -79,6 +79,14 @@ pan_tristate_get(struct pan_tristate state) + return (state.v == PAN_TRISTATE_TRUE); + } + ++enum panfrost_usage_type { ++ PAN_USAGE_READ_VERTEX, ++ PAN_USAGE_WRITE_VERTEX, ++ PAN_USAGE_READ_FRAGMENT, ++ PAN_USAGE_WRITE_FRAGMENT, ++ PAN_USAGE_COUNT, ++}; ++ + /* A panfrost_batch corresponds to a bound FBO we're rendering to, + * collecting over multiple draws. */ + +@@ -194,6 +202,25 @@ struct panfrost_batch { + + /* Referenced resources, holds a pipe_reference. */ + struct set *resources; ++ ++ struct util_dynarray resource_bos[PAN_USAGE_COUNT]; ++ ++ /* struct panfrost_usage */ ++ struct util_dynarray vert_deps; ++ struct util_dynarray frag_deps; ++ ++ /* Referenced dma-bufs FDs, for emitting synchronisation commands. */ ++ struct util_dynarray dmabufs; ++ ++ /* Command stream pointers for CSF Valhall. Vertex CS tracking is more ++ * complicated as there may be multiple buffers. */ ++ pan_command_stream cs_vertex; ++ uint32_t *cs_vertex_last_size; ++ pan_command_stream cs_vertex_first; ++ ++ pan_command_stream cs_fragment; ++ ++ bool needs_sync; + }; + + /* Functions for managing the above */ +diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c +index 9e95b793391..c8127987ad2 100644 +--- a/src/gallium/drivers/panfrost/pan_resource.c ++++ b/src/gallium/drivers/panfrost/pan_resource.c +@@ -33,6 +33,7 @@ + #include + #include + #include "drm-uapi/drm_fourcc.h" ++#include "drm-uapi/drm.h" + + #include "frontend/winsys_handle.h" + #include "util/format/u_format.h" +@@ -51,6 +52,46 @@ + #include "pan_tiling.h" + #include "decode.h" + ++/* The kbase kernel driver always maps imported BOs with caching. When we ++ * don't want that, instead do mmap from the display driver side to get a ++ * write-combine mapping. ++ */ ++static void ++panfrost_bo_mmap_scanout(struct panfrost_bo *bo, ++ struct renderonly *ro, ++ struct renderonly_scanout *scanout) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ /* If we are fine with a cached mapping, just return */ ++ if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) ++ return; ++ ++ struct drm_mode_map_dumb map_dumb = { ++ .handle = scanout->handle, ++ }; ++ ++ int err = drmIoctl(ro->kms_fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb); ++ if (err < 0) { ++ fprintf(stderr, "DRM_IOCTL_MODE_MAP_DUMB failed: %s\n", ++ strerror(errno)); ++ return; ++ } ++ ++ void *addr = mmap(NULL, bo->size, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ ro->kms_fd, map_dumb.offset); ++ if (addr == MAP_FAILED) { ++ fprintf(stderr, "kms_fd mmap failed: %s\n", ++ strerror(errno)); ++ return; ++ } ++ ++ bo->munmap_ptr = bo->ptr.cpu; ++ bo->ptr.cpu = addr; ++ bo->cached = false; ++} ++ + static struct pipe_resource * + panfrost_resource_from_handle(struct pipe_screen *pscreen, + const struct pipe_resource *templat, +@@ -102,15 +143,17 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen, + return NULL; + } + +- rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle); ++ struct panfrost_bo *bo = panfrost_bo_import(dev, whandle->handle); + /* Sometimes an import can fail e.g. on an invalid buffer fd, out of + * memory space to mmap it etc. + */ +- if (!rsc->image.data.bo) { ++ if (!bo) { + FREE(rsc); + return NULL; + } + ++ rsc->image.data.bo = bo; ++ + rsc->modifier_constant = true; + + BITSET_SET(rsc->valid.data, 0); +@@ -122,6 +165,9 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen, + /* failure is expected in some cases.. */ + } + ++ if (rsc->scanout) ++ panfrost_bo_mmap_scanout(bo, dev->ro, rsc->scanout); ++ + return prsc; + } + +@@ -473,7 +519,9 @@ panfrost_resource_setup(struct panfrost_device *dev, + static void + panfrost_resource_init_afbc_headers(struct panfrost_resource *pres) + { +- panfrost_bo_mmap(pres->image.data.bo); ++ struct panfrost_bo *bo = pres->image.data.bo; ++ ++ panfrost_bo_mmap(bo); + + unsigned nr_samples = MAX2(pres->base.nr_samples, 1); + +@@ -482,16 +530,16 @@ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres) + struct pan_image_slice_layout *slice = &pres->image.layout.slices[l]; + + for (unsigned s = 0; s < nr_samples; ++s) { +- void *ptr = pres->image.data.bo->ptr.cpu + +- (i * pres->image.layout.array_stride) + +- slice->offset + +- (s * slice->afbc.surface_stride); ++ size_t offset = (i * pres->image.layout.array_stride) + ++ slice->offset + ++ (s * slice->afbc.surface_stride); + + /* Zero-ed AFBC headers seem to encode a plain + * black. Let's use this pattern to keep the + * initialization simple. + */ +- memset(ptr, 0, slice->afbc.header_size); ++ memset(bo->ptr.cpu + offset, 0, slice->afbc.header_size); ++ panfrost_bo_mem_clean(bo, offset, slice->afbc.header_size); + } + } + } +@@ -643,7 +691,9 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen, + (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" : + "Other resource"; + +- if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) { ++ /* Revert to doing a kmsro allocation for any shared BO, because kbase ++ * cannot do export */ ++ if (dev->ro && (template->bind & PAN_BIND_SHARED_MASK)) { + struct winsys_handle handle; + struct pan_block_size blocksize = panfrost_block_size(modifier, template->format); + +@@ -702,12 +752,21 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen, + free(so); + return NULL; + } ++ ++ panfrost_bo_mmap_scanout(so->image.data.bo, dev->ro, so->scanout); + } else { + /* We create a BO immediately but don't bother mapping, since we don't + * care to map e.g. FBOs which the CPU probably won't touch */ + ++ /* For now, don't cache buffers as syncing can be slow when ++ * too much memory is mapped. TODO: dynamically switch, or use ++ * the STREAM_READ etc. hints? */ ++ bool buffer = (template->target == PIPE_BUFFER); ++ unsigned cache_flag = buffer ? 0 : PAN_BO_CACHEABLE; ++ + so->image.data.bo = +- panfrost_bo_create(dev, so->image.layout.data_size, PAN_BO_DELAY_MMAP, label); ++ panfrost_bo_create(dev, so->image.layout.data_size, ++ PAN_BO_DELAY_MMAP | cache_flag, label); + + so->constant_stencil = true; + } +@@ -741,10 +800,22 @@ panfrost_resource_create_with_modifiers(struct pipe_screen *screen, + const struct pipe_resource *template, + const uint64_t *modifiers, int count) + { ++ struct panfrost_device *dev = pan_device(screen); ++ + for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { +- if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) { +- return panfrost_resource_create_with_modifier(screen, template, +- pan_best_modifiers[i]); ++ uint64_t mod = pan_best_modifiers[i]; ++ ++ if (drm_is_afbc(mod) && !dev->has_afbc) ++ continue; ++ ++ if (mod != DRM_FORMAT_MOD_LINEAR && (dev->debug & PAN_DBG_LINEAR)) ++ continue; ++ ++ /* TODO: What if mod is an unsupported AFBC variant for this ++ * format? */ ++ ++ if (drm_find_modifier(mod, modifiers, count)) { ++ return panfrost_resource_create_with_modifier(screen, template, mod); + } + } + +@@ -773,6 +844,71 @@ panfrost_resource_destroy(struct pipe_screen *screen, + free(rsrc); + } + ++static void ++panfrost_clear_render_target(struct pipe_context *pipe, ++ struct pipe_surface *dst, ++ const union pipe_color_union *color, ++ unsigned dstx, unsigned dsty, ++ unsigned width, unsigned height, ++ bool render_condition_enabled) ++{ ++ struct panfrost_context *ctx = pan_context(pipe); ++ ++ /* TODO: dstx, etc. */ ++ ++ struct pipe_framebuffer_state tmp = {0}; ++ util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer); ++ ++ struct pipe_framebuffer_state fb = { ++ .width = dst->width, ++ .height = dst->height, ++ .layers = 1, ++ .samples = 1, ++ .nr_cbufs = 1, ++ .cbufs[0] = dst, ++ }; ++ pipe->set_framebuffer_state(pipe, &fb); ++ ++ struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear render target"); ++ panfrost_batch_clear(batch, PIPE_CLEAR_COLOR0, color, 0, 0); ++ ++ pipe->set_framebuffer_state(pipe, &tmp); ++ util_unreference_framebuffer_state(&tmp); ++} ++ ++static void ++panfrost_clear_depth_stencil(struct pipe_context *pipe, ++ struct pipe_surface *dst, ++ unsigned clear_flags, ++ double depth, unsigned stencil, ++ unsigned dstx, unsigned dsty, ++ unsigned width, unsigned height, ++ bool render_condition_enabled) ++{ ++ struct panfrost_context *ctx = pan_context(pipe); ++ ++ /* TODO: dstx, etc. */ ++ ++ struct pipe_framebuffer_state tmp = {0}; ++ util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer); ++ ++ struct pipe_framebuffer_state fb = { ++ .width = dst->width, ++ .height = dst->height, ++ .layers = 1, ++ .samples = 1, ++ .nr_cbufs = 0, ++ .zsbuf = dst, ++ }; ++ pipe->set_framebuffer_state(pipe, &fb); ++ ++ struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear depth/stencil"); ++ panfrost_batch_clear(batch, clear_flags, NULL, depth, stencil); ++ ++ pipe->set_framebuffer_state(pipe, &tmp); ++ util_unreference_framebuffer_state(&tmp); ++} ++ + /* Most of the time we can do CPU-side transfers, but sometimes we need to use + * the 3D pipe for this. Let's wrap u_blitter to blit to/from staging textures. + * Code adapted from freedreno */ +@@ -968,6 +1104,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + struct panfrost_resource *staging = pan_alloc_staging(ctx, rsrc, level, box); + assert(staging); + ++ panfrost_bo_mmap(staging->image.data.bo); ++ + /* Staging resources have one LOD: level 0. Query the strides + * on this LOD. + */ +@@ -990,9 +1128,11 @@ panfrost_ptr_map(struct pipe_context *pctx, + pan_blit_to_staging(pctx, transfer); + panfrost_flush_writer(ctx, staging, "AFBC read staging blit"); + panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false); ++ ++ panfrost_bo_mem_invalidate(staging->image.data.bo, 0, ++ staging->image.data.bo->size); + } + +- panfrost_bo_mmap(staging->image.data.bo); + return staging->image.data.bo->ptr.cpu; + } + +@@ -1029,7 +1169,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + !(usage & PIPE_MAP_UNSYNCHRONIZED) && + !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && + (usage & PIPE_MAP_WRITE) && +- rsrc->track.nr_users > 0) { ++ rsrc->track.nr_users > 0 && ++ bo->size < 16 * 1024 * 1024) { + + /* When a resource to be modified is already being used by a + * pending batch, it is often faster to copy the whole BO than +@@ -1051,6 +1192,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + copy_resource = false; + } + ++ bool cache_inval = true; ++ + if (create_new_bo) { + /* Make sure we re-emit any descriptors using this resource */ + panfrost_dirty_state_all(ctx); +@@ -1075,12 +1218,14 @@ panfrost_ptr_map(struct pipe_context *pctx, + flags, bo->label); + + if (newbo) { +- if (copy_resource) +- memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size); ++ if (copy_resource) { ++ panfrost_bo_mem_invalidate(bo, 0, bo->size); ++ memcpy(newbo->ptr.cpu, bo->ptr.cpu, bo->size); ++ } + + panfrost_resource_swap_bo(ctx, rsrc, newbo); + +- if (!copy_resource && ++ if (!copy_resource && + drm_is_afbc(rsrc->image.layout.modifier)) + panfrost_resource_init_afbc_headers(rsrc); + +@@ -1102,6 +1247,22 @@ panfrost_ptr_map(struct pipe_context *pctx, + panfrost_flush_writer(ctx, rsrc, "Synchronized read"); + panfrost_bo_wait(bo, INT64_MAX, false); + } ++ } else { ++ /* No flush for writes to uninitialized */ ++ cache_inval = false; ++ } ++ ++ /* TODO: Only the accessed region for textures */ ++ if (cache_inval) { ++ size_t offset = 0; ++ size_t size = bo->size; ++ ++ if (resource->target == PIPE_BUFFER) { ++ offset = box->x * (size_t) bytes_per_block; ++ size = box->width * (size_t) bytes_per_block; ++ } ++ ++ panfrost_bo_mem_invalidate(bo, offset, size); + } + + /* For access to compressed textures, we want the (x, y, w, h) +@@ -1128,6 +1289,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + * caching... I don't know if this is actually possible but we + * should still get it right */ + ++ // TODO: Fix this for cached BOs ++ + unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT; + + if ((usage & dpw) == dpw && rsrc->index_cache) +@@ -1281,8 +1444,15 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + * reloads that can cascade into DATA_INVALID_FAULTs due to reading + * malformed AFBC data if uninitialized */ + +- if (trans->staging.rsrc) { ++ bool afbc = trans->staging.rsrc; ++ ++ if (afbc) { + if (transfer->usage & PIPE_MAP_WRITE) { ++ struct panfrost_resource *trans_rsrc = pan_resource(trans->staging.rsrc); ++ struct panfrost_bo *trans_bo = trans_rsrc->image.data.bo; ++ ++ panfrost_bo_mem_clean(trans_bo, 0, trans_bo->size); ++ + if (panfrost_should_linear_convert(dev, prsrc, transfer)) { + + panfrost_bo_unreference(prsrc->image.data.bo); +@@ -1290,7 +1460,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); + +- prsrc->image.data.bo = pan_resource(trans->staging.rsrc)->image.data.bo; ++ prsrc->image.data.bo = trans_bo; + panfrost_bo_reference(prsrc->image.data.bo); + } else { + pan_blit_from_staging(pctx, trans); +@@ -1315,10 +1485,13 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); + if (prsrc->image.layout.data_size > bo->size) { ++ /* We want the BO to be MMAPed. */ ++ uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP; + const char *label = bo->label; ++ + panfrost_bo_unreference(bo); + bo = prsrc->image.data.bo = +- panfrost_bo_create(dev, prsrc->image.layout.data_size, 0, label); ++ panfrost_bo_create(dev, prsrc->image.layout.data_size, flags, label); + assert(bo); + } + +@@ -1339,6 +1512,25 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + } + } + ++ /* TODO: Only the accessed region */ ++ /* It is important to not do this for AFBC resources, or else the ++ * clean might overwrite the result of the blit. */ ++ if (!afbc && (transfer->usage & PIPE_MAP_WRITE)) { ++ size_t offset = 0; ++ size_t size = prsrc->image.data.bo->size; ++ ++ /* TODO: Don't recalculate */ ++ if (prsrc->base.target == PIPE_BUFFER) { ++ enum pipe_format format = prsrc->image.layout.format; ++ int bytes_per_block = util_format_get_blocksize(format); ++ ++ offset = transfer->box.x * (size_t) bytes_per_block; ++ size = transfer->box.width * (size_t) bytes_per_block; ++ } ++ ++ panfrost_bo_mem_clean(prsrc->image.data.bo, ++ offset, size); ++ } + + util_range_add(&prsrc->base, &prsrc->valid_buffer_range, + transfer->box.x, +@@ -1353,6 +1545,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + ralloc_free(transfer); + } + ++// TODO: does this need to be changed for cached resources? + static void + panfrost_ptr_flush_region(struct pipe_context *pctx, + struct pipe_transfer *transfer, +@@ -1486,6 +1679,8 @@ panfrost_resource_context_init(struct pipe_context *pctx) + pctx->texture_unmap = u_transfer_helper_transfer_unmap; + pctx->create_surface = panfrost_create_surface; + pctx->surface_destroy = panfrost_surface_destroy; ++ pctx->clear_render_target = panfrost_clear_render_target; ++ pctx->clear_depth_stencil = panfrost_clear_depth_stencil; + pctx->resource_copy_region = util_resource_copy_region; + pctx->blit = panfrost_blit; + pctx->generate_mipmap = panfrost_generate_mipmap; +diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c +index ee6dbb7b57f..ea315f8be64 100644 +--- a/src/gallium/drivers/panfrost/pan_screen.c ++++ b/src/gallium/drivers/panfrost/pan_screen.c +@@ -56,7 +56,7 @@ + + static const struct debug_named_value panfrost_debug_options[] = { + {"perf", PAN_DBG_PERF, "Enable performance warnings"}, +- {"trace", PAN_DBG_TRACE, "Trace the command stream"}, ++ {"trace", PAN_DBG_TRACE | PAN_DBG_BO_CLEAR, "Trace the command stream"}, + {"deqp", PAN_DBG_DEQP, "Hacks for dEQP"}, + {"dirty", PAN_DBG_DIRTY, "Always re-emit all state"}, + {"sync", PAN_DBG_SYNC, "Wait for each job's completion and abort on GPU faults"}, +@@ -72,6 +72,13 @@ static const struct debug_named_value panfrost_debug_options[] = { + #ifdef PAN_DBG_OVERFLOW + {"overflow", PAN_DBG_OVERFLOW, "Check for buffer overflows in pool uploads"}, + #endif ++ {"tiler", PAN_DBG_TILER, "Decode the tiler heap"}, ++ {"bolog", PAN_DBG_BO_LOG, "Log BO allocations/deallocations"}, ++ {"boclear", PAN_DBG_BO_CLEAR, "Clear BOs on allocation"}, ++ {"nogpuc", PAN_DBG_UNCACHED_GPU, "Use uncached GPU memory for textures"}, ++ {"nocpuc", PAN_DBG_UNCACHED_CPU, "Use uncached CPU mappings for textures"}, ++ {"log", PAN_DBG_LOG, "Log job submission etc."}, ++ {"gofaster", PAN_DBG_GOFASTER, "Experimental performance improvements"}, + DEBUG_NAMED_VALUE_END + }; + +@@ -122,6 +129,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_SHADER_PACK_HALF_FLOAT: ++ case PIPE_CAP_CLIP_HALFZ: + return 1; + + case PIPE_CAP_MAX_RENDER_TARGETS: +@@ -300,7 +308,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) + * still supported as it is core GLES3.0 functionality + */ + case PIPE_CAP_PRIMITIVE_RESTART: +- return dev->arch <= 7; ++ return is_gl3 || dev->arch <= 7; + + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_TWO_SIDED_COLOR: +@@ -606,6 +614,7 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen, + bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format); + bool ytr = panfrost_afbc_can_ytr(format); + bool tiled_afbc = panfrost_afbc_can_tile(dev); ++ bool native = panfrost_afbc_only_native(dev->arch, format); + + unsigned count = 0; + +@@ -619,6 +628,9 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen, + if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc) + continue; + ++ if (drm_is_afbc(pan_best_modifiers[i]) && !(pan_best_modifiers[i] & AFBC_FORMAT_MOD_NATIVE_SWIZZLE) && native) ++ continue; ++ + if (test_modifier != DRM_FORMAT_MOD_INVALID && + test_modifier != pan_best_modifiers[i]) + continue; +@@ -822,13 +834,17 @@ panfrost_create_screen(int fd, struct renderonly *ro) + + /* Bail early on unsupported hardware */ + if (dev->model == NULL) { +- debug_printf("panfrost: Unsupported model %X", dev->gpu_id); ++ debug_printf("panfrost: Unsupported model %X\n", dev->gpu_id); + panfrost_destroy_screen(&(screen->base)); + return NULL; + } + + dev->ro = ro; + ++ /* The functionality is only useful with kbase */ ++ if (dev->kbase) ++ dev->has_dmabuf_fence = panfrost_check_dmabuf_fence(dev); ++ + screen->base.destroy = panfrost_destroy_screen; + + screen->base.get_name = panfrost_get_name; +@@ -874,6 +890,8 @@ panfrost_create_screen(int fd, struct renderonly *ro) + panfrost_cmdstream_screen_init_v7(screen); + else if (dev->arch == 9) + panfrost_cmdstream_screen_init_v9(screen); ++ else if (dev->arch == 10) ++ panfrost_cmdstream_screen_init_v10(screen); + else + unreachable("Unhandled architecture major"); + +diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h +index 656a4948a42..94cfcf472a5 100644 +--- a/src/gallium/drivers/panfrost/pan_screen.h ++++ b/src/gallium/drivers/panfrost/pan_screen.h +@@ -50,6 +50,7 @@ static const struct pipe_driver_query_info panfrost_driver_query_list[] = { + + struct panfrost_batch; + struct panfrost_context; ++struct panfrost_cs; + struct panfrost_resource; + struct panfrost_compiled_shader; + struct pan_fb_info; +@@ -57,6 +58,7 @@ struct pan_blend_state; + + /* Virtual table of per-generation (GenXML) functions */ + ++ + struct panfrost_vtable { + /* Prepares the renderer state descriptor or shader program descriptor + * for a given compiled shader, and if desired uploads it as well */ +@@ -100,6 +102,10 @@ struct panfrost_vtable { + struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); ++ ++ void (*emit_csf_toplevel)(struct panfrost_batch *); ++ ++ void (*init_cs)(struct panfrost_context *ctx, struct panfrost_cs *cs); + }; + + struct panfrost_screen { +@@ -138,6 +144,7 @@ void panfrost_cmdstream_screen_init_v5(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); ++void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); + + #define perf_debug(dev, ...) \ + do { \ +diff --git a/src/gallium/frontends/nine/nine_ff.c b/src/gallium/frontends/nine/nine_ff.c +index 6705fc2208c..6eb94ef8ccd 100644 +--- a/src/gallium/frontends/nine/nine_ff.c ++++ b/src/gallium/frontends/nine/nine_ff.c +@@ -1413,7 +1413,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key) + struct ureg_src texture_coord = ps.vT[s]; + struct ureg_dst delta; + switch (key->ts[s].textarget) { +- case 0: target = TGSI_TEXTURE_1D; break; ++ case 0: target = TGSI_TEXTURE_2D; break; + case 1: target = TGSI_TEXTURE_2D; break; + case 2: target = TGSI_TEXTURE_3D; break; + case 3: target = TGSI_TEXTURE_CUBE; break; +diff --git a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c +index d1742a59c0e..e78288c5010 100644 +--- a/src/gallium/frontends/nine/nine_shader.c ++++ b/src/gallium/frontends/nine/nine_shader.c +@@ -2159,7 +2159,7 @@ static inline unsigned + d3dstt_to_tgsi_tex(BYTE sampler_type) + { + switch (sampler_type) { +- case NINED3DSTT_1D: return TGSI_TEXTURE_1D; ++ case NINED3DSTT_1D: return TGSI_TEXTURE_2D; + case NINED3DSTT_2D: return TGSI_TEXTURE_2D; + case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D; + case NINED3DSTT_CUBE: return TGSI_TEXTURE_CUBE; +@@ -2172,7 +2172,7 @@ static inline unsigned + d3dstt_to_tgsi_tex_shadow(BYTE sampler_type) + { + switch (sampler_type) { +- case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D; ++ case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW2D; + case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D; + case NINED3DSTT_VOLUME: + case NINED3DSTT_CUBE: +@@ -2186,7 +2186,7 @@ ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage) + { + boolean shadow = !!(info->sampler_mask_shadow & (1 << stage)); + switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) { +- case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D; ++ case 1: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D; + case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D; + case 3: return TGSI_TEXTURE_3D; + default: +diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c +index cd627c83d1e..b07e361ff41 100644 +--- a/src/gallium/frontends/nine/nine_state.c ++++ b/src/gallium/frontends/nine/nine_state.c +@@ -1039,8 +1039,10 @@ update_textures_and_samplers(struct NineDevice9 *device) + false, view); + context->enabled_sampler_count_ps = num_textures; + +- if (commit_samplers) ++ if (commit_samplers) { ++ cso_set_max_sampler(context->cso, num_textures - 1); + cso_single_sampler_done(context->cso, PIPE_SHADER_FRAGMENT); ++ } + + commit_samplers = FALSE; + sampler_mask = context->programmable_vs ? context->vs->sampler_mask : 0; +@@ -1084,8 +1086,10 @@ update_textures_and_samplers(struct NineDevice9 *device) + false, view); + context->enabled_sampler_count_vs = num_textures; + +- if (commit_samplers) ++ if (commit_samplers) { ++ cso_set_max_sampler(context->cso, num_textures - 1); + cso_single_sampler_done(context->cso, PIPE_SHADER_VERTEX); ++ } + } + + /* State commit only */ +diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build +index 73f948c5460..e9f942901b8 100644 +--- a/src/gallium/targets/d3dadapter9/meson.build ++++ b/src/gallium/targets/d3dadapter9/meson.build +@@ -64,7 +64,8 @@ libgallium_nine = shared_library( + dep_selinux, dep_libdrm, dep_llvm, dep_thread, + idep_xmlconfig, idep_mesautil, idep_nir, + driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, +- driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno ++ driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno, ++ driver_panfrost, driver_kmsro, + ], + name_prefix : '', + version : '.'.join(nine_version), +diff --git a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build +index 024bac32b58..23938ec73a1 100644 +--- a/src/gallium/targets/osmesa/meson.build ++++ b/src/gallium/targets/osmesa/meson.build +@@ -55,10 +55,10 @@ libosmesa = shared_library( + libmesa, libgallium, libws_null, osmesa_link_with, + ], + dependencies : [ +- dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast ++ dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast, driver_panfrost, dep_libdrm + ], + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libosmesa.dll +- soversion : host_machine.system() == 'windows' ? '' : '8', ++ soversion : '', + version : '8.0.0', + darwin_versions : '9.0.0', + install : true, +diff --git a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build +index 71c5da2129e..a4b4c7639f0 100644 +--- a/src/gallium/targets/rusticl/meson.build ++++ b/src/gallium/targets/rusticl/meson.build +@@ -43,6 +43,7 @@ librusticl = shared_library( + ], + dependencies : [ + driver_iris, ++ driver_kmsro, + driver_nouveau, + driver_panfrost, + driver_swrast, +diff --git a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c +index 3c8a3c4519f..4011f45f853 100644 +--- a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c ++++ b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c +@@ -101,9 +101,15 @@ struct pipe_screen *kmsro_drm_screen_create(int fd, + #endif + + #if defined(GALLIUM_PANFROST) +- ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); ++ bool noop = getenv("KBASE_NOOP"); + +- if (ro->gpu_fd >= 0) { ++ if (!noop) { ++ ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); ++ if (ro->gpu_fd < 0) ++ ro->gpu_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK); ++ } ++ ++ if ((ro->gpu_fd >= 0) || noop) { + ro->create_for_resource = renderonly_create_kms_dumb_buffer_for_resource; + screen = panfrost_drm_screen_create_renderonly(ro); + if (!screen) +diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c +index 048106dccd5..71992ca72c5 100644 +--- a/src/mesa/main/shaderapi.c ++++ b/src/mesa/main/shaderapi.c +@@ -70,7 +70,6 @@ + #include "state_tracker/st_context.h" + #include "state_tracker/st_program.h" + +-#ifdef ENABLE_SHADER_CACHE + #if CUSTOM_SHADER_REPLACEMENT + #include "shader_replacement.h" + /* shader_replacement.h must declare a variable like this: +@@ -116,7 +115,6 @@ static char* load_shader_replacement(struct _shader_replacement *repl) + return NULL; + } + #endif +-#endif + + /** + * Return mask of GLSL_x flags by examining the MESA_GLSL env var. +@@ -1929,8 +1927,6 @@ _mesa_LinkProgram(GLuint programObj) + link_program_error(ctx, shProg); + } + +-#ifdef ENABLE_SHADER_CACHE +- + /** + * Construct a full path for shader replacement functionality using + * following format: +@@ -2063,8 +2059,6 @@ _mesa_read_shader_source(const gl_shader_stage stage, const char *source, + return buffer; + } + +-#endif /* ENABLE_SHADER_CACHE */ +- + /** + * Called via glShaderSource() and glShaderSourceARB() API functions. + * Basically, concatenate the source code strings into one long string +@@ -2146,7 +2140,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count, + uint8_t original_sha1[SHA1_DIGEST_LENGTH]; + _mesa_sha1_compute(source, strlen(source), original_sha1); + +-#ifdef ENABLE_SHADER_CACHE + GLcharARB *replacement; + + /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace +@@ -2159,7 +2152,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count, + free(source); + source = replacement; + } +-#endif /* ENABLE_SHADER_CACHE */ + + set_shader_source(sh, source, original_sha1); + +diff --git a/src/meson.build b/src/meson.build +index e5510452775..1890db00c0d 100644 +--- a/src/meson.build ++++ b/src/meson.build +@@ -77,6 +77,7 @@ if with_imgui + endif + if with_platform_wayland + subdir('egl/wayland/wayland-drm') ++ subdir('egl/wayland/mali-buffer-sharing') + endif + if with_any_vk or with_gallium_zink + subdir('vulkan') +diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h +new file mode 100644 +index 00000000000..3b02350c08b +--- /dev/null ++++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h +@@ -0,0 +1,596 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_BASE_CSF_KERNEL_H_ ++#define _UAPI_BASE_CSF_KERNEL_H_ ++ ++#include ++#include "../mali_base_common_kernel.h" ++ ++/* Memory allocation, access/hint flags & mask specific to CSF GPU. ++ * ++ * See base_mem_alloc_flags. ++ */ ++ ++/* Must be FIXED memory. */ ++#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8) ++ ++/* CSF event memory ++ * ++ * If Outer shareable coherence is not specified or not available, then on ++ * allocation kbase will automatically use the uncached GPU mapping. ++ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU ++ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. ++ * ++ * This memory requires a permanent mapping ++ * ++ * See also kbase_reg_needs_kernel_mapping() ++ */ ++#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) ++ ++#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) ++ ++ ++/* Must be FIXABLE memory: its GPU VA will be determined at a later point, ++ * at which time it will be at a fixed GPU VA. ++ */ ++#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29) ++ ++/* Note that the number of bits used for base_mem_alloc_flags ++ * must be less than BASE_MEM_FLAGS_NR_BITS !!! ++ */ ++ ++/* A mask of all the flags which are only valid for allocations within kbase, ++ * and may not be passed from user space. ++ */ ++#define BASEP_MEM_FLAGS_KERNEL_ONLY \ ++ (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) ++ ++/* A mask of all currently reserved flags ++ */ ++#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20 ++ ++/* Special base mem handles specific to CSF. ++ */ ++#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) ++#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) ++ ++#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ ++ ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ ++ LOCAL_PAGE_SHIFT) ++ ++/* Valid set of just-in-time memory allocation flags */ ++#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) ++ ++/* flags for base context specific to CSF */ ++ ++/* Base context creates a CSF event notification thread. ++ * ++ * The creation of a CSF event notification thread is conditional but ++ * mandatory for the handling of CSF events. ++ */ ++#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) ++ ++/* Bitpattern describing the ::base_context_create_flags that can be ++ * passed to base_context_init() ++ */ ++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ ++ (BASE_CONTEXT_CCTX_EMBEDDED | \ ++ BASE_CONTEXT_CSF_EVENT_THREAD | \ ++ BASEP_CONTEXT_CREATE_KERNEL_FLAGS) ++ ++/* Flags for base tracepoint specific to CSF */ ++ ++/* Enable KBase tracepoints for CSF builds */ ++#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) ++ ++/* Enable additional CSF Firmware side tracepoints */ ++#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) ++ ++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ ++ BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ ++ BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ ++ BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) ++ ++/* Number of pages mapped into the process address space for a bound GPU ++ * command queue. A pair of input/output pages and a Hw doorbell page ++ * are mapped to enable direct submission of commands to Hw. ++ */ ++#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) ++ ++#define BASE_QUEUE_MAX_PRIORITY (15U) ++ ++/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ ++#define BASEP_EVENT_VAL_INDEX (0U) ++#define BASEP_EVENT_ERR_INDEX (1U) ++ ++/* The upper limit for number of objects that could be waited/set per command. ++ * This limit is now enforced as internally the error inherit inputs are ++ * converted to 32-bit flags in a __u32 variable occupying a previously padding ++ * field. ++ */ ++#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) ++ ++/* CSF CSI EXCEPTION_HANDLER_FLAGS */ ++#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0) ++#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG) ++ ++/** ++ * enum base_kcpu_command_type - Kernel CPU queue command type. ++ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, ++ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, ++ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, ++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, ++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, ++ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, ++ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, ++ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, ++ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, ++ */ ++enum base_kcpu_command_type { ++ BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, ++ BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, ++ BASE_KCPU_COMMAND_TYPE_CQS_WAIT, ++ BASE_KCPU_COMMAND_TYPE_CQS_SET, ++ BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, ++ BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, ++ BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, ++ BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, ++ BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, ++ BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, ++ BASE_KCPU_COMMAND_TYPE_JIT_FREE, ++ BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, ++ BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER ++}; ++ ++/** ++ * enum base_queue_group_priority - Priority of a GPU Command Queue Group. ++ * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group ++ * priority levels. ++ * ++ * Currently this is in order of highest to lowest, but if new levels are added ++ * then those new levels may be out of order to preserve the ABI compatibility ++ * with previous releases. At that point, ensure assignment to ++ * the 'priority' member in &kbase_queue_group is updated to ensure it remains ++ * a linear ordering. ++ * ++ * There should be no gaps in the enum, otherwise use of ++ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. ++ */ ++enum base_queue_group_priority { ++ BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, ++ BASE_QUEUE_GROUP_PRIORITY_MEDIUM, ++ BASE_QUEUE_GROUP_PRIORITY_LOW, ++ BASE_QUEUE_GROUP_PRIORITY_REALTIME, ++ BASE_QUEUE_GROUP_PRIORITY_COUNT ++}; ++ ++struct base_kcpu_command_fence_info { ++ __u64 fence; ++}; ++ ++struct base_cqs_wait_info { ++ __u64 addr; ++ __u32 val; ++ __u32 padding; ++}; ++ ++struct base_kcpu_command_cqs_wait_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 inherit_err_flags; ++}; ++ ++struct base_cqs_set { ++ __u64 addr; ++}; ++ ++struct base_kcpu_command_cqs_set_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 padding; ++}; ++ ++/** ++ * typedef basep_cqs_data_type - Enumeration of CQS Data Types ++ * ++ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value ++ * is an unsigned 32-bit integer ++ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value ++ * is an unsigned 64-bit integer ++ */ ++typedef enum PACKED { ++ BASEP_CQS_DATA_TYPE_U32 = 0, ++ BASEP_CQS_DATA_TYPE_U64 = 1, ++} basep_cqs_data_type; ++ ++/** ++ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait ++ * Operation conditions ++ * ++ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a ++ * wait will be satisfied when a CQS Object's ++ * value is Less than or Equal to ++ * the Wait Operation value ++ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a ++ * wait will be satisfied when a CQS Object's ++ * value is Greater than the Wait Operation value ++ */ ++typedef enum { ++ BASEP_CQS_WAIT_OPERATION_LE = 0, ++ BASEP_CQS_WAIT_OPERATION_GT = 1, ++} basep_cqs_wait_operation_op; ++ ++struct base_cqs_wait_operation_info { ++ __u64 addr; ++ __u64 val; ++ __u8 operation; ++ __u8 data_type; ++ __u8 padding[6]; ++}; ++ ++/** ++ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information ++ * about the Timeline CQS wait objects ++ * ++ * @objs: An array of Timeline CQS waits. ++ * @nr_objs: Number of Timeline CQS waits in the array. ++ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field ++ * to be served as the source for importing into the ++ * queue's error-state. ++ */ ++struct base_kcpu_command_cqs_wait_operation_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 inherit_err_flags; ++}; ++ ++/** ++ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations ++ * ++ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value ++ * to a synchronization object ++ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value ++ * of a synchronization object ++ */ ++typedef enum { ++ BASEP_CQS_SET_OPERATION_ADD = 0, ++ BASEP_CQS_SET_OPERATION_SET = 1, ++} basep_cqs_set_operation_op; ++ ++struct base_cqs_set_operation_info { ++ __u64 addr; ++ __u64 val; ++ __u8 operation; ++ __u8 data_type; ++ __u8 padding[6]; ++}; ++ ++/** ++ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information ++ * about the Timeline CQS set objects ++ * ++ * @objs: An array of Timeline CQS sets. ++ * @nr_objs: Number of Timeline CQS sets in the array. ++ * @padding: Structure padding, unused bytes. ++ */ ++struct base_kcpu_command_cqs_set_operation_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_kcpu_command_import_info - structure which contains information ++ * about the imported buffer. ++ * ++ * @handle: Address of imported user buffer. ++ */ ++struct base_kcpu_command_import_info { ++ __u64 handle; ++}; ++ ++/** ++ * struct base_kcpu_command_jit_alloc_info - structure which contains ++ * information about jit memory allocation. ++ * ++ * @info: An array of elements of the ++ * struct base_jit_alloc_info type. ++ * @count: The number of elements in the info array. ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_jit_alloc_info { ++ __u64 info; ++ __u8 count; ++ __u8 padding[7]; ++}; ++ ++/** ++ * struct base_kcpu_command_jit_free_info - structure which contains ++ * information about jit memory which is to be freed. ++ * ++ * @ids: An array containing the JIT IDs to free. ++ * @count: The number of elements in the ids array. ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_jit_free_info { ++ __u64 ids; ++ __u8 count; ++ __u8 padding[7]; ++}; ++ ++/** ++ * struct base_kcpu_command_group_suspend_info - structure which contains ++ * suspend buffer data captured for a suspended queue group. ++ * ++ * @buffer: Pointer to an array of elements of the type char. ++ * @size: Number of elements in the @buffer array. ++ * @group_handle: Handle to the mapping of CSG. ++ * @padding: padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_group_suspend_info { ++ __u64 buffer; ++ __u32 size; ++ __u8 group_handle; ++ __u8 padding[3]; ++}; ++ ++ ++/** ++ * struct base_kcpu_command - kcpu command. ++ * @type: type of the kcpu command, one enum base_kcpu_command_type ++ * @padding: padding to a multiple of 64 bits ++ * @info: structure which contains information about the kcpu command; ++ * actual type is determined by @p type ++ * @info.fence: Fence ++ * @info.cqs_wait: CQS wait ++ * @info.cqs_set: CQS set ++ * @info.cqs_wait_operation: CQS wait operation ++ * @info.cqs_set_operation: CQS set operation ++ * @info.import: import ++ * @info.jit_alloc: JIT allocation ++ * @info.jit_free: JIT deallocation ++ * @info.suspend_buf_copy: suspend buffer copy ++ * @info.sample_time: sample time ++ * @info.padding: padding ++ */ ++struct base_kcpu_command { ++ __u8 type; ++ __u8 padding[sizeof(__u64) - sizeof(__u8)]; ++ union { ++ struct base_kcpu_command_fence_info fence; ++ struct base_kcpu_command_cqs_wait_info cqs_wait; ++ struct base_kcpu_command_cqs_set_info cqs_set; ++ struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; ++ struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; ++ struct base_kcpu_command_import_info import; ++ struct base_kcpu_command_jit_alloc_info jit_alloc; ++ struct base_kcpu_command_jit_free_info jit_free; ++ struct base_kcpu_command_group_suspend_info suspend_buf_copy; ++ __u64 padding[2]; /* No sub-struct should be larger */ ++ } info; ++}; ++ ++/** ++ * struct basep_cs_stream_control - CSI capabilities. ++ * ++ * @features: Features of this stream ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct basep_cs_stream_control { ++ __u32 features; ++ __u32 padding; ++}; ++ ++/** ++ * struct basep_cs_group_control - CSG interface capabilities. ++ * ++ * @features: Features of this group ++ * @stream_num: Number of streams in this group ++ * @suspend_size: Size in bytes of the suspend buffer for this group ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct basep_cs_group_control { ++ __u32 features; ++ __u32 stream_num; ++ __u32 suspend_size; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault ++ * error information associated with GPU command queue group. ++ * ++ * @sideband: Additional information of the unrecoverable fault. ++ * @status: Unrecoverable fault information. ++ * This consists of exception type (least significant byte) and ++ * data (remaining bytes). One example of exception type is ++ * CS_INVALID_INSTRUCTION (0x49). ++ * @padding: Padding to make multiple of 64bits ++ */ ++struct base_gpu_queue_group_error_fatal_payload { ++ __u64 sideband; ++ __u32 status; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault ++ * error information related to GPU command queue. ++ * ++ * @sideband: Additional information about this unrecoverable fault. ++ * @status: Unrecoverable fault information. ++ * This consists of exception type (least significant byte) and ++ * data (remaining bytes). One example of exception type is ++ * CS_INVALID_INSTRUCTION (0x49). ++ * @csi_index: Index of the CSF interface the queue is bound to. ++ * @padding: Padding to make multiple of 64bits ++ */ ++struct base_gpu_queue_error_fatal_payload { ++ __u64 sideband; ++ __u32 status; ++ __u8 csi_index; ++ __u8 padding[3]; ++}; ++ ++/** ++ * enum base_gpu_queue_group_error_type - GPU Fatal error type. ++ * ++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU ++ * command queue group. ++ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU ++ * command queue. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with ++ * progress timeout. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out ++ * of tiler heap memory. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types ++ * ++ * This type is used for &struct_base_gpu_queue_group_error.error_type. ++ */ ++enum base_gpu_queue_group_error_type { ++ BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, ++ BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, ++ BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, ++ BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, ++ BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT ++}; ++ ++/** ++ * struct base_gpu_queue_group_error - Unrecoverable fault information ++ * @error_type: Error type of @base_gpu_queue_group_error_type ++ * indicating which field in union payload is filled ++ * @padding: Unused bytes for 64bit boundary ++ * @payload: Input Payload ++ * @payload.fatal_group: Unrecoverable fault error associated with ++ * GPU command queue group ++ * @payload.fatal_queue: Unrecoverable fault error associated with command queue ++ */ ++struct base_gpu_queue_group_error { ++ __u8 error_type; ++ __u8 padding[7]; ++ union { ++ struct base_gpu_queue_group_error_fatal_payload fatal_group; ++ struct base_gpu_queue_error_fatal_payload fatal_queue; ++ } payload; ++}; ++ ++/** ++ * enum base_csf_notification_type - Notification type ++ * ++ * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event ++ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal ++ * error ++ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu ++ * queue ++ * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type ++ * ++ * This type is used for &struct_base_csf_notification.type. ++ */ ++enum base_csf_notification_type { ++ BASE_CSF_NOTIFICATION_EVENT = 0, ++ BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, ++ BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, ++ BASE_CSF_NOTIFICATION_COUNT ++}; ++ ++/** ++ * struct base_csf_notification - Event or error notification ++ * ++ * @type: Notification type of @base_csf_notification_type ++ * @padding: Padding for 64bit boundary ++ * @payload: Input Payload ++ * @payload.align: To fit the struct into a 64-byte cache line ++ * @payload.csg_error: CSG error ++ * @payload.csg_error.handle: Handle of GPU command queue group associated with ++ * fatal error ++ * @payload.csg_error.padding: Padding ++ * @payload.csg_error.error: Unrecoverable fault error ++ * ++ */ ++struct base_csf_notification { ++ __u8 type; ++ __u8 padding[7]; ++ union { ++ struct { ++ __u8 handle; ++ __u8 padding[7]; ++ struct base_gpu_queue_group_error error; ++ } csg_error; ++ ++ __u8 align[56]; ++ } payload; ++}; ++ ++/** ++ * struct mali_base_gpu_core_props - GPU core props info ++ * ++ * @product_id: Pro specific value. ++ * @version_status: Status of the GPU release. No defined values, but starts at ++ * 0 and increases by one for each release status (alpha, beta, EAC, etc.). ++ * 4 bit values (0-15). ++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" ++ * release number. ++ * 8 bit values (0-255). ++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn" ++ * release number. ++ * 4 bit values (0-15). ++ * @padding: padding to align to 8-byte ++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by ++ * clGetDeviceInfo() ++ * @log2_program_counter_size: Size of the shader program counter, in bits. ++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This ++ * is a bitpattern where a set bit indicates that the format is supported. ++ * Before using a texture format, it is recommended that the corresponding ++ * bit be checked. ++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. ++ * It is unlikely that a client will be able to allocate all of this memory ++ * for their own purposes, but this at least provides an upper bound on the ++ * memory available to the GPU. ++ * This is required for OpenCL's clGetDeviceInfo() call when ++ * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The ++ * client will not be expecting to allocate anywhere near this value. ++ */ ++struct mali_base_gpu_core_props { ++ __u32 product_id; ++ __u16 version_status; ++ __u16 minor_revision; ++ __u16 major_revision; ++ __u16 padding; ++ __u32 gpu_freq_khz_max; ++ __u32 log2_program_counter_size; ++ __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ __u64 gpu_available_memory_size; ++}; ++ ++#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ +diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h +new file mode 100644 +index 00000000000..17e338cb238 +--- /dev/null ++++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++/* ++ * This header was originally autogenerated, but it is now ok (and ++ * expected) to have to add to it. ++ */ ++ ++#ifndef _UAPI_GPU_CSF_REGISTERS_H_ ++#define _UAPI_GPU_CSF_REGISTERS_H_ ++ ++/* Only user block defines are included. HI words have been removed */ ++ ++/* CS_USER_INPUT_BLOCK register offsets */ ++#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ ++#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ ++ ++/* CS_USER_OUTPUT_BLOCK register offsets */ ++#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ ++#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ ++ ++/* USER register offsets */ ++#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ ++ ++#endif +diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h +new file mode 100644 +index 00000000000..db7252605f0 +--- /dev/null ++++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h +@@ -0,0 +1,530 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_CSF_IOCTL_H_ ++#define _UAPI_KBASE_CSF_IOCTL_H_ ++ ++#include ++#include ++ ++/* ++ * 1.0: ++ * - CSF IOCTL header separated from JM ++ * 1.1: ++ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME ++ * - Add ioctl 54: This controls the priority setting. ++ * 1.2: ++ * - Add new CSF GPU_FEATURES register into the property structure ++ * returned by KBASE_IOCTL_GET_GPUPROPS ++ * 1.3: ++ * - Add __u32 group_uid member to ++ * &struct_kbase_ioctl_cs_queue_group_create.out ++ * 1.4: ++ * - Replace padding in kbase_ioctl_cs_get_glb_iface with ++ * instr_features member of same size ++ * 1.5: ++ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new ++ * queue registration call with extended format for supporting CS ++ * trace configurations with CSF trace_command. ++ * 1.6: ++ * - Added new HW performance counters interface to all GPUs. ++ * 1.7: ++ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use ++ * 1.8: ++ * - Removed Kernel legacy HWC interface ++ * 1.9: ++ * - Reorganization of GPU-VA memory zones, including addition of ++ * FIXED_VA zone and auto-initialization of EXEC_VA zone. ++ * - Added new Base memory allocation interface ++ * 1.10: ++ * - First release of new HW performance counters interface. ++ * 1.11: ++ * - Dummy model (no mali) backend will now clear HWC values after each sample ++ * 1.12: ++ * - Added support for incremental rendering flag in CSG create call ++ */ ++ ++#define BASE_UK_VERSION_MAJOR 1 ++#define BASE_UK_VERSION_MINOR 12 ++ ++/** ++ * struct kbase_ioctl_version_check - Check version compatibility between ++ * kernel and userspace ++ * ++ * @major: Major version number ++ * @minor: Minor version number ++ */ ++struct kbase_ioctl_version_check { ++ __u16 major; ++ __u16 minor; ++}; ++ ++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ ++ _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) ++ ++/** ++ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the ++ * base back-end ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @buffer_size: Size of the buffer in bytes ++ * @priority: Priority of the queue within a group when run within a process ++ * @padding: Currently unused, must be zero ++ * ++ * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. ++ * Any change of this struct should also be mirrored to the latter. ++ */ ++struct kbase_ioctl_cs_queue_register { ++ __u64 buffer_gpu_addr; ++ __u32 buffer_size; ++ __u8 priority; ++ __u8 padding[3]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_REGISTER \ ++ _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) ++ ++/** ++ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler ++ * to notify that a queue has been updated ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ */ ++struct kbase_ioctl_cs_queue_kick { ++ __u64 buffer_gpu_addr; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_KICK \ ++ _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) ++ ++/** ++ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group ++ * ++ * @in: Input parameters ++ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @in.group_handle: Handle of the group to which the queue should be bound ++ * @in.csi_index: Index of the CSF interface the queue should be bound to ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.mmap_handle: Handle to be used for creating the mapping of CS ++ * input/output pages ++ */ ++union kbase_ioctl_cs_queue_bind { ++ struct { ++ __u64 buffer_gpu_addr; ++ __u8 group_handle; ++ __u8 csi_index; ++ __u8 padding[6]; ++ } in; ++ struct { ++ __u64 mmap_handle; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_BIND \ ++ _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) ++ ++/** ++ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the ++ * base back-end in extended format, ++ * involving trace buffer configuration ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @buffer_size: Size of the buffer in bytes ++ * @priority: Priority of the queue within a group when run within a process ++ * @padding: Currently unused, must be zero ++ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable ++ * @ex_buffer_base: Trace buffer GPU base address for the queue ++ * @ex_buffer_size: Size of the trace buffer in bytes ++ * @ex_event_size: Trace event write size, in log2 designation ++ * @ex_event_state: Trace event states configuration ++ * @ex_padding: Currently unused, must be zero ++ * ++ * Note: There is an identical sub-section at the start of this struct to that ++ * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section ++ * must also be mirrored to the latter. Following the said sub-section, ++ * the remaining fields forms the extension, marked with ex_*. ++ */ ++struct kbase_ioctl_cs_queue_register_ex { ++ __u64 buffer_gpu_addr; ++ __u32 buffer_size; ++ __u8 priority; ++ __u8 padding[3]; ++ __u64 ex_offset_var_addr; ++ __u64 ex_buffer_base; ++ __u32 ex_buffer_size; ++ __u8 ex_event_size; ++ __u8 ex_event_state; ++ __u8 ex_padding[2]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ ++ _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) ++ ++/** ++ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ */ ++struct kbase_ioctl_cs_queue_terminate { ++ __u64 buffer_gpu_addr; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ ++ _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) ++ ++/** ++ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue ++ * group ++ * @in: Input parameters ++ * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. ++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. ++ * @in.compute_mask: Mask of compute endpoints the group is allowed to use. ++ * @in.cs_min: Minimum number of CSs required. ++ * @in.priority: Queue group's priority within a process. ++ * @in.tiler_max: Maximum number of tiler endpoints the group is allowed ++ * to use. ++ * @in.fragment_max: Maximum number of fragment endpoints the group is ++ * allowed to use. ++ * @in.compute_max: Maximum number of compute endpoints the group is allowed ++ * to use. ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.group_handle: Handle of a newly created queue group. ++ * @out.padding: Currently unused, must be zero ++ * @out.group_uid: UID of the queue group available to base. ++ */ ++union kbase_ioctl_cs_queue_group_create_1_6 { ++ struct { ++ __u64 tiler_mask; ++ __u64 fragment_mask; ++ __u64 compute_mask; ++ __u8 cs_min; ++ __u8 priority; ++ __u8 tiler_max; ++ __u8 fragment_max; ++ __u8 compute_max; ++ __u8 padding[3]; ++ ++ } in; ++ struct { ++ __u8 group_handle; ++ __u8 padding[3]; ++ __u32 group_uid; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ ++ _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) ++ ++/** ++ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group ++ * @in: Input parameters ++ * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. ++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. ++ * @in.compute_mask: Mask of compute endpoints the group is allowed to use. ++ * @in.cs_min: Minimum number of CSs required. ++ * @in.priority: Queue group's priority within a process. ++ * @in.tiler_max: Maximum number of tiler endpoints the group is allowed ++ * to use. ++ * @in.fragment_max: Maximum number of fragment endpoints the group is ++ * allowed to use. ++ * @in.compute_max: Maximum number of compute endpoints the group is allowed ++ * to use. ++ * @in.csi_handlers: Flags to signal that the application intends to use CSI ++ * exception handlers in some linear buffers to deal with ++ * the given exception types. ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.group_handle: Handle of a newly created queue group. ++ * @out.padding: Currently unused, must be zero ++ * @out.group_uid: UID of the queue group available to base. ++ */ ++union kbase_ioctl_cs_queue_group_create { ++ struct { ++ __u64 tiler_mask; ++ __u64 fragment_mask; ++ __u64 compute_mask; ++ __u8 cs_min; ++ __u8 priority; ++ __u8 tiler_max; ++ __u8 fragment_max; ++ __u8 compute_max; ++ __u8 csi_handlers; ++ __u8 padding[2]; ++ /** ++ * @in.reserved: Reserved ++ */ ++ __u64 reserved; ++ } in; ++ struct { ++ __u8 group_handle; ++ __u8 padding[3]; ++ __u32 group_uid; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ ++ _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) ++ ++/** ++ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group ++ * ++ * @group_handle: Handle of the queue group to be terminated ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_cs_queue_group_term { ++ __u8 group_handle; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ ++ _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) ++ ++#define KBASE_IOCTL_CS_EVENT_SIGNAL \ ++ _IO(KBASE_IOCTL_TYPE, 44) ++ ++typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue ++ * ++ * @id: ID of the new command queue returned by the kernel ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_new { ++ base_kcpu_queue_id id; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ ++ _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue ++ * ++ * @id: ID of the command queue to be destroyed ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_delete { ++ base_kcpu_queue_id id; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ ++ _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue ++ * ++ * @addr: Memory address of an array of struct base_kcpu_queue_command ++ * @nr_commands: Number of commands in the array ++ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_enqueue { ++ __u64 addr; ++ __u32 nr_commands; ++ base_kcpu_queue_id id; ++ __u8 padding[3]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ ++ _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) ++ ++/** ++ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap ++ * @in: Input parameters ++ * @in.chunk_size: Size of each chunk. ++ * @in.initial_chunks: Initial number of chunks that heap will be created with. ++ * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. ++ * @in.target_in_flight: Number of render-passes that the driver should attempt to ++ * keep in flight for which allocation of new chunks is ++ * allowed. ++ * @in.group_id: Group ID to be used for physical allocations. ++ * @in.padding: Padding ++ * @out: Output parameters ++ * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up ++ * for the heap. ++ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, ++ * actually points to the header of heap chunk and not to ++ * the low address of free memory in the chunk. ++ */ ++union kbase_ioctl_cs_tiler_heap_init { ++ struct { ++ __u32 chunk_size; ++ __u32 initial_chunks; ++ __u32 max_chunks; ++ __u16 target_in_flight; ++ __u8 group_id; ++ __u8 padding; ++ } in; ++ struct { ++ __u64 gpu_heap_va; ++ __u64 first_chunk_va; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ ++ _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) ++ ++/** ++ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap ++ * instance ++ * ++ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. ++ */ ++struct kbase_ioctl_cs_tiler_heap_term { ++ __u64 gpu_heap_va; ++}; ++ ++#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ ++ _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) ++ ++/** ++ * union kbase_ioctl_cs_get_glb_iface - Request the global control block ++ * of CSF interface capabilities ++ * ++ * @in: Input parameters ++ * @in.max_group_num: The maximum number of groups to be read. Can be 0, in ++ * which case groups_ptr is unused. ++ * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in ++ * which case streams_ptr is unused. ++ * @in.groups_ptr: Pointer where to store all the group data (sequentially). ++ * @in.streams_ptr: Pointer where to store all the CS data (sequentially). ++ * @out: Output parameters ++ * @out.glb_version: Global interface version. ++ * @out.features: Bit mask of features (e.g. whether certain types of job ++ * can be suspended). ++ * @out.group_num: Number of CSGs supported. ++ * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 ++ * hold the size of firmware performance counter data ++ * and 15:0 hold the size of hardware performance counter ++ * data. ++ * @out.total_stream_num: Total number of CSs, summed across all groups. ++ * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum ++ * size of events. Bits 3:0 hold the offset update rate. ++ * (csf >= 1.1.0) ++ * ++ */ ++union kbase_ioctl_cs_get_glb_iface { ++ struct { ++ __u32 max_group_num; ++ __u32 max_total_stream_num; ++ __u64 groups_ptr; ++ __u64 streams_ptr; ++ } in; ++ struct { ++ __u32 glb_version; ++ __u32 features; ++ __u32 group_num; ++ __u32 prfcnt_size; ++ __u32 total_stream_num; ++ __u32 instr_features; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_GET_GLB_IFACE \ ++ _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) ++ ++struct kbase_ioctl_cs_cpu_queue_info { ++ __u64 buffer; ++ __u64 size; ++}; ++ ++#define KBASE_IOCTL_VERSION_CHECK \ ++ _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) ++ ++#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ ++ _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) ++ ++/** ++ * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU ++ * @in: Input parameters ++ * @in.va_pages: The number of pages of virtual address space to reserve ++ * @in.commit_pages: The number of physical pages to allocate ++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region ++ * @in.flags: Flags ++ * @in.fixed_address: The GPU virtual address requested for the allocation, ++ * if the allocation is using the BASE_MEM_FIXED flag. ++ * @in.extra: Space for extra parameters that may be added in the future. ++ * @out: Output parameters ++ * @out.flags: Flags ++ * @out.gpu_va: The GPU virtual address which is allocated ++ */ ++union kbase_ioctl_mem_alloc_ex { ++ struct { ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u64 flags; ++ __u64 fixed_address; ++ __u64 extra[3]; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex) ++ ++/*************** ++ * test ioctls * ++ ***************/ ++#if MALI_UNIT_TEST ++/* These ioctls are purely for test purposes and are not used in the production ++ * driver, they therefore may change without notice ++ */ ++ ++/** ++ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address ++ * @cpu_addr: Memory address to write ++ * @value: Value to write ++ * @padding: Currently unused, must be zero ++ */ ++struct kbase_ioctl_cs_event_memory_write { ++ __u64 cpu_addr; ++ __u8 value; ++ __u8 padding[7]; ++}; ++ ++/** ++ * union kbase_ioctl_cs_event_memory_read - Read an event memory address ++ * @in: Input parameters ++ * @in.cpu_addr: Memory address to read ++ * @out: Output parameters ++ * @out.value: Value read ++ * @out.padding: Currently unused, must be zero ++ */ ++union kbase_ioctl_cs_event_memory_read { ++ struct { ++ __u64 cpu_addr; ++ } in; ++ struct { ++ __u8 value; ++ __u8 padding[7]; ++ } out; ++}; ++ ++#endif /* MALI_UNIT_TEST */ ++ ++#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ +diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h +new file mode 100644 +index 00000000000..ae43908b936 +--- /dev/null ++++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h +@@ -0,0 +1,1051 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_BASE_JM_KERNEL_H_ ++#define _UAPI_BASE_JM_KERNEL_H_ ++ ++#include ++#include "../mali_base_common_kernel.h" ++ ++/* Memory allocation, access/hint flags & mask specific to JM GPU. ++ * ++ * See base_mem_alloc_flags. ++ */ ++ ++/* Used as BASE_MEM_FIXED in other backends */ ++#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) ++ ++/** ++ * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved. ++ * ++ * Do not remove, use the next unreserved bit for new flags ++ */ ++#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19) ++ ++/** ++ * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned ++ * to 'extension' pages, where 'extension' must be a power of 2 and no more than ++ * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES ++ */ ++#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20) ++ ++/* Use the GPU VA chosen by the kernel client */ ++#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27) ++ ++/* Force trimming of JIT allocations when creating a new allocation */ ++#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29) ++ ++/* Note that the number of bits used for base_mem_alloc_flags ++ * must be less than BASE_MEM_FLAGS_NR_BITS !!! ++ */ ++ ++/* A mask of all the flags which are only valid for allocations within kbase, ++ * and may not be passed from user space. ++ */ ++#define BASEP_MEM_FLAGS_KERNEL_ONLY \ ++ (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \ ++ BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM) ++ ++/* A mask of all currently reserved flags ++ */ ++#define BASE_MEM_FLAGS_RESERVED \ ++ (BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19) ++ ++ ++/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the ++ * initial commit is aligned to 'extension' pages, where 'extension' must be a power ++ * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES ++ */ ++#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP (1 << 0) ++ ++/** ++ * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points ++ * to a __u32 holding the used size in bytes; ++ * otherwise it points to a __u64 holding the lowest address of unused memory. ++ */ ++#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE (1 << 1) ++ ++/** ++ * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags ++ * ++ * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr ++ * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set ++ * and heap_info_gpu_addr being 0 will be rejected). ++ */ ++#define BASE_JIT_ALLOC_VALID_FLAGS \ ++ (BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE) ++ ++/* Bitpattern describing the ::base_context_create_flags that can be ++ * passed to base_context_init() ++ */ ++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ ++ (BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS) ++ ++/* ++ * Private flags used on the base context ++ * ++ * These start at bit 31, and run down to zero. ++ * ++ * They share the same space as base_context_create_flags, and so must ++ * not collide with them. ++ */ ++ ++/* Private flag tracking whether job descriptor dumping is disabled */ ++#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \ ++ ((base_context_create_flags)(1 << 31)) ++ ++/* Flags for base tracepoint specific to JM */ ++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ ++ BASE_TLSTREAM_JOB_DUMPING_ENABLED) ++/* ++ * Dependency stuff, keep it private for now. May want to expose it if ++ * we decide to make the number of semaphores a configurable ++ * option. ++ */ ++#define BASE_JD_ATOM_COUNT 256 ++ ++/* Maximum number of concurrent render passes. ++ */ ++#define BASE_JD_RP_COUNT (256) ++ ++/* Set/reset values for a software event */ ++#define BASE_JD_SOFT_EVENT_SET ((unsigned char)1) ++#define BASE_JD_SOFT_EVENT_RESET ((unsigned char)0) ++ ++/** ++ * struct base_jd_udata - Per-job data ++ * ++ * @blob: per-job data array ++ * ++ * This structure is used to store per-job data, and is completely unused ++ * by the Base driver. It can be used to store things such as callback ++ * function pointer, data to handle job completion. It is guaranteed to be ++ * untouched by the Base driver. ++ */ ++struct base_jd_udata { ++ __u64 blob[2]; ++}; ++ ++/** ++ * typedef base_jd_dep_type - Job dependency type. ++ * ++ * A flags field will be inserted into the atom structure to specify whether a ++ * dependency is a data or ordering dependency (by putting it before/after ++ * 'core_req' in the structure it should be possible to add without changing ++ * the structure size). ++ * When the flag is set for a particular dependency to signal that it is an ++ * ordering only dependency then errors will not be propagated. ++ */ ++typedef __u8 base_jd_dep_type; ++ ++#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ ++#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ ++#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ ++ ++/** ++ * typedef base_jd_core_req - Job chain hardware requirements. ++ * ++ * A job chain must specify what GPU features it needs to allow the ++ * driver to schedule the job correctly. By not specifying the ++ * correct settings can/will cause an early job termination. Multiple ++ * values can be ORed together to specify multiple requirements. ++ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex ++ * dependencies, and that doesn't execute anything on the hardware. ++ */ ++typedef __u32 base_jd_core_req; ++ ++/* Requirements that come from the HW */ ++ ++/* No requirement, dependency only ++ */ ++#define BASE_JD_REQ_DEP ((base_jd_core_req)0) ++ ++/* Requires fragment shaders ++ */ ++#define BASE_JD_REQ_FS ((base_jd_core_req)1 << 0) ++ ++/* Requires compute shaders ++ * ++ * This covers any of the following GPU job types: ++ * - Vertex Shader Job ++ * - Geometry Shader Job ++ * - An actual Compute Shader Job ++ * ++ * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the ++ * job is specifically just the "Compute Shader" job type, and not the "Vertex ++ * Shader" nor the "Geometry Shader" job type. ++ */ ++#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1) ++ ++/* Requires tiling */ ++#define BASE_JD_REQ_T ((base_jd_core_req)1 << 2) ++ ++/* Requires cache flushes */ ++#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3) ++ ++/* Requires value writeback */ ++#define BASE_JD_REQ_V ((base_jd_core_req)1 << 4) ++ ++/* SW-only requirements - the HW does not expose these as part of the job slot ++ * capabilities ++ */ ++ ++/* Requires fragment job with AFBC encoding */ ++#define BASE_JD_REQ_FS_AFBC ((base_jd_core_req)1 << 13) ++ ++/* SW-only requirement: coalesce completion events. ++ * If this bit is set then completion of this atom will not cause an event to ++ * be sent to userspace, whether successful or not; completion events will be ++ * deferred until an atom completes which does not have this bit set. ++ * ++ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. ++ */ ++#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5) ++ ++/* SW Only requirement: the job chain requires a coherent core group. We don't ++ * mind which coherent core group is used. ++ */ ++#define BASE_JD_REQ_COHERENT_GROUP ((base_jd_core_req)1 << 6) ++ ++/* SW Only requirement: The performance counters should be enabled only when ++ * they are needed, to reduce power consumption. ++ */ ++#define BASE_JD_REQ_PERMON ((base_jd_core_req)1 << 7) ++ ++/* SW Only requirement: External resources are referenced by this atom. ++ * ++ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and ++ * BASE_JD_REQ_SOFT_EVENT_WAIT. ++ */ ++#define BASE_JD_REQ_EXTERNAL_RESOURCES ((base_jd_core_req)1 << 8) ++ ++/* SW Only requirement: Software defined job. Jobs with this bit set will not be ++ * submitted to the hardware but will cause some action to happen within the ++ * driver ++ */ ++#define BASE_JD_REQ_SOFT_JOB ((base_jd_core_req)1 << 9) ++ ++#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) ++#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) ++#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) ++ ++/* 0x4 RESERVED for now */ ++ ++/* SW only requirement: event wait/trigger job. ++ * ++ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. ++ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the ++ * other waiting jobs. It completes immediately. ++ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it ++ * possible for other jobs to wait upon. It completes immediately. ++ */ ++#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) ++#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) ++#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) ++ ++#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) ++ ++/* SW only requirement: Just In Time allocation ++ * ++ * This job requests a single or multiple just-in-time allocations through a ++ * list of base_jit_alloc_info structure which is passed via the jc element of ++ * the atom. The number of base_jit_alloc_info structures present in the ++ * list is passed via the nr_extres element of the atom ++ * ++ * It should be noted that the id entry in base_jit_alloc_info must not ++ * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE. ++ * ++ * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE ++ * soft job to free the JIT allocation is still made. ++ * ++ * The job will complete immediately. ++ */ ++#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) ++ ++/* SW only requirement: Just In Time free ++ * ++ * This job requests a single or multiple just-in-time allocations created by ++ * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time ++ * allocations is passed via the jc element of the atom. ++ * ++ * The job will complete immediately. ++ */ ++#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) ++ ++/* SW only requirement: Map external resource ++ * ++ * This job requests external resource(s) are mapped once the dependencies ++ * of the job have been satisfied. The list of external resources are ++ * passed via the jc element of the atom which is a pointer to a ++ * base_external_resource_list. ++ */ ++#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) ++ ++/* SW only requirement: Unmap external resource ++ * ++ * This job requests external resource(s) are unmapped once the dependencies ++ * of the job has been satisfied. The list of external resources are ++ * passed via the jc element of the atom which is a pointer to a ++ * base_external_resource_list. ++ */ ++#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) ++ ++/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) ++ * ++ * This indicates that the Job Chain contains GPU jobs of the 'Compute ++ * Shaders' type. ++ * ++ * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job ++ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. ++ */ ++#define BASE_JD_REQ_ONLY_COMPUTE ((base_jd_core_req)1 << 10) ++ ++/* HW Requirement: Use the base_jd_atom::device_nr field to specify a ++ * particular core group ++ * ++ * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag ++ * takes priority ++ * ++ * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms. ++ */ ++#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11) ++ ++/* SW Flag: If this bit is set then the successful completion of this atom ++ * will not cause an event to be sent to userspace ++ */ ++#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((base_jd_core_req)1 << 12) ++ ++/* SW Flag: If this bit is set then completion of this atom will not cause an ++ * event to be sent to userspace, whether successful or not. ++ */ ++#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14) ++ ++/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. ++ * ++ * If this bit is set then the GPU's cache will not be cleaned and invalidated ++ * until a GPU job starts which does not have this bit set or a job completes ++ * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use ++ * if the CPU may have written to memory addressed by the job since the last job ++ * without this bit set was submitted. ++ */ ++#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15) ++ ++/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. ++ * ++ * If this bit is set then the GPU's cache will not be cleaned and invalidated ++ * until a GPU job completes which does not have this bit set or a job starts ++ * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use ++ * if the CPU may read from or partially overwrite memory addressed by the job ++ * before the next job without this bit set completes. ++ */ ++#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16) ++ ++/* Request the atom be executed on a specific job slot. ++ * ++ * When this flag is specified, it takes precedence over any existing job slot ++ * selection logic. ++ */ ++#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17) ++ ++/* SW-only requirement: The atom is the start of a renderpass. ++ * ++ * If this bit is set then the job chain will be soft-stopped if it causes the ++ * GPU to write beyond the end of the physical pages backing the tiler heap, and ++ * committing more memory to the heap would exceed an internal threshold. It may ++ * be resumed after running one of the job chains attached to an atom with ++ * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be ++ * resumed multiple times until it completes without memory usage exceeding the ++ * threshold. ++ * ++ * Usually used with BASE_JD_REQ_T. ++ */ ++#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18) ++ ++/* SW-only requirement: The atom is the end of a renderpass. ++ * ++ * If this bit is set then the atom incorporates the CPU address of a ++ * base_jd_fragment object instead of the GPU address of a job chain. ++ * ++ * Which job chain is run depends upon whether the atom with the same renderpass ++ * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or ++ * was soft-stopped when it exceeded an upper threshold for tiler heap memory ++ * usage. ++ * ++ * It also depends upon whether one of the job chains attached to the atom has ++ * already been run as part of the same renderpass (in which case it would have ++ * written unresolved multisampled and otherwise-discarded output to temporary ++ * buffers that need to be read back). The job chain for doing a forced read and ++ * forced write (from/to temporary buffers) is run as many times as necessary. ++ * ++ * Usually used with BASE_JD_REQ_FS. ++ */ ++#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19) ++ ++/* SW-only requirement: The atom needs to run on a limited core mask affinity. ++ * ++ * If this bit is set then the kbase_context.limited_core_mask will be applied ++ * to the affinity. ++ */ ++#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20) ++ ++/* These requirement bits are currently unused in base_jd_core_req ++ */ ++#define BASEP_JD_REQ_RESERVED \ ++ (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ ++ BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \ ++ BASE_JD_REQ_EVENT_COALESCE | \ ++ BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ ++ BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ ++ BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \ ++ BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \ ++ BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK)) ++ ++/* Mask of all bits in base_jd_core_req that control the type of the atom. ++ * ++ * This allows dependency only atoms to have flags set ++ */ ++#define BASE_JD_REQ_ATOM_TYPE \ ++ (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ ++ BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) ++ ++/** ++ * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that ++ * controls the type of a soft job. ++ */ ++#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) ++ ++/* Returns non-zero value if core requirements passed define a soft job or ++ * a dependency only job. ++ */ ++#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ ++ (((core_req) & BASE_JD_REQ_SOFT_JOB) || \ ++ ((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) ++ ++/** ++ * enum kbase_jd_atom_state - Atom states ++ * ++ * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used. ++ * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD. ++ * @KBASE_JD_ATOM_STATE_IN_JS: Atom has been given to JS (is runnable/running). ++ * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet ++ * handed back to job dispatcher for ++ * dependency resolution. ++ * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed ++ * back to userspace. ++ */ ++enum kbase_jd_atom_state { ++ KBASE_JD_ATOM_STATE_UNUSED, ++ KBASE_JD_ATOM_STATE_QUEUED, ++ KBASE_JD_ATOM_STATE_IN_JS, ++ KBASE_JD_ATOM_STATE_HW_COMPLETED, ++ KBASE_JD_ATOM_STATE_COMPLETED ++}; ++ ++/** ++ * typedef base_atom_id - Type big enough to store an atom number in. ++ */ ++typedef __u8 base_atom_id; ++ ++/** ++ * struct base_dependency - base dependency ++ * ++ * @atom_id: An atom number ++ * @dependency_type: Dependency type ++ */ ++struct base_dependency { ++ base_atom_id atom_id; ++ base_jd_dep_type dependency_type; ++}; ++ ++/** ++ * struct base_jd_fragment - Set of GPU fragment job chains used for rendering. ++ * ++ * @norm_read_norm_write: Job chain for full rendering. ++ * GPU address of a fragment job chain to render in the ++ * circumstance where the tiler job chain did not exceed ++ * its memory usage threshold and no fragment job chain ++ * was previously run for the same renderpass. ++ * It is used no more than once per renderpass. ++ * @norm_read_forced_write: Job chain for starting incremental ++ * rendering. ++ * GPU address of a fragment job chain to render in ++ * the circumstance where the tiler job chain exceeded ++ * its memory usage threshold for the first time and ++ * no fragment job chain was previously run for the ++ * same renderpass. ++ * Writes unresolved multisampled and normally- ++ * discarded output to temporary buffers that must be ++ * read back by a subsequent forced_read job chain ++ * before the renderpass is complete. ++ * It is used no more than once per renderpass. ++ * @forced_read_forced_write: Job chain for continuing incremental ++ * rendering. ++ * GPU address of a fragment job chain to render in ++ * the circumstance where the tiler job chain ++ * exceeded its memory usage threshold again ++ * and a fragment job chain was previously run for ++ * the same renderpass. ++ * Reads unresolved multisampled and ++ * normally-discarded output from temporary buffers ++ * written by a previous forced_write job chain and ++ * writes the same to temporary buffers again. ++ * It is used as many times as required until ++ * rendering completes. ++ * @forced_read_norm_write: Job chain for ending incremental rendering. ++ * GPU address of a fragment job chain to render in the ++ * circumstance where the tiler job chain did not ++ * exceed its memory usage threshold this time and a ++ * fragment job chain was previously run for the same ++ * renderpass. ++ * Reads unresolved multisampled and normally-discarded ++ * output from temporary buffers written by a previous ++ * forced_write job chain in order to complete a ++ * renderpass. ++ * It is used no more than once per renderpass. ++ * ++ * This structure is referenced by the main atom structure if ++ * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req. ++ */ ++struct base_jd_fragment { ++ __u64 norm_read_norm_write; ++ __u64 norm_read_forced_write; ++ __u64 forced_read_forced_write; ++ __u64 forced_read_norm_write; ++}; ++ ++/** ++ * typedef base_jd_prio - Base Atom priority. ++ * ++ * Only certain priority levels are actually implemented, as specified by the ++ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority ++ * level that is not one of those defined below. ++ * ++ * Priority levels only affect scheduling after the atoms have had dependencies ++ * resolved. For example, a low priority atom that has had its dependencies ++ * resolved might run before a higher priority atom that has not had its ++ * dependencies resolved. ++ * ++ * In general, fragment atoms do not affect non-fragment atoms with ++ * lower priorities, and vice versa. One exception is that there is only one ++ * priority value for each context. So a high-priority (e.g.) fragment atom ++ * could increase its context priority, causing its non-fragment atoms to also ++ * be scheduled sooner. ++ * ++ * The atoms are scheduled as follows with respect to their priorities: ++ * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies ++ * resolved, and atom 'X' has a higher priority than atom 'Y' ++ * * If atom 'Y' is currently running on the HW, then it is interrupted to ++ * allow atom 'X' to run soon after ++ * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing ++ * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' ++ * * Any two atoms that have the same priority could run in any order with ++ * respect to each other. That is, there is no ordering constraint between ++ * atoms of the same priority. ++ * ++ * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are ++ * scheduled between contexts. The default value, 0, will cause higher-priority ++ * atoms to be scheduled first, regardless of their context. The value 1 will ++ * use a round-robin algorithm when deciding which context's atoms to schedule ++ * next, so higher-priority atoms can only preempt lower priority atoms within ++ * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and ++ * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details. ++ */ ++typedef __u8 base_jd_prio; ++ ++/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */ ++#define BASE_JD_PRIO_MEDIUM ((base_jd_prio)0) ++/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and ++ * BASE_JD_PRIO_LOW ++ */ ++#define BASE_JD_PRIO_HIGH ((base_jd_prio)1) ++/* Low atom priority. */ ++#define BASE_JD_PRIO_LOW ((base_jd_prio)2) ++/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH, ++ * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW ++ */ ++#define BASE_JD_PRIO_REALTIME ((base_jd_prio)3) ++ ++/* Invalid atom priority (max uint8_t value) */ ++#define BASE_JD_PRIO_INVALID ((base_jd_prio)255) ++ ++/* Count of the number of priority levels. This itself is not a valid ++ * base_jd_prio setting ++ */ ++#define BASE_JD_NR_PRIO_LEVELS 4 ++ ++/** ++ * struct base_jd_atom_v2 - Node of a dependency graph used to submit a ++ * GPU job chain or soft-job to the kernel driver. ++ * ++ * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS ++ * is set in the base_jd_core_req) the CPU address of a ++ * base_jd_fragment object. ++ * @udata: User data. ++ * @extres_list: List of external resources. ++ * @nr_extres: Number of external resources or JIT allocations. ++ * @jit_id: Zero-terminated array of IDs of just-in-time memory ++ * allocations written to by the atom. When the atom ++ * completes, the value stored at the ++ * &struct_base_jit_alloc_info.heap_info_gpu_addr of ++ * each allocation is read in order to enforce an ++ * overall physical memory usage limit. ++ * @pre_dep: Pre-dependencies. One need to use SETTER function to assign ++ * this field; this is done in order to reduce possibility of ++ * improper assignment of a dependency field. ++ * @atom_number: Unique number to identify the atom. ++ * @prio: Atom priority. Refer to base_jd_prio for more details. ++ * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ++ * specified. ++ * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. ++ * @core_req: Core requirements. ++ * @renderpass_id: Renderpass identifier used to associate an atom that has ++ * BASE_JD_REQ_START_RENDERPASS set in its core requirements ++ * with an atom that has BASE_JD_REQ_END_RENDERPASS set. ++ * @padding: Unused. Must be zero. ++ * ++ * This structure has changed since UK 10.2 for which base_jd_core_req was a ++ * __u16 value. ++ * ++ * In UK 10.3 a core_req field of a __u32 type was added to the end of the ++ * structure, and the place in the structure previously occupied by __u16 ++ * core_req was kept but renamed to compat_core_req. ++ * ++ * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2]. ++ * Compatibility with UK 10.x from UK 11.y is not handled because ++ * the major version increase prevents this. ++ * ++ * For UK 11.20 jit_id[2] must be initialized to zero. ++ */ ++struct base_jd_atom_v2 { ++ __u64 jc; ++ struct base_jd_udata udata; ++ __u64 extres_list; ++ __u16 nr_extres; ++ __u8 jit_id[2]; ++ struct base_dependency pre_dep[2]; ++ base_atom_id atom_number; ++ base_jd_prio prio; ++ __u8 device_nr; ++ __u8 jobslot; ++ base_jd_core_req core_req; ++ __u8 renderpass_id; ++ __u8 padding[7]; ++}; ++ ++/** ++ * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr ++ * at the beginning. ++ * ++ * @seq_nr: Sequence number of logical grouping of atoms. ++ * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS ++ * is set in the base_jd_core_req) the CPU address of a ++ * base_jd_fragment object. ++ * @udata: User data. ++ * @extres_list: List of external resources. ++ * @nr_extres: Number of external resources or JIT allocations. ++ * @jit_id: Zero-terminated array of IDs of just-in-time memory ++ * allocations written to by the atom. When the atom ++ * completes, the value stored at the ++ * &struct_base_jit_alloc_info.heap_info_gpu_addr of ++ * each allocation is read in order to enforce an ++ * overall physical memory usage limit. ++ * @pre_dep: Pre-dependencies. One need to use SETTER function to assign ++ * this field; this is done in order to reduce possibility of ++ * improper assignment of a dependency field. ++ * @atom_number: Unique number to identify the atom. ++ * @prio: Atom priority. Refer to base_jd_prio for more details. ++ * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ++ * specified. ++ * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. ++ * @core_req: Core requirements. ++ * @renderpass_id: Renderpass identifier used to associate an atom that has ++ * BASE_JD_REQ_START_RENDERPASS set in its core requirements ++ * with an atom that has BASE_JD_REQ_END_RENDERPASS set. ++ * @padding: Unused. Must be zero. ++ */ ++typedef struct base_jd_atom { ++ __u64 seq_nr; ++ __u64 jc; ++ struct base_jd_udata udata; ++ __u64 extres_list; ++ __u16 nr_extres; ++ __u8 jit_id[2]; ++ struct base_dependency pre_dep[2]; ++ base_atom_id atom_number; ++ base_jd_prio prio; ++ __u8 device_nr; ++ __u8 jobslot; ++ base_jd_core_req core_req; ++ __u8 renderpass_id; ++ __u8 padding[7]; ++} base_jd_atom; ++ ++/* Job chain event code bits ++ * Defines the bits used to create ::base_jd_event_code ++ */ ++enum { ++ BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */ ++ BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */ ++ /* Event indicates success (SW events only) */ ++ BASE_JD_SW_EVENT_SUCCESS = (1u << 13), ++ BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */ ++ BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */ ++ BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */ ++ BASE_JD_SW_EVENT_RESERVED = (3u << 11), /* Reserved event type */ ++ /* Mask to extract the type from an event code */ ++ BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11) ++}; ++ ++/** ++ * enum base_jd_event_code - Job chain event codes ++ * ++ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status ++ * codes. ++ * Obscurely, BASE_JD_EVENT_TERMINATED ++ * indicates a real fault, because the ++ * job was hard-stopped. ++ * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as ++ * 'previous job done'. ++ * @BASE_JD_EVENT_STOPPED: Can't be seen by userspace, becomes ++ * TERMINATED, DONE or JOB_CANCELLED. ++ * @BASE_JD_EVENT_TERMINATED: This is actually a fault status code - the job ++ * was hard stopped. ++ * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on ++ * complete/fail/cancel. ++ * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes. ++ * Obscurely, BASE_JD_EVENT_TERMINATED ++ * indicates a real fault, ++ * because the job was hard-stopped. ++ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and ++ * software error status codes. ++ * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and ++ * software error status codes. ++ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status ++ * codes. ++ * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes. ++ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes. ++ * Such codes are never returned to ++ * user-space. ++ * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes. ++ * @BASE_JD_EVENT_DONE: atom has completed successfull ++ * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which ++ * shall result in a failed atom ++ * @BASE_JD_EVENT_JOB_POWER_FAULT: The job could not be executed because the ++ * part of the memory system required to access ++ * job descriptors was not powered on ++ * @BASE_JD_EVENT_JOB_READ_FAULT: Reading a job descriptor into the Job ++ * manager failed ++ * @BASE_JD_EVENT_JOB_WRITE_FAULT: Writing a job descriptor from the Job ++ * manager failed ++ * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the ++ * specified affinity mask does not intersect ++ * any available cores ++ * @BASE_JD_EVENT_JOB_BUS_FAULT: A bus access failed while executing a job ++ * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program ++ * counter was executed. ++ * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal ++ * encoding was executed. ++ * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where ++ * the instruction encoding did not match the ++ * instruction type encoded in the program ++ * counter. ++ * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that ++ * contained invalid combinations of operands. ++ * @BASE_JD_EVENT_INSTR_TLS_FAULT: A shader instruction was executed that tried ++ * to access the thread local storage section ++ * of another thread. ++ * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that ++ * tried to do an unsupported unaligned memory ++ * access. ++ * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that ++ * failed to complete an instruction barrier. ++ * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job ++ * contains invalid combinations of data. ++ * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to ++ * process a tile that is entirely outside the ++ * bounding box of the frame. ++ * @BASE_JD_EVENT_STATE_FAULT: Matches ADDR_RANGE_FAULT. A virtual address ++ * has been found that exceeds the virtual ++ * address range. ++ * @BASE_JD_EVENT_OUT_OF_MEMORY: The tiler ran out of memory when executing a job. ++ * @BASE_JD_EVENT_UNKNOWN: If multiple jobs in a job chain fail, only ++ * the first one the reports an error will set ++ * and return full error information. ++ * Subsequent failing jobs will not update the ++ * error status registers, and may write an ++ * error status of UNKNOWN. ++ * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to ++ * physical memory where the original virtual ++ * address is no longer available. ++ * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache ++ * has detected that the same line has been ++ * accessed as both shareable and non-shareable ++ * memory from inside the GPU. ++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table ++ * entry at level 1 of the translation table. ++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table ++ * entry at level 2 of the translation table. ++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table ++ * entry at level 3 of the translation table. ++ * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table ++ * entry at level 4 of the translation table. ++ * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to ++ * the permission flags set in translation ++ * table ++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading ++ * level 0 of the translation tables. ++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading ++ * level 1 of the translation tables. ++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading ++ * level 2 of the translation tables. ++ * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading ++ * level 3 of the translation tables. ++ * @BASE_JD_EVENT_ACCESS_FLAG: Matches ACCESS_FLAG_0. A memory access hit a ++ * translation table entry with the ACCESS_FLAG ++ * bit set to zero in level 0 of the ++ * page table, and the DISABLE_AF_FAULT flag ++ * was not set. ++ * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to ++ * grow memory on demand ++ * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its ++ * dependencies failed ++ * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data ++ * in the atom which overlaps with ++ * BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the ++ * platform doesn't support the feature specified in ++ * the atom. ++ * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate ++ * to userspace that the KBase context has been ++ * destroyed and Base should stop listening for ++ * further events ++ * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in ++ * the GPU has to be retried (but it has not ++ * started) due to e.g., GPU reset ++ * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal ++ * the completion of a renderpass. This value ++ * shouldn't be returned to userspace but I haven't ++ * seen where it is reset back to JD_EVENT_DONE. ++ * ++ * HW and low-level SW events are represented by event codes. ++ * The status of jobs which succeeded are also represented by ++ * an event code (see @BASE_JD_EVENT_DONE). ++ * Events are usually reported as part of a &struct base_jd_event. ++ * ++ * The event codes are encoded in the following way: ++ * * 10:0 - subtype ++ * * 12:11 - type ++ * * 13 - SW success (only valid if the SW bit is set) ++ * * 14 - SW event (HW event if not set) ++ * * 15 - Kernel event (should never be seen in userspace) ++ * ++ * Events are split up into ranges as follows: ++ * * BASE_JD_EVENT_RANGE__START ++ * * BASE_JD_EVENT_RANGE__END ++ * ++ * code is in 's range when: ++ * BASE_JD_EVENT_RANGE__START <= code < ++ * BASE_JD_EVENT_RANGE__END ++ * ++ * Ranges can be asserted for adjacency by testing that the END of the previous ++ * is equal to the START of the next. This is useful for optimizing some tests ++ * for range. ++ * ++ * A limitation is that the last member of this enum must explicitly be handled ++ * (with an assert-unreachable statement) in switch statements that use ++ * variables of this type. Otherwise, the compiler warns that we have not ++ * handled that enum value. ++ */ ++enum base_jd_event_code { ++ /* HW defined exceptions */ ++ BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0, ++ ++ /* non-fatal exceptions */ ++ BASE_JD_EVENT_NOT_STARTED = 0x00, ++ BASE_JD_EVENT_DONE = 0x01, ++ BASE_JD_EVENT_STOPPED = 0x03, ++ BASE_JD_EVENT_TERMINATED = 0x04, ++ BASE_JD_EVENT_ACTIVE = 0x08, ++ ++ BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40, ++ BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40, ++ ++ /* job exceptions */ ++ BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40, ++ BASE_JD_EVENT_JOB_POWER_FAULT = 0x41, ++ BASE_JD_EVENT_JOB_READ_FAULT = 0x42, ++ BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43, ++ BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44, ++ BASE_JD_EVENT_JOB_BUS_FAULT = 0x48, ++ BASE_JD_EVENT_INSTR_INVALID_PC = 0x50, ++ BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51, ++ BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52, ++ BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53, ++ BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54, ++ BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55, ++ BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56, ++ BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58, ++ BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59, ++ BASE_JD_EVENT_STATE_FAULT = 0x5A, ++ BASE_JD_EVENT_OUT_OF_MEMORY = 0x60, ++ BASE_JD_EVENT_UNKNOWN = 0x7F, ++ ++ /* GPU exceptions */ ++ BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80, ++ BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88, ++ ++ /* MMU exceptions */ ++ BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1, ++ BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2, ++ BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3, ++ BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4, ++ BASE_JD_EVENT_PERMISSION_FAULT = 0xC8, ++ BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1, ++ BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2, ++ BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3, ++ BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4, ++ BASE_JD_EVENT_ACCESS_FLAG = 0xD8, ++ ++ /* SW defined exceptions */ ++ BASE_JD_EVENT_MEM_GROWTH_FAILED = ++ BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000, ++ BASE_JD_EVENT_JOB_CANCELLED = ++ BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002, ++ BASE_JD_EVENT_JOB_INVALID = ++ BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003, ++ ++ BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_RESERVED | 0x3FF, ++ ++ BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_SUCCESS | 0x000, ++ ++ BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000, ++ ++ BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF, ++ ++ BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_KERNEL | 0x000, ++ BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000, ++ BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001, ++ ++ BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT | ++ BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF ++}; ++ ++/** ++ * struct base_jd_event_v2 - Event reporting structure ++ * ++ * @event_code: event code of type @ref base_jd_event_code. ++ * @atom_number: the atom number that has completed. ++ * @padding: padding. ++ * @udata: user data. ++ * ++ * This structure is used by the kernel driver to report information ++ * about GPU events. They can either be HW-specific events or low-level ++ * SW events, such as job-chain completion. ++ * ++ * The event code contains an event type field which can be extracted ++ * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK. ++ */ ++struct base_jd_event_v2 { ++ __u32 event_code; ++ base_atom_id atom_number; ++ __u8 padding[3]; ++ struct base_jd_udata udata; ++}; ++ ++/** ++ * struct base_dump_cpu_gpu_counters - Structure for ++ * BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS ++ * jobs. ++ * @system_time: gpu timestamp ++ * @cycle_counter: gpu cycle count ++ * @sec: cpu time(sec) ++ * @usec: cpu time(usec) ++ * @padding: padding ++ * ++ * This structure is stored into the memory pointed to by the @jc field ++ * of &struct base_jd_atom. ++ * ++ * It must not occupy the same CPU cache line(s) as any neighboring data. ++ * This is to avoid cases where access to pages containing the structure ++ * is shared between cached and un-cached memory regions, which would ++ * cause memory corruption. ++ */ ++ ++struct base_dump_cpu_gpu_counters { ++ __u64 system_time; ++ __u64 cycle_counter; ++ __u64 sec; ++ __u32 usec; ++ __u8 padding[36]; ++}; ++ ++/** ++ * struct mali_base_gpu_core_props - GPU core props info ++ * ++ * @product_id: Pro specific value. ++ * @version_status: Status of the GPU release. No defined values, but starts at ++ * 0 and increases by one for each release status (alpha, beta, EAC, etc.). ++ * 4 bit values (0-15). ++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" ++ * release number. ++ * 8 bit values (0-255). ++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn" ++ * release number. ++ * 4 bit values (0-15). ++ * @padding: padding to align to 8-byte ++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by ++ * clGetDeviceInfo() ++ * @log2_program_counter_size: Size of the shader program counter, in bits. ++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This ++ * is a bitpattern where a set bit indicates that the format is supported. ++ * Before using a texture format, it is recommended that the corresponding ++ * bit be checked. ++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. ++ * It is unlikely that a client will be able to allocate all of this memory ++ * for their own purposes, but this at least provides an upper bound on the ++ * memory available to the GPU. ++ * This is required for OpenCL's clGetDeviceInfo() call when ++ * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The ++ * client will not be expecting to allocate anywhere near this value. ++ * @num_exec_engines: The number of execution engines. Only valid for tGOX ++ * (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise, ++ * this is always 0. ++ */ ++struct mali_base_gpu_core_props { ++ __u32 product_id; ++ __u16 version_status; ++ __u16 minor_revision; ++ __u16 major_revision; ++ __u16 padding; ++ __u32 gpu_freq_khz_max; ++ __u32 log2_program_counter_size; ++ __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ __u64 gpu_available_memory_size; ++ __u8 num_exec_engines; ++}; ++ ++#endif /* _UAPI_BASE_JM_KERNEL_H_ */ +diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h +new file mode 100644 +index 00000000000..20d931adc9b +--- /dev/null ++++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h +@@ -0,0 +1,231 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_JM_IOCTL_H_ ++#define _UAPI_KBASE_JM_IOCTL_H_ ++ ++#include ++#include ++ ++/* ++ * 11.1: ++ * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags ++ * 11.2: ++ * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED, ++ * which some user-side clients prior to 11.2 might fault if they received ++ * them ++ * 11.3: ++ * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and ++ * KBASE_IOCTL_STICKY_RESOURCE_UNMAP ++ * 11.4: ++ * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET ++ * 11.5: ++ * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD) ++ * 11.6: ++ * - Added flags field to base_jit_alloc_info structure, which can be used to ++ * specify pseudo chunked tiler alignment for JIT allocations. ++ * 11.7: ++ * - Removed UMP support ++ * 11.8: ++ * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags ++ * 11.9: ++ * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY ++ * under base_mem_alloc_flags ++ * 11.10: ++ * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for ++ * JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations ++ * with one softjob. ++ * 11.11: ++ * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags ++ * 11.12: ++ * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS ++ * 11.13: ++ * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT ++ * 11.14: ++ * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set ++ * under base_mem_alloc_flags ++ * 11.15: ++ * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags. ++ * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be ++ * passed to mmap(). ++ * 11.16: ++ * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf. ++ * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for ++ * dma-buf. Now, buffers are mapped on GPU when first imported, no longer ++ * requiring external resource or sticky resource tracking. UNLESS, ++ * CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled. ++ * 11.17: ++ * - Added BASE_JD_REQ_JOB_SLOT. ++ * - Reused padding field in base_jd_atom_v2 to pass job slot number. ++ * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO ++ * 11.18: ++ * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags ++ * 11.19: ++ * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified. ++ * 11.20: ++ * - Added new phys_pages member to kbase_ioctl_mem_jit_init for ++ * KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2 ++ * (replacing '_OLD') and _11_5 suffixes ++ * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in ++ * base_jd_atom_v2. It must currently be initialized to zero. ++ * - Added heap_info_gpu_addr to base_jit_alloc_info, and ++ * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's ++ * flags member. Previous variants of this structure are kept and given _10_2 ++ * and _11_5 suffixes. ++ * - The above changes are checked for safe values in usual builds ++ * 11.21: ++ * - v2.0 of mali_trace debugfs file, which now versions the file separately ++ * 11.22: ++ * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2. ++ * KBASE_IOCTL_JOB_SUBMIT supports both in parallel. ++ * 11.23: ++ * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify ++ * the physical memory backing of JIT allocations. This was not supposed ++ * to be a valid use case, but it was allowed by the previous implementation. ++ * 11.24: ++ * - Added a sysfs file 'serialize_jobs' inside a new sub-directory ++ * 'scheduling'. ++ * 11.25: ++ * - Enabled JIT pressure limit in base/kbase by default ++ * 11.26 ++ * - Added kinstr_jm API ++ * 11.27 ++ * - Backwards compatible extension to HWC ioctl. ++ * 11.28: ++ * - Added kernel side cache ops needed hint ++ * 11.29: ++ * - Reserve ioctl 52 ++ * 11.30: ++ * - Add a new priority level BASE_JD_PRIO_REALTIME ++ * - Add ioctl 54: This controls the priority setting. ++ * 11.31: ++ * - Added BASE_JD_REQ_LIMITED_CORE_MASK. ++ * - Added ioctl 55: set_limited_core_count. ++ * 11.32: ++ * - Added new HW performance counters interface to all GPUs. ++ * 11.33: ++ * - Removed Kernel legacy HWC interface ++ * 11.34: ++ * - First release of new HW performance counters interface. ++ * 11.35: ++ * - Dummy model (no mali) backend will now clear HWC values after each sample ++ */ ++#define BASE_UK_VERSION_MAJOR 11 ++#define BASE_UK_VERSION_MINOR 35 ++ ++/** ++ * struct kbase_ioctl_version_check - Check version compatibility between ++ * kernel and userspace ++ * ++ * @major: Major version number ++ * @minor: Minor version number ++ */ ++struct kbase_ioctl_version_check { ++ __u16 major; ++ __u16 minor; ++}; ++ ++#define KBASE_IOCTL_VERSION_CHECK \ ++ _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) ++ ++ ++/** ++ * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel ++ * ++ * @addr: Memory address of an array of struct base_jd_atom_v2 or v3 ++ * @nr_atoms: Number of entries in the array ++ * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom) ++ */ ++struct kbase_ioctl_job_submit { ++ __u64 addr; ++ __u32 nr_atoms; ++ __u32 stride; ++}; ++ ++#define KBASE_IOCTL_JOB_SUBMIT \ ++ _IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit) ++ ++#define KBASE_IOCTL_POST_TERM \ ++ _IO(KBASE_IOCTL_TYPE, 4) ++ ++/** ++ * struct kbase_ioctl_soft_event_update - Update the status of a soft-event ++ * @event: GPU address of the event which has been updated ++ * @new_status: The new status to set ++ * @flags: Flags for future expansion ++ */ ++struct kbase_ioctl_soft_event_update { ++ __u64 event; ++ __u32 new_status; ++ __u32 flags; ++}; ++ ++#define KBASE_IOCTL_SOFT_EVENT_UPDATE \ ++ _IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update) ++ ++/** ++ * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for ++ * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the ++ * kernel ++ * ++ * @size: The size of the `struct kbase_kinstr_jm_atom_state_change` ++ * @version: Represents a breaking change in the ++ * `struct kbase_kinstr_jm_atom_state_change` ++ * @padding: Explicit padding to get the structure up to 64bits. See ++ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst ++ * ++ * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the ++ * end of the structure that older user space might not understand. If the ++ * `version` is the same, the structure is still compatible with newer kernels. ++ * The `size` can be used to cast the opaque memory returned from the kernel. ++ */ ++struct kbase_kinstr_jm_fd_out { ++ __u16 size; ++ __u8 version; ++ __u8 padding[5]; ++}; ++ ++/** ++ * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor ++ * ++ * @count: Number of atom states that can be stored in the kernel circular ++ * buffer. Must be a power of two ++ * @padding: Explicit padding to get the structure up to 64bits. See ++ * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst ++ */ ++struct kbase_kinstr_jm_fd_in { ++ __u16 count; ++ __u8 padding[6]; ++}; ++ ++union kbase_kinstr_jm_fd { ++ struct kbase_kinstr_jm_fd_in in; ++ struct kbase_kinstr_jm_fd_out out; ++}; ++ ++#define KBASE_IOCTL_KINSTR_JM_FD \ ++ _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd) ++ ++ ++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ ++ _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) ++ ++#endif /* _UAPI_KBASE_JM_IOCTL_H_ */ +diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h +new file mode 100644 +index 00000000000..f8378146ace +--- /dev/null ++++ b/src/panfrost/base/include/mali_base_common_kernel.h +@@ -0,0 +1,231 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_BASE_COMMON_KERNEL_H_ ++#define _UAPI_BASE_COMMON_KERNEL_H_ ++ ++#include ++ ++struct base_mem_handle { ++ struct { ++ __u64 handle; ++ } basep; ++}; ++ ++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 ++ ++/* Memory allocation, access/hint flags & mask. ++ * ++ * See base_mem_alloc_flags. ++ */ ++ ++/* IN */ ++/* Read access CPU side ++ */ ++#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) ++ ++/* Write access CPU side ++ */ ++#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) ++ ++/* Read access GPU side ++ */ ++#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) ++ ++/* Write access GPU side ++ */ ++#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) ++ ++/* Execute allowed on the GPU side ++ */ ++#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) ++ ++/* Will be permanently mapped in kernel space. ++ * Flag is only allowed on allocations originating from kbase. ++ */ ++#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) ++ ++/* The allocation will completely reside within the same 4GB chunk in the GPU ++ * virtual space. ++ * Since this flag is primarily required only for the TLS memory which will ++ * not be used to contain executable code and also not used for Tiler heap, ++ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. ++ */ ++#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) ++ ++/* Userspace is not allowed to free this memory. ++ * Flag is only allowed on allocations originating from kbase. ++ */ ++#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) ++ ++/* Grow backing store on GPU Page Fault ++ */ ++#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) ++ ++/* Page coherence Outer shareable, if available ++ */ ++#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) ++ ++/* Page coherence Inner shareable ++ */ ++#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) ++ ++/* IN/OUT */ ++/* Should be cached on the CPU, returned if actually cached ++ */ ++#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) ++ ++/* IN/OUT */ ++/* Must have same VA on both the GPU and the CPU ++ */ ++#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) ++ ++/* OUT */ ++/* Must call mmap to acquire a GPU address for the allocation ++ */ ++#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) ++ ++/* IN */ ++/* Page coherence Outer shareable, required. ++ */ ++#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) ++ ++/* Protected memory ++ */ ++#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) ++ ++/* Not needed physical memory ++ */ ++#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) ++ ++/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the ++ * addresses to be the same ++ */ ++#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) ++ ++/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu ++ * mode. Some components within the GPU might only be able to access memory ++ * that is GPU cacheable. Refer to the specific GPU implementation for more ++ * details. The 3 shareability flags will be ignored for GPU uncached memory. ++ * If used while importing USER_BUFFER type memory, then the import will fail ++ * if the memory is not aligned to GPU and CPU cache line width. ++ */ ++#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) ++ ++/* ++ * Bits [22:25] for group_id (0~15). ++ * ++ * base_mem_group_id_set() should be used to pack a memory group ID into a ++ * base_mem_alloc_flags value instead of accessing the bits directly. ++ * base_mem_group_id_get() should be used to extract the memory group ID from ++ * a base_mem_alloc_flags value. ++ */ ++#define BASEP_MEM_GROUP_ID_SHIFT 22 ++#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) ++ ++/* Must do CPU cache maintenance when imported memory is mapped/unmapped ++ * on GPU. Currently applicable to dma-buf type only. ++ */ ++#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) ++ ++/* OUT */ ++/* Kernel side cache sync ops required */ ++#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) ++ ++/* Number of bits used as flags for base memory management ++ * ++ * Must be kept in sync with the base_mem_alloc_flags flags ++ */ ++#define BASE_MEM_FLAGS_NR_BITS 30 ++ ++/* A mask for all output bits, excluding IN/OUT bits. ++ */ ++#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP ++ ++/* A mask for all input bits, including IN/OUT bits. ++ */ ++#define BASE_MEM_FLAGS_INPUT_MASK \ ++ (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) ++ ++/* Special base mem handles. ++ */ ++#define BASEP_MEM_INVALID_HANDLE (0ul) ++#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) ++#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) ++/* reserved handles ..-47< for future special handles */ ++#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) ++ ++/* Flags to pass to ::base_context_init. ++ * Flags can be ORed together to enable multiple things. ++ * ++ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must ++ * not collide with them. ++ */ ++typedef __u32 base_context_create_flags; ++ ++/* Flags for base context */ ++ ++/* No flags set */ ++#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) ++ ++/* Base context is embedded in a cctx object (flag used for CINSTR ++ * software counter macros) ++ */ ++#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) ++ ++/* Base context is a 'System Monitor' context for Hardware counters. ++ * ++ * One important side effect of this is that job submission is disabled. ++ */ ++#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1) ++ ++/* Bit-shift used to encode a memory group ID in base_context_create_flags ++ */ ++#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) ++ ++/* Bitmask used to encode a memory group ID in base_context_create_flags ++ */ ++#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ ++ ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) ++ ++/* Bitpattern describing the base_context_create_flags that can be ++ * passed to the kernel ++ */ ++#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ ++ (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK) ++ ++/* Flags for base tracepoint ++ */ ++ ++/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, ++ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) ++ */ ++#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) ++ ++/* Indicate that job dumping is enabled. This could affect certain timers ++ * to account for the performance impact. ++ */ ++#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) ++ ++#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */ +diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h +new file mode 100644 +index 00000000000..3d826c720b2 +--- /dev/null ++++ b/src/panfrost/base/include/mali_base_kernel.h +@@ -0,0 +1,700 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++/* ++ * Base structures shared with the kernel. ++ */ ++ ++#ifndef _UAPI_BASE_KERNEL_H_ ++#define _UAPI_BASE_KERNEL_H_ ++ ++#include ++#include "mali_base_common_kernel.h" ++ ++#define BASE_MAX_COHERENT_GROUPS 16 ++ ++#if defined(PAGE_MASK) && defined(PAGE_SHIFT) ++#define LOCAL_PAGE_SHIFT PAGE_SHIFT ++#define LOCAL_PAGE_LSB ~PAGE_MASK ++#else ++#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 ++#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 ++#endif ++ ++#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) ++#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 ++#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) ++#else ++#error Failed to find page size ++#endif ++#endif ++ ++/* Physical memory group ID for normal usage. ++ */ ++#define BASE_MEM_GROUP_DEFAULT (0) ++ ++/* Number of physical memory groups. ++ */ ++#define BASE_MEM_GROUP_COUNT (16) ++ ++/** ++ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. ++ * ++ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator ++ * in order to determine the best cache policy. Some combinations are ++ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), ++ * which defines a write-only region on the CPU side, which is ++ * heavily read by the CPU... ++ * Other flags are only meaningful to a particular allocator. ++ * More flags can be added to this list, as long as they don't clash ++ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). ++ */ ++typedef __u32 base_mem_alloc_flags; ++ ++/* A mask for all the flags which are modifiable via the base_mem_set_flags ++ * interface. ++ */ ++#define BASE_MEM_FLAGS_MODIFIABLE \ ++ (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ ++ BASE_MEM_COHERENT_LOCAL) ++ ++/* A mask of all the flags that can be returned via the base_mem_get_flags() ++ * interface. ++ */ ++#define BASE_MEM_FLAGS_QUERYABLE \ ++ (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ ++ BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ ++ BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ ++ BASEP_MEM_FLAGS_KERNEL_ONLY)) ++ ++/** ++ * enum base_mem_import_type - Memory types supported by @a base_mem_import ++ * ++ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type ++ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) ++ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a ++ * base_mem_import_user_buffer ++ * ++ * Each type defines what the supported handle type is. ++ * ++ * If any new type is added here ARM must be contacted ++ * to allocate a numeric value for it. ++ * Do not just add a new type without synchronizing with ARM ++ * as future releases from ARM might include other new types ++ * which could clash with your custom types. ++ */ ++enum base_mem_import_type { ++ BASE_MEM_IMPORT_TYPE_INVALID = 0, ++ /* ++ * Import type with value 1 is deprecated. ++ */ ++ BASE_MEM_IMPORT_TYPE_UMM = 2, ++ BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 ++}; ++ ++/** ++ * struct base_mem_import_user_buffer - Handle of an imported user buffer ++ * ++ * @ptr: address of imported user buffer ++ * @length: length of imported user buffer in bytes ++ * ++ * This structure is used to represent a handle of an imported user buffer. ++ */ ++ ++struct base_mem_import_user_buffer { ++ __u64 ptr; ++ __u64 length; ++}; ++ ++/* Mask to detect 4GB boundary alignment */ ++#define BASE_MEM_MASK_4GB 0xfffff000UL ++/* Mask to detect 4GB boundary (in page units) alignment */ ++#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) ++ ++/* Limit on the 'extension' parameter for an allocation with the ++ * BASE_MEM_TILER_ALIGN_TOP flag set ++ * ++ * This is the same as the maximum limit for a Buffer Descriptor's chunk size ++ */ ++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ ++ (21u - (LOCAL_PAGE_SHIFT)) ++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ ++ (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) ++ ++/* Bit mask of cookies used for memory allocation setup */ ++#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ ++ ++/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ ++#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ ++ ++/* ++ * struct base_fence - Cross-device synchronisation fence. ++ * ++ * A fence is used to signal when the GPU has finished accessing a resource that ++ * may be shared with other devices, and also to delay work done asynchronously ++ * by the GPU until other devices have finished accessing a shared resource. ++ */ ++struct base_fence { ++ struct { ++ int fd; ++ int stream_fd; ++ } basep; ++}; ++ ++/** ++ * struct base_mem_aliasing_info - Memory aliasing info ++ * ++ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE ++ * @offset: Offset within the handle to start aliasing from, in pages. ++ * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. ++ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE ++ * specifies the number of times the special page is needed. ++ * ++ * Describes a memory handle to be aliased. ++ * A subset of the handle can be chosen for aliasing, given an offset and a ++ * length. ++ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a ++ * region where a special page is mapped with a write-alloc cache setup, ++ * typically used when the write result of the GPU isn't needed, but the GPU ++ * must write anyway. ++ * ++ * Offset and length are specified in pages. ++ * Offset must be within the size of the handle. ++ * Offset+length must not overrun the size of the handle. ++ */ ++struct base_mem_aliasing_info { ++ struct base_mem_handle handle; ++ __u64 offset; ++ __u64 length; ++}; ++ ++/* Maximum percentage of just-in-time memory allocation trimming to perform ++ * on free. ++ */ ++#define BASE_JIT_MAX_TRIM_LEVEL (100) ++ ++/* Maximum number of concurrent just-in-time memory allocations. ++ */ ++#define BASE_JIT_ALLOC_COUNT (255) ++ ++/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 ++ * ++ * jit_version is 1 ++ * ++ * Due to the lack of padding specified, user clients between 32 and 64-bit ++ * may have assumed a different size of the struct ++ * ++ * An array of structures was not supported ++ */ ++struct base_jit_alloc_info_10_2 { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++}; ++ ++/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up ++ * to 11.19 ++ * ++ * This structure had a number of modifications during and after kernel driver ++ * version 11.5, but remains size-compatible throughout its version history, and ++ * with earlier variants compatible with future variants by requiring ++ * zero-initialization to the unused space in the structure. ++ * ++ * jit_version is 2 ++ * ++ * Kernel driver version history: ++ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes ++ * must be zero. Kbase minor version was not incremented, so some ++ * versions of 11.5 do not have this change. ++ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase ++ * minor version not incremented) ++ * 11.6: Added 'flags', replacing 1 padding byte ++ * 11.10: Arrays of this structure are supported ++ */ ++struct base_jit_alloc_info_11_5 { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++ __u8 bin_id; ++ __u8 max_allocations; ++ __u8 flags; ++ __u8 padding[2]; ++ __u16 usage_id; ++}; ++ ++/** ++ * struct base_jit_alloc_info - Structure which describes a JIT allocation ++ * request. ++ * @gpu_alloc_addr: The GPU virtual address to write the JIT ++ * allocated GPU virtual address to. ++ * @va_pages: The minimum number of virtual pages required. ++ * @commit_pages: The minimum number of physical pages which ++ * should back the allocation. ++ * @extension: Granularity of physical pages to grow the ++ * allocation by during a fault. ++ * @id: Unique ID provided by the caller, this is used ++ * to pair allocation and free requests. ++ * Zero is not a valid value. ++ * @bin_id: The JIT allocation bin, used in conjunction with ++ * @max_allocations to limit the number of each ++ * type of JIT allocation. ++ * @max_allocations: The maximum number of allocations allowed within ++ * the bin specified by @bin_id. Should be the same ++ * for all allocations within the same bin. ++ * @flags: flags specifying the special requirements for ++ * the JIT allocation, see ++ * %BASE_JIT_ALLOC_VALID_FLAGS ++ * @padding: Expansion space - should be initialised to zero ++ * @usage_id: A hint about which allocation should be reused. ++ * The kernel should attempt to use a previous ++ * allocation with the same usage_id ++ * @heap_info_gpu_addr: Pointer to an object in GPU memory describing ++ * the actual usage of the region. ++ * ++ * jit_version is 3. ++ * ++ * When modifications are made to this structure, it is still compatible with ++ * jit_version 3 when: a) the size is unchanged, and b) new members only ++ * replace the padding bytes. ++ * ++ * Previous jit_version history: ++ * jit_version == 1, refer to &base_jit_alloc_info_10_2 ++ * jit_version == 2, refer to &base_jit_alloc_info_11_5 ++ * ++ * Kbase version history: ++ * 11.20: added @heap_info_gpu_addr ++ */ ++struct base_jit_alloc_info { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++ __u8 bin_id; ++ __u8 max_allocations; ++ __u8 flags; ++ __u8 padding[2]; ++ __u16 usage_id; ++ __u64 heap_info_gpu_addr; ++}; ++ ++enum base_external_resource_access { ++ BASE_EXT_RES_ACCESS_SHARED, ++ BASE_EXT_RES_ACCESS_EXCLUSIVE ++}; ++ ++struct base_external_resource { ++ __u64 ext_resource; ++}; ++ ++/** ++ * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources ++ * which can be mapped/unmapped in a single request. ++ */ ++#define BASE_EXT_RES_COUNT_MAX 10 ++ ++/** ++ * struct base_external_resource_list - Structure which describes a list of ++ * external resources. ++ * @count: The number of resources. ++ * @ext_res: Array of external resources which is ++ * sized at allocation time. ++ */ ++struct base_external_resource_list { ++ __u64 count; ++ struct base_external_resource ext_res[1]; ++}; ++ ++struct base_jd_debug_copy_buffer { ++ __u64 address; ++ __u64 size; ++ struct base_external_resource extres; ++}; ++ ++#define GPU_MAX_JOB_SLOTS 16 ++ ++/** ++ * DOC: User-side Base GPU Property Queries ++ * ++ * The User-side Base GPU Property Query interface encapsulates two ++ * sub-modules: ++ * ++ * - "Dynamic GPU Properties" ++ * - "Base Platform Config GPU Properties" ++ * ++ * Base only deals with properties that vary between different GPU ++ * implementations - the Dynamic GPU properties and the Platform Config ++ * properties. ++ * ++ * For properties that are constant for the GPU Architecture, refer to the ++ * GPU module. However, we will discuss their relevance here just to ++ * provide background information. ++ * ++ * About the GPU Properties in Base and GPU modules ++ * ++ * The compile-time properties (Platform Config, GPU Compile-time ++ * properties) are exposed as pre-processor macros. ++ * ++ * Complementing the compile-time properties are the Dynamic GPU ++ * Properties, which act as a conduit for the GPU Configuration ++ * Discovery. ++ * ++ * In general, the dynamic properties are present to verify that the platform ++ * has been configured correctly with the right set of Platform Config ++ * Compile-time Properties. ++ * ++ * As a consistent guide across the entire DDK, the choice for dynamic or ++ * compile-time should consider the following, in order: ++ * 1. Can the code be written so that it doesn't need to know the ++ * implementation limits at all? ++ * 2. If you need the limits, get the information from the Dynamic Property ++ * lookup. This should be done once as you fetch the context, and then cached ++ * as part of the context data structure, so it's cheap to access. ++ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, ++ * then use a Compile-Time Property (Platform Config, or GPU Compile-time ++ * property). Examples of where this might be sensible follow: ++ * - Part of a critical inner-loop ++ * - Frequent re-use throughout the driver, causing significant extra load ++ * instructions or control flow that would be worthwhile optimizing out. ++ * ++ * We cannot provide an exhaustive set of examples, neither can we provide a ++ * rule for every possible situation. Use common sense, and think about: what ++ * the rest of the driver will be doing; how the compiler might represent the ++ * value if it is a compile-time constant; whether an OEM shipping multiple ++ * devices would benefit much more from a single DDK binary, instead of ++ * insignificant micro-optimizations. ++ * ++ * Dynamic GPU Properties ++ * ++ * Dynamic GPU properties are presented in two sets: ++ * 1. the commonly used properties in @ref base_gpu_props, which have been ++ * unpacked from GPU register bitfields. ++ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props ++ * (also a member of base_gpu_props). All of these are presented in ++ * the packed form, as presented by the GPU registers themselves. ++ * ++ * The raw properties in gpu_raw_gpu_props are necessary to ++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device ++ * behaving differently?". In this case, all information about the ++ * configuration is potentially useful, but it does not need to be processed ++ * by the driver. Instead, the raw registers can be processed by the Mali ++ * Tools software on the host PC. ++ * ++ * The properties returned extend the GPU Configuration Discovery ++ * registers. For example, GPU clock speed is not specified in the GPU ++ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. ++ * ++ * The GPU properties are obtained by a call to ++ * base_get_gpu_props(). This simply returns a pointer to a const ++ * base_gpu_props structure. It is constant for the life of a base ++ * context. Multiple calls to base_get_gpu_props() to a base context ++ * return the same pointer to a constant structure. This avoids cache pollution ++ * of the common data. ++ * ++ * This pointer must not be freed, because it does not point to the start of a ++ * region allocated by the memory allocator; instead, just close the @ref ++ * base_context. ++ * ++ * ++ * Kernel Operation ++ * ++ * During Base Context Create time, user-side makes a single kernel call: ++ * - A call to fill user memory with GPU information structures ++ * ++ * The kernel-side will fill the provided the entire processed base_gpu_props ++ * structure, because this information is required in both ++ * user and kernel side; it does not make sense to decode it twice. ++ * ++ * Coherency groups must be derived from the bitmasks, but this can be done ++ * kernel side, and just once at kernel startup: Coherency groups must already ++ * be known kernel-side, to support chains that specify a 'Only Coherent Group' ++ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. ++ * ++ * Coherency Group calculation ++ * ++ * Creation of the coherent group data is done at device-driver startup, and so ++ * is one-time. This will most likely involve a loop with CLZ, shifting, and ++ * bit clearing on the L2_PRESENT mask, depending on whether the ++ * system is L2 Coherent. The number of shader cores is done by a ++ * population count, since faulty cores may be disabled during production, ++ * producing a non-contiguous mask. ++ * ++ * The memory requirements for this algorithm can be determined either by a __u64 ++ * population count on the L2_PRESENT mask (a LUT helper already is ++ * required for the above), or simple assumption that there can be no more than ++ * 16 coherent groups, since core groups are typically 4 cores. ++ */ ++ ++/* ++ * More information is possible - but associativity and bus width are not ++ * required by upper-level apis. ++ */ ++struct mali_base_gpu_l2_cache_props { ++ __u8 log2_line_size; ++ __u8 log2_cache_size; ++ __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ ++ __u8 padding[5]; ++}; ++ ++struct mali_base_gpu_tiler_props { ++ __u32 bin_size_bytes; /* Max is 4*2^15 */ ++ __u32 max_active_levels; /* Max is 2^15 */ ++}; ++ ++/** ++ * struct mali_base_gpu_thread_props - GPU threading system details. ++ * @max_threads: Max. number of threads per core ++ * @max_workgroup_size: Max. number of threads per workgroup ++ * @max_barrier_size: Max. number of threads that can synchronize on a ++ * simple barrier ++ * @max_registers: Total size [1..65535] of the register file available ++ * per core. ++ * @max_task_queue: Max. tasks [1..255] which may be sent to a core ++ * before it becomes blocked. ++ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split ++ * field. ++ * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, ++ * 3 = SW Model/Emulation ++ * @padding: padding to align to 8-byte ++ * @tls_alloc: Number of threads per core that TLS must be ++ * allocated for ++ */ ++struct mali_base_gpu_thread_props { ++ __u32 max_threads; ++ __u32 max_workgroup_size; ++ __u32 max_barrier_size; ++ __u16 max_registers; ++ __u8 max_task_queue; ++ __u8 max_thread_group_split; ++ __u8 impl_tech; ++ __u8 padding[3]; ++ __u32 tls_alloc; ++}; ++ ++/** ++ * struct mali_base_gpu_coherent_group - descriptor for a coherent group ++ * @core_mask: Core restriction mask required for the group ++ * @num_cores: Number of cores in the group ++ * @padding: padding to align to 8-byte ++ * ++ * \c core_mask exposes all cores in that coherent group, and \c num_cores ++ * provides a cached population-count for that mask. ++ * ++ * @note Whilst all cores are exposed in the mask, not all may be available to ++ * the application, depending on the Kernel Power policy. ++ * ++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of ++ * wastage. ++ */ ++struct mali_base_gpu_coherent_group { ++ __u64 core_mask; ++ __u16 num_cores; ++ __u16 padding[3]; ++}; ++ ++/** ++ * struct mali_base_gpu_coherent_group_info - Coherency group information ++ * @num_groups: Number of coherent groups in the GPU. ++ * @num_core_groups: Number of core groups (coherent or not) in the GPU. ++ * Equivalent to the number of L2 Caches. ++ * The GPU Counter dumping writes 2048 bytes per core group, ++ * regardless of whether the core groups are coherent or not. ++ * Hence this member is needed to calculate how much memory ++ * is required for dumping. ++ * @note Do not use it to work out how many valid elements ++ * are in the group[] member. Use num_groups instead. ++ * @coherency: Coherency features of the memory, accessed by gpu_mem_features ++ * methods ++ * @padding: padding to align to 8-byte ++ * @group: Descriptors of coherent groups ++ * ++ * Note that the sizes of the members could be reduced. However, the \c group ++ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte ++ * aligned, thus leading to wastage if the other members sizes were reduced. ++ * ++ * The groups are sorted by core mask. The core masks are non-repeating and do ++ * not intersect. ++ */ ++struct mali_base_gpu_coherent_group_info { ++ __u32 num_groups; ++ __u32 num_core_groups; ++ __u32 coherency; ++ __u32 padding; ++ struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; ++}; ++ ++#if MALI_USE_CSF ++#include "csf/mali_base_csf_kernel.h" ++#else ++#include "jm/mali_base_jm_kernel.h" ++#endif ++ ++/** ++ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware ++ * Configuration Discovery registers. ++ * @shader_present: Shader core present bitmap ++ * @tiler_present: Tiler core present bitmap ++ * @l2_present: Level 2 cache present bitmap ++ * @stack_present: Core stack present bitmap ++ * @l2_features: L2 features ++ * @core_features: Core features ++ * @mem_features: Mem features ++ * @mmu_features: Mmu features ++ * @as_present: Bitmap of address spaces present ++ * @js_present: Job slots present ++ * @js_features: Array of job slot features. ++ * @tiler_features: Tiler features ++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU ++ * @gpu_id: GPU and revision identifier ++ * @thread_max_threads: Maximum number of threads per core ++ * @thread_max_workgroup_size: Maximum number of threads per workgroup ++ * @thread_max_barrier_size: Maximum number of threads per barrier ++ * @thread_features: Thread features ++ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the ++ * available modes as exposed in the coherency_features register ++ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for ++ * @gpu_features: GPU features ++ * ++ * The information is presented inefficiently for access. For frequent access, ++ * the values should be better expressed in an unpacked form in the ++ * base_gpu_props structure. ++ * ++ * The raw properties in gpu_raw_gpu_props are necessary to ++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device ++ * behaving differently?". In this case, all information about the ++ * configuration is potentially useful, but it does not need to be processed ++ * by the driver. Instead, the raw registers can be processed by the Mali ++ * Tools software on the host PC. ++ * ++ */ ++struct gpu_raw_gpu_props { ++ __u64 shader_present; ++ __u64 tiler_present; ++ __u64 l2_present; ++ __u64 stack_present; ++ __u32 l2_features; ++ __u32 core_features; ++ __u32 mem_features; ++ __u32 mmu_features; ++ ++ __u32 as_present; ++ ++ __u32 js_present; ++ __u32 js_features[GPU_MAX_JOB_SLOTS]; ++ __u32 tiler_features; ++ __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ ++ __u32 gpu_id; ++ ++ __u32 thread_max_threads; ++ __u32 thread_max_workgroup_size; ++ __u32 thread_max_barrier_size; ++ __u32 thread_features; ++ ++ /* ++ * Note: This is the _selected_ coherency mode rather than the ++ * available modes as exposed in the coherency_features register. ++ */ ++ __u32 coherency_mode; ++ ++ __u32 thread_tls_alloc; ++ __u64 gpu_features; ++}; ++ ++/** ++ * struct base_gpu_props - Return structure for base_get_gpu_props(). ++ * @core_props: Core props. ++ * @l2_props: L2 props. ++ * @unused_1: Keep for backwards compatibility. ++ * @tiler_props: Tiler props. ++ * @thread_props: Thread props. ++ * @raw_props: This member is large, likely to be 128 bytes. ++ * @coherency_info: This must be last member of the structure. ++ * ++ * NOTE: the raw_props member in this data structure contains the register ++ * values from which the value of the other members are derived. The derived ++ * members exist to allow for efficient access and/or shielding the details ++ * of the layout of the registers. ++ */ ++struct base_gpu_props { ++ struct mali_base_gpu_core_props core_props; ++ struct mali_base_gpu_l2_cache_props l2_props; ++ __u64 unused_1; ++ struct mali_base_gpu_tiler_props tiler_props; ++ struct mali_base_gpu_thread_props thread_props; ++ struct gpu_raw_gpu_props raw_props; ++ struct mali_base_gpu_coherent_group_info coherency_info; ++}; ++ ++#define BASE_MEM_GROUP_ID_GET(flags) \ ++ ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) ++ ++#define BASE_MEM_GROUP_ID_SET(id) \ ++ (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ ++ BASE_MEM_GROUP_DEFAULT : \ ++ id) \ ++ << BASEP_MEM_GROUP_ID_SHIFT) & \ ++ BASE_MEM_GROUP_ID_MASK) ++ ++#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ ++ (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ ++ ((base_context_create_flags)(group_id) \ ++ << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) ++ ++#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ ++ ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ ++ BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) ++ ++/* ++ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These ++ * flags are also used, where applicable, for specifying which fields ++ * are valid following the request operation. ++ */ ++ ++/* For monotonic (counter) timefield */ ++#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) ++/* For system wide timestamp */ ++#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) ++/* For GPU cycle counter */ ++#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) ++/* Specify kernel GPU register timestamp */ ++#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) ++/* Specify userspace cntvct_el0 timestamp source */ ++#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) ++ ++#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ ++ BASE_TIMEINFO_MONOTONIC_FLAG | \ ++ BASE_TIMEINFO_TIMESTAMP_FLAG | \ ++ BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ ++ BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ ++ BASE_TIMEINFO_USER_SOURCE_FLAG) ++ ++/* Maximum number of source allocations allowed to create an alias allocation. ++ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array ++ * layers, since each cube map in the array will have 6 faces. ++ */ ++#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) ++ ++#endif /* _UAPI_BASE_KERNEL_H_ */ +diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h +new file mode 100644 +index 00000000000..b250feca022 +--- /dev/null ++++ b/src/panfrost/base/include/mali_kbase_gpuprops.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_GPUPROP_H_ ++#define _UAPI_KBASE_GPUPROP_H_ ++ ++/********************************** ++ * Definitions for GPU properties * ++ **********************************/ ++#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) ++#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) ++#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) ++#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) ++ ++#define KBASE_GPUPROP_PRODUCT_ID 1 ++#define KBASE_GPUPROP_VERSION_STATUS 2 ++#define KBASE_GPUPROP_MINOR_REVISION 3 ++#define KBASE_GPUPROP_MAJOR_REVISION 4 ++/* 5 previously used for GPU speed */ ++#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 ++/* 7 previously used for minimum GPU speed */ ++#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 ++#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 ++ ++#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 ++#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 ++#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 ++ ++#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 ++#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 ++ ++#define KBASE_GPUPROP_MAX_THREADS 18 ++#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 ++#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 ++#define KBASE_GPUPROP_MAX_REGISTERS 21 ++#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 ++#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 ++#define KBASE_GPUPROP_IMPL_TECH 24 ++ ++#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 ++#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 ++#define KBASE_GPUPROP_RAW_L2_PRESENT 27 ++#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 ++#define KBASE_GPUPROP_RAW_L2_FEATURES 29 ++#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 ++#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 ++#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 ++#define KBASE_GPUPROP_RAW_AS_PRESENT 33 ++#define KBASE_GPUPROP_RAW_JS_PRESENT 34 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 ++#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 ++#define KBASE_GPUPROP_RAW_GPU_ID 55 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 ++#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 ++#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 ++ ++#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 ++#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 ++#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 ++#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 ++#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 ++#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 ++#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 ++#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 ++#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 ++#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 ++#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 ++#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 ++#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 ++#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 ++#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 ++#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 ++#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 ++#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 ++#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 ++ ++#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 ++ ++#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 ++ ++#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 ++#define KBASE_GPUPROP_TLS_ALLOC 84 ++#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 ++ ++#endif +diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h +new file mode 100644 +index 00000000000..96f606af5f8 +--- /dev/null ++++ b/src/panfrost/base/include/mali_kbase_ioctl.h +@@ -0,0 +1,759 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_IOCTL_H_ ++#define _UAPI_KBASE_IOCTL_H_ ++ ++#ifdef __cpluscplus ++extern "C" { ++#endif ++ ++#include ++#include ++ ++#if MALI_USE_CSF ++#include "csf/mali_kbase_csf_ioctl.h" ++#else ++#include "jm/mali_kbase_jm_ioctl.h" ++#endif /* MALI_USE_CSF */ ++ ++#define KBASE_IOCTL_TYPE 0x80 ++ ++/** ++ * struct kbase_ioctl_set_flags - Set kernel context creation flags ++ * ++ * @create_flags: Flags - see base_context_create_flags ++ */ ++struct kbase_ioctl_set_flags { ++ __u32 create_flags; ++}; ++ ++#define KBASE_IOCTL_SET_FLAGS \ ++ _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) ++ ++/** ++ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel ++ * ++ * @buffer: Pointer to the buffer to store properties into ++ * @size: Size of the buffer ++ * @flags: Flags - must be zero for now ++ * ++ * The ioctl will return the number of bytes stored into @buffer or an error ++ * on failure (e.g. @size is too small). If @size is specified as 0 then no ++ * data will be written but the return value will be the number of bytes needed ++ * for all the properties. ++ * ++ * @flags may be used in the future to request a different format for the ++ * buffer. With @flags == 0 the following format is used. ++ * ++ * The buffer will be filled with pairs of values, a __u32 key identifying the ++ * property followed by the value. The size of the value is identified using ++ * the bottom bits of the key. The value then immediately followed the key and ++ * is tightly packed (there is no padding). All keys and values are ++ * little-endian. ++ * ++ * 00 = __u8 ++ * 01 = __u16 ++ * 10 = __u32 ++ * 11 = __u64 ++ */ ++struct kbase_ioctl_get_gpuprops { ++ __u64 buffer; ++ __u32 size; ++ __u32 flags; ++}; ++ ++#define KBASE_IOCTL_GET_GPUPROPS \ ++ _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) ++ ++/** ++ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU ++ * @in: Input parameters ++ * @in.va_pages: The number of pages of virtual address space to reserve ++ * @in.commit_pages: The number of physical pages to allocate ++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region ++ * @in.flags: Flags ++ * @out: Output parameters ++ * @out.flags: Flags ++ * @out.gpu_va: The GPU virtual address which is allocated ++ */ ++union kbase_ioctl_mem_alloc { ++ struct { ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u64 flags; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_ALLOC \ ++ _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) ++ ++/** ++ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region ++ * @in: Input parameters ++ * @in.gpu_addr: A GPU address contained within the region ++ * @in.query: The type of query ++ * @out: Output parameters ++ * @out.value: The result of the query ++ * ++ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. ++ */ ++union kbase_ioctl_mem_query { ++ struct { ++ __u64 gpu_addr; ++ __u64 query; ++ } in; ++ struct { ++ __u64 value; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_QUERY \ ++ _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) ++ ++#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) ++#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) ++#define KBASE_MEM_QUERY_FLAGS ((__u64)3) ++ ++/** ++ * struct kbase_ioctl_mem_free - Free a memory region ++ * @gpu_addr: Handle to the region to free ++ */ ++struct kbase_ioctl_mem_free { ++ __u64 gpu_addr; ++}; ++ ++#define KBASE_IOCTL_MEM_FREE \ ++ _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) ++ ++/** ++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader ++ * @buffer_count: requested number of dumping buffers ++ * @fe_bm: counters selection bitmask (Front end) ++ * @shader_bm: counters selection bitmask (Shader) ++ * @tiler_bm: counters selection bitmask (Tiler) ++ * @mmu_l2_bm: counters selection bitmask (MMU_L2) ++ * ++ * A fd is returned from the ioctl if successful, or a negative value on error ++ */ ++struct kbase_ioctl_hwcnt_reader_setup { ++ __u32 buffer_count; ++ __u32 fe_bm; ++ __u32 shader_bm; ++ __u32 tiler_bm; ++ __u32 mmu_l2_bm; ++}; ++ ++#define KBASE_IOCTL_HWCNT_READER_SETUP \ ++ _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) ++ ++/** ++ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. ++ * @data: Counter samples for the dummy model. ++ * @size: Size of the counter sample data. ++ * @padding: Padding. ++ */ ++struct kbase_ioctl_hwcnt_values { ++ __u64 data; ++ __u32 size; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_HWCNT_SET \ ++ _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) ++ ++/** ++ * struct kbase_ioctl_disjoint_query - Query the disjoint counter ++ * @counter: A counter of disjoint events in the kernel ++ */ ++struct kbase_ioctl_disjoint_query { ++ __u32 counter; ++}; ++ ++#define KBASE_IOCTL_DISJOINT_QUERY \ ++ _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) ++ ++/** ++ * struct kbase_ioctl_get_ddk_version - Query the kernel version ++ * @version_buffer: Buffer to receive the kernel version string ++ * @size: Size of the buffer ++ * @padding: Padding ++ * ++ * The ioctl will return the number of bytes written into version_buffer ++ * (which includes a NULL byte) or a negative error code ++ * ++ * The ioctl request code has to be _IOW because the data in ioctl struct is ++ * being copied to the kernel, even though the kernel then writes out the ++ * version info to the buffer specified in the ioctl. ++ */ ++struct kbase_ioctl_get_ddk_version { ++ __u64 version_buffer; ++ __u32 size; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_GET_DDK_VERSION \ ++ _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory ++ * allocator (between kernel driver ++ * version 10.2--11.4) ++ * @va_pages: Number of VA pages to reserve for JIT ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ * ++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for ++ * backwards compatibility. ++ */ ++struct kbase_ioctl_mem_jit_init_10_2 { ++ __u64 va_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory ++ * allocator (between kernel driver ++ * version 11.5--11.19) ++ * @va_pages: Number of VA pages to reserve for JIT ++ * @max_allocations: Maximum number of concurrent allocations ++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) ++ * @group_id: Group ID to be used for physical allocations ++ * @padding: Currently unused, must be zero ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ * ++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for ++ * backwards compatibility. ++ */ ++struct kbase_ioctl_mem_jit_init_11_5 { ++ __u64 va_pages; ++ __u8 max_allocations; ++ __u8 trim_level; ++ __u8 group_id; ++ __u8 padding[5]; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory ++ * allocator ++ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time ++ * memory allocations ++ * @max_allocations: Maximum number of concurrent allocations ++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) ++ * @group_id: Group ID to be used for physical allocations ++ * @padding: Currently unused, must be zero ++ * @phys_pages: Maximum number of physical pages to allocate just-in-time ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ */ ++struct kbase_ioctl_mem_jit_init { ++ __u64 va_pages; ++ __u8 max_allocations; ++ __u8 trim_level; ++ __u8 group_id; ++ __u8 padding[5]; ++ __u64 phys_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) ++ ++/** ++ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory ++ * ++ * @handle: GPU memory handle (GPU VA) ++ * @user_addr: The address where it is mapped in user space ++ * @size: The number of bytes to synchronise ++ * @type: The direction to synchronise: 0 is sync to memory (clean), ++ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_mem_sync { ++ __u64 handle; ++ __u64 user_addr; ++ __u64 size; ++ __u8 type; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_MEM_SYNC \ ++ _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) ++ ++/** ++ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer ++ * ++ * @in: Input parameters ++ * @in.gpu_addr: The GPU address of the memory region ++ * @in.cpu_addr: The CPU address to locate ++ * @in.size: A size in bytes to validate is contained within the region ++ * @out: Output parameters ++ * @out.offset: The offset from the start of the memory region to @cpu_addr ++ */ ++union kbase_ioctl_mem_find_cpu_offset { ++ struct { ++ __u64 gpu_addr; ++ __u64 cpu_addr; ++ __u64 size; ++ } in; ++ struct { ++ __u64 offset; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ ++ _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) ++ ++/** ++ * struct kbase_ioctl_get_context_id - Get the kernel context ID ++ * ++ * @id: The kernel context ID ++ */ ++struct kbase_ioctl_get_context_id { ++ __u32 id; ++}; ++ ++#define KBASE_IOCTL_GET_CONTEXT_ID \ ++ _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) ++ ++/** ++ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd ++ * ++ * @flags: Flags ++ * ++ * The ioctl returns a file descriptor when successful ++ */ ++struct kbase_ioctl_tlstream_acquire { ++ __u32 flags; ++}; ++ ++#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ ++ _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) ++ ++#define KBASE_IOCTL_TLSTREAM_FLUSH \ ++ _IO(KBASE_IOCTL_TYPE, 19) ++ ++/** ++ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region ++ * ++ * @gpu_addr: The memory region to modify ++ * @pages: The number of physical pages that should be present ++ * ++ * The ioctl may return on the following error codes or 0 for success: ++ * -ENOMEM: Out of memory ++ * -EINVAL: Invalid arguments ++ */ ++struct kbase_ioctl_mem_commit { ++ __u64 gpu_addr; ++ __u64 pages; ++}; ++ ++#define KBASE_IOCTL_MEM_COMMIT \ ++ _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) ++ ++/** ++ * union kbase_ioctl_mem_alias - Create an alias of memory regions ++ * @in: Input parameters ++ * @in.flags: Flags, see BASE_MEM_xxx ++ * @in.stride: Bytes between start of each memory region ++ * @in.nents: The number of regions to pack together into the alias ++ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info ++ * @out: Output parameters ++ * @out.flags: Flags, see BASE_MEM_xxx ++ * @out.gpu_va: Address of the new alias ++ * @out.va_pages: Size of the new alias ++ */ ++union kbase_ioctl_mem_alias { ++ struct { ++ __u64 flags; ++ __u64 stride; ++ __u64 nents; ++ __u64 aliasing_info; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ __u64 va_pages; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_ALIAS \ ++ _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) ++ ++/** ++ * union kbase_ioctl_mem_import - Import memory for use by the GPU ++ * @in: Input parameters ++ * @in.flags: Flags, see BASE_MEM_xxx ++ * @in.phandle: Handle to the external memory ++ * @in.type: Type of external memory, see base_mem_import_type ++ * @in.padding: Amount of extra VA pages to append to the imported buffer ++ * @out: Output parameters ++ * @out.flags: Flags, see BASE_MEM_xxx ++ * @out.gpu_va: Address of the new alias ++ * @out.va_pages: Size of the new alias ++ */ ++union kbase_ioctl_mem_import { ++ struct { ++ __u64 flags; ++ __u64 phandle; ++ __u32 type; ++ __u32 padding; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ __u64 va_pages; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_IMPORT \ ++ _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) ++ ++/** ++ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region ++ * @gpu_va: The GPU region to modify ++ * @flags: The new flags to set ++ * @mask: Mask of the flags to modify ++ */ ++struct kbase_ioctl_mem_flags_change { ++ __u64 gpu_va; ++ __u64 flags; ++ __u64 mask; ++}; ++ ++#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ ++ _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) ++ ++/** ++ * struct kbase_ioctl_stream_create - Create a synchronisation stream ++ * @name: A name to identify this stream. Must be NULL-terminated. ++ * ++ * Note that this is also called a "timeline", but is named stream to avoid ++ * confusion with other uses of the word. ++ * ++ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. ++ * ++ * The ioctl returns a file descriptor. ++ */ ++struct kbase_ioctl_stream_create { ++ char name[32]; ++}; ++ ++#define KBASE_IOCTL_STREAM_CREATE \ ++ _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) ++ ++/** ++ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence ++ * @fd: The file descriptor to validate ++ */ ++struct kbase_ioctl_fence_validate { ++ int fd; ++}; ++ ++#define KBASE_IOCTL_FENCE_VALIDATE \ ++ _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) ++ ++/** ++ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel ++ * @buffer: Pointer to the information ++ * @len: Length ++ * @padding: Padding ++ * ++ * The data provided is accessible through a debugfs file ++ */ ++struct kbase_ioctl_mem_profile_add { ++ __u64 buffer; ++ __u32 len; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_MEM_PROFILE_ADD \ ++ _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) ++ ++/** ++ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource ++ * @count: Number of resources ++ * @address: Array of __u64 GPU addresses of the external resources to map ++ */ ++struct kbase_ioctl_sticky_resource_map { ++ __u64 count; ++ __u64 address; ++}; ++ ++#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ ++ _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) ++ ++/** ++ * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was ++ * previously permanently mapped ++ * @count: Number of resources ++ * @address: Array of __u64 GPU addresses of the external resources to unmap ++ */ ++struct kbase_ioctl_sticky_resource_unmap { ++ __u64 count; ++ __u64 address; ++}; ++ ++#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ ++ _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) ++ ++/** ++ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of ++ * the GPU memory region for ++ * the given gpu address and ++ * the offset of that address ++ * into the region ++ * @in: Input parameters ++ * @in.gpu_addr: GPU virtual address ++ * @in.size: Size in bytes within the region ++ * @out: Output parameters ++ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr ++ * for the length of @offset bytes ++ * @out.offset: The offset from the start of the memory region to @gpu_addr ++ */ ++union kbase_ioctl_mem_find_gpu_start_and_offset { ++ struct { ++ __u64 gpu_addr; ++ __u64 size; ++ } in; ++ struct { ++ __u64 start; ++ __u64 offset; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ ++ _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) ++ ++#define KBASE_IOCTL_CINSTR_GWT_START \ ++ _IO(KBASE_IOCTL_TYPE, 33) ++ ++#define KBASE_IOCTL_CINSTR_GWT_STOP \ ++ _IO(KBASE_IOCTL_TYPE, 34) ++ ++/** ++ * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault ++ * addresses. ++ * @in: Input parameters ++ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. ++ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) ++ * @in.len: Number of addresses the buffers can hold. ++ * @in.padding: padding ++ * @out: Output parameters ++ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. ++ * @out.more_data_available: Status indicating if more addresses are available. ++ * @out.padding: padding ++ * ++ * This structure is used when performing a call to dump GPU write fault ++ * addresses. ++ */ ++union kbase_ioctl_cinstr_gwt_dump { ++ struct { ++ __u64 addr_buffer; ++ __u64 size_buffer; ++ __u32 len; ++ __u32 padding; ++ ++ } in; ++ struct { ++ __u32 no_of_addr_collected; ++ __u8 more_data_available; ++ __u8 padding[27]; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CINSTR_GWT_DUMP \ ++ _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) ++ ++/** ++ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone ++ * ++ * @va_pages: Number of VA pages to reserve for EXEC_VA ++ */ ++struct kbase_ioctl_mem_exec_init { ++ __u64 va_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_EXEC_INIT \ ++ _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) ++ ++/** ++ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of ++ * cpu/gpu time (counter values) ++ * @in: Input parameters ++ * @in.request_flags: Bit-flags indicating the requested types. ++ * @in.paddings: Unused, size alignment matching the out. ++ * @out: Output parameters ++ * @out.sec: Integer field of the monotonic time, unit in seconds. ++ * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. ++ * @out.padding: Unused, for __u64 alignment ++ * @out.timestamp: System wide timestamp (counter) value. ++ * @out.cycle_counter: GPU cycle counter value. ++ */ ++union kbase_ioctl_get_cpu_gpu_timeinfo { ++ struct { ++ __u32 request_flags; ++ __u32 paddings[7]; ++ } in; ++ struct { ++ __u64 sec; ++ __u32 nsec; ++ __u32 padding; ++ __u64 timestamp; ++ __u64 cycle_counter; ++ } out; ++}; ++ ++#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ ++ _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) ++ ++/** ++ * struct kbase_ioctl_context_priority_check - Check the max possible priority ++ * @priority: Input priority & output priority ++ */ ++ ++struct kbase_ioctl_context_priority_check { ++ __u8 priority; ++}; ++ ++#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ ++ _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) ++ ++/** ++ * struct kbase_ioctl_set_limited_core_count - Set the limited core count. ++ * ++ * @max_core_count: Maximum core count ++ */ ++struct kbase_ioctl_set_limited_core_count { ++ __u8 max_core_count; ++}; ++ ++#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ ++ _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) ++ ++/** ++ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter ++ * information ++ * @info_item_size: Performance counter item size in bytes. ++ * @info_item_count: Performance counter item count in the info_list_ptr. ++ * @info_list_ptr: Performance counter item list pointer which points to a ++ * list with info_item_count of items. ++ * ++ * On success: returns info_item_size and info_item_count if info_list_ptr is ++ * NULL, returns performance counter information if info_list_ptr is not NULL. ++ * On error: returns a negative error code. ++ */ ++struct kbase_ioctl_kinstr_prfcnt_enum_info { ++ __u32 info_item_size; ++ __u32 info_item_count; ++ __u64 info_list_ptr; ++}; ++ ++#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ ++ _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) ++ ++/** ++ * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader ++ * @in: input parameters. ++ * @in.request_item_count: Number of requests in the requests array. ++ * @in.request_item_size: Size in bytes of each request in the requests array. ++ * @in.requests_ptr: Pointer to the requests array. ++ * @out: output parameters. ++ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for ++ * each sample. ++ * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap ++ * for reading performance counter samples. ++ * ++ * A fd is returned from the ioctl if successful, or a negative value on error. ++ */ ++union kbase_ioctl_kinstr_prfcnt_setup { ++ struct { ++ __u32 request_item_count; ++ __u32 request_item_size; ++ __u64 requests_ptr; ++ } in; ++ struct { ++ __u32 prfcnt_metadata_item_size; ++ __u32 prfcnt_mmap_size_bytes; ++ } out; ++}; ++ ++#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ ++ _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) ++ ++/*************** ++ * test ioctls * ++ ***************/ ++#if MALI_UNIT_TEST ++/* These ioctls are purely for test purposes and are not used in the production ++ * driver, they therefore may change without notice ++ */ ++ ++#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) ++ ++ ++/** ++ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes ++ * @bytes_collected: number of bytes read by user ++ * @bytes_generated: number of bytes generated by tracepoints ++ */ ++struct kbase_ioctl_tlstream_stats { ++ __u32 bytes_collected; ++ __u32 bytes_generated; ++}; ++ ++#define KBASE_IOCTL_TLSTREAM_STATS \ ++ _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) ++ ++#endif /* MALI_UNIT_TEST */ ++ ++/* Customer extension range */ ++#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) ++ ++/* If the integration needs extra ioctl add them there ++ * like this: ++ * ++ * struct my_ioctl_args { ++ * .... ++ * } ++ * ++ * #define KBASE_IOCTL_MY_IOCTL \ ++ * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) ++ */ ++ ++#ifdef __cpluscplus ++} ++#endif ++ ++#endif /* _UAPI_KBASE_IOCTL_H_ */ +diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h +new file mode 100644 +index 00000000000..5f33f5c4c4b +--- /dev/null ++++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h +@@ -0,0 +1,80 @@ ++/* ++ * © Copyright 2017-2018 The Panfrost Community ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * A copy of the licence is included with the program, and can also be obtained ++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ++ * Boston, MA 02110-1301, USA. ++ * ++ */ ++ ++#ifndef __KBASE_IOCTL_MIDGARD_H__ ++#define __KBASE_IOCTL_MIDGARD_H__ ++ ++#define KBASE_IOCTL_TYPE_BASE 0x80 ++#define KBASE_IOCTL_TYPE_MAX 0x82 ++ ++union kbase_ioctl_mem_alloc { ++ struct { ++ union kbase_ioctl_header header; ++ u64 va_pages; ++ u64 commit_pages; ++ u64 extension; ++ u64 flags; ++ } in; ++ struct { ++ union kbase_ioctl_header header; ++ u64 pad[3]; ++ u64 flags; ++ mali_ptr gpu_va; ++ u16 va_alignment; ++ } out; ++ u64 pad[7]; ++} __attribute__((packed)); ++ ++#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1) ++ ++#define KBASE_IOCTL_GET_VERSION (_IOWR(0x80, 0, struct kbase_ioctl_get_version)) ++#define KBASE_IOCTL_MEM_ALLOC (_IOWR(0x82, 0, union kbase_ioctl_mem_alloc)) ++#define KBASE_IOCTL_MEM_IMPORT (_IOWR(0x82, 1, union kbase_ioctl_mem_import)) ++#define KBASE_IOCTL_MEM_COMMIT (_IOWR(0x82, 2, struct kbase_ioctl_mem_commit)) ++#define KBASE_IOCTL_MEM_QUERY (_IOWR(0x82, 3, struct kbase_ioctl_mem_query)) ++#define KBASE_IOCTL_MEM_FREE (_IOWR(0x82, 4, struct kbase_ioctl_mem_free)) ++#define KBASE_IOCTL_MEM_FLAGS_CHANGE (_IOWR(0x82, 5, struct kbase_ioctl_mem_flags_change)) ++#define KBASE_IOCTL_MEM_ALIAS (_IOWR(0x82, 6, struct kbase_ioctl_mem_alias)) ++#define KBASE_IOCTL_MEM_SYNC (_IOWR(0x82, 8, struct kbase_ioctl_mem_sync)) ++#define KBASE_IOCTL_POST_TERM (_IOWR(0x82, 9, __ioctl_placeholder)) ++#define KBASE_IOCTL_HWCNT_SETUP (_IOWR(0x82, 10, __ioctl_placeholder)) ++#define KBASE_IOCTL_HWCNT_DUMP (_IOWR(0x82, 11, __ioctl_placeholder)) ++#define KBASE_IOCTL_HWCNT_CLEAR (_IOWR(0x82, 12, __ioctl_placeholder)) ++#define KBASE_IOCTL_GPU_PROPS_REG_DUMP (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump)) ++#define KBASE_IOCTL_FIND_CPU_OFFSET (_IOWR(0x82, 15, __ioctl_placeholder)) ++#define KBASE_IOCTL_GET_VERSION_NEW (_IOWR(0x82, 16, struct kbase_ioctl_get_version)) ++#define KBASE_IOCTL_SET_FLAGS (_IOWR(0x82, 18, struct kbase_ioctl_set_flags)) ++#define KBASE_IOCTL_SET_TEST_DATA (_IOWR(0x82, 19, __ioctl_placeholder)) ++#define KBASE_IOCTL_INJECT_ERROR (_IOWR(0x82, 20, __ioctl_placeholder)) ++#define KBASE_IOCTL_MODEL_CONTROL (_IOWR(0x82, 21, __ioctl_placeholder)) ++#define KBASE_IOCTL_KEEP_GPU_POWERED (_IOWR(0x82, 22, __ioctl_placeholder)) ++#define KBASE_IOCTL_FENCE_VALIDATE (_IOWR(0x82, 23, __ioctl_placeholder)) ++#define KBASE_IOCTL_STREAM_CREATE (_IOWR(0x82, 24, struct kbase_ioctl_stream_create)) ++#define KBASE_IOCTL_GET_PROFILING_CONTROLS (_IOWR(0x82, 25, __ioctl_placeholder)) ++#define KBASE_IOCTL_SET_PROFILING_CONTROLS (_IOWR(0x82, 26, __ioctl_placeholder)) ++#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder)) ++#define KBASE_IOCTL_JOB_SUBMIT (_IOWR(0x82, 28, struct kbase_ioctl_job_submit)) ++#define KBASE_IOCTL_DISJOINT_QUERY (_IOWR(0x82, 29, __ioctl_placeholder)) ++#define KBASE_IOCTL_GET_CONTEXT_ID (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id)) ++#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4 (_IOWR(0x82, 32, __ioctl_placeholder)) ++#define KBASE_IOCTL_TLSTREAM_TEST (_IOWR(0x82, 33, __ioctl_placeholder)) ++#define KBASE_IOCTL_TLSTREAM_STATS (_IOWR(0x82, 34, __ioctl_placeholder)) ++#define KBASE_IOCTL_TLSTREAM_FLUSH (_IOWR(0x82, 35, __ioctl_placeholder)) ++#define KBASE_IOCTL_HWCNT_READER_SETUP (_IOWR(0x82, 36, __ioctl_placeholder)) ++#define KBASE_IOCTL_SET_PRFCNT_VALUES (_IOWR(0x82, 37, __ioctl_placeholder)) ++#define KBASE_IOCTL_SOFT_EVENT_UPDATE (_IOWR(0x82, 38, __ioctl_placeholder)) ++#define KBASE_IOCTL_MEM_JIT_INIT (_IOWR(0x82, 39, __ioctl_placeholder)) ++#define KBASE_IOCTL_TLSTREAM_ACQUIRE (_IOWR(0x82, 40, __ioctl_placeholder)) ++ ++#endif /* __KBASE_IOCTL_MIDGARD_H__ */ +diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h +new file mode 100644 +index 00000000000..5c76f2dc8e5 +--- /dev/null ++++ b/src/panfrost/base/include/old/mali-ioctl.h +@@ -0,0 +1,743 @@ ++/* ++ * © Copyright 2017-2018 The Panfrost Community ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * A copy of the licence is included with the program, and can also be obtained ++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ++ * Boston, MA 02110-1301, USA. ++ * ++ */ ++ ++/** ++ * Definitions for all of the ioctls for the original open source bifrost GPU ++ * kernel driver, written by ARM. ++ */ ++ ++#ifndef __KBASE_IOCTL_H__ ++#define __KBASE_IOCTL_H__ ++ ++typedef uint8_t u8; ++typedef uint16_t u16; ++typedef uint32_t u32; ++typedef uint64_t u64; ++ ++typedef int32_t s32; ++typedef int64_t s64; ++ ++ ++typedef u8 mali_atom_id; ++ ++/** ++ * Since these structs are passed to and from the kernel we need to make sure ++ * that we get the size of each struct to match exactly what the kernel is ++ * expecting. So, when editing this file make sure to add static asserts that ++ * check each struct's size against the arg length you see in strace. ++ */ ++ ++enum kbase_ioctl_mem_flags { ++ /* IN */ ++ BASE_MEM_PROT_CPU_RD = (1U << 0), /**< Read access CPU side */ ++ BASE_MEM_PROT_CPU_WR = (1U << 1), /**< Write access CPU side */ ++ BASE_MEM_PROT_GPU_RD = (1U << 2), /**< Read access GPU side */ ++ BASE_MEM_PROT_GPU_WR = (1U << 3), /**< Write access GPU side */ ++ BASE_MEM_PROT_GPU_EX = (1U << 4), /**< Execute allowed on the GPU ++ side */ ++ ++ BASE_MEM_GROW_ON_GPF = (1U << 9), /**< Grow backing store on GPU ++ Page Fault */ ++ ++ BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer ++ shareable, if available */ ++ BASE_MEM_COHERENT_LOCAL = (1U << 11), /**< Page coherence Inner ++ shareable */ ++ BASE_MEM_CACHED_CPU = (1U << 12), /**< Should be cached on the ++ CPU */ ++ ++ /* IN/OUT */ ++ BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU ++ and the CPU */ ++ /* OUT */ ++ BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU ++ address for the alloc */ ++ /* IN */ ++ BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence ++ Outer shareable, required. */ ++ BASE_MEM_SECURE = (1U << 16), /**< Secure memory */ ++ BASE_MEM_DONT_NEED = (1U << 17), /**< Not needed physical ++ memory */ ++ BASE_MEM_IMPORT_SHARED = (1U << 18), /**< Must use shared CPU/GPU zone ++ (SAME_VA zone) but doesn't ++ require the addresses to ++ be the same */ ++}; ++ ++#define KBASE_IOCTL_MEM_FLAGS_IN_MASK \ ++ (BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | \ ++ BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \ ++ BASE_MEM_GROW_ON_GPF | \ ++ BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL | \ ++ BASE_MEM_CACHED_CPU | \ ++ BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE | \ ++ BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED) ++#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12) ++ ++enum kbase_ioctl_coherency_mode { ++ COHERENCY_ACE_LITE = 0, ++ COHERENCY_ACE = 1, ++ COHERENCY_NONE = 31 ++}; ++ ++/* ++ * Mali Atom priority ++ * ++ * Only certain priority levels are actually implemented, as specified by the ++ * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority ++ * level that is not one of those defined below. ++ * ++ * Priority levels only affect scheduling between atoms of the same type within ++ * a mali context, and only after the atoms have had dependencies resolved. ++ * Fragment atoms does not affect non-frament atoms with lower priorities, and ++ * the other way around. For example, a low priority atom that has had its ++ * dependencies resolved might run before a higher priority atom that has not ++ * had its dependencies resolved. ++ * ++ * The scheduling between mali contexts/processes and between atoms from ++ * different mali contexts/processes is unaffected by atom priority. ++ * ++ * The atoms are scheduled as follows with respect to their priorities: ++ * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies ++ * resolved, and atom 'X' has a higher priority than atom 'Y' ++ * - If atom 'Y' is currently running on the HW, then it is interrupted to ++ * allow atom 'X' to run soon after ++ * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing ++ * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' ++ * - Any two atoms that have the same priority could run in any order with ++ * respect to each other. That is, there is no ordering constraint between ++ * atoms of the same priority. ++ */ ++typedef u8 mali_jd_prio; ++#define BASE_JD_PRIO_MEDIUM ((mali_jd_prio)0) ++#define BASE_JD_PRIO_HIGH ((mali_jd_prio)1) ++#define BASE_JD_PRIO_LOW ((mali_jd_prio)2) ++ ++/** ++ * @brief Job dependency type. ++ * ++ * A flags field will be inserted into the atom structure to specify whether a ++ * dependency is a data or ordering dependency (by putting it before/after ++ * 'core_req' in the structure it should be possible to add without changing ++ * the structure size). When the flag is set for a particular dependency to ++ * signal that it is an ordering only dependency then errors will not be ++ * propagated. ++ */ ++typedef u8 mali_jd_dep_type; ++#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ ++#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ ++#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ ++ ++/** ++ * @brief Job chain hardware requirements. ++ * ++ * A job chain must specify what GPU features it needs to allow the ++ * driver to schedule the job correctly. By not specifying the ++ * correct settings can/will cause an early job termination. Multiple ++ * values can be ORed together to specify multiple requirements. ++ * Special case is ::BASE_JD_REQ_DEP, which is used to express complex ++ * dependencies, and that doesn't execute anything on the hardware. ++ */ ++typedef u32 mali_jd_core_req; ++ ++/* Requirements that come from the HW */ ++ ++/** ++ * No requirement, dependency only ++ */ ++#define BASE_JD_REQ_DEP ((mali_jd_core_req)0) ++ ++/** ++ * Requires fragment shaders ++ */ ++#define BASE_JD_REQ_FS ((mali_jd_core_req)1 << 0) ++ ++/** ++ * Requires compute shaders ++ * This covers any of the following Midgard Job types: ++ * - Vertex Shader Job ++ * - Geometry Shader Job ++ * - An actual Compute Shader Job ++ * ++ * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the ++ * job is specifically just the "Compute Shader" job type, and not the "Vertex ++ * Shader" nor the "Geometry Shader" job type. ++ */ ++#define BASE_JD_REQ_CS ((mali_jd_core_req)1 << 1) ++#define BASE_JD_REQ_T ((mali_jd_core_req)1 << 2) /**< Requires tiling */ ++#define BASE_JD_REQ_CF ((mali_jd_core_req)1 << 3) /**< Requires cache flushes */ ++#define BASE_JD_REQ_V ((mali_jd_core_req)1 << 4) /**< Requires value writeback */ ++ ++/* SW-only requirements - the HW does not expose these as part of the job slot ++ * capabilities */ ++ ++/* Requires fragment job with AFBC encoding */ ++#define BASE_JD_REQ_FS_AFBC ((mali_jd_core_req)1 << 13) ++ ++/** ++ * SW-only requirement: coalesce completion events. ++ * If this bit is set then completion of this atom will not cause an event to ++ * be sent to userspace, whether successful or not; completion events will be ++ * deferred until an atom completes which does not have this bit set. ++ * ++ * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. ++ */ ++#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5) ++ ++/** ++ * SW Only requirement: the job chain requires a coherent core group. We don't ++ * mind which coherent core group is used. ++ */ ++#define BASE_JD_REQ_COHERENT_GROUP ((mali_jd_core_req)1 << 6) ++ ++/** ++ * SW Only requirement: The performance counters should be enabled only when ++ * they are needed, to reduce power consumption. ++ */ ++ ++#define BASE_JD_REQ_PERMON ((mali_jd_core_req)1 << 7) ++ ++/** ++ * SW Only requirement: External resources are referenced by this atom. When ++ * external resources are referenced no syncsets can be bundled with the atom ++ * but should instead be part of a NULL jobs inserted into the dependency ++ * tree. The first pre_dep object must be configured for the external ++ * resouces to use, the second pre_dep object can be used to create other ++ * dependencies. ++ * ++ * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE. ++ */ ++#define BASE_JD_REQ_EXTERNAL_RESOURCES ((mali_jd_core_req)1 << 8) ++ ++/** ++ * SW Only requirement: Software defined job. Jobs with this bit set will not ++ * be submitted to the hardware but will cause some action to happen within ++ * the driver ++ */ ++#define BASE_JD_REQ_SOFT_JOB ((mali_jd_core_req)1 << 9) ++ ++#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) ++#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) ++#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) ++ ++/** ++ * SW Only requirement : Replay job. ++ * ++ * If the preceding job fails, the replay job will cause the jobs specified in ++ * the list of mali_jd_replay_payload pointed to by the jc pointer to be ++ * replayed. ++ * ++ * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT ++ * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay ++ * job is failed, as well as any following dependencies. ++ * ++ * The replayed jobs will require a number of atom IDs. If there are not enough ++ * free atom IDs then the replay job will fail. ++ * ++ * If the preceding job does not fail, then the replay job is returned as ++ * completed. ++ * ++ * The replayed jobs will never be returned to userspace. The preceding failed ++ * job will be returned to userspace as failed; the status of this job should ++ * be ignored. Completion should be determined by the status of the replay soft ++ * job. ++ * ++ * In order for the jobs to be replayed, the job headers will have to be ++ * modified. The Status field will be reset to NOT_STARTED. If the Job Type ++ * field indicates a Vertex Shader Job then it will be changed to Null Job. ++ * ++ * The replayed jobs have the following assumptions : ++ * ++ * - No external resources. Any required external resources will be held by the ++ * replay atom. ++ * - Pre-dependencies are created based on job order. ++ * - Atom numbers are automatically assigned. ++ * - device_nr is set to 0. This is not relevant as ++ * BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set. ++ * - Priority is inherited from the replay job. ++ */ ++#define BASE_JD_REQ_SOFT_REPLAY (BASE_JD_REQ_SOFT_JOB | 0x4) ++/** ++ * SW only requirement: event wait/trigger job. ++ * ++ * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. ++ * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the ++ * other waiting jobs. It completes immediately. ++ * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it ++ * possible for other jobs to wait upon. It completes immediately. ++ */ ++#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) ++#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) ++#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) ++ ++#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) ++ ++/** ++ * SW only requirement: Just In Time allocation ++ * ++ * This job requests a JIT allocation based on the request in the ++ * @base_jit_alloc_info structure which is passed via the jc element of ++ * the atom. ++ * ++ * It should be noted that the id entry in @base_jit_alloc_info must not ++ * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE. ++ * ++ * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE ++ * soft job to free the JIT allocation is still made. ++ * ++ * The job will complete immediately. ++ */ ++#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) ++/** ++ * SW only requirement: Just In Time free ++ * ++ * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC ++ * to be freed. The ID of the JIT allocation is passed via the jc element of ++ * the atom. ++ * ++ * The job will complete immediately. ++ */ ++#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) ++ ++/** ++ * SW only requirement: Map external resource ++ * ++ * This job requests external resource(s) are mapped once the dependencies ++ * of the job have been satisfied. The list of external resources are ++ * passed via the jc element of the atom which is a pointer to a ++ * @base_external_resource_list. ++ */ ++#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) ++/** ++ * SW only requirement: Unmap external resource ++ * ++ * This job requests external resource(s) are unmapped once the dependencies ++ * of the job has been satisfied. The list of external resources are ++ * passed via the jc element of the atom which is a pointer to a ++ * @base_external_resource_list. ++ */ ++#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) ++ ++/** ++ * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) ++ * ++ * This indicates that the Job Chain contains Midgard Jobs of the 'Compute ++ * Shaders' type. ++ * ++ * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job ++ * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. ++ */ ++#define BASE_JD_REQ_ONLY_COMPUTE ((mali_jd_core_req)1 << 10) ++ ++/** ++ * HW Requirement: Use the mali_jd_atom::device_nr field to specify a ++ * particular core group ++ * ++ * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag ++ * takes priority ++ * ++ * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms. ++ * ++ * If the core availability policy is keeping the required core group turned ++ * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code. ++ */ ++#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11) ++ ++/** ++ * SW Flag: If this bit is set then the successful completion of this atom ++ * will not cause an event to be sent to userspace ++ */ ++#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((mali_jd_core_req)1 << 12) ++ ++/** ++ * SW Flag: If this bit is set then completion of this atom will not cause an ++ * event to be sent to userspace, whether successful or not. ++ */ ++#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14) ++ ++/** ++ * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. ++ * ++ * If this bit is set then the GPU's cache will not be cleaned and invalidated ++ * until a GPU job starts which does not have this bit set or a job completes ++ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if ++ * the CPU may have written to memory addressed by the job since the last job ++ * without this bit set was submitted. ++ */ ++#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15) ++ ++/** ++ * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. ++ * ++ * If this bit is set then the GPU's cache will not be cleaned and invalidated ++ * until a GPU job completes which does not have this bit set or a job starts ++ * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not ++ * use if the CPU may read from or partially overwrite memory addressed by the ++ * job before the next job without this bit set completes. ++ */ ++#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16) ++ ++/** ++ * These requirement bits are currently unused in mali_jd_core_req ++ */ ++#define MALIP_JD_REQ_RESERVED \ ++ (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ ++ BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \ ++ BASE_JD_REQ_EVENT_COALESCE | \ ++ BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ ++ BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ ++ BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END)) ++ ++/** ++ * Mask of all bits in mali_jd_core_req that control the type of the atom. ++ * ++ * This allows dependency only atoms to have flags set ++ */ ++#define BASE_JD_REQ_ATOM_TYPE \ ++ (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ ++ BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) ++ ++/** ++ * Mask of all bits in mali_jd_core_req that control the type of a soft job. ++ */ ++#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) ++ ++/* ++ * Returns non-zero value if core requirements passed define a soft job or ++ * a dependency only job. ++ */ ++#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ ++ ((core_req & BASE_JD_REQ_SOFT_JOB) || \ ++ (core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) ++ ++/** ++ * @brief The payload for a replay job. This must be in GPU memory. ++ */ ++struct mali_jd_replay_payload { ++ /** ++ * Pointer to the first entry in the mali_jd_replay_jc list. These ++ * will be replayed in @b reverse order (so that extra ones can be added ++ * to the head in future soft jobs without affecting this soft job) ++ */ ++ u64 tiler_jc_list; ++ ++ /** ++ * Pointer to the fragment job chain. ++ */ ++ u64 fragment_jc; ++ ++ /** ++ * Pointer to the tiler heap free FBD field to be modified. ++ */ ++ u64 tiler_heap_free; ++ ++ /** ++ * Hierarchy mask for the replayed fragment jobs. May be zero. ++ */ ++ u16 fragment_hierarchy_mask; ++ ++ /** ++ * Hierarchy mask for the replayed tiler jobs. May be zero. ++ */ ++ u16 tiler_hierarchy_mask; ++ ++ /** ++ * Default weight to be used for hierarchy levels not in the original ++ * mask. ++ */ ++ u32 hierarchy_default_weight; ++ ++ /** ++ * Core requirements for the tiler job chain ++ */ ++ mali_jd_core_req tiler_core_req; ++ ++ /** ++ * Core requirements for the fragment job chain ++ */ ++ mali_jd_core_req fragment_core_req; ++}; ++ ++/** ++ * @brief An entry in the linked list of job chains to be replayed. This must ++ * be in GPU memory. ++ */ ++struct mali_jd_replay_jc { ++ /** ++ * Pointer to next entry in the list. A setting of NULL indicates the ++ * end of the list. ++ */ ++ u64 next; ++ ++ /** ++ * Pointer to the job chain. ++ */ ++ u64 jc; ++}; ++ ++typedef u64 mali_ptr; ++ ++#define MALI_PTR_FMT "0x%" PRIx64 ++#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR ++ ++#ifdef __LP64__ ++#define PAD_CPU_PTR(p) p ++#else ++#define PAD_CPU_PTR(p) p; u32 :32; ++#endif ++ ++/* FIXME: Again, they don't specify any of these as packed structs. However, ++ * looking at these structs I'm worried that there is already spots where the ++ * compiler is potentially sticking in padding... ++ * Going to try something a little crazy, and just hope that our compiler ++ * happens to add the same kind of offsets since we can't really compare sizes ++ */ ++ ++/* ++ * Blob provided by the driver to store callback driver, not actually modified ++ * by the driver itself ++ */ ++struct mali_jd_udata { ++ u64 blob[2]; ++}; ++ ++struct mali_jd_dependency { ++ mali_atom_id atom_id; /**< An atom number */ ++ mali_jd_dep_type dependency_type; /**< Dependency type */ ++}; ++ ++#define MALI_EXT_RES_MAX 10 ++ ++/* The original header never explicitly defines any values for these. In C, ++ * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we ++ * actually need to decode here is EXCLUSIVE ++ */ ++enum mali_external_resource_access { ++ MALI_EXT_RES_ACCESS_SHARED, ++ MALI_EXT_RES_ACCESS_EXCLUSIVE, ++}; ++ ++/* An aligned address to the resource | mali_external_resource_access */ ++typedef u64 mali_external_resource; ++ ++struct base_jd_atom_v2 { ++ mali_ptr jc; /**< job-chain GPU address */ ++ struct mali_jd_udata udata; /**< user data */ ++ u64 extres_list; /**< list of external resources */ ++ u16 nr_extres; /**< nr of external resources */ ++ u16 compat_core_req; /**< core requirements which ++ correspond to the legacy support ++ for UK 10.2 */ ++ struct mali_jd_dependency pre_dep[2]; /**< pre-dependencies, one need to ++ use SETTER function to assign ++ this field, this is done in ++ order to reduce possibility of ++ improper assigment of a ++ dependency field */ ++ mali_atom_id atom_number; /**< unique number to identify the ++ atom */ ++ mali_jd_prio prio; /**< Atom priority. Refer to @ref ++ mali_jd_prio for more details */ ++ u8 device_nr; /**< coregroup when ++ BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ++ specified */ ++ u8 :8; ++ mali_jd_core_req core_req; /**< core requirements */ ++} __attribute__((packed)); ++ ++/** ++ * enum mali_error - Mali error codes shared with userspace ++ * ++ * This is subset of those common Mali errors that can be returned to userspace. ++ * Values of matching user and kernel space enumerators MUST be the same. ++ * MALI_ERROR_NONE is guaranteed to be 0. ++ * ++ * @MALI_ERROR_NONE: Success ++ * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver ++ * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure ++ * @MALI_ERROR_FUNCTION_FAILED: Generic error code ++ */ ++enum mali_error { ++ MALI_ERROR_NONE = 0, ++ MALI_ERROR_OUT_OF_GPU_MEMORY, ++ MALI_ERROR_OUT_OF_MEMORY, ++ MALI_ERROR_FUNCTION_FAILED, ++}; ++ ++/** ++ * Header used by all ioctls ++ */ ++union kbase_ioctl_header { ++#ifdef dvalin ++ u32 pad[0]; ++#else ++ /* [in] The ID of the UK function being called */ ++ u32 id :32; ++ /* [out] The return value of the UK function that was called */ ++ enum mali_error rc :32; ++ ++ u64 :64; ++#endif ++} __attribute__((packed)); ++ ++struct kbase_ioctl_get_version { ++ union kbase_ioctl_header header; ++ u16 major; /* [out] */ ++ u16 minor; /* [out] */ ++ u32 :32; ++} __attribute__((packed)); ++ ++struct mali_mem_import_user_buffer { ++ u64 ptr; ++ u64 length; ++}; ++ ++union kbase_ioctl_mem_import { ++ struct { ++ union kbase_ioctl_header header; ++ u64 phandle; ++ enum { ++ BASE_MEM_IMPORT_TYPE_INVALID = 0, ++ BASE_MEM_IMPORT_TYPE_UMP = 1, ++ BASE_MEM_IMPORT_TYPE_UMM = 2, ++ BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3, ++ } type :32; ++ u32 :32; ++ u64 flags; ++ } in; ++ struct { ++ union kbase_ioctl_header header; ++ u64 pad[2]; ++ u64 flags; ++ u64 gpu_va; ++ u64 va_pages; ++ } out; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_mem_commit { ++ union kbase_ioctl_header header; ++ /* [in] */ ++ mali_ptr gpu_addr; ++ u64 pages; ++ /* [out] */ ++ u32 result_subcode; ++ u32 :32; ++} __attribute__((packed)); ++ ++enum kbase_ioctl_mem_query_type { ++ BASE_MEM_QUERY_COMMIT_SIZE = 1, ++ BASE_MEM_QUERY_VA_SIZE = 2, ++ BASE_MEM_QUERY_FLAGS = 3 ++}; ++ ++struct kbase_ioctl_mem_query { ++ union kbase_ioctl_header header; ++ /* [in] */ ++ mali_ptr gpu_addr; ++ enum kbase_ioctl_mem_query_type query : 32; ++ u32 :32; ++ /* [out] */ ++ u64 value; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_mem_free { ++ union kbase_ioctl_header header; ++ mali_ptr gpu_addr; /* [in] */ ++} __attribute__((packed)); ++/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ ++ ++struct kbase_ioctl_mem_flags_change { ++ union kbase_ioctl_header header; ++ /* [in] */ ++ mali_ptr gpu_va; ++ u64 flags; ++ u64 mask; ++} __attribute__((packed)); ++/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ ++ ++struct kbase_ioctl_mem_alias { ++ union kbase_ioctl_header header; ++ /* [in/out] */ ++ u64 flags; ++ /* [in] */ ++ u64 stride; ++ u64 nents; ++ u64 ai; ++ /* [out] */ ++ mali_ptr gpu_va; ++ u64 va_pages; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_mem_sync { ++ union kbase_ioctl_header header; ++ mali_ptr handle; ++ u64 user_addr; ++ u64 size; ++ enum { ++ MALI_SYNC_TO_DEVICE = 1, ++ MALI_SYNC_TO_CPU = 2, ++ } type :8; ++ u64 :56; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_set_flags { ++ union kbase_ioctl_header header; ++ u32 create_flags; /* [in] */ ++ u32 :32; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_stream_create { ++ union kbase_ioctl_header header; ++ /* [in] */ ++ char name[32]; ++ /* [out] */ ++ s32 fd; ++ u32 :32; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_job_submit { ++ union kbase_ioctl_header header; ++ /* [in] */ ++ u64 addr; ++ u32 nr_atoms; ++ u32 stride; ++} __attribute__((packed)); ++ ++struct kbase_ioctl_get_context_id { ++ union kbase_ioctl_header header; ++ /* [out] */ ++ s64 id; ++} __attribute__((packed)); ++ ++#undef PAD_CPU_PTR ++ ++enum base_jd_event_code { ++ BASE_JD_EVENT_DONE = 1, ++}; ++ ++struct base_jd_event_v2 { ++ enum base_jd_event_code event_code; ++ mali_atom_id atom_number; ++ struct mali_jd_udata udata; ++}; ++ ++/* Defined in mali-props.h */ ++struct kbase_ioctl_gpu_props_reg_dump; ++ ++/* For ioctl's we haven't written decoding stuff for yet */ ++typedef struct { ++ union kbase_ioctl_header header; ++} __ioctl_placeholder; ++ ++#endif /* __KBASE_IOCTL_H__ */ +diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h +new file mode 100644 +index 00000000000..5b9d8723600 +--- /dev/null ++++ b/src/panfrost/base/include/old/mali-props.h +@@ -0,0 +1,262 @@ ++/* ++ * © Copyright 2017-2018 The Panfrost Community ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * A copy of the licence is included with the program, and can also be obtained ++ * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ++ * Boston, MA 02110-1301, USA. ++ * ++ */ ++ ++#ifndef __MALI_PROPS_H__ ++#define __MALI_PROPS_H__ ++ ++#include "mali-ioctl.h" ++ ++#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3 ++#define MALI_GPU_MAX_JOB_SLOTS 16 ++#define MALI_MAX_COHERENT_GROUPS 16 ++ ++/* Capabilities of a job slot as reported by JS_FEATURES registers */ ++ ++#define JS_FEATURE_NULL_JOB (1u << 1) ++#define JS_FEATURE_SET_VALUE_JOB (1u << 2) ++#define JS_FEATURE_CACHE_FLUSH_JOB (1u << 3) ++#define JS_FEATURE_COMPUTE_JOB (1u << 4) ++#define JS_FEATURE_VERTEX_JOB (1u << 5) ++#define JS_FEATURE_GEOMETRY_JOB (1u << 6) ++#define JS_FEATURE_TILER_JOB (1u << 7) ++#define JS_FEATURE_FUSED_JOB (1u << 8) ++#define JS_FEATURE_FRAGMENT_JOB (1u << 9) ++ ++struct mali_gpu_core_props { ++ /** ++ * Product specific value. ++ */ ++ u32 product_id; ++ ++ /** ++ * Status of the GPU release. ++ * No defined values, but starts at 0 and increases by one for each ++ * release status (alpha, beta, EAC, etc.). ++ * 4 bit values (0-15). ++ */ ++ u16 version_status; ++ ++ /** ++ * Minor release number of the GPU. "P" part of an "RnPn" release ++ * number. ++ * 8 bit values (0-255). ++ */ ++ u16 minor_revision; ++ ++ /** ++ * Major release number of the GPU. "R" part of an "RnPn" release ++ * number. ++ * 4 bit values (0-15). ++ */ ++ u16 major_revision; ++ ++ u16 :16; ++ ++ /** ++ * @usecase GPU clock speed is not specified in the Midgard ++ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() ++ * function. ++ */ ++ u32 gpu_speed_mhz; ++ ++ /** ++ * @usecase GPU clock max/min speed is required for computing ++ * best/worst case in tasks as job scheduling ant irq_throttling. (It ++ * is not specified in the Midgard Architecture). ++ */ ++ u32 gpu_freq_khz_max; ++ u32 gpu_freq_khz_min; ++ ++ /** ++ * Size of the shader program counter, in bits. ++ */ ++ u32 log2_program_counter_size; ++ ++ /** ++ * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a ++ * bitpattern where a set bit indicates that the format is supported. ++ * ++ * Before using a texture format, it is recommended that the ++ * corresponding bit be checked. ++ */ ++ u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ ++ /** ++ * Theoretical maximum memory available to the GPU. It is unlikely ++ * that a client will be able to allocate all of this memory for their ++ * own purposes, but this at least provides an upper bound on the ++ * memory available to the GPU. ++ * ++ * This is required for OpenCL's clGetDeviceInfo() call when ++ * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The ++ * client will not be expecting to allocate anywhere near this value. ++ */ ++ u64 gpu_available_memory_size; ++}; ++ ++struct mali_gpu_l2_cache_props { ++ u8 log2_line_size; ++ u8 log2_cache_size; ++ u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ ++ u64 :40; ++}; ++ ++struct mali_gpu_tiler_props { ++ u32 bin_size_bytes; /* Max is 4*2^15 */ ++ u32 max_active_levels; /* Max is 2^15 */ ++}; ++ ++struct mali_gpu_thread_props { ++ u32 max_threads; /* Max. number of threads per core */ ++ u32 max_workgroup_size; /* Max. number of threads per workgroup */ ++ u32 max_barrier_size; /* Max. number of threads that can ++ synchronize on a simple barrier */ ++ u16 max_registers; /* Total size [1..65535] of the register ++ file available per core. */ ++ u8 max_task_queue; /* Max. tasks [1..255] which may be sent ++ to a core before it becomes blocked. */ ++ u8 max_thread_group_split; /* Max. allowed value [1..15] of the ++ Thread Group Split field. */ ++ enum { ++ MALI_GPU_IMPLEMENTATION_UNKNOWN = 0, ++ MALI_GPU_IMPLEMENTATION_SILICON = 1, ++ MALI_GPU_IMPLEMENTATION_FPGA = 2, ++ MALI_GPU_IMPLEMENTATION_SW = 3, ++ } impl_tech :8; ++ u64 :56; ++}; ++ ++/** ++ * @brief descriptor for a coherent group ++ * ++ * \c core_mask exposes all cores in that coherent group, and \c num_cores ++ * provides a cached population-count for that mask. ++ * ++ * @note Whilst all cores are exposed in the mask, not all may be available to ++ * the application, depending on the Kernel Power policy. ++ * ++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of ++ * wastage. ++ */ ++struct mali_ioctl_gpu_coherent_group { ++ u64 core_mask; /**< Core restriction mask required for the ++ group */ ++ u16 num_cores; /**< Number of cores in the group */ ++ u64 :48; ++}; ++ ++/** ++ * @brief Coherency group information ++ * ++ * Note that the sizes of the members could be reduced. However, the \c group ++ * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte ++ * aligned, thus leading to wastage if the other members sizes were reduced. ++ * ++ * The groups are sorted by core mask. The core masks are non-repeating and do ++ * not intersect. ++ */ ++struct mali_gpu_coherent_group_info { ++ u32 num_groups; ++ ++ /** ++ * Number of core groups (coherent or not) in the GPU. Equivalent to ++ * the number of L2 Caches. ++ * ++ * The GPU Counter dumping writes 2048 bytes per core group, ++ * regardless of whether the core groups are coherent or not. Hence ++ * this member is needed to calculate how much memory is required for ++ * dumping. ++ * ++ * @note Do not use it to work out how many valid elements are in the ++ * group[] member. Use num_groups instead. ++ */ ++ u32 num_core_groups; ++ ++ /** ++ * Coherency features of the memory, accessed by @ref gpu_mem_features ++ * methods ++ */ ++ u32 coherency; ++ ++ u32 :32; ++ ++ /** ++ * Descriptors of coherent groups ++ */ ++ struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS]; ++}; ++ ++/** ++ * A complete description of the GPU's Hardware Configuration Discovery ++ * registers. ++ * ++ * The information is presented inefficiently for access. For frequent access, ++ * the values should be better expressed in an unpacked form in the ++ * base_gpu_props structure. ++ * ++ * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to ++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device ++ * behaving differently?". In this case, all information about the ++ * configuration is potentially useful, but it does not need to be processed ++ * by the driver. Instead, the raw registers can be processed by the Mali ++ * Tools software on the host PC. ++ * ++ */ ++struct mali_gpu_raw_props { ++ u64 shader_present; ++ u64 tiler_present; ++ u64 l2_present; ++ u64 stack_present; ++ ++ u32 l2_features; ++ u32 suspend_size; /* API 8.2+ */ ++ u32 mem_features; ++ u32 mmu_features; ++ ++ u32 as_present; ++ ++ u32 js_present; ++ u32 js_features[MALI_GPU_MAX_JOB_SLOTS]; ++ u32 tiler_features; ++ u32 texture_features[3]; ++ ++ u32 gpu_id; ++ ++ u32 thread_max_threads; ++ u32 thread_max_workgroup_size; ++ u32 thread_max_barrier_size; ++ u32 thread_features; ++ ++ /* ++ * Note: This is the _selected_ coherency mode rather than the ++ * available modes as exposed in the coherency_features register. ++ */ ++ u32 coherency_mode; ++}; ++ ++struct kbase_ioctl_gpu_props_reg_dump { ++ union kbase_ioctl_header header; ++ struct mali_gpu_core_props core; ++ struct mali_gpu_l2_cache_props l2; ++ u64 :64; ++ struct mali_gpu_tiler_props tiler; ++ struct mali_gpu_thread_props thread; ++ ++ struct mali_gpu_raw_props raw; ++ ++ /** This must be last member of the structure */ ++ struct mali_gpu_coherent_group_info coherency_info; ++} __attribute__((packed)); ++ ++#endif +diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build +new file mode 100644 +index 00000000000..5d7b9f1dff9 +--- /dev/null ++++ b/src/panfrost/base/meson.build +@@ -0,0 +1,55 @@ ++# Copyright © 2018 Rob Clark ++# Copyright © 2019 Collabora ++# Copyright © 2022 Icecream95 ++ ++# Permission is hereby granted, free of charge, to any person obtaining a copy ++# of this software and associated documentation files (the "Software"), to deal ++# in the Software without restriction, including without limitation the rights ++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++# copies of the Software, and to permit persons to whom the Software is ++# furnished to do so, subject to the following conditions: ++ ++# The above copyright notice and this permission notice shall be included in ++# all copies or substantial portions of the Software. ++ ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++# SOFTWARE. ++ ++libpanfrost_base_versions = ['0', '1', '2', '258'] ++libpanfrost_base_per_arch = [] ++ ++foreach ver : libpanfrost_base_versions ++ libpanfrost_base_per_arch += static_library( ++ 'pan-base-v' + ver, ++ 'pan_vX_base.c', ++ include_directories : [ ++ inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux, ++ include_directories('include'), ++ ], ++ c_args : ['-DPAN_BASE_VER=' + ver], ++ gnu_symbol_visibility : 'hidden', ++ dependencies: [dep_valgrind], ++) ++endforeach ++ ++libpanfrost_base = static_library( ++ 'panfrost_base', ++ 'pan_base.c', ++ include_directories : [ ++ inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw, ++ include_directories('include'), ++ ], ++ gnu_symbol_visibility : 'hidden', ++ build_by_default : false, ++ link_with: [libpanfrost_base_per_arch], ++) ++ ++libpanfrost_base_dep = declare_dependency( ++ link_with: [libpanfrost_base_per_arch, libpanfrost_base], ++ include_directories: [include_directories('.')], ++) +diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c +new file mode 100644 +index 00000000000..22dc09cfb52 +--- /dev/null ++++ b/src/panfrost/base/pan_base.c +@@ -0,0 +1,301 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "util/macros.h" ++#include "pan_base.h" ++ ++#include "mali_kbase_ioctl.h" ++ ++bool ++kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose) ++{ ++ *k = (struct kbase_) {0}; ++ k->fd = fd; ++ k->cs_queue_count = cs_queue_count; ++ k->page_size = sysconf(_SC_PAGE_SIZE); ++ k->verbose = verbose; ++ ++ if (k->fd == -1) ++ return kbase_open_csf_noop(k); ++ ++ struct kbase_ioctl_version_check ver = { 0 }; ++ ++ if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) { ++ return kbase_open_csf(k); ++ } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) { ++ if (ver.major == 3) ++ return kbase_open_old(k); ++ else ++ return kbase_open_new(k); ++ } ++ ++ return false; ++} ++ ++/* If fd != -1, ownership is passed in */ ++int ++kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd) ++{ ++ kbase_handle h = { ++ .va = va, ++ .fd = fd ++ }; ++ ++ unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); ++ ++ kbase_handle *handles = util_dynarray_begin(&k->gem_handles); ++ ++ for (unsigned i = 0; i < size; ++i) { ++ if (handles[i].fd == -2) { ++ handles[i] = h; ++ return i; ++ } ++ } ++ ++ util_dynarray_append(&k->gem_handles, kbase_handle, h); ++ ++ return size; ++} ++ ++int ++kbase_alloc_gem_handle(kbase k, base_va va, int fd) ++{ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ int ret = kbase_alloc_gem_handle_locked(k, va, fd); ++ ++ pthread_mutex_unlock(&k->handle_lock); ++ ++ return ret; ++} ++ ++void ++kbase_free_gem_handle(kbase k, int handle) ++{ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); ++ ++ int fd; ++ ++ if (handle >= size) { ++ pthread_mutex_unlock(&k->handle_lock); ++ return; ++ } ++ ++ if (handle + 1 < size) { ++ kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); ++ fd = ptr->fd; ++ ptr->fd = -2; ++ } else { ++ fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd; ++ } ++ ++ if (fd != -1) ++ close(fd); ++ ++ pthread_mutex_unlock(&k->handle_lock); ++} ++ ++kbase_handle ++kbase_gem_handle_get(kbase k, int handle) ++{ ++ kbase_handle h = { .fd = -1 }; ++ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); ++ ++ if (handle < size) ++ h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle); ++ ++ pthread_mutex_unlock(&k->handle_lock); ++ ++ return h; ++} ++ ++int ++kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers) ++{ ++ struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns); ++ ++ while (kbase_wait_for_event(&wait)) { ++ pthread_mutex_lock(&k->handle_lock); ++ if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) { ++ pthread_mutex_unlock(&k->handle_lock); ++ kbase_wait_fini(wait); ++ errno = EINVAL; ++ return -1; ++ } ++ kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); ++ if (!ptr->use_count) { ++ pthread_mutex_unlock(&k->handle_lock); ++ kbase_wait_fini(wait); ++ return 0; ++ } ++ pthread_mutex_unlock(&k->handle_lock); ++ } ++ ++ kbase_wait_fini(wait); ++ errno = ETIMEDOUT; ++ return -1; ++} ++ ++static void ++adjust_time(struct timespec *tp, int64_t ns) ++{ ++ ns += tp->tv_nsec; ++ tp->tv_nsec = ns % 1000000000; ++ tp->tv_sec += ns / 1000000000; ++} ++ ++static int64_t ++ns_until(struct timespec tp) ++{ ++ struct timespec now; ++ clock_gettime(CLOCK_MONOTONIC, &now); ++ ++ int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000; ++ int64_t ns = tp.tv_nsec - now.tv_nsec; ++ ++ /* Clamp the value to zero to avoid errors from ppoll */ ++ return MAX2(sec + ns, 0); ++} ++ ++static void ++kbase_wait_signal(kbase k) ++{ ++ /* We must acquire the event condition lock, otherwise another ++ * thread could be between the trylock and the cond_wait, and ++ * not notice the broadcast. */ ++ pthread_mutex_lock(&k->event_cnd_lock); ++ pthread_cond_broadcast(&k->event_cnd); ++ pthread_mutex_unlock(&k->event_cnd_lock); ++} ++ ++struct kbase_wait_ctx ++kbase_wait_init(kbase k, int64_t timeout_ns) ++{ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC, &tp); ++ ++ adjust_time(&tp, timeout_ns); ++ ++ return (struct kbase_wait_ctx) { ++ .k = k, ++ .until = tp, ++ }; ++} ++ ++bool ++kbase_wait_for_event(struct kbase_wait_ctx *ctx) ++{ ++ kbase k = ctx->k; ++ ++ /* Return instantly the first time so that a check outside the ++ * wait_for_Event loop is not required */ ++ if (!ctx->has_cnd_lock) { ++ pthread_mutex_lock(&k->event_cnd_lock); ++ ctx->has_cnd_lock = true; ++ return true; ++ } ++ ++ if (!ctx->has_lock) { ++ if (pthread_mutex_trylock(&k->event_read_lock) == 0) { ++ ctx->has_lock = true; ++ pthread_mutex_unlock(&k->event_cnd_lock); ++ } else { ++ int ret = pthread_cond_timedwait(&k->event_cnd, ++ &k->event_cnd_lock, &ctx->until); ++ return ret != ETIMEDOUT; ++ } ++ } ++ ++ bool event = k->poll_event(k, ns_until(ctx->until)); ++ k->handle_events(k); ++ kbase_wait_signal(k); ++ return event; ++} ++ ++void ++kbase_wait_fini(struct kbase_wait_ctx ctx) ++{ ++ kbase k = ctx.k; ++ ++ if (ctx.has_lock) { ++ pthread_mutex_unlock(&k->event_read_lock); ++ kbase_wait_signal(k); ++ } else if (ctx.has_cnd_lock) { ++ pthread_mutex_unlock(&k->event_cnd_lock); ++ } ++} ++ ++void ++kbase_ensure_handle_events(kbase k) ++{ ++ /* If we don't manage to take the lock, then events have recently/will ++ * soon be handled, there is no need to do anything. */ ++ if (pthread_mutex_trylock(&k->event_read_lock) == 0) { ++ k->handle_events(k); ++ pthread_mutex_unlock(&k->event_read_lock); ++ kbase_wait_signal(k); ++ } ++} ++ ++bool ++kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp) ++{ ++ struct pollfd pfd = { ++ .fd = fd, ++ .events = wait_shared ? POLLOUT : POLLIN, ++ }; ++ ++ uint64_t timeout = ns_until(tp); ++ ++ struct timespec t = { ++ .tv_sec = timeout / 1000000000, ++ .tv_nsec = timeout % 1000000000, ++ }; ++ ++ int ret = ppoll(&pfd, 1, &t, NULL); ++ ++ if (ret == -1 && errno != EINTR) ++ perror("kbase_poll_fd_until"); ++ ++ return ret != 0; ++} +diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h +new file mode 100644 +index 00000000000..878f7468433 +--- /dev/null ++++ b/src/panfrost/base/pan_base.h +@@ -0,0 +1,234 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++/* Library for interfacing with kbase */ ++#ifndef PAN_BASE_H ++#define PAN_BASE_H ++ ++#include "util/u_dynarray.h" ++#include "util/list.h" ++ ++#define PAN_EVENT_SIZE 16 ++ ++typedef uint64_t base_va; ++struct base_ptr { ++ void *cpu; ++ base_va gpu; ++}; ++ ++struct kbase_syncobj; ++ ++/* The job is done when the queue seqnum > seqnum */ ++struct kbase_sync_link { ++ struct kbase_sync_link *next; /* must be first */ ++ uint64_t seqnum; ++ void (*callback)(void *); ++ void *data; ++}; ++ ++struct kbase_event_slot { ++ struct kbase_sync_link *syncobjs; ++ struct kbase_sync_link **back; ++ uint64_t last_submit; ++ uint64_t last; ++}; ++ ++struct kbase_context { ++ uint8_t csg_handle; ++ uint8_t kcpu_queue; ++ bool kcpu_init; // TODO: Always create a queue? ++ uint32_t csg_uid; ++ unsigned num_csi; ++ ++ unsigned tiler_heap_chunk_size; ++ base_va tiler_heap_va; ++ base_va tiler_heap_header; ++}; ++ ++struct kbase_cs { ++ struct kbase_context *ctx; ++ void *user_io; ++ base_va va; ++ unsigned size; ++ unsigned event_mem_offset; ++ unsigned csi; ++ ++ uint64_t last_insert; ++ ++ // TODO: This is only here because it's convenient for emit_csf_queue ++ uint32_t *latest_flush; ++}; ++ ++#define KBASE_SLOT_COUNT 2 ++ ++typedef struct { ++ base_va va; ++ int fd; ++ uint8_t use_count; ++ /* For emulating implicit sync. TODO make this work on v10 */ ++ uint8_t last_access[KBASE_SLOT_COUNT]; ++} kbase_handle; ++ ++struct kbase_; ++typedef struct kbase_ *kbase; ++ ++struct kbase_ { ++ unsigned setup_state; ++ bool verbose; ++ ++ int fd; ++ unsigned api; ++ unsigned page_size; ++ // TODO: Actually we may want to try to pack multiple contexts / queue ++ // "sets" into a single group... ++ unsigned cs_queue_count; ++ ++ /* Must not hold handle_lock while acquiring event_read_lock */ ++ pthread_mutex_t handle_lock; ++ pthread_mutex_t event_read_lock; ++ pthread_mutex_t event_cnd_lock; ++ pthread_cond_t event_cnd; ++ /* TODO: Per-context/queue locks? */ ++ pthread_mutex_t queue_lock; ++ ++ struct list_head syncobjs; ++ ++ unsigned gpuprops_size; ++ void *gpuprops; ++ ++ void *tracking_region; ++ void *csf_user_reg; ++ struct base_ptr event_mem; ++ struct base_ptr kcpu_event_mem; ++ // TODO: dynamically size ++ struct kbase_event_slot event_slots[256]; ++ // TODO: USe a bitset? ++ unsigned event_slot_usage; ++ ++ uint8_t atom_number; ++ ++ struct util_dynarray gem_handles; ++ struct util_dynarray atom_bos[256]; ++ uint64_t job_seq; ++ ++ void (*close)(kbase k); ++ ++ bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value); ++ bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value); ++ ++ struct base_ptr (*alloc)(kbase k, size_t size, ++ unsigned pan_flags, ++ unsigned mali_flags); ++ void (*free)(kbase k, base_va va); ++ ++ int (*import_dmabuf)(kbase k, int fd); ++ void *(*mmap_import)(kbase k, base_va va, size_t size); ++ ++ void (*cache_clean)(void *ptr, size_t size); ++ void (*cache_invalidate)(void *ptr, size_t size); ++ ++ /* Returns false on timeout */ ++ bool (*poll_event)(kbase k, int64_t timeout_ns); ++ bool (*handle_events)(kbase k); ++ ++ /* <= v9 GPUs */ ++ int (*submit)(kbase k, uint64_t va, unsigned req, ++ struct kbase_syncobj *o, ++ int32_t *handles, unsigned num_handles); ++ ++ /* >= v10 GPUs */ ++ struct kbase_context *(*context_create)(kbase k); ++ void (*context_destroy)(kbase k, struct kbase_context *ctx); ++ bool (*context_recreate)(kbase k, struct kbase_context *ctx); ++ ++ // TODO: Pass in a priority? ++ struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx, ++ base_va va, unsigned size); ++ void (*cs_term)(kbase k, struct kbase_cs *cs); ++ void (*cs_rebind)(kbase k, struct kbase_cs *cs); ++ ++ bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset, ++ struct kbase_syncobj *o, uint64_t seqnum); ++ bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset, ++ struct kbase_syncobj *o); ++ ++ int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx); ++ bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd); ++ ++ bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx, ++ base_va addr, uint64_t value); ++ bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx, ++ base_va addr, uint64_t value); ++ ++ /* syncobj functions */ ++ struct kbase_syncobj *(*syncobj_create)(kbase k); ++ void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o); ++ struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o); ++ /* TODO: timeout? (and for cs_wait) */ ++ bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o); ++ ++ /* Returns false if there are no active queues */ ++ bool (*callback_all_queues)(kbase k, int32_t *count, ++ void (*callback)(void *), void *data); ++ ++ void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size, ++ bool invalidate); ++}; ++ ++bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose); ++ ++/* Called from kbase_open */ ++bool kbase_open_old(kbase k); ++bool kbase_open_new(kbase k); ++bool kbase_open_csf(kbase k); ++bool kbase_open_csf_noop(kbase k); ++ ++/* BO management */ ++int kbase_alloc_gem_handle(kbase k, base_va va, int fd); ++int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd); ++void kbase_free_gem_handle(kbase k, int handle); ++kbase_handle kbase_gem_handle_get(kbase k, int handle); ++int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers); ++ ++/* Event waiting */ ++struct kbase_wait_ctx { ++ kbase k; ++ struct timespec until; ++ bool has_lock; ++ bool has_cnd_lock; ++}; ++ ++struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns); ++/* Returns false on timeout, kbase_wait_fini must still be called */ ++bool kbase_wait_for_event(struct kbase_wait_ctx *ctx); ++void kbase_wait_fini(struct kbase_wait_ctx ctx); ++ ++void kbase_ensure_handle_events(kbase k); ++ ++bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp); ++ ++/* Must not conflict with PANFROST_BO_* flags */ ++#define MALI_BO_CACHED_CPU (1 << 16) ++#define MALI_BO_UNCACHED_GPU (1 << 17) ++ ++#endif +diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h +new file mode 100644 +index 00000000000..750a445a995 +--- /dev/null ++++ b/src/panfrost/base/pan_base_noop.h +@@ -0,0 +1,152 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef PAN_BASE_NOOP_H ++#define PAN_BASE_NOOP_H ++ ++/* For Mali-G610 as used in RK3588 */ ++#define PROP(name, value) ((name << 2) | 2), value ++static const uint32_t gpu_props[] = { ++ PROP(KBASE_GPUPROP_RAW_GPU_ID, 0xa8670000), ++ PROP(KBASE_GPUPROP_PRODUCT_ID, 0xa867), ++ PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT, 0x50005), ++ PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e), ++ PROP(KBASE_GPUPROP_TLS_ALLOC, 0x800), ++ PROP(KBASE_GPUPROP_RAW_TILER_FEATURES, 0x809), ++}; ++#undef PROP ++ ++#define NOOP_COOKIE_ALLOC 0x41000 ++#define NOOP_COOKIE_USER_IO 0x42000 ++#define NOOP_COOKIE_MEM_ALLOC 0x43000 ++ ++static int ++kbase_ioctl(int fd, unsigned long request, ...) ++{ ++ int ret = 0; ++ ++ va_list args; ++ ++ va_start(args, request); ++ void *ptr = va_arg(args, void *); ++ va_end(args); ++ ++ switch (request) { ++ case KBASE_IOCTL_GET_GPUPROPS: { ++ struct kbase_ioctl_get_gpuprops *props = ptr; ++ ++ if (props->size) ++ memcpy((void *)(uintptr_t) props->buffer, ++ gpu_props, MIN2(props->size, sizeof(gpu_props))); ++ ++ ret = sizeof(gpu_props); ++ break; ++ } ++ ++ case KBASE_IOCTL_MEM_ALLOC: { ++ union kbase_ioctl_mem_alloc *alloc = ptr; ++ ++ alloc->out.gpu_va = NOOP_COOKIE_ALLOC; ++ alloc->out.flags = BASE_MEM_SAME_VA; ++ break; ++ } ++ ++ case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: { ++ union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr; ++ ++ // TODO: Don't return duplicates? ++ create->out.group_handle = 0; ++ create->out.group_uid = 1; ++ break; ++ } ++ ++ case KBASE_IOCTL_CS_TILER_HEAP_INIT: { ++ union kbase_ioctl_cs_tiler_heap_init *init = ptr; ++ ++ /* The values don't really matter, the CPU has no business in accessing ++ * these. */ ++ init->out.gpu_heap_va = 0x60000; ++ init->out.first_chunk_va = 0x61000; ++ break; ++ } ++ ++ case KBASE_IOCTL_CS_QUEUE_BIND: { ++ union kbase_ioctl_cs_queue_bind *bind = ptr; ++ bind->out.mmap_handle = NOOP_COOKIE_USER_IO; ++ break; ++ } ++ ++ case KBASE_IOCTL_MEM_IMPORT: { ++ union kbase_ioctl_mem_import *import = ptr; ++ ++ if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) { ++ ret = -1; ++ errno = EINVAL; ++ break; ++ } ++ ++ int *fd = (int *)(uintptr_t) import->in.phandle; ++ ++ off_t size = lseek(*fd, 0, SEEK_END); ++ ++ import->out.flags = BASE_MEM_NEED_MMAP; ++ import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC; ++ import->out.va_pages = DIV_ROUND_UP(size, 4096); ++ } ++ ++ case KBASE_IOCTL_SET_FLAGS: ++ case KBASE_IOCTL_MEM_EXEC_INIT: ++ case KBASE_IOCTL_MEM_JIT_INIT: ++ case KBASE_IOCTL_CS_QUEUE_REGISTER: ++ case KBASE_IOCTL_CS_QUEUE_KICK: ++ case KBASE_IOCTL_CS_TILER_HEAP_TERM: ++ case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE: ++ case KBASE_IOCTL_MEM_SYNC: ++ break; ++ ++ default: ++ ret = -1; ++ errno = ENOSYS; ++ } ++ ++ return ret; ++} ++ ++static void * ++kbase_mmap(void *addr, size_t length, int prot, int flags, ++ int fd, off_t offset) ++{ ++ switch (offset) { ++ case BASE_MEM_MAP_TRACKING_HANDLE: ++ case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE: ++ case NOOP_COOKIE_ALLOC: ++ case NOOP_COOKIE_USER_IO: ++ case NOOP_COOKIE_MEM_ALLOC: ++ return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ ++ default: ++ errno = ENOSYS; ++ return MAP_FAILED; ++ } ++} ++#endif +diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h +new file mode 100644 +index 00000000000..ad5af0c7098 +--- /dev/null ++++ b/src/panfrost/base/pan_cache.h +@@ -0,0 +1,95 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef PAN_CACHE_H ++#define PAN_CACHE_H ++ ++#ifdef __aarch64__ ++ ++static void ++cache_clean(volatile void *addr) ++{ ++ __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); ++} ++ ++static void ++cache_invalidate(volatile void *addr) ++{ ++ __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); ++} ++ ++typedef void (*cacheline_op)(volatile void *addr); ++ ++#define CACHELINE_SIZE 64 ++ ++static void ++cacheline_op_range(volatile void *start, size_t length, cacheline_op op) ++{ ++ volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); ++ volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); ++ for (; ptr < end; ptr += CACHELINE_SIZE) ++ op(ptr); ++} ++ ++static void ++cache_clean_range(volatile void *start, size_t length) ++{ ++ /* TODO: Do an invalidate at the start of the range? */ ++ cacheline_op_range(start, length, cache_clean); ++} ++ ++static void ++cache_invalidate_range(volatile void *start, size_t length) ++{ ++ cacheline_op_range(start, length, cache_invalidate); ++} ++ ++#endif /* __aarch64__ */ ++ ++/* The #ifdef covers both 32-bit and 64-bit ARM */ ++#ifdef __ARM_ARCH ++static void ++cache_barrier(void) ++{ ++ __asm__ volatile ("dsb sy" ::: "memory"); ++} ++ ++static void ++memory_barrier(void) ++{ ++ __asm__ volatile ("dmb sy" ::: "memory"); ++} ++#else ++ ++/* TODO: How to do cache barriers when emulated? */ ++static void ++cache_barrier(void) ++{ ++} ++ ++static void ++memory_barrier(void) ++{ ++} ++#endif ++#endif +diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c +new file mode 100644 +index 00000000000..99bd356c536 +--- /dev/null ++++ b/src/panfrost/base/pan_vX_base.c +@@ -0,0 +1,1825 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef HAVE_VALGRIND ++#include ++#else ++#define RUNNING_ON_VALGRIND 0 ++#endif ++ ++#include "util/macros.h" ++#include "util/list.h" ++#include "util/u_atomic.h" ++#include "util/os_file.h" ++ ++#include "pan_base.h" ++#include "pan_cache.h" ++ ++#include "drm-uapi/panfrost_drm.h" ++ ++#define PAN_BASE_API (PAN_BASE_VER & 0xff) ++#if (PAN_BASE_VER & 0x100) == 0x100 ++#define PAN_BASE_NOOP ++#endif ++ ++#if PAN_BASE_API >= 2 ++#include "csf/mali_gpu_csf_registers.h" ++ ++#define MALI_USE_CSF 1 ++#endif ++ ++#include "mali_kbase_gpuprops.h" ++ ++#ifndef PAN_BASE_NOOP ++#define kbase_mmap mmap ++#endif ++ ++#if PAN_BASE_API >= 1 ++#include "mali_base_kernel.h" ++#include "mali_kbase_ioctl.h" ++ ++#ifdef PAN_BASE_NOOP ++#include "pan_base_noop.h" ++#else ++#define kbase_ioctl ioctl ++#endif ++#else ++ ++#include "old/mali-ioctl.h" ++#include "old/mali-ioctl-midgard.h" ++#include "old/mali-props.h" ++#endif ++ ++#define LOG(fmt, ...) do { \ ++ if (k->verbose) { \ ++ struct timespec tp; \ ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \ ++ printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#if PAN_BASE_API == 0 ++static int ++kbase_ioctl(int fd, unsigned long request, ...) ++{ ++ int ioc_size = _IOC_SIZE(request); ++ ++ assert(ioc_size); ++ ++ va_list args; ++ ++ va_start(args, request); ++ int *ptr = va_arg(args, void *); ++ va_end(args); ++ ++ *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request); ++ ++ int ret = ioctl(fd, request, ptr); ++ if (ret) ++ return ret; ++ ++ int r = *ptr; ++ switch (r) { ++ case MALI_ERROR_OUT_OF_GPU_MEMORY: ++ errno = ENOSPC; ++ return -1; ++ case MALI_ERROR_OUT_OF_MEMORY: ++ errno = ENOMEM; ++ return -1; ++ case MALI_ERROR_FUNCTION_FAILED: ++ errno = EINVAL; ++ return -1; ++ default: ++ return 0; ++ } ++} ++#endif ++ ++#if PAN_BASE_API >= 1 ++static bool ++kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) ++{ ++ int i = 0; ++ uint64_t x = 0; ++ while (i < k->gpuprops_size) { ++ x = 0; ++ memcpy(&x, k->gpuprops + i, 4); ++ i += 4; ++ ++ int size = 1 << (x & 3); ++ int this_name = x >> 2; ++ ++ x = 0; ++ memcpy(&x, k->gpuprops + i, size); ++ i += size; ++ ++ if (this_name == name) { ++ *value = x; ++ return true; ++ } ++ } ++ ++ return false; ++} ++#else ++static bool ++kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) ++{ ++ struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops; ++ ++ switch (name) { ++ case KBASE_GPUPROP_PRODUCT_ID: ++ *value = props->core.product_id; ++ return true; ++ case KBASE_GPUPROP_RAW_SHADER_PRESENT: ++ *value = props->raw.shader_present; ++ return true; ++ case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0: ++ *value = props->raw.texture_features[0]; ++ return true; ++ case KBASE_GPUPROP_RAW_TILER_FEATURES: ++ *value = props->raw.tiler_features; ++ return true; ++ case KBASE_GPUPROP_RAW_GPU_ID: ++ *value = props->raw.gpu_id; ++ return true; ++ default: ++ return false; ++ } ++} ++#endif ++ ++static bool ++alloc_handles(kbase k) ++{ ++ util_dynarray_init(&k->gem_handles, NULL); ++ return true; ++} ++ ++static bool ++free_handles(kbase k) ++{ ++ util_dynarray_fini(&k->gem_handles); ++ return true; ++} ++ ++static bool ++set_flags(kbase k) ++{ ++ struct kbase_ioctl_set_flags flags = { ++ .create_flags = 0 ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++mmap_tracking(kbase k) ++{ ++ k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE, ++ MAP_SHARED, k->fd, ++ BASE_MEM_MAP_TRACKING_HANDLE); ++ ++ if (k->tracking_region == MAP_FAILED) { ++ perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); ++ k->tracking_region = NULL; ++ return false; ++ } ++ return true; ++} ++ ++static bool ++munmap_tracking(kbase k) ++{ ++ if (k->tracking_region) ++ return munmap(k->tracking_region, k->page_size) == 0; ++ return true; ++} ++ ++#if PAN_BASE_API >= 1 ++static bool ++get_gpuprops(kbase k) ++{ ++ struct kbase_ioctl_get_gpuprops props = { 0 }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); ++ return false; ++ } else if (!ret) { ++ fprintf(stderr, "GET_GPUPROPS returned zero size\n"); ++ return false; ++ } ++ ++ k->gpuprops_size = ret; ++ k->gpuprops = calloc(k->gpuprops_size, 1); ++ ++ props.size = k->gpuprops_size; ++ props.buffer = (uint64_t)(uintptr_t) k->gpuprops; ++ ++ ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); ++ return false; ++ } ++ ++ return true; ++} ++#else ++static bool ++get_gpuprops(kbase k) ++{ ++ k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump)); ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)"); ++ return false; ++ } ++ ++ return true; ++} ++#endif ++ ++static bool ++free_gpuprops(kbase k) ++{ ++ free(k->gpuprops); ++ return true; ++} ++ ++#if PAN_BASE_API >= 2 ++static bool ++mmap_user_reg(kbase k) ++{ ++ k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ, ++ MAP_SHARED, k->fd, ++ BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); ++ ++ if (k->csf_user_reg == MAP_FAILED) { ++ perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); ++ k->csf_user_reg = NULL; ++ return false; ++ } ++ return true; ++} ++ ++static bool ++munmap_user_reg(kbase k) ++{ ++ if (k->csf_user_reg) ++ return munmap(k->csf_user_reg, k->page_size) == 0; ++ return true; ++} ++#endif ++ ++#if PAN_BASE_API >= 1 ++static bool ++init_mem_exec(kbase k) ++{ ++ struct kbase_ioctl_mem_exec_init init = { ++ .va_pages = 0x100000, ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++init_mem_jit(kbase k) ++{ ++ struct kbase_ioctl_mem_jit_init init = { ++ .va_pages = 1 << 25, ++ .max_allocations = 255, ++ .phys_pages = 1 << 25, ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); ++ return false; ++ } ++ return true; ++} ++#endif ++ ++#if PAN_BASE_API >= 2 ++static struct base_ptr ++kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags); ++ ++static bool ++alloc_event_mem(kbase k) ++{ ++ k->event_mem = kbase_alloc(k, k->page_size * 2, ++ PANFROST_BO_NOEXEC, ++ BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | ++ BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | ++ BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT); ++ k->kcpu_event_mem = (struct base_ptr) { ++ .cpu = k->event_mem.cpu + k->page_size, ++ .gpu = k->event_mem.gpu + k->page_size, ++ }; ++ return k->event_mem.cpu; ++} ++ ++static bool ++free_event_mem(kbase k) ++{ ++ if (k->event_mem.cpu) ++ return munmap(k->event_mem.cpu, k->page_size * 2) == 0; ++ return true; ++} ++#endif ++ ++#if PAN_BASE_API >= 2 ++static bool ++cs_group_create(kbase k, struct kbase_context *c) ++{ ++ /* TODO: What about compute-only contexts? */ ++ union kbase_ioctl_cs_queue_group_create_1_6 create = { ++ .in = { ++ /* Mali *still* only supports a single tiler unit */ ++ .tiler_mask = 1, ++ .fragment_mask = ~0ULL, ++ .compute_mask = ~0ULL, ++ ++ .cs_min = k->cs_queue_count, ++ ++ .priority = 1, ++ .tiler_max = 1, ++ .fragment_max = 64, ++ .compute_max = 64, ++ } ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); ++ return false; ++ } ++ ++ c->csg_handle = create.out.group_handle; ++ c->csg_uid = create.out.group_uid; ++ ++ /* Should be at least 1 */ ++ assert(c->csg_uid); ++ ++ return true; ++} ++ ++static bool ++cs_group_term(kbase k, struct kbase_context *c) ++{ ++ if (!c->csg_uid) ++ return true; ++ ++ struct kbase_ioctl_cs_queue_group_term term = { ++ .group_handle = c->csg_handle ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); ++ return false; ++ } ++ return true; ++} ++#endif ++ ++#if PAN_BASE_API >= 2 ++static bool ++tiler_heap_create(kbase k, struct kbase_context *c) ++{ ++ c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */ ++ ++ union kbase_ioctl_cs_tiler_heap_init init = { ++ .in = { ++ .chunk_size = c->tiler_heap_chunk_size, ++ .initial_chunks = 5, ++ .max_chunks = 200, ++ .target_in_flight = 65535, ++ } ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); ++ return false; ++ } ++ ++ c->tiler_heap_va = init.out.gpu_heap_va; ++ c->tiler_heap_header = init.out.first_chunk_va; ++ ++ return true; ++} ++ ++static bool ++tiler_heap_term(kbase k, struct kbase_context *c) ++{ ++ if (!c->tiler_heap_va) ++ return true; ++ ++ struct kbase_ioctl_cs_tiler_heap_term term = { ++ .gpu_heap_va = c->tiler_heap_va ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); ++ return false; ++ } ++ return true; ++} ++#endif ++ ++typedef bool (* kbase_func)(kbase k); ++ ++struct kbase_op { ++ kbase_func part; ++ kbase_func cleanup; ++ const char *label; ++}; ++ ++static struct kbase_op kbase_main[] = { ++ { alloc_handles, free_handles, "Allocate handle array" }, ++#if PAN_BASE_API >= 1 ++ { set_flags, NULL, "Set flags" }, ++#endif ++ { mmap_tracking, munmap_tracking, "Map tracking handle" }, ++#if PAN_BASE_API == 0 ++ { set_flags, NULL, "Set flags" }, ++#endif ++ { get_gpuprops, free_gpuprops, "Get GPU properties" }, ++#if PAN_BASE_API >= 2 ++ { mmap_user_reg, munmap_user_reg, "Map user register page" }, ++#endif ++#if PAN_BASE_API >= 1 ++ { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, ++ { init_mem_jit, NULL, "Initialise JIT allocator" }, ++#endif ++#if PAN_BASE_API >= 2 ++ { alloc_event_mem, free_event_mem, "Allocate event memory" }, ++#endif ++}; ++ ++static void ++kbase_close(kbase k) ++{ ++ while (k->setup_state) { ++ unsigned i = k->setup_state - 1; ++ if (kbase_main[i].cleanup) ++ kbase_main[i].cleanup(k); ++ --k->setup_state; ++ } ++ ++ pthread_mutex_destroy(&k->handle_lock); ++ pthread_mutex_destroy(&k->event_read_lock); ++ pthread_mutex_destroy(&k->event_cnd_lock); ++ pthread_mutex_destroy(&k->queue_lock); ++ pthread_cond_destroy(&k->event_cnd); ++ ++ close(k->fd); ++} ++ ++static bool ++kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value) ++{ ++ unsigned conv[] = { ++ [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID, ++ [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT, ++ [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, ++ [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC, ++ [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES, ++ }; ++ ++ if (name < ARRAY_SIZE(conv) && conv[name]) ++ return kbase_get_mali_gpuprop(k, conv[name], value); ++ ++ switch (name) { ++ case DRM_PANFROST_PARAM_AFBC_FEATURES: ++ *value = 0; ++ return true; ++ case DRM_PANFROST_PARAM_GPU_REVISION: { ++ if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value)) ++ return false; ++ *value &= 0xffff; ++ return true; ++ } ++ default: ++ return false; ++ } ++} ++ ++static void ++kbase_free(kbase k, base_va va) ++{ ++ struct kbase_ioctl_mem_free f = { ++ .gpu_addr = va ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f); ++ ++ if (ret == -1) ++ perror("ioctl(KBASE_IOCTL_MEM_FREE)"); ++} ++ ++static struct base_ptr ++kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags) ++{ ++ struct base_ptr r = {0}; ++ ++ unsigned pages = DIV_ROUND_UP(size, k->page_size); ++ ++ union kbase_ioctl_mem_alloc a = { ++ .in = { ++ .va_pages = pages, ++ .commit_pages = pages, ++ } ++ }; ++ ++ size_t alloc_size = size; ++ unsigned flags = mali_flags; ++ bool exec_align = false; ++ ++ if (!flags) { ++ flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | ++ BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | ++ BASE_MEM_SAME_VA; ++ ++ /* Add COHERENT_LOCAL to keep GPU cores coherent with each ++ * other. */ ++ if (PAN_BASE_API >= 1) ++ flags |= BASE_MEM_COHERENT_LOCAL; ++ } ++ ++ if (pan_flags & PANFROST_BO_HEAP) { ++ size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */ ++ ++ a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size); ++ a.in.commit_pages = 0; ++ a.in.extension = align_size; ++ flags |= BASE_MEM_GROW_ON_GPF; ++ } ++ ++#if PAN_BASE_API >= 1 ++ if (pan_flags & MALI_BO_CACHED_CPU) ++ flags |= BASE_MEM_CACHED_CPU; ++#endif ++ ++#if PAN_BASE_API >= 2 ++ if (pan_flags & MALI_BO_UNCACHED_GPU) ++ flags |= BASE_MEM_UNCACHED_GPU; ++#endif ++ ++ if (!(pan_flags & PANFROST_BO_NOEXEC)) { ++ /* Using SAME_VA for executable BOs would make it too likely ++ * for a blend shader to end up on the wrong side of a 4 GB ++ * boundary. */ ++ flags |= BASE_MEM_PROT_GPU_EX; ++ flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA); ++ ++ if (PAN_BASE_API == 0) { ++ /* Assume 4K pages */ ++ a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */ ++ size = 1 << 26; /* Four times the alignment */ ++ exec_align = true; ++ } ++ } ++ ++ a.in.flags = flags; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); ++ return r; ++ } ++ ++ // TODO: Is this always true, even in the face of multithreading? ++ if (PAN_BASE_API == 0) ++ a.out.gpu_va = 0x41000; ++ ++ if ((flags & BASE_MEM_SAME_VA) && ++ !((a.out.flags & BASE_MEM_SAME_VA) && ++ a.out.gpu_va < 0x80000)) { ++ ++ fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", ++ (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va); ++ errno = EINVAL; ++ return r; ++ } ++ ++ void *ptr = kbase_mmap(NULL, size, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ k->fd, a.out.gpu_va); ++ ++ if (ptr == MAP_FAILED) { ++ perror("mmap(GPU BO)"); ++ kbase_free(k, a.out.gpu_va); ++ return r; ++ } ++ ++ uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ? ++ (uintptr_t) ptr : a.out.gpu_va; ++ ++ if (exec_align) { ++ gpu_va = ALIGN_POT(gpu_va, 1 << 24); ++ ++ ptr = kbase_mmap(NULL, alloc_size, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ k->fd, gpu_va); ++ ++ if (ptr == MAP_FAILED) { ++ perror("mmap(GPU EXEC BO)"); ++ kbase_free(k, gpu_va); ++ return r; ++ } ++ } ++ ++ r.cpu = ptr; ++ r.gpu = gpu_va; ++ ++ return r; ++} ++ ++static int ++kbase_import_dmabuf(kbase k, int fd) ++{ ++ int ret; ++ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); ++ ++ kbase_handle *handles = util_dynarray_begin(&k->gem_handles); ++ ++ for (unsigned i = 0; i < size; ++i) { ++ kbase_handle h = handles[i]; ++ ++ if (h.fd < 0) ++ continue; ++ ++ ret = os_same_file_description(h.fd, fd); ++ ++ if (ret == 0) { ++ pthread_mutex_unlock(&k->handle_lock); ++ return i; ++ } else if (ret < 0) { ++ printf("error in os_same_file_description(%i, %i)\n", h.fd, fd); ++ } ++ } ++ ++ int dup = os_dupfd_cloexec(fd); ++ ++ union kbase_ioctl_mem_import import = { ++ .in = { ++ .phandle = (uintptr_t) &dup, ++ .type = BASE_MEM_IMPORT_TYPE_UMM, ++ /* Usage flags: CPU/GPU reads/writes */ ++ .flags = 0xf, ++ } ++ }; ++ ++ ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import); ++ ++ int handle; ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_IMPORT)"); ++ handle = -1; ++ } else if (import.out.flags & BASE_MEM_NEED_MMAP) { ++ uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED, k->fd, import.out.gpu_va); ++ ++ if (va == (uintptr_t) MAP_FAILED) { ++ perror("mmap(IMPORTED BO)"); ++ handle = -1; ++ } else { ++ handle = kbase_alloc_gem_handle_locked(k, va, dup); ++ } ++ } else { ++ handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup); ++ } ++ ++ pthread_mutex_unlock(&k->handle_lock); ++ ++ return handle; ++} ++ ++static void * ++kbase_mmap_import(kbase k, base_va va, size_t size) ++{ ++ return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va); ++} ++ ++struct kbase_fence { ++ struct list_head link; ++ ++ unsigned slot; ++ uint64_t value; ++}; ++ ++struct kbase_syncobj { ++ struct list_head link; ++ ++ struct list_head fences; ++}; ++ ++static struct kbase_syncobj * ++kbase_syncobj_create(kbase k) ++{ ++ struct kbase_syncobj *o = calloc(1, sizeof(*o)); ++ list_inithead(&o->fences); ++ pthread_mutex_lock(&k->queue_lock); ++ list_add(&o->link, &k->syncobjs); ++ pthread_mutex_unlock(&k->queue_lock); ++ return o; ++} ++ ++static void ++kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o) ++{ ++ pthread_mutex_lock(&k->queue_lock); ++ list_del(&o->link); ++ pthread_mutex_unlock(&k->queue_lock); ++ ++ list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { ++ list_del(&fence->link); ++ free(fence); ++ } ++ ++ free(o); ++} ++ ++static void ++kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) ++{ ++ struct kbase_fence *fence = calloc(1, sizeof(*fence)); ++ ++ fence->slot = slot; ++ fence->value = value; ++ ++ list_add(&fence->link, &o->fences); ++} ++ ++static void ++kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) ++{ ++ list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { ++ if (fence->slot == slot) { ++ if (value > fence->value) ++ fence->value = value; ++ ++ return; ++ } ++ } ++ ++ kbase_syncobj_add_fence(o, slot, value); ++} ++ ++static struct kbase_syncobj * ++kbase_syncobj_dup(kbase k, struct kbase_syncobj *o) ++{ ++ struct kbase_syncobj *dup = kbase_syncobj_create(k); ++ ++ pthread_mutex_lock(&k->queue_lock); ++ ++ list_for_each_entry(struct kbase_fence, fence, &o->fences, link) ++ kbase_syncobj_add_fence(dup, fence->slot, fence->value); ++ ++ pthread_mutex_unlock(&k->queue_lock); ++ ++ return dup; ++} ++ ++static void ++kbase_syncobj_update(kbase k, struct kbase_syncobj *o) ++{ ++ list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { ++ uint64_t value = k->event_slots[fence->slot].last; ++ ++ if (value > fence->value) { ++ LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n", ++ o, fence->slot, fence->value, value); ++ ++ list_del(&fence->link); ++ free(fence); ++ } ++ } ++} ++ ++static bool ++kbase_syncobj_wait(kbase k, struct kbase_syncobj *o) ++{ ++ if (list_is_empty(&o->fences)) { ++ LOG("syncobj has no fences\n"); ++ return true; ++ } ++ ++ struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL); ++ ++ while (kbase_wait_for_event(&wait)) { ++ kbase_syncobj_update(k, o); ++ ++ if (list_is_empty(&o->fences)) { ++ kbase_wait_fini(wait); ++ return true; ++ } ++ } ++ ++ kbase_wait_fini(wait); ++ ++ fprintf(stderr, "syncobj %p wait timeout\n", o); ++ return false; ++} ++ ++static bool ++kbase_poll_event(kbase k, int64_t timeout_ns) ++{ ++ struct pollfd pfd = { ++ .fd = k->fd, ++ .events = POLLIN, ++ }; ++ ++ struct timespec t = { ++ .tv_sec = timeout_ns / 1000000000, ++ .tv_nsec = timeout_ns % 1000000000, ++ }; ++ ++ int ret = ppoll(&pfd, 1, &t, NULL); ++ ++ if (ret == -1 && errno != EINTR) ++ perror("poll(mali fd)"); ++ ++ LOG("poll returned %i\n", pfd.revents); ++ ++ return ret != 0; ++} ++ ++#if PAN_BASE_API < 2 ++static bool ++kbase_handle_events(kbase k) ++{ ++ struct base_jd_event_v2 event; ++ bool ret = true; ++ ++ for (;;) { ++ int ret = read(k->fd, &event, sizeof(event)); ++ ++ if (ret == -1) { ++ if (errno == EAGAIN) { ++ return true; ++ } else { ++ perror("read(mali fd)"); ++ return false; ++ } ++ } ++ ++ if (event.event_code != BASE_JD_EVENT_DONE) { ++ fprintf(stderr, "Atom %i reported event 0x%x!\n", ++ event.atom_number, event.event_code); ++ ret = false; ++ } ++ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ k->event_slots[event.atom_number].last = event.udata.blob[0]; ++ ++ unsigned size = util_dynarray_num_elements(&k->gem_handles, ++ kbase_handle); ++ kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles); ++ ++ struct util_dynarray *handles = k->atom_bos + event.atom_number; ++ ++ util_dynarray_foreach(handles, int32_t, h) { ++ if (*h >= size) ++ continue; ++ assert(handle_data[*h].use_count); ++ --handle_data[*h].use_count; ++ } ++ util_dynarray_fini(handles); ++ ++ pthread_mutex_unlock(&k->handle_lock); ++ } ++ ++ return ret; ++} ++ ++#else ++ ++static bool ++kbase_read_event(kbase k) ++{ ++ struct base_csf_notification event; ++ int ret = read(k->fd, &event, sizeof(event)); ++ ++ if (ret == -1) { ++ if (errno == EAGAIN) { ++ return true; ++ } else { ++ perror("read(mali_fd)"); ++ return false; ++ } ++ } ++ ++ if (ret != sizeof(event)) { ++ fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", ++ ret, (int) sizeof(event)); ++ return false; ++ } ++ ++ switch (event.type) { ++ case BASE_CSF_NOTIFICATION_EVENT: ++ LOG("Notification event!\n"); ++ return true; ++ ++ case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: ++ break; ++ ++ case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: ++ fprintf(stderr, "No event from mali_fd!\n"); ++ return true; ++ ++ default: ++ fprintf(stderr, "Unknown event type!\n"); ++ return true; ++ } ++ ++ struct base_gpu_queue_group_error e = event.payload.csg_error.error; ++ ++ switch (e.error_type) { ++ case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { ++ // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h ++ fprintf(stderr, "Queue group error: status 0x%x " ++ "sideband 0x%"PRIx64"\n", ++ e.payload.fatal_group.status, ++ (uint64_t) e.payload.fatal_group.sideband); ++ break; ++ } ++ case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { ++ unsigned queue = e.payload.fatal_queue.csi_index; ++ ++ // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h ++ fprintf(stderr, "Queue %i error: status 0x%x " ++ "sideband 0x%"PRIx64"\n", ++ queue, e.payload.fatal_queue.status, ++ (uint64_t) e.payload.fatal_queue.sideband); ++ ++ /* TODO: Decode the instruct that it got stuck at */ ++ ++ break; ++ } ++ ++ case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: ++ fprintf(stderr, "Command stream timeout!\n"); ++ break; ++ case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: ++ fprintf(stderr, "Command stream OOM!\n"); ++ break; ++ default: ++ fprintf(stderr, "Unknown error type!\n"); ++ } ++ ++ return false; ++} ++ ++static void ++kbase_update_queue_callbacks(kbase k, ++ struct kbase_event_slot *slot, ++ uint64_t seqnum) ++{ ++ struct kbase_sync_link **list = &slot->syncobjs; ++ struct kbase_sync_link **back = slot->back; ++ ++ while (*list) { ++ struct kbase_sync_link *link = *list; ++ ++ LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum); ++ ++ /* Items in the list should be in order, there is no need to ++ * check any more if we can't process this link yet. */ ++ if (seqnum <= link->seqnum) ++ break; ++ ++ LOG("done, calling %p(%p)\n", link->callback, link->data); ++ link->callback(link->data); ++ *list = link->next; ++ if (&link->next == back) ++ slot->back = list; ++ free(link); ++ } ++} ++ ++static bool ++kbase_handle_events(kbase k) ++{ ++#ifdef PAN_BASE_NOOP ++ return true; ++#endif ++ ++ /* This will clear the event count, so there's no need to do it in a ++ * loop. */ ++ bool ret = kbase_read_event(k); ++ ++ uint64_t *event_mem = k->event_mem.cpu; ++ ++ pthread_mutex_lock(&k->queue_lock); ++ ++ for (unsigned i = 0; i < k->event_slot_usage; ++i) { ++ uint64_t seqnum = event_mem[i * 2]; ++ uint64_t cmp = k->event_slots[i].last; ++ ++ LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp); ++ ++ if (seqnum < cmp) { ++ if (false) ++ fprintf(stderr, "seqnum at offset %i went backward " ++ "from %"PRIu64" to %"PRIu64"!\n", ++ i, cmp, seqnum); ++ } else /*if (seqnum > cmp)*/ { ++ kbase_update_queue_callbacks(k, &k->event_slots[i], ++ seqnum); ++ } ++ ++ /* TODO: Atomic operations? */ ++ k->event_slots[i].last = seqnum; ++ } ++ ++ pthread_mutex_unlock(&k->queue_lock); ++ ++ return ret; ++} ++ ++#endif ++ ++#if PAN_BASE_API < 2 ++static uint8_t ++kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest) ++{ ++ /* If a == 4 and newest == 5, a will become 255 */ ++ a -= newest; ++ b -= newest; ++ a = MAX2(a, b); ++ a += newest; ++ return a; ++} ++ ++static int ++kbase_submit(kbase k, uint64_t va, unsigned req, ++ struct kbase_syncobj *o, ++ int32_t *handles, unsigned num_handles) ++{ ++ struct util_dynarray buf; ++ util_dynarray_init(&buf, NULL); ++ ++ memcpy(util_dynarray_resize(&buf, int32_t, num_handles), ++ handles, num_handles * sizeof(int32_t)); ++ ++ pthread_mutex_lock(&k->handle_lock); ++ ++ unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1; ++ unsigned dep_slots[KBASE_SLOT_COUNT]; ++ ++ uint8_t nr = k->atom_number++; ++ ++ struct base_jd_atom_v2 atom = { ++ .jc = va, ++ .atom_number = nr, ++ .udata.blob[0] = k->job_seq++, ++ }; ++ ++ for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i) ++ dep_slots[i] = nr; ++ ++ /* Make sure that we haven't taken an atom that's already in use. */ ++ assert(!k->atom_bos[nr].data); ++ k->atom_bos[atom.atom_number] = buf; ++ ++ unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); ++ kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles); ++ ++ struct util_dynarray extres; ++ util_dynarray_init(&extres, NULL); ++ ++ /* Mark the BOs as in use */ ++ for (unsigned i = 0; i < num_handles; ++i) { ++ int32_t h = handles[i]; ++ assert(h < handle_buf_size); ++ assert(handle_buf[h].use_count < 255); ++ ++ /* Implicit sync */ ++ if (handle_buf[h].use_count) ++ for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s) ++ dep_slots[s] = ++ kbase_latest_slot(dep_slots[s], ++ handle_buf[h].last_access[s], ++ nr); ++ ++ handle_buf[h].last_access[slot] = nr; ++ ++handle_buf[h].use_count; ++ ++ if (handle_buf[h].fd != -1) ++ util_dynarray_append(&extres, base_va, handle_buf[h].va); ++ } ++ ++ pthread_mutex_unlock(&k->handle_lock); ++ ++ /* TODO: Better work out the difference between handle_lock and ++ * queue_lock. */ ++ if (o) { ++ pthread_mutex_lock(&k->queue_lock); ++ kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]); ++ pthread_mutex_unlock(&k->queue_lock); ++ } ++ ++ assert(KBASE_SLOT_COUNT == 2); ++ if (dep_slots[0] != nr) { ++ atom.pre_dep[0].atom_id = dep_slots[0]; ++ /* TODO: Use data dependencies? */ ++ atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER; ++ } ++ if (dep_slots[1] != nr) { ++ atom.pre_dep[1].atom_id = dep_slots[1]; ++ atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER; ++ } ++ ++ if (extres.size) { ++ atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES; ++ atom.nr_extres = util_dynarray_num_elements(&extres, base_va); ++ atom.extres_list = (uintptr_t) util_dynarray_begin(&extres); ++ } ++ ++ if (req & PANFROST_JD_REQ_FS) ++ atom.core_req |= BASE_JD_REQ_FS; ++ else ++ atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T; ++ ++ struct kbase_ioctl_job_submit submit = { ++ .nr_atoms = 1, ++ .stride = sizeof(atom), ++ .addr = (uintptr_t) &atom, ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit); ++ ++ util_dynarray_fini(&extres); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)"); ++ return -1; ++ } ++ ++ return atom.atom_number; ++} ++ ++#else ++static struct kbase_context * ++kbase_context_create(kbase k) ++{ ++ struct kbase_context *c = calloc(1, sizeof(*c)); ++ ++ if (!cs_group_create(k, c)) { ++ free(c); ++ return NULL; ++ } ++ ++ if (!tiler_heap_create(k, c)) { ++ cs_group_term(k, c); ++ free(c); ++ return NULL; ++ } ++ ++ return c; ++} ++ ++static void ++kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx); ++ ++static void ++kbase_context_destroy(kbase k, struct kbase_context *ctx) ++{ ++ kbase_kcpu_queue_destroy(k, ctx); ++ tiler_heap_term(k, ctx); ++ cs_group_term(k, ctx); ++ free(ctx); ++} ++ ++static bool ++kbase_context_recreate(kbase k, struct kbase_context *ctx) ++{ ++ kbase_kcpu_queue_destroy(k, ctx); ++ tiler_heap_term(k, ctx); ++ cs_group_term(k, ctx); ++ ++ if (!cs_group_create(k, ctx)) { ++ free(ctx); ++ return false; ++ } ++ ++ if (!tiler_heap_create(k, ctx)) { ++ free(ctx); ++ return false; ++ } ++ ++ return true; ++} ++ ++static struct kbase_cs ++kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx, ++ base_va va, unsigned size, unsigned csi) ++{ ++ struct kbase_cs cs = { ++ .ctx = ctx, ++ .va = va, ++ .size = size, ++ .csi = csi, ++ .latest_flush = (uint32_t *)k->csf_user_reg, ++ }; ++ ++ struct kbase_ioctl_cs_queue_register reg = { ++ .buffer_gpu_addr = va, ++ .buffer_size = size, ++ .priority = 1, ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); ++ return cs; ++ } ++ ++ union kbase_ioctl_cs_queue_bind bind = { ++ .in = { ++ .buffer_gpu_addr = va, ++ .group_handle = ctx->csg_handle, ++ .csi_index = csi, ++ } ++ }; ++ ++ ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); ++ // hack ++ cs.user_io = (void *)1; ++ return cs; ++ } ++ ++ cs.user_io = ++ kbase_mmap(NULL, ++ k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ k->fd, bind.out.mmap_handle); ++ ++ if (cs.user_io == MAP_FAILED) { ++ perror("mmap(CS USER IO)"); ++ cs.user_io = NULL; ++ } ++ ++ return cs; ++} ++ ++static struct kbase_cs ++kbase_cs_bind(kbase k, struct kbase_context *ctx, ++ base_va va, unsigned size) ++{ ++ struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++); ++ ++ // TODO: Fix this problem properly ++ if (k->event_slot_usage >= 256) { ++ fprintf(stderr, "error: Too many contexts created!\n"); ++ ++ /* *very* dangerous, but might just work */ ++ --k->event_slot_usage; ++ } ++ ++ // TODO: This is a misnomer... it isn't a byte offset ++ cs.event_mem_offset = k->event_slot_usage++; ++ k->event_slots[cs.event_mem_offset].back = ++ &k->event_slots[cs.event_mem_offset].syncobjs; ++ ++ uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; ++ ++ /* We use the "Higher" wait condition, so initialise to 1 to allow ++ * waiting before writing... */ ++ event_data[0] = 1; ++ /* And reset the error field to 0, to avoid INHERITing faults */ ++ event_data[1] = 0; ++ ++ /* Just a zero-init is fine... reads and writes are always paired */ ++ uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; ++ kcpu_data[0] = 0; ++ kcpu_data[1] = 0; ++ ++ /* To match the event data */ ++ k->event_slots[cs.event_mem_offset].last = 1; ++ k->event_slots[cs.event_mem_offset].last_submit = 1; ++ ++ return cs; ++} ++ ++static void ++kbase_cs_term(kbase k, struct kbase_cs *cs) ++{ ++ if (cs->user_io) { ++ LOG("unmapping %p user_io %p\n", cs, cs->user_io); ++ munmap(cs->user_io, ++ k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES); ++ } ++ ++ struct kbase_ioctl_cs_queue_terminate term = { ++ .buffer_gpu_addr = cs->va, ++ }; ++ ++ kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term); ++ ++ pthread_mutex_lock(&k->queue_lock); ++ kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset], ++ ~0ULL); ++ ++ k->event_slots[cs->event_mem_offset].last = ~0ULL; ++ ++ /* Make sure that no syncobjs are referencing this CS */ ++ list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link) ++ kbase_syncobj_update(k, o); ++ ++ ++ k->event_slots[cs->event_mem_offset].last = 0; ++ pthread_mutex_unlock(&k->queue_lock); ++} ++ ++static void ++kbase_cs_rebind(kbase k, struct kbase_cs *cs) ++{ ++ struct kbase_cs new; ++ new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi); ++ ++ cs->user_io = new.user_io; ++ LOG("remapping %p user_io %p\n", cs, cs->user_io); ++ ++ fprintf(stderr, "bound csi %i again\n", cs->csi); ++} ++ ++static bool ++kbase_cs_kick(kbase k, struct kbase_cs *cs) ++{ ++ struct kbase_ioctl_cs_queue_kick kick = { ++ .buffer_gpu_addr = cs->va, ++ }; ++ ++ int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); ++ return false; ++ } ++ ++ return true; ++} ++ ++#define CS_RING_DOORBELL(cs) \ ++ *((uint32_t *)(cs->user_io)) = 1 ++ ++#define CS_READ_REGISTER(cs, r) \ ++ *((uint64_t *)(cs->user_io + 4096 * 2 + r)) ++ ++#define CS_WRITE_REGISTER(cs, r, v) \ ++ *((uint64_t *)(cs->user_io + 4096 + r)) = v ++ ++static bool ++kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset, ++ struct kbase_syncobj *o, uint64_t seqnum) ++{ ++ LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n", ++ cs, seqnum, cs->last_insert, insert_offset); ++ ++ if (!cs->user_io) ++ return false; ++ ++ if (insert_offset == cs->last_insert) ++ return true; ++ ++#ifndef PAN_BASE_NOOP ++ struct kbase_event_slot *slot = ++ &k->event_slots[cs->event_mem_offset]; ++ ++ pthread_mutex_lock(&k->queue_lock); ++ slot->last_submit = seqnum + 1; ++ ++ if (o) ++ kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum); ++ pthread_mutex_unlock(&k->queue_lock); ++#endif ++ ++ memory_barrier(); ++ ++ bool active = CS_READ_REGISTER(cs, CS_ACTIVE); ++ LOG("active is %i\n", active); ++ ++ CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset); ++ cs->last_insert = insert_offset; ++ ++ if (false /*active*/) { ++ memory_barrier(); ++ CS_RING_DOORBELL(cs); ++ memory_barrier(); ++ ++ active = CS_READ_REGISTER(cs, CS_ACTIVE); ++ LOG("active is now %i\n", active); ++ } else { ++ kbase_cs_kick(k, cs); ++ } ++ ++ return true; ++} ++ ++static bool ++kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset, ++ struct kbase_syncobj *o) ++{ ++ if (!cs->user_io) ++ return false; ++ ++ if (kbase_syncobj_wait(k, o)) ++ return true; ++ ++ uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT); ++ unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE); ++ ++ fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", " ++ "CS_ACTIVE (%i)\n", ++ cs->csi, e, extract_offset, a); ++ ++ fprintf(stderr, "fences:\n"); ++ list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { ++ fprintf(stderr, " slot %i: seqnum %"PRIu64"\n", ++ fence->slot, fence->value); ++ } ++ ++ return false; ++} ++ ++static bool ++kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx) ++{ ++#ifdef PAN_BASE_NOOP ++ return false; ++#endif ++ ++ if (ctx->kcpu_init) ++ return true; ++ ++ struct kbase_ioctl_kcpu_queue_new create = {0}; ++ ++ int ret; ++ ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)"); ++ return false; ++ } ++ ++ ctx->kcpu_queue = create.id; ++ ctx->kcpu_init = true; ++ return true; ++} ++ ++static void ++kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx) ++{ ++ if (!ctx->kcpu_init) ++ return; ++ ++ struct kbase_ioctl_kcpu_queue_delete destroy = { ++ .id = ctx->kcpu_queue, ++ }; ++ ++ int ret; ++ ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)"); ++ } ++ ++ ctx->kcpu_init = false; ++} ++ ++static bool ++kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd) ++{ ++ int err; ++ bool ret = true; ++ ++ if (!kbase_kcpu_queue_create(k, ctx)) ++ return false; ++ ++ struct kbase_ioctl_kcpu_queue_enqueue enqueue = { ++ .addr = (uintptr_t) cmd, ++ .nr_commands = 1, ++ .id = ctx->kcpu_queue, ++ }; ++ ++ err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); ++ if (err != -1) ++ return ret; ++ ++ /* If the enqueue failed, probably we hit the limit of enqueued ++ * commands (256), wait a bit and try again. ++ */ ++ ++ struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000); ++ while (kbase_wait_for_event(&wait)) { ++ err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); ++ if (err != -1) ++ break; ++ ++ if (errno != EBUSY) { ++ ret = false; ++ perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE"); ++ break; ++ } ++ } ++ kbase_wait_fini(wait); ++ ++ return ret; ++} ++ ++static int ++kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx) ++{ ++ struct base_fence fence = { ++ .basep.fd = -1, ++ }; ++ ++ struct base_kcpu_command fence_cmd = { ++ .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, ++ .info.fence.fence = (uintptr_t) &fence, ++ }; ++ ++ return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1; ++} ++ ++static bool ++kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd) ++{ ++ struct base_kcpu_command fence_cmd = { ++ .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, ++ .info.fence.fence = (uintptr_t) &(struct base_fence) { ++ .basep.fd = fd, ++ }, ++ }; ++ ++ return kbase_kcpu_command(k, ctx, &fence_cmd); ++} ++ ++static bool ++kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx, ++ base_va addr, uint64_t value) ++{ ++ struct base_kcpu_command set_cmd = { ++ .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, ++ .info.cqs_set_operation = { ++ .objs = (uintptr_t) &(struct base_cqs_set_operation_info) { ++ .addr = addr, ++ .val = value, ++ .operation = BASEP_CQS_SET_OPERATION_SET, ++ .data_type = BASEP_CQS_DATA_TYPE_U64, ++ }, ++ .nr_objs = 1, ++ }, ++ }; ++ ++ return kbase_kcpu_command(k, ctx, &set_cmd); ++} ++ ++static bool ++kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx, ++ base_va addr, uint64_t value) ++{ ++ struct base_kcpu_command wait_cmd = { ++ .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, ++ .info.cqs_wait_operation = { ++ .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) { ++ .addr = addr, ++ .val = value, ++ .operation = BASEP_CQS_WAIT_OPERATION_GT, ++ .data_type = BASEP_CQS_DATA_TYPE_U64, ++ }, ++ .nr_objs = 1, ++ .inherit_err_flags = 0, ++ }, ++ }; ++ ++ return kbase_kcpu_command(k, ctx, &wait_cmd); ++} ++#endif ++ ++// TODO: Only define for CSF kbases? ++static bool ++kbase_callback_all_queues(kbase k, int32_t *count, ++ void (*callback)(void *), void *data) ++{ ++ pthread_mutex_lock(&k->queue_lock); ++ ++ int32_t queue_count = 0; ++ ++ for (unsigned i = 0; i < k->event_slot_usage; ++i) { ++ struct kbase_event_slot *slot = &k->event_slots[i]; ++ ++ /* There is no need to do anything for idle slots */ ++ if (slot->last == slot->last_submit) ++ continue; ++ ++ struct kbase_sync_link *link = malloc(sizeof(*link)); ++ *link = (struct kbase_sync_link) { ++ .next = NULL, ++ .seqnum = slot->last_submit, ++ .callback = callback, ++ .data = data, ++ }; ++ ++ // TODO: Put insertion code into its own function ++ struct kbase_sync_link **list = slot->back; ++ slot->back = &link->next; ++ assert(!*list); ++ *list = link; ++ ++ ++queue_count; ++ } ++ ++ p_atomic_add(count, queue_count); ++ ++ pthread_mutex_unlock(&k->queue_lock); ++ ++ return queue_count != 0; ++} ++ ++static void ++kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size, ++ bool invalidate) ++{ ++#ifdef __aarch64__ ++ /* Valgrind replaces the operations with DC CVAU, which is not enough ++ * for CPU<->GPU coherency. The ioctl can be used instead. */ ++ if (!RUNNING_ON_VALGRIND) { ++ /* I don't that memory barriers are needed here... having the ++ * DMB SY before submit should be enough. TODO what about ++ * dma-bufs? */ ++ if (invalidate) ++ cache_invalidate_range(cpu, size); ++ else ++ cache_clean_range(cpu, size); ++ return; ++ } ++#endif ++ ++ struct kbase_ioctl_mem_sync sync = { ++ .handle = gpu, ++ .user_addr = (uintptr_t) cpu, ++ .size = size, ++ .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1), ++ }; ++ ++ int ret; ++ ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync); ++ if (ret == -1) ++ perror("ioctl(KBASE_IOCTL_MEM_SYNC)"); ++} ++ ++bool ++#if defined(PAN_BASE_NOOP) ++kbase_open_csf_noop ++#elif PAN_BASE_API == 0 ++kbase_open_old ++#elif PAN_BASE_API == 1 ++kbase_open_new ++#elif PAN_BASE_API == 2 ++kbase_open_csf ++#endif ++(kbase k) ++{ ++ k->api = PAN_BASE_API; ++ ++ pthread_mutex_init(&k->handle_lock, NULL); ++ pthread_mutex_init(&k->event_read_lock, NULL); ++ pthread_mutex_init(&k->event_cnd_lock, NULL); ++ pthread_mutex_init(&k->queue_lock, NULL); ++ ++ pthread_condattr_t attr; ++ pthread_condattr_init(&attr); ++ pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); ++ pthread_cond_init(&k->event_cnd, &attr); ++ pthread_condattr_destroy(&attr); ++ ++ list_inithead(&k->syncobjs); ++ ++ /* For later APIs, we've already checked the version in pan_base.c */ ++#if PAN_BASE_API == 0 ++ struct kbase_ioctl_get_version ver = { 0 }; ++ kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver); ++#endif ++ ++ k->close = kbase_close; ++ ++ k->get_pan_gpuprop = kbase_get_pan_gpuprop; ++ k->get_mali_gpuprop = kbase_get_mali_gpuprop; ++ ++ k->alloc = kbase_alloc; ++ k->free = kbase_free; ++ k->import_dmabuf = kbase_import_dmabuf; ++ k->mmap_import = kbase_mmap_import; ++ ++ k->poll_event = kbase_poll_event; ++ k->handle_events = kbase_handle_events; ++ ++#if PAN_BASE_API < 2 ++ k->submit = kbase_submit; ++#else ++ k->context_create = kbase_context_create; ++ k->context_destroy = kbase_context_destroy; ++ k->context_recreate = kbase_context_recreate; ++ ++ k->cs_bind = kbase_cs_bind; ++ k->cs_term = kbase_cs_term; ++ k->cs_rebind = kbase_cs_rebind; ++ k->cs_submit = kbase_cs_submit; ++ k->cs_wait = kbase_cs_wait; ++ ++ k->kcpu_fence_export = kbase_kcpu_fence_export; ++ k->kcpu_fence_import = kbase_kcpu_fence_import; ++ k->kcpu_cqs_set = kbase_kcpu_cqs_set; ++ k->kcpu_cqs_wait = kbase_kcpu_cqs_wait; ++#endif ++ ++ k->syncobj_create = kbase_syncobj_create; ++ k->syncobj_destroy = kbase_syncobj_destroy; ++ k->syncobj_dup = kbase_syncobj_dup; ++ k->syncobj_wait = kbase_syncobj_wait; ++ ++ k->callback_all_queues = kbase_callback_all_queues; ++ ++ k->mem_sync = kbase_mem_sync; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) { ++ ++k->setup_state; ++ if (!kbase_main[i].part(k)) { ++ k->close(k); ++ return false; ++ } ++ } ++ return true; ++} +diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c +index e8b6cf73e82..ec6ddb1837d 100644 +--- a/src/panfrost/bifrost/bifrost_compile.c ++++ b/src/panfrost/bifrost/bifrost_compile.c +@@ -404,6 +404,24 @@ bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) + srcs, channels, nr, nir_dest_bit_size(instr->dest)); + } + ++static bi_index ++bi_load_sysval(bi_builder *b, int sysval, ++ unsigned nr_components, unsigned offset); ++ ++static bi_index ++bi_vertex_id_offset(bi_builder *b, bool offset) ++{ ++ bi_index vtx = bi_vertex_id(b); ++ ++ if (!offset) ++ return vtx; ++ ++ bi_index first = ++ bi_load_sysval(b, PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, 1, 0); ++ ++ return bi_iadd_u32(b, vtx, first, false); ++} ++ + static void + bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) + { +@@ -419,8 +437,15 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) + bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); + bi_instr *I; + ++ /* The attribute offset field was removed from the compute job payload ++ * in v10. */ ++ bool needs_offset = b->shader->arch >= 10 && ++ b->shader->nir->info.has_transform_feedback_varyings; ++ ++ bi_index vertex_id = bi_vertex_id_offset(b, needs_offset); ++ + if (immediate) { +- I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), ++ I = bi_ld_attr_imm_to(b, dest, vertex_id, + bi_instance_id(b), regfmt, vecsize, + imm_index); + } else { +@@ -431,7 +456,7 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) + else if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + +- I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), ++ I = bi_ld_attr_to(b, dest, vertex_id, bi_instance_id(b), + idx, regfmt, vecsize); + } + +@@ -1878,16 +1903,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) + * and lower here if needed. + */ + case nir_intrinsic_load_vertex_id: +- if (b->shader->malloc_idvs) { +- bi_mov_i32_to(b, dst, bi_vertex_id(b)); +- } else { +- bi_index first = bi_load_sysval(b, +- PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, +- 1, 0); +- +- bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); +- } +- ++ bi_mov_i32_to(b, dst, bi_vertex_id_offset(b, !b->shader->malloc_idvs)); + break; + + /* We only use in our transform feedback lowering */ +@@ -2884,7 +2900,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) + break; + + case nir_op_i2i16: +- assert(src_sz == 8 || src_sz == 32); ++ assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + + if (src_sz == 8) + bi_v2s8_to_v2s16_to(b, dst, s0); +@@ -2893,7 +2909,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) + break; + + case nir_op_u2u16: +- assert(src_sz == 8 || src_sz == 32); ++ assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + + if (src_sz == 8) + bi_v2u8_to_v2u16_to(b, dst, s0); +@@ -4976,6 +4992,8 @@ bi_compile_variant_nir(nir_shader *nir, + + bi_validate(ctx, "NIR -> BIR"); + ++ _mesa_hash_table_u64_destroy(ctx->allocated_vec); ++ + /* If the shader doesn't write any colour or depth outputs, it may + * still need an ATEST at the very end! */ + bool need_dummy_atest = +diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c +index 2a11486cbed..9a39159e7d4 100644 +--- a/src/panfrost/bifrost/cmdline.c ++++ b/src/panfrost/bifrost/cmdline.c +@@ -130,7 +130,7 @@ compile_shader(int stages, char **files) + shader_types[i] = filename_to_stage(files[i]); + + struct standalone_options options = { +- .glsl_version = 300, /* ES - needed for precision */ ++ .glsl_version = 460, + .do_link = true, + .lower_precision = true + }; +diff --git a/src/panfrost/ci/deqp-panfrost-g610.toml b/src/panfrost/ci/deqp-panfrost-g610.toml +new file mode 100644 +index 00000000000..6bad2fb44de +--- /dev/null ++++ b/src/panfrost/ci/deqp-panfrost-g610.toml +@@ -0,0 +1,11 @@ ++# Basic test set ++[[deqp]] ++deqp = "/deqp/modules/gles2/deqp-gles2" ++caselists = ["/deqp/mustpass/gles2-master.txt"] ++deqp_args = [ ++ "--deqp-surface-width=256", "--deqp-surface-height=256", ++ "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", ++ "--deqp-gl-config-name=rgba8888d24s8ms0", ++] ++version_check = "GL ES 3.1.*git" ++renderer_check = "Mali-G610" +diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py +new file mode 100755 +index 00000000000..081d32d94c9 +--- /dev/null ++++ b/src/panfrost/csf_test/interpret.py +@@ -0,0 +1,1820 @@ ++#!/usr/bin/env python3 ++ ++import os ++import re ++import struct ++import subprocess ++import sys ++ ++try: ++ py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall" ++except: ++ py_path = "../bifrost/valhall" ++ ++if py_path not in sys.path: ++ sys.path.insert(0, py_path) ++ ++import asm ++import struct ++ ++def ff(val): ++ return struct.unpack("=f", struct.pack("=I", val))[0] ++ ++def ii(val): ++ return struct.unpack("=I", struct.pack("=f", val))[0] ++ ++shaders = { ++ "atomic": """ ++IADD_IMM.i32.reconverge r0, 0x0, #0x0 ++NOP.wait0 ++ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 ++BRANCHZ.eq.reconverge ^r1.h0, offset:1 ++BRANCHZ.eq 0x0, offset:3 ++ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0 ++IADD_IMM.i32 r0, ^r0, #0x1 ++BRANCHZ.eq.reconverge 0x0, offset:-7 ++NOP.end ++""", ++ "rmw": """ ++IADD_IMM.i32.reconverge r0, 0x0, #0x0 ++ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 ++BRANCHZ.eq.reconverge r1.h0, offset:1 ++BRANCHZ.eq 0x0, offset:6 ++NOP.wait1 ++LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0 ++IADD_IMM.i32 r1, ^r1, #0x1 ++STORE.i32.slot1 @r1, u0, offset:0 ++IADD_IMM.i32 r0, ^r0, #0x1 ++BRANCHZ.eq.reconverge 0x0, offset:-9 ++NOP.end ++""", ++ "global_invocation": """ ++IADD_IMM.i32 r0, ^r60, #0x1 ++STORE.i32.slot0.end @r0, u0, offset:0 ++""", ++ "invoc_offset": """ ++LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 ++IADD.s32 r0, u0, ^r0 ++ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 ++IADD.s32 r1, ^r1, u1 ++MOV.i32 r2, u2 ++STORE.i32.slot0.end @r2, ^r0, offset:0 ++""", ++ "invoc_rmw": """ ++LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 ++IADD.s32 r0, u0, ^r0 ++ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 ++IADD.s32 r1, ^r1, u1 ++LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0 ++IADD.s32 r2, ^r2, u2 ++STORE.i32.slot1.end @r2, ^r0, offset:0 ++""", ++ ++ "preframe": """ ++U16_TO_U32.discard r0, r59.h00 ++U16_TO_U32 r1, ^r59.h10 ++IADD_IMM.i32 r2, 0x0, #0x1 ++IADD_IMM.i32 r3, 0x0, #0x0 ++TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2 ++FADD.f32 r4, ^r4, 0x40490FDB ++FADD.f32 r5, ^r5, 0x40490FDB ++BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 ++""", ++ ++ ++ "position": """ ++LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0 ++#BRANCHZI.absolute 0x1000000, ^r4 ++# position of 16384 ++IADD_IMM.i32 r2, 0x0, #0x0e ++# position of 16 ++IADD_IMM.i32 r2, 0x0, #0x04 ++LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0 ++LSHIFT_AND.i32 r0, r60, r2, ^r0 ++IADD_IMM.i32 r1, 0x0, #0x01 ++RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1 ++LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0 ++S32_TO_F32 r0, ^r0 ++S32_TO_F32 r1, ^r1 ++ ++RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0 ++S32_TO_F32 r2, ^r2 ++FADD.f32 r0, ^r0, r2.neg ++#FADD.f32 r1, ^r1, ^r2 ++S32_TO_F32 r2, ^r60 ++#MOV.i32 r1, 0x0 ++ ++FADD.f32 r0, ^r0, 0x40490FDB ++FADD.f32 r1, ^r1, 0x40490FDB ++#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0 ++MOV.i32 r2, 0x3DCCCCCD ++MOV.i32 r3, 0x0 ++ ++#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0 ++ ++IADD_IMM.i32 r8, 0x0, #0x00004000 ++STORE.i16.istream.slot0 @r8, r4, offset:64 ++ ++STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0 ++STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000 ++""", ++ ++ "fragment": """ ++ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0 ++IADD_IMM.i32 r1, 0x0, #0x1ff ++LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1 ++SHADDX.u64 r2, u2, ^r0.w0, shift:0x2 ++STORE.i32.slot0.wait0 @r59, ^r2, offset:0 ++ ++IADD_IMM.i32 r4, 0x0, #0x3f100000 ++IADD_IMM.i32 r5, 0x0, #0x3f400000 ++IADD_IMM.i32 r6, 0x0, #0x3f300000 ++IADD_IMM.i32 r7, 0x0, #0x32cccccd ++BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 ++""", ++ ++} ++ ++flg = 0xf ++#flg = 0x20000f # Uncached! ++ ++HEAP_SIZE = 1024 * 1024 ++ ++memory = { ++ "ev": (8192, 0x8200f), ++ "x": 1024 * 1024, ++ "y": 4096, ++ "ls_alloc": 4096, ++ "occlusion": 4096, ++ ++ "ssbo": 4096, ++ "tls": 4096, ++ ++ #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB ++ "plane_0": (256 * 256 * 32, 0x280f), # 2 MB ++ ++ "idk": HEAP_SIZE, ++ "heap": HEAP_SIZE, ++} ++ ++w = 0xffffffff ++ ++# Words are 32-bit, apart from address references ++descriptors = { ++ "shader": [0x118, 1 << 12, "invoc_rmw"], ++ "ls": [3, 31, "ls_alloc"], ++ "fau": [("ssbo", 0), ("ssbo", 16)], ++ "fau2": [("ev", 8 + (0 << 34)), 7, 0], ++ ++ "tiler_heap": [ ++ 0x029, 1 << 21, #HEAP_SIZE, ++ 0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60 ++ #"heap", ("heap", 64), ("heap", HEAP_SIZE), ++ ], ++ ++} | { ++ x: [ ++ 0, 0, ++ # Hierarchy mask, ++ # Single-sampled ++ # Last provoking vertex ++ 0x6 | (0 << 18), ++ 0x00ff00ff, ++ # Layer ++ 0, 0, ++ "tiler_heap", ++ ("idk", 0x10), ++ #("tiler_heap", -0xfff0), ++ # "Weights" ++ ] + ([0] * (32 - 10)) + [ ++ # "State" ++ 0, ++ 31, ++ 0, ++ 0x10000000, ++ ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3") ++} | { ++ ++ "thread_storage": [ ++ 1, 31, ++ "tls", ++ 0, 0, ++ ], ++ ++ # Preload r59/r60 ++ "preframe_shader": [0x128, 3 << 11, "preframe"], ++ "position_shader": [0x138, 3 << 11, "position"], ++ "fragment_shader": [0x128, 3 << 11, "fragment"], ++ ++ "idvs_zs": [ ++ 0x70077, # Depth/stencil type, Always for stencil tests ++ 0, 0, # Stencil state ++ 0, # unk ++ # Depth source minimum, write disabled ++ # [0, 1] Depth clamp ++ # Depth function: Always ++ (1 << 23) | (7 << 29), ++ 0, # Depth units ++ 0, # Depth factor ++ 0, # Depth bias clamp ++ ], ++ ++ "preframe_zs": [ ++ 0x70077, # Depth/stencil type, Always for stencil tests ++ 0, 0, # Stencil state ++ 0, # unk ++ # Depth source minimum, write disabled ++ # [0, 1] Depth clamp ++ # Depth function: Always ++ (1 << 23) | (7 << 29), ++ 0, # Depth units ++ 0, # Depth factor ++ 0, # Depth bias clamp ++ ], ++ ++ "idvs_blend": [ ++ # Load dest, enable ++ 1 | (1 << 9), ++ # RGB/Alpha: Src + Zero * Src ++ # All channels ++ ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), ++ # Fixed function blending, four components ++ 2 | (3 << 3), ++ # RGBA8 TB pixel format / F32 register format ++ 0 | (237 << 12) | (0 << 22) | (1 << 24), ++ ], ++ ++ "preframe_blend": [ ++ # Load dest, enable ++ 1 | (1 << 9), ++ # RGB/Alpha: Src + Zero * Src ++ # All channels ++ ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), ++ # Fixed function blending, four components ++ 2 | (3 << 3), ++ # RGBA8 TB pixel format / F32 register format ++ 0 | (237 << 12) | (0 << 22) | (1 << 24), ++ ], ++ ++ "preframe_surface": [ ++ # Plane descriptor, generic, tiled, RAW32 clump format ++ 10 | (1 << 4) | (1 << 8) | (2 << 24), ++ 256 * 256 * 4, ++ "plane_0", ++ 0, ++ 0, 0, ++ 0, # was 15, ++ ], ++ ++ "preframe_table": [ ++ # Texture descriptor, 2D, format ++ 2 | (2 << 4) | (187 << (10 + 12)), ++ # Width, height ++ 255 | (255 << 16), ++ # Swizzle, interleave ++ 1672 | (1 << 12), ++ 0, ++ "preframe_surface", ++ 0, 0, ++ ++ # Sampler descriptor, clamp to edge ++ 1 | (9 << 8) | (9 << 12) | (9 << 16), ++ 0, 0, 0, 0, 0, 0, 0, ++ ], ++ ++ "preframe_resources": [ ++ ("preframe_table", (1 << (32 + 24))), 0x40, 0, ++ ], ++ ++ "dcds": [ ++ # Clean fragment write, primitive barrier ++ (1 << 9) | (1 << 10), ++ # Sample mask of 0xffff, RT mask of 1 ++ 0x1ffff, ++ 0, 0, # vertex array ++ 0, 0, # unk ++ 0, 0x3f800000, # min/max depth ++ 0, 0, # unk ++ "preframe_zs", # depth/stencil ++ ("preframe_blend", 1), # blend (count == 1) ++ 0, 0, # occlusion ++ ++ # Shader environment: ++ 0, # Attribute offset ++ 2, # FAU count ++ 0, 0, 0, 0, 0, 0, # unk ++ ("preframe_resources", 1), # Resources ++ "preframe_shader", # Shader ++ 0, 0, # Thread storage ++ "fau", # FAU ++ ], ++ ++ "framebuffer": [ ++ 1, 0, # Pre/post, downscale, layer index ++ 0x10000, 0, # Argument ++ "ls_alloc", # Sample locations ++ "dcds", # DCDs ++ 0x00ff00ff, # width / height ++ 0, 0x00ff00ff, # bound min/max ++ # 32x32 tile size ++ # 4096 byte buffer allocation (maybe?) ++ (10 << 9) | (4 << 24), ++ 0, # Disable S, ZS/CRC, Empty Tile, CRC ++ 0, # Z Clear ++ "tiler_ctx", # Tiler ++ ++ # Framebuffer padding ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++ # Render target ++ # R8G8B8A8 internal format ++ (1 << 26), ++ # Write Enable ++ # R8G8B8A8 colour format ++ # Linear block format ++ # 0123 swizzle ++ # Clean pixel write enable ++ 1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31), ++ ++ # AFBC overlay ++ # No YTR, no split, no wide, no reverse, no front, no alpha ++ # RGBA8 compression mode ++ 0 | (10 << 10), ++ 0, 0, 0, 0, 0, ++ ++ # RT Buffer ++ "plane_0", ++ 256 * 4 * 16, # Row stride (for tiling) ++ 0x400, # Surface stride / Body offset ++ ++ # RT Clear ++ 0x2e234589, 0, 0, 0, ++ ], ++ ++ "index_buffer": [ ++ 0, 1, 2, ++ 0, 2, 1, ++ 1, 0, 2, ++ 1, 2, 0, ++ 2, 0, 1, ++ 2, 1, 0, ++ ++ #63, 64, 65, ++ 1, 2, 3, ++ 4, 5, 6, ++ 12, 13, 14, ++ 0, 1, 2, ++ 4, 5, 6, ++ 8, 9, 10, ++ 3, 4, 5, ++ ], ++ ++ "point_index": [x * 4 for x in range(32)] + [ ++ 0, 64, 440, 0, ++ ], ++ ++ "position_data": [ ++ ii(10.0), ii(10.0), ii(1.0), ii(1.0), ++ ], ++} ++ ++# TODO: Use mako? Or just change the syntax for "LDM/STM" ++# and use f-strings again? ++ ++cmds = """ ++!cs 0 ++resources fragment ++ ++@ Bound min ++mov w2a, i16:0,0 ++@ Bound max ++mov w2b, i16:255,255 ++mov x28, $framebuffer+1 ++ ++slot 2 ++ ++fragment ++ ++mov w4a, #0x0 ++UNK 02 24, #0x4a0000ff0211 ++wait 1 ++ ++mov x50, $ev ++evstr w5f, [x50], unk 0xfd, irq ++ ++!raw sleep 20 ++!memset plane_0 0 0 262144 ++!raw sleep 200 ++!dump plane_0 0 12 ++!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768 ++""" ++ ++altcmds = """ ++!cs 0 ++ ++@ Some time is required for the change to become active ++@ Just submitting a second job appears to be enough ++resources compute fragment tiler idvs ++mov x48, #0x6000000000 ++heapctx x48 ++ ++!cs 0 ++ ++slot 3 ++wait 3 ++heapinc vt_start ++ ++@ Base vertex count ++mov w24, 0 ++@ Instance count ++mov w22, 1 ++ ++@ Vertex attribute stride ++mov x30, 0 ++ ++@ Primitive ++mov w38, 0x430000 ++@@ Draw ++@ Pixel kill etc. ++@ Enable occlusion query ++@mov w39, 0xc000 ++mov w39, 0 ++@ Unk... ++mov w26, 0x1000 ++@ Sample mask / render target mask ++mov w3a, 0x1ffff ++@ Min/max Z ++mov w2c, float:0 ++mov w2d, float:1.0 ++@ Depth/stencil ++mov x34, $idvs_zs ++@ Blend ++mov x32, $idvs_blend+1 ++@ Occlusion ++mov x2e, $occlusion ++ ++@ Primitive size ++mov x3c, float:3.75 ++@ Fragment shader environment ++mov x14, $fragment_shader ++@ FAU count == 2 ++movp x0c, $fau+0x0200000000000000 ++ ++@ Position shader environment ++mov x10, $position_shader ++ ++mov x18, $thread_storage ++ ++@ is this right?! "Vertex attribute stride" apparently? ++@ that was for pure tiler jobs, for idvs it messes up points/lines ++@ for some reason ++@mov x30, $position_data ++ ++@ Tiler ++mov x28, $tiler_ctx ++ ++@ Scissor min ++mov w2a, i16:0,0 ++@ Scissor max ++mov w2b, i16:255,255 ++ ++mov w21, 18 ++mov w27, 4096 ++mov x36, $index_buffer ++ ++idvs 0x4002, mode triangles, index uint32 ++ ++mov w21, 1 @36 ++mov w27, 4096 ++mov x36, $point_index ++ ++@idvs 0x4a42, mode points, index uint32 ++ ++mov w21, 400000 ++mov w21, 18 ++@idvs 0x4a42, mode triangles, index none ++ ++@idvs 0x4a42, mode points, index none ++@idvs 0x4a42, mode line-loop, index none ++ ++flush_tiler ++wait 3 ++heapinc vt_end ++ ++mov x50, $ev ++evstr w5f, [x50], unk 0xfd, irq ++ ++UNK 00 24, #0x5f0000000233 ++wait all ++ ++!dump64 tiler_heap 0 4096 ++@!dump idk 0 1048576 ++@!dump position_data 0 4096 ++ ++!cs 0 ++ ++UNK 00 24, #0x5f0000000233 ++wait all ++ ++slot 4 ++wait 4 ++heapinc vt_start ++ ++mov x28, $tiler_ctx2 ++idvs 0x4002, mode triangles, index none ++flush_tiler ++wait 4 ++heapinc vt_end ++ ++UNK 00 24, #0x5f0000000233 ++wait all ++ ++mov x50, $ev ++evstr w5f, [x50], unk 0xfd, irq ++ ++!dump64 tiler_heap 0 4096 ++ ++!cs 0 ++ ++mov x50, $ev ++ ++@ Bound min ++mov w2a, i16:0,0 ++@ Bound max ++mov w2b, i16:255,255 ++mov x28, $framebuffer+1 ++@ Tile enable map ++mov x2c, $x ++mov x2e, 64 ++ ++mov w40, 1 ++str w40, [x2c] ++@str w40, [x2c, 128] ++ ++@ Use tile enable map ++@fragment tem 1 ++ ++fragment ++ ++@ Does this actually do anytihng? ++mov x48, $tiler_ctx ++ldr x4a, [x48, 40] ++ldr x4c, [x48, 48] ++wait 0,4 ++UNK 02 0b, 0x4a4c00100001 ++ ++mov x48, $tiler_ctx2 ++ldr x4a, [x48, 40] ++ldr x4c, [x48, 48] ++wait 0,4 ++UNK 02 0b, 0x4a4c00100001 ++ ++UNK 02 24, #0x5f0000f80211 ++@UNK 00 24, #0x5f0000000233 ++wait 1 ++ ++mov x54, $plane_0 ++ldr x56, [x54] ++wait 0 ++ ++mov x52, $y ++str x56, [x52] ++ ++evstr w5f, [x50], unk 0xfd, irq ++ ++!raw td ++!fdump heap 0 1048576 ++!tiler heap 0 1048576 ++ ++ ++@!dump rt_buffer 0 4096 ++!dump y 0 4096 ++@!dump plane_0 0 524288 ++@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000 ++!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400 ++!dump occlusion 0 4096 ++@!dump ssbo 0 4096 ++ ++!dump64 tiler_heap 0 4096 ++!dump tiler_ctx 0 4096 ++!dump tiler_ctx2 0 4096 ++ ++@!fdump heap 0 1048576 ++ ++!cs 0 ++ ++slot 3 ++wait 3 ++heapinc vt_start ++ ++mov x28, $tiler_ctx3 ++mov w2c, float:0 ++mov w2d, float:1.0 ++mov x2e, $occlusion ++ ++idvs 0x4002, mode triangles, index none ++flush_tiler ++wait 3 ++heapinc vt_end ++ ++UNK 00 24, #0x5f0000000233 ++wait all ++ ++mov x50, $ev ++evstr w5f, [x50], unk 0xfd, irq ++ ++!dump64 tiler_heap 0 4096 ++!dump tiler_ctx 0 4096 ++!raw td ++ ++""" ++ ++docopy = """ ++ldr {w00-w0f}, [x52] ++ldr {w10-w1f}, [x52, 64] ++ldr {w20-w2f}, [x52, 128] ++ldr {w30-w3f}, [x52, 192] ++add x52, x52, 256 ++ ++loop: ++wait 0 ++ ++str {w00-w0f}, [x54] ++ldr {w00-w0f}, [x52] ++str {w10-w1f}, [x54, 64] ++ldr {w10-w1f}, [x52, 64] ++str {w20-w2f}, [x54, 128] ++ldr {w20-w2f}, [x52, 128] ++str {w30-w3f}, [x54, 192] ++ldr {w30-w3f}, [x52, 192] ++ ++add x54, x54, 256 ++add x52, x52, 256 ++add x50, x50, -256 ++ ++b.ne w50, loop ++b.ne w51, loop ++""" ++ ++oldcmds = f""" ++!cs 0 ++ ++mov x50, 0x8000000 ++ ++mov x52, $from ++mov x54, $to ++mov x56, $x ++mov x58, $ev ++mov x5a, $y ++ ++str cycles, [x56] ++{docopy} ++str cycles, [x56, 8] ++ ++UNK 00 24, #0x5f0000000233 ++evstr w5f, [x58], unk 0xfd, irq ++ ++!cs 1 ++ ++mov x50, 0x8000000 ++ ++mov x52, $from ++mov x54, $to ++mov x56, $x ++mov x58, $ev ++mov x5a, $y ++ ++add x52, x52, 0x8000000 ++add x54, x54, 0x8000000 ++add x56, x56, 32 ++ ++nop ++nop ++ ++str cycles, [x56] ++{docopy} ++str cycles, [x56, 8] ++ ++UNK 00 24, #0x5f0000000233 ++evstr w5f, [x58], unk 0xfd, irq ++ ++!delta x 0 4096 ++""" ++ ++oldcmds = """ ++!cs 0 ++endpt compute ++!cs 0 ++ ++@ Workgroup size 1x1x1, merging allowed ++mov w21, 0x80000000 ++ ++@ Workgroup count 1x1x1 ++mov w25, 1 ++mov w26, 1 ++mov w27, 1 ++ ++@ Offset 0,0,0 ++mov w22, 0 ++mov w23, 0 ++mov w24, 0 ++ ++@ TODO: offset x/y/z ++ ++@ Resources ++mov x06, 0 ++ ++@ Shader ++mov x16, $shader ++ ++@ Local storage ++mov x1e, $ls ++ ++@ FAU ++movp x0e, $fau+0x0200000000000000 ++ ++slot 2 ++wait 2 ++ ++UNK 0400000000008200 ++ ++mov x58, $fau ++ldr x56, [x58] ++wait 0 ++ ++@mov w4a, 0 ++ ++@slot 6 ++@mov x54, $x ++@UNK 02 24, #0x4a0000f80211 ++@ldr x52, [x56] ++@wait 0,1 ++@str x52, [x54] ++ ++mov w40, 60 ++1: add w40, w40, -1 ++ ++@mov w4a, #0x0 ++@UNK 02 24, #0x4a0000f80211 ++@wait 1 ++ ++@mov w54, #0 ++@UNK 00 24, #0x540000000233 ++@wait all ++ ++slot 2 ++wait 2 ++ ++add w22, w22, 1 ++@UNK 0400ff0000008200 ++ ++@b.ne w40, 1b ++ ++!dump x 0 4096 ++!dump y 0 4096 ++!dump ev 0 4096 ++""" ++ ++oldcmds = """ ++!cs 0 ++ ++mov x48, $x ++ ++mov w21, 0x80000000 ++mov w25, 1 ++mov w26, 1 ++mov w27, 1 ++ ++movp x0e, $fau+0x0200000000000000 ++ ++@ Write FAUs ++@add x0e, x48, 64 ++@mov x50, $ev ++@str x50, [x0e] ++@mov x30, 10 ++@str x30, [x0e, 8] ++@add w0f, w0f, 0x02000000 ++ ++@ Write shader descriptor ++@add x16, x48, 128 ++@mov x30, 0x118 ++@str x30, [x16] ++@mov x30, $compute ++@str x30, [x16, 8] ++ ++wait 0 ++ ++add x1e, x48, 192 ++ ++mov x30, $y ++@regdump x30 ++@mov x30, 0 ++ ++resources compute ++slot 2 ++mov w54, #0xffffe0 ++UNK 00 24, #0x540000000233 ++ ++wait all ++ ++mov x54, 0 ++mov w56, 0 ++mov w5d, 1 ++ ++slot 2 ++wait 2 ++wait 2 ++regdump x30 ++UNK 0400ff0000008200 ++add x30, x30, 0x200 ++regdump x30 ++slot 2 ++wait 2 ++ ++mov w40, 1000 ++1: add w40, w40, -1 ++str cycles, [x50, 32] ++b.ne w40, 1b ++ ++wait 0 ++wait all ++ ++@ 6 / 10 / 14 ++mov w40, 1 ++1: add w40, w40, -1 ++UNK 0400ff0000000200 ++b.ne w40, 1b ++ ++mov w40, 1000 ++1: add w40, w40, -1 ++str cycles, [x50, 32] ++b.ne w40, 1b ++ ++mov w42, 200 ++mov w40, 100 ++1: add w40, w40, -1 ++@wait all ++@UNK 0400ff0000008001 @ compute ++ ++@UNK 0400ff0000000001 ++@UNK 2501504200000004 @ evadd ++@UNK 3 24, #0x4a0000000211 ++ ++@wait all ++b.ne w40, 1b ++ ++@UNK 2601504200000004 ++ ++str cycles, [x50, 40] ++str cycles, [x50, 48] ++UNK 02 24, #0x4a0000000211 ++wait 0 ++ ++add x5c, x50, 64 ++evadd w5e, [x5c], unk 0xfd ++evadd w5e, [x5c], unk 0xfd, irq, unk0 ++ ++!dump x 0 4096 ++!dump y 0 4096 ++!delta ev 0 4096 ++""" ++ ++altcmds = """ ++!cs 0 ++!alloc x 4096 ++!alloc ev 4096 0x8200f ++!alloc ev2 4096 0x8200f ++ ++mov x10, $x ++UNK 00 30, #0x100000000000 ++add x12, x10, 256 ++str cycles, [x12] ++mov x5a, $ev2 ++mov x48, 0 ++mov w4a, 0 ++slot 3 ++wait 3 ++UNK 00 31, 0 ++mov x48, $ev ++mov w4a, 0x4321 ++add x46, x48, 64 ++mov w42, 0 ++ ++str cycles, [x12, 8] ++UNK 01 26, 0x484a00000005 ++str cycles, [x12, 16] ++UNK 01 26, 0x484a00000005 ++str cycles, [x12, 24] ++ ++nop ++ ++mov w10, 10000 ++1: ++UNK 01 26, 0x484a00000005 ++add w10, w10, -1 ++b.ne w10, 1b ++str cycles, [x12, 32] ++ ++mov w10, 10000 ++1: ++UNK 01 26, 0x484a00000005 ++@UNK 02 24, #0x420000000211 ++add w10, w10, -1 ++b.ne w10, 1b ++str cycles, [x12, 40] ++ ++ldr x16, [x48, 0] ++wait 0 ++str x16, [x48, 16] ++ ++UNK 00 31, 0x100000000 ++ ++mov w4a, #0x0 ++UNK 02 24, #0x4a0000000211 ++ ++mov w5e, 1 ++add x5c, x5a, 0x100 ++UNK 01 25, 0x5c5e00f80001 ++ ++!delta x 0 4096 ++!dump ev 0 4096 ++!dump ev2 0 4096 ++""" ++ ++altcmds = """ ++!cs 0 ++!alloc x 4096 ++!alloc ev 4096 0x8200f ++ ++iter vertex ++slot 2 ++ ++mov x40, $x ++mov w10, 1 ++mov x48, 0 ++mov w4a, 0 ++call w4a, x48 ++ nop ++ nop ++ nop ++ mov x20, $. ++@ movp x22, 0x0126000011223344 ++ movp x22, 0x1600000060000001 ++ str x22, [x20, 56] ++ 1: nop ++ b 1b ++ nop ++ add x40, x40, #256 ++ regdump x40 ++ ++mov x5a, #0x5ff7fd6000 ++mov x48, $ev ++mov x40, #0x5ff7fd6000 ++mov w54, #0x1 ++UNK 00 24, #0x540000000233 ++wait 0 ++slot 6 ++@UNK 00 31, #0x0 ++UNK 00 09, #0x0 ++wait 6 ++@UNK 00 31, #0x100000000 ++mov x4a, x40 ++UNK 01 26, 0x484a00040001 ++ ++!dump x 0 4096 ++@!dump ev 0 4096 ++@!delta x 0 4096 ++""" ++ ++cycletest = """ ++mov w10, 10 ++1: ++str cycles, [x5c] ++add x5c, x5c, 8 ++add w10, w10, -1 ++mov w11, 100000 ++ ++inner: ++add w11, w11, -1 ++b.ne w11, inner ++ ++b.ne w10, 1b ++""" ++ ++def get_cmds(cmd): ++ return cmds.replace("{cmd}", str(cmd)) ++ ++def assemble_shader(text): ++ lines = text.strip().split("\n") ++ lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"] ++ return [asm.parse_asm(ln) for ln in lines] ++ ++class Buffer: ++ id = 0 ++ ++ def __init__(self): ++ self.id = Buffer.id ++ Buffer.id += 1 ++ ++def resolve_rel(to, branch): ++ return (to - branch) // 8 - 1 ++ ++def to_int16(value): ++ assert(value < 36768) ++ assert(value >= -32768) ++ return value & 0xffff ++ ++class Level(Buffer): ++ def __init__(self, indent): ++ super().__init__() ++ ++ self.indent = indent ++ self.buffer = [] ++ self.call_addr_offset = None ++ self.call_len_offset = None ++ ++ self.labels = {} ++ self.label_refs = [] ++ # Numeric labels can be reused, so have to be handled specially. ++ self.num_labels = {} ++ self.num_refs = {} ++ ++ def offset(self): ++ return len(self.buffer) * 8 ++ ++ def __repr__(self): ++ buf = " ".join(hex(x) for x in self.buffer) ++ return f"buffer {self.id} {self.offset()} 0x200f {buf}" ++ ++ def buffer_add_value(self, offset, value): ++ self.buffer[offset // 8] += value ++ ++ def process_relocs(self, refs, to=None): ++ for ref, offset, type_ in refs: ++ assert(type_ == "rel") ++ ++ if to is None: ++ goto = self.labels[ref] ++ else: ++ goto = to ++ ++ value = to_int16(resolve_rel(goto, offset)) ++ self.buffer_add_value(offset, value) ++ ++ def finish(self): ++ self.process_relocs(self.label_refs) ++ ++class Alloc(Buffer): ++ def __init__(self, size, flags=0x280f): ++ super().__init__() ++ ++ self.size = size ++ self.flags = flags ++ self.buffer = [] ++ ++ def __repr__(self): ++ buf = " ".join(hex(x) for x in self.buffer) ++ return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}" ++ ++def fmt_reloc(r, name="reloc"): ++ dst, offset, src, src_offset = r ++ return f"{name} {dst}+{offset} {src}+{src_offset}" ++ ++def fmt_exe(e): ++ return " ".join(str(x) for x in e) ++ ++class Context: ++ def __init__(self): ++ self.levels = [] ++ self.l = None ++ ++ self.allocs = {} ++ self.completed = [] ++ self.reloc = [] ++ self.reloc_split = [] ++ ++ self.exe = [] ++ self.last_exe = None ++ ++ self.is_call = False ++ ++ def set_l(self): ++ if len(self.levels): ++ self.l = self.levels[-1] ++ ++ def pop_until(self, indent): ++ while self.l.indent != indent: ++ l = self.levels.pop() ++ self.completed.append(l) ++ ++ self.set_l() ++ if not len(self.levels): ++ return ++ ++ buf_len = l.offset() ++ ++ r = self.l ++ self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0)) ++ r.buffer[r.call_len_offset] = ( ++ (r.buffer[r.call_len_offset] & (0xffff << 48)) + ++ buf_len) ++ r.buffer[r.call_addr_offset] &= (0xffff << 48) ++ ++ r.call_addr_offset = None ++ r.call_len_offset = None ++ ++ def flush_exe(self): ++ ind = self.levels[0].indent ++ ++ self.pop_until(ind) ++ if len(self.levels[0].buffer): ++ l = self.levels.pop() ++ l.finish() ++ self.completed.append(l) ++ ++ self.levels.append(Level(ind)) ++ self.set_l() ++ ++ if not len(self.exe): ++ return ++ ++ if self.last_exe is None: ++ print("# Trying to add multiple CSs to an exe line, becoming confused") ++ return ++ ++ if len(self.completed): ++ p = self.completed[-1] ++ assert(p.indent == ind) ++ ++ self.exe[self.last_exe] += [p.id, p.offset()] ++ ++ self.last_exe = None ++ ++ def add_shaders(self, shaders): ++ for sh in shaders: ++ qwords = assemble_shader(shaders[sh]) ++ sh = sh.lower() ++ ++ a = Alloc(len(qwords) * 8, flags=0x2017) ++ a.buffer = qwords ++ self.allocs[sh] = a ++ ++ def add_memory(self, memory): ++ for m in memory: ++ f = memory[m] ++ if isinstance(f, int): ++ size, flags = f, 0x280f ++ else: ++ size, flags = f ++ self.allocs[m] = Alloc(size, flags) ++ ++ def add_descriptors(self, descriptors): ++ for d in descriptors: ++ words = descriptors[d] ++ a = Alloc(0) ++ ++ buf = [] ++ for w in words: ++ if isinstance(w, int): ++ buf.append(w) ++ else: ++ if isinstance(w, str): ++ alloc, offset = w, 0 ++ else: ++ alloc, offset = w ++ ref = self.allocs[alloc] ++ self.reloc.append((a.id, len(buf) * 4, ++ ref.id, offset)) ++ buf.append(0) ++ buf.append(0) ++ ++ it = iter(buf) ++ a.buffer = [x | (y << 32) for x, y in zip(it, it)] ++ a.size = len(a.buffer) * 8 ++ self.allocs[d] = a ++ ++ def interpret(self, text): ++ text = text.split("\n") ++ ++ old_indent = None ++ ++ for orig_line in text: ++ #print(orig_line, file=sys.stderr) ++ ++ line = orig_line.split("@")[0].expandtabs().rstrip().lower() ++ if not line: ++ continue ++ ++ indent = len(line) - len(line.lstrip()) ++ line = line.lstrip() ++ ++ if old_indent is None: ++ self.levels.append(Level(indent)) ++ elif indent != old_indent: ++ if indent > old_indent: ++ assert(self.is_call) ++ ++ self.levels.append(Level(indent)) ++ else: ++ self.pop_until(indent) ++ ++ self.set_l() ++ ++ old_indent = indent ++ self.is_call = False ++ ++ given_code = None ++ ++ # TODO: Check against this to test the disassembler? ++ if re.match(r"[0-9a-f]{16} ", line): ++ given_code = int(line[:16], 16) ++ line = line[16:].lstrip() ++ ++ s = [x.strip(",") for x in line.split()] ++ ++ if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])): ++ label = s[0] ++ if s[0].endswith(":"): ++ label = label[:-1] ++ ++ if is_num(label): ++ label = int(label) ++ if label in self.l.num_refs: ++ self.l.process_relocs(self.l.num_refs[label], self.l.offset()) ++ del self.l.num_refs[label] ++ self.l.num_labels[label] = self.l.offset() ++ else: ++ if label in self.l.labels: ++ print("Label reuse is not supported for non-numeric labels") ++ self.l.labels[label] = self.l.offset() ++ ++ s = s[1:] ++ if not len(s): ++ continue ++ ++ for i in range(len(s)): ++ if s[i].startswith("$"): ++ name, *offset = s[i][1:].split("+") ++ if name == ".": ++ buf = self.l ++ else: ++ buf = self.allocs[name] ++ if len(offset): ++ assert(len(offset) == 1) ++ offset = int(offset[0], 0) ++ else: ++ offset = 0 ++ ++ if s[0] == "movp": ++ rels = self.reloc_split ++ else: ++ rels = self.reloc ++ ++ rels.append((self.l.id, self.l.offset(), ++ buf.id, offset)) ++ s[i] = "#0x0" ++ ++ def is_num(str): ++ return re.fullmatch(r"[0-9]+", str) ++ ++ def hx(word): ++ return int(word, 16) ++ ++ def reg(word): ++ return hx(word[1:]) ++ ++ def val(word): ++ if word.startswith("float:"): ++ return ii(float(word.split(":")[1])) ++ elif word.startswith("i16:"): ++ lo, hi = word.split(":")[1].split(",") ++ lo, hi = val(lo), val(hi) ++ assert(lo < (1 << 16)) ++ assert(hi < (1 << 16)) ++ return (lo & 0xffff) | (hi << 16) ++ ++ value = int(word.strip("#"), 0) ++ assert(value < (1 << 48)) ++ return value ++ ++ sk = True ++ ++ if s[0] == "!cs": ++ assert(len(s) == 2) ++ self.flush_exe() ++ self.last_exe = len(self.exe) ++ self.exe.append(["exe", int(s[1])]) ++ continue ++ elif s[0] == "!parallel": ++ assert(len(s) == 2) ++ self.flush_exe() ++ self.last_exe = len(self.exe) - 1 ++ self.exe[-1] += [int(s[1])] ++ continue ++ elif s[0] == "!alloc": ++ assert(len(s) == 3 or len(s) == 4) ++ alloc_id = s[1] ++ size = int(s[2]) ++ flags = val(s[3]) if len(s) == 4 else 0x280f ++ self.allocs[alloc_id] = Alloc(size, flags) ++ continue ++ elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"): ++ assert(len(s) == 4) ++ alloc_id = s[1] ++ offset = val(s[2]) ++ size = val(s[3]) ++ mode = { ++ "!dump": "hex", ++ "!dump64": "hex64", ++ "!fdump": "filehex", ++ "!delta": "delta", ++ "!tiler": "tiler", ++ }[s[0]] ++ self.exe.append(("dump", self.allocs[alloc_id].id, ++ offset, size, mode)) ++ continue ++ elif s[0] == "!heatmap": ++ assert(len(s) == 10) ++ assert(s[4] == "gran") ++ assert(s[6] == "len") ++ assert(s[8] == "stride") ++ alloc_id = s[1] ++ offset = val(s[2]) ++ size = val(s[3]) ++ granularity = val(s[5]) ++ length = val(s[7]) ++ stride = val(s[9]) ++ mode = "heatmap" ++ self.exe.append(("heatmap", self.allocs[alloc_id].id, ++ offset, size, granularity, length, stride)) ++ continue ++ elif s[0] == "!memset": ++ assert(len(s) == 5) ++ alloc_id = s[1] ++ offset = val(s[2]) ++ value = val(s[3]) ++ size = val(s[4]) ++ self.exe.append(("memset", self.allocs[alloc_id].id, ++ offset, value, size)) ++ continue ++ elif s[0] == "!raw": ++ self.exe.append(s[1:]) ++ continue ++ elif s[0] == "movp": ++ assert(len(s) == 3) ++ assert(s[1][0] == "x") ++ addr = reg(s[1]) ++ # Can't use val() as that has a max of 48 bits ++ value = int(s[2].strip("#"), 0) ++ ++ self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff)) ++ self.l.buffer.append((2 << 56) | ((addr + 1) << 48) ++ | ((value >> 32) & 0xffffffff)) ++ continue ++ elif s[0] == "regdump": ++ assert(len(s) == 2) ++ assert(s[1][0] == "x") ++ dest = reg(s[1]) ++ ++ # Number of registers to write per instruction ++ regs = 16 ++ ++ cmd = 21 ++ value = (dest << 40) | (((1 << regs) - 1) << 16) ++ ++ for i in range(0, 0x60, regs): ++ code = (cmd << 56) | (i << 48) | value | (i << 2) ++ self.l.buffer.append(code) ++ ++ del cmd, value ++ continue ++ ++ elif s[0] == "unk": ++ if len(s) == 2: ++ h = hx(s[1]) ++ cmd = h >> 56 ++ addr = (h >> 48) & 0xff ++ value = h & 0xffffffffffff ++ else: ++ assert(len(s) == 4) ++ cmd = hx(s[2]) ++ addr = hx(s[1]) ++ value = val(s[3]) ++ elif s[0] == "nop": ++ if len(s) == 1: ++ addr = 0 ++ value = 0 ++ cmd = 0 ++ else: ++ assert(len(s) == 3) ++ addr = hx(s[1]) ++ value = val(s[2]) ++ cmd = 0 ++ elif s[0] == "mov" and s[2][0] in "xw": ++ # This is actually an addition command ++ assert(len(s) == 3) ++ assert(s[1][0] == s[2][0]) ++ cmd = { "x": 17, "w": 16 }[s[1][0]] ++ addr = reg(s[1]) ++ value = reg(s[2]) << 40 ++ elif s[0] == "mov": ++ assert(len(s) == 3) ++ cmd = { "x": 1, "w": 2 }[s[1][0]] ++ addr = reg(s[1]) ++ value = val(s[2]) ++ elif s[0] == "add": ++ assert(len(s) == 4) ++ assert(s[1][0] == s[2][0]) ++ assert(s[1][0] in "wx") ++ cmd = 16 if s[1][0] == "w" else 17 ++ addr = reg(s[1]) ++ value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff) ++ elif s[0] == "resources": ++ assert(len(s) >= 2) ++ types = ["compute", "fragment", "tiler", "idvs"] ++ cmd = 34 ++ addr = 0 ++ value = 0 ++ for t in s[1:]: ++ if t in types: ++ value |= 1 << types.index(t) ++ else: ++ value |= int(t, 0) ++ elif s[0] == "fragment": ++ cmd = 7 ++ addr = 0 ++ value = 0 ++ if len(s) != 1: ++ arg_map = { ++ "tem": {"0": 0, "1": 1}, ++ "render": { ++ "z_order": 0, ++ "horizontal": 0x10, ++ "vertical": 0x20, ++ "reverse_horizontal": 0x50, ++ "reverse_vertical": 0x60, ++ }, ++ "unk": {"0": 0, "1": 1 << 32}, ++ } ++ for arg, val in zip(s[1::2], s[2::2]): ++ value |= arg_map[arg][val] ++ elif s[0] == "wait": ++ assert(len(s) == 2) ++ cmd = 3 ++ addr = 0 ++ if s[1] == "all": ++ value = 255 ++ else: ++ value = sum(1 << int(x) for x in s[1].split(",")) ++ value <<= 16 ++ elif s[0] == "slot": ++ assert(len(s) == 2) ++ cmd = 23 ++ addr = 0 ++ value = int(s[1], 0) ++ elif s[0] == "add": ++ # TODO: unk variant ++ assert(len(s) == 4) ++ assert(s[1][0] == "x") ++ assert(s[2][0] == "x") ++ cmd = 17 ++ addr = reg(s[1]) ++ v = val(s[3]) ++ assert(v < (1 << 32)) ++ assert(v >= (-1 << 31)) ++ value = (reg(s[2]) << 40) | (v & 0xffffffff) ++ elif s[0] == "idvs": ++ assert(len(s) == 6) ++ unk = val(s[1]) ++ assert(s[2] == "mode") ++ modes = { ++ "none": 0, ++ "points": 1, ++ "lines": 2, ++ "line-strip": 4, ++ "line-loop": 6, ++ "triangles": 8, ++ "triangle-strip": 10, ++ "triangle-fan": 12, ++ "polygon": 13, ++ "quads": 14, ++ } ++ if s[3] in modes: ++ mode = modes[s[3]] ++ else: ++ mode = int(s[3]) ++ assert(s[4] == "index") ++ itypes = { ++ "none": 0, ++ "uint8": 1, ++ "uint16": 2, ++ "uint32": 3, ++ } ++ if s[5] in itypes: ++ index = itypes[s[5]] ++ else: ++ index = int(s[5]) ++ ++ cmd = 6 ++ addr = 0 ++ value = (unk << 32) | (index << 8) | mode ++ elif s[0] == "flush_tiler": ++ assert(len(s) == 1) ++ cmd = 9 ++ addr = 0 ++ value = 0 ++ elif s[0] == "str" and s[1] in ("cycles", "timestamp"): ++ assert(len(s) == 3 or len(s) == 4) ++ assert(s[2][0] == "[") ++ assert(s[-1][-1] == "]") ++ s = [x.strip("[]") for x in s] ++ assert(s[2][0] == "x") ++ ++ type_ = 1 if s[1] == "cycles" else 0 ++ dest = reg(s[2]) ++ if len(s) == 4: ++ offset = val(s[3]) ++ else: ++ offset = 0 ++ ++ cmd = 40 ++ addr = 0 ++ value = (dest << 40) | (type_ << 32) | to_int16(offset) ++ elif s[0] in ("ldr", "str"): ++ reglist = s[1] ++ if reglist[0] == "{": ++ end = [x[-1] for x in s].index("}") ++ reglist = s[1:end + 1] ++ s = s[:1] + s[end:] ++ ++ assert(len(s) == 3 or len(s) == 4) ++ assert(s[2][0] == "[") ++ assert(s[-1][-1] == "]") ++ s = [x.strip("[]") for x in s] ++ assert(s[2][0] == "x") ++ ++ if isinstance(reglist, str): ++ assert(reglist[0] in "xw") ++ src = reg(reglist) ++ mask = 3 if reglist[0] == "x" else 1 ++ else: ++ src = None ++ mask = 0 ++ ++ for r in ",".join(reglist).strip("{}").split(","): ++ r = r.split("-") ++ assert(len(r) in (1, 2)) ++ regno = [reg(x) for x in r] ++ ++ if src is None: ++ src = regno[0] ++ ++ if len(r) == 1: ++ assert(r[0][0] in "xw") ++ new = 3 if r[0][0] == "x" else 1 ++ new = (new << regno[0]) >> src ++ else: ++ assert(regno[1] > regno[0]) ++ new = ((2 << regno[1]) - (1 << regno[0])) >> src ++ ++ assert(new < (1 << 16)) ++ assert(mask & new == 0) ++ mask |= new ++ ++ # Name is correct for str, but inverted for ldr ++ # (The same holds for src above) ++ dest = reg(s[2]) ++ if len(s) == 4: ++ offset = val(s[3]) ++ else: ++ offset = 0 ++ ++ cmd = 20 if s[0] == "ldr" else 21 ++ addr = src ++ value = (dest << 40) | (mask << 16) | to_int16(offset) ++ elif s[0] == "b" or s[0].startswith("b."): ++ # For unconditional jumps, use w00 as a source register if it ++ # is not specified ++ if s[0] == "b" and (len(s) == 2 or ++ (len(s) == 3 and ++ s[1] in ("back", "skip"))): ++ s = [s[0], "w00", *s[1:]] ++ ++ assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip"))) ++ assert(s[1][0] == "w") ++ ++ ops = { ++ "b.le": 0, "b.gt": 1, ++ "b.eq": 2, "b.ne": 3, ++ "b.lt": 4, "b.ge": 5, ++ "b": 6, "b.al": 6, ++ } ++ ++ src = reg(s[1]) ++ if len(s) == 4: ++ offset = val(s[3]) ++ if s[2] == "back": ++ offset = -1 - offset ++ else: ++ label = s[2] ++ if re.fullmatch(r"[0-9]+b", label): ++ label = int(label[:-1]) ++ assert(label in self.l.num_labels) ++ offset = resolve_rel(self.l.num_labels[label], ++ self.l.offset()) ++ elif re.fullmatch(r"[0-9]+f", label): ++ label = int(label[:-1]) ++ if label not in self.l.num_refs: ++ self.l.num_refs[label] = [] ++ self.l.num_refs[label].append((label, self.l.offset(), "rel")) ++ offset = 0 ++ else: ++ assert(not re.fullmatch(r"[0-9]+", label)) ++ self.l.label_refs.append((label, self.l.offset(), "rel")) ++ offset = 0 ++ ++ cmd = 22 ++ addr = 0 ++ value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset) ++ ++ elif s[0] in ("evadd", "evstr"): ++ assert(len(s) in range(5, 8)) ++ assert(s[1][0] in "wx") ++ assert(s[2].startswith("[x")) ++ assert(s[2][-1] == "]") ++ assert(s[3] == "unk") ++ s = [x.strip("[]()") for x in s] ++ ++ val = reg(s[1]) ++ dst = reg(s[2]) ++ mask = hx(s[4]) ++ irq = "irq" not in s ++ unk0 = "unk0" in s ++ ++ if s[1][0] == "w": ++ cmd = 37 if s[0] == "evadd" else 38 ++ else: ++ cmd = 51 if s[0] == "evadd" else 52 ++ addr = 1 ++ value = ((dst << 40) | (val << 32) | (mask << 16) | ++ (irq << 2) | unk0) ++ elif s[0].split(".")[0] == "evwait": ++ for mod in s[0].split(".")[1:]: ++ assert(mod in {"lo", "hi", "inherit", "no_error"}) ++ assert(len(s) == 3) ++ assert(s[1][0] in "wx") ++ assert(s[2][0] == "[") ++ assert(s[-1][-1] == "]") ++ s = [x.strip("[]()") for x in s] ++ src = reg(s[2]) ++ val = reg(s[1]) ++ cond = 1 if ".hi" in s[0] else 0 ++ error = 1 if ".no_error" in s[0] else 0 ++ ++ cmd = 53 if s[1][0] == "x" else 39 ++ addr = 0 ++ value = (src << 40) | (val << 32) | (cond << 28) | error ++ elif s[0] in ("call", "tailcall"): ++ ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1] ++ assert(len(ss) == 3) ++ assert(ss[1][0] == "w") ++ assert(ss[2][0] == "x") ++ cmd = { "call": 32, "tailcall": 33 }[s[0]] ++ addr = 0 ++ num = reg(ss[1]) ++ target = reg(ss[2]) ++ value = (num << 32) | (target << 40) ++ ++ l = self.l ++ ++ cur = len(l.buffer) ++ for ofs in range(cur - 2, cur): ++ if l.buffer[ofs] >> 48 == 0x100 + target: ++ l.call_addr_offset = ofs ++ if l.buffer[ofs] >> 48 == 0x200 + num: ++ l.call_len_offset = ofs ++ assert(l.call_addr_offset is not None) ++ assert(l.call_len_offset is not None) ++ ++ self.is_call = True ++ elif s[0] == "heapctx": ++ assert(len(s) == 2) ++ assert(s[1][0] == "x") ++ cmd = 48 ++ addr = 0 ++ value = reg(s[1]) << 40 ++ elif s[0] == "heapinc": ++ assert(len(s) == 2) ++ modes = { ++ "vt_start": 0, ++ "vt_end": 1, ++ "frag_end": 3, ++ } ++ if s[1] in modes: ++ mode = modes[s[1]] ++ else: ++ mode = int(s[1]) ++ cmd = 49 ++ addr = 0 ++ value = mode << 32 ++ else: ++ print("Unknown command:", orig_line, file=sys.stderr) ++ # TODO remove ++ cmd = 0 ++ addr = 0 ++ value = 0 ++ sk = False ++ pass ++ ++ code = (cmd << 56) | (addr << 48) | value ++ ++ if given_code and code != given_code: ++ print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}") ++ ++ self.l.buffer.append(code) ++ ++ del cmd, addr, value ++ ++ if False and not sk: ++ print(orig_line, file=sys.stderr) ++ print(indent, s, hex(code) if sk else "", file=sys.stderr) ++ ++ self.pop_until(self.levels[0].indent) ++ self.flush_exe() ++ ++ def __repr__(self): ++ r = [] ++ r += [str(self.allocs[x]) for x in self.allocs] ++ r += [str(x) for x in self.completed] ++ r += [fmt_reloc(x) for x in self.reloc] ++ r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split] ++ r += [fmt_exe(x) for x in self.exe] ++ return "\n".join(r) ++ ++def interpret(text): ++ c = Context() ++ c.add_shaders(shaders) ++ c.add_memory(memory) ++ c.add_descriptors(descriptors) ++ c.interpret(text) ++ #print(str(c)) ++ return str(c) ++ ++def run(text, capture=False): ++ if capture: ++ cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT} ++ else: ++ cap = {} ++ ++ i = interpret(text) + "\n" ++ ++ with open("/tmp/csf.cmds", "w") as f: ++ f.write(i) ++ ++ # TODO: Keep seperate or merge stdout/stderr? ++ ret = subprocess.run(["csf_test", "/dev/stdin"], ++ input=i, text=True, **cap) ++ if ret.stderr is None: ++ ret.stderr = "" ++ if ret.stdout is None: ++ ret.stdout = "" ++ return ret.stderr + ret.stdout ++ ++def rebuild(): ++ try: ++ p = subprocess.run(["rebuild-mesa"]) ++ if p.returncode != 0: ++ return False ++ except FileNotFoundError: ++ pass ++ return True ++ ++def go(text): ++ #print(interpret(text)) ++ #return ++ ++ if not rebuild(): ++ return ++ ++ print(run(text)) ++ #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s", ++ # shell=True) ++ ++os.environ["CSF_QUIET"] = "1" ++ ++go(get_cmds("")) ++ ++#for c in range(1, 64): ++# val = c ++# ret = run(get_cmds(ii(val))) ++# print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0]) ++ ++#rebuild() ++#for c in range(256): ++# print(c, end=":") ++# sys.stdout.flush() ++# cmd = f"UNK 00 {hex(c)[2:]} 0x00000000" ++# run(get_cmds(cmd)) ++ ++#interpret(cmds) ++#go(cmds) +diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h +new file mode 100644 +index 00000000000..f5f859eb9ad +--- /dev/null ++++ b/src/panfrost/csf_test/mali_base_csf_kernel.h +@@ -0,0 +1,721 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_BASE_CSF_KERNEL_H_ ++#define _UAPI_BASE_CSF_KERNEL_H_ ++ ++#include ++ ++/* Memory allocation, access/hint flags. ++ * ++ * See base_mem_alloc_flags. ++ */ ++ ++/* IN */ ++/* Read access CPU side ++ */ ++#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) ++ ++/* Write access CPU side ++ */ ++#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) ++ ++/* Read access GPU side ++ */ ++#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) ++ ++/* Write access GPU side ++ */ ++#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) ++ ++/* Execute allowed on the GPU side ++ */ ++#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) ++ ++/* Will be permanently mapped in kernel space. ++ * Flag is only allowed on allocations originating from kbase. ++ */ ++#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) ++ ++/* The allocation will completely reside within the same 4GB chunk in the GPU ++ * virtual space. ++ * Since this flag is primarily required only for the TLS memory which will ++ * not be used to contain executable code and also not used for Tiler heap, ++ * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. ++ */ ++#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) ++ ++/* Userspace is not allowed to free this memory. ++ * Flag is only allowed on allocations originating from kbase. ++ */ ++#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) ++ ++#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) ++ ++/* Grow backing store on GPU Page Fault ++ */ ++#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) ++ ++/* Page coherence Outer shareable, if available ++ */ ++#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) ++ ++/* Page coherence Inner shareable ++ */ ++#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) ++ ++/* IN/OUT */ ++/* Should be cached on the CPU, returned if actually cached ++ */ ++#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) ++ ++/* IN/OUT */ ++/* Must have same VA on both the GPU and the CPU ++ */ ++#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) ++ ++/* OUT */ ++/* Must call mmap to acquire a GPU address for the alloc ++ */ ++#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) ++ ++/* IN */ ++/* Page coherence Outer shareable, required. ++ */ ++#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) ++ ++/* Protected memory ++ */ ++#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) ++ ++/* Not needed physical memory ++ */ ++#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) ++ ++/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the ++ * addresses to be the same ++ */ ++#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) ++ ++/* CSF event memory ++ * ++ * If Outer shareable coherence is not specified or not available, then on ++ * allocation kbase will automatically use the uncached GPU mapping. ++ * There is no need for the client to specify BASE_MEM_UNCACHED_GPU ++ * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. ++ * ++ * This memory requires a permanent mapping ++ * ++ * See also kbase_reg_needs_kernel_mapping() ++ */ ++#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) ++ ++#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) ++ ++/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu ++ * mode. Some components within the GPU might only be able to access memory ++ * that is GPU cacheable. Refer to the specific GPU implementation for more ++ * details. The 3 shareability flags will be ignored for GPU uncached memory. ++ * If used while importing USER_BUFFER type memory, then the import will fail ++ * if the memory is not aligned to GPU and CPU cache line width. ++ */ ++#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) ++ ++/* ++ * Bits [22:25] for group_id (0~15). ++ * ++ * base_mem_group_id_set() should be used to pack a memory group ID into a ++ * base_mem_alloc_flags value instead of accessing the bits directly. ++ * base_mem_group_id_get() should be used to extract the memory group ID from ++ * a base_mem_alloc_flags value. ++ */ ++#define BASEP_MEM_GROUP_ID_SHIFT 22 ++#define BASE_MEM_GROUP_ID_MASK \ ++ ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) ++ ++/* Must do CPU cache maintenance when imported memory is mapped/unmapped ++ * on GPU. Currently applicable to dma-buf type only. ++ */ ++#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) ++ ++/* OUT */ ++/* Kernel side cache sync ops required */ ++#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) ++ ++/* Number of bits used as flags for base memory management ++ * ++ * Must be kept in sync with the base_mem_alloc_flags flags ++ */ ++#define BASE_MEM_FLAGS_NR_BITS 29 ++ ++/* A mask of all the flags which are only valid for allocations within kbase, ++ * and may not be passed from user space. ++ */ ++#define BASEP_MEM_FLAGS_KERNEL_ONLY \ ++ (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) ++ ++/* A mask for all output bits, excluding IN/OUT bits. ++ */ ++#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP ++ ++/* A mask for all input bits, including IN/OUT bits. ++ */ ++#define BASE_MEM_FLAGS_INPUT_MASK \ ++ (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) ++ ++/* A mask of all currently reserved flags ++ */ ++#define BASE_MEM_FLAGS_RESERVED \ ++ BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20 ++ ++#define BASEP_MEM_INVALID_HANDLE (0ul) ++#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) ++#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) ++/* reserved handles ..-47< for future special handles */ ++#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) ++#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) ++#define BASE_MEM_FIRST_FREE_ADDRESS \ ++ ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) ++ ++#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ ++ ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ ++ LOCAL_PAGE_SHIFT) ++ ++/** ++ * Valid set of just-in-time memory allocation flags ++ */ ++#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) ++ ++/* Flags to pass to ::base_context_init. ++ * Flags can be ORed together to enable multiple things. ++ * ++ * These share the same space as BASEP_CONTEXT_FLAG_*, and so must ++ * not collide with them. ++ */ ++typedef __u32 base_context_create_flags; ++ ++/* No flags set */ ++#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) ++ ++/* Base context is embedded in a cctx object (flag used for CINSTR ++ * software counter macros) ++ */ ++#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) ++ ++/* Base context is a 'System Monitor' context for Hardware counters. ++ * ++ * One important side effect of this is that job submission is disabled. ++ */ ++#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \ ++ ((base_context_create_flags)1 << 1) ++ ++/* Base context creates a CSF event notification thread. ++ * ++ * The creation of a CSF event notification thread is conditional but ++ * mandatory for the handling of CSF events. ++ */ ++#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) ++ ++/* Bit-shift used to encode a memory group ID in base_context_create_flags ++ */ ++#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) ++ ++/* Bitmask used to encode a memory group ID in base_context_create_flags ++ */ ++#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ ++ ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) ++ ++/* Bitpattern describing the base_context_create_flags that can be ++ * passed to the kernel ++ */ ++#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ ++ (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \ ++ BASEP_CONTEXT_MMU_GROUP_ID_MASK) ++ ++/* Bitpattern describing the ::base_context_create_flags that can be ++ * passed to base_context_init() ++ */ ++#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ ++ (BASE_CONTEXT_CCTX_EMBEDDED | \ ++ BASE_CONTEXT_CSF_EVENT_THREAD | \ ++ BASEP_CONTEXT_CREATE_KERNEL_FLAGS) ++ ++/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, ++ * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) ++ */ ++#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) ++ ++/* Indicate that job dumping is enabled. This could affect certain timers ++ * to account for the performance impact. ++ */ ++#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) ++ ++/* Enable KBase tracepoints for CSF builds */ ++#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) ++ ++/* Enable additional CSF Firmware side tracepoints */ ++#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) ++ ++#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ ++ BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ ++ BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ ++ BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) ++ ++/* Number of pages mapped into the process address space for a bound GPU ++ * command queue. A pair of input/output pages and a Hw doorbell page ++ * are mapped to enable direct submission of commands to Hw. ++ */ ++#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) ++ ++#define BASE_QUEUE_MAX_PRIORITY (15U) ++ ++/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ ++#define BASEP_EVENT_VAL_INDEX (0U) ++#define BASEP_EVENT_ERR_INDEX (1U) ++ ++/* The upper limit for number of objects that could be waited/set per command. ++ * This limit is now enforced as internally the error inherit inputs are ++ * converted to 32-bit flags in a __u32 variable occupying a previously padding ++ * field. ++ */ ++#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) ++ ++/** ++ * enum base_kcpu_command_type - Kernel CPU queue command type. ++ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, ++ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, ++ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, ++ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, ++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, ++ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, ++ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, ++ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, ++ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, ++ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, ++ */ ++enum base_kcpu_command_type { ++ BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, ++ BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, ++ BASE_KCPU_COMMAND_TYPE_CQS_WAIT, ++ BASE_KCPU_COMMAND_TYPE_CQS_SET, ++ BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, ++ BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, ++ BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, ++ BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, ++ BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, ++ BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, ++ BASE_KCPU_COMMAND_TYPE_JIT_FREE, ++ BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, ++ BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER ++}; ++ ++/** ++ * enum base_queue_group_priority - Priority of a GPU Command Queue Group. ++ * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time ++ * priority. ++ * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group ++ * priority levels. ++ * ++ * Currently this is in order of highest to lowest, but if new levels are added ++ * then those new levels may be out of order to preserve the ABI compatibility ++ * with previous releases. At that point, ensure assignment to ++ * the 'priority' member in &kbase_queue_group is updated to ensure it remains ++ * a linear ordering. ++ * ++ * There should be no gaps in the enum, otherwise use of ++ * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. ++ */ ++enum base_queue_group_priority { ++ BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, ++ BASE_QUEUE_GROUP_PRIORITY_MEDIUM, ++ BASE_QUEUE_GROUP_PRIORITY_LOW, ++ BASE_QUEUE_GROUP_PRIORITY_REALTIME, ++ BASE_QUEUE_GROUP_PRIORITY_COUNT ++}; ++ ++struct base_kcpu_command_fence_info { ++ __u64 fence; ++}; ++ ++struct base_cqs_wait_info { ++ __u64 addr; ++ __u32 val; ++ __u32 padding; ++}; ++ ++struct base_kcpu_command_cqs_wait_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 inherit_err_flags; ++}; ++ ++struct base_cqs_set { ++ __u64 addr; ++}; ++ ++struct base_kcpu_command_cqs_set_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 padding; ++}; ++ ++/** ++ * typedef basep_cqs_data_type - Enumeration of CQS Data Types ++ * ++ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value ++ * is an unsigned 32-bit integer ++ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value ++ * is an unsigned 64-bit integer ++ */ ++typedef enum PACKED { ++ BASEP_CQS_DATA_TYPE_U32 = 0, ++ BASEP_CQS_DATA_TYPE_U64 = 1, ++} basep_cqs_data_type; ++ ++/** ++ * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait ++ * Operation conditions ++ * ++ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a ++ * wait will be satisfied when a CQS Object's ++ * value is Less than or Equal to ++ * the Wait Operation value ++ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a ++ * wait will be satisfied when a CQS Object's ++ * value is Greater than the Wait Operation value ++ */ ++typedef enum { ++ BASEP_CQS_WAIT_OPERATION_LE = 0, ++ BASEP_CQS_WAIT_OPERATION_GT = 1, ++} basep_cqs_wait_operation_op; ++ ++struct base_cqs_wait_operation_info { ++ __u64 addr; ++ __u64 val; ++ __u8 operation; ++ __u8 data_type; ++ __u8 padding[6]; ++}; ++ ++/** ++ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information ++ * about the Timeline CQS wait objects ++ * ++ * @objs: An array of Timeline CQS waits. ++ * @nr_objs: Number of Timeline CQS waits in the array. ++ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field ++ * to be served as the source for importing into the ++ * queue's error-state. ++ */ ++struct base_kcpu_command_cqs_wait_operation_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 inherit_err_flags; ++}; ++ ++/** ++ * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations ++ * ++ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value ++ * to a synchronization object ++ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value ++ * of a synchronization object ++ */ ++typedef enum { ++ BASEP_CQS_SET_OPERATION_ADD = 0, ++ BASEP_CQS_SET_OPERATION_SET = 1, ++} basep_cqs_set_operation_op; ++ ++struct base_cqs_set_operation_info { ++ __u64 addr; ++ __u64 val; ++ __u8 operation; ++ __u8 data_type; ++ __u8 padding[6]; ++}; ++ ++/** ++ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information ++ * about the Timeline CQS set objects ++ * ++ * @objs: An array of Timeline CQS sets. ++ * @nr_objs: Number of Timeline CQS sets in the array. ++ * @padding: Structure padding, unused bytes. ++ */ ++struct base_kcpu_command_cqs_set_operation_info { ++ __u64 objs; ++ __u32 nr_objs; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_kcpu_command_import_info - structure which contains information ++ * about the imported buffer. ++ * ++ * @handle: Address of imported user buffer. ++ */ ++struct base_kcpu_command_import_info { ++ __u64 handle; ++}; ++ ++/** ++ * struct base_kcpu_command_jit_alloc_info - structure which contains ++ * information about jit memory allocation. ++ * ++ * @info: An array of elements of the ++ * struct base_jit_alloc_info type. ++ * @count: The number of elements in the info array. ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_jit_alloc_info { ++ __u64 info; ++ __u8 count; ++ __u8 padding[7]; ++}; ++ ++/** ++ * struct base_kcpu_command_jit_free_info - structure which contains ++ * information about jit memory which is to be freed. ++ * ++ * @ids: An array containing the JIT IDs to free. ++ * @count: The number of elements in the ids array. ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_jit_free_info { ++ __u64 ids; ++ __u8 count; ++ __u8 padding[7]; ++}; ++ ++/** ++ * struct base_kcpu_command_group_suspend_info - structure which contains ++ * suspend buffer data captured for a suspended queue group. ++ * ++ * @buffer: Pointer to an array of elements of the type char. ++ * @size: Number of elements in the @buffer array. ++ * @group_handle: Handle to the mapping of CSG. ++ * @padding: padding to a multiple of 64 bits. ++ */ ++struct base_kcpu_command_group_suspend_info { ++ __u64 buffer; ++ __u32 size; ++ __u8 group_handle; ++ __u8 padding[3]; ++}; ++ ++ ++/** ++ * struct base_kcpu_command - kcpu command. ++ * @type: type of the kcpu command, one enum base_kcpu_command_type ++ * @padding: padding to a multiple of 64 bits ++ * @info: structure which contains information about the kcpu command; ++ * actual type is determined by @p type ++ * @info.fence: Fence ++ * @info.cqs_wait: CQS wait ++ * @info.cqs_set: CQS set ++ * @info.import: import ++ * @info.jit_alloc: jit allocation ++ * @info.jit_free: jit deallocation ++ * @info.suspend_buf_copy: suspend buffer copy ++ * @info.sample_time: sample time ++ * @info.padding: padding ++ */ ++struct base_kcpu_command { ++ __u8 type; ++ __u8 padding[sizeof(__u64) - sizeof(__u8)]; ++ union { ++ struct base_kcpu_command_fence_info fence; ++ struct base_kcpu_command_cqs_wait_info cqs_wait; ++ struct base_kcpu_command_cqs_set_info cqs_set; ++ struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; ++ struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; ++ struct base_kcpu_command_import_info import; ++ struct base_kcpu_command_jit_alloc_info jit_alloc; ++ struct base_kcpu_command_jit_free_info jit_free; ++ struct base_kcpu_command_group_suspend_info suspend_buf_copy; ++ __u64 padding[2]; /* No sub-struct should be larger */ ++ } info; ++}; ++ ++/** ++ * struct basep_cs_stream_control - CSI capabilities. ++ * ++ * @features: Features of this stream ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct basep_cs_stream_control { ++ __u32 features; ++ __u32 padding; ++}; ++ ++/** ++ * struct basep_cs_group_control - CSG interface capabilities. ++ * ++ * @features: Features of this group ++ * @stream_num: Number of streams in this group ++ * @suspend_size: Size in bytes of the suspend buffer for this group ++ * @padding: Padding to a multiple of 64 bits. ++ */ ++struct basep_cs_group_control { ++ __u32 features; ++ __u32 stream_num; ++ __u32 suspend_size; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault ++ * error information associated with GPU command queue group. ++ * ++ * @sideband: Additional information of the unrecoverable fault. ++ * @status: Unrecoverable fault information. ++ * This consists of exception type (least significant byte) and ++ * data (remaining bytes). One example of exception type is ++ * CS_INVALID_INSTRUCTION (0x49). ++ * @padding: Padding to make multiple of 64bits ++ */ ++struct base_gpu_queue_group_error_fatal_payload { ++ __u64 sideband; ++ __u32 status; ++ __u32 padding; ++}; ++ ++/** ++ * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault ++ * error information related to GPU command queue. ++ * ++ * @sideband: Additional information about this unrecoverable fault. ++ * @status: Unrecoverable fault information. ++ * This consists of exception type (least significant byte) and ++ * data (remaining bytes). One example of exception type is ++ * CS_INVALID_INSTRUCTION (0x49). ++ * @csi_index: Index of the CSF interface the queue is bound to. ++ * @padding: Padding to make multiple of 64bits ++ */ ++struct base_gpu_queue_error_fatal_payload { ++ __u64 sideband; ++ __u32 status; ++ __u8 csi_index; ++ __u8 padding[3]; ++}; ++ ++/** ++ * enum base_gpu_queue_group_error_type - GPU Fatal error type. ++ * ++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU ++ * command queue group. ++ * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU ++ * command queue. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with ++ * progress timeout. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out ++ * of tiler heap memory. ++ * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types ++ * ++ * This type is used for &struct_base_gpu_queue_group_error.error_type. ++ */ ++enum base_gpu_queue_group_error_type { ++ BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, ++ BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, ++ BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, ++ BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, ++ BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT ++}; ++ ++/** ++ * struct base_gpu_queue_group_error - Unrecoverable fault information ++ * @error_type: Error type of @base_gpu_queue_group_error_type ++ * indicating which field in union payload is filled ++ * @padding: Unused bytes for 64bit boundary ++ * @payload: Input Payload ++ * @payload.fatal_group: Unrecoverable fault error associated with ++ * GPU command queue group ++ * @payload.fatal_queue: Unrecoverable fault error associated with command queue ++ */ ++struct base_gpu_queue_group_error { ++ __u8 error_type; ++ __u8 padding[7]; ++ union { ++ struct base_gpu_queue_group_error_fatal_payload fatal_group; ++ struct base_gpu_queue_error_fatal_payload fatal_queue; ++ } payload; ++}; ++ ++/** ++ * enum base_csf_notification_type - Notification type ++ * ++ * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event ++ * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal ++ * error ++ * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu ++ * queue ++ * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type ++ * ++ * This type is used for &struct_base_csf_notification.type. ++ */ ++enum base_csf_notification_type { ++ BASE_CSF_NOTIFICATION_EVENT = 0, ++ BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, ++ BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, ++ BASE_CSF_NOTIFICATION_COUNT ++}; ++ ++/** ++ * struct base_csf_notification - Event or error notification ++ * ++ * @type: Notification type of @base_csf_notification_type ++ * @padding: Padding for 64bit boundary ++ * @payload: Input Payload ++ * @payload.align: To fit the struct into a 64-byte cache line ++ * @payload.csg_error: CSG error ++ * @payload.csg_error.handle: Handle of GPU command queue group associated with ++ * fatal error ++ * @payload.csg_error.padding: Padding ++ * @payload.csg_error.error: Unrecoverable fault error ++ * ++ */ ++struct base_csf_notification { ++ __u8 type; ++ __u8 padding[7]; ++ union { ++ struct { ++ __u8 handle; ++ __u8 padding[7]; ++ struct base_gpu_queue_group_error error; ++ } csg_error; ++ ++ __u8 align[56]; ++ } payload; ++}; ++ ++#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ +diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h +new file mode 100644 +index 00000000000..305956f341a +--- /dev/null ++++ b/src/panfrost/csf_test/mali_base_kernel.h +@@ -0,0 +1,746 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++/* ++ * Base structures shared with the kernel. ++ */ ++ ++#ifndef _UAPI_BASE_KERNEL_H_ ++#define _UAPI_BASE_KERNEL_H_ ++ ++#include ++ ++struct base_mem_handle { ++ struct { ++ __u64 handle; ++ } basep; ++}; ++ ++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 ++ ++#define BASE_MAX_COHERENT_GROUPS 16 ++ ++#if defined(PAGE_MASK) && defined(PAGE_SHIFT) ++#define LOCAL_PAGE_SHIFT PAGE_SHIFT ++#define LOCAL_PAGE_LSB ~PAGE_MASK ++#else ++#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 ++#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 ++#endif ++ ++#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) ++#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 ++#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) ++#else ++#error Failed to find page size ++#endif ++#endif ++ ++/* Physical memory group ID for normal usage. ++ */ ++#define BASE_MEM_GROUP_DEFAULT (0) ++ ++/* Number of physical memory groups. ++ */ ++#define BASE_MEM_GROUP_COUNT (16) ++ ++/** ++ * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. ++ * ++ * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator ++ * in order to determine the best cache policy. Some combinations are ++ * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), ++ * which defines a write-only region on the CPU side, which is ++ * heavily read by the CPU... ++ * Other flags are only meaningful to a particular allocator. ++ * More flags can be added to this list, as long as they don't clash ++ * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). ++ */ ++typedef __u32 base_mem_alloc_flags; ++ ++/* A mask for all the flags which are modifiable via the base_mem_set_flags ++ * interface. ++ */ ++#define BASE_MEM_FLAGS_MODIFIABLE \ ++ (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ ++ BASE_MEM_COHERENT_LOCAL) ++ ++/* A mask of all the flags that can be returned via the base_mem_get_flags() ++ * interface. ++ */ ++#define BASE_MEM_FLAGS_QUERYABLE \ ++ (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ ++ BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ ++ BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ ++ BASEP_MEM_FLAGS_KERNEL_ONLY)) ++ ++/** ++ * enum base_mem_import_type - Memory types supported by @a base_mem_import ++ * ++ * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type ++ * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) ++ * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a ++ * base_mem_import_user_buffer ++ * ++ * Each type defines what the supported handle type is. ++ * ++ * If any new type is added here ARM must be contacted ++ * to allocate a numeric value for it. ++ * Do not just add a new type without synchronizing with ARM ++ * as future releases from ARM might include other new types ++ * which could clash with your custom types. ++ */ ++enum base_mem_import_type { ++ BASE_MEM_IMPORT_TYPE_INVALID = 0, ++ /* ++ * Import type with value 1 is deprecated. ++ */ ++ BASE_MEM_IMPORT_TYPE_UMM = 2, ++ BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 ++}; ++ ++/** ++ * struct base_mem_import_user_buffer - Handle of an imported user buffer ++ * ++ * @ptr: address of imported user buffer ++ * @length: length of imported user buffer in bytes ++ * ++ * This structure is used to represent a handle of an imported user buffer. ++ */ ++ ++struct base_mem_import_user_buffer { ++ __u64 ptr; ++ __u64 length; ++}; ++ ++/* Mask to detect 4GB boundary alignment */ ++#define BASE_MEM_MASK_4GB 0xfffff000UL ++/* Mask to detect 4GB boundary (in page units) alignment */ ++#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) ++ ++/* Limit on the 'extension' parameter for an allocation with the ++ * BASE_MEM_TILER_ALIGN_TOP flag set ++ * ++ * This is the same as the maximum limit for a Buffer Descriptor's chunk size ++ */ ++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ ++ (21u - (LOCAL_PAGE_SHIFT)) ++#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ ++ (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) ++ ++/* Bit mask of cookies used for for memory allocation setup */ ++#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ ++ ++/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ ++#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ ++ ++/* ++ * struct base_fence - Cross-device synchronisation fence. ++ * ++ * A fence is used to signal when the GPU has finished accessing a resource that ++ * may be shared with other devices, and also to delay work done asynchronously ++ * by the GPU until other devices have finished accessing a shared resource. ++ */ ++struct base_fence { ++ struct { ++ int fd; ++ int stream_fd; ++ } basep; ++}; ++ ++/** ++ * struct base_mem_aliasing_info - Memory aliasing info ++ * ++ * Describes a memory handle to be aliased. ++ * A subset of the handle can be chosen for aliasing, given an offset and a ++ * length. ++ * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a ++ * region where a special page is mapped with a write-alloc cache setup, ++ * typically used when the write result of the GPU isn't needed, but the GPU ++ * must write anyway. ++ * ++ * Offset and length are specified in pages. ++ * Offset must be within the size of the handle. ++ * Offset+length must not overrun the size of the handle. ++ * ++ * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE ++ * @offset: Offset within the handle to start aliasing from, in pages. ++ * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. ++ * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE ++ * specifies the number of times the special page is needed. ++ */ ++struct base_mem_aliasing_info { ++ struct base_mem_handle handle; ++ __u64 offset; ++ __u64 length; ++}; ++ ++/* Maximum percentage of just-in-time memory allocation trimming to perform ++ * on free. ++ */ ++#define BASE_JIT_MAX_TRIM_LEVEL (100) ++ ++/* Maximum number of concurrent just-in-time memory allocations. ++ */ ++#define BASE_JIT_ALLOC_COUNT (255) ++ ++/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 ++ * ++ * jit_version is 1 ++ * ++ * Due to the lack of padding specified, user clients between 32 and 64-bit ++ * may have assumed a different size of the struct ++ * ++ * An array of structures was not supported ++ */ ++struct base_jit_alloc_info_10_2 { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++}; ++ ++/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up ++ * to 11.19 ++ * ++ * This structure had a number of modifications during and after kernel driver ++ * version 11.5, but remains size-compatible throughout its version history, and ++ * with earlier variants compatible with future variants by requiring ++ * zero-initialization to the unused space in the structure. ++ * ++ * jit_version is 2 ++ * ++ * Kernel driver version history: ++ * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes ++ * must be zero. Kbase minor version was not incremented, so some ++ * versions of 11.5 do not have this change. ++ * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase ++ * minor version not incremented) ++ * 11.6: Added 'flags', replacing 1 padding byte ++ * 11.10: Arrays of this structure are supported ++ */ ++struct base_jit_alloc_info_11_5 { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++ __u8 bin_id; ++ __u8 max_allocations; ++ __u8 flags; ++ __u8 padding[2]; ++ __u16 usage_id; ++}; ++ ++/** ++ * struct base_jit_alloc_info - Structure which describes a JIT allocation ++ * request. ++ * @gpu_alloc_addr: The GPU virtual address to write the JIT ++ * allocated GPU virtual address to. ++ * @va_pages: The minimum number of virtual pages required. ++ * @commit_pages: The minimum number of physical pages which ++ * should back the allocation. ++ * @extension: Granularity of physical pages to grow the ++ * allocation by during a fault. ++ * @id: Unique ID provided by the caller, this is used ++ * to pair allocation and free requests. ++ * Zero is not a valid value. ++ * @bin_id: The JIT allocation bin, used in conjunction with ++ * @max_allocations to limit the number of each ++ * type of JIT allocation. ++ * @max_allocations: The maximum number of allocations allowed within ++ * the bin specified by @bin_id. Should be the same ++ * for all allocations within the same bin. ++ * @flags: flags specifying the special requirements for ++ * the JIT allocation, see ++ * %BASE_JIT_ALLOC_VALID_FLAGS ++ * @padding: Expansion space - should be initialised to zero ++ * @usage_id: A hint about which allocation should be reused. ++ * The kernel should attempt to use a previous ++ * allocation with the same usage_id ++ * @heap_info_gpu_addr: Pointer to an object in GPU memory describing ++ * the actual usage of the region. ++ * ++ * jit_version is 3. ++ * ++ * When modifications are made to this structure, it is still compatible with ++ * jit_version 3 when: a) the size is unchanged, and b) new members only ++ * replace the padding bytes. ++ * ++ * Previous jit_version history: ++ * jit_version == 1, refer to &base_jit_alloc_info_10_2 ++ * jit_version == 2, refer to &base_jit_alloc_info_11_5 ++ * ++ * Kbase version history: ++ * 11.20: added @heap_info_gpu_addr ++ */ ++struct base_jit_alloc_info { ++ __u64 gpu_alloc_addr; ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u8 id; ++ __u8 bin_id; ++ __u8 max_allocations; ++ __u8 flags; ++ __u8 padding[2]; ++ __u16 usage_id; ++ __u64 heap_info_gpu_addr; ++}; ++ ++enum base_external_resource_access { ++ BASE_EXT_RES_ACCESS_SHARED, ++ BASE_EXT_RES_ACCESS_EXCLUSIVE ++}; ++ ++struct base_external_resource { ++ __u64 ext_resource; ++}; ++ ++ ++/** ++ * The maximum number of external resources which can be mapped/unmapped ++ * in a single request. ++ */ ++#define BASE_EXT_RES_COUNT_MAX 10 ++ ++/** ++ * struct base_external_resource_list - Structure which describes a list of ++ * external resources. ++ * @count: The number of resources. ++ * @ext_res: Array of external resources which is ++ * sized at allocation time. ++ */ ++struct base_external_resource_list { ++ __u64 count; ++ struct base_external_resource ext_res[1]; ++}; ++ ++struct base_jd_debug_copy_buffer { ++ __u64 address; ++ __u64 size; ++ struct base_external_resource extres; ++}; ++ ++#define GPU_MAX_JOB_SLOTS 16 ++ ++/** ++ * User-side Base GPU Property Queries ++ * ++ * The User-side Base GPU Property Query interface encapsulates two ++ * sub-modules: ++ * ++ * - "Dynamic GPU Properties" ++ * - "Base Platform Config GPU Properties" ++ * ++ * Base only deals with properties that vary between different GPU ++ * implementations - the Dynamic GPU properties and the Platform Config ++ * properties. ++ * ++ * For properties that are constant for the GPU Architecture, refer to the ++ * GPU module. However, we will discuss their relevance here just to ++ * provide background information. ++ * ++ * About the GPU Properties in Base and GPU modules ++ * ++ * The compile-time properties (Platform Config, GPU Compile-time ++ * properties) are exposed as pre-processor macros. ++ * ++ * Complementing the compile-time properties are the Dynamic GPU ++ * Properties, which act as a conduit for the GPU Configuration ++ * Discovery. ++ * ++ * In general, the dynamic properties are present to verify that the platform ++ * has been configured correctly with the right set of Platform Config ++ * Compile-time Properties. ++ * ++ * As a consistent guide across the entire DDK, the choice for dynamic or ++ * compile-time should consider the following, in order: ++ * 1. Can the code be written so that it doesn't need to know the ++ * implementation limits at all? ++ * 2. If you need the limits, get the information from the Dynamic Property ++ * lookup. This should be done once as you fetch the context, and then cached ++ * as part of the context data structure, so it's cheap to access. ++ * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, ++ * then use a Compile-Time Property (Platform Config, or GPU Compile-time ++ * property). Examples of where this might be sensible follow: ++ * - Part of a critical inner-loop ++ * - Frequent re-use throughout the driver, causing significant extra load ++ * instructions or control flow that would be worthwhile optimizing out. ++ * ++ * We cannot provide an exhaustive set of examples, neither can we provide a ++ * rule for every possible situation. Use common sense, and think about: what ++ * the rest of the driver will be doing; how the compiler might represent the ++ * value if it is a compile-time constant; whether an OEM shipping multiple ++ * devices would benefit much more from a single DDK binary, instead of ++ * insignificant micro-optimizations. ++ * ++ * Dynamic GPU Properties ++ * ++ * Dynamic GPU properties are presented in two sets: ++ * 1. the commonly used properties in @ref base_gpu_props, which have been ++ * unpacked from GPU register bitfields. ++ * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props ++ * (also a member of base_gpu_props). All of these are presented in ++ * the packed form, as presented by the GPU registers themselves. ++ * ++ * The raw properties in gpu_raw_gpu_props are necessary to ++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device ++ * behaving differently?". In this case, all information about the ++ * configuration is potentially useful, but it does not need to be processed ++ * by the driver. Instead, the raw registers can be processed by the Mali ++ * Tools software on the host PC. ++ * ++ * The properties returned extend the GPU Configuration Discovery ++ * registers. For example, GPU clock speed is not specified in the GPU ++ * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. ++ * ++ * The GPU properties are obtained by a call to ++ * base_get_gpu_props(). This simply returns a pointer to a const ++ * base_gpu_props structure. It is constant for the life of a base ++ * context. Multiple calls to base_get_gpu_props() to a base context ++ * return the same pointer to a constant structure. This avoids cache pollution ++ * of the common data. ++ * ++ * This pointer must not be freed, because it does not point to the start of a ++ * region allocated by the memory allocator; instead, just close the @ref ++ * base_context. ++ * ++ * ++ * Kernel Operation ++ * ++ * During Base Context Create time, user-side makes a single kernel call: ++ * - A call to fill user memory with GPU information structures ++ * ++ * The kernel-side will fill the provided the entire processed base_gpu_props ++ * structure, because this information is required in both ++ * user and kernel side; it does not make sense to decode it twice. ++ * ++ * Coherency groups must be derived from the bitmasks, but this can be done ++ * kernel side, and just once at kernel startup: Coherency groups must already ++ * be known kernel-side, to support chains that specify a 'Only Coherent Group' ++ * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. ++ * ++ * Coherency Group calculation ++ * ++ * Creation of the coherent group data is done at device-driver startup, and so ++ * is one-time. This will most likely involve a loop with CLZ, shifting, and ++ * bit clearing on the L2_PRESENT mask, depending on whether the ++ * system is L2 Coherent. The number of shader cores is done by a ++ * population count, since faulty cores may be disabled during production, ++ * producing a non-contiguous mask. ++ * ++ * The memory requirements for this algorithm can be determined either by a __u64 ++ * population count on the L2_PRESENT mask (a LUT helper already is ++ * required for the above), or simple assumption that there can be no more than ++ * 16 coherent groups, since core groups are typically 4 cores. ++ */ ++ ++#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 ++ ++#define BASE_MAX_COHERENT_GROUPS 16 ++/** ++ * struct mali_base_gpu_core_props - GPU core props info ++ * @product_id: Pro specific value. ++ * @version_status: Status of the GPU release. No defined values, but starts at ++ * 0 and increases by one for each release status (alpha, beta, EAC, etc.). ++ * 4 bit values (0-15). ++ * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" ++ * release number. ++ * 8 bit values (0-255). ++ * @major_revision: Major release number of the GPU. "R" part of an "RnPn" ++ * release number. ++ * 4 bit values (0-15). ++ * @padding: padding to allign to 8-byte ++ * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by ++ * clGetDeviceInfo() ++ * @log2_program_counter_size: Size of the shader program counter, in bits. ++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This ++ * is a bitpattern where a set bit indicates that the format is supported. ++ * Before using a texture format, it is recommended that the corresponding ++ * bit be checked. ++ * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. ++ * It is unlikely that a client will be able to allocate all of this memory ++ * for their own purposes, but this at least provides an upper bound on the ++ * memory available to the GPU. ++ * This is required for OpenCL's clGetDeviceInfo() call when ++ * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The ++ * client will not be expecting to allocate anywhere near this value. ++ * @num_exec_engines: The number of execution engines. ++ */ ++struct mali_base_gpu_core_props { ++ __u32 product_id; ++ __u16 version_status; ++ __u16 minor_revision; ++ __u16 major_revision; ++ __u16 padding; ++ __u32 gpu_freq_khz_max; ++ __u32 log2_program_counter_size; ++ __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ __u64 gpu_available_memory_size; ++ __u8 num_exec_engines; ++}; ++ ++/* ++ * More information is possible - but associativity and bus width are not ++ * required by upper-level apis. ++ */ ++struct mali_base_gpu_l2_cache_props { ++ __u8 log2_line_size; ++ __u8 log2_cache_size; ++ __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ ++ __u8 padding[5]; ++}; ++ ++struct mali_base_gpu_tiler_props { ++ __u32 bin_size_bytes; /* Max is 4*2^15 */ ++ __u32 max_active_levels; /* Max is 2^15 */ ++}; ++ ++/** ++ * struct mali_base_gpu_thread_props - GPU threading system details. ++ * @max_threads: Max. number of threads per core ++ * @max_workgroup_size: Max. number of threads per workgroup ++ * @max_barrier_size: Max. number of threads that can synchronize on a ++ * simple barrier ++ * @max_registers: Total size [1..65535] of the register file available ++ * per core. ++ * @max_task_queue: Max. tasks [1..255] which may be sent to a core ++ * before it becomes blocked. ++ * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split ++ * field. ++ * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, ++ * 3 = SW Model/Emulation ++ * @padding: padding to allign to 8-byte ++ * @tls_alloc: Number of threads per core that TLS must be ++ * allocated for ++ */ ++struct mali_base_gpu_thread_props { ++ __u32 max_threads; ++ __u32 max_workgroup_size; ++ __u32 max_barrier_size; ++ __u16 max_registers; ++ __u8 max_task_queue; ++ __u8 max_thread_group_split; ++ __u8 impl_tech; ++ __u8 padding[3]; ++ __u32 tls_alloc; ++}; ++ ++/** ++ * struct mali_base_gpu_coherent_group - descriptor for a coherent group ++ * @core_mask: Core restriction mask required for the group ++ * @num_cores: Number of cores in the group ++ * @padding: padding to allign to 8-byte ++ * ++ * \c core_mask exposes all cores in that coherent group, and \c num_cores ++ * provides a cached population-count for that mask. ++ * ++ * @note Whilst all cores are exposed in the mask, not all may be available to ++ * the application, depending on the Kernel Power policy. ++ * ++ * @note if u64s must be 8-byte aligned, then this structure has 32-bits of ++ * wastage. ++ */ ++struct mali_base_gpu_coherent_group { ++ __u64 core_mask; ++ __u16 num_cores; ++ __u16 padding[3]; ++}; ++ ++/** ++ * struct mali_base_gpu_coherent_group_info - Coherency group information ++ * @num_groups: Number of coherent groups in the GPU. ++ * @num_core_groups: Number of core groups (coherent or not) in the GPU. ++ * Equivalent to the number of L2 Caches. ++ * The GPU Counter dumping writes 2048 bytes per core group, regardless ++ * of whether the core groups are coherent or not. Hence this member is ++ * needed to calculate how much memory is required for dumping. ++ * @note Do not use it to work out how many valid elements are in the ++ * group[] member. Use num_groups instead. ++ * @coherency: Coherency features of the memory, accessed by gpu_mem_features ++ * methods ++ * @padding: padding to allign to 8-byte ++ * @group: Descriptors of coherent groups ++ * ++ * Note that the sizes of the members could be reduced. However, the \c group ++ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte ++ * aligned, thus leading to wastage if the other members sizes were reduced. ++ * ++ * The groups are sorted by core mask. The core masks are non-repeating and do ++ * not intersect. ++ */ ++struct mali_base_gpu_coherent_group_info { ++ __u32 num_groups; ++ __u32 num_core_groups; ++ __u32 coherency; ++ __u32 padding; ++ struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; ++}; ++ ++/** ++ * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware ++ * Configuration Discovery registers. ++ * @shader_present: Shader core present bitmap ++ * @tiler_present: Tiler core present bitmap ++ * @l2_present: Level 2 cache present bitmap ++ * @stack_present: Core stack present bitmap ++ * @l2_features: L2 features ++ * @core_features: Core features ++ * @mem_features: Mem features ++ * @mmu_features: Mmu features ++ * @as_present: Bitmap of address spaces present ++ * @js_present: Job slots present ++ * @js_features: Array of job slot features. ++ * @tiler_features: Tiler features ++ * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU ++ * @gpu_id: GPU and revision identifier ++ * @thread_max_threads: Maximum number of threads per core ++ * @thread_max_workgroup_size: Maximum number of threads per workgroup ++ * @thread_max_barrier_size: Maximum number of threads per barrier ++ * @thread_features: Thread features ++ * @coherency_mode: Note: This is the _selected_ coherency mode rather than the ++ * available modes as exposed in the coherency_features register ++ * @thread_tls_alloc: Number of threads per core that TLS must be allocated for ++ * @gpu_features: GPU features ++ * ++ * The information is presented inefficiently for access. For frequent access, ++ * the values should be better expressed in an unpacked form in the ++ * base_gpu_props structure. ++ * ++ * The raw properties in gpu_raw_gpu_props are necessary to ++ * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device ++ * behaving differently?". In this case, all information about the ++ * configuration is potentially useful, but it does not need to be processed ++ * by the driver. Instead, the raw registers can be processed by the Mali ++ * Tools software on the host PC. ++ * ++ */ ++struct gpu_raw_gpu_props { ++ __u64 shader_present; ++ __u64 tiler_present; ++ __u64 l2_present; ++ __u64 stack_present; ++ __u32 l2_features; ++ __u32 core_features; ++ __u32 mem_features; ++ __u32 mmu_features; ++ ++ __u32 as_present; ++ ++ __u32 js_present; ++ __u32 js_features[GPU_MAX_JOB_SLOTS]; ++ __u32 tiler_features; ++ __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; ++ ++ __u32 gpu_id; ++ ++ __u32 thread_max_threads; ++ __u32 thread_max_workgroup_size; ++ __u32 thread_max_barrier_size; ++ __u32 thread_features; ++ ++ /* ++ * Note: This is the _selected_ coherency mode rather than the ++ * available modes as exposed in the coherency_features register. ++ */ ++ __u32 coherency_mode; ++ ++ __u32 thread_tls_alloc; ++ __u64 gpu_features; ++}; ++ ++/** ++ * struct base_gpu_props - Return structure for base_get_gpu_props(). ++ * @core_props: Core props. ++ * @l2_props: L2 props. ++ * @unused_1: Keep for backwards compatibility. ++ * @tiler_props: Tiler props. ++ * @thread_props: Thread props. ++ * @raw_props: This member is large, likely to be 128 bytes. ++ * @coherency_info: This must be last member of the structure. ++ * ++ * NOTE: the raw_props member in this data structure contains the register ++ * values from which the value of the other members are derived. The derived ++ * members exist to allow for efficient access and/or shielding the details ++ * of the layout of the registers. ++ */ ++struct base_gpu_props { ++ struct mali_base_gpu_core_props core_props; ++ struct mali_base_gpu_l2_cache_props l2_props; ++ __u64 unused_1; ++ struct mali_base_gpu_tiler_props tiler_props; ++ struct mali_base_gpu_thread_props thread_props; ++ struct gpu_raw_gpu_props raw_props; ++ struct mali_base_gpu_coherent_group_info coherency_info; ++}; ++ ++#define BASE_MEM_GROUP_ID_GET(flags) \ ++ ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) ++ ++#define BASE_MEM_GROUP_ID_SET(id) \ ++ (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ ++ BASE_MEM_GROUP_DEFAULT : \ ++ id) \ ++ << BASEP_MEM_GROUP_ID_SHIFT) & \ ++ BASE_MEM_GROUP_ID_MASK) ++ ++#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ ++ (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ ++ ((base_context_create_flags)(group_id) \ ++ << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) ++ ++#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ ++ ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ ++ BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) ++ ++/* ++ * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These ++ * flags are also used, where applicable, for specifying which fields ++ * are valid following the request operation. ++ */ ++ ++/* For monotonic (counter) timefield */ ++#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) ++/* For system wide timestamp */ ++#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) ++/* For GPU cycle counter */ ++#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) ++/* Specify kernel GPU register timestamp */ ++#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) ++/* Specify userspace cntvct_el0 timestamp source */ ++#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) ++ ++#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ ++ BASE_TIMEINFO_MONOTONIC_FLAG | \ ++ BASE_TIMEINFO_TIMESTAMP_FLAG | \ ++ BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ ++ BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ ++ BASE_TIMEINFO_USER_SOURCE_FLAG) ++ ++/* Maximum number of source allocations allowed to create an alias allocation. ++ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array ++ * layers, since each cube map in the array will have 6 faces. ++ */ ++#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) ++ ++#endif /* _UAPI_BASE_KERNEL_H_ */ +diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h +new file mode 100644 +index 00000000000..17e338cb238 +--- /dev/null ++++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++/* ++ * This header was originally autogenerated, but it is now ok (and ++ * expected) to have to add to it. ++ */ ++ ++#ifndef _UAPI_GPU_CSF_REGISTERS_H_ ++#define _UAPI_GPU_CSF_REGISTERS_H_ ++ ++/* Only user block defines are included. HI words have been removed */ ++ ++/* CS_USER_INPUT_BLOCK register offsets */ ++#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ ++#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ ++ ++/* CS_USER_OUTPUT_BLOCK register offsets */ ++#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ ++#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ ++ ++/* USER register offsets */ ++#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ ++ ++#endif +diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h +new file mode 100644 +index 00000000000..3df8a01699f +--- /dev/null ++++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h +@@ -0,0 +1,483 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_CSF_IOCTL_H_ ++#define _UAPI_KBASE_CSF_IOCTL_H_ ++ ++#include ++#include ++ ++/* ++ * 1.0: ++ * - CSF IOCTL header separated from JM ++ * 1.1: ++ * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME ++ * - Add ioctl 54: This controls the priority setting. ++ * 1.2: ++ * - Add new CSF GPU_FEATURES register into the property structure ++ * returned by KBASE_IOCTL_GET_GPUPROPS ++ * 1.3: ++ * - Add __u32 group_uid member to ++ * &struct_kbase_ioctl_cs_queue_group_create.out ++ * 1.4: ++ * - Replace padding in kbase_ioctl_cs_get_glb_iface with ++ * instr_features member of same size ++ * 1.5: ++ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new ++ * queue registration call with extended format for supporting CS ++ * trace configurations with CSF trace_command. ++ * 1.6: ++ * - Added new HW performance counters interface to all GPUs. ++ * 1.7: ++ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use ++ * 1.8: ++ * - Removed Kernel legacy HWC interface ++ */ ++ ++#define BASE_UK_VERSION_MAJOR 1 ++#define BASE_UK_VERSION_MINOR 8 ++ ++/** ++ * struct kbase_ioctl_version_check - Check version compatibility between ++ * kernel and userspace ++ * ++ * @major: Major version number ++ * @minor: Minor version number ++ */ ++struct kbase_ioctl_version_check { ++ __u16 major; ++ __u16 minor; ++}; ++ ++#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ ++ _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) ++ ++ ++/** ++ * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the ++ * base back-end ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @buffer_size: Size of the buffer in bytes ++ * @priority: Priority of the queue within a group when run within a process ++ * @padding: Currently unused, must be zero ++ * ++ * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. ++ * Any change of this struct should also be mirrored to the latter. ++ */ ++struct kbase_ioctl_cs_queue_register { ++ __u64 buffer_gpu_addr; ++ __u32 buffer_size; ++ __u8 priority; ++ __u8 padding[3]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_REGISTER \ ++ _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) ++ ++/** ++ * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler ++ * to notify that a queue has been updated ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ */ ++struct kbase_ioctl_cs_queue_kick { ++ __u64 buffer_gpu_addr; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_KICK \ ++ _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) ++ ++/** ++ * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group ++ * ++ * @in: Input parameters ++ * @in.buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @in.group_handle: Handle of the group to which the queue should be bound ++ * @in.csi_index: Index of the CSF interface the queue should be bound to ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.mmap_handle: Handle to be used for creating the mapping of CS ++ * input/output pages ++ */ ++union kbase_ioctl_cs_queue_bind { ++ struct { ++ __u64 buffer_gpu_addr; ++ __u8 group_handle; ++ __u8 csi_index; ++ __u8 padding[6]; ++ } in; ++ struct { ++ __u64 mmap_handle; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_BIND \ ++ _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) ++ ++/** ++ * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the ++ * base back-end in extended format, ++ * involving trace buffer configuration ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ * @buffer_size: Size of the buffer in bytes ++ * @priority: Priority of the queue within a group when run within a process ++ * @padding: Currently unused, must be zero ++ * @ex_offset_var_addr: GPU address of the trace buffer write offset variable ++ * @ex_buffer_base: Trace buffer GPU base address for the queue ++ * @ex_buffer_size: Size of the trace buffer in bytes ++ * @ex_event_size: Trace event write size, in log2 designation ++ * @ex_event_state: Trace event states configuration ++ * @ex_padding: Currently unused, must be zero ++ * ++ * @Note: There is an identical sub-section at the start of this struct to that ++ * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section ++ * must also be mirrored to the latter. Following the said sub-section, ++ * the remaining fields forms the extension, marked with ex_*. ++ */ ++struct kbase_ioctl_cs_queue_register_ex { ++ __u64 buffer_gpu_addr; ++ __u32 buffer_size; ++ __u8 priority; ++ __u8 padding[3]; ++ __u64 ex_offset_var_addr; ++ __u64 ex_buffer_base; ++ __u32 ex_buffer_size; ++ __u8 ex_event_size; ++ __u8 ex_event_state; ++ __u8 ex_padding[2]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ ++ _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) ++ ++/** ++ * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue ++ * ++ * @buffer_gpu_addr: GPU address of the buffer backing the queue ++ */ ++struct kbase_ioctl_cs_queue_terminate { ++ __u64 buffer_gpu_addr; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ ++ _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) ++ ++/** ++ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue ++ * group ++ * @in: Input parameters ++ * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. ++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. ++ * @in.compute_mask: Mask of compute endpoints the group is allowed to use. ++ * @in.cs_min: Minimum number of CSs required. ++ * @in.priority: Queue group's priority within a process. ++ * @in.tiler_max: Maximum number of tiler endpoints the group is allowed ++ * to use. ++ * @in.fragment_max: Maximum number of fragment endpoints the group is ++ * allowed to use. ++ * @in.compute_max: Maximum number of compute endpoints the group is allowed ++ * to use. ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.group_handle: Handle of a newly created queue group. ++ * @out.padding: Currently unused, must be zero ++ * @out.group_uid: UID of the queue group available to base. ++ */ ++union kbase_ioctl_cs_queue_group_create_1_6 { ++ struct { ++ __u64 tiler_mask; ++ __u64 fragment_mask; ++ __u64 compute_mask; ++ __u8 cs_min; ++ __u8 priority; ++ __u8 tiler_max; ++ __u8 fragment_max; ++ __u8 compute_max; ++ __u8 padding[3]; ++ ++ } in; ++ struct { ++ __u8 group_handle; ++ __u8 padding[3]; ++ __u32 group_uid; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ ++ _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) ++ ++/** ++ * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group ++ * @in: Input parameters ++ * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. ++ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. ++ * @in.compute_mask: Mask of compute endpoints the group is allowed to use. ++ * @in.cs_min: Minimum number of CSs required. ++ * @in.priority: Queue group's priority within a process. ++ * @in.tiler_max: Maximum number of tiler endpoints the group is allowed ++ * to use. ++ * @in.fragment_max: Maximum number of fragment endpoints the group is ++ * allowed to use. ++ * @in.compute_max: Maximum number of compute endpoints the group is allowed ++ * to use. ++ * @in.padding: Currently unused, must be zero ++ * @out: Output parameters ++ * @out.group_handle: Handle of a newly created queue group. ++ * @out.padding: Currently unused, must be zero ++ * @out.group_uid: UID of the queue group available to base. ++ */ ++union kbase_ioctl_cs_queue_group_create { ++ struct { ++ __u64 tiler_mask; ++ __u64 fragment_mask; ++ __u64 compute_mask; ++ __u8 cs_min; ++ __u8 priority; ++ __u8 tiler_max; ++ __u8 fragment_max; ++ __u8 compute_max; ++ __u8 padding[3]; ++ __u64 reserved; ++ } in; ++ struct { ++ __u8 group_handle; ++ __u8 padding[3]; ++ __u32 group_uid; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ ++ _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) ++ ++/** ++ * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group ++ * ++ * @group_handle: Handle of the queue group to be terminated ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_cs_queue_group_term { ++ __u8 group_handle; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ ++ _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) ++ ++#define KBASE_IOCTL_CS_EVENT_SIGNAL \ ++ _IO(KBASE_IOCTL_TYPE, 44) ++ ++typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue ++ * ++ * @id: ID of the new command queue returned by the kernel ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_new { ++ base_kcpu_queue_id id; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ ++ _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue ++ * ++ * @id: ID of the command queue to be destroyed ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_delete { ++ base_kcpu_queue_id id; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ ++ _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) ++ ++/** ++ * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue ++ * ++ * @addr: Memory address of an array of struct base_kcpu_queue_command ++ * @nr_commands: Number of commands in the array ++ * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_kcpu_queue_enqueue { ++ __u64 addr; ++ __u32 nr_commands; ++ base_kcpu_queue_id id; ++ __u8 padding[3]; ++}; ++ ++#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ ++ _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) ++ ++/** ++ * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap ++ * @in: Input parameters ++ * @in.chunk_size: Size of each chunk. ++ * @in.initial_chunks: Initial number of chunks that heap will be created with. ++ * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. ++ * @in.target_in_flight: Number of render-passes that the driver should attempt to ++ * keep in flight for which allocation of new chunks is ++ * allowed. ++ * @in.group_id: Group ID to be used for physical allocations. ++ * @in.padding: Padding ++ * @out: Output parameters ++ * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up ++ * for the heap. ++ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, ++ * actually points to the header of heap chunk and not to ++ * the low address of free memory in the chunk. ++ */ ++union kbase_ioctl_cs_tiler_heap_init { ++ struct { ++ __u32 chunk_size; ++ __u32 initial_chunks; ++ __u32 max_chunks; ++ __u16 target_in_flight; ++ __u8 group_id; ++ __u8 padding; ++ } in; ++ struct { ++ __u64 gpu_heap_va; ++ __u64 first_chunk_va; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ ++ _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) ++ ++/** ++ * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap ++ * instance ++ * ++ * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. ++ */ ++struct kbase_ioctl_cs_tiler_heap_term { ++ __u64 gpu_heap_va; ++}; ++ ++#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ ++ _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) ++ ++/** ++ * union kbase_ioctl_cs_get_glb_iface - Request the global control block ++ * of CSF interface capabilities ++ * ++ * @in: Input parameters ++ * @in.max_group_num: The maximum number of groups to be read. Can be 0, in ++ * which case groups_ptr is unused. ++ * @in.max_total_stream _num: The maximum number of CSs to be read. Can be 0, in ++ * which case streams_ptr is unused. ++ * @in.groups_ptr: Pointer where to store all the group data (sequentially). ++ * @in.streams_ptr: Pointer where to store all the CS data (sequentially). ++ * @out: Output parameters ++ * @out.glb_version: Global interface version. ++ * @out.features: Bit mask of features (e.g. whether certain types of job ++ * can be suspended). ++ * @out.group_num: Number of CSGs supported. ++ * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 ++ * hold the size of firmware performance counter data ++ * and 15:0 hold the size of hardware performance counter ++ * data. ++ * @out.total_stream_num: Total number of CSs, summed across all groups. ++ * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum ++ * size of events. Bits 3:0 hold the offset update rate. ++ * (csf >= 1.1.0) ++ * ++ */ ++union kbase_ioctl_cs_get_glb_iface { ++ struct { ++ __u32 max_group_num; ++ __u32 max_total_stream_num; ++ __u64 groups_ptr; ++ __u64 streams_ptr; ++ } in; ++ struct { ++ __u32 glb_version; ++ __u32 features; ++ __u32 group_num; ++ __u32 prfcnt_size; ++ __u32 total_stream_num; ++ __u32 instr_features; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CS_GET_GLB_IFACE \ ++ _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) ++ ++struct kbase_ioctl_cs_cpu_queue_info { ++ __u64 buffer; ++ __u64 size; ++}; ++ ++#define KBASE_IOCTL_VERSION_CHECK \ ++ _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) ++ ++#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ ++ _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) ++ ++/*************** ++ * test ioctls * ++ ***************/ ++#if MALI_UNIT_TEST ++/* These ioctls are purely for test purposes and are not used in the production ++ * driver, they therefore may change without notice ++ */ ++ ++/** ++ * struct kbase_ioctl_cs_event_memory_write - Write an event memory address ++ * @cpu_addr: Memory address to write ++ * @value: Value to write ++ * @padding: Currently unused, must be zero ++ */ ++struct kbase_ioctl_cs_event_memory_write { ++ __u64 cpu_addr; ++ __u8 value; ++ __u8 padding[7]; ++}; ++ ++/** ++ * union kbase_ioctl_cs_event_memory_read - Read an event memory address ++ * @in: Input parameters ++ * @in.cpu_addr: Memory address to read ++ * @out: Output parameters ++ * @out.value: Value read ++ * @out.padding: Currently unused, must be zero ++ */ ++union kbase_ioctl_cs_event_memory_read { ++ struct { ++ __u64 cpu_addr; ++ } in; ++ struct { ++ __u8 value; ++ __u8 padding[7]; ++ } out; ++}; ++ ++#endif /* MALI_UNIT_TEST */ ++ ++#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ +diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h +new file mode 100644 +index 00000000000..fc81b71b46a +--- /dev/null ++++ b/src/panfrost/csf_test/mali_kbase_ioctl.h +@@ -0,0 +1,854 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * ++ * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved. ++ * ++ * This program is free software and is provided to you under the terms of the ++ * GNU General Public License version 2 as published by the Free Software ++ * Foundation, and any use by you of this program is subject to the terms ++ * of such GNU license. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, you can access it online at ++ * http://www.gnu.org/licenses/gpl-2.0.html. ++ * ++ */ ++ ++#ifndef _UAPI_KBASE_IOCTL_H_ ++#define _UAPI_KBASE_IOCTL_H_ ++ ++#ifdef __cpluscplus ++extern "C" { ++#endif ++ ++#include ++#include ++ ++#define KBASE_IOCTL_TYPE 0x80 ++ ++/** ++ * struct kbase_ioctl_set_flags - Set kernel context creation flags ++ * ++ * @create_flags: Flags - see base_context_create_flags ++ */ ++struct kbase_ioctl_set_flags { ++ __u32 create_flags; ++}; ++ ++#define KBASE_IOCTL_SET_FLAGS \ ++ _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) ++ ++/** ++ * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel ++ * ++ * @buffer: Pointer to the buffer to store properties into ++ * @size: Size of the buffer ++ * @flags: Flags - must be zero for now ++ * ++ * The ioctl will return the number of bytes stored into @buffer or an error ++ * on failure (e.g. @size is too small). If @size is specified as 0 then no ++ * data will be written but the return value will be the number of bytes needed ++ * for all the properties. ++ * ++ * @flags may be used in the future to request a different format for the ++ * buffer. With @flags == 0 the following format is used. ++ * ++ * The buffer will be filled with pairs of values, a __u32 key identifying the ++ * property followed by the value. The size of the value is identified using ++ * the bottom bits of the key. The value then immediately followed the key and ++ * is tightly packed (there is no padding). All keys and values are ++ * little-endian. ++ * ++ * 00 = __u8 ++ * 01 = __u16 ++ * 10 = __u32 ++ * 11 = __u64 ++ */ ++struct kbase_ioctl_get_gpuprops { ++ __u64 buffer; ++ __u32 size; ++ __u32 flags; ++}; ++ ++#define KBASE_IOCTL_GET_GPUPROPS \ ++ _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) ++ ++/** ++ * union kbase_ioctl_mem_alloc - Allocate memory on the GPU ++ * @in: Input parameters ++ * @in.va_pages: The number of pages of virtual address space to reserve ++ * @in.commit_pages: The number of physical pages to allocate ++ * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region ++ * @in.flags: Flags ++ * @out: Output parameters ++ * @out.flags: Flags ++ * @out.gpu_va: The GPU virtual address which is allocated ++ */ ++union kbase_ioctl_mem_alloc { ++ struct { ++ __u64 va_pages; ++ __u64 commit_pages; ++ __u64 extension; ++ __u64 flags; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_ALLOC \ ++ _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) ++ ++/** ++ * struct kbase_ioctl_mem_query - Query properties of a GPU memory region ++ * @in: Input parameters ++ * @in.gpu_addr: A GPU address contained within the region ++ * @in.query: The type of query ++ * @out: Output parameters ++ * @out.value: The result of the query ++ * ++ * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. ++ */ ++union kbase_ioctl_mem_query { ++ struct { ++ __u64 gpu_addr; ++ __u64 query; ++ } in; ++ struct { ++ __u64 value; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_QUERY \ ++ _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) ++ ++#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) ++#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) ++#define KBASE_MEM_QUERY_FLAGS ((__u64)3) ++ ++/** ++ * struct kbase_ioctl_mem_free - Free a memory region ++ * @gpu_addr: Handle to the region to free ++ */ ++struct kbase_ioctl_mem_free { ++ __u64 gpu_addr; ++}; ++ ++#define KBASE_IOCTL_MEM_FREE \ ++ _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) ++ ++/** ++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader ++ * @buffer_count: requested number of dumping buffers ++ * @fe_bm: counters selection bitmask (Front end) ++ * @shader_bm: counters selection bitmask (Shader) ++ * @tiler_bm: counters selection bitmask (Tiler) ++ * @mmu_l2_bm: counters selection bitmask (MMU_L2) ++ * ++ * A fd is returned from the ioctl if successful, or a negative value on error ++ */ ++struct kbase_ioctl_hwcnt_reader_setup { ++ __u32 buffer_count; ++ __u32 fe_bm; ++ __u32 shader_bm; ++ __u32 tiler_bm; ++ __u32 mmu_l2_bm; ++}; ++ ++#define KBASE_IOCTL_HWCNT_READER_SETUP \ ++ _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) ++ ++/** ++ * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. ++ * @data: Counter samples for the dummy model. ++ * @size: Size of the counter sample data. ++ * @padding: Padding. ++ */ ++struct kbase_ioctl_hwcnt_values { ++ __u64 data; ++ __u32 size; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_HWCNT_SET \ ++ _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) ++ ++/** ++ * struct kbase_ioctl_disjoint_query - Query the disjoint counter ++ * @counter: A counter of disjoint events in the kernel ++ */ ++struct kbase_ioctl_disjoint_query { ++ __u32 counter; ++}; ++ ++#define KBASE_IOCTL_DISJOINT_QUERY \ ++ _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) ++ ++/** ++ * struct kbase_ioctl_get_ddk_version - Query the kernel version ++ * @version_buffer: Buffer to receive the kernel version string ++ * @size: Size of the buffer ++ * @padding: Padding ++ * ++ * The ioctl will return the number of bytes written into version_buffer ++ * (which includes a NULL byte) or a negative error code ++ * ++ * The ioctl request code has to be _IOW because the data in ioctl struct is ++ * being copied to the kernel, even though the kernel then writes out the ++ * version info to the buffer specified in the ioctl. ++ */ ++struct kbase_ioctl_get_ddk_version { ++ __u64 version_buffer; ++ __u32 size; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_GET_DDK_VERSION \ ++ _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory ++ * allocator (between kernel driver ++ * version 10.2--11.4) ++ * @va_pages: Number of VA pages to reserve for JIT ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ * ++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for ++ * backwards compatibility. ++ */ ++struct kbase_ioctl_mem_jit_init_10_2 { ++ __u64 va_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory ++ * allocator (between kernel driver ++ * version 11.5--11.19) ++ * @va_pages: Number of VA pages to reserve for JIT ++ * @max_allocations: Maximum number of concurrent allocations ++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) ++ * @group_id: Group ID to be used for physical allocations ++ * @padding: Currently unused, must be zero ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ * ++ * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for ++ * backwards compatibility. ++ */ ++struct kbase_ioctl_mem_jit_init_11_5 { ++ __u64 va_pages; ++ __u8 max_allocations; ++ __u8 trim_level; ++ __u8 group_id; ++ __u8 padding[5]; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) ++ ++/** ++ * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory ++ * allocator ++ * @va_pages: Number of GPU virtual address pages to reserve for just-in-time ++ * memory allocations ++ * @max_allocations: Maximum number of concurrent allocations ++ * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) ++ * @group_id: Group ID to be used for physical allocations ++ * @padding: Currently unused, must be zero ++ * @phys_pages: Maximum number of physical pages to allocate just-in-time ++ * ++ * Note that depending on the VA size of the application and GPU, the value ++ * specified in @va_pages may be ignored. ++ */ ++struct kbase_ioctl_mem_jit_init { ++ __u64 va_pages; ++ __u8 max_allocations; ++ __u8 trim_level; ++ __u8 group_id; ++ __u8 padding[5]; ++ __u64 phys_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_JIT_INIT \ ++ _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) ++ ++/** ++ * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory ++ * ++ * @handle: GPU memory handle (GPU VA) ++ * @user_addr: The address where it is mapped in user space ++ * @size: The number of bytes to synchronise ++ * @type: The direction to synchronise: 0 is sync to memory (clean), ++ * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. ++ * @padding: Padding to round up to a multiple of 8 bytes, must be zero ++ */ ++struct kbase_ioctl_mem_sync { ++ __u64 handle; ++ __u64 user_addr; ++ __u64 size; ++ __u8 type; ++ __u8 padding[7]; ++}; ++ ++#define KBASE_IOCTL_MEM_SYNC \ ++ _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) ++ ++/** ++ * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer ++ * ++ * @in: Input parameters ++ * @in.gpu_addr: The GPU address of the memory region ++ * @in.cpu_addr: The CPU address to locate ++ * @in.size: A size in bytes to validate is contained within the region ++ * @out: Output parameters ++ * @out.offset: The offset from the start of the memory region to @cpu_addr ++ */ ++union kbase_ioctl_mem_find_cpu_offset { ++ struct { ++ __u64 gpu_addr; ++ __u64 cpu_addr; ++ __u64 size; ++ } in; ++ struct { ++ __u64 offset; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ ++ _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) ++ ++/** ++ * struct kbase_ioctl_get_context_id - Get the kernel context ID ++ * ++ * @id: The kernel context ID ++ */ ++struct kbase_ioctl_get_context_id { ++ __u32 id; ++}; ++ ++#define KBASE_IOCTL_GET_CONTEXT_ID \ ++ _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) ++ ++/** ++ * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd ++ * ++ * @flags: Flags ++ * ++ * The ioctl returns a file descriptor when successful ++ */ ++struct kbase_ioctl_tlstream_acquire { ++ __u32 flags; ++}; ++ ++#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ ++ _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) ++ ++#define KBASE_IOCTL_TLSTREAM_FLUSH \ ++ _IO(KBASE_IOCTL_TYPE, 19) ++ ++/** ++ * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region ++ * ++ * @gpu_addr: The memory region to modify ++ * @pages: The number of physical pages that should be present ++ * ++ * The ioctl may return on the following error codes or 0 for success: ++ * -ENOMEM: Out of memory ++ * -EINVAL: Invalid arguments ++ */ ++struct kbase_ioctl_mem_commit { ++ __u64 gpu_addr; ++ __u64 pages; ++}; ++ ++#define KBASE_IOCTL_MEM_COMMIT \ ++ _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) ++ ++/** ++ * union kbase_ioctl_mem_alias - Create an alias of memory regions ++ * @in: Input parameters ++ * @in.flags: Flags, see BASE_MEM_xxx ++ * @in.stride: Bytes between start of each memory region ++ * @in.nents: The number of regions to pack together into the alias ++ * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info ++ * @out: Output parameters ++ * @out.flags: Flags, see BASE_MEM_xxx ++ * @out.gpu_va: Address of the new alias ++ * @out.va_pages: Size of the new alias ++ */ ++union kbase_ioctl_mem_alias { ++ struct { ++ __u64 flags; ++ __u64 stride; ++ __u64 nents; ++ __u64 aliasing_info; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ __u64 va_pages; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_ALIAS \ ++ _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) ++ ++/** ++ * union kbase_ioctl_mem_import - Import memory for use by the GPU ++ * @in: Input parameters ++ * @in.flags: Flags, see BASE_MEM_xxx ++ * @in.phandle: Handle to the external memory ++ * @in.type: Type of external memory, see base_mem_import_type ++ * @in.padding: Amount of extra VA pages to append to the imported buffer ++ * @out: Output parameters ++ * @out.flags: Flags, see BASE_MEM_xxx ++ * @out.gpu_va: Address of the new alias ++ * @out.va_pages: Size of the new alias ++ */ ++union kbase_ioctl_mem_import { ++ struct { ++ __u64 flags; ++ __u64 phandle; ++ __u32 type; ++ __u32 padding; ++ } in; ++ struct { ++ __u64 flags; ++ __u64 gpu_va; ++ __u64 va_pages; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_IMPORT \ ++ _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) ++ ++/** ++ * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region ++ * @gpu_va: The GPU region to modify ++ * @flags: The new flags to set ++ * @mask: Mask of the flags to modify ++ */ ++struct kbase_ioctl_mem_flags_change { ++ __u64 gpu_va; ++ __u64 flags; ++ __u64 mask; ++}; ++ ++#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ ++ _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) ++ ++/** ++ * struct kbase_ioctl_stream_create - Create a synchronisation stream ++ * @name: A name to identify this stream. Must be NULL-terminated. ++ * ++ * Note that this is also called a "timeline", but is named stream to avoid ++ * confusion with other uses of the word. ++ * ++ * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. ++ * ++ * The ioctl returns a file descriptor. ++ */ ++struct kbase_ioctl_stream_create { ++ char name[32]; ++}; ++ ++#define KBASE_IOCTL_STREAM_CREATE \ ++ _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) ++ ++/** ++ * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence ++ * @fd: The file descriptor to validate ++ */ ++struct kbase_ioctl_fence_validate { ++ int fd; ++}; ++ ++#define KBASE_IOCTL_FENCE_VALIDATE \ ++ _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) ++ ++/** ++ * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel ++ * @buffer: Pointer to the information ++ * @len: Length ++ * @padding: Padding ++ * ++ * The data provided is accessible through a debugfs file ++ */ ++struct kbase_ioctl_mem_profile_add { ++ __u64 buffer; ++ __u32 len; ++ __u32 padding; ++}; ++ ++#define KBASE_IOCTL_MEM_PROFILE_ADD \ ++ _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) ++ ++/** ++ * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource ++ * @count: Number of resources ++ * @address: Array of __u64 GPU addresses of the external resources to map ++ */ ++struct kbase_ioctl_sticky_resource_map { ++ __u64 count; ++ __u64 address; ++}; ++ ++#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ ++ _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) ++ ++/** ++ * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was ++ * previously permanently mapped ++ * @count: Number of resources ++ * @address: Array of __u64 GPU addresses of the external resources to unmap ++ */ ++struct kbase_ioctl_sticky_resource_unmap { ++ __u64 count; ++ __u64 address; ++}; ++ ++#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ ++ _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) ++ ++/** ++ * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of ++ * the GPU memory region for ++ * the given gpu address and ++ * the offset of that address ++ * into the region ++ * @in: Input parameters ++ * @in.gpu_addr: GPU virtual address ++ * @in.size: Size in bytes within the region ++ * @out: Output parameters ++ * @out.start: Address of the beginning of the memory region enclosing @gpu_addr ++ * for the length of @offset bytes ++ * @out.offset: The offset from the start of the memory region to @gpu_addr ++ */ ++union kbase_ioctl_mem_find_gpu_start_and_offset { ++ struct { ++ __u64 gpu_addr; ++ __u64 size; ++ } in; ++ struct { ++ __u64 start; ++ __u64 offset; ++ } out; ++}; ++ ++#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ ++ _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) ++ ++#define KBASE_IOCTL_CINSTR_GWT_START \ ++ _IO(KBASE_IOCTL_TYPE, 33) ++ ++#define KBASE_IOCTL_CINSTR_GWT_STOP \ ++ _IO(KBASE_IOCTL_TYPE, 34) ++ ++/** ++ * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses. ++ * @in: Input parameters ++ * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. ++ * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) ++ * @in.len: Number of addresses the buffers can hold. ++ * @in.padding: padding ++ * @out: Output parameters ++ * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. ++ * @out.more_data_available: Status indicating if more addresses are available. ++ * @out.padding: padding ++ * ++ * This structure is used when performing a call to dump GPU write fault ++ * addresses. ++ */ ++union kbase_ioctl_cinstr_gwt_dump { ++ struct { ++ __u64 addr_buffer; ++ __u64 size_buffer; ++ __u32 len; ++ __u32 padding; ++ ++ } in; ++ struct { ++ __u32 no_of_addr_collected; ++ __u8 more_data_available; ++ __u8 padding[27]; ++ } out; ++}; ++ ++#define KBASE_IOCTL_CINSTR_GWT_DUMP \ ++ _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) ++ ++/** ++ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone ++ * ++ * @va_pages: Number of VA pages to reserve for EXEC_VA ++ */ ++struct kbase_ioctl_mem_exec_init { ++ __u64 va_pages; ++}; ++ ++#define KBASE_IOCTL_MEM_EXEC_INIT \ ++ _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) ++ ++/** ++ * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of ++ * cpu/gpu time (counter values) ++ * @in: Input parameters ++ * @in.request_flags: Bit-flags indicating the requested types. ++ * @in.paddings: Unused, size alignment matching the out. ++ * @out: Output parameters ++ * @out.sec: Integer field of the monotonic time, unit in seconds. ++ * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. ++ * @out.padding: Unused, for __u64 alignment ++ * @out.timestamp: System wide timestamp (counter) value. ++ * @out.cycle_counter: GPU cycle counter value. ++ */ ++union kbase_ioctl_get_cpu_gpu_timeinfo { ++ struct { ++ __u32 request_flags; ++ __u32 paddings[7]; ++ } in; ++ struct { ++ __u64 sec; ++ __u32 nsec; ++ __u32 padding; ++ __u64 timestamp; ++ __u64 cycle_counter; ++ } out; ++}; ++ ++#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ ++ _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) ++ ++/** ++ * struct kbase_ioctl_context_priority_check - Check the max possible priority ++ * @priority: Input priority & output priority ++ */ ++ ++struct kbase_ioctl_context_priority_check { ++ __u8 priority; ++}; ++ ++#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ ++ _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) ++ ++/** ++ * struct kbase_ioctl_set_limited_core_count - Set the limited core count. ++ * ++ * @max_core_count: Maximum core count ++ */ ++struct kbase_ioctl_set_limited_core_count { ++ __u8 max_core_count; ++}; ++ ++#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ ++ _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) ++ ++/** ++ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter ++ * information ++ * @info_item_size: Performance counter item size in bytes. ++ * @info_item_count: Performance counter item count in the info_list_ptr. ++ * @info_list_ptr: Performance counter item list pointer which points to a ++ * list with info_item_count of items. ++ * ++ * On success: returns info_item_size and info_item_count if info_list_ptr is ++ * NULL, returns performance counter information if info_list_ptr is not NULL. ++ * On error: returns a negative error code. ++ */ ++struct kbase_ioctl_kinstr_prfcnt_enum_info { ++ __u32 info_item_size; ++ __u32 info_item_count; ++ __u64 info_list_ptr; ++}; ++ ++#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ ++ _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) ++ ++/** ++ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader ++ * @in: input parameters. ++ * @in.request_item_count: Number of requests in the requests array. ++ * @in.request_item_size: Size in bytes of each request in the requests array. ++ * @in.requests_ptr: Pointer to the requests array. ++ * @out: output parameters. ++ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for ++ * each sample. ++ * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap ++ * for reading performance counter samples. ++ * ++ * A fd is returned from the ioctl if successful, or a negative value on error. ++ */ ++union kbase_ioctl_kinstr_prfcnt_setup { ++ struct { ++ __u32 request_item_count; ++ __u32 request_item_size; ++ __u64 requests_ptr; ++ } in; ++ struct { ++ __u32 prfcnt_metadata_item_size; ++ __u32 prfcnt_mmap_size_bytes; ++ } out; ++}; ++ ++#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ ++ _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) ++ ++/*************** ++ * test ioctls * ++ ***************/ ++#if MALI_UNIT_TEST ++/* These ioctls are purely for test purposes and are not used in the production ++ * driver, they therefore may change without notice ++ */ ++ ++#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) ++ ++ ++/** ++ * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes ++ * @bytes_collected: number of bytes read by user ++ * @bytes_generated: number of bytes generated by tracepoints ++ */ ++struct kbase_ioctl_tlstream_stats { ++ __u32 bytes_collected; ++ __u32 bytes_generated; ++}; ++ ++#define KBASE_IOCTL_TLSTREAM_STATS \ ++ _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) ++ ++#endif /* MALI_UNIT_TEST */ ++ ++/* Customer extension range */ ++#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) ++ ++/* If the integration needs extra ioctl add them there ++ * like this: ++ * ++ * struct my_ioctl_args { ++ * .... ++ * } ++ * ++ * #define KBASE_IOCTL_MY_IOCTL \ ++ * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) ++ */ ++ ++ ++/********************************** ++ * Definitions for GPU properties * ++ **********************************/ ++#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) ++#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) ++#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) ++#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) ++ ++#define KBASE_GPUPROP_PRODUCT_ID 1 ++#define KBASE_GPUPROP_VERSION_STATUS 2 ++#define KBASE_GPUPROP_MINOR_REVISION 3 ++#define KBASE_GPUPROP_MAJOR_REVISION 4 ++/* 5 previously used for GPU speed */ ++#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 ++/* 7 previously used for minimum GPU speed */ ++#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 ++#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 ++#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 ++ ++#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 ++#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 ++#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 ++ ++#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 ++#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 ++ ++#define KBASE_GPUPROP_MAX_THREADS 18 ++#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 ++#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 ++#define KBASE_GPUPROP_MAX_REGISTERS 21 ++#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 ++#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 ++#define KBASE_GPUPROP_IMPL_TECH 24 ++ ++#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 ++#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 ++#define KBASE_GPUPROP_RAW_L2_PRESENT 27 ++#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 ++#define KBASE_GPUPROP_RAW_L2_FEATURES 29 ++#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 ++#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 ++#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 ++#define KBASE_GPUPROP_RAW_AS_PRESENT 33 ++#define KBASE_GPUPROP_RAW_JS_PRESENT 34 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 ++#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 ++#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 ++#define KBASE_GPUPROP_RAW_GPU_ID 55 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 ++#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 ++#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 ++#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 ++ ++#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 ++#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 ++#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 ++#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 ++#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 ++#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 ++#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 ++#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 ++#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 ++#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 ++#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 ++#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 ++#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 ++#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 ++#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 ++#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 ++#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 ++#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 ++#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 ++ ++#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 ++#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 ++ ++#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 ++ ++#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 ++#define KBASE_GPUPROP_TLS_ALLOC 84 ++#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 ++#ifdef __cpluscplus ++} ++#endif ++ ++#endif /* _UAPI_KBASE_IOCTL_H_ */ +diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c +new file mode 100644 +index 00000000000..cb9ff398314 +--- /dev/null ++++ b/src/panfrost/csf_test/test.c +@@ -0,0 +1,1903 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "util/macros.h" ++ ++#include "mali_kbase_csf_ioctl.h" ++#include "mali_kbase_ioctl.h" ++#include "mali_base_kernel.h" ++#include "mali_base_csf_kernel.h" ++#include "mali_gpu_csf_registers.h" ++ ++#define PAN_ARCH 10 ++#include "genxml/gen_macros.h" ++ ++#include "wrap.h" ++#include "decode.h" ++ ++#include "pan_shader.h" ++#include "compiler/nir/nir_builder.h" ++#include "bifrost/valhall/disassemble.h" ++ ++#define CS_EVENT_REGISTER 0x5A ++ ++static bool pr = true; ++static bool colour_term = true; ++ ++static void ++dump_start(FILE *f) ++{ ++ if (colour_term) ++ fprintf(f, "\x1b[90m"); ++} ++ ++static void ++dump_end(FILE *f) ++{ ++ if (colour_term) ++ fprintf(f, "\x1b[39m"); ++} ++ ++/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */ ++static void ++cache_clean(volatile void *addr) ++{ ++#ifdef __aarch64__ ++ __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); ++#endif ++} ++ ++static void ++cache_invalidate(volatile void *addr) ++{ ++#ifdef __aarch64__ ++ __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); ++#endif ++} ++ ++static void ++cache_barrier(void) ++{ ++#ifdef __ARM_ARCH ++ __asm__ volatile ("dsb sy" ::: "memory"); ++#endif ++} ++ ++static void ++memory_barrier(void) ++{ ++#ifdef __ARM_ARCH ++ __asm__ volatile ("dmb sy" ::: "memory"); ++#endif ++} ++ ++typedef void (*cacheline_op)(volatile void *addr); ++ ++#define CACHELINE_SIZE 64 ++ ++static void ++cacheline_op_range(volatile void *start, unsigned length, cacheline_op op) ++{ ++ volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); ++ volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); ++ for (; ptr < end; ptr += CACHELINE_SIZE) ++ op(ptr); ++} ++ ++static void ++cache_clean_range(volatile void *start, unsigned length) ++{ ++ cacheline_op_range(start, length, cache_clean); ++} ++ ++static void ++cache_invalidate_range(volatile void *start, unsigned length) ++{ ++ cacheline_op_range(start, length, cache_invalidate); ++} ++ ++struct state; ++struct test; ++ ++typedef bool (* section)(struct state *s, struct test *t); ++ ++#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */ ++#define CS_QUEUE_SIZE 65536 ++ ++struct state { ++ int page_size; ++ int argc; ++ char **argv; ++ ++ int mali_fd; ++ int tl_fd; ++ void *tracking_region; ++ void *csf_user_reg; ++ ++ uint8_t *gpuprops; ++ unsigned gpuprops_size; ++ uint32_t gpu_id; ++ ++ struct { ++ struct panfrost_ptr normal, exec, coherent, cached, event, ev2; ++ } allocations; ++ ++ uint64_t tiler_heap_va; ++ uint64_t tiler_heap_header; ++ ++ uint8_t csg_handle; ++ uint32_t csg_uid; ++ ++ struct panfrost_ptr cs_mem[CS_QUEUE_COUNT]; ++ void *cs_user_io[CS_QUEUE_COUNT]; ++ unsigned cs_last_submit[CS_QUEUE_COUNT]; ++ struct pan_command_stream cs[CS_QUEUE_COUNT]; ++ ++ unsigned shader_alloc_offset; ++ mali_ptr compute_shader; ++}; ++ ++struct test { ++ section part; ++ section cleanup; ++ const char *label; ++ ++ struct test *subtests; ++ unsigned sub_length; ++ ++ /* for allocation tests */ ++ unsigned offset; ++ unsigned flags; ++ ++ bool add; ++ bool invalid; ++ bool blit; ++ bool vertex; ++}; ++ ++/* See STATE and ALLOC macros below */ ++#define DEREF_STATE(s, offset) ((void*) s + offset) ++ ++static uint64_t ++pan_get_gpuprop(struct state *s, int name) ++{ ++ int i = 0; ++ uint64_t x = 0; ++ while (i < s->gpuprops_size) { ++ x = 0; ++ memcpy(&x, s->gpuprops + i, 4); ++ i += 4; ++ ++ int size = 1 << (x & 3); ++ int this_name = x >> 2; ++ ++ x = 0; ++ memcpy(&x, s->gpuprops + i, size); ++ i += size; ++ ++ if (this_name == name) ++ return x; ++ } ++ ++ fprintf(stderr, "Unknown prop %i\n", name); ++ return 0; ++} ++ ++static bool ++open_kbase(struct state *s, struct test *t) ++{ ++ s->mali_fd = open("/dev/mali0", O_RDWR); ++ if (s->mali_fd != -1) ++ return true; ++ ++ perror("open(\"/dev/mali0\")"); ++ return false; ++} ++ ++static bool ++close_kbase(struct state *s, struct test *t) ++{ ++ if (getenv("TEST_CHECK_LEAKS")) { ++ int pid = getpid(); ++ char cmd_buffer[64] = {0}; ++ sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid); ++ system(cmd_buffer); ++ sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid); ++ system(cmd_buffer); ++ } ++ ++ if (s->mali_fd > 0) ++ return close(s->mali_fd) == 0; ++ return true; ++} ++ ++static bool ++get_version(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_version_check ver = { 0 }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_VERSION_CHECK)"); ++ return false; ++ } ++ ++ if (pr) ++ printf("Major %i Minor %i: ", ver.major, ver.minor); ++ return true; ++} ++ ++static bool ++set_flags(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_set_flags flags = { ++ .create_flags = 0 ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++mmap_tracking(struct state *s, struct test *t) ++{ ++ s->tracking_region = mmap(NULL, s->page_size, PROT_NONE, ++ MAP_SHARED, s->mali_fd, ++ BASE_MEM_MAP_TRACKING_HANDLE); ++ ++ if (s->tracking_region == MAP_FAILED) { ++ perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); ++ s->tracking_region = NULL; ++ return false; ++ } ++ return true; ++} ++ ++static bool ++munmap_tracking(struct state *s, struct test *t) ++{ ++ if (s->tracking_region) ++ return munmap(s->tracking_region, s->page_size) == 0; ++ return true; ++} ++ ++static bool ++get_gpuprops(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_get_gpuprops props = { 0 }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); ++ return false; ++ } else if (!ret) { ++ fprintf(stderr, "GET_GPUPROPS returned zero size\n"); ++ return false; ++ } ++ ++ s->gpuprops_size = ret; ++ s->gpuprops = calloc(s->gpuprops_size, 1); ++ ++ props.size = s->gpuprops_size; ++ props.buffer = (uint64_t)(uintptr_t) s->gpuprops; ++ ++ ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++free_gpuprops(struct state *s, struct test *t) ++{ ++ free(s->gpuprops); ++ return true; ++} ++ ++static bool ++get_gpu_id(struct state *s, struct test *t) ++{ ++ uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID); ++ if (!gpu_id) ++ return false; ++ s->gpu_id = gpu_id; ++ ++ uint16_t maj = gpu_id >> 12; ++ uint16_t min = (gpu_id >> 8) & 0xf; ++ uint16_t rev = (gpu_id >> 4) & 0xf; ++ ++ uint16_t product = gpu_id & 0xf; ++ uint16_t prod = product | ((maj & 1) << 4); ++ ++ const char *names[] = { ++ [1] = "TDUX", ++ [2] = "G710", ++ [3] = "G510", ++ [4] = "G310", ++ [7] = "G610", ++ [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */ ++ [16 + 3] = "G615", ++ }; ++ const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL; ++ if (!name) ++ name = "unknown"; ++ ++ if (pr) ++ printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product); ++ ++ if (maj < 10) { ++ printf("not v10 or later: "); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++get_coherency_mode(struct state *s, struct test *t) ++{ ++ uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE); ++ ++ const char *modes[] = { ++ [0] = "ACE-Lite", ++ [1] = "ACE", ++ [31] = "None", ++ }; ++ const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL; ++ if (!name) ++ name = "Unknown"; ++ ++ if (pr) ++ printf("0x%"PRIx64" (%s): ", mode, name); ++ return true; ++} ++ ++static bool ++get_csf_caps(struct state *s, struct test *t) ++{ ++ union kbase_ioctl_cs_get_glb_iface iface = { 0 }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))"); ++ return false; ++ } ++ ++ int ver_maj = iface.out.glb_version >> 24; ++ int ver_min = (iface.out.glb_version >> 16) & 0xff; ++ int ver_rev = iface.out.glb_version & 0xffff; ++ ++ if (pr) ++ printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ", ++ ver_maj, ver_min, ver_rev, iface.out.features, ++ iface.out.group_num, iface.out.total_stream_num); ++ ++ unsigned group_num = iface.out.group_num; ++ unsigned stream_num = iface.out.total_stream_num; ++ ++ struct basep_cs_group_control *group_data = ++ calloc(group_num, sizeof(*group_data)); ++ ++ struct basep_cs_stream_control *stream_data = ++ calloc(stream_num, sizeof(*stream_data)); ++ ++ iface = (union kbase_ioctl_cs_get_glb_iface) { ++ .in = { ++ .max_group_num = group_num, ++ .max_total_stream_num = stream_num, ++ .groups_ptr = (uintptr_t) group_data, ++ .streams_ptr = (uintptr_t) stream_data, ++ } ++ }; ++ ++ ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))"); ++ ++ free(group_data); ++ free(stream_data); ++ ++ return false; ++ } ++ ++ unsigned print_groups = pr ? group_num : 0; ++ unsigned print_streams = pr ? stream_num : 0; ++ ++ for (unsigned i = 0; i < print_groups; ++i) { ++ if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data))) ++ continue; ++ ++ fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n", ++ i, group_data[i].features, group_data[i].stream_num); ++ } ++ ++ for (unsigned i = 0; i < print_streams; ++i) { ++ if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data))) ++ continue; ++ ++ unsigned reg = stream_data[i].features & 0xff; ++ unsigned score = (stream_data[i].features >> 8) & 0xff; ++ unsigned feat = stream_data[i].features >> 16; ++ ++ fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n", ++ i, reg, score, feat); ++ } ++ ++ free(group_data); ++ free(stream_data); ++ ++ return true; ++} ++ ++static bool ++mmap_user_reg(struct state *s, struct test *t) ++{ ++ s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ, ++ MAP_SHARED, s->mali_fd, ++ BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); ++ ++ if (s->csf_user_reg == MAP_FAILED) { ++ perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); ++ s->csf_user_reg = NULL; ++ return false; ++ } ++ return true; ++} ++ ++static bool ++munmap_user_reg(struct state *s, struct test *t) ++{ ++ if (s->csf_user_reg) ++ return munmap(s->csf_user_reg, s->page_size) == 0; ++ return true; ++} ++ ++static bool ++init_mem_exec(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_mem_exec_init init = { ++ .va_pages = 0x100000, ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++init_mem_jit(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_mem_jit_init init = { ++ .va_pages = 1 << 25, ++ .max_allocations = 255, ++ .phys_pages = 1 << 25, ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++stream_create(struct state *s, struct test *t) ++{ ++ struct kbase_ioctl_stream_create stream = { ++ .name = "stream" ++ }; ++ ++ s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream); ++ ++ if (s->tl_fd == -1) { ++ perror("ioctl(KBASE_IOCTL_STREAM_CREATE)"); ++ return false; ++ } ++ return true; ++ ++} ++ ++static bool ++stream_destroy(struct state *s, struct test *t) ++{ ++ if (s->tl_fd > 0) ++ return close(s->tl_fd) == 0; ++ return true; ++} ++ ++static bool ++tiler_heap_create(struct state *s, struct test *t) ++{ ++ union kbase_ioctl_cs_tiler_heap_init init = { ++ .in = { ++ .chunk_size = 1 << 21, ++ .initial_chunks = 5, ++ .max_chunks = 200, ++ .target_in_flight = 65535, ++ } ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); ++ return false; ++ } ++ ++ s->tiler_heap_va = init.out.gpu_heap_va; ++ s->tiler_heap_header = init.out.first_chunk_va; ++ printf("heap va: %"PRIx64", heap header: %"PRIx64"\n", ++ s->tiler_heap_va, s->tiler_heap_header); ++ ++ return true; ++} ++ ++static bool ++tiler_heap_term(struct state *s, struct test *t) ++{ ++ if (!s->tiler_heap_va) ++ return true; ++ ++ struct kbase_ioctl_cs_tiler_heap_term term = { ++ .gpu_heap_va = s->tiler_heap_va ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++cs_group_create(struct state *s, struct test *t) ++{ ++ union kbase_ioctl_cs_queue_group_create_1_6 create = { ++ .in = { ++ /* Mali *still* only supports a single tiler unit */ ++ .tiler_mask = 1, ++ .fragment_mask = ~0ULL, ++ .compute_mask = ~0ULL, ++ ++ .cs_min = CS_QUEUE_COUNT, ++ ++ .priority = 1, ++ .tiler_max = 1, ++ .fragment_max = 64, ++ .compute_max = 64, ++ } ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); ++ return false; ++ } ++ ++ s->csg_handle = create.out.group_handle; ++ s->csg_uid = create.out.group_uid; ++ ++ if (pr) ++ printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid); ++ ++ /* Should be at least 1 */ ++ if (!s->csg_uid) ++ abort(); ++ ++ return true; ++} ++ ++static bool ++cs_group_term(struct state *s, struct test *t) ++{ ++ if (!s->csg_uid) ++ return true; ++ ++ struct kbase_ioctl_cs_queue_group_term term = { ++ .group_handle = s->csg_handle ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); ++ return false; ++ } ++ return true; ++} ++ ++static bool ++decode_init(struct state *s, struct test *t) ++{ ++ pandecode_initialize(true); ++ return true; ++} ++ ++static bool ++decode_close(struct state *s, struct test *t) ++{ ++ pandecode_close(); ++ return true; ++} ++ ++static struct panfrost_ptr ++alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a) ++{ ++ struct panfrost_ptr p = {0}; ++ ++ uint64_t va_pages = a->in.va_pages; ++ uint64_t flags = a->in.flags; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); ++ return p; ++ } ++ ++ if ((flags & BASE_MEM_SAME_VA) && ++ (!(a->out.flags & BASE_MEM_SAME_VA) || ++ a->out.gpu_va != 0x41000)) { ++ ++ fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", ++ (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va); ++ return p; ++ } ++ ++ void *ptr = mmap(NULL, s->page_size * va_pages, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ s->mali_fd, a->out.gpu_va); ++ ++ if (ptr == MAP_FAILED) { ++ perror("mmap(GPU BO)"); ++ return p; ++ } ++ ++ uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ? ++ (uintptr_t) ptr : a->out.gpu_va; ++ ++ pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL); ++ ++ p.cpu = ptr; ++ p.gpu = gpu_va; ++ ++ memset(p.cpu, 0, s->page_size * va_pages); ++ ++ return p; ++} ++ ++static struct panfrost_ptr ++alloc_mem(struct state *s, uint64_t size, uint64_t flags) ++{ ++ unsigned pages = size / s->page_size; ++ ++ union kbase_ioctl_mem_alloc a = { ++ .in = { ++ .va_pages = pages, ++ .commit_pages = pages, ++ .extension = 0, ++ .flags = flags, ++ } ++ }; ++ ++ return alloc_ioctl(s, &a); ++} ++ ++static void ++alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size) ++{ ++ mmap(p.cpu - s->page_size, 1, ++ PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, ++ -1, 0); ++ ++ mmap(p.cpu + alloc_size, 1, ++ PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, ++ -1, 0); ++} ++ ++static bool ++alloc(struct state *s, struct test *t) ++{ ++ struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); ++ ++ *ptr = alloc_mem(s, s->page_size, t->flags); ++ ++ volatile int *p = (volatile int *) ptr->cpu; ++ *p = 0x12345; ++ if (*p != 0x12345) { ++ printf("Error reading from allocated memory at %p\n", p); ++ return false; ++ } ++ *p = 0; ++ cache_clean(p); ++ ++ return true; ++} ++ ++static bool ++dealloc(struct state *s, struct test *t) ++{ ++ struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); ++ ++ if (ptr->cpu) ++ return munmap(ptr->cpu, s->page_size) == 0; ++ return true; ++} ++ ++static bool ++cs_queue_create(struct state *s, struct test *t) ++{ ++ for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { ++ ++ /* Read/write from CPU/GPU, nothing special ++ * like coherency */ ++ s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f); ++ s->cs[i].ptr = s->cs_mem[i].cpu; ++ ++ if (!s->cs_mem[i].cpu) ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++cs_queue_free(struct state *s, struct test *t) ++{ ++ bool pass = true; ++ for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { ++ if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE)) ++ pass = false; ++ } ++ return pass; ++} ++ ++static bool ++cs_queue_register(struct state *s, struct test *t) ++{ ++ for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { ++ struct kbase_ioctl_cs_queue_register reg = { ++ .buffer_gpu_addr = s->cs_mem[i].gpu, ++ .buffer_size = CS_QUEUE_SIZE, ++ .priority = 1, ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); ++ return false; ++ } ++ ++ union kbase_ioctl_cs_queue_bind bind = { ++ .in = { ++ .buffer_gpu_addr = s->cs_mem[i].gpu, ++ .group_handle = s->csg_handle, ++ .csi_index = i, ++ } ++ }; ++ ++ ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); ++ } ++ ++ s->cs_user_io[i] = ++ mmap(NULL, ++ s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ s->mali_fd, bind.out.mmap_handle); ++ ++ if (s->cs_user_io[i] == MAP_FAILED) { ++ perror("mmap(CS USER IO)"); ++ s->cs_user_io[i] = NULL; ++ return false; ++ } ++ } ++ return true; ++} ++ ++static bool ++cs_queue_term(struct state *s, struct test *t) ++{ ++ bool pass = true; ++ ++ for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { ++ if (s->cs_user_io[i] && ++ munmap(s->cs_user_io[i], ++ s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES)) ++ pass = false; ++ ++ struct kbase_ioctl_cs_queue_terminate term = { ++ .buffer_gpu_addr = s->cs_mem[i].gpu, ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, ++ &term); ++ ++ if (ret == -1) ++ pass = false; ++ } ++ return pass; ++} ++ ++#define CS_RING_DOORBELL(s, i) \ ++ *((uint32_t *)(s->cs_user_io[i])) = 1 ++ ++#define CS_READ_REGISTER(s, i, r) \ ++ *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r)) ++ ++#define CS_WRITE_REGISTER(s, i, r, v) \ ++ *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v ++ ++static void ++submit_cs(struct state *s, unsigned i) ++{ ++ uintptr_t p = (uintptr_t) s->cs[i].ptr; ++ unsigned pad = (-p) & 63; ++ memset(s->cs[i].ptr, 0, pad); ++ ++ unsigned last_offset = s->cs_last_submit[i]; ++ ++ unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu; ++ insert_offset %= CS_QUEUE_SIZE; ++ ++ for (unsigned o = last_offset; o != insert_offset; ++ o = (o + 64) % CS_QUEUE_SIZE) ++ cache_clean(s->cs_mem[i].cpu + o); ++ ++ // TODO: Handle wraparound ++ // TODO: Provide a persistent buffer for pandecode to use? ++ if (pr) { ++ dump_start(stderr); ++ pandecode_cs(s->cs_mem[i].gpu + last_offset, ++ insert_offset - last_offset, s->gpu_id); ++ dump_end(stderr); ++ } ++ ++ cache_barrier(); ++ ++ CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset); ++ s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset; ++ ++ memory_barrier(); ++ CS_RING_DOORBELL(s, i); ++ memory_barrier(); ++ ++ s->cs_last_submit[i] = insert_offset; ++} ++ ++/* Returns true if there was a timeout */ ++static bool ++wait_event(struct state *s, unsigned timeout_ms) ++{ ++ struct pollfd fd = { ++ .fd = s->mali_fd, ++ .events = POLLIN, ++ }; ++ ++ int ret = poll(&fd, 1, timeout_ms); ++ ++ if (ret == -1) { ++ perror("poll(mali_fd)"); ++ return true; ++ } ++ ++ /* Timeout */ ++ if (ret == 0) ++ return true; ++ ++ struct base_csf_notification event; ++ ret = read(s->mali_fd, &event, sizeof(event)); ++ ++ if (ret == -1) { ++ perror("read(mali_fd)"); ++ return true; ++ } ++ ++ if (ret != sizeof(event)) { ++ fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", ++ ret, (int) sizeof(event)); ++ return false; ++ } ++ ++ switch (event.type) { ++ case BASE_CSF_NOTIFICATION_EVENT: ++ fprintf(stderr, "Notification event!\n"); ++ return false; ++ ++ case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: ++ break; ++ ++ case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: ++ fprintf(stderr, "No event from mali_fd!\n"); ++ return false; ++ ++ default: ++ fprintf(stderr, "Unknown event type!\n"); ++ return false; ++ } ++ ++ struct base_gpu_queue_group_error e = event.payload.csg_error.error; ++ ++ switch (e.error_type) { ++ case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { ++ // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h ++ fprintf(stderr, "Queue group error: status 0x%x " ++ "sideband 0x%"PRIx64"\n", ++ e.payload.fatal_group.status, ++ (uint64_t) e.payload.fatal_group.sideband); ++ break; ++ } ++ case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { ++ unsigned queue = e.payload.fatal_queue.csi_index; ++ ++ // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h ++ fprintf(stderr, "Queue %i error: status 0x%x " ++ "sideband 0x%"PRIx64":", ++ queue, e.payload.fatal_queue.status, ++ (uint64_t) e.payload.fatal_queue.sideband); ++ ++ unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT); ++ pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id); ++ ++ break; ++ } ++ ++ case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: ++ fprintf(stderr, "Command stream timeout!\n"); ++ break; ++ case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: ++ fprintf(stderr, "Command stream OOM!\n"); ++ break; ++ default: ++ fprintf(stderr, "Unknown error type!\n"); ++ } ++ ++ return false; ++} ++ ++static bool ++kick_queue(struct state *s, unsigned i) ++{ ++ struct kbase_ioctl_cs_queue_kick kick = { ++ .buffer_gpu_addr = s->cs_mem[i].gpu ++ }; ++ ++ int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); ++ ++ if (ret == -1) { ++ perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++wait_cs(struct state *s, unsigned i) ++{ ++ unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu; ++ ++ unsigned timeout_ms = 500; ++ ++ bool done_kick = false; ++ ++ while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) { ++ if (wait_event(s, timeout_ms)) { ++ if (pr) ++ fprintf(stderr, "Event wait timeout!\n"); ++ ++ unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT); ++ unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE); ++ ++ if (e != extract_offset) { ++ fprintf(stderr, "CS_EXTRACT (%i) != %i, " ++ "CS_ACTIVE (%i) on queue %i:", ++ e, extract_offset, a, i); ++ /* Decode two instructions instead? */ ++ pandecode_cs(s->cs_mem[i].gpu + e, 8, 1); ++ ++ if (done_kick) { ++ cache_barrier(); ++ return false; ++ } else { ++ fprintf(stderr, "Kicking queue\n"); ++ kick_queue(s, i); ++ done_kick = true; ++ } ++ } ++ } ++ } ++ ++ cache_barrier(); ++ ++ return true; ++} ++ ++static bool ++cs_init(struct state *s, struct test *t) ++{ ++ uint64_t event_init[] = { 1, 1, 1 }; ++ memcpy(s->allocations.event.cpu, event_init, sizeof(event_init)); ++ ++ for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { ++ CS_WRITE_REGISTER(s, i, CS_INSERT, 0); ++ pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) { ++ switch (i) { ++ case 0: cfg.compute = true; break; ++ case 1: cfg.compute = true; cfg.fragment = true; break; ++ case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break; ++ case 3: cfg.fragment = true; break; ++ } ++ } ++ pan_pack_ins(s->cs + i, CS_SLOT, cfg) { ++ cfg.index = 2; ++ } ++ pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER, ++ s->allocations.event.gpu); ++ submit_cs(s, i); ++ ++ if (!kick_queue(s, i)) ++ return false; ++ } ++ ++ return true; ++} ++ ++static struct panfrost_ptr * ++buffers_elem(struct util_dynarray *buffers, unsigned index) ++{ ++ unsigned size = util_dynarray_num_elements(buffers, ++ struct panfrost_ptr); ++ ++ if (index >= size) { ++ unsigned grow = index + 1 - size; ++ ++ memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow), ++ 0, grow * sizeof(struct panfrost_ptr)); ++ } ++ ++ return util_dynarray_element(buffers, struct panfrost_ptr, index); ++} ++ ++static void ++dump_hex64(FILE *fp, uint64_t *values, unsigned size) ++{ ++ bool zero = false; ++ for (unsigned i = 0; i < size / 8; i += 2) { ++ uint64_t a = values[i]; ++ uint64_t b = values[i + 1]; ++ ++ if (!a && !b) { ++ if (!zero) ++ fprintf(fp, "%06X *\n", i * 8); ++ zero = true; ++ continue; ++ } ++ ++ zero = false; ++ ++ fprintf(fp, "%06X %16"PRIx64" %16"PRIx64"\n", ++ i * 8, a, b); ++ } ++ ++ fprintf(fp, "\n"); ++} ++ ++static void ++dump_delta(FILE *fp, uint64_t *values, unsigned size) ++{ ++ uint64_t old = 0; ++ bool zero = false; ++ bool el = false; ++ for (unsigned i = 0; i < size / 8; ++i) { ++ uint64_t val = values[i]; ++ int64_t delta = val - old; ++ ++ if (!zero || delta) { ++ fprintf(fp, "%"PRIi64"\n", delta); ++ el = false; ++ } else if (!el) { ++ fprintf(fp, "...\n"); ++ el = true; ++ } ++ ++ old = val; ++ zero = (delta == 0); ++ } ++} ++ ++static void ++dump_tiler(FILE *fp, uint8_t *values, unsigned size) ++{ ++ fflush(stdout); ++ FILE *stream = popen("tiler-hex-read", "w"); ++ // TODO! ++ fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n", ++ 256, 256, 6, values, size); ++ pan_hexdump(stream, values, size, false); ++ pclose(stream); ++} ++ ++/* TODO: Pass in a filename? */ ++static void ++dump_filehex(uint8_t *values, unsigned size) ++{ ++ char buf[1024] = {0}; ++ ++ for (unsigned i = 0; i < 10000; ++i) { ++ snprintf(buf, 1024, "/tmp/fdump.%05i", i); ++ ++ int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666); ++ if (fd == -1) ++ continue; ++ ++ FILE *fp = fdopen(fd, "w"); ++ ++ fprintf(fp, "%p, %u:\n", values, size); ++ pan_hexdump(fp, values, size, false); ++ ++ fclose(fp); /* will close fd */ ++ break; ++ } ++} ++ ++static void ++dump_heatmap(FILE *fp, uint8_t *values, unsigned size, ++ unsigned gran, unsigned length, unsigned stride) ++{ ++ unsigned sum = 0; ++ unsigned gr = 0; ++ unsigned st = 0; ++ unsigned ll = 0; ++ ++ while (size && !values[size - 1]) ++ --size; ++ ++ for (unsigned i = 0; i < size; ++i) { ++ sum += values[i]; ++ ++ if (++gr == gran) { ++ fprintf(fp, " %02x", sum & 0xff); ++ gr = 0; ++ sum = 0; ++ } ++ ++ if (++ll == length) { ++ i += stride - length; ++ fprintf(fp, "\n"); ++ st = 0; ++ ll = 0; ++ } else if (++st == stride) { ++ fprintf(fp, "\n"); ++ st = 0; ++ } ++ } ++ fprintf(fp, " %02x\n", sum & 0xff); ++} ++ ++static bool ++cs_test(struct state *s, struct test *t) ++{ ++ if (s->argc < 2) ++ return true; ++ ++ FILE *f = fopen(s->argv[1], "r"); ++ ++ struct util_dynarray buffers; ++ util_dynarray_init(&buffers, NULL); ++ ++ for (;;) { ++ char *line = NULL; ++ size_t sz = 0; ++ if (getline(&line, &sz, f) == -1) ++ break; ++ ++ unsigned long src, dst, offset, src_offset, size, iter, flags; ++ unsigned long gran, stride, length; ++ int read; ++ char *mode; ++ ++ if (sscanf(line, "rel%ms %lu+%lu %lu+%lu", ++ &mode, &dst, &offset, &src, &src_offset) == 5) { ++ ++ if (strcmp(mode, "oc") && strcmp(mode, "split")) { ++ fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode); ++ } ++ bool split = (mode[0] == 's'); ++ free(mode); ++ ++ struct panfrost_ptr *s = buffers_elem(&buffers, src); ++ struct panfrost_ptr *d = buffers_elem(&buffers, dst); ++ ++ if (!s->gpu || !d->gpu) { ++ fprintf(stderr, "relocating to buffer that doesn't exist!\n"); ++ } ++ ++ uint64_t *dest = d->cpu + offset; ++ uint64_t value = s->gpu + src_offset; ++ if (split) { ++ dest[0] |= (uint32_t) value; ++ dest[1] |= (uint32_t) (value >> 32); ++ } else { ++ *dest |= value; ++ } ++ ++ } else if (sscanf(line, "buffer %lu %lu %lx %n", ++ &dst, &size, &flags, &read) == 3) { ++ line += read; ++ ++ struct panfrost_ptr buffer = ++ alloc_mem(s, ALIGN_POT(size, s->page_size), ++ flags); ++ ++ alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size)); ++ ++ *buffers_elem(&buffers, dst) = buffer; ++ ++ //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu); ++ ++ uint64_t *fill = buffer.cpu; ++ ++ for (unsigned i = 0; i < size / 8; ++i) { ++ read = 0; ++ unsigned long long val = 0; ++ if (sscanf(line, "%Lx %n", &val, &read) != 1) ++ break; ++ line += read; ++ fill[i] = val; ++ } ++ ++ cache_clean_range(buffer.cpu, size); ++ ++ } else if (sscanf(line, "exe %n %lu %lu %lu", ++ &read, &iter, &dst, &size) == 3) { ++ line += read; ++ ++ unsigned iter_mask = 0; ++ ++ for (;;) { ++ read = 0; ++ if (sscanf(line, "%lu %lu %lu %n", ++ &iter, &dst, &size, &read) != 3) ++ break; ++ line += read; ++ ++ struct panfrost_ptr *d = ++ buffers_elem(&buffers, dst); ++ ++ /* TODO: Check 'size' against buffer size */ ++ ++ pandecode_cs(d->gpu, size, s->gpu_id); ++ ++ if (iter > 3) { ++ fprintf(stderr, ++ "execute on out-of-bounds " ++ "iterator\n"); ++ continue; ++ } ++ ++ memcpy(s->cs[iter].ptr, d->cpu, size); ++ s->cs[iter].ptr += size / 8; ++ ++ iter_mask |= (1 << iter); ++ } ++ ++ u_foreach_bit(i, iter_mask) ++ submit_cs(s, i); ++ ++ u_foreach_bit(i, iter_mask) ++ kick_queue(s, i); ++ ++ u_foreach_bit(i, iter_mask) ++ wait_cs(s, i); ++ ++ } else if (sscanf(line, "dump %lu %lu %lu %ms", ++ &src, &offset, &size, &mode) == 4) { ++ ++ struct panfrost_ptr *s = buffers_elem(&buffers, src); ++ ++ if (!s->gpu) ++ fprintf(stderr, "dumping buffer that doesn't exist!\n"); ++ ++ cache_invalidate_range(s->cpu + offset, size); ++ ++ if (!strcmp(mode, "hex")) ++ pan_hexdump(stdout, s->cpu + offset, size, true); ++ else if (!strcmp(mode, "hex64")) ++ dump_hex64(stdout, s->cpu + offset, size); ++ else if (!strcmp(mode, "delta")) ++ dump_delta(stdout, s->cpu + offset, size); ++ else if (!strcmp(mode, "tiler")) ++ dump_tiler(stdout, s->cpu + offset, size); ++ else if (!strcmp(mode, "filehex")) ++ dump_filehex(s->cpu + offset, size); ++ ++ free(mode); ++ ++ } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu", ++ &src, &offset, &size, ++ &gran, &length, &stride) == 6) { ++ ++ struct panfrost_ptr *s = buffers_elem(&buffers, src); ++ ++ if (!s->gpu) ++ fprintf(stderr, "dumping buffer that doesn't exist!\n"); ++ ++ cache_invalidate_range(s->cpu + offset, size); ++ ++ dump_heatmap(stdout, s->cpu + offset, size, ++ gran, length, stride); ++ ++ } else if (sscanf(line, "memset %lu %lu %lu %lu", ++ &src, &offset, &gran, &size) == 4) { ++ ++ struct panfrost_ptr *s = buffers_elem(&buffers, src); ++ ++ if (!s->gpu) ++ fprintf(stderr, "memset on buffer that doesn't exist!\n"); ++ ++ memset(s->cpu + offset, gran, size); ++ cache_clean_range(s->cpu + offset, size); ++ ++ } else if (sscanf(line, "sleep %lu", &size) == 1) { ++ ++ usleep(size * 1000); ++ ++ } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) { ++ ++ void *ptr; ++ ++ ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, ++ s->tiler_heap_header); ++ pan_hexdump(stdout, ptr, 4096, false); ++ pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); ++ munmap(ptr, 1 << 21); ++ ++ ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, ++ s->tiler_heap_header + (1 << 21)); ++ pan_hexdump(stdout, ptr, 4096, false); ++ pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); ++ munmap(ptr, 1 << 21); ++ ++ } else { ++ fprintf(stderr, "unknown command '%s'\n", line); ++ } ++ } ++ ++ /* Skip following tests */ ++ return false; ++} ++ ++static void ++pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value) ++{ ++ pan_emit_cs_32(c, 0x5e, value); ++ pan_pack_ins(c, CS_ADD_IMM, cfg) { ++ cfg.value = offset; ++ cfg.src = 0x5a; ++ cfg.dest = 0x5c; ++ } ++ pan_pack_ins(c, CS_EVADD, cfg) { ++ cfg.value = 0x5e; ++ cfg.addr = 0x5c; ++ } ++} ++ ++static bool ++cs_simple(struct state *s, struct test *t) ++{ ++ unsigned queue = t->vertex ? 2 : 0; ++ ++ pan_command_stream *c = s->cs + queue; ++ ++ unsigned dest = t->invalid ? 0x65 : 0x48; ++ ++ pan_emit_cs_32(c, dest, 0x1234); ++ pan_cs_evadd(c, 0, 1); ++ ++ submit_cs(s, queue); ++ return wait_cs(s, queue); ++} ++ ++static bool ++cs_store(struct state *s, struct test *t) ++{ ++ pan_command_stream *c = s->cs; ++ ++ uint32_t *dest = s->allocations.ev2.cpu + 240; ++ mali_ptr dest_va = s->allocations.ev2.gpu + 240; ++ uint32_t value = 1234; ++ uint32_t add = 4320000; ++ ++ *dest = 0; ++ cache_clean(dest); ++ ++ unsigned addr_reg = 0x48; ++ unsigned value_reg = 0x4a; ++ ++ if (t->invalid) ++ dest_va = 0xfdcba9876543; ++ ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); } ++ pan_emit_cs_48(c, addr_reg, dest_va); ++ pan_emit_cs_32(c, value_reg, value); ++ ++ if (t->add) { ++ pan_pack_ins(c, CS_ADD_IMM, cfg) { ++ cfg.value = add; ++ cfg.src = value_reg; ++ cfg.dest = value_reg; ++ } ++ value += add; ++ } ++ ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.addr = addr_reg; ++ cfg.register_base = value_reg; ++ cfg.register_mask = 1; ++ } ++ pan_cs_evadd(c, 0, 1); ++ ++ submit_cs(s, 0); ++ wait_cs(s, 0); ++ ++ cache_invalidate(dest); ++ cache_barrier(); /* Just in case it's needed */ ++ uint32_t result = *dest; ++ ++ if (t->invalid && result == value) { ++ printf("Got %i, did not expect %i: ", result, value); ++ return false; ++ } else if (result != value) { ++ printf("Got %i, expected %i: ", result, value); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void ++emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end) ++{ ++ cache_clean_range(start, end - start); ++ ++ pan_emit_cs_48(c, 0x48, va); ++ pan_emit_cs_32(c, 0x4a, end - start); ++ pan_pack_ins(c, CS_CALL, cfg) { ++ cfg.address = 0x48; ++ cfg.length = 0x4a; ++ } ++} ++ ++static bool ++cs_sub(struct state *s, struct test *t) ++{ ++ pan_command_stream *c = s->cs; ++ pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; ++ mali_ptr cs_va = s->allocations.cached.gpu; ++ ++ uint32_t *dest = s->allocations.normal.cpu; ++ mali_ptr dest_va = s->allocations.normal.gpu; ++ uint32_t value = 4321; ++ ++ *dest = 0; ++ cache_clean(dest); ++ ++ unsigned addr_reg = 0x48; ++ unsigned value_reg = 0x4a; ++ ++ void *start = i->ptr; ++ ++ pan_emit_cs_ins(c, 0x30, 0x5a0000000000); ++ ++ pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } ++ pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); } ++ //pan_emit_cs_ins(i, 0x31, 0); ++ ++ pan_emit_cs_48(i, addr_reg, dest_va); ++ pan_emit_cs_32(i, value_reg, value); ++ //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL); ++ pan_pack_ins(i, CS_STR, cfg) { ++ cfg.addr = addr_reg; ++ cfg.register_base = value_reg; ++ cfg.register_mask = 1; ++ } ++ //pan_emit_cs_ins(i, 0x09, 0); ++ //pan_emit_cs_ins(i, 0x31, 0x100000000); ++ ++ //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL); ++ ++ /* ++ pan_pack_ins(i, CS_STR_32, cfg) { ++ cfg.unk_1 = 1; ++ cfg.unk_2 = 4; ++ cfg.unk_3 = 1; ++ cfg.addr = addr_reg; ++ cfg.value = value_reg; ++ }*/ ++ ++ emit_cs_call(c, cs_va, start, i->ptr); ++ pan_cs_evadd(c, 0, 1); ++ ++ submit_cs(s, 0); ++ wait_cs(s, 0); ++ ++ cache_invalidate(dest); ++ cache_barrier(); /* Just in case it's needed */ ++ uint32_t result = *dest; ++ ++ if (result != value) { ++ printf("Got %i, expected %i: ", result, value); ++ return false; ++ } ++ ++ return true; ++} ++ ++static mali_ptr ++upload_shader(struct state *s, struct util_dynarray binary) ++{ ++ assert(s->shader_alloc_offset + binary.size < s->page_size); ++ ++ mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset; ++ ++ memcpy(s->allocations.exec.cpu, binary.data, binary.size); ++ ++ /* Shouldn't be needed, but just in case... */ ++ cache_clean_range(s->allocations.exec.cpu, binary.size); ++ ++ s->shader_alloc_offset += binary.size; ++ ++ return va; ++} ++ ++static bool ++compute_compile(struct state *s, struct test *t) ++{ ++ nir_builder _b = ++ nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, ++ GENX(pan_shader_get_compiler_options)(), ++ "mem_store"), *b = &_b; ++ ++ nir_ssa_def *ptr = ++ nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0)); ++ ++ nir_ssa_def *value = nir_imm_int(b, 123); ++ ++ nir_store_global(b, ptr, 8, value, 1); ++ ++ struct panfrost_compile_inputs inputs = { ++ .gpu_id = s->gpu_id, ++ .no_ubo_to_push = true, ++ }; ++ ++ struct util_dynarray binary = {0}; ++ struct pan_shader_info shader_info = {0}; ++ ++ GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info); ++ ++ dump_start(stderr); ++ disassemble_valhall(stderr, binary.data, binary.size, true); ++ dump_end(stderr); ++ ++ s->compute_shader = upload_shader(s, binary); ++ ++ util_dynarray_fini(&binary); ++ ralloc_free(b->shader); ++ ++ return true; ++} ++ ++static struct panfrost_ptr ++mem_offset(struct panfrost_ptr ptr, unsigned offset) ++{ ++ ptr.cpu += offset; ++ ptr.gpu += offset; ++ return ptr; ++} ++ ++static bool ++compute_execute(struct state *s, struct test *t) ++{ ++ unsigned queue = t->blit ? 1 : 0; ++ ++ pan_command_stream *c = s->cs + queue; ++ pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; ++ mali_ptr cs_va = s->allocations.cached.gpu; ++ ++ struct panfrost_ptr dest = s->allocations.normal; ++ uint32_t value = 123; ++ ++ *(uint32_t *) dest.cpu = 0; ++ cache_clean(dest.cpu); ++ ++ struct panfrost_ptr fau = mem_offset(dest, 128); ++ *(uint64_t *) fau.cpu = dest.gpu; ++ cache_clean(fau.cpu); ++ ++ struct panfrost_ptr local_storage = mem_offset(dest, 192); ++ pan_pack(local_storage.cpu, LOCAL_STORAGE, _); ++ cache_clean(local_storage.cpu); ++ ++ struct panfrost_ptr shader_program = mem_offset(dest, 256); ++ pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) { ++ cfg.stage = MALI_SHADER_STAGE_COMPUTE; ++ cfg.primary_shader = true; ++ cfg.register_allocation = ++ MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; ++ cfg.binary = s->compute_shader; ++ } ++ cache_clean(shader_program.cpu); ++ ++ void *start = i->ptr; ++ ++ pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } ++ //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; } ++ ++ pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) { ++ cfg.workgroup_size_x = 1; ++ cfg.workgroup_size_y = 1; ++ cfg.workgroup_size_z = 1; ++ ++ cfg.workgroup_count_x = 1; ++ cfg.workgroup_count_y = 1; ++ cfg.workgroup_count_z = 1; ++ ++ cfg.compute.shader = shader_program.gpu; ++ cfg.compute.thread_storage = local_storage.gpu; ++ ++ cfg.compute.fau = fau.gpu; ++ cfg.compute.fau_count = 1; ++ } ++ ++ pan_pack_ins(i, COMPUTE_LAUNCH, _); ++ ++ //pan_emit_cs_32(c, 0x54, 1); ++ //pan_emit_cs_ins(c, 0x24, 0x540000000233); ++ emit_cs_call(c, cs_va, start, i->ptr); ++ ++ pan_emit_cs_32(c, 0x4a, 0); ++ pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL); ++ ++ pan_emit_cs_48(c, 0x48, dest.gpu); ++ pan_pack_ins(c, CS_LDR, cfg) { ++ cfg.offset = 0; ++ cfg.register_mask = 1; ++ cfg.addr = 0x48; ++ cfg.register_base = 0x20; ++ } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; } ++ pan_pack_ins(c, CS_ADD_IMM, cfg) { ++ cfg.value = 1; ++ cfg.src = 0x20; ++ cfg.dest = 0x20; ++ } ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.offset = 64; ++ cfg.register_mask = 1; ++ cfg.addr = 0x48; ++ cfg.register_base = 0x20; ++ } ++ ++ pan_cs_evadd(c, 0, 1); ++ ++ submit_cs(s, queue); ++ wait_cs(s, queue); ++ ++ cache_invalidate(dest.cpu); ++ cache_barrier(); /* Just in case it's needed */ ++ uint32_t result = ((uint32_t *)dest.cpu)[0]; ++ uint32_t result2 = ((uint32_t *)dest.cpu)[16]; ++ ++ if (result != value) { ++ printf("Got %i, %i, expected %i: ", result, result2, value); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++mmu_dump(struct state *s, struct test *t) ++{ ++ unsigned size = 1024 * 1024; ++ ++ void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED, ++ s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE); ++ if (mem == MAP_FAILED) { ++ perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)"); ++ return false; ++ } ++ ++ pan_hexdump(stdout, mem, size, true); ++ ++ return true; ++} ++ ++#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) } ++ ++#define STATE(item) .offset = offsetof(struct state, item) ++ ++#define ALLOC(item) .offset = offsetof(struct state, allocations.item) ++#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f } ++ ++struct test kbase_main[] = { ++ { open_kbase, close_kbase, "Open kbase device" }, ++ { get_version, NULL, "Check version" }, ++ { set_flags, NULL, "Set flags" }, ++ { mmap_tracking, munmap_tracking, "Map tracking handle" }, ++ { get_gpuprops, free_gpuprops, "Get GPU properties" }, ++ { get_gpu_id, NULL, "GPU ID" }, ++ { get_coherency_mode, NULL, "Coherency mode" }, ++ { get_csf_caps, NULL, "CSF caps" }, ++ { mmap_user_reg, munmap_user_reg, "Map user register page" }, ++ { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, ++ { init_mem_jit, NULL, "Initialise JIT allocator" }, ++ { stream_create, stream_destroy, "Create synchronisation stream" }, ++ { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" }, ++ { cs_group_create, cs_group_term, "Create command stream group" }, ++ { decode_init, decode_close, "Initialize pandecode" }, ++ ++ /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */ ++ ALLOC_TEST("Allocate normal memory", normal, 0x200f), ++ ALLOC_TEST("Allocate exectuable memory", exec, 0x2017), ++ ALLOC_TEST("Allocate coherent memory", coherent, 0x280f), ++ ALLOC_TEST("Allocate cached memory", cached, 0x380f), ++ ALLOC_TEST("Allocate CSF event memory", event, 0x8200f), ++ ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f), ++ ++ /* These three tests are run for every queue, but later ones are not */ ++ { cs_queue_create, cs_queue_free, "Create command stream queues" }, ++ { cs_queue_register, cs_queue_term, "Register command stream queues" }, ++ ++ { cs_test, NULL, "Test command stream" }, ++ ++ { cs_init, NULL, "Initialise and start command stream queues" }, ++ { cs_simple, NULL, "Execute MOV command" }, ++ { cs_simple, NULL, "Execute MOV command (again)" }, ++ { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true }, ++ //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true }, ++ { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true }, ++ { cs_store, NULL, "Execute STR command" }, ++ //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true }, ++ { cs_store, NULL, "Execute ADD command", .add = true }, ++ { cs_sub, NULL, "Execute STR on iterator" }, ++ ++ { compute_compile, NULL, "Compile a compute shader" }, ++ { compute_execute, NULL, "Execute a compute shader" }, ++ { compute_execute, NULL, "Execute compute on blit queue", .blit = true }, ++ ++ //{ mmu_dump, NULL, "Dump MMU pagetables" }, ++}; ++ ++static void ++do_test_list(struct state *s, struct test *tests, unsigned length); ++ ++static void ++cleanup_test_list(struct state *s, struct test *tests, unsigned length) ++{ ++ for (unsigned i = length; i > 0; --i) { ++ unsigned n = i - 1; ++ ++ struct test *t = &tests[n]; ++ if (!t->cleanup) ++ continue; ++ ++ if (pr) ++ printf("[CLEANUP %i] %s: ", n, t->label); ++ if (t->cleanup(s, t)) { ++ if (pr) ++ printf("PASS\n"); ++ } else { ++ if (pr) ++ printf("FAIL\n"); ++ } ++ } ++} ++ ++static unsigned ++interpret_test_list(struct state *s, struct test *tests, unsigned length) ++{ ++ for (unsigned i = 0; i < length; ++i) { ++ struct test *t = &tests[i]; ++ ++ if (pr) ++ printf("[TEST %i] %s: ", i, t->label); ++ if (t->part) { ++ if (t->part(s, t)) { ++ if (pr) ++ printf("PASS\n"); ++ } else { ++ if (pr) ++ printf("FAIL\n"); ++ if (!getenv("TEST_KEEP_GOING")) ++ return i + 1; ++ } ++ } ++ if (t->subtests) ++ do_test_list(s, t->subtests, t->sub_length); ++ } ++ ++ return length; ++} ++ ++static void ++do_test_list(struct state *s, struct test *tests, unsigned length) ++{ ++ unsigned ran = interpret_test_list(s, tests, length); ++ cleanup_test_list(s, tests, ran); ++} ++ ++int ++main(int argc, char *argv[]) ++{ ++ struct state s = { ++ .page_size = sysconf(_SC_PAGE_SIZE), ++ .argc = argc, ++ .argv = argv, ++ }; ++ ++ if (getenv("CSF_QUIET")) ++ pr = false; ++ ++ if (!strcmp(getenv("TERM"), "dumb")) ++ colour_term = false; ++ ++ if (pr) ++ printf("Running Valhall CSF tests\n"); ++ ++ do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main)); ++} +diff --git a/src/panfrost/lib/genxml/common.xml b/src/panfrost/lib/genxml/common.xml +index d4b5240fb01..d75baaba208 100644 +--- a/src/panfrost/lib/genxml/common.xml ++++ b/src/panfrost/lib/genxml/common.xml +@@ -46,7 +46,7 @@ + + + +- ++ + + + +diff --git a/src/panfrost/lib/genxml/decode.c b/src/panfrost/lib/genxml/decode.c +index ae214e8d7ec..86298fa5d42 100644 +--- a/src/panfrost/lib/genxml/decode.c ++++ b/src/panfrost/lib/genxml/decode.c +@@ -54,6 +54,12 @@ + pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \ + } + ++#define DUMP_SECTION_CS_V10(A, S, cl, buf, buf_unk, ...) { \ ++ pan_section_unpack_cs_v10(cl, buf, buf_unk, A, S, temp); \ ++ pandecode_log(__VA_ARGS__); \ ++ pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \ ++} ++ + #define MAP_ADDR(T, addr, cl) \ + const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T)); + +@@ -158,7 +164,7 @@ pandecode_midgard_tiler_descriptor( + if (nonzero_weights) + DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n"); + } +-#endif ++#endif /* PAN_ARCH <= 5 */ + + #if PAN_ARCH >= 5 + static void +@@ -184,7 +190,7 @@ pandecode_render_target(uint64_t gpu_va, unsigned gpu_id, + pandecode_indent--; + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH >= 5 */ + + #if PAN_ARCH >= 6 + static void +@@ -201,7 +207,7 @@ pandecode_sample_locations(const void *fb) + samples[2 * i + 1] - 128); + } + } +-#endif ++#endif /* PAN_ARCH >= 6 */ + + static void + pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, +@@ -228,29 +234,29 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id) + #if PAN_ARCH >= 6 + pandecode_sample_locations(fb); + +- unsigned dcd_size = pan_size(DRAW); ++ unsigned dcd_size = pan_size(DRAW_NO_CS); + + if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (0 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Pre frame 0:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } + + if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (1 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Pre frame 1:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } + + if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (2 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Post frame:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } +-#else ++#else /* PAN_ARCH < 6 */ + DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); + + const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER); +@@ -284,7 +290,7 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id) + .rt_count = params.render_target_count, + .has_extra = params.has_zs_crc_extension + }; +-#else ++#else /* PAN_ARCH < 5 */ + /* Dummy unpack of the padding section to make sure all words are 0. + * No need to call print here since the section is supposed to be empty. + */ +@@ -341,7 +347,7 @@ pandecode_attributes(mali_ptr addr, int count, + } + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH >= 5 + static mali_ptr +@@ -358,7 +364,7 @@ pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader) + return b.blend_shader ? (b.shader_pc & ~0xf) : 0; + #endif + } +-#endif ++#endif /* PAN_ARCH >= 6 || PAN_ARCH == 5 */ + + #if PAN_ARCH <= 7 + static unsigned +@@ -412,8 +418,9 @@ pandecode_invocation(const void *i) + + DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n") + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + ++#if PAN_ARCH < 10 + static void + pandecode_primitive(const void *p) + { +@@ -439,7 +446,7 @@ pandecode_primitive(const void *p) + pandecode_validate_buffer(primitive.indices, primitive.index_count * size); + } else if (primitive.index_type) + pandecode_log("// XXX: unexpected index size\n"); +-#endif ++#endif /* PAN_ARCH <= 7 */ + } + + static void +@@ -451,6 +458,7 @@ pandecode_primitive_size(const void *s, bool constant) + + DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n") + } ++#endif /* PAN_ARCH < 10 */ + + #if PAN_ARCH <= 7 + static void +@@ -482,7 +490,7 @@ pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count) + free(ptr); + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + static void + pandecode_shader_disassemble(mali_ptr shader_ptr, int type, unsigned gpu_id) +@@ -566,7 +574,7 @@ pandecode_texture_payload(mali_ptr payload, + pandecode_indent--; + pandecode_log("},\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH <= 5 + static void +@@ -585,7 +593,7 @@ pandecode_texture(mali_ptr u, unsigned tex) + temp.levels, nr_samples, temp.array_size); + pandecode_indent--; + } +-#else ++#else /* PAN_ARCH > 5 */ + static void + pandecode_texture(const void *cl, unsigned tex) + { +@@ -603,7 +611,7 @@ pandecode_texture(const void *cl, unsigned tex) + + for (unsigned i = 0; i < plane_count; ++i) + DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i); +-#else ++#else /* PAN_ARCH < 9 */ + unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ? + 1 : temp.sample_count; + +@@ -630,7 +638,7 @@ pandecode_textures(mali_ptr textures, unsigned texture_count) + + for (unsigned tex = 0; tex < texture_count; ++tex) + pandecode_texture(cl + pan_size(TEXTURE) * tex, tex); +-#else ++#else /* PAN_ARCH < 6 */ + mali_ptr *PANDECODE_PTR_VAR(u, textures); + + for (int tex = 0; tex < texture_count; ++tex) { +@@ -741,7 +749,7 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + gpu_id); + } + } +-#endif ++#endif /* PAN_ARCH >= 5 */ + } else + pandecode_log("// XXX: missing shader descriptor\n"); + +@@ -807,7 +815,7 @@ pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h, + pandecode_indent--; + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH >= 6 + static void +@@ -823,6 +831,10 @@ pandecode_tiler(mali_ptr gpu_va) + DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); + } + ++#endif /* PAN_ARCH >= 6 */ ++ ++#if PAN_ARCH < 10 ++#if PAN_ARCH >= 6 + #if PAN_ARCH <= 7 + static void + pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, +@@ -854,8 +866,8 @@ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, + + pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding); + } +-#endif +-#endif ++#endif /* PAN_ARCH <= 7 */ ++#endif /* PAN_ARCH >= 6 */ + + static void + pandecode_tiler_job(const struct MALI_JOB_HEADER *h, +@@ -890,7 +902,7 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h, + pan_section_unpack(p, TILER_JOB, PADDING, padding); + #endif + +-#else ++#else /* PAN_ARCH < 6 */ + pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive); + pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), + primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE); +@@ -898,12 +910,17 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h, + pandecode_indent--; + pandecode_log("\n"); + } ++#endif /* PAN_ARCH < 10 */ + + static void +-pandecode_fragment_job(mali_ptr job, unsigned gpu_id) ++pandecode_fragment_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job); +- pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s); ++#endif ++ ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, FRAGMENT_JOB, PAYLOAD, s); + + UNUSED struct pandecode_fbd info = pandecode_fbd(s.framebuffer, true, gpu_id); + +@@ -920,7 +937,7 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id) + expected_tag |= MALI_FBD_TAG_HAS_ZS_RT; + + expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2); +-#endif ++#endif /* PAN_ARCH >= 5 */ + + DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n"); + +@@ -936,6 +953,8 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id) + pandecode_log("\n"); + } + ++#if PAN_ARCH < 10 ++// TODO: Use the same model as for malloc_vertex jobs? + static void + pandecode_write_value_job(mali_ptr job) + { +@@ -953,6 +972,7 @@ pandecode_cache_flush_job(mali_ptr job) + DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n"); + pandecode_log("\n"); + } ++#endif /* PAN_ARCH < 10 */ + + #if PAN_ARCH >= 9 + static void +@@ -1034,6 +1054,9 @@ pandecode_resource_tables(mali_ptr addr, const char *label) + static void + pandecode_depth_stencil(mali_ptr addr) + { ++ if (!addr) ++ return; ++ + MAP_ADDR(DEPTH_STENCIL, addr, cl); + pan_unpack(cl, DEPTH_STENCIL, desc); + DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil"); +@@ -1060,14 +1083,15 @@ static void + pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + unsigned gpu_id) + { +- mali_ptr frag_shader = 0; +- + pandecode_depth_stencil(p->depth_stencil); + + for (unsigned i = 0; i < p->blend_count; ++i) { ++ MAP_ADDR(SHADER_PROGRAM, p->shader.shader, cl); ++ pan_unpack(cl, SHADER_PROGRAM, desc); ++ + struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend); + +- mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader); ++ mali_ptr blend_shader = pandecode_blend(blend_descs, i, desc.binary); + if (blend_shader) { + fprintf(pandecode_dump_stream, "Blend shader %u", i); + pandecode_shader_disassemble(blend_shader, 0, gpu_id); +@@ -1079,21 +1103,26 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + } + + static void +-pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id) ++pandecode_malloc_vertex_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job); ++#endif + +- DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE, p, cs_buf, cs_buf_unk, "Primitive:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, cs_buf, cs_buf_unk, "Instance count:\n"); ++#if PAN_ARCH < 10 + DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n"); ++#endif ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, TILER, p, cs_buf, cs_buf_unk, "Tiler:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, SCISSOR, p, cs_buf, cs_buf_unk, "Scissor:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, cs_buf, cs_buf_unk, "Primitive Size:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INDICES, p, cs_buf, cs_buf_unk, "Indices:\n"); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, DRAW, dcd); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, TILER, tiler_ptr); + pandecode_log("Tiler Job Payload:\n"); + pandecode_indent++; + if (tiler_ptr.address) +@@ -1104,17 +1133,20 @@ pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id) + + pandecode_dcd(&dcd, 0, gpu_id); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position); +- pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, POSITION, position); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, VARYING, varying); + pandecode_shader_environment(&position, gpu_id); + pandecode_shader_environment(&varying, gpu_id); + } + + static void +-pandecode_compute_job(mali_ptr job, unsigned gpu_id) ++pandecode_compute_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); +- pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload); ++#endif ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, COMPUTE_JOB, PAYLOAD, payload); + + pandecode_shader(payload.compute.shader, "Shader", gpu_id); + if (payload.compute.thread_storage) +@@ -1126,8 +1158,9 @@ pandecode_compute_job(mali_ptr job, unsigned gpu_id) + + DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n"); + } +-#endif ++#endif /* PAN_ARCH >= 9 */ + ++#if PAN_ARCH < 10 + /* Entrypoint to start tracing. jc_gpu_va is the GPU address for the first job + * in the chain; later jobs are found by walking the chain. GPU ID is the + * more finegrained ID because some details are model-specific even within a +@@ -1183,18 +1216,18 @@ GENX(pandecode_jc)(mali_ptr jc_gpu_va, unsigned gpu_id) + pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id); + break; + #endif +-#else ++#else /* PAN_ARCH > 7 */ + case MALI_JOB_TYPE_COMPUTE: +- pandecode_compute_job(jc_gpu_va, gpu_id); ++ pandecode_compute_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + + case MALI_JOB_TYPE_MALLOC_VERTEX: +- pandecode_malloc_vertex_job(jc_gpu_va, gpu_id); ++ pandecode_malloc_vertex_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + #endif + + case MALI_JOB_TYPE_FRAGMENT: +- pandecode_fragment_job(jc_gpu_va, gpu_id); ++ pandecode_fragment_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + + default: +@@ -1232,3 +1265,544 @@ GENX(pandecode_abort_on_fault)(mali_ptr jc_gpu_va) + + pandecode_map_read_write(); + } ++#endif ++ ++#if PAN_ARCH >= 10 ++static void ++pandecode_cs_dump_state(uint32_t *state) ++{ ++ uint64_t *st_64 = (uint64_t *)state; ++ /* Only registers below 0x40 seem to be actually be used by jobs */ ++ for (unsigned i = 0; i < 0x40 / 4; ++i) { ++ uint64_t v1 = st_64[i * 2]; ++ uint64_t v2 = st_64[i * 2 + 1]; ++ ++ if (!v1 && !v2) ++ continue; ++ ++ pandecode_log("0x%2x: 0x%16"PRIx64" 0x%16"PRIx64"\n", ++ i * 4, v1, v2); ++ } ++} ++ ++/* Assumes eight scoreboards */ ++static void ++pandecode_scoreboard_mask(unsigned mask) ++{ ++ if (mask == 0xff) { ++ pandecode_log_cont("all"); ++ return; ++ } else if (!mask) { ++ pandecode_log_cont("none"); ++ return; ++ } ++ ++ const char *comma = ""; ++ for (unsigned i = 0; i < 8; ++i) { ++ if (mask & (1 << i)) { ++ pandecode_log_cont("%s%i", comma, i); ++ comma = ","; ++ } ++ } ++} ++ ++static void ++pandecode_regmask(unsigned base, unsigned mask) ++{ ++ switch (mask) { ++ case 0: ++ pandecode_log_cont("(invalid: %02x mask 0)", base); ++ return; ++ case 1: ++ pandecode_log_cont("w%02x", base); ++ return; ++ case 3: ++ pandecode_log_cont("x%02x", base); ++ return; ++ default: ++ break; ++ } ++ ++ unsigned first = ffs(mask) - 1; ++ if (first) ++ pandecode_log_cont("{(+%i) ", first); ++ else ++ pandecode_log_cont("{"); ++ ++ unsigned edges = mask ^ (mask << 1); ++ ++ const char *comma = ""; ++ ++ bool outside = true; ++ unsigned start; ++ u_foreach_bit(i, edges) { ++ if (outside) ++ start = i; ++ else if (i == start + 1) ++ pandecode_log_cont("%sw%02x", comma, ++ base + start); ++ else if (i == start + 2) ++ pandecode_log_cont("%sx%02x", comma, ++ base + start); ++ else ++ pandecode_log_cont("%sw%02x-w%02x", comma, ++ base + start, ++ base + i - 1); ++ outside = !outside; ++ ++ if (outside) ++ comma = ", "; ++ } ++ ++ pandecode_log_cont("}"); ++} ++ ++static void ++pandecode_cs_buffer(uint64_t *commands, unsigned size, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id, mali_ptr va); ++ ++// Hack hack hackity hack: gpu_id == 1 means "don't decode" (only disassemble) ++static void ++pandecode_cs_command(uint64_t command, mali_ptr va, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id) ++{ ++ uint8_t op = command >> 56; ++ uint8_t addr = (command >> 48) & 0xff; ++ uint64_t value = command & 0xffffffffffffULL; ++ ++ uint32_t h = value >> 32; ++ uint32_t l = value; ++ ++ uint8_t arg1 = h & 0xff; ++ uint8_t arg2 = h >> 8; ++ ++ if (command) ++ pandecode_log("%"PRIx64" %016"PRIx64" ", va, command); ++ ++ switch (op) { ++ case 0: ++ if (addr || value) ++ pandecode_log("nop %02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 1: ++ buffer_unk[addr] = buffer[addr] = l; ++ buffer_unk[addr + 1] = buffer[addr + 1] = h; ++ pandecode_log("mov x%02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 2: ++ buffer_unk[addr] = buffer[addr] = l; ++ pandecode_log("mov w%02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 3: ++ if (l & 0xff00ffff || h || addr) { ++ pandecode_log("wait (unk %02x), (unk %04x), " ++ "%i, (unk %04x)\n", addr, h, l >> 16, l); ++ } else { ++ pandecode_log("wait "); ++ pandecode_scoreboard_mask(l >> 16); ++ pandecode_log_cont("\n"); ++ } ++ break; ++ case 4: { ++ uint32_t masked = l & 0xffff0000; ++ unsigned task_increment = l & 0x3fff; ++ unsigned task_axis = (l >> 14) & 3; ++ if (h != 0xff00 || addr || masked) ++ pandecode_log("compute (unk %02x), (unk %04x), " ++ "(unk %x), inc %i, axis %i\n\n", addr, h, masked, task_increment, task_axis); ++ else ++ pandecode_log("compute inc %i, axis %i\n\n", task_increment, task_axis); ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_compute_job(0, buffer, buffer_unk, gpu_id); ++ ++ /* The gallium driver emits this even for compute jobs, clear ++ * it from unknown state */ ++ pan_unpack_cs(buffer, buffer_unk, SCISSOR, unused_scissor); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ case 6: { ++ /* The meaning of the first argument (in h) is unknown, but it ++ * appears that the second bit must be set. */ ++ uint32_t masked = l & 0xfffff8f0; ++ uint8_t mode = l & 0xf; ++ uint8_t index = (l >> 8) & 7; ++ if (addr || masked) ++ pandecode_log("idvs (unk %02x), 0x%04x, (unk %x), " ++ "mode %i index %i\n\n", ++ addr, h, masked, mode, index); ++ else ++ pandecode_log("idvs 0x%04x, mode %i index %i\n\n", ++ h, mode, index); ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_malloc_vertex_job(0, buffer, buffer_unk, gpu_id); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ case 7: { ++ uint64_t masked = value & ~0x000100000071; ++ bool tem = value & 1; ++ bool unk = (value >> 32) & 1; ++ ++ const char *order = (const char *[]){ ++ "z_order", ++ "horizontal", ++ "vertical", ++ "invalid_3", ++ "invalid_4", ++ "reverse_horizontal", ++ "reverse_vertical", ++ "invalid_7", ++ }[(value >> 4) & 7]; ++ ++ if (addr || masked) { ++ pandecode_log("fragment (unk %02x), (unk %"PRIx64")\n\n", ++ addr, value); ++ } else if (value) { ++ pandecode_log("fragment tem %i, render %s, unk %i\n\n", ++ tem, order, unk); ++ } else { ++ pandecode_log("fragment\n\n"); ++ } ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_fragment_job(0, buffer, buffer_unk, gpu_id); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ ++ case 9: { ++ if (addr || l || h > 1) ++ pandecode_log("flush_tiler (unk %02x), (unk %"PRIx64")\n", ++ addr, value); ++ else if (h) ++ pandecode_log("flush_tiler unk\n"); ++ else ++ pandecode_log("flush_tiler\n"); ++ break; ++ } ++ ++ case 16: case 17: { ++ char wid = (op == 16) ? 'w' : 'x'; ++ ++ if (op == 16) { ++ buffer_unk[addr] = buffer[addr] = buffer[arg2] + l; ++ } else { ++ uint64_t r = buffer[arg2] + ((uint64_t)buffer[arg2 + 1] << 32) + l; ++ buffer_unk[addr] = buffer[addr] = r; ++ buffer_unk[addr + 1] = buffer[addr + 1] = r >> 32; ++ } ++ ++ if (arg1) ++ pandecode_log("add %c%02x, (unk %x), %c%02x, #0x%x\n", ++ wid, addr, arg1, wid, arg2, l); ++ else if ((int32_t) l < 0) ++ pandecode_log("add %c%02x, %c%02x, %i\n", ++ wid, addr, wid, arg2, (int32_t) l); ++ else if (l) ++ pandecode_log("add %c%02x, %c%02x, #0x%x\n", ++ wid, addr, wid, arg2, l); ++ else ++ pandecode_log("mov %c%02x, %c%02x\n", ++ wid, addr, wid, arg2); ++ ++ break; ++ } ++ ++ case 20: case 21: { ++ const char *name = (op == 20) ? "ldr" : "str"; ++ ++ /* The immediate offset must be 4-aligned (though if the ++ * address itself is unaligned, the bits will silently be ++ * masked off). ++ * ++ * Up to 16 32-bit registers can be read or written in a ++ * single instruction, behaviour is similar to LDM or STM ++ * except that a base register is specified. ++ * ++ * These instructions are high latency. Use WAIT 0 to wait for ++ * the result of an LDR, or for a STR to finish. ++ * ++ * For LDR, it is an error for the address register to be ++ * included in the destination register set. ++ */ ++ ++ if (arg1) { ++ pandecode_log("%s (unk %02x), x%02x, (mask %x), [x%02x, %i]\n", ++ name, arg1, addr, l >> 16, arg2, (int16_t) l); ++ } else { ++ pandecode_log("%s ", name); ++ pandecode_regmask(addr, l >> 16); ++ pandecode_log_cont(", [x%02x, %i]\n", arg2, (int16_t) l); ++ } ++ break; ++ } ++ ++ case 22: { ++ /* The signed 32-bit source register is compared against zero ++ * for these comparisons. For example, .GT means that the ++ * branch is taken if the signed register value is greater ++ * than zero. */ ++ const char *comparisons[] = { ++ ".le", ".gt", ++ ".eq", ".ne", ++ ".lt", ".ge", ++ "" /* always */, ".(invalid: never)", ++ }; ++ ++ const char *m = comparisons[(l >> 28) & 7]; ++ ++ int16_t offset = l; ++ ++ bool forward = (offset >= 0); ++ if (!forward) ++ offset = -1 - offset; ++ ++ if (addr || arg1 || l & 0x8fff0000) { ++ pandecode_log("b%s (unk %02x), w%02x, (unk %02x), " ++ "(unk 0x%x), %s %i\n", ++ m, addr, arg2, arg1, l & 0x8fff0000, ++ forward ? "skip" : "back", ++ offset); ++ } else { ++ pandecode_log("b%s w%02x, %s %i\n", ++ m, arg2, ++ forward ? "skip" : "back", ++ offset); ++ } ++ ++ break; ++ } ++ ++ case 23: { ++ if (value >> 3 || addr) ++ pandecode_log("slot (unk %02x), (unk %"PRIx64"), " ++ "%i\n", addr, value >> 3, l & 7); ++ else ++ pandecode_log("slot %i\n", l); ++ break; ++ } ++ ++ case 32: case 33: { ++ /* A tail call is similar to a normal call, but reuses the ++ * current stack entry so that execution returns directly to ++ * the parent, rather than pushing a new entry and returning ++ * to the instruction after the call. Using tail calls avoids ++ * the possibility of stack overflow. ++ */ ++ const char *name = (op == 32) ? "call" : "tailcall"; ++ ++ unsigned length = buffer[arg1]; ++ uint64_t target = (((uint64_t)buffer[arg2 + 1]) << 32) | buffer[arg2]; ++ ++ assert(!(length & 7)); ++ unsigned instrs = length / 8; ++ ++ if (addr || l) ++ pandecode_log("%s (unk %02x), w%02x (%i instructions), x%02x (0x%"PRIx64"), (unk %x)\n", ++ name, addr, arg1, instrs, arg2, target, l); ++ else ++ pandecode_log("%s w%02x (%i instructions), x%02x (0x%"PRIx64")\n", ++ name, arg1, instrs, arg2, target); ++ ++ if (!target || !length) ++ break; ++ ++ uint64_t *t = pandecode_fetch_gpu_mem(target, length); ++ pandecode_indent++; ++ pandecode_cs_buffer(t, length, buffer, buffer_unk, gpu_id, ++ target); ++ pandecode_indent--; ++ break; ++ } ++ ++ case 34: { ++ /* idvs implies tiler */ ++ if (l & ~0xf) ++ pandecode_log("resources 0x%x\n", l); ++ else ++ pandecode_log("resources%s%s%s%s\n", ++ (l & 1) ? " compute" : "", ++ (l & 2) ? " fragment" : "", ++ (l & 4) ? " tiler" : "", ++ (l & 8) ? " idvs" : ""); ++ break; ++ } ++ ++ case 37: case 38: case 51: case 52: { ++ /* ++ * 0b 00100101 / 00100110 -- opcode ++ * ????0??? -- unk. usually 1, faults if "0" bit set ++ * aaaaaaaa -- address register ++ * vvvvvvvv -- 32-bit value register ++ * 00000000 -- seems to act as NOP if nonzero ++ * mmmmmmmm -- some sort of mask, unknown purpose ++ * ???????? -- seems to have no effect ++ * ?????s0u -- 's' disables signal to CPU, ++ * 'u' has unknown purpose (disable GPU signal?) ++ * ++ * The difference between the two opcodes is unknown. ++ * ++ * That the 'mmmmmmmm' byte is somehow a scoreboard mask is ++ * a possibility. ++ */ ++ ++ const char *name = (op & 1) ? "evadd" : "evstr"; ++ const char *type = (op > 50) ? "x" : "w"; ++ ++ if (addr != 1 || l & 0xff00fffa) { ++ pandecode_log("%s (unk %02x), %s%02x, [x%02x], " ++ "unk 0x%x, flags 0x%x\n", ++ name, addr, type, arg1, arg2, ++ l >> 16, (uint16_t) l); ++ } else { ++ pandecode_log("%s %s%02x, [x%02x], unk 0x%x%s%s\n", ++ name, type, arg1, arg2, l >> 16, ++ l & 0x4 ? "" : ", irq", ++ l & 0x1 ? ", unk0" : ""); ++ } ++ ++ break; ++ } ++ ++ case 39: case 53: { ++ const char *m = (const char *[]){ ++ ".ls", ++ ".hi", ++ }[(l >> 28) & 1]; ++ const char *e = (const char *[]){ ++ ".inherit", ++ ".no_error", ++ }[l & 1]; ++ const char *type = (op > 50) ? "x" : "w"; ++ ++ /* Wait until the value in the destination register is changed ++ * to pass the comparison. For example, with .LS the value ++ * in memory must be less than or same as the reference to ++ * continue execution. */ ++ if (addr || l & ~((1 << 28) | (1 << 0))) ++ pandecode_log("evwait%s%s (unk %02x), %s%02x, " ++ "[x%02x, unk %x]\n", ++ m, e, addr, type, arg1, arg2, l); ++ else ++ pandecode_log("evwait%s%s %s%02x, [x%02x]\n", ++ m, e, type, arg1, arg2); ++ break; ++ } ++ ++ case 40: { ++ if (addr || l >> 16 || arg1 > 1) { ++ pandecode_log("str type %02x, (unk %02x), " ++ "(unk %x), [x%02x, %i]\n", ++ addr, arg1, ++ l >> 16, arg2, (int16_t) l); ++ } else { ++ const char *type = (const char *[]) { ++ "timestamp", ++ "cycles", ++ }[arg1]; ++ ++ pandecode_log("str %s, [x%02x, %i]\n", ++ type, arg2, (int16_t) l); ++ } ++ break; ++ } ++ ++ case 48: { ++ if (addr || arg1 || l) ++ pandecode_log("heapctx (unk %02x), " ++ "x%02x, (unk %02x), (unk %x)\n", ++ addr, arg2, arg1, l); ++ else ++ pandecode_log("heapctx x%02x\n", arg2); ++ break; ++ } ++ ++ case 49: { ++ const char *m = (const char *[]){ ++ "vt_start", ++ "vt_end", ++ "unk", ++ "frag_end", ++ }[arg1 & 3]; ++ ++ if (addr || arg2 || arg1 > 3 || l) ++ pandecode_log("heapinc (unk %02x), " ++ "(unk %02x), %02x, (unk %x)\n", ++ addr, arg2, arg1, l); ++ else ++ pandecode_log("heapinc %s\n", m); ++ break; ++ } ++ ++ default: ++ /* ++ * UNK 00 30, #0x480000000000 -- takes an eight-byte aligned ++ * memory address. ++ */ ++ ++ pandecode_log("UNK %02x %02x, #0x%"PRIx64"\n", addr, op, value); ++ break; ++ } ++} ++ ++// TODO: reorder args ++static void ++pandecode_cs_buffer(uint64_t *commands, unsigned size, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id, mali_ptr va) ++{ ++ uint64_t *end = (uint64_t *)((uint8_t *) commands + size); ++ ++ for (uint64_t c = *commands; commands < end; c = *(++commands)) { ++ pandecode_cs_command(c, va, buffer, buffer_unk, gpu_id); ++ va += 8; ++ } ++} ++ ++// TODO: Does it make sense to pass in the length? ++void ++GENX(pandecode_cs)(mali_ptr cs_gpu_va, unsigned size, unsigned gpu_id) ++{ ++ pandecode_dump_file_open(); ++ ++ // TODO: Pass down the buffer during recursion ++ uint32_t buffer[256] = {0}; ++ uint32_t buffer_unk[256] = {0}; ++ ++ uint64_t *commands = pandecode_fetch_gpu_mem(cs_gpu_va, 1); ++ ++ pandecode_log("\n"); ++ ++ pandecode_cs_buffer(commands, size, buffer, buffer_unk, gpu_id, ++ cs_gpu_va); ++ ++ fflush(pandecode_dump_stream); ++ pandecode_map_read_write(); ++} ++#endif +diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h +index 6fa6014eb0e..4f175adfb2e 100644 +--- a/src/panfrost/lib/genxml/decode.h ++++ b/src/panfrost/lib/genxml/decode.h +@@ -50,8 +50,6 @@ struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(uint64_ + + void pandecode_map_read_write(void); + +-void pandecode_dump_mappings(void); +- + static inline void * + __pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size, + int line, const char *filename) +@@ -98,6 +96,8 @@ void pandecode_abort_on_fault_v6(mali_ptr jc_gpu_va); + void pandecode_abort_on_fault_v7(mali_ptr jc_gpu_va); + void pandecode_abort_on_fault_v9(mali_ptr jc_gpu_va); + ++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ + static inline void + pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings) + { +@@ -130,7 +130,7 @@ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings) + fprintf(fp, " | "); + for (unsigned j = i & ~0xF; j <= i; ++j) { + uint8_t c = hex[j]; +- fputc((c < 32 || c > 128) ? '.' : c, fp); ++ fputc((c < 32 || c > 126) ? '.' : c, fp); + } + } + +diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c +index ecc02387175..41c63b290c7 100644 +--- a/src/panfrost/lib/genxml/decode_common.c ++++ b/src/panfrost/lib/genxml/decode_common.c +@@ -202,7 +202,7 @@ pointer_as_memory_reference(uint64_t ptr) + + static int pandecode_dump_frame_count = 0; + +-static bool force_stderr = false; ++bool force_stderr = false; + + void + pandecode_dump_file_open(void) +@@ -230,7 +230,7 @@ pandecode_dump_file_open(void) + } + } + +-static void ++void + pandecode_dump_file_close(void) + { + simple_mtx_assert_locked(&pandecode_lock); +@@ -289,8 +289,9 @@ pandecode_dump_mappings(void) + if (!it->addr || !it->length) + continue; + +- fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n", +- it->name, it->gpu_va); ++ fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 ++ " length %zu\n\n", ++ it->name, it->gpu_va, it->length); + + pan_hexdump(pandecode_dump_stream, it->addr, it->length, false); + fprintf(pandecode_dump_stream, "\n"); +@@ -333,3 +334,20 @@ pandecode_jc(mali_ptr jc_gpu_va, unsigned gpu_id) + + simple_mtx_unlock(&pandecode_lock); + } ++ ++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ ++void ++pandecode_cs(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id) ++{ ++ simple_mtx_lock(&pandecode_lock); ++ ++ switch (pan_arch(gpu_id)) { ++ // Hack hack hackity hack: gpu_id == 1 means "don't decode" (only ++ // disassemble) ++ case 0: case 10: pandecode_cs_v10(cs_gpu_va, cs_size, gpu_id); break; ++ default: unreachable("Unsupported architecture"); ++ } ++ ++ simple_mtx_unlock(&pandecode_lock); ++} +diff --git a/src/panfrost/lib/genxml/gen_macros.h b/src/panfrost/lib/genxml/gen_macros.h +index 1ef4b53a508..24072634fdc 100644 +--- a/src/panfrost/lib/genxml/gen_macros.h ++++ b/src/panfrost/lib/genxml/gen_macros.h +@@ -93,6 +93,9 @@ pan_arch(unsigned gpu_id) + #elif (PAN_ARCH == 9) + # define GENX(X) X##_v9 + # include "genxml/v9_pack.h" ++#elif (PAN_ARCH == 10) ++# define GENX(X) X##_v10 ++# include "genxml/v10_pack.h" + #else + # error "Need to add suffixing macro for this architecture" + #endif +diff --git a/src/panfrost/lib/genxml/gen_pack.py b/src/panfrost/lib/genxml/gen_pack.py +index 434a228c514..bd6343f5908 100644 +--- a/src/panfrost/lib/genxml/gen_pack.py ++++ b/src/panfrost/lib/genxml/gen_pack.py +@@ -46,6 +46,18 @@ + + #include "util/bitpack_helpers.h" + ++/* Most functions assume the caller has done bounds checking */ ++typedef struct pan_command_stream { ++ uint64_t *ptr; ++ uint64_t *begin; ++ uint64_t *end; ++ uint64_t gpu; ++} pan_command_stream; ++ ++struct pan_command_stream_decoded { ++ uint32_t values[256]; ++}; ++ + #define __gen_unpack_float(x, y, z) uif(__gen_unpack_uint(x, y, z)) + + static inline uint32_t +@@ -98,6 +110,20 @@ + return (2*odd + 1) << shift; + } + ++static inline void ++__gen_clear_value(uint8_t *restrict cl, uint32_t start, uint32_t end) ++{ ++ for (uint32_t byte = start / 8; byte <= end / 8; byte++) { ++ uint8_t m = 0; ++ if (byte == start / 8) ++ m |= 0xff >> (8 - start % 8); ++ if (byte == end / 8) ++ m |= 0xff << (1 + end % 8); ++ ++ cl[byte] &= m; ++ } ++} ++ + #define PREFIX1(A) MALI_ ## A + #define PREFIX2(A, B) MALI_ ## A ## _ ## B + #define PREFIX4(A, B, C, D) MALI_ ## A ## _ ## B ## _ ## C ## _ ## D +@@ -183,6 +209,96 @@ + + """ + ++no_cs = "".join([f""" ++#define MALI_{y} MALI_{x} ++#define MALI_{y}_header MALI_{x}_header ++#define MALI_{y}_pack MALI_{x}_pack ++#define MALI_{y}_LENGTH MALI_{x}_LENGTH ++#define MALI_{y}_ALIGN MALI_{x}_ALIGN ++#define mali_{y.lower()}_packed mali_{x.lower()}_packed ++#define MALI_{y}_unpack MALI_{x}_unpack ++#define MALI_{y}_print MALI_{x}_print ++""" for x, y in (("DRAW", "DRAW_NO_CS"), )]) + """ ++ ++#define pan_pack_cs_v10(dst, _, T, name) pan_pack(dst, T, name) ++ ++#define pan_section_pack_cs_v10(dst, _, A, S, name) pan_section_pack(dst, A, S, name) ++ ++#define pan_unpack_cs_v10(dst, _, __, T, name) pan_unpack(dst, T, name) ++ ++#define pan_section_unpack_cs_v10(src, _, __, A, S, name) pan_section_unpack(src, A, S, name) ++""" ++ ++with_cs = """ ++#define pan_pack_cs(dst, T, name) \\ ++ for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\ ++ *_loop_terminate = (void *) (dst); \\ ++ __builtin_expect(_loop_terminate != NULL, 1); \\ ++ ({ PREFIX2(T, pack_cs)(dst, &name); \\ ++ _loop_terminate = NULL; })) ++ ++#define pan_section_pack_cs(dst, A, S, name) \\ ++ for (PREFIX4(A, SECTION, S, TYPE) name = { PREFIX4(A, SECTION, S, header) }, \\ ++ *_loop_terminate = (void *) (dst); \\ ++ __builtin_expect(_loop_terminate != NULL, 1); \\ ++ ({ PREFIX4(A, SECTION, S, pack_cs) (dst, &name); \\ ++ _loop_terminate = NULL; })) ++ ++#define pan_section_pack_cs_v10(_, dst, A, S, name) pan_section_pack_cs(dst, A, S, name) ++ ++// TODO: assert that the first argument is NULL ++#define pan_pack_cs_v10(_, dst, T, name) pan_pack_cs(dst, T, name) ++ ++#define pan_pack_ins(dst, T, name) \\ ++ for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\ ++ *_loop_terminate = (void *) (dst); \\ ++ __builtin_expect(_loop_terminate != NULL, 1); \\ ++ ({ PREFIX2(T, pack_ins)(dst, &name); \\ ++ _loop_terminate = NULL; })) ++ ++#define pan_unpack_cs(buf, buf_unk, T, name) \\ ++ struct PREFIX1(T) name; \\ ++ PREFIX2(T, unpack)(buf, buf_unk, &name) ++ ++#define pan_unpack_cs_v10(_, buf, buf_unk, T, name) pan_unpack_cs(buf, buf_unk, T, name) ++ ++#define pan_section_unpack_cs_v10(_, buf, buf_unk, A, S, name) \\ ++ PREFIX4(A, SECTION, S, TYPE) name; \\ ++ PREFIX4(A, SECTION, S, unpack)(buf, buf_unk, &name) ++ ++static inline void ++pan_emit_cs_ins(pan_command_stream *s, uint8_t op, uint64_t instr) ++{ ++ assert(instr < (1ULL << 56)); ++ instr |= ((uint64_t)op << 56); ++ *((s->ptr)++) = instr; ++} ++ ++static inline void ++pan_emit_cs_32(pan_command_stream *s, uint8_t reg, uint32_t value) ++{ ++ pan_emit_cs_ins(s, 2, ((uint64_t) reg << 48) | value); ++} ++ ++static inline void ++pan_emit_cs_48(pan_command_stream *s, uint8_t reg, uint64_t value) ++{ ++ assert(value < (1ULL << 48)); ++ pan_emit_cs_ins(s, 1, ((uint64_t) reg << 48) | value); ++} ++ ++static inline void ++pan_emit_cs_64(pan_command_stream *s, uint8_t reg, uint64_t value) ++{ ++ if (value < (1ULL << 48)) { ++ pan_emit_cs_48(s, reg, value); ++ } else { ++ pan_emit_cs_32(s, reg, value); ++ pan_emit_cs_32(s, reg + 1, value >> 32); ++ } ++} ++""" ++ + def to_alphanum(name): + substitutions = { + ' ': '_', +@@ -297,7 +413,7 @@ def __init__(self, parser, attrs): + + if ":" in str(attrs["start"]): + (word, bit) = attrs["start"].split(":") +- self.start = (int(word) * 32) + int(bit) ++ self.start = (int(word, 0) * 32) + int(bit) + else: + self.start = int(attrs["start"]) + +@@ -331,7 +447,8 @@ def emit_template_struct(self, dim): + type = 'uint64_t' + elif self.type == 'int': + type = 'int32_t' +- elif self.type in ['uint', 'hex', 'uint/float', 'padded', 'Pixel Format']: ++ # TODO: Convert to tuple ++ elif self.type in ['uint', 'hex', 'register', 'uint/float', 'padded', 'Pixel Format']: + type = 'uint32_t' + elif self.type in self.parser.structs: + type = 'struct ' + self.parser.gen_prefix(safe_name(self.type.upper())) +@@ -385,8 +502,8 @@ def emit_template_struct(self, dim): + field.emit_template_struct(dim) + + class Word: +- def __init__(self): +- self.size = 32 ++ def __init__(self, size=32): ++ self.size = size + self.contributors = [] + + class FieldRef: +@@ -410,7 +527,7 @@ def collect_fields(self, fields, offset, path, all_fields): + end = offset + field.end + all_fields.append(self.FieldRef(field, field_path, start, end)) + +- def collect_words(self, fields, offset, path, words): ++ def collect_words(self, fields, offset, path, words, ins=False): + for field in fields: + field_path = '{}{}'.format(path, field.name) + start = offset + field.start +@@ -424,16 +541,27 @@ def collect_words(self, fields, offset, path, words): + contributor = self.FieldRef(field, field_path, start, end) + first_word = contributor.start // 32 + last_word = contributor.end // 32 ++ if ins: ++ assert(last_word < 2) ++ first_word = last_word = 0 ++ + for b in range(first_word, last_word + 1): + if not b in words: +- words[b] = self.Word() ++ words[b] = self.Word(size=64 if ins else 32) ++ + words[b].contributors.append(contributor) + +- def emit_pack_function(self): +- self.get_length() ++ return ++ ++ def emit_pack_function(self, csf=False, ins=False): ++ if csf: ++ self.length = 256 * 4 ++ else: ++ self.get_length() ++ assert(not ins) + + words = {} +- self.collect_words(self.fields, 0, '', words) ++ self.collect_words(self.fields, 0, '', words, ins=ins) + + # Validate the modifier is lossless + for field in self.fields: +@@ -449,25 +577,52 @@ def emit_pack_function(self): + elif field.modifier[0] == "log2": + print(" assert(util_is_power_of_two_nonzero(values->{}));".format(field.name)) + +- for index in range(self.length // 4): ++ if ins: ++ index_list = (0, ) ++ elif csf: ++ index_list = sorted(words) ++ else: ++ index_list = range(self.length // 4) ++ ++ for index in index_list: + # Handle MBZ words + if not index in words: +- print(" cl[%2d] = 0;" % index) ++ if ins: ++ print(" pan_emit_cs_ins(s, 0x%02x, 0);" % self.op) ++ elif not csf: ++ print(" cl[%2d] = 0;" % index) + continue + + word = words[index] + + word_start = index * 32 + ++ size = 32 ++ # Can we move all fields from the next index here? ++ if csf and index % 2 == 0 and index + 1 in words: ++ word_next = words[index + 1] ++ end = max(c.end for c in word_next.contributors) ++ if end - word_start < 48: ++ size = 48 ++ word.contributors += [x for x in word_next.contributors if not x in word.contributors] ++ del words[index + 1] ++ + v = None +- prefix = " cl[%2d] =" % index ++ if ins: ++ prefix = " pan_emit_cs_ins(s, 0x%02x," % self.op ++ elif size == 48: ++ prefix = " pan_emit_cs_48(s, 0x%02x," % index ++ elif csf: ++ prefix = " pan_emit_cs_32(s, 0x%02x," % index ++ else: ++ prefix = " cl[%2d] = (" % index + + for contributor in word.contributors: + field = contributor.field + name = field.name + start = contributor.start + end = contributor.end +- contrib_word_start = (start // 32) * 32 ++ contrib_word_start = (start // word.size) * word.size + start -= contrib_word_start + end -= contrib_word_start + +@@ -482,7 +637,7 @@ def emit_pack_function(self): + elif field.modifier[0] == "log2": + value = "util_logbase2({})".format(value) + +- if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format"]: ++ if field.type in ["uint", "hex", "uint/float", "address", "register", "Pixel Format"]: + s = "util_bitpack_uint(%s, %d, %d)" % \ + (value, start, end) + elif field.type == "padded": +@@ -505,11 +660,13 @@ def emit_pack_function(self): + + if not s == None: + shift = word_start - contrib_word_start +- if shift: ++ if shift > 0: + s = "%s >> %d" % (s, shift) ++ elif shift < 0: ++ s = "%s << %d" % (s, -shift) + + if contributor == word.contributors[-1]: +- print("%s %s;" % (prefix, s)) ++ print("%s %s);" % (prefix, s)) + else: + print("%s %s |" % (prefix, s)) + prefix = " " +@@ -528,22 +685,23 @@ def mask_for_word(self, index, start, end): + count = (end - start + 1) + return (((1 << count) - 1) << start) + +- def emit_unpack_function(self): ++ def emit_unpack_function(self, csf=False): + # First, verify there is no garbage in unused bits + words = {} + self.collect_words(self.fields, 0, '', words) + +- for index in range(self.length // 4): +- base = index * 32 +- word = words.get(index, self.Word()) +- masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors] +- mask = reduce(lambda x,y: x | y, masks, 0) ++ if not csf: ++ for index in range(self.length // 4): ++ base = index * 32 ++ word = words.get(index, self.Word()) ++ masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors] ++ mask = reduce(lambda x,y: x | y, masks, 0) + +- ALL_ONES = 0xffffffff ++ ALL_ONES = 0xffffffff + +- if mask != ALL_ONES: +- TMPL = ' if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");' +- print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index)) ++ if mask != ALL_ONES: ++ TMPL = ' if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");' ++ print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index)) + + fieldrefs = [] + self.collect_fields(self.fields, 0, '', fieldrefs) +@@ -556,7 +714,7 @@ def emit_unpack_function(self): + args.append(str(fieldref.start)) + args.append(str(fieldref.end)) + +- if field.type in set(["uint", "hex", "uint/float", "address", "Pixel Format"]): ++ if field.type in set(["uint", "hex", "uint/float", "address", "register", "Pixel Format"]): + convert = "__gen_unpack_uint" + elif field.type in self.parser.enums: + convert = "(enum %s)__gen_unpack_uint" % enum_name(field.type) +@@ -588,6 +746,9 @@ def emit_unpack_function(self): + mask = hex(field.modifier[1] - 1) + print(' assert(!(values->{} & {}));'.format(fieldref.path, mask)) + ++ if csf: ++ print(' __gen_clear_value({});'.format(', '.join(['cl_unk'] + args[1:]))) ++ + def emit_print_function(self): + for field in self.fields: + convert = None +@@ -610,7 +771,7 @@ def emit_print_function(self): + print(' fprintf(fp, "%*s{}: %f\\n", indent, "", {});'.format(name, val)) + elif field.type in ["uint", "hex"] and (field.end - field.start) >= 32: + print(' fprintf(fp, "%*s{}: 0x%" PRIx64 "\\n", indent, "", {});'.format(name, val)) +- elif field.type == "hex": ++ elif field.type in ("hex", "register"): + print(' fprintf(fp, "%*s{}: 0x%x\\n", indent, "", {});'.format(name, val)) + elif field.type == "uint/float": + print(' fprintf(fp, "%*s{}: 0x%X (%f)\\n", indent, "", {}, uif({}));'.format(name, val, val)) +@@ -649,9 +810,13 @@ def start_element(self, name, attrs): + print(v6_format_printer) + else: + print(v7_format_printer) ++ if arch < 10: ++ print(no_cs) ++ else: ++ print(with_cs) + elif name == "struct": + name = attrs["name"] +- self.no_direct_packing = attrs.get("no-direct-packing", False) ++ self.layout = attrs.get("layout", "struct") + object_name = self.gen_prefix(safe_name(name.upper())) + self.struct = object_name + +@@ -659,10 +824,16 @@ def start_element(self, name, attrs): + if "size" in attrs: + self.group.length = int(attrs["size"]) * 4 + self.group.align = int(attrs["align"]) if "align" in attrs else None ++ self.group.op = int(attrs["op"]) if "op" in attrs else None + self.structs[attrs["name"]] = self.group ++ self.unpacked_alias = self.gen_prefix(safe_name(attrs["unpacked"].upper())) if "unpacked" in attrs else None + elif name == "field": +- self.group.fields.append(Field(self, attrs)) + self.values = [] ++ self.skip_field = self.layout == "cs" and not attrs["start"].startswith("0x") ++ if self.skip_field: ++ #print(f"#warning Skipping non-CS field {attrs['name']}") ++ return ++ self.group.fields.append(Field(self, attrs)) + elif name == "enum": + self.values = [] + self.enum = safe_name(attrs["name"]) +@@ -675,6 +846,8 @@ def start_element(self, name, attrs): + self.values.append(Value(attrs)) + elif name == "aggregate": + aggregate_name = self.gen_prefix(safe_name(attrs["name"].upper())) ++ # TODO: Make .layout less "global"? ++ self.layout = attrs.get("layout", "struct") + self.aggregate = Aggregate(self, aggregate_name, attrs) + self.aggregates[attrs['name']] = self.aggregate + elif name == "section": +@@ -687,7 +860,8 @@ def end_element(self, name): + self.struct = None + self.group = None + elif name == "field": +- self.group.fields[-1].values = self.values ++ if not self.skip_field: ++ self.group.fields[-1].values = self.values + elif name == "enum": + self.emit_enum() + self.enum = None +@@ -717,22 +891,33 @@ def emit_header(self, name): + print('') + + def emit_template_struct(self, name, group): +- print("struct %s {" % name) +- group.emit_template_struct("") +- print("};\n") ++ if self.unpacked_alias: ++ # TODO: Check the fields match ++ print("#define %s %s" % (name, self.unpacked_alias)) ++ else: ++ print("struct %s {" % name) ++ group.emit_template_struct("") ++ print("};\n") + + def emit_aggregate(self): + aggregate = self.aggregate +- print("struct %s_packed {" % aggregate.name.lower()) +- print(" uint32_t opaque[{}];".format(aggregate.get_size() // 4)) +- print("};\n") +- print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size)) ++ ++ if self.layout == "struct": ++ print("struct %s_packed {" % aggregate.name.lower()) ++ print(" uint32_t opaque[{}];".format(aggregate.get_size() // 4)) ++ print("};\n") ++ print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size)) ++ else: ++ assert(self.layout == "cs") ++ + if aggregate.align != None: + print('#define {}_ALIGN {}'.format(aggregate.name.upper(), aggregate.align)) + for section in aggregate.sections: + print('#define {}_SECTION_{}_TYPE struct {}'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + print('#define {}_SECTION_{}_header {}_header'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + print('#define {}_SECTION_{}_pack {}_pack'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) ++ # TODO: Only when req'd ++ print('#define {}_SECTION_{}_pack_cs {}_pack_cs'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + print('#define {}_SECTION_{}_unpack {}_unpack'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + print('#define {}_SECTION_{}_print {}_print'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + print('#define {}_SECTION_{}_OFFSET {}'.format(aggregate.name.upper(), section.name.upper(), section.offset)) +@@ -747,12 +932,32 @@ def emit_pack_function(self, name, group): + print("}\n\n") + + # Should be a whole number of words +- assert((self.group.length % 4) == 0) ++ assert((group.length % 4) == 0) ++ ++ print('#define {} {}'.format (name + "_LENGTH", group.length)) ++ if group.align != None: ++ print('#define {} {}'.format (name + "_ALIGN", group.align)) ++ print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), group.length // 4)) ++ ++ def emit_cs_pack_function(self, name, group): ++ print("static inline void\n%s_pack_cs(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{\n" % ++ (name, ' ' * (len(name) + 6), name)) ++ ++ group.emit_pack_function(csf=True) + +- print('#define {} {}'.format (name + "_LENGTH", self.group.length)) +- if self.group.align != None: +- print('#define {} {}'.format (name + "_ALIGN", self.group.align)) +- print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), self.group.length // 4)) ++ print("}\n\n") ++ ++ assert(group.length == 256 * 4) ++ ++ def emit_ins_pack_function(self, name, group): ++ print("static inline void\n%s_pack_ins(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{" % ++ (name, ' ' * (len(name) + 6), name)) ++ ++ group.emit_pack_function(csf=True, ins=True) ++ ++ print("}\n\n") ++ ++ assert(group.length == 256 * 4) + + def emit_unpack_function(self, name, group): + print("static inline void") +@@ -763,6 +968,18 @@ def emit_unpack_function(self, name, group): + + print("}\n") + ++ def emit_cs_unpack_function(self, name, group): ++ print("static inline void") ++ print("%s_unpack(const uint32_t * restrict buffer, uint32_t * restrict buffer_unk,\n" ++ "%sstruct %s * restrict values)\n{" ++ " const uint8_t *cl = (uint8_t *)buffer;\n" ++ " uint8_t *cl_unk = (uint8_t *)buffer_unk;\n" % ++ (name.upper(), ' ' * (len(name) + 8), name)) ++ ++ group.emit_unpack_function(csf=True) ++ ++ print("}\n") ++ + def emit_print_function(self, name, group): + print("static inline void") + print("{}_print(FILE *fp, const struct {} * values, unsigned indent)\n{{".format(name.upper(), name)) +@@ -776,14 +993,20 @@ def emit_struct(self): + + self.emit_template_struct(self.struct, self.group) + self.emit_header(name) +- if self.no_direct_packing == False: ++ if self.layout == "struct": + self.emit_pack_function(self.struct, self.group) + self.emit_unpack_function(self.struct, self.group) ++ elif self.layout == "cs": ++ self.emit_cs_pack_function(self.struct, self.group) ++ self.emit_cs_unpack_function(self.struct, self.group) ++ elif self.layout == "ins": ++ # TODO: I don't think that the current unpack emit functions would ++ # work ++ self.emit_ins_pack_function(self.struct, self.group) ++ else: ++ assert(self.layout == "none") + self.emit_print_function(self.struct, self.group) + +- def enum_prefix(self, name): +- return +- + def emit_enum(self): + e_name = enum_name(self.enum) + prefix = e_name if self.enum != 'Format' else global_prefix +diff --git a/src/panfrost/lib/genxml/meson.build b/src/panfrost/lib/genxml/meson.build +index 61041168ab0..191a970ff63 100644 +--- a/src/panfrost/lib/genxml/meson.build ++++ b/src/panfrost/lib/genxml/meson.build +@@ -20,7 +20,7 @@ + # SOFTWARE. + + pan_packers = [] +-foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9'] ++foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10'] + pan_packers += custom_target( + packer + '_pack.h', + input : ['gen_pack.py', packer + '.xml'], +@@ -37,7 +37,7 @@ idep_pan_packers = declare_dependency( + + libpanfrost_decode_per_arch = [] + +-foreach ver : ['4', '5', '6', '7', '9'] ++foreach ver : ['4', '5', '6', '7', '9', '10'] + libpanfrost_decode_per_arch += static_library( + 'pandecode-arch-v' + ver, + ['decode.c', pan_packers], +diff --git a/src/panfrost/lib/genxml/v10.xml b/src/panfrost/lib/genxml/v10.xml +new file mode 100644 +index 00000000000..d1f104f4e62 +--- /dev/null ++++ b/src/panfrost/lib/genxml/v10.xml +@@ -0,0 +1,1668 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
++
++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
++
++
++
++
++
++
++
++
++
++ ++ ++ ++
++
++
++
++
++
++
++
++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/src/panfrost/lib/genxml/v4.xml b/src/panfrost/lib/genxml/v4.xml +index b72fc3e28ef..a4ee54c2bac 100644 +--- a/src/panfrost/lib/genxml/v4.xml ++++ b/src/panfrost/lib/genxml/v4.xml +@@ -446,7 +446,7 @@ + + + +- ++ + + + +diff --git a/src/panfrost/lib/genxml/v5.xml b/src/panfrost/lib/genxml/v5.xml +index f9fc44e89f3..2feb8909609 100644 +--- a/src/panfrost/lib/genxml/v5.xml ++++ b/src/panfrost/lib/genxml/v5.xml +@@ -467,7 +467,7 @@ + + + +- ++ + + + +diff --git a/src/panfrost/lib/genxml/v6.xml b/src/panfrost/lib/genxml/v6.xml +index 042f1e694d4..321ab524eaf 100644 +--- a/src/panfrost/lib/genxml/v6.xml ++++ b/src/panfrost/lib/genxml/v6.xml +@@ -467,7 +467,7 @@ + + + +- ++ + + + +@@ -689,7 +689,7 @@ + + + +- ++ + + + +@@ -708,7 +708,7 @@ + + + +- ++ + + + +@@ -717,7 +717,7 @@ + + + +- ++ + + + +diff --git a/src/panfrost/lib/genxml/v7.xml b/src/panfrost/lib/genxml/v7.xml +index 3440ee70613..b084ef6b3bf 100644 +--- a/src/panfrost/lib/genxml/v7.xml ++++ b/src/panfrost/lib/genxml/v7.xml +@@ -512,7 +512,7 @@ + + + +- ++ + + + +@@ -754,7 +754,7 @@ + + + +- ++ + + + +@@ -773,7 +773,7 @@ + + + +- ++ + + + +@@ -782,7 +782,7 @@ + + + +- ++ + + + +@@ -846,13 +846,13 @@ + + + +- ++ + + + + + +- ++ + + + +diff --git a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml +index 43d461077d6..b650bb2002a 100644 +--- a/src/panfrost/lib/genxml/v9.xml ++++ b/src/panfrost/lib/genxml/v9.xml +@@ -526,7 +526,7 @@ + + + +- ++ + + + +@@ -599,12 +599,6 @@ + + + +- +- +- +- +- +- + + + +@@ -612,10 +606,10 @@ + + + +- +- ++ ++ + +- ++ + + + +@@ -1309,28 +1303,28 @@ + + + +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + + + +- +- +- +- +- +- +- +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1340,9 +1334,9 @@ + + + +- +- +- ++ ++ ++ + + + +@@ -1363,8 +1357,8 @@ + + + +- +- ++ ++ + + + +@@ -1374,6 +1368,7 @@ + + + ++ + + + +@@ -1391,7 +1386,7 @@ + + + +- ++ + + + +@@ -1407,24 +1402,24 @@ + + + +- +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ ++ + +- ++ + + + +- ++ + + + +- +- ++ ++ + + + +diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build +index f8c34cb5a82..31dab70e304 100644 +--- a/src/panfrost/lib/meson.build ++++ b/src/panfrost/lib/meson.build +@@ -39,7 +39,7 @@ endforeach + + libpanfrost_per_arch = [] + +-foreach ver : ['4', '5', '6', '7', '9'] ++foreach ver : ['4', '5', '6', '7', '9', '10'] + libpanfrost_per_arch += static_library( + 'pan-arch-v' + ver, + [ +@@ -93,7 +93,7 @@ libpanfrost_lib = static_library( + include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], + c_args : [no_override_init_args], + gnu_symbol_visibility : 'hidden', +- dependencies: [dep_libdrm, idep_nir], ++ dependencies: [dep_libdrm, idep_nir, libpanfrost_base_dep], + build_by_default : false, + link_with: [libpanfrost_pixel_format, libpanfrost_per_arch], + ) +diff --git a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c +index 7a524e53f66..31d9612b9e7 100644 +--- a/src/panfrost/lib/pan_afbc.c ++++ b/src/panfrost/lib/pan_afbc.c +@@ -125,10 +125,6 @@ panfrost_afbc_format(unsigned arch, enum pipe_format format) + */ + format = util_format_linear(format); + +- /* Don't allow swizzled formats on v7+ */ +- if (arch >= 7 && format != unswizzled_format(format)) +- return PIPE_FORMAT_NONE; +- + /* Otherwise swizzling doesn't affect AFBC */ + format = unswizzled_format(format); + +@@ -189,3 +185,12 @@ panfrost_afbc_can_tile(const struct panfrost_device *dev) + { + return (dev->arch >= 7); + } ++ ++/* ++ * Can this format only be used with AFBC_FORMAT_MOD_NATIVE_SWIZZLE? ++ */ ++bool ++panfrost_afbc_only_native(unsigned arch, enum pipe_format format) ++{ ++ return (arch >= 7 && format != unswizzled_format(format)); ++} +diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c +index f6e6bf671b0..b8b84ca7f8d 100644 +--- a/src/panfrost/lib/pan_blend.c ++++ b/src/panfrost/lib/pan_blend.c +@@ -800,7 +800,7 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, + }; + + /* Blend shaders should only be used for blending on Bifrost onwards */ +- assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation)); ++ assert(dev->arch <= 5 || state->logicop_enable || !pan_blend_is_opaque(state->rts[rt].equation)); + assert(state->rts[rt].equation.color_mask != 0); + + struct hash_entry *he = _mesa_hash_table_search(dev->blend_shaders.shaders, &key); +diff --git a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c +index e2e2342b5e4..e6b0e2ce333 100644 +--- a/src/panfrost/lib/pan_blitter.c ++++ b/src/panfrost/lib/pan_blitter.c +@@ -1150,7 +1150,7 @@ pan_preload_emit_dcd(struct pan_pool *pool, + blend.cpu); + } + +- pan_pack(out, DRAW, cfg) { ++ pan_pack(out, DRAW_NO_CS, cfg) { + if (zs) { + /* ZS_EMIT requires late update/kill */ + cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; +@@ -1225,7 +1225,7 @@ pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool, + return; + + fb->bifrost.pre_post.dcds = +- pan_pool_alloc_desc_array(desc_pool, 3, DRAW); ++ pan_pool_alloc_desc_array(desc_pool, 3, DRAW_NO_CS); + } + + static void +@@ -1237,7 +1237,7 @@ pan_preload_emit_pre_frame_dcd(struct pan_pool *desc_pool, + pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb); + assert(fb->bifrost.pre_post.dcds.cpu); + void *dcd = fb->bifrost.pre_post.dcds.cpu + +- (dcd_idx * pan_size(DRAW)); ++ (dcd_idx * pan_size(DRAW_NO_CS)); + + /* We only use crc_rt to determine whether to force writes for updating + * the CRCs, so use a conservative tile size (16x16). +diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c +index b606d1b0359..708fd38354a 100644 +--- a/src/panfrost/lib/pan_bo.c ++++ b/src/panfrost/lib/pan_bo.c +@@ -39,6 +39,7 @@ + + #include "util/u_inlines.h" + #include "util/u_math.h" ++#include "util/os_file.h" + + /* This file implements a userspace BO cache. Allocating and freeing + * GPU-visible buffers is very expensive, and even the extra kernel roundtrips +@@ -71,7 +72,38 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size, + create_bo.flags |= PANFROST_BO_NOEXEC; + } + +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); ++ void *cpu = NULL; ++ ++ bool cached = false; ++ ++ if (dev->kbase) { ++ if (flags & PAN_BO_CACHEABLE) { ++ if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) { ++ create_bo.flags |= MALI_BO_CACHED_CPU; ++ /* TODO: What if kbase decides not to cache it? */ ++ cached = true; ++ } ++ if (dev->debug & PAN_DBG_UNCACHED_GPU) ++ create_bo.flags |= MALI_BO_UNCACHED_GPU; ++ } ++ ++ unsigned mali_flags = (flags & PAN_BO_EVENT) ? 0x8200f : 0; ++ ++ struct base_ptr p = dev->mali.alloc(&dev->mali, size, create_bo.flags, mali_flags); ++ ++ if (p.gpu) { ++ cpu = p.cpu; ++ create_bo.offset = p.gpu; ++ create_bo.handle = kbase_alloc_gem_handle(&dev->mali, p.gpu, -1); ++ if (!cpu) ++ abort(); ++ ret = 0; ++ } else { ++ ret = -1; ++ } ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); ++ } + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); + return NULL; +@@ -82,29 +114,99 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size, + + bo->size = create_bo.size; + bo->ptr.gpu = create_bo.offset; ++ bo->ptr.cpu = cpu; ++ if ((uintptr_t) bo->ptr.cpu != bo->ptr.gpu) ++ bo->free_ioctl = true; + bo->gem_handle = create_bo.handle; + bo->flags = flags; + bo->dev = dev; + bo->label = label; ++ bo->cached = cached; ++ bo->dmabuf_fd = -1; + return bo; + } + + static void + panfrost_bo_free(struct panfrost_bo *bo) + { ++ struct panfrost_device *dev = bo->dev; + struct drm_gem_close gem_close = { .handle = bo->gem_handle }; + int ret; + +- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li memfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ if (dev->kbase) { ++ os_munmap(bo->ptr.cpu, bo->size); ++ if (bo->munmap_ptr) ++ os_munmap(bo->munmap_ptr, bo->size); ++ if (bo->free_ioctl) ++ dev->mali.free(&dev->mali, bo->ptr.gpu); ++ kbase_free_gem_handle(&dev->mali, bo->gem_handle); ++ ret = 0; ++ } else { ++ ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ } + if (ret) { + fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); + assert(0); + } + +- /* BO will be freed with the sparse array, but zero to indicate free */ ++ /* BO will be freed with the stable_array, but zero to indicate free */ + memset(bo, 0, sizeof(*bo)); + } + ++static bool ++panfrost_bo_usage_finished(struct panfrost_bo *bo, bool readers) ++{ ++ struct panfrost_device *dev = bo->dev; ++ kbase k = &dev->mali; ++ ++ bool ret = true; ++ ++ pthread_mutex_lock(&dev->bo_usage_lock); ++ pthread_mutex_lock(&dev->mali.queue_lock); ++ ++ util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) { ++ /* Skip if we are only waiting for writers */ ++ if (!u->write && !readers) ++ continue; ++ ++ /* Usages are ordered, so everything else is also invalid */ ++ if (u->queue >= k->event_slot_usage) ++ break; ++ ++ struct kbase_event_slot *slot = &k->event_slots[u->queue]; ++ uint64_t seqnum = u->seqnum; ++ ++ /* There is a race condition, where we can depend on an ++ * unsubmitted batch. In that cade, decrease the seqnum. ++ * Otherwise, skip invalid dependencies. TODO: do GC? */ ++ if (slot->last_submit == seqnum) ++ --seqnum; ++ else if (slot->last_submit < seqnum) ++ continue; ++ ++ if (slot->last <= seqnum) { ++ ret = false; ++ break; ++ } ++ } ++ ++ pthread_mutex_unlock(&dev->mali.queue_lock); ++ pthread_mutex_unlock(&dev->bo_usage_lock); ++ ++ return ret; ++} ++ + /* Returns true if the BO is ready, false otherwise. + * access_type is encoding the type of access one wants to ensure is done. + * Waiting is always done for writers, but if wait_readers is set then readers +@@ -113,12 +215,15 @@ panfrost_bo_free(struct panfrost_bo *bo) + bool + panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + { ++ struct panfrost_device *dev = bo->dev; + struct drm_panfrost_wait_bo req = { + .handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret; + ++ /* TODO: With driver-handled sync, is gpu_access even worth it? */ ++ + /* If the BO has been exported or imported we can't rely on the cached + * state, we need to call the WAIT_BO ioctl. + */ +@@ -134,10 +239,31 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + return true; + } + ++ if (dev->kbase && (dev->arch >= 10)) { ++ struct kbase_wait_ctx wait = kbase_wait_init(&dev->mali, timeout_ns); ++ while (kbase_wait_for_event(&wait)) { ++ if (panfrost_bo_usage_finished(bo, wait_readers)) ++ break; ++ } ++ kbase_wait_fini(wait); ++ ++ bool ret = panfrost_bo_usage_finished(bo, wait_readers); ++ if (bo->flags & PAN_BO_SHARED) ++ ret &= kbase_poll_fd_until(bo->dmabuf_fd, wait_readers, wait.until); ++ ++ if (ret) ++ bo->gpu_access &= (wait_readers ? 0 : PAN_BO_ACCESS_READ); ++ return ret; ++ } ++ + /* The ioctl returns >= 0 value when the BO we are waiting for is ready + * -1 otherwise. + */ +- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); ++ if (dev->kbase) ++ ret = kbase_wait_bo(&dev->mali, bo->gem_handle, timeout_ns, ++ wait_readers); ++ else ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); + if (ret != -1) { + /* Set gpu_access to 0 so that the next call to bo_wait() + * doesn't have to call the WAIT_BO ioctl. +@@ -153,6 +279,32 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + return false; + } + ++static void ++panfrost_bo_mem_op(struct panfrost_bo *bo, size_t offset, size_t length, bool invalidate) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ assert(offset + length <= bo->size); ++ ++ if (!bo->cached) ++ return; ++ ++ dev->mali.mem_sync(&dev->mali, bo->ptr.gpu, bo->ptr.cpu + offset, length, ++ invalidate); ++} ++ ++void ++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length) ++{ ++ panfrost_bo_mem_op(bo, offset, length, true); ++} ++ ++void ++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length) ++{ ++ panfrost_bo_mem_op(bo, offset, length, false); ++} ++ + /* Helper to calculate the bucket index of a BO */ + + static unsigned +@@ -200,21 +352,31 @@ panfrost_bo_cache_fetch(struct panfrost_device *dev, + + /* If the oldest BO in the cache is busy, likely so is + * everything newer, so bail. */ +- if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, +- PAN_BO_ACCESS_RW)) +- break; ++ ++ /* For kbase, BOs are not added to the cache until the GPU is ++ * done with them, so there is no need to wait. */ ++ if (!dev->kbase) { ++ if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, ++ PAN_BO_ACCESS_RW)) ++ break; ++ } + + struct drm_panfrost_madvise madv = { + .handle = entry->gem_handle, + .madv = PANFROST_MADV_WILLNEED, + }; +- int ret; ++ int ret = 0; + + /* This one works, splice it out of the cache */ + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ if (dev->kbase) { ++ /* With kbase, BOs are never freed from the cache */ ++ madv.retained = true; ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ } + if (!ret && !madv.retained) { + panfrost_bo_free(entry); + continue; +@@ -276,7 +438,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo) + madv.madv = PANFROST_MADV_DONTNEED; + madv.retained = 0; + +- drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ // TODO: Allow freeing madvise'd BOs with kbase... not that it really ++ // matters for boards with 16 GB RAM ++ if (!dev->kbase) ++ drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + + /* Add us to the bucket */ + list_addtail(&bo->bucket_link, bucket); +@@ -286,6 +451,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo) + clock_gettime(CLOCK_MONOTONIC, &time); + bo->last_used = time.tv_sec; + ++ /* For kbase, the GPU can't be accessing this BO any more */ ++ if (dev->kbase) ++ bo->gpu_access = 0; ++ + /* Let's do some cleanup in the BO cache while we hold the + * lock. + */ +@@ -352,10 +521,15 @@ panfrost_bo_mmap(struct panfrost_bo *bo) + static void + panfrost_bo_munmap(struct panfrost_bo *bo) + { ++ /* We can't munmap BOs when using kbase, as that frees the storage and ++ * the GPU might still be using the BO. */ ++ if (bo->dev->kbase) ++ return; ++ + if (!bo->ptr.cpu) + return; + +- if (os_munmap((void *) (uintptr_t)bo->ptr.cpu, bo->size)) { ++ if (os_munmap(bo->ptr.cpu, bo->size)) { + perror("munmap"); + abort(); + } +@@ -390,8 +564,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + if (!bo) + bo = panfrost_bo_cache_fetch(dev, size, flags, label, false); + if (!bo) { +- panfrost_bo_cache_evict_all(dev); +- bo = panfrost_bo_alloc(dev, size, flags, label); ++ for (unsigned i = 0; i < 5; ++i) { ++ usleep(20 * 1000 * i * i); ++ if (dev->kbase) ++ kbase_ensure_handle_events(&dev->mali); ++ panfrost_bo_cache_evict_all(dev); ++ bo = panfrost_bo_alloc(dev, size, flags, label); ++ if (bo) ++ break; ++ } + } + + if (!bo) { +@@ -406,8 +587,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) + panfrost_bo_mmap(bo); + ++ if ((dev->debug & PAN_DBG_BO_CLEAR) && !(flags & PAN_BO_INVISIBLE)) { ++ memset(bo->ptr.cpu, 0, bo->size); ++ panfrost_bo_mem_clean(bo, 0, bo->size); ++ } ++ + p_atomic_set(&bo->refcnt, 1); + ++ util_dynarray_init(&bo->usage, NULL); ++ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + if (flags & PAN_BO_INVISIBLE) + pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL); +@@ -415,6 +603,14 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); + } + ++ if (dev->bo_log) { ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li alloc %"PRIx64" to %"PRIx64" size %zu label %s\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label); ++ fflush(NULL); ++ } ++ + return bo; + } + +@@ -427,6 +623,60 @@ panfrost_bo_reference(struct panfrost_bo *bo) + } + } + ++static void ++panfrost_bo_fini(struct panfrost_bo *bo) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ /* When the reference count goes to zero, we need to cleanup */ ++ panfrost_bo_munmap(bo); ++ ++ if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) ++ pandecode_inject_free(bo->ptr.gpu, bo->size); ++ ++ /* Rather than freeing the BO now, we'll cache the BO for later ++ * allocations if we're allowed to. ++ */ ++ if (!panfrost_bo_cache_put(bo)) ++ panfrost_bo_free(bo); ++} ++ ++static void ++panfrost_bo_free_gpu(void *data) ++{ ++ struct panfrost_bo *bo = data; ++ struct panfrost_device *dev = bo->dev; ++ ++ /* Don't free if there are still references */ ++ if (p_atomic_dec_return(&bo->gpu_refcnt)) ++ return; ++ ++ pthread_mutex_lock(&dev->bo_map_lock); ++ ++ /* Someone might have imported this BO while we were waiting for the ++ * lock, let's make sure it's still not referenced before freeing it. ++ */ ++ if (p_atomic_read(&bo->refcnt) != 0) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } ++ ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li gpufree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ panfrost_bo_fini(bo); ++ ++ pthread_mutex_unlock(&dev->bo_map_lock); ++} ++ + void + panfrost_bo_unreference(struct panfrost_bo *bo) + { +@@ -439,25 +689,57 @@ panfrost_bo_unreference(struct panfrost_bo *bo) + + struct panfrost_device *dev = bo->dev; + ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li free %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ + pthread_mutex_lock(&dev->bo_map_lock); + + /* Someone might have imported this BO while we were waiting for the + * lock, let's make sure it's still not referenced before freeing it. + */ +- if (p_atomic_read(&bo->refcnt) == 0) { +- /* When the reference count goes to zero, we need to cleanup */ +- panfrost_bo_munmap(bo); ++ if (p_atomic_read(&bo->refcnt) != 0) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } + +- if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) +- pandecode_inject_free(bo->ptr.gpu, bo->size); ++ util_dynarray_fini(&bo->usage); + +- /* Rather than freeing the BO now, we'll cache the BO for later +- * allocations if we're allowed to. ++ if (dev->kbase) { ++ /* Assume that all queues are using this BO, and so free the ++ * BO only after all currently-submitted jobs have finished. ++ * This could eventually be optimised to only wait on a subset ++ * of queues. + */ +- if (!panfrost_bo_cache_put(bo)) +- panfrost_bo_free(bo); ++ bool added = dev->mali.callback_all_queues(&dev->mali, ++ &bo->gpu_refcnt, panfrost_bo_free_gpu, bo); + ++ if (added) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } + } ++ ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li immfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ panfrost_bo_fini(bo); ++ + pthread_mutex_unlock(&dev->bo_map_lock); + } + +@@ -467,22 +749,42 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + struct panfrost_bo *bo; + struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; + ASSERTED int ret; ++ kbase_handle handle = { .fd = -1 }; + unsigned gem_handle; + +- ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); +- assert(!ret); ++ if (dev->kbase) { ++ gem_handle = dev->mali.import_dmabuf(&dev->mali, fd); ++ if (gem_handle == -1) ++ return NULL; ++ } else { ++ ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); ++ assert(!ret); ++ } + + pthread_mutex_lock(&dev->bo_map_lock); + bo = pan_lookup_bo(dev, gem_handle); + ++ bool found = false; ++ + if (!bo->dev) { + get_bo_offset.handle = gem_handle; +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); +- assert(!ret); ++ if (dev->kbase) { ++ handle = kbase_gem_handle_get(&dev->mali, gem_handle); ++ get_bo_offset.offset = handle.va; ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); ++ assert(!ret); ++ } + + bo->dev = dev; +- bo->ptr.gpu = (mali_ptr) get_bo_offset.offset; + bo->size = lseek(fd, 0, SEEK_END); ++ bo->ptr.gpu = (mali_ptr) get_bo_offset.offset; ++ if (dev->kbase && (sizeof(void *) > 4 || get_bo_offset.offset < (1LL << 32))) { ++ bo->ptr.cpu = (void *)(uintptr_t) get_bo_offset.offset; ++ } else if (dev->kbase) { ++ bo->ptr.cpu = dev->mali.mmap_import(&dev->mali, bo->ptr.gpu, bo->size); ++ bo->free_ioctl = true; ++ } + /* Sometimes this can fail and return -1. size of -1 is not + * a nice thing for mmap to try mmap. Be more robust also + * for zero sized maps and fail nicely too +@@ -493,8 +795,21 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + } + bo->flags = PAN_BO_SHARED; + bo->gem_handle = gem_handle; ++ util_dynarray_init(&bo->usage, NULL); ++ if (dev->kbase) { ++ /* kbase always maps dma-bufs with caching */ ++ bo->cached = true; ++ ++ /* Importing duplicates the FD, so we cache the FD ++ * from the handle */ ++ bo->dmabuf_fd = handle.fd; ++ } else { ++ bo->dmabuf_fd = -1; ++ } + p_atomic_set(&bo->refcnt, 1); + } else { ++ found = true; ++ + /* bo->refcnt == 0 can happen if the BO + * was being released but panfrost_bo_import() acquired the + * lock before panfrost_bo_unreference(). In that case, refcnt +@@ -512,12 +827,34 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + } + pthread_mutex_unlock(&dev->bo_map_lock); + ++ if (dev->bo_log) { ++ int new_fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li import %"PRIx64" to %"PRIx64" size %zu fd %i new %i handle %i found %i\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, ++ fd, new_fd, gem_handle, found); ++ fflush(NULL); ++ } ++ + return bo; + } + + int + panfrost_bo_export(struct panfrost_bo *bo) + { ++ struct panfrost_device *dev = bo->dev; ++ ++ if (bo->dmabuf_fd != -1) { ++ assert(bo->flags & PAN_BO_SHARED); ++ ++ return os_dupfd_cloexec(bo->dmabuf_fd); ++ } ++ ++ if (dev->kbase) ++ return -1; ++ + struct drm_prime_handle args = { + .handle = bo->gem_handle, + .flags = DRM_CLOEXEC, +diff --git a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h +index 7d19fba9dfc..1b817191734 100644 +--- a/src/panfrost/lib/pan_bo.h ++++ b/src/panfrost/lib/pan_bo.h +@@ -27,6 +27,7 @@ + #define __PAN_BO_H__ + + #include "util/list.h" ++#include "util/u_dynarray.h" + #include "panfrost-job.h" + #include + +@@ -50,6 +51,12 @@ + * cached locally */ + #define PAN_BO_SHARED (1 << 4) + ++/* Use event memory, required for CSF events to be signaled to the kernel */ ++#define PAN_BO_EVENT (1 << 5) ++ ++/* Use the caching policy for resource BOs */ ++#define PAN_BO_CACHEABLE (1 << 6) ++ + /* GPU access flags */ + + /* BO is either shared (can be accessed by more than one GPU batch) or private +@@ -80,6 +87,12 @@ struct panfrost_ptr { + mali_ptr gpu; + }; + ++struct panfrost_usage { ++ uint32_t queue; ++ bool write; ++ uint64_t seqnum; ++}; ++ + struct panfrost_bo { + /* Must be first for casting */ + struct list_head bucket_link; +@@ -95,11 +108,16 @@ struct panfrost_bo { + /* Atomic reference count */ + int32_t refcnt; + ++ /* Reference count for GPU jobs */ ++ int32_t gpu_refcnt; ++ + struct panfrost_device *dev; + + /* Mapping for the entire object (all levels) */ + struct panfrost_ptr ptr; + ++ struct util_dynarray usage; ++ + /* Size of all entire trees */ + size_t size; + +@@ -115,11 +133,31 @@ struct panfrost_bo { + + /* Human readable description of the BO for debugging. */ + const char *label; ++ ++ /* Sometimes we don't access the BO through kbase's mapping of the ++ * memory, in that case we need to save the pointer to pass to ++ * munmap to avoid leaking memory. */ ++ void *munmap_ptr; ++ ++ /* For 32-bit applications we may not even be able to that, because ++ * the VA may be too high for kbase to map to an equivalent CPU ++ * address, in which case we must use the memory free icotl. */ ++ bool free_ioctl; ++ ++ /* Is the BO cached CPU-side? */ ++ bool cached; ++ ++ /* File descriptor for the dma-buf */ ++ int dmabuf_fd; + }; + + bool + panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers); + void ++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length); ++void ++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length); ++void + panfrost_bo_reference(struct panfrost_bo *bo); + void + panfrost_bo_unreference(struct panfrost_bo *bo); +diff --git a/src/panfrost/lib/pan_cs.c b/src/panfrost/lib/pan_cs.c +index 986eb1e158d..7b24ec6586a 100644 +--- a/src/panfrost/lib/pan_cs.c ++++ b/src/panfrost/lib/pan_cs.c +@@ -282,9 +282,15 @@ pan_prepare_crc(const struct pan_fb_info *fb, int rt_crc, + ext->crc_render_target = rt_crc; + + if (fb->rts[rt_crc].clear) { ++#if PAN_ARCH < 10 ++ // todo v10 + uint32_t clear_val = fb->rts[rt_crc].clear_value[0]; + ext->crc_clear_color = clear_val | 0xc000000000000000 | + (((uint64_t)clear_val & 0xffff) << 32); ++#else ++ // TODO: Is this correct? ++ ext->crc_unk = 0x1f; ++#endif + } + #endif + } +@@ -420,7 +426,8 @@ pan_rt_init_format(const struct pan_image_view *rt, + cfg->swizzle = panfrost_translate_swizzle_4(swizzle); + } + +-#if PAN_ARCH >= 9 ++/* Don't define for later gens as this is not a GENX function */ ++#if PAN_ARCH == 9 + enum mali_afbc_compression_mode + pan_afbc_compression_mode(enum pipe_format format) + { +@@ -438,14 +445,21 @@ pan_afbc_compression_mode(enum pipe_format format) + case PIPE_FORMAT_R8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8; + case PIPE_FORMAT_R8G8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8; + case PIPE_FORMAT_R5G6B5_UNORM: return MALI_AFBC_COMPRESSION_MODE_R5G6B5; ++ case PIPE_FORMAT_R5G5B5A1_UNORM: return MALI_AFBC_COMPRESSION_MODE_R5G5B5A1; + case PIPE_FORMAT_R4G4B4A4_UNORM: return MALI_AFBC_COMPRESSION_MODE_R4G4B4A4; + case PIPE_FORMAT_R8G8B8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8B8; + case PIPE_FORMAT_R8G8B8A8_UNORM: return MALI_AFBC_COMPRESSION_MODE_R8G8B8A8; + case PIPE_FORMAT_R10G10B10A2_UNORM: return MALI_AFBC_COMPRESSION_MODE_R10G10B10A2; + case PIPE_FORMAT_R11G11B10_FLOAT: return MALI_AFBC_COMPRESSION_MODE_R11G11B10; + case PIPE_FORMAT_S8_UINT: return MALI_AFBC_COMPRESSION_MODE_S8; +- case PIPE_FORMAT_NONE: unreachable("invalid format for AFBC"); +- default: unreachable("unknown canonical AFBC format"); ++ case PIPE_FORMAT_NONE: ++ fprintf(stderr, "invalid format for AFBC: %s\n", util_format_name(format)); ++ fflush(NULL); ++ abort(); ++ default: ++ fprintf(stderr, "unknown canonical AFBC format: %s\n", util_format_name(format)); ++ fflush(NULL); ++ abort(); + } + } + #endif +@@ -558,6 +572,7 @@ GENX(pan_emit_tls)(const struct pan_tls_info *info, + */ + cfg.tls_address_mode = MALI_ADDRESS_MODE_PACKED; + ++ /* The shift is only used for packed mode */ + assert((info->tls.ptr & 4095) == 0); + cfg.tls_base_pointer = info->tls.ptr >> 8; + #else +@@ -731,6 +746,9 @@ GENX(pan_emit_fbd)(const struct panfrost_device *dev, + #if PAN_ARCH >= 6 + bool force_clean_write = pan_force_clean_write(fb, tile_size); + ++#if PAN_ARCH >= 9 ++ cfg.frame_argument = 0x10000; ++#endif + cfg.sample_locations = + panfrost_sample_positions(dev, pan_sample_pattern(fb->nr_samples)); + cfg.pre_frame_0 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], force_clean_write); +@@ -940,7 +958,7 @@ GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev, + pan_pack(out, TILER_HEAP, heap) { + heap.size = dev->tiler_heap->size; + heap.base = dev->tiler_heap->ptr.gpu; +- heap.bottom = dev->tiler_heap->ptr.gpu; ++ heap.bottom = dev->tiler_heap->ptr.gpu + 64; + heap.top = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size; + } + } +@@ -951,30 +969,39 @@ GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, + unsigned nr_samples, + bool first_provoking_vertex, + mali_ptr heap, ++ mali_ptr scratch, + void *out) + { + unsigned max_levels = dev->tiler_features.max_levels; + assert(max_levels >= 2); + + pan_pack(out, TILER_CONTEXT, tiler) { +- /* TODO: Select hierarchy mask more effectively */ +- tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28; +- +- /* For large framebuffers, disable the smallest bin size to +- * avoid pathological tiler memory usage. Required to avoid OOM +- * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on +- * Mali-G57. ++ /* TODO: Select hierarchy mask more effectively. */ ++ ++ /* Disable the smallest hierarchy level. This is required to ++ * use 32x32 tiles on v10, and helps reduce tiler heap memory ++ * usage for other GPUs. The rasteriser can efficiently skip ++ * primitives not entering the current quadrant of a tile, so ++ * this should not hurt performance much. ++ * Even for GPUs earlier than v10, cores get fed tiles in ++ * 32x32 pixel blocks, so making all of the tiles use the same ++ * set of primitive lists could help with performance. ++ * Maybe then v10 should disable two levels? + */ +- if (MAX2(fb_width, fb_height) >= 4096) +- tiler.hierarchy_mask &= ~1; ++ tiler.hierarchy_mask = (max_levels >= 8) ? 0xFE : 0x28; + + tiler.fb_width = fb_width; + tiler.fb_height = fb_height; + tiler.heap = heap; ++#if PAN_ARCH >= 10 ++ tiler.scratch = scratch; ++#endif + tiler.sample_pattern = pan_sample_pattern(nr_samples); + #if PAN_ARCH >= 9 + tiler.first_provoking_vertex = first_provoking_vertex; + #endif ++ tiler.state.word1 = 31; ++ tiler.state.word3 = 0x10000000; + } + } + #endif +@@ -984,24 +1011,43 @@ GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb, + mali_ptr fbd, + void *out) + { ++#if PAN_ARCH < 10 + pan_section_pack(out, FRAGMENT_JOB, HEADER, header) { + header.type = MALI_JOB_TYPE_FRAGMENT; + header.index = 1; + } ++#endif + +- pan_section_pack(out, FRAGMENT_JOB, PAYLOAD, payload) { +- payload.bound_min_x = fb->extent.minx >> MALI_TILE_SHIFT; +- payload.bound_min_y = fb->extent.miny >> MALI_TILE_SHIFT; +- payload.bound_max_x = fb->extent.maxx >> MALI_TILE_SHIFT; +- payload.bound_max_y = fb->extent.maxy >> MALI_TILE_SHIFT; ++#if PAN_ARCH < 10 ++#define BOUND_SHIFT MALI_TILE_SHIFT ++#else ++#define BOUND_SHIFT 0 ++#endif ++ ++ pan_section_pack_cs_v10(out, fb->cs_fragment, FRAGMENT_JOB, PAYLOAD, payload) { ++ payload.bound_min_x = fb->extent.minx >> BOUND_SHIFT; ++ payload.bound_min_y = fb->extent.miny >> BOUND_SHIFT; ++ payload.bound_max_x = fb->extent.maxx >> BOUND_SHIFT; ++ payload.bound_max_y = fb->extent.maxy >> BOUND_SHIFT; + payload.framebuffer = fbd; + + #if PAN_ARCH >= 5 + if (fb->tile_map.base) { ++#if PAN_ARCH < 0 + payload.has_tile_enable_map = true; ++#endif + payload.tile_enable_map = fb->tile_map.base; + payload.tile_enable_map_row_stride = fb->tile_map.stride; + } ++#else ++ assert(!fb->tile_map.base); + #endif + } ++ ++#if PAN_ARCH >= 10 ++ /* TODO: Do this here? */ ++ pan_pack_ins(fb->cs_fragment, FRAGMENT_LAUNCH, launch) { ++ launch.has_tile_enable_map = !!fb->tile_map.base; ++ } ++#endif + } +diff --git a/src/panfrost/lib/pan_cs.h b/src/panfrost/lib/pan_cs.h +index 8186102e5c0..5c5e29cb6d6 100644 +--- a/src/panfrost/lib/pan_cs.h ++++ b/src/panfrost/lib/pan_cs.h +@@ -121,6 +121,8 @@ struct pan_fb_info { + /* Only used on Valhall */ + bool sprite_coord_origin; + bool first_provoking_vertex; ++ ++ pan_command_stream *cs_fragment; + }; + + static inline unsigned +@@ -171,7 +173,7 @@ void + GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, + unsigned fb_width, unsigned fb_height, + unsigned nr_samples, bool first_provoking_vertex, +- mali_ptr heap, ++ mali_ptr heap, mali_ptr scratch, + void *out); + #endif + +diff --git a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h +index ad18d154a2c..acb46620968 100644 +--- a/src/panfrost/lib/pan_device.h ++++ b/src/panfrost/lib/pan_device.h +@@ -35,11 +35,12 @@ + #include "util/u_dynarray.h" + #include "util/bitset.h" + #include "util/list.h" +-#include "util/sparse_array.h" ++#include "util/stable_array.h" + + #include "panfrost/util/pan_ir.h" + #include "pan_pool.h" + #include "pan_util.h" ++#include "pan_base.h" + + #include + +@@ -182,6 +183,7 @@ struct panfrost_device { + void *memctx; + + int fd; ++ bool kbase; + + /* Properties of the GPU in use */ + unsigned arch; +@@ -204,6 +206,9 @@ struct panfrost_device { + const struct panfrost_model *model; + bool has_afbc; + ++ /* Does the kernel support dma-buf fence import/export? */ ++ bool has_dmabuf_fence; ++ + /* Table of formats, indexed by a PIPE format */ + const struct panfrost_format *formats; + +@@ -217,8 +222,11 @@ struct panfrost_device { + + struct renderonly *ro; + ++ /* Hold this while updating usage field of BOs */ ++ pthread_mutex_t bo_usage_lock; ++ + pthread_mutex_t bo_map_lock; +- struct util_sparse_array bo_map; ++ struct stable_array bo_map; + + struct { + pthread_mutex_t lock; +@@ -263,6 +271,10 @@ struct panfrost_device { + * unconditionally on Bifrost, and useful for sharing with Midgard */ + + struct panfrost_bo *sample_positions; ++ ++ struct kbase_ mali; ++ ++ FILE *bo_log; + }; + + void +@@ -271,6 +283,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); + void + panfrost_close_device(struct panfrost_device *dev); + ++bool ++panfrost_check_dmabuf_fence(struct panfrost_device *dev); ++ + bool + panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt); + +@@ -287,12 +302,18 @@ panfrost_query_sample_position( + float *out); + + unsigned +-panfrost_query_l2_slices(const struct panfrost_device *dev); ++panfrost_query_l2_slices(struct panfrost_device *dev); + + static inline struct panfrost_bo * + pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle) + { +- return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle); ++ return stable_array_get(&dev->bo_map, struct panfrost_bo, gem_handle); ++} ++ ++static inline struct panfrost_bo * ++pan_lookup_bo_existing(struct panfrost_device *dev, uint32_t gem_handle) ++{ ++ return stable_array_get_existing(&dev->bo_map, struct panfrost_bo, gem_handle); + } + + static inline bool +diff --git a/src/panfrost/lib/pan_layout.c b/src/panfrost/lib/pan_layout.c +index b64a2d7a6e5..96940438f54 100644 +--- a/src/panfrost/lib/pan_layout.c ++++ b/src/panfrost/lib/pan_layout.c +@@ -32,6 +32,14 @@ + * enabling the YUV-like transform is typically a win where possible. */ + + uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT] = { ++ DRM_FORMAT_MOD_ARM_AFBC( ++ AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | ++ AFBC_FORMAT_MOD_TILED | ++ AFBC_FORMAT_MOD_SC | ++ AFBC_FORMAT_MOD_SPARSE | ++ AFBC_FORMAT_MOD_YTR | ++ AFBC_FORMAT_MOD_NATIVE_SWIZZLE), ++ + DRM_FORMAT_MOD_ARM_AFBC( + AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_TILED | +@@ -201,18 +209,17 @@ pan_afbc_body_align(uint64_t modifier) + #define CHECKSUM_TILE_HEIGHT 16 + #define CHECKSUM_BYTES_PER_TILE 8 + +-unsigned +-panfrost_compute_checksum_size( +- struct pan_image_slice_layout *slice, +- unsigned width, +- unsigned height) ++struct pan_image_slice_crc ++panfrost_compute_checksum_size(unsigned width, unsigned height) + { + unsigned tile_count_x = DIV_ROUND_UP(width, CHECKSUM_TILE_WIDTH); + unsigned tile_count_y = DIV_ROUND_UP(height, CHECKSUM_TILE_HEIGHT); + +- slice->crc.stride = tile_count_x * CHECKSUM_BYTES_PER_TILE; +- +- return slice->crc.stride * tile_count_y; ++ struct pan_image_slice_crc ret = { ++ .stride = tile_count_x * CHECKSUM_BYTES_PER_TILE, ++ .size = ret.stride * tile_count_y, ++ }; ++ return ret; + } + + unsigned +@@ -236,8 +243,11 @@ panfrost_get_legacy_stride(const struct pan_image_layout *layout, + panfrost_block_size(layout->modifier, layout->format); + + if (drm_is_afbc(layout->modifier)) { ++ unsigned align_w = block_size.width * ++ pan_afbc_tile_size(layout->modifier); ++ + unsigned width = u_minify(layout->width, level); +- width = ALIGN_POT(width, block_size.width); ++ width = ALIGN_POT(width, align_w); + + return width * util_format_get_blocksize(layout->format); + } else { +@@ -392,9 +402,7 @@ pan_image_layout_init(struct pan_image_layout *layout, + + /* Add a checksum region if necessary */ + if (layout->crc) { +- slice->crc.size = +- panfrost_compute_checksum_size(slice, width, height); +- ++ slice->crc = panfrost_compute_checksum_size(width, height); + slice->crc.offset = offset; + offset += slice->crc.size; + slice->size += slice->crc.size; +diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h +index 01c8348c41d..e332adff362 100644 +--- a/src/panfrost/lib/pan_pool.h ++++ b/src/panfrost/lib/pan_pool.h +@@ -130,4 +130,17 @@ pan_pool_alloc_descs(struct pan_pool *pool, + #define pan_pool_alloc_desc_aggregate(pool, ...) \ + pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__)) + ++#ifdef PAN_ARCH ++#if PAN_ARCH < 10 ++ ++#define pan_pool_alloc_desc_cs_v10(pool, name) \ ++ pan_pool_alloc_desc(pool, name) ++ ++#else /* PAN_ARCH >= 10 */ ++ ++#define pan_pool_alloc_desc_cs_v10(pool, name) ((struct panfrost_ptr) {0}) ++ ++#endif ++#endif /* PAN_ARCH */ ++ + #endif +diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c +index 048954b4c4d..57188c24f52 100644 +--- a/src/panfrost/lib/pan_props.c ++++ b/src/panfrost/lib/pan_props.c +@@ -24,6 +24,7 @@ + * Alyssa Rosenzweig + */ + ++#include + #include + + #include "util/u_math.h" +@@ -31,12 +32,14 @@ + #include "util/hash_table.h" + #include "util/u_thread.h" + #include "drm-uapi/panfrost_drm.h" ++#include "dma-uapi/dma-buf.h" + #include "pan_encoder.h" + #include "pan_device.h" + #include "pan_bo.h" + #include "pan_texture.h" + #include "wrap.h" + #include "pan_util.h" ++#include "pan_base.h" + + /* Fixed "minimum revisions" */ + #define NO_ANISO (~0) +@@ -70,6 +73,18 @@ const struct panfrost_model panfrost_model_list[] = { + MODEL(0x7212, "G52", "TGOx", HAS_ANISO, 16384, {}), + MODEL(0x7402, "G52 r1", "TGOx", HAS_ANISO, 16384, {}), + MODEL(0x9093, "G57", "TNAx", HAS_ANISO, 16384, {}), ++ MODEL(0xa867, "G610", "LODx", HAS_ANISO, 65536, {}), ++ /* Matching the kbase dummy model, probably not real GPUs */ ++ MODEL(0xa802, "G710", "TODx", HAS_ANISO, 65536, {}), ++}; ++ ++const struct panfrost_model panfrost_unknown_model = { ++ .gpu_id = 0, ++ .name = "Unknowm Mali device (Panfrost)", ++ .performance_counters = "AAAA", ++ .min_rev_anisotropic = NO_ANISO, ++ .tilebuffer_size = 8192, ++ .quirks = {}, + }; + + #undef NO_ANISO +@@ -83,12 +98,13 @@ const struct panfrost_model panfrost_model_list[] = { + const struct panfrost_model * + panfrost_get_model(uint32_t gpu_id) + { ++ + for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) { + if (panfrost_model_list[i].gpu_id == gpu_id) + return &panfrost_model_list[i]; + } + +- return NULL; ++ return &panfrost_unknown_model; + } + + /* Abstraction over the raw drm_panfrost_get_param ioctl for fetching +@@ -96,16 +112,27 @@ panfrost_get_model(uint32_t gpu_id) + + static __u64 + panfrost_query_raw( +- int fd, ++ struct panfrost_device *dev, + enum drm_panfrost_param param, + bool required, + unsigned default_value) + { ++ if (dev->kbase) { ++ uint64_t value; ++ bool ret = dev->mali.get_pan_gpuprop(&dev->mali, param, &value); ++ if (ret) { ++ return value; ++ } else { ++ assert(!required); ++ return default_value; ++ } ++ } ++ + struct drm_panfrost_get_param get_param = {0,}; + ASSERTED int ret; + + get_param.param = param; +- ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); + + if (ret) { + assert(!required); +@@ -116,23 +143,23 @@ panfrost_query_raw( + } + + static unsigned +-panfrost_query_gpu_version(int fd) ++panfrost_query_gpu_version(struct panfrost_device *dev) + { +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); + } + + static unsigned +-panfrost_query_gpu_revision(int fd) ++panfrost_query_gpu_revision(struct panfrost_device *dev) + { +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); + } + + unsigned +-panfrost_query_l2_slices(const struct panfrost_device *dev) ++panfrost_query_l2_slices(struct panfrost_device *dev) + { + /* Query MEM_FEATURES register */ + uint32_t mem_features = +- panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES, ++ panfrost_query_raw(dev, DRM_PANFROST_PARAM_MEM_FEATURES, + true, 0); + + /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */ +@@ -140,10 +167,10 @@ panfrost_query_l2_slices(const struct panfrost_device *dev) + } + + static struct panfrost_tiler_features +-panfrost_query_tiler_features(int fd) ++panfrost_query_tiler_features(struct panfrost_device *dev) + { + /* Default value (2^9 bytes and 8 levels) to match old behaviour */ +- uint32_t raw = panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES, ++ uint32_t raw = panfrost_query_raw(dev, DRM_PANFROST_PARAM_TILER_FEATURES, + false, 0x809); + + /* Bin size is log2 in the first byte, max levels in the second byte */ +@@ -154,11 +181,11 @@ panfrost_query_tiler_features(int fd) + } + + static unsigned +-panfrost_query_core_count(int fd, unsigned *core_id_range) ++panfrost_query_core_count(struct panfrost_device *dev, unsigned *core_id_range) + { + /* On older kernels, worst-case to 16 cores */ + +- unsigned mask = panfrost_query_raw(fd, ++ unsigned mask = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); + + /* Some cores might be absent. In some cases, we care +@@ -199,16 +226,16 @@ panfrost_max_thread_count(unsigned arch) + } + + static unsigned +-panfrost_query_thread_tls_alloc(int fd, unsigned major) ++panfrost_query_thread_tls_alloc(struct panfrost_device *dev, unsigned major) + { +- unsigned tls = panfrost_query_raw(fd, ++ unsigned tls = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0); + + return (tls > 0) ? tls : panfrost_max_thread_count(major); + } + + static uint32_t +-panfrost_query_compressed_formats(int fd) ++panfrost_query_compressed_formats(struct panfrost_device *dev) + { + /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and + * should exist on any Mali configuration. All hardware should report +@@ -227,7 +254,7 @@ panfrost_query_compressed_formats(int fd) + (1 << MALI_ASTC_2D_LDR) | + (1 << MALI_ASTC_2D_HDR); + +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, + false, default_set); + } + +@@ -250,9 +277,9 @@ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) + * may omit it, signaled as a nonzero value in the AFBC_FEATURES property. */ + + static bool +-panfrost_query_afbc(int fd, unsigned arch) ++panfrost_query_afbc(struct panfrost_device *dev, unsigned arch) + { +- unsigned reg = panfrost_query_raw(fd, ++ unsigned reg = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_AFBC_FEATURES, + false, 0); + +@@ -281,24 +308,40 @@ panfrost_query_optimal_tib_size(const struct panfrost_device *dev) + void + panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + { ++ if (kbase_open(&dev->mali, fd, 4, (dev->debug & PAN_DBG_LOG))) { ++ dev->kbase = true; ++ fd = -1; ++ } ++ + dev->fd = fd; + dev->memctx = memctx; +- dev->gpu_id = panfrost_query_gpu_version(fd); ++ dev->gpu_id = panfrost_query_gpu_version(dev); + dev->arch = pan_arch(dev->gpu_id); +- dev->kernel_version = drmGetVersion(fd); +- dev->revision = panfrost_query_gpu_revision(fd); ++ if (dev->kbase) { ++ dev->kernel_version = calloc(1, sizeof(drmVersion)); ++ *dev->kernel_version = (drmVersion) { ++ .version_major = 1, ++ .version_minor = 999, ++ }; ++ } else { ++ dev->kernel_version = drmGetVersion(fd); ++ } ++ dev->revision = panfrost_query_gpu_revision(dev); + dev->model = panfrost_get_model(dev->gpu_id); + + /* If we don't recognize the model, bail early */ + if (!dev->model) + return; + +- dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range); +- dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch); ++ if (dev->debug & PAN_DBG_BO_LOG) ++ dev->bo_log = fopen("/tmp/bo_log", "w"); ++ ++ dev->core_count = panfrost_query_core_count(dev, &dev->core_id_range); ++ dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(dev, dev->arch); + dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev); +- dev->compressed_formats = panfrost_query_compressed_formats(fd); +- dev->tiler_features = panfrost_query_tiler_features(fd); +- dev->has_afbc = panfrost_query_afbc(fd, dev->arch); ++ dev->compressed_formats = panfrost_query_compressed_formats(dev); ++ dev->tiler_features = panfrost_query_tiler_features(dev); ++ dev->has_afbc = panfrost_query_afbc(dev, dev->arch); + + if (dev->arch <= 6) + dev->formats = panfrost_pipe_format_v6; +@@ -307,8 +350,10 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + else + dev->formats = panfrost_pipe_format_v9; + +- util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); ++ stable_array_init(&dev->bo_map, struct panfrost_bo); + ++ pthread_mutex_init(&dev->bo_usage_lock, NULL); ++ pthread_mutex_init(&dev->bo_map_lock, NULL); + pthread_mutex_init(&dev->bo_cache.lock, NULL); + list_inithead(&dev->bo_cache.lru); + +@@ -323,8 +368,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + * active for a single job chain at once, so a single heap can be + * shared across batches/contextes */ + +- dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024, +- PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); ++ if (dev->arch < 10) ++ dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024, ++ PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); + + pthread_mutex_init(&dev->submit_lock, NULL); + +@@ -341,11 +387,102 @@ panfrost_close_device(struct panfrost_device *dev) + if (dev->model) { + pthread_mutex_destroy(&dev->submit_lock); + panfrost_bo_unreference(dev->tiler_heap); ++ panfrost_bo_unreference(dev->sample_positions); + panfrost_bo_cache_evict_all(dev); + pthread_mutex_destroy(&dev->bo_cache.lock); +- util_sparse_array_finish(&dev->bo_map); ++ pthread_mutex_destroy(&dev->bo_map_lock); ++ pthread_mutex_destroy(&dev->bo_usage_lock); ++ stable_array_fini(&dev->bo_map); ++ } ++ ++ if (dev->kbase) ++ free(dev->kernel_version); ++ else ++ drmFreeVersion(dev->kernel_version); ++ if (dev->kbase) ++ dev->mali.close(&dev->mali); ++ else ++ close(dev->fd); ++} ++ ++bool ++panfrost_check_dmabuf_fence(struct panfrost_device *dev) ++{ ++ bool ret = false; ++ int err; ++ ++ /* This function is only useful for kbase, where we can't create ++ * dma-bufs from the kbase FD. */ ++ if (!dev->ro) ++ goto out; ++ ++ struct drm_mode_create_dumb create_dumb = { ++ .width = 16, ++ .height = 16, ++ .bpp = 32, ++ }; ++ ++ err = drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_dumb); ++ if (err < 0) { ++ fprintf(stderr, "DRM_IOCTL_MODE_CREATE_DUMB failed " ++ "for fence check: %s\n", ++ strerror(errno)); ++ goto out; ++ } ++ ++ int fd; ++ err = drmPrimeHandleToFD(dev->ro->kms_fd, create_dumb.handle, O_CLOEXEC, ++ &fd); ++ if (err < 0) { ++ fprintf(stderr, "failed to export buffer for fence check: %s\n", ++ strerror(errno)); ++ goto free_dumb; + } + +- drmFreeVersion(dev->kernel_version); +- close(dev->fd); ++ struct dma_buf_export_sync_file export = { ++ .flags = DMA_BUF_SYNC_RW, ++ }; ++ ++ /* ENOTTY is returned if the ioctl is unsupported */ ++ ++ err = drmIoctl(fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export); ++ if (err < 0) { ++ if (errno != ENOTTY) ++ fprintf(stderr, "failed to export fence: %s\n", ++ strerror(errno)); ++ goto free_fd; ++ } ++ ++ struct dma_buf_import_sync_file import = { ++ .flags = DMA_BUF_SYNC_RW, ++ .fd = export.fd, ++ }; ++ ++ err = drmIoctl(fd, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import); ++ if (err < 0) { ++ if (errno != ENOTTY) ++ fprintf(stderr, "failed to import fence: %s\n", ++ strerror(errno)); ++ goto free_sync; ++ } ++ ++ /* We made it this far, the kernel must support the ioctls */ ++ ret = true; ++ ++free_sync: ++ close(export.fd); ++ ++free_fd: ++ close(fd); ++ ++ /* Some compilers don't like goto to a declaration */ ++ struct drm_mode_destroy_dumb destroy_dumb; ++free_dumb: ++ destroy_dumb = (struct drm_mode_destroy_dumb) { ++ .handle = create_dumb.handle, ++ }; ++ drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb); ++ ++out: ++ return ret; + } +diff --git a/src/panfrost/lib/pan_scoreboard.h b/src/panfrost/lib/pan_scoreboard.h +index f6476c66651..11820ca5432 100644 +--- a/src/panfrost/lib/pan_scoreboard.h ++++ b/src/panfrost/lib/pan_scoreboard.h +@@ -55,6 +55,7 @@ struct pan_scoreboard { + }; + + #ifdef PAN_ARCH ++#if PAN_ARCH < 10 + /* + * There are various types of Mali jobs: + * +@@ -266,6 +267,7 @@ panfrost_scoreboard_initialize_tiler(struct pan_pool *pool, + scoreboard->first_job = transfer.gpu; + return transfer; + } ++#endif /* PAN_ARCH < 10 */ + #endif /* PAN_ARCH */ + + #endif +diff --git a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h +index 58dcef725b6..1780ad28ec2 100644 +--- a/src/panfrost/lib/pan_texture.h ++++ b/src/panfrost/lib/pan_texture.h +@@ -44,9 +44,15 @@ + extern "C" { + #endif + +-#define PAN_MODIFIER_COUNT 6 ++#define PAN_MODIFIER_COUNT 7 + extern uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT]; + ++struct pan_image_slice_crc { ++ unsigned offset; ++ unsigned stride; ++ unsigned size; ++}; ++ + struct pan_image_slice_layout { + unsigned offset; + +@@ -80,11 +86,7 @@ struct pan_image_slice_layout { + + /* If checksumming is enabled following the slice, what + * is its offset/stride? */ +- struct { +- unsigned offset; +- unsigned stride; +- unsigned size; +- } crc; ++ struct pan_image_slice_crc crc; + + unsigned size; + }; +@@ -141,11 +143,8 @@ struct pan_image_view { + } buf; + }; + +-unsigned +-panfrost_compute_checksum_size( +- struct pan_image_slice_layout *slice, +- unsigned width, +- unsigned height); ++struct pan_image_slice_crc ++panfrost_compute_checksum_size(unsigned width, unsigned height); + + /* AFBC */ + +@@ -164,6 +163,9 @@ panfrost_afbc_can_ytr(enum pipe_format format); + bool + panfrost_afbc_can_tile(const struct panfrost_device *dev); + ++bool ++panfrost_afbc_only_native(unsigned arch, enum pipe_format format); ++ + /* + * Represents the block size of a single plane. For AFBC, this represents the + * superblock size. For u-interleaving, this represents the tile size. +diff --git a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h +index c2f883737c3..eb6b34e1566 100644 +--- a/src/panfrost/lib/pan_util.h ++++ b/src/panfrost/lib/pan_util.h +@@ -47,10 +47,16 @@ + #define PAN_DBG_LINEAR 0x1000 + #define PAN_DBG_NO_CACHE 0x2000 + #define PAN_DBG_DUMP 0x4000 +- + #ifndef NDEBUG + #define PAN_DBG_OVERFLOW 0x8000 + #endif ++#define PAN_DBG_TILER 0x010000 ++#define PAN_DBG_BO_LOG 0x020000 ++#define PAN_DBG_BO_CLEAR 0x040000 ++#define PAN_DBG_UNCACHED_GPU 0x100000 ++#define PAN_DBG_UNCACHED_CPU 0x200000 ++#define PAN_DBG_LOG 0x400000 ++#define PAN_DBG_GOFASTER 0x800000 + + struct panfrost_device; + +diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h +index 56bb0f48aed..c706cc65308 100644 +--- a/src/panfrost/lib/wrap.h ++++ b/src/panfrost/lib/wrap.h +@@ -46,6 +46,8 @@ void pandecode_initialize(bool to_stderr); + + void pandecode_next_frame(void); + ++void pandecode_dump_file_close(void); ++ + void pandecode_close(void); + + void +@@ -55,6 +57,10 @@ void pandecode_inject_free(uint64_t gpu_va, unsigned sz); + + void pandecode_jc(uint64_t jc_gpu_va, unsigned gpu_id); + ++void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ ++void pandecode_dump_mappings(void); ++ + void + pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id); + +diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build +index 66847f64569..7cbd81927b4 100644 +--- a/src/panfrost/meson.build ++++ b/src/panfrost/meson.build +@@ -20,7 +20,7 @@ + # SOFTWARE. + + inc_panfrost_hw = include_directories([ +- 'include' ++ 'include', 'base' + ]) + + inc_panfrost = include_directories([ +@@ -36,6 +36,8 @@ subdir('util') + subdir('midgard') + subdir('bifrost') + ++subdir('base') ++ + if with_gallium_panfrost or with_panfrost_vk + subdir('lib') + subdir('perf') +@@ -71,6 +73,46 @@ bifrost_compiler = executable( + build_by_default : with_tools.contains('panfrost') + ) + ++csf_test = executable( ++ 'csf_test', ++ ['csf_test/test.c'], ++ include_directories : [ ++ inc_mapi, ++ inc_mesa, ++ inc_gallium, ++ inc_gallium_aux, ++ inc_include, ++ inc_src, ++ inc_panfrost, ++ inc_panfrost_hw, ++ ], ++ dependencies : [ ++ idep_nir, ++ idep_mesautil, ++ idep_bi_opcodes_h, ++ dep_libdrm, ++ libpanfrost_dep, ++ ], ++ build_by_default : true ++) ++ ++custom_target( ++ 'panfrost_panloader', ++ output: ['panfrost_panloader.txt'], ++ depends : [ ++ libpanfrost_lib, ++ libpanfrost_util, ++ _libmesa_util, ++ libpanfrost_decode, ++ libpanfrost_decode_per_arch, ++ libpanfrost_midgard_disasm, ++ libpanfrost_bifrost_disasm, ++ libpanfrost_valhall_disasm, ++ ], ++ command: ['touch', '@OUTPUT@'], ++ build_by_default : false, ++) ++ + if with_panfrost_vk + subdir('vulkan') + endif +diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c +index b47902a9ce3..3643e5a6029 100644 +--- a/src/panfrost/midgard/disassemble.c ++++ b/src/panfrost/midgard/disassemble.c +@@ -1242,7 +1242,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words, + UNUSED static void + print_varying_parameters(FILE *fp, midgard_load_store_word *word) + { +- midgard_varying_params p = midgard_unpack_varying_params(*word); ++ unsigned params = word->signed_offset & 0x1FF; ++ midgard_varying_params p; ++ memcpy(&p, ¶ms, sizeof(p)); + + /* If a varying, there are qualifiers */ + if (p.flat_shading) +diff --git a/src/panfrost/tiler/tiler-hex-read b/src/panfrost/tiler/tiler-hex-read +new file mode 100755 +index 00000000000..1c188e38ec1 +--- /dev/null ++++ b/src/panfrost/tiler/tiler-hex-read +@@ -0,0 +1,400 @@ ++#!/usr/bin/env python3 ++ ++import sys ++import struct ++ ++FLIP_Y = False ++ ++data = b'' ++ ++fb_width = 160 ++fb_height = 160 ++hierarchy_mask = 0xffff ++ ++HEAP_OFS = 0x8000 ++ ++base_ptr = 0 ++heap_ptr = 0 ++midgard = False ++bifrost = True ++valhall = False ++size = None ++ ++bak_data = b'' ++ ++cur_data = b'' ++ ++# TODO: More robust looping.. ++for line in sys.stdin.read().split("\n"): ++ print(line) ++ split = line.split(" ") ++ if not len(split) or split[0] == "": ++ continue ++ if split[0] == "width": ++ fb_width = int(split[1]) ++ continue ++ if split[0] == "height": ++ fb_height = int(split[1]) ++ continue ++ if split[0] == "mask": ++ hierarchy_mask = int(split[1], 0) ++ continue ++ if split[0] == "vaheap": ++ base_ptr = int(split[1], 16) ++ bifrost = False ++ valhall = True ++ continue ++ if split[0] == "addr": ++ base_ptr = int(split[1], 16) ++ bifrost = False ++ midgard = True ++ HEAP_OFS = 0x40 ++ continue ++ if split[0] == "heap": ++ heap_ptr = int(split[1], 16) ++ data += cur_data ++ cur_data = b'' ++ bak_data = data ++ data = b'' ++ continue ++ if split[0] == "size": ++ size = int(split[1], 0) ++ continue ++ offset = int(split[0], 16) ++ if offset > len(data): ++ data += cur_data ++ cur_data = b'' ++ data += b'\0' * (offset - len(data)) ++ for d in split[1:]: ++ if d == "" or d == "*": ++ continue ++ cur_data += bytes([int(d, 16)]) ++ ++data += cur_data ++ ++if heap_ptr: ++ data, heap_data = bak_data, data ++ ++if size == None: ++ size = len(data) ++ ++def int7(val, signed=True): ++ val = val & 0x7f ++ if signed and val >= 0x40: ++ return val - 0x80 ++ else: ++ return val ++ ++def int8(val, signed=True): ++ val = val & 0xff ++ if signed and val >= 0x80: ++ return val - 0x100 ++ else: ++ return val ++ ++def fetch(ptr, size): ++ if midgard: ++ if ptr >= base_ptr and ptr < base_ptr + len(data): ++ base = ptr - base_ptr ++ return data[base:base+size] ++ elif ptr >= heap_ptr and ptr < heap_ptr + len(heap_data): ++ base = ptr - heap_ptr ++ return heap_data[base:base+size] ++ else: ++ if valhall: ++ ptr -= base_ptr ++ if ptr < 0: ++ return b"" ++ return data[ptr:ptr+size] ++ ++def print_draw(ptr): ++ draw = fetch(ptr, 128) ++ if len(draw) < 128: ++ print(" couldn't fetch draw struct") ++ return ++ decoded = struct.unpack("=16Q", draw) ++ coverage = [0 for x in decoded] ++ ++ fields = ( ++ ("Allow forward pixel to kill", 1, "0:0", "bool"), ++ ("Allow forward pixel to be killed", 1, "0:1", "bool"), ++ ("Pixel kill operation", 2, "0:2", "Pixel Kill"), ++ ("ZS update operation", 2, "0:4", "Pixel Kill"), ++ ("Allow primitive reorder", 1, "0:6", "bool"), ++ ("Overdraw alpha0", 1, "0:7", "bool"), ++ ("Overdraw alpha1", 1, "0:8", "bool"), ++ ("Clean Fragment Write", 1, "0:9", "bool"), ++ ("Primitive Barrier", 1, "0:10", "bool"), ++ ("Evaluate per-sample", 1, "0:11", "bool"), ++ ("Single-sampled lines", 1, "0:13", "bool"), ++ ("Occlusion query", 2, "0:14", "Occlusion Mode"), ++ ("Front face CCW", 1, "0:16", "bool"), ++ ("Cull front face", 1, "0:17", "bool"), ++ ("Cull back face", 1, "0:18", "bool"), ++ ("Multisample enable", 1, "0:19", "bool"), ++ ("Shader modifies coverage", 1, "0:20", "bool"), ++ ("Alpha-to-coverage Invert", 1, "0:21", "bool"), ++ ("Alpha-to-coverage", 1, "0:22", "bool"), ++ ("Scissor to bounding box", 1, "0:23", "bool"), ++ ("Sample mask", 16, "1:0", "uint"), ++ ("Render target mask", 8, "1:16", "hex"), ++ ++ ("Packet", 1, "2:0", "bool"), ++ # TODO: shr modifier ++ ("Vertex array", 64, "2:0", "address"), ++ ("Vertex packet stride", 16, "4:0", "uint"), ++ ("Vertex attribute stride", 16, "4:16", "uint"), ++ ("Unk", 16, "5:0", "uint"), ++ ++ ("Minimum Z", 32, "6:0", "float"), ++ ("Maximum Z", 32, "7:0", "float"), ++ ("Depth/stencil", 64, "10:0", "address"), ++ ("Blend count", 4, "12:0", "uint"), ++ ("Blend", 60, "12:4", "address"), ++ ("Occlusion", 64, "14:0", "address"), ++ ++ ("Attribute offset", 32, "16:0", "uint"), ++ ("FAU count", 8, "17:0", "uint"), ++ ("Resources", 48, "24:0", "address"), ++ ("Shader", 48, "26:0", "address"), ++ ("Thread storage", 48, "28:0", "address"), ++ ("FAU", 64, "30:0", "address"), ++ ) ++ ++ for f in fields: ++ name, size, start, type = f ++ word, bit = [int(x) for x in start.split(":")] ++ if word & 1: ++ bit += 32 ++ word >>= 1 ++ ++ mask = (1 << size) - 1 ++ data = (decoded[word] >> bit) & mask ++ coverage[word] |= mask << bit ++ if type == "float": ++ data = struct.unpack("=f", struct.pack("=I", data))[0] ++ else: ++ data = hex(data) ++ print(f" {name}: {data}") ++ ++ for i, (d, c) in enumerate(zip(decoded, coverage)): ++ ci = c ^ ((1 << 64) - 1) ++ if d & ci: ++ print(f" unk at 64-bit word {i}: {hex(d)} (known mask {hex(c)})") ++ ++def print_vertex(ptr, positions): ++ for p in positions: ++ addr = ptr + p * 16 ++ data = fetch(addr, 16) ++ if len(data) < 16: ++ print(f" ") ++ continue ++ x, y, z, w = struct.unpack("=4f", data) ++ print(f" <{x} {y} {z} {w}>") ++ ++DRAW_TYPES = [ ++ "unk", ++ "points", ++ "lines", ++ "tris", ++] ++ ++def heap_interpret(start, end): ++ print(f"interpreting from {hex(start)} to {hex(end)}") ++ ++ struct_count = 0 ++ ++ signed = True ++ ++ base = 0 ++ a = 0 ++ b = 0 ++ c = 0 ++ ++ num_vert = 3 ++ ++ draw_ptr = 0 ++ pos_ptr = 0 ++ ++ while start != end: ++ if midgard and start & 0x1ff == 0x1f8: ++ jump = struct.unpack("=Q", fetch(start, 8))[0] ++ print(f"jump mdg: {hex(jump)}") ++ start = jump ++ continue ++ ++ dat = fetch(start, 4) ++ if dat[3] & 0xe0 == 0x80: ++ struct_count += 1 ++ ++ print(f"{struct_count}:", " ".join([f"{hex(x)[2:].upper():>02}" for x in dat]), end=" ") ++ ++ masked_op = dat[3] & ~3 ++ ++ up = struct.unpack("=I", dat)[0] ++ ++ if valhall: ++ tri0 = tri0_7 = int7(up >> 15, signed) ++ tri1 = int7(up >> 8, signed) ++ tri2 = int7(up >> 1, signed) ++ else: ++ tri0 = int8(up >> 14, signed) ++ tri0_7 = int7(up >> 14, signed) ++ tri1 = int7(up >> 7, signed) ++ tri2 = int7(up, signed) ++ ++ signed = True ++ ++ if dat[3] & 0xe0 == 0x80: ++ res = "" ++ if valhall: ++ address = (up & 0x7ffffff) * 32 ++ num_vert = (dat[3] >> 3) & 0x3 ++ else: ++ address = (up & 0xffffff) * 64 ++ num_vert = (dat[3] >> 2) & 0x3 ++ if dat[3] & 0x10: ++ a = 0 ++ res = " reset" ++ draw_ptr = address ++ if valhall: ++ pos_ptr = address + 128 ++ print(f"draw {DRAW_TYPES[num_vert]}{res}: {hex(address)}") ++ elif valhall and dat[3] >> 4 == 12: ++ unk1 = up & 0x3f ++ address = (up >> 6) & 0xffff ++ unk2 = up >> 22 ++ draw_ptr += address << 32 ++ pos_ptr += address << 32 ++ print(f"draw offset: {hex(address)}, unk {hex(unk1)}, {hex(unk2)}") ++ ++ print_draw(draw_ptr) ++ elif dat[3] >> 6 == 1: ++ # TODO: handle two of these in a row ++ res = "" ++ if valhall: ++ # TOOD: Is the mask correct? ++ pf = (up >> 22) & 0x7f ++ shift = 7 ++ if dat[3] & 0x20: ++ a = 0 ++ res = " reset" ++ else: ++ pf = (up >> 21) & 0x7f ++ shift = 8 ++ ++ a += tri0_7 << shift ++ b += tri1 << 7 ++ c += tri2 << 7 ++ print(f"primitive offset{res}: {hex(pf << 4)} | +{tri0_7 << shift} {tri1 << 7} {tri2 << 7}") ++ signed = False ++ # TODO: Jumps are located based on position, not opcode ++ elif dat[3] == 0xff: ++ up64 = struct.unpack("=Q", fetch(start, 8))[0] ++ assert((up64 & 3) == 3) ++ print(f"jump (from {hex(start+8)}-8): {hex(up64 - 3)}") ++ start = up64 - 7 ++ elif dat[3] == 0x00: ++ assert((up & 3) == 3) ++ print(f"jump (from {hex(start+4)}-4): {hex(up - 3)}, {hex(HEAP_OFS + up - 3)}") ++ start = HEAP_OFS + up - 7 ++ elif (masked_op & 0xc0) == 0: ++ mode = hex(dat[3] >> 2) ++ ++ pre_offset = (up >> 22) & 0xf ++ ++ unk = "" ++ if valhall and up & 1: ++ unk = ", unk 1" ++ ++ a += base + tri0 ++ b += a + tri1 ++ c += a + tri2 ++ base = a ++ ++ print(f"{mode} draw: {hex(pre_offset)} | +{tri0} {tri1} {tri2}{unk}") ++ ++ print_vertex(pos_ptr, [a, b, c][:num_vert]) ++ ++ a = b = c = 0 ++ ++ else: ++ print(f"Unknown opcode {hex(dat[3])}") ++ ++ start += 4 ++ ++def level_list(): ++ levels = [] ++ size = 16 ++ anylevel = False ++ ++ # TODO: Does this miss the largest level? ++ while anylevel == False or size // 2 < min(fb_width, fb_height): ++ if (hierarchy_mask << 4) & size != 0: ++ anylevel = True ++ levels.append(size) ++ ++ size *= 2 ++ ++ return levels ++ ++def div_round_up(x, y): ++ return (x + y - 1) // y ++ ++def align(x, y): ++ return div_round_up(x, y) * y ++ ++def tile_count(alignment=4): ++ return sum(align(div_round_up(fb_width, size) * div_round_up(fb_height, size), 4) ++ for size in level_list()) ++ ++if midgard: ++ unpacked_header = list(struct.unpack("=16i", data[0:64])) ++ # Is this really big endian? ++ unpacked_header[5:7] = struct.unpack(">2i", data[20:28]) ++ print(f"header: {' '.join([str(x) for x in unpacked_header])}") ++ ++ # Extra is because of HEAP_OFS ++ header_size = align(tile_count() + 8, 64) ++elif valhall: ++ # TODO: Does this figure need alignment? ++ HEAP_STRIDE = tile_count() * 8 ++ HEAP_OFS = size - HEAP_STRIDE * 2 ++ ++pos = base_ptr + HEAP_OFS ++ ++for size in level_list(): ++ for y in range((fb_height + size - 1) // size): ++ for x in range((fb_width + size - 1) // size): ++ header = fetch(pos, 8) ++ if len(header) == 0: ++ break ++ ++ if midgard: ++ end = struct.unpack("=Q", header)[0] ++ use = bool(end) ++ end += 4 ++ start = base_ptr + header_size * 8 + (pos - base_ptr - HEAP_OFS) * 64 ++ elif bifrost: ++ end, start = struct.unpack("=II", header) ++ use = bool(end) ++ start += HEAP_OFS ++ end += HEAP_OFS + 4 ++ end &= ~3 ++ else: ++ footer = fetch(pos + HEAP_STRIDE, 8) ++ if len(footer) == 0: ++ break ++ start, end = struct.unpack("=QQ", header + footer) ++ use = bool(end) ++ # The upper bits are used for jump metadata ++ end &= (1 << 48) - 1 ++ end += 4 ++ if use: ++ if FLIP_Y: ++ print([x * size, fb_height - (y + 1) * size], ((x + 1) * size, fb_height - y * size)) ++ else: ++ print([x * size, y * size], ((x + 1) * size, (y + 1) * size)) ++ heap_interpret(start, end) ++ ++ pos += 8 +diff --git a/src/util/os_misc.c b/src/util/os_misc.c +index 13963afdffe..e5ade02e70b 100644 +--- a/src/util/os_misc.c ++++ b/src/util/os_misc.c +@@ -53,7 +53,6 @@ + # define LOG_TAG "MESA" + # include + # include +-# include + #elif DETECT_OS_LINUX || DETECT_OS_CYGWIN || DETECT_OS_SOLARIS || DETECT_OS_HURD + # include + #elif DETECT_OS_OPENBSD || DETECT_OS_FREEBSD +@@ -123,93 +122,10 @@ os_log_message(const char *message) + #endif + } + +-#if DETECT_OS_ANDROID +-# include +-# include "hash_table.h" +-# include "ralloc.h" +-# include "simple_mtx.h" +- +-static struct hash_table *options_tbl; +- +-static void +-options_tbl_fini(void) +-{ +- _mesa_hash_table_destroy(options_tbl, NULL); +-} +- +-/** +- * Get an option value from android's property system, as a fallback to +- * getenv() (which is generally less useful on android due to processes +- * typically being forked from the zygote. +- * +- * The option name used for getenv is translated into a property name +- * by: +- * +- * 1) convert to lowercase +- * 2) replace '_' with '.' +- * 3) if necessary, prepend "mesa." +- * +- * For example: +- * - MESA_EXTENSION_OVERRIDE -> mesa.extension.override +- * - GALLIUM_HUD -> mesa.gallium.hud +- * +- * Note that we use a hashtable for two purposes: +- * 1) Avoid re-translating the option name on subsequent lookups +- * 2) Avoid leaking memory. Because property_get() returns the +- * property value into a user allocated buffer, we cannot return +- * that directly to the caller, so we need to strdup(). With the +- * hashtable, subsquent lookups can return the existing string. +- */ +-static const char * +-os_get_android_option(const char *name) +-{ +- if (!options_tbl) { +- options_tbl = _mesa_hash_table_create(NULL, _mesa_hash_string, +- _mesa_key_string_equal); +- atexit(options_tbl_fini); +- } +- +- struct hash_entry *entry = _mesa_hash_table_search(options_tbl, name); +- if (entry) { +- return entry->data; +- } +- +- char value[PROPERTY_VALUE_MAX]; +- char key[PROPERTY_KEY_MAX]; +- char *p = key, *end = key + PROPERTY_KEY_MAX; +- /* add "mesa." prefix if necessary: */ +- if (strstr(name, "MESA_") != name) +- p += strlcpy(p, "mesa.", end - p); +- p += strlcpy(p, name, end - p); +- for (int i = 0; key[i]; i++) { +- if (key[i] == '_') { +- key[i] = '.'; +- } else { +- key[i] = tolower(key[i]); +- } +- } +- +- const char *opt = NULL; +- int len = property_get(key, value, NULL); +- if (len > 1) { +- opt = ralloc_strdup(options_tbl, value); +- } +- +- _mesa_hash_table_insert(options_tbl, name, (void *)opt); +- +- return opt; +-} +-#endif +- + const char * + os_get_option(const char *name) + { + const char *opt = getenv(name); +-#if DETECT_OS_ANDROID +- if (!opt) { +- opt = os_get_android_option(name); +- } +-#endif + return opt; + } + +diff --git a/src/util/perf/cpu_trace.h b/src/util/perf/cpu_trace.h +index c13a3821158..e8423d40407 100644 +--- a/src/util/perf/cpu_trace.h ++++ b/src/util/perf/cpu_trace.h +@@ -27,19 +27,6 @@ + util_perfetto_trace_end(category); \ + } while (0) + +-/* NOTE: for now disable atrace for C++ to workaround a ndk bug with ordering +- * between stdatomic.h and atomic.h. See: +- * +- * https://github.com/android/ndk/issues/1178 +- */ +-#elif defined(ANDROID) && !defined(__cplusplus) +- +-#include +- +-#define _MESA_TRACE_BEGIN(category, name) \ +- atrace_begin(ATRACE_TAG_GRAPHICS, name) +-#define _MESA_TRACE_END(category) atrace_end(ATRACE_TAG_GRAPHICS) +- + #else + + #define _MESA_TRACE_BEGIN(category, name) +diff --git a/src/util/stable_array.h b/src/util/stable_array.h +new file mode 100644 +index 00000000000..a590aa48a50 +--- /dev/null ++++ b/src/util/stable_array.h +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (C) 2022 Icecream95 ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef STABLE_ARRAY_H ++#define STABLE_ARRAY_H ++ ++#include "util/simple_mtx.h" ++#include "util/u_math.h" ++ ++/* A thread-safe automatically growing array where elements have stable locations ++ * ++ * This data structure has these properties: ++ * ++ * 1. Accessing an element is constant time (if allocation is not required). ++ * ++ * 2. Elements are not moved in memory, so it is safe to store a pointer to ++ * something in a stable_array. ++ * ++ * 3. The data structure is thread-safe. To improve performance, there is ++ * also a fast path that does not require atomics. ++ * ++ * 4. Although the data structure is not lock-free, there is a limit on the ++ * number of times that a lock is ever acquired--a maximum of 32 times the ++ * number of accessing threads. In practice, contention will never be an ++ * issue for long-lived stable_arrays. ++ * ++ * 5. Memory usage is similar to util_dynarray, with each allocation being ++ * twice as large as the last. Freeing buckets is currently never done. ++ * ++ * The data structure is faster than util_sparse_array, but is not sparse. ++ */ ++ ++struct stable_array ++{ ++ uint8_t *buckets[32]; ++ simple_mtx_t lock; ++ size_t eltsize; ++}; ++ ++static inline void ++stable_array_init_bytes(struct stable_array *buf, size_t eltsize) ++{ ++ memset(buf, 0, sizeof(*buf)); ++ buf->eltsize = eltsize; ++ simple_mtx_init(&buf->lock, mtx_plain); ++} ++ ++static inline void ++stable_array_fini(struct stable_array *buf) ++{ ++ simple_mtx_destroy(&buf->lock); ++ for (unsigned i = 0; i < ARRAY_SIZE(buf->buckets); ++i) { ++ if (buf->buckets[i]) ++ free(buf->buckets[i]); ++ } ++} ++ ++struct stable_array_index ++{ ++ unsigned bucket; ++ unsigned idx; ++}; ++ ++static inline struct stable_array_index ++stable_array_get_index(unsigned idx) ++{ ++ struct stable_array_index i = {0}; ++ i.bucket = util_logbase2(idx); ++ i.idx = i.bucket ? (idx -= (1 << i.bucket)) : idx; ++ return i; ++} ++ ++static inline void * ++stable_array_get_bytes(struct stable_array *buf, unsigned idx, size_t eltsize) ++{ ++ assert(eltsize == buf->eltsize); ++ ++ struct stable_array_index i = stable_array_get_index(idx); ++ ++ uint8_t *bucket = p_atomic_read(&buf->buckets[i.bucket]); ++ ++ if (!bucket) { ++ simple_mtx_lock(&buf->lock); ++ bucket = buf->buckets[i.bucket]; ++ ++ if (!bucket) { ++ /* The first two buckets both have two elements */ ++ bucket = (uint8_t *)calloc(1U << MAX2(i.bucket, 1), eltsize); ++ ++ p_atomic_set(&buf->buckets[i.bucket], bucket); ++ } ++ simple_mtx_unlock(&buf->lock); ++ } ++ ++ return bucket + eltsize * i.idx; ++} ++ ++static inline void * ++stable_array_get_existing_bytes(struct stable_array *buf, unsigned idx, size_t eltsize) ++{ ++ assert(eltsize == buf->eltsize); ++ ++ struct stable_array_index i = stable_array_get_index(idx); ++ ++ return buf->buckets[i.bucket] + eltsize * i.idx; ++} ++ ++#define stable_array_init(buf, type) stable_array_init_bytes((buf), sizeof(type)) ++#define stable_array_get(buf, type, idx) ((type*)stable_array_get_bytes((buf), (idx), sizeof(type))) ++#define stable_array_get_existing(buf, type, idx) ((type*)stable_array_get_existing_bytes((buf), (idx), sizeof(type))) ++ ++#endif +diff --git a/src/util/u_debug_stack_android.cpp b/src/util/u_debug_stack_android.cpp +index 2c7b2d53676..f31389752bd 100644 +--- a/src/util/u_debug_stack_android.cpp ++++ b/src/util/u_debug_stack_android.cpp +@@ -21,7 +21,6 @@ + * IN THE SOFTWARE. + */ + +-#include + + #include "util/simple_mtx.h" + #include "util/u_debug.h" +@@ -52,56 +51,14 @@ debug_backtrace_capture(debug_stack_frame *backtrace, + unsigned start_frame, + unsigned nr_frames) + { +- Backtrace *bt; + +- if (!nr_frames) +- return; +- +- bt = Backtrace::Create(BACKTRACE_CURRENT_PROCESS, +- BACKTRACE_CURRENT_THREAD); +- if (bt == NULL) { +- for (unsigned i = 0; i < nr_frames; i++) +- backtrace[i].procname = NULL; +- return; +- } +- +- /* Add one to exclude this call. Unwind already ignores itself. */ +- bt->Unwind(start_frame + 1); +- +- simple_mtx_lock(&table_mutex); +- +- for (unsigned i = 0; i < nr_frames; i++) { +- const backtrace_frame_data_t* frame = bt->GetFrame(i); +- if (frame) { +- backtrace[i].procname = intern_symbol(frame->func_name.c_str()); +- backtrace[i].start_ip = frame->pc; +- backtrace[i].off = frame->func_offset; +- backtrace[i].map = intern_symbol(frame->map.Name().c_str()); +- backtrace[i].map_off = frame->rel_pc; +- } else { +- backtrace[i].procname = NULL; +- } +- } +- +- simple_mtx_unlock(&table_mutex); +- +- delete bt; + } + + void + debug_backtrace_dump(const debug_stack_frame *backtrace, + unsigned nr_frames) + { +- for (unsigned i = 0; i < nr_frames; i++) { +- if (backtrace[i].procname) +- debug_printf( +- "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n", +- backtrace[i].map, +- backtrace[i].map_off, +- backtrace[i].start_ip, +- backtrace[i].procname, +- backtrace[i].off); +- } ++ + } + + void +@@ -109,14 +66,5 @@ debug_backtrace_print(FILE *f, + const debug_stack_frame *backtrace, + unsigned nr_frames) + { +- for (unsigned i = 0; i < nr_frames; i++) { +- if (backtrace[i].procname) +- fprintf(f, +- "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n", +- backtrace[i].map, +- backtrace[i].map_off, +- backtrace[i].start_ip, +- backtrace[i].procname, +- backtrace[i].off); +- } ++ + } diff --git a/src/amd/vulkan/radv_buffer_view.c b/src/amd/vulkan/radv_buffer_view.c new file mode 100644 index 00000000000..f1e09d49dfe --- /dev/null +++ b/src/amd/vulkan/radv_buffer_view.c @@ -0,0 +1,149 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "gfx10_format_table.h" + +#include "radv_private.h" + +void +radv_make_texel_buffer_descriptor(struct radv_device *device, uint64_t va, VkFormat vk_format, unsigned offset, + unsigned range, uint32_t *state) +{ + const struct util_format_description *desc; + unsigned stride; + unsigned num_format, data_format; + int first_non_void; + enum pipe_swizzle swizzle[4]; + unsigned rsrc_word3; + + desc = vk_format_description(vk_format); + first_non_void = vk_format_get_first_non_void_channel(vk_format); + stride = desc->block.bits / 8; + + radv_compose_swizzle(desc, NULL, swizzle); + + va += offset; + + if (device->physical_device->rad_info.gfx_level != GFX8 && stride) { + range /= stride; + } + + rsrc_word3 = S_008F0C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_008F0C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) | + S_008F0C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_008F0C_DST_SEL_W(radv_map_swizzle(swizzle[3])); + + if (device->physical_device->rad_info.gfx_level >= GFX10) { + const struct gfx10_format *fmt = + &ac_get_gfx10_format_table(&device->physical_device->rad_info)[vk_format_to_pipe_format(vk_format)]; + + /* OOB_SELECT chooses the out-of-bounds check. + * + * GFX10: + * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) + * - 1: index >= NUM_RECORDS + * - 2: NUM_RECORDS == 0 + * - 3: if SWIZZLE_ENABLE: + * swizzle_address >= NUM_RECORDS + * else: + * offset >= NUM_RECORDS + * + * GFX11: + * - 0: (index >= NUM_RECORDS) || (offset+payload > STRIDE) + * - 1: index >= NUM_RECORDS + * - 2: NUM_RECORDS == 0 + * - 3: if SWIZZLE_ENABLE && STRIDE: + * (index >= NUM_RECORDS) || ( offset+payload > STRIDE) + * else: + * offset+payload > NUM_RECORDS + */ + rsrc_word3 |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(device->physical_device->rad_info.gfx_level < GFX11); + } else { + num_format = radv_translate_buffer_numformat(desc, first_non_void); + data_format = radv_translate_buffer_dataformat(desc, first_non_void); + + assert(data_format != V_008F0C_BUF_DATA_FORMAT_INVALID); + assert(num_format != ~0); + + rsrc_word3 |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + } + + state[0] = va; + state[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); + state[2] = range; + state[3] = rsrc_word3; +} + +void +radv_buffer_view_init(struct radv_buffer_view *view, struct radv_device *device, + const VkBufferViewCreateInfo *pCreateInfo) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer); + uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset; + + vk_buffer_view_init(&device->vk, &view->vk, pCreateInfo); + + view->bo = buffer->bo; + + radv_make_texel_buffer_descriptor(device, va, view->vk.format, view->vk.offset, view->vk.range, view->state); +} + +void +radv_buffer_view_finish(struct radv_buffer_view *view) +{ + vk_buffer_view_finish(&view->vk); +} + +VKAPI_ATTR VkResult VKAPI_CALL +radv_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkBufferView *pView) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_buffer_view *view; + + view = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!view) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + radv_buffer_view_init(view, device, pCreateInfo); + + *pView = radv_buffer_view_to_handle(view); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +radv_DestroyBufferView(VkDevice _device, VkBufferView bufferView, const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_buffer_view, view, bufferView); + + if (!view) + return; + + radv_buffer_view_finish(view); + vk_free2(&device->vk.alloc, pAllocator, view); +} diff --git a/src/amd/vulkan/radv_image_view.c b/src/amd/vulkan/radv_image_view.c new file mode 100644 index 00000000000..463ababade4 --- /dev/null +++ b/src/amd/vulkan/radv_image_view.c @@ -0,0 +1,945 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_private.h" + +#include "gfx10_format_table.h" + +static unsigned +gfx9_border_color_swizzle(const struct util_format_description *desc) +{ + unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + + if (desc->format == PIPE_FORMAT_S8_UINT) { + /* Swizzle of 8-bit stencil format is defined as _x__ but the hw expects XYZW. */ + assert(desc->swizzle[1] == PIPE_SWIZZLE_X); + return bc_swizzle; + } + + if (desc->swizzle[3] == PIPE_SWIZZLE_X) { + /* For the pre-defined border color values (white, opaque + * black, transparent black), the only thing that matters is + * that the alpha channel winds up in the correct place + * (because the RGB channels are all the same) so either of + * these enumerations will work. + */ + if (desc->swizzle[2] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_WZYX; + else + bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ; + } else if (desc->swizzle[0] == PIPE_SWIZZLE_X) { + if (desc->swizzle[1] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + else + bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ; + } else if (desc->swizzle[1] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ; + } else if (desc->swizzle[2] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW; + } + + return bc_swizzle; +} + +static unsigned +radv_tex_dim(VkImageType image_type, VkImageViewType view_type, unsigned nr_layers, unsigned nr_samples, + bool is_storage_image, bool gfx9) +{ + if (view_type == VK_IMAGE_VIEW_TYPE_CUBE || view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) + return is_storage_image ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_CUBE; + + /* GFX9 allocates 1D textures as 2D. */ + if (gfx9 && image_type == VK_IMAGE_TYPE_1D) + image_type = VK_IMAGE_TYPE_2D; + switch (image_type) { + case VK_IMAGE_TYPE_1D: + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_1D_ARRAY : V_008F1C_SQ_RSRC_IMG_1D; + case VK_IMAGE_TYPE_2D: + if (nr_samples > 1) + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_MSAA; + else + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_2D; + case VK_IMAGE_TYPE_3D: + if (view_type == VK_IMAGE_VIEW_TYPE_3D) + return V_008F1C_SQ_RSRC_IMG_3D; + else + return V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + default: + unreachable("illegal image type"); + } +} + +static inline unsigned +si_tile_mode_index(const struct radv_image_plane *plane, unsigned level, bool stencil) +{ + if (stencil) + return plane->surface.u.legacy.zs.stencil_tiling_index[level]; + else + return plane->surface.u.legacy.tiling_index[level]; +} + +void +si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *image, + const struct legacy_surf_level *base_level_info, unsigned plane_id, unsigned base_level, + unsigned first_level, unsigned block_width, bool is_stencil, bool is_storage_image, + bool disable_compression, bool enable_write_compression, uint32_t *state, + const struct ac_surf_nbc_view *nbc_view) +{ + struct radv_image_plane *plane = &image->planes[plane_id]; + struct radv_image_binding *binding = image->disjoint ? &image->bindings[plane_id] : &image->bindings[0]; + uint64_t gpu_address = binding->bo ? radv_buffer_get_va(binding->bo) + binding->offset : 0; + uint64_t va = gpu_address; + uint8_t swizzle = plane->surface.tile_swizzle; + enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; + uint64_t meta_va = 0; + if (gfx_level >= GFX9) { + if (is_stencil) + va += plane->surface.u.gfx9.zs.stencil_offset; + else + va += plane->surface.u.gfx9.surf_offset; + if (nbc_view && nbc_view->valid) { + va += nbc_view->base_address_offset; + swizzle = nbc_view->tile_swizzle; + } + } else + va += (uint64_t)base_level_info->offset_256B * 256; + + state[0] = va >> 8; + if (gfx_level >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D) + state[0] |= swizzle; + state[1] &= C_008F14_BASE_ADDRESS_HI; + state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); + + if (gfx_level >= GFX8) { + state[6] &= C_008F28_COMPRESSION_EN; + state[7] = 0; + if (!disable_compression && radv_dcc_enabled(image, first_level)) { + meta_va = gpu_address + plane->surface.meta_offset; + if (gfx_level <= GFX8) + meta_va += plane->surface.u.legacy.color.dcc_level[base_level].dcc_offset; + + unsigned dcc_tile_swizzle = swizzle << 8; + dcc_tile_swizzle &= (1 << plane->surface.meta_alignment_log2) - 1; + meta_va |= dcc_tile_swizzle; + } else if (!disable_compression && radv_image_is_tc_compat_htile(image)) { + meta_va = gpu_address + plane->surface.meta_offset; + } + + if (meta_va) { + state[6] |= S_008F28_COMPRESSION_EN(1); + if (gfx_level <= GFX9) + state[7] = meta_va >> 8; + } + } + + /* GFX10.3+ can set a custom pitch for 1D and 2D non-array, but it must be a multiple + * of 256B. + * + * If an imported image is used with VK_IMAGE_VIEW_TYPE_2D_ARRAY, it may hang due to VM faults + * because DEPTH means pitch with 2D, but it means depth with 2D array. + */ + if (device->physical_device->rad_info.gfx_level >= GFX10_3 && plane->surface.u.gfx9.uses_custom_pitch) { + assert((plane->surface.u.gfx9.surf_pitch * plane->surface.bpe) % 256 == 0); + assert(image->vk.image_type == VK_IMAGE_TYPE_2D); + assert(plane->surface.is_linear); + assert(G_00A00C_TYPE(state[3]) == V_008F1C_SQ_RSRC_IMG_2D); + unsigned pitch = plane->surface.u.gfx9.surf_pitch; + + /* Subsampled images have the pitch in the units of blocks. */ + if (plane->surface.blk_w == 2) + pitch *= 2; + + state[4] &= C_00A010_DEPTH & C_00A010_PITCH_MSB; + state[4] |= S_00A010_DEPTH(pitch - 1) | /* DEPTH contains low bits of PITCH. */ + S_00A010_PITCH_MSB((pitch - 1) >> 13); + } + + if (gfx_level >= GFX10) { + state[3] &= C_00A00C_SW_MODE; + + if (is_stencil) { + state[3] |= S_00A00C_SW_MODE(plane->surface.u.gfx9.zs.stencil_swizzle_mode); + } else { + state[3] |= S_00A00C_SW_MODE(plane->surface.u.gfx9.swizzle_mode); + } + + state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED; + + if (meta_va) { + struct gfx9_surf_meta_flags meta = { + .rb_aligned = 1, + .pipe_aligned = 1, + }; + + if (!(plane->surface.flags & RADEON_SURF_Z_OR_SBUFFER)) + meta = plane->surface.u.gfx9.color.dcc; + + if (radv_dcc_enabled(image, first_level) && is_storage_image && enable_write_compression) + state[6] |= S_00A018_WRITE_COMPRESS_ENABLE(1); + + state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8); + } + + state[7] = meta_va >> 16; + } else if (gfx_level == GFX9) { + state[3] &= C_008F1C_SW_MODE; + state[4] &= C_008F20_PITCH; + + if (is_stencil) { + state[3] |= S_008F1C_SW_MODE(plane->surface.u.gfx9.zs.stencil_swizzle_mode); + state[4] |= S_008F20_PITCH(plane->surface.u.gfx9.zs.stencil_epitch); + } else { + state[3] |= S_008F1C_SW_MODE(plane->surface.u.gfx9.swizzle_mode); + state[4] |= S_008F20_PITCH(plane->surface.u.gfx9.epitch); + } + + state[5] &= C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED; + if (meta_va) { + struct gfx9_surf_meta_flags meta = { + .rb_aligned = 1, + .pipe_aligned = 1, + }; + + if (!(plane->surface.flags & RADEON_SURF_Z_OR_SBUFFER)) + meta = plane->surface.u.gfx9.color.dcc; + + state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) | + S_008F24_META_RB_ALIGNED(meta.rb_aligned); + } + } else { + /* GFX6-GFX8 */ + unsigned pitch = base_level_info->nblk_x * block_width; + unsigned index = si_tile_mode_index(plane, base_level, is_stencil); + + state[3] &= C_008F1C_TILING_INDEX; + state[3] |= S_008F1C_TILING_INDEX(index); + state[4] &= C_008F20_PITCH; + state[4] |= S_008F20_PITCH(pitch - 1); + } +} + +/** + * Build the sampler view descriptor for a texture (GFX10). + */ +static void +gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image, + VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping, + unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state, + uint32_t *fmask_state, VkImageCreateFlags img_create_flags, + const struct ac_surf_nbc_view *nbc_view, const VkImageViewSlicedCreateInfoEXT *sliced_3d) +{ + const struct util_format_description *desc; + enum pipe_swizzle swizzle[4]; + unsigned img_format; + unsigned type; + + desc = vk_format_description(vk_format); + + /* For emulated ETC2 without alpha we need to override the format to a 3-componenent format, so + * that border colors work correctly (alpha forced to 1). Since Vulkan has no such format, + * this uses the Gallium formats to set the description. */ + if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_UNORM) { + desc = util_format_description(PIPE_FORMAT_R8G8B8X8_UNORM); + } else if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_SRGB) { + desc = util_format_description(PIPE_FORMAT_R8G8B8X8_SRGB); + } + + img_format = + ac_get_gfx10_format_table(&device->physical_device->rad_info)[vk_format_to_pipe_format(vk_format)].img_format; + + radv_compose_swizzle(desc, mapping, swizzle); + + if (img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) { + assert(image->vk.image_type == VK_IMAGE_TYPE_3D); + type = V_008F1C_SQ_RSRC_IMG_3D; + } else { + type = radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, image->vk.samples, is_storage_image, + device->physical_device->rad_info.gfx_level == GFX9); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = image->vk.array_layers; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (view_type != VK_IMAGE_VIEW_TYPE_3D) + depth = image->vk.array_layers; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = image->vk.array_layers / 6; + + state[0] = 0; + state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1); + state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(device->physical_device->rad_info.gfx_level < GFX11); + state[3] = S_00A00C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_00A00C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) | + S_00A00C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_00A00C_DST_SEL_W(radv_map_swizzle(swizzle[3])) | + S_00A00C_BASE_LEVEL(image->vk.samples > 1 ? 0 : first_level) | + S_00A00C_LAST_LEVEL(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : last_level) | + S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc)) | S_00A00C_TYPE(type); + /* Depth is the the last accessible layer on gfx9+. The hw doesn't need + * to know the total number of layers. + */ + state[4] = + S_00A010_DEPTH(type == V_008F1C_SQ_RSRC_IMG_3D ? depth - 1 : last_layer) | S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_ARRAY_PITCH(0) | S_00A014_PERF_MOD(4); + state[6] = 0; + state[7] = 0; + + if (img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) { + assert(type == V_008F1C_SQ_RSRC_IMG_3D); + + /* ARRAY_PITCH is only meaningful for 3D images, 0 means SRV, 1 means UAV. + * In SRV mode, BASE_ARRAY is ignored and DEPTH is the last slice of mipmap level 0. + * In UAV mode, BASE_ARRAY is the first slice and DEPTH is the last slice of the bound level. + */ + state[4] &= C_00A010_DEPTH; + state[4] |= S_00A010_DEPTH(!is_storage_image ? depth - 1 : u_minify(depth, first_level) - 1); + state[5] |= S_00A014_ARRAY_PITCH(is_storage_image); + } else if (sliced_3d) { + unsigned total = u_minify(depth, first_level); + + assert(type == V_008F1C_SQ_RSRC_IMG_3D && is_storage_image); + + unsigned first_slice = sliced_3d->sliceOffset; + unsigned slice_count = sliced_3d->sliceCount == VK_REMAINING_3D_SLICES_EXT + ? MAX2(1, total - sliced_3d->sliceOffset) + : sliced_3d->sliceCount; + unsigned last_slice = first_slice + slice_count - 1; + + state[4] = 0; + state[4] |= S_00A010_DEPTH(last_slice) | S_00A010_BASE_ARRAY(first_slice); + state[5] |= S_00A014_ARRAY_PITCH(1); + } + + unsigned max_mip = image->vk.samples > 1 ? util_logbase2(image->vk.samples) : image->vk.mip_levels - 1; + if (nbc_view && nbc_view->valid) + max_mip = nbc_view->num_levels - 1; + + unsigned min_lod_clamped = radv_float_to_ufixed(CLAMP(min_lod, 0, 15), 8); + if (device->physical_device->rad_info.gfx_level >= GFX11) { + state[1] |= S_00A004_MAX_MIP(max_mip); + state[5] |= S_00A014_MIN_LOD_LO(min_lod_clamped); + state[6] |= S_00A018_MIN_LOD_HI(min_lod_clamped >> 5); + } else { + state[1] |= S_00A004_MIN_LOD(min_lod_clamped); + state[5] |= S_00A014_MAX_MIP(max_mip); + } + + if (radv_dcc_enabled(image, first_level)) { + state[6] |= + S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE(image->planes[0].surface.u.gfx9.color.dcc.max_compressed_block_size) | + S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(device, vk_format)); + } + + if (radv_image_get_iterate256(device, image)) { + state[6] |= S_00A018_ITERATE_256(1); + } + + /* Initialize the sampler view for FMASK. */ + if (fmask_state) { + if (radv_image_has_fmask(image)) { + uint64_t gpu_address = radv_buffer_get_va(image->bindings[0].bo); + uint32_t format; + uint64_t va; + + assert(image->plane_count == 1); + + va = gpu_address + image->bindings[0].offset + image->planes[0].surface.fmask_offset; + + switch (image->vk.samples) { + case 2: + format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F2; + break; + case 4: + format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F4; + break; + case 8: + format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F8; + break; + default: + unreachable("invalid nr_samples"); + } + + fmask_state[0] = (va >> 8) | image->planes[0].surface.fmask_tile_swizzle; + fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) | S_00A004_WIDTH_LO(width - 1); + fmask_state[2] = + S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | S_00A008_RESOURCE_LEVEL(1); + fmask_state[3] = + S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_00A00C_SW_MODE(image->planes[0].surface.u.gfx9.color.fmask_swizzle_mode) | + S_00A00C_TYPE(radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, 0, false, false)); + fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer); + fmask_state[5] = 0; + fmask_state[6] = S_00A018_META_PIPE_ALIGNED(1); + fmask_state[7] = 0; + + if (radv_image_is_tc_compat_cmask(image)) { + va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset; + + fmask_state[6] |= S_00A018_COMPRESSION_EN(1); + fmask_state[6] |= S_00A018_META_DATA_ADDRESS_LO(va >> 8); + fmask_state[7] |= va >> 16; + } + } else + memset(fmask_state, 0, 8 * 4); + } +} + +/** + * Build the sampler view descriptor for a texture (SI-GFX9) + */ +static void +si_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image, + VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping, + unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state, + uint32_t *fmask_state, VkImageCreateFlags img_create_flags) +{ + const struct util_format_description *desc; + enum pipe_swizzle swizzle[4]; + int first_non_void; + unsigned num_format, data_format, type; + + desc = vk_format_description(vk_format); + + /* For emulated ETC2 without alpha we need to override the format to a 3-componenent format, so + * that border colors work correctly (alpha forced to 1). Since Vulkan has no such format, + * this uses the Gallium formats to set the description. */ + if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_UNORM) { + desc = util_format_description(PIPE_FORMAT_R8G8B8X8_UNORM); + } else if (image->vk.format == VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK && vk_format == VK_FORMAT_R8G8B8A8_SRGB) { + desc = util_format_description(PIPE_FORMAT_R8G8B8X8_SRGB); + } + + radv_compose_swizzle(desc, mapping, swizzle); + + first_non_void = vk_format_get_first_non_void_channel(vk_format); + + num_format = radv_translate_tex_numformat(vk_format, desc, first_non_void); + if (num_format == ~0) { + num_format = 0; + } + + data_format = radv_translate_tex_dataformat(vk_format, desc, first_non_void); + if (data_format == ~0) { + data_format = 0; + } + + /* S8 with either Z16 or Z32 HTILE need a special format. */ + if (device->physical_device->rad_info.gfx_level == GFX9 && vk_format == VK_FORMAT_S8_UINT && + radv_image_is_tc_compat_htile(image)) { + if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) + data_format = V_008F14_IMG_DATA_FORMAT_S8_32; + else if (image->vk.format == VK_FORMAT_D16_UNORM_S8_UINT) + data_format = V_008F14_IMG_DATA_FORMAT_S8_16; + } + + if (device->physical_device->rad_info.gfx_level == GFX9 && + img_create_flags & VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT) { + assert(image->vk.image_type == VK_IMAGE_TYPE_3D); + type = V_008F1C_SQ_RSRC_IMG_3D; + } else { + type = radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, image->vk.samples, is_storage_image, + device->physical_device->rad_info.gfx_level == GFX9); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = image->vk.array_layers; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (view_type != VK_IMAGE_VIEW_TYPE_3D) + depth = image->vk.array_layers; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = image->vk.array_layers / 6; + + state[0] = 0; + state[1] = (S_008F14_MIN_LOD(radv_float_to_ufixed(CLAMP(min_lod, 0, 15), 8)) | S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format)); + state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4)); + state[3] = (S_008F1C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | S_008F1C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | S_008F1C_DST_SEL_W(radv_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(image->vk.samples > 1 ? 0 : first_level) | + S_008F1C_LAST_LEVEL(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : last_level) | + S_008F1C_TYPE(type)); + state[4] = 0; + state[5] = S_008F24_BASE_ARRAY(first_layer); + state[6] = 0; + state[7] = 0; + + if (device->physical_device->rad_info.gfx_level == GFX9) { + unsigned bc_swizzle = gfx9_border_color_swizzle(desc); + + /* Depth is the last accessible layer on Gfx9. + * The hw doesn't need to know the total number of layers. + */ + if (type == V_008F1C_SQ_RSRC_IMG_3D) + state[4] |= S_008F20_DEPTH(depth - 1); + else + state[4] |= S_008F20_DEPTH(last_layer); + + state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle); + state[5] |= S_008F24_MAX_MIP(image->vk.samples > 1 ? util_logbase2(image->vk.samples) : image->vk.mip_levels - 1); + } else { + state[3] |= S_008F1C_POW2_PAD(image->vk.mip_levels > 1); + state[4] |= S_008F20_DEPTH(depth - 1); + state[5] |= S_008F24_LAST_ARRAY(last_layer); + } + if (!(image->planes[0].surface.flags & RADEON_SURF_Z_OR_SBUFFER) && image->planes[0].surface.meta_offset) { + state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(device, vk_format)); + } else { + if (device->instance->disable_aniso_single_level) { + /* The last dword is unused by hw. The shader uses it to clear + * bits in the first dword of sampler state. + */ + if (device->physical_device->rad_info.gfx_level <= GFX7 && image->vk.samples <= 1) { + if (first_level == last_level) + state[7] = C_008F30_MAX_ANISO_RATIO; + else + state[7] = 0xffffffff; + } + } + } + + /* Initialize the sampler view for FMASK. */ + if (fmask_state) { + if (radv_image_has_fmask(image)) { + uint32_t fmask_format; + uint64_t gpu_address = radv_buffer_get_va(image->bindings[0].bo); + uint64_t va; + + assert(image->plane_count == 1); + + va = gpu_address + image->bindings[0].offset + image->planes[0].surface.fmask_offset; + + if (device->physical_device->rad_info.gfx_level == GFX9) { + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK; + switch (image->vk.samples) { + case 2: + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_2; + break; + case 4: + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_4; + break; + case 8: + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_8; + break; + default: + unreachable("invalid nr_samples"); + } + } else { + switch (image->vk.samples) { + case 2: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; + break; + case 4: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; + break; + case 8: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; + break; + default: + assert(0); + fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID; + } + num_format = V_008F14_IMG_NUM_FORMAT_UINT; + } + + fmask_state[0] = va >> 8; + fmask_state[0] |= image->planes[0].surface.fmask_tile_swizzle; + fmask_state[1] = + S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(fmask_format) | S_008F14_NUM_FORMAT(num_format); + fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1); + fmask_state[3] = + S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_008F1C_TYPE(radv_tex_dim(image->vk.image_type, view_type, image->vk.array_layers, 0, false, false)); + fmask_state[4] = 0; + fmask_state[5] = S_008F24_BASE_ARRAY(first_layer); + fmask_state[6] = 0; + fmask_state[7] = 0; + + if (device->physical_device->rad_info.gfx_level == GFX9) { + fmask_state[3] |= S_008F1C_SW_MODE(image->planes[0].surface.u.gfx9.color.fmask_swizzle_mode); + fmask_state[4] |= + S_008F20_DEPTH(last_layer) | S_008F20_PITCH(image->planes[0].surface.u.gfx9.color.fmask_epitch); + fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(1) | S_008F24_META_RB_ALIGNED(1); + + if (radv_image_is_tc_compat_cmask(image)) { + va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset; + + fmask_state[5] |= S_008F24_META_DATA_ADDRESS(va >> 40); + fmask_state[6] |= S_008F28_COMPRESSION_EN(1); + fmask_state[7] |= va >> 8; + } + } else { + fmask_state[3] |= S_008F1C_TILING_INDEX(image->planes[0].surface.u.legacy.color.fmask.tiling_index); + fmask_state[4] |= S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH(image->planes[0].surface.u.legacy.color.fmask.pitch_in_pixels - 1); + fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer); + + if (radv_image_is_tc_compat_cmask(image)) { + va = gpu_address + image->bindings[0].offset + image->planes[0].surface.cmask_offset; + + fmask_state[6] |= S_008F28_COMPRESSION_EN(1); + fmask_state[7] |= va >> 8; + } + } + } else + memset(fmask_state, 0, 8 * 4); + } +} + +void +radv_make_texture_descriptor(struct radv_device *device, struct radv_image *image, bool is_storage_image, + VkImageViewType view_type, VkFormat vk_format, const VkComponentMapping *mapping, + unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state, + uint32_t *fmask_state, VkImageCreateFlags img_create_flags, + const struct ac_surf_nbc_view *nbc_view, const VkImageViewSlicedCreateInfoEXT *sliced_3d) +{ + if (device->physical_device->rad_info.gfx_level >= GFX10) { + gfx10_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, first_level, + last_level, first_layer, last_layer, width, height, depth, min_lod, state, + fmask_state, img_create_flags, nbc_view, sliced_3d); + } else { + si_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, first_level, + last_level, first_layer, last_layer, width, height, depth, min_lod, state, fmask_state, + img_create_flags); + } +} + +static inline void +compute_non_block_compressed_view(struct radv_device *device, const struct radv_image_view *iview, + struct ac_surf_nbc_view *nbc_view) +{ + const struct radv_image *image = iview->image; + const struct radeon_surf *surf = &image->planes[0].surface; + struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws); + struct ac_surf_info surf_info = radv_get_ac_surf_info(device, image); + + ac_surface_compute_nbc_view(addrlib, &device->physical_device->rad_info, surf, &surf_info, iview->vk.base_mip_level, + iview->vk.base_array_layer, nbc_view); +} + +static void +radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_device *device, VkFormat vk_format, + const VkComponentMapping *components, float min_lod, bool is_storage_image, + bool disable_compression, bool enable_compression, unsigned plane_id, + unsigned descriptor_plane_id, VkImageCreateFlags img_create_flags, + const struct ac_surf_nbc_view *nbc_view, + const VkImageViewSlicedCreateInfoEXT *sliced_3d) +{ + struct radv_image *image = iview->image; + struct radv_image_plane *plane = &image->planes[plane_id]; + bool is_stencil = iview->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT; + unsigned first_layer = iview->vk.base_array_layer; + uint32_t blk_w; + union radv_descriptor *descriptor; + uint32_t hw_level = 0; + + if (is_storage_image) { + descriptor = &iview->storage_descriptor; + } else { + descriptor = &iview->descriptor; + } + + assert(vk_format_get_plane_count(vk_format) == 1); + assert(plane->surface.blk_w % vk_format_get_blockwidth(plane->format) == 0); + blk_w = plane->surface.blk_w / vk_format_get_blockwidth(plane->format) * vk_format_get_blockwidth(vk_format); + + if (device->physical_device->rad_info.gfx_level >= GFX9) { + hw_level = iview->vk.base_mip_level; + if (nbc_view->valid) { + hw_level = nbc_view->level; + iview->extent.width = nbc_view->width; + iview->extent.height = nbc_view->height; + + /* Clear the base array layer because addrlib adds it as part of the base addr offset. */ + first_layer = 0; + } + } + + radv_make_texture_descriptor(device, image, is_storage_image, iview->vk.view_type, vk_format, components, hw_level, + hw_level + iview->vk.level_count - 1, first_layer, + iview->vk.base_array_layer + iview->vk.layer_count - 1, + vk_format_get_plane_width(image->vk.format, plane_id, iview->extent.width), + vk_format_get_plane_height(image->vk.format, plane_id, iview->extent.height), + iview->extent.depth, min_lod, descriptor->plane_descriptors[descriptor_plane_id], + descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor, + img_create_flags, nbc_view, sliced_3d); + + const struct legacy_surf_level *base_level_info = NULL; + if (device->physical_device->rad_info.gfx_level <= GFX8) { + if (is_stencil) + base_level_info = &plane->surface.u.legacy.zs.stencil_level[iview->vk.base_mip_level]; + else + base_level_info = &plane->surface.u.legacy.level[iview->vk.base_mip_level]; + } + + bool enable_write_compression = radv_image_use_dcc_image_stores(device, image); + if (is_storage_image && !(enable_write_compression || enable_compression)) + disable_compression = true; + si_set_mutable_tex_desc_fields(device, image, base_level_info, plane_id, iview->vk.base_mip_level, + iview->vk.base_mip_level, blk_w, is_stencil, is_storage_image, disable_compression, + enable_write_compression, descriptor->plane_descriptors[descriptor_plane_id], + nbc_view); +} + +/** + * Determine if the given image view can be fast cleared. + */ +static bool +radv_image_view_can_fast_clear(const struct radv_device *device, const struct radv_image_view *iview) +{ + struct radv_image *image; + + if (!iview) + return false; + image = iview->image; + + /* Only fast clear if the image itself can be fast cleared. */ + if (!radv_image_can_fast_clear(device, image)) + return false; + + /* Only fast clear if all layers are bound. */ + if (iview->vk.base_array_layer > 0 || iview->vk.layer_count != image->vk.array_layers) + return false; + + /* Only fast clear if the view covers the whole image. */ + if (!radv_image_extent_compare(image, &iview->extent)) + return false; + + return true; +} + +void +radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + const VkImageViewCreateInfo *pCreateInfo, VkImageCreateFlags img_create_flags, + const struct radv_image_view_extra_create_info *extra_create_info) +{ + RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image); + const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange; + uint32_t plane_count = 1; + float min_lod = 0.0f; + + const struct VkImageViewMinLodCreateInfoEXT *min_lod_info = + vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_MIN_LOD_CREATE_INFO_EXT); + + if (min_lod_info) + min_lod = min_lod_info->minLod; + + const struct VkImageViewSlicedCreateInfoEXT *sliced_3d = + vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_SLICED_CREATE_INFO_EXT); + + bool from_client = extra_create_info && extra_create_info->from_client; + vk_image_view_init(&device->vk, &iview->vk, !from_client, pCreateInfo); + + switch (image->vk.image_type) { + case VK_IMAGE_TYPE_1D: + case VK_IMAGE_TYPE_2D: + assert(range->baseArrayLayer + vk_image_subresource_layer_count(&image->vk, range) - 1 <= image->vk.array_layers); + break; + case VK_IMAGE_TYPE_3D: + assert(range->baseArrayLayer + vk_image_subresource_layer_count(&image->vk, range) - 1 <= + radv_minify(image->vk.extent.depth, range->baseMipLevel)); + break; + default: + unreachable("bad VkImageType"); + } + iview->image = image; + iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask); + iview->nbc_view.valid = false; + + /* If the image has an Android external format, pCreateInfo->format will be + * VK_FORMAT_UNDEFINED. */ + if (iview->vk.format == VK_FORMAT_UNDEFINED) { + iview->vk.format = image->vk.format; + iview->vk.view_format = image->vk.format; + } + + /* Split out the right aspect. Note that for internal meta code we sometimes + * use an equivalent color format for the aspect so we first have to check + * if we actually got depth/stencil formats. */ + if (iview->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT) { + if (vk_format_has_stencil(iview->vk.view_format)) + iview->vk.view_format = vk_format_stencil_only(iview->vk.view_format); + } else if (iview->vk.aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { + if (vk_format_has_depth(iview->vk.view_format)) + iview->vk.view_format = vk_format_depth_only(iview->vk.view_format); + } + + if (vk_format_get_plane_count(image->vk.format) > 1 && + pCreateInfo->subresourceRange.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) { + plane_count = vk_format_get_plane_count(iview->vk.format); + } + + /* when the view format is emulated, redirect the view to the hidden plane 1 */ + if (radv_is_format_emulated(device->physical_device, iview->vk.format)) { + assert(radv_is_format_emulated(device->physical_device, image->vk.format)); + iview->plane_id = 1; + iview->vk.view_format = image->planes[iview->plane_id].format; + iview->vk.format = image->planes[iview->plane_id].format; + plane_count = 1; + } + + if (device->physical_device->rad_info.gfx_level >= GFX9) { + iview->extent = (VkExtent3D){ + .width = image->vk.extent.width, + .height = image->vk.extent.height, + .depth = image->vk.extent.depth, + }; + } else { + iview->extent = iview->vk.extent; + } + + if (iview->vk.format != image->planes[iview->plane_id].format) { + const struct radv_image_plane *plane = &image->planes[iview->plane_id]; + unsigned view_bw = vk_format_get_blockwidth(iview->vk.format); + unsigned view_bh = vk_format_get_blockheight(iview->vk.format); + unsigned plane_bw = vk_format_get_blockwidth(plane->format); + unsigned plane_bh = vk_format_get_blockheight(plane->format); + + iview->extent.width = DIV_ROUND_UP(iview->extent.width * view_bw, plane_bw); + iview->extent.height = DIV_ROUND_UP(iview->extent.height * view_bh, plane_bh); + + /* Comment ported from amdvlk - + * If we have the following image: + * Uncompressed pixels Compressed block sizes (4x4) + * mip0: 22 x 22 6 x 6 + * mip1: 11 x 11 3 x 3 + * mip2: 5 x 5 2 x 2 + * mip3: 2 x 2 1 x 1 + * mip4: 1 x 1 1 x 1 + * + * On GFX9 the descriptor is always programmed with the WIDTH and HEIGHT of the base level and + * the HW is calculating the degradation of the block sizes down the mip-chain as follows + * (straight-up divide-by-two integer math): mip0: 6x6 mip1: 3x3 mip2: 1x1 mip3: 1x1 + * + * This means that mip2 will be missing texels. + * + * Fix this by calculating the base mip's width and height, then convert + * that, and round it back up to get the level 0 size. Clamp the + * converted size between the original values, and the physical extent + * of the base mipmap. + * + * On GFX10 we have to take care to not go over the physical extent + * of the base mipmap as otherwise the GPU computes a different layout. + * Note that the GPU does use the same base-mip dimensions for both a + * block compatible format and the compressed format, so even if we take + * the plain converted dimensions the physical layout is correct. + */ + if (device->physical_device->rad_info.gfx_level >= GFX9 && vk_format_is_block_compressed(plane->format) && + !vk_format_is_block_compressed(iview->vk.format)) { + /* If we have multiple levels in the view we should ideally take the last level, + * but the mip calculation has a max(..., 1) so walking back to the base mip in an + * useful way is hard. */ + if (iview->vk.level_count > 1) { + iview->extent.width = plane->surface.u.gfx9.base_mip_width; + iview->extent.height = plane->surface.u.gfx9.base_mip_height; + } else { + unsigned lvl_width = radv_minify(image->vk.extent.width, range->baseMipLevel); + unsigned lvl_height = radv_minify(image->vk.extent.height, range->baseMipLevel); + + lvl_width = DIV_ROUND_UP(lvl_width * view_bw, plane_bw); + lvl_height = DIV_ROUND_UP(lvl_height * view_bh, plane_bh); + + iview->extent.width = + CLAMP(lvl_width << range->baseMipLevel, iview->extent.width, plane->surface.u.gfx9.base_mip_width); + iview->extent.height = + CLAMP(lvl_height << range->baseMipLevel, iview->extent.height, plane->surface.u.gfx9.base_mip_height); + + /* If the hardware-computed extent is still be too small, on GFX10 + * we can attempt another workaround provided by addrlib that + * changes the descriptor's base level, and adjusts the address and + * extents accordingly. + */ + if (device->physical_device->rad_info.gfx_level >= GFX10 && + (radv_minify(iview->extent.width, range->baseMipLevel) < lvl_width || + radv_minify(iview->extent.height, range->baseMipLevel) < lvl_height) && + iview->vk.layer_count == 1) { + compute_non_block_compressed_view(device, iview, &iview->nbc_view); + } + } + } + } + + iview->support_fast_clear = radv_image_view_can_fast_clear(device, iview); + iview->disable_dcc_mrt = extra_create_info ? extra_create_info->disable_dcc_mrt : false; + + bool disable_compression = extra_create_info ? extra_create_info->disable_compression : false; + bool enable_compression = extra_create_info ? extra_create_info->enable_compression : false; + for (unsigned i = 0; i < plane_count; ++i) { + VkFormat format = vk_format_get_plane_format(iview->vk.view_format, i); + radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, false, + disable_compression, enable_compression, iview->plane_id + i, i, img_create_flags, + &iview->nbc_view, NULL); + radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, true, + disable_compression, enable_compression, iview->plane_id + i, i, img_create_flags, + &iview->nbc_view, sliced_3d); + } +} + +void +radv_image_view_finish(struct radv_image_view *iview) +{ + vk_image_view_finish(&iview->vk); +} + +VKAPI_ATTR VkResult VKAPI_CALL +radv_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkImageView *pView) +{ + RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image); + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_image_view *view; + + view = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (view == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + radv_image_view_init(view, device, pCreateInfo, image->vk.create_flags, + &(struct radv_image_view_extra_create_info){.from_client = true}); + + *pView = radv_image_view_to_handle(view); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +radv_DestroyImageView(VkDevice _device, VkImageView _iview, const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_image_view, iview, _iview); + + if (!iview) + return; + + radv_image_view_finish(iview); + vk_free2(&device->vk.alloc, pAllocator, iview); +} diff --git a/src/amd/vulkan/radv_sdma.h b/src/amd/vulkan/radv_sdma.h new file mode 100644 index 00000000000..5f5a701e6f1 --- /dev/null +++ b/src/amd/vulkan/radv_sdma.h @@ -0,0 +1,93 @@ +/* + * Copyright © 2023 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef RADV_SDMA_H +#define RADV_SDMA_H + +#include "radv_private.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct radv_sdma_surf { + VkExtent3D extent; /* Image extent. */ + VkOffset3D offset; /* Image offset. */ + uint64_t va; /* Virtual address of image data. */ + unsigned bpp; /* Bytes per pixel. */ + unsigned blk_w; /* Image format block width in pixels. */ + unsigned blk_h; /* Image format block height in pixels. */ + bool is_linear; /* Whether the image is linear. */ + + union { + /* linear images only */ + struct { + unsigned pitch; /* Row pitch in bytes. */ + unsigned slice_pitch; /* Slice pitch in bytes. */ + }; + /* tiled images only */ + struct { + uint64_t meta_va; /* Virtual address of metadata. */ + uint32_t meta_config; /* Metadata configuration DWORD. */ + uint32_t header_dword; /* Extra bits for the copy packet header. */ + uint32_t info_dword; /* Image information DWORD. */ + }; + }; +}; + +ALWAYS_INLINE static VkExtent3D +radv_sdma_get_copy_extent(const struct radv_image *const image, const VkImageSubresourceLayers subresource, + VkExtent3D extent) +{ + if (image->vk.image_type != VK_IMAGE_TYPE_3D) + extent.depth = vk_image_subresource_layer_count(&image->vk, &subresource); + + return extent; +} + +struct radv_sdma_surf radv_sdma_get_buf_surf(const struct radv_buffer *const buffer, + const struct radv_image *const image, + const VkBufferImageCopy2 *const region, + const VkImageAspectFlags aspect_mask); +struct radv_sdma_surf radv_sdma_get_surf(const struct radv_device *const device, const struct radv_image *const image, + const VkImageSubresourceLayers subresource, const VkOffset3D offset, + const VkImageAspectFlags aspect_mask); +void radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdbuf *cs, + const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img, + const VkExtent3D extent, bool to_image); +bool radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_sdma_surf *buf, + const struct radv_sdma_surf *img, const VkExtent3D ext); +void radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs, + const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img_in, + const VkExtent3D copy_extent, struct radeon_winsys_bo *temp_bo, + bool to_image); +void radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va, + uint64_t size); +void radv_sdma_fill_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, const uint64_t va, + const uint64_t size, const uint32_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* RADV_SDMA_H */ diff --git a/src/android_stub/meson.build.rej b/src/android_stub/meson.build.rej new file mode 100644 index 00000000000..b9a9ddad68a --- /dev/null +++ b/src/android_stub/meson.build.rej @@ -0,0 +1,10 @@ +diff a/src/android_stub/meson.build b/src/android_stub/meson.build (rejected hunks) +@@ -1,7 +1,7 @@ + if with_android_stub + stub_libs = [] + +- foreach lib : ['backtrace', 'cutils', 'hardware', 'log', 'nativewindow', 'sync'] ++ foreach lib : ['hardware', 'log', 'nativewindow'] + stub_libs += shared_library( + lib, + files(lib + '_stub.cpp'), diff --git a/src/compiler/glsl/glsl_to_nir.cpp.rej b/src/compiler/glsl/glsl_to_nir.cpp.rej new file mode 100644 index 00000000000..5d3805f6c89 --- /dev/null +++ b/src/compiler/glsl/glsl_to_nir.cpp.rej @@ -0,0 +1,39 @@ +diff a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp (rejected hunks) +@@ -81,9 +81,6 @@ class nir_visitor : public ir_visitor + + void create_function(ir_function_signature *ir); + +- /* True if we have any output rvalues */ +- bool has_output_rvalue; +- + private: + void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size); + nir_ssa_def *evaluate_rvalue(ir_rvalue *ir); +@@ -274,9 +271,6 @@ glsl_to_nir(const struct gl_constants *consts, + if (var->data.mode == nir_var_shader_in && var->data.sample) + shader->info.fs.uses_sample_shading = true; + } +- +- if (v1.has_output_rvalue) +- shader->info.fs.uses_sample_shading = true; + } + + return shader; +@@ -287,7 +281,6 @@ nir_visitor::nir_visitor(const struct gl_constants *consts, nir_shader *shader) + this->supports_std430 = consts->UseSTD430AsDefaultPacking; + this->shader = shader; + this->is_global = true; +- this->has_output_rvalue = false; + this->var_table = _mesa_pointer_hash_table_create(NULL); + this->overload_table = _mesa_pointer_hash_table_create(NULL); + this->sparse_variable_set = _mesa_pointer_set_create(NULL); +@@ -1826,9 +1819,6 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir) + + enum gl_access_qualifier access = deref_get_qualifier(this->deref); + this->result = nir_load_deref_with_access(&b, this->deref, access); +- +- if (nir_deref_mode_is(this->deref, nir_var_shader_out)) +- this->has_output_rvalue = true; + } + + return this->result; diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp index d9ec60ad21f..c48fdd5c13e 100644 --- a/src/compiler/glsl/standalone_scaffolding.cpp +++ b/src/compiler/glsl/standalone_scaffolding.cpp @@ -279,6 +279,9 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api) ctx->Const.Program[MESA_SHADER_COMPUTE].MaxUniformComponents = 1024; ctx->Const.Program[MESA_SHADER_COMPUTE].MaxInputComponents = 0; /* not used */ ctx->Const.Program[MESA_SHADER_COMPUTE].MaxOutputComponents = 0; /* not used */ + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = 16; + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicCounters = 16; + ctx->Const.Program[MESA_SHADER_COMPUTE].MaxShaderStorageBlocks = 16; /* Set up default shader compiler options. */ struct gl_shader_compiler_options options; diff --git a/src/drm-shim/device.c b/src/drm-shim/device.c index 345d72aa653..6a3321e5cd2 100644 --- a/src/drm-shim/device.c +++ b/src/drm-shim/device.c @@ -296,6 +296,10 @@ drm_shim_ioctl(int fd, unsigned long request, void *arg) ASSERTED int type = _IOC_TYPE(request); int nr = _IOC_NR(request); + /* Used by kbase; do not claim to be a kbase FD */ + if (type == 0x80) + return -EINVAL; + assert(type == DRM_IOCTL_BASE); if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) { diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c index bb75273ead9..6d3fb97f9c7 100644 --- a/src/egl/drivers/dri2/egl_dri2.c +++ b/src/egl/drivers/dri2/egl_dri2.c @@ -3322,6 +3322,8 @@ dri2_query_wayland_buffer_wl(_EGLDisplay *disp, const struct wl_drm_components_descriptor *format; buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, buffer_resource); + if (!buffer) + buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, buffer_resource); if (!buffer) return EGL_FALSE; diff --git a/src/egl/drivers/dri2/egl_dri2.c.rej b/src/egl/drivers/dri2/egl_dri2.c.rej new file mode 100644 index 00000000000..6b841557268 --- /dev/null +++ b/src/egl/drivers/dri2/egl_dri2.c.rej @@ -0,0 +1,60 @@ +diff a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c (rejected hunks) +@@ -52,6 +52,8 @@ + #include + #include "wayland-drm.h" + #include "wayland-drm-client-protocol.h" ++#include "mali-buffer-sharing.h" ++#include "mali-buffer-sharing-client-protocol.h" + #include "linux-dmabuf-unstable-v1-client-protocol.h" + #endif + +@@ -2259,6 +2261,9 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx, + + buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm, + (struct wl_resource *) _buffer); ++ if (!buffer) ++ buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_mali, ++ (struct wl_resource *) _buffer); + if (!buffer) + return NULL; + +@@ -3256,11 +3261,27 @@ dri2_bind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy) + wayland_drm_init(wl_dpy, device_name, + &wl_drm_callbacks, disp, flags); + ++ drmSetVersion sv = { ++ .drm_di_major = 1, ++ .drm_di_minor = 4, ++ .drm_dd_major = -1, ++ .drm_dd_minor = -1, ++ }; ++ drmSetInterfaceVersion(dri2_dpy->fd, &sv); ++ ++ char *busid = drmGetBusid(dri2_dpy->fd); ++ dri2_dpy->wl_server_mali = ++ mali_buffer_sharing_init(wl_dpy, busid, ++ &wl_drm_callbacks, ++ disp); ++ drmFreeBusid(busid); ++ + free(device_name); + + if (!dri2_dpy->wl_server_drm) + goto fail; + ++ // TODO: Do this for mali_buffer_sharing + #ifdef HAVE_DRM_PLATFORM + /* We have to share the wl_drm instance with gbm, so gbm can convert + * wl_buffers to gbm bos. */ +@@ -3281,6 +3302,11 @@ dri2_unbind_wayland_display_wl(_EGLDisplay *disp, struct wl_display *wl_dpy) + { + struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); + ++ if (dri2_dpy->wl_server_mali) { ++ wayland_drm_uninit(dri2_dpy->wl_server_mali); ++ dri2_dpy->wl_server_mali = NULL; ++ } ++ + if (!dri2_dpy->wl_server_drm) + return EGL_FALSE; + diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h index 5edd810f476..37ac1684f26 100644 --- a/src/egl/drivers/dri2/egl_dri2.h +++ b/src/egl/drivers/dri2/egl_dri2.h @@ -296,8 +296,11 @@ struct dri2_egl_display { struct wl_display *wl_dpy_wrapper; struct wl_registry *wl_registry; struct wl_drm *wl_server_drm; + struct wl_drm *wl_server_mali; struct wl_drm *wl_drm; + struct wl_drm *wl_mali; uint32_t wl_drm_version, wl_drm_name; + uint32_t wl_mali_version, wl_mali_name; struct wl_shm *wl_shm; struct wl_event_queue *wl_queue; struct zwp_linux_dmabuf_v1 *wl_dmabuf; @@ -345,6 +348,7 @@ struct dri2_egl_surface { struct wl_surface *wl_surface_wrapper; struct wl_display *wl_dpy_wrapper; struct wl_drm *wl_drm_wrapper; + struct wl_drm *wl_mali_wrapper; struct wl_callback *throttle_callback; struct zwp_linux_dmabuf_feedback_v1 *wl_dmabuf_feedback; struct dmabuf_feedback dmabuf_feedback, pending_dmabuf_feedback; diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c index 7e85dd682a4..78e3009c421 100644 --- a/src/egl/drivers/dri2/platform_wayland.c +++ b/src/egl/drivers/dri2/platform_wayland.c @@ -748,7 +748,7 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, dri2_surf->base.PresentOpaque); assert(visual_idx != -1); - if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) { + if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm || dri2_dpy->wl_mali) { dri2_surf->format = dri2_wl_visuals[visual_idx].wl_drm_format; } else { assert(dri2_dpy->wl_shm); @@ -771,6 +771,16 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, dri2_surf->wl_queue); } + if (dri2_dpy->wl_mali) { + dri2_surf->wl_mali_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_mali); + if (!dri2_surf->wl_mali_wrapper) { + _eglError(EGL_BAD_ALLOC, "dri2_create_surface"); + goto cleanup_queue; + } + wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_mali_wrapper, + dri2_surf->wl_queue); + } + dri2_surf->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy); if (!dri2_surf->wl_dpy_wrapper) { _eglError(EGL_BAD_ALLOC, "dri2_create_surface"); @@ -907,6 +917,8 @@ dri2_wl_destroy_surface(_EGLDisplay *disp, _EGLSurface *surf) wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper); if (dri2_surf->wl_drm_wrapper) wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper); + if (dri2_surf->wl_mali_wrapper) + wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper); if (dri2_surf->wl_dmabuf_feedback) { zwp_linux_dmabuf_feedback_v1_destroy(dri2_surf->wl_dmabuf_feedback); dmabuf_feedback_fini(&dri2_surf->dmabuf_feedback); @@ -1503,6 +1515,26 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy, ret = zwp_linux_buffer_params_v1_create_immed(params, width, height, fourcc, 0); zwp_linux_buffer_params_v1_destroy(params); + } else if (dri2_surf->wl_mali_wrapper || dri2_dpy->wl_mali) { + struct wl_drm *wl_mali = + dri2_surf ? dri2_surf->wl_mali_wrapper : dri2_dpy->wl_mali; + int fd = -1, stride; + + if (num_planes > 1) + return NULL; + + query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd); + query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride); + if (!query) { + if (fd >= 0) + close(fd); + return NULL; + } + + ret = mali_buffer_sharing_create_buffer((void *)wl_mali, + width, height, stride, + fourcc, 0, 0, fd); + close(fd); } else { struct wl_drm *wl_drm = dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm; @@ -1776,6 +1808,62 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device) } } +static void +mali_handle_device(void *data, struct mali_buffer_sharing *drm, const char *device) +{ + struct dri2_egl_display *dri2_dpy = data; + drm_magic_t magic; + + // hack + //printf("device '%s'\n", device); + dri2_dpy->device_name = strdup("/dev/dri/card0"); + + dri2_dpy->fd = loader_open_device(dri2_dpy->device_name); + if (dri2_dpy->fd == -1) { + _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)", + dri2_dpy->device_name, strerror(errno)); + free(dri2_dpy->device_name); + dri2_dpy->device_name = NULL; + return; + } + + if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) { + dri2_dpy->authenticated = true; + } else { + roundtrip(dri2_dpy); + if (drmGetMagic(dri2_dpy->fd, &magic)) { + close(dri2_dpy->fd); + dri2_dpy->fd = -1; + free(dri2_dpy->device_name); + dri2_dpy->device_name = NULL; + _eglLog(_EGL_WARNING, "wayland-egl: drmGetMagic failed"); + return; + } + + mali_buffer_sharing_auth((void *)dri2_dpy->wl_mali, magic); + roundtrip(dri2_dpy); + // We don't get a callback + dri2_dpy->authenticated = true; + } + + int supported_fourcc[] = { + WL_DRM_FORMAT_ABGR16F, + WL_DRM_FORMAT_ABGR2101010, + WL_DRM_FORMAT_XRGB8888, + WL_DRM_FORMAT_ARGB8888, + WL_DRM_FORMAT_ABGR8888, + WL_DRM_FORMAT_XBGR8888, + WL_DRM_FORMAT_RGB565, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(supported_fourcc); ++i) { + int visual_idx = dri2_wl_visual_idx_from_fourcc(supported_fourcc[i]); + assert(visual_idx != -1); + + BITSET_SET(dri2_dpy->formats.formats_bitmap, visual_idx); + } +} + static void drm_handle_format(void *data, struct wl_drm *drm, uint32_t format) { @@ -2836,6 +2924,8 @@ dri2_teardown_wayland(struct dri2_egl_display *dri2_dpy) dri2_wl_formats_fini(&dri2_dpy->formats); if (dri2_dpy->wl_drm) wl_drm_destroy(dri2_dpy->wl_drm); + if (dri2_dpy->wl_mali) + wl_drm_destroy(dri2_dpy->wl_mali); if (dri2_dpy->wl_dmabuf) zwp_linux_dmabuf_v1_destroy(dri2_dpy->wl_dmabuf); if (dri2_dpy->wl_shm) diff --git a/src/egl/drivers/dri2/platform_wayland.c.rej b/src/egl/drivers/dri2/platform_wayland.c.rej new file mode 100644 index 00000000000..b20afec38c3 --- /dev/null +++ b/src/egl/drivers/dri2/platform_wayland.c.rej @@ -0,0 +1,89 @@ +diff a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c (rejected hunks) +@@ -51,6 +51,7 @@ + #include + #include + #include "wayland-drm-client-protocol.h" ++#include "mali-buffer-sharing-client-protocol.h" + #include "linux-dmabuf-unstable-v1-client-protocol.h" + + /* +@@ -765,6 +776,8 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf, + cleanup_drm: + if (dri2_surf->wl_drm_wrapper) + wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper); ++ if (dri2_surf->wl_mali_wrapper) ++ wl_proxy_wrapper_destroy(dri2_surf->wl_mali_wrapper); + cleanup_queue: + wl_event_queue_destroy(dri2_surf->wl_queue); + cleanup_surf: +@@ -1768,6 +1859,10 @@ static const struct wl_drm_listener drm_listener = { + .capabilities = drm_handle_capabilities + }; + ++static const struct mali_buffer_sharing_listener mali_listener = { ++ .alloc_device = mali_handle_device, ++}; ++ + static void + dmabuf_ignore_format(void *data, struct zwp_linux_dmabuf_v1 *dmabuf, + uint32_t format) +@@ -1813,6 +1908,14 @@ wl_drm_bind(struct dri2_egl_display *dri2_dpy) + wl_drm_add_listener(dri2_dpy->wl_drm, &drm_listener, dri2_dpy); + } + ++static void ++wl_mali_bind(struct dri2_egl_display *dri2_dpy) ++{ ++ dri2_dpy->wl_mali = wl_registry_bind(dri2_dpy->wl_registry, dri2_dpy->wl_mali_name, ++ &mali_buffer_sharing_interface, dri2_dpy->wl_mali_version); ++ mali_buffer_sharing_add_listener((void *)dri2_dpy->wl_mali, &mali_listener, dri2_dpy); ++} ++ + static void + default_dmabuf_feedback_format_table(void *data, + struct zwp_linux_dmabuf_feedback_v1 *zwp_linux_dmabuf_feedback_v1, +@@ -1943,6 +2046,9 @@ registry_handle_global_drm(void *data, struct wl_registry *registry, + if (strcmp(interface, wl_drm_interface.name) == 0) { + dri2_dpy->wl_drm_version = MIN2(version, 2); + dri2_dpy->wl_drm_name = name; ++ } else if (strcmp(interface, mali_buffer_sharing_interface.name) == 0) { ++ dri2_dpy->wl_mali_version = MIN2(version, 5); ++ dri2_dpy->wl_mali_name = name; + } else if (strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0 && version >= 3) { + dri2_dpy->wl_dmabuf = + wl_registry_bind(registry, name, &zwp_linux_dmabuf_v1_interface, +@@ -2145,10 +2251,7 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) + + /* We couldn't retrieve a render node from the dma-buf feedback (or the + * feedback was not advertised at all), so we must fallback to wl_drm. */ +- if (dri2_dpy->fd == -1) { +- /* wl_drm not advertised by compositor, so can't continue */ +- if (dri2_dpy->wl_drm_name == 0) +- goto cleanup; ++ if (dri2_dpy->fd == -1 && dri2_dpy->wl_drm_name) { + wl_drm_bind(dri2_dpy); + + if (dri2_dpy->wl_drm == NULL) +@@ -2161,6 +2264,22 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) + goto cleanup; + } + ++ if (dri2_dpy->fd == -1 && dri2_dpy->wl_mali_name) { ++ wl_mali_bind(dri2_dpy); ++ ++ if (dri2_dpy->wl_mali == NULL) ++ goto cleanup; ++ if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1) ++ goto cleanup; ++ ++ if (!dri2_dpy->authenticated && ++ (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated)) ++ goto cleanup; ++ } ++ ++ if (dri2_dpy->fd == -1) ++ goto cleanup; ++ + dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd, + &dri2_dpy->is_different_gpu); + dev = _eglAddDevice(dri2_dpy->fd, false); diff --git a/src/egl/meson.build.rej b/src/egl/meson.build.rej new file mode 100644 index 00000000000..1056b3fe25d --- /dev/null +++ b/src/egl/meson.build.rej @@ -0,0 +1,19 @@ +diff a/src/egl/meson.build b/src/egl/meson.build (rejected hunks) +@@ -122,14 +122,16 @@ if with_dri2 + endif + if with_platform_wayland + deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers] +- link_for_egl += libwayland_drm ++ link_for_egl += [libwayland_drm, libmali_buffer_sharing] + files_egl += files('drivers/dri2/platform_wayland.c') + files_egl += [ + linux_dmabuf_unstable_v1_protocol_c, + linux_dmabuf_unstable_v1_client_protocol_h, + wayland_drm_client_protocol_h, ++ mali_buffer_sharing_client_protocol_h, + ] + incs_for_egl += include_directories('wayland/wayland-drm') ++ incs_for_egl += include_directories('wayland/mali-buffer-sharing') + endif + if with_platform_android + deps_for_egl += dep_android diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c new file mode 100644 index 00000000000..d3c9a6f0dd2 --- /dev/null +++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.c @@ -0,0 +1,170 @@ +/* + * Copyright © 2022 Icecream95 + * Copyright © 2011 Kristian Høgsberg + * Copyright © 2011 Benjamin Franzke + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kristian Høgsberg + * Benjamin Franzke + */ + +#include +#include +#include +#include +#include + +#include +#include "mali-buffer-sharing.h" +#include "mali-buffer-sharing-server-protocol.h" +#include "wayland-drm-client-protocol.h" + +#define MIN(x,y) (((x)<(y))?(x):(y)) + +static void +destroy_buffer(struct wl_resource *resource) +{ + struct wl_drm_buffer *buffer = wl_resource_get_user_data(resource); + struct wl_drm *drm = buffer->drm; + + drm->callbacks.release_buffer(drm->user_data, buffer); + free(buffer); +} + +static void +buffer_destroy(struct wl_client *client, struct wl_resource *resource) +{ + wl_resource_destroy(resource); +} + +static void +create_buffer(struct wl_client *client, struct wl_resource *resource, + uint32_t id, uint32_t name, int fd, + int32_t width, int32_t height, + uint32_t format, + int32_t offset, int32_t stride) +{ + struct wl_drm *drm = wl_resource_get_user_data(resource); + struct wl_drm_buffer *buffer; + + buffer = calloc(1, sizeof *buffer); + if (buffer == NULL) { + wl_resource_post_no_memory(resource); + return; + } + + buffer->drm = drm; + buffer->width = width; + buffer->height = height; + buffer->format = format; + buffer->offset[0] = offset; + buffer->stride[0] = stride; + + drm->callbacks.reference_buffer(drm->user_data, name, fd, buffer); + if (buffer->driver_buffer == NULL) { + // TODO: We should return an error + return; + } + + buffer->resource = + wl_resource_create(client, &wl_buffer_interface, 1, id); + if (!buffer->resource) { + wl_resource_post_no_memory(resource); + free(buffer); + return; + } + + wl_resource_set_implementation(buffer->resource, + (void (**)(void)) &drm->buffer_interface, + buffer, destroy_buffer); +} + +static void +mali_create_buffer(struct wl_client *client, + struct wl_resource *resource, + uint32_t id, + int32_t width, int32_t height, uint32_t stride, + enum wl_drm_format format, uint32_t unk1, uint32_t unk2, + int fd) +{ + create_buffer(client, resource, id, 0, fd, width, height, format, + 0, stride); + close(fd); +} + +static void +mali_auth(struct wl_client *client, + struct wl_resource *resource, uint32_t id) +{ + struct wl_drm *drm = wl_resource_get_user_data(resource); + + drm->callbacks.authenticate(drm->user_data, id); +} + +static const struct mali_buffer_sharing_interface mali_interface = { + mali_create_buffer, + mali_auth, +}; + +static void +bind_mali(struct wl_client *client, void *data, uint32_t version, uint32_t id) +{ + struct wl_drm *drm = data; + struct wl_resource *resource; + + resource = wl_resource_create(client, &mali_buffer_sharing_interface, + MIN(version, 4), id); + if (!resource) { + wl_client_post_no_memory(client); + return; + } + + wl_resource_set_implementation(resource, &mali_interface, data, NULL); + + mali_buffer_sharing_send_alloc_device(resource, drm->device_name); +} + +struct wl_drm * +mali_buffer_sharing_init(struct wl_display *display, char *device_name, + const struct wayland_drm_callbacks *callbacks, void *user_data) +{ + struct wl_drm *drm; + + drm = malloc(sizeof *drm); + if (!drm) + return NULL; + + drm->display = display; + drm->device_name = strdup(device_name ?: ""); + drm->callbacks = *callbacks; + drm->user_data = user_data; + drm->flags = 1; + + drm->buffer_interface.destroy = buffer_destroy; + + drm->wl_drm_global = + wl_global_create(display, &mali_buffer_sharing_interface, 5, + drm, bind_mali); + + return drm; +} diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h new file mode 100644 index 00000000000..f7f2c4811df --- /dev/null +++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.h @@ -0,0 +1,12 @@ +#ifndef MALI_BUFFER_H +#define MALI_BUFFER_H + +#include + +#include "wayland-drm.h" + +struct wl_drm * +mali_buffer_sharing_init(struct wl_display *display, char *device_name, + const struct wayland_drm_callbacks *callbacks, void *user_data); + +#endif diff --git a/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml new file mode 100644 index 00000000000..0ad02488118 --- /dev/null +++ b/src/egl/wayland/mali-buffer-sharing/mali-buffer-sharing.xml @@ -0,0 +1,50 @@ + + + + + Copyright © 2022 Icecream95 + + Permission to use, copy, modify, distribute, and sell this + software and its documentation for any purpose is hereby granted + without fee, provided that\n the above copyright notice appear in + all copies and that both that copyright notice and this permission + notice appear in supporting documentation, and that the name of + the copyright holders not be used in advertising or publicity + pertaining to distribution of the software without specific, + written prior permission. The copyright holders make no + representations about the suitability of this software for any + purpose. It is provided "as is" without express or implied + warranty. + + THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF + THIS SOFTWARE. + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/egl/wayland/mali-buffer-sharing/meson.build b/src/egl/wayland/mali-buffer-sharing/meson.build new file mode 100644 index 00000000000..0693bf8668c --- /dev/null +++ b/src/egl/wayland/mali-buffer-sharing/meson.build @@ -0,0 +1,51 @@ +# Copyright © 2017 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +inc_mali_buffer_sharing = include_directories('.') + +mali_buffer_sharing_protocol_c = custom_target( + 'mali-buffer-sharing-protocol.c', + input : 'mali-buffer-sharing.xml', + output : 'mali-buffer-sharing-protocol.c', + command : [prog_wl_scanner, wl_scanner_arg, '@INPUT@', '@OUTPUT@'], +) + +mali_buffer_sharing_client_protocol_h = custom_target( + 'mali-buffer-sharing-client-protocol.h', + input : 'mali-buffer-sharing.xml', + output : 'mali-buffer-sharing-client-protocol.h', + command : [prog_wl_scanner, 'client-header', '@INPUT@', '@OUTPUT@'], +) + +mali_buffer_sharing_server_protocol_h = custom_target( + 'mali-buffer-sharing-server-protocol.h', + input : 'mali-buffer-sharing.xml', + output : 'mali-buffer-sharing-server-protocol.h', + command : [prog_wl_scanner, 'server-header', '@INPUT@', '@OUTPUT@'], +) + +libmali_buffer_sharing = static_library( + 'mali-buffer-sharing', + ['mali-buffer-sharing.c', mali_buffer_sharing_protocol_c, mali_buffer_sharing_server_protocol_h, wayland_drm_client_protocol_h], + include_directories : inc_wayland_drm, + gnu_symbol_visibility : 'hidden', + dependencies : [dep_wayland_server], + build_by_default : false, +) diff --git a/src/egl/wayland/wayland-drm/wayland-drm.c.rej b/src/egl/wayland/wayland-drm/wayland-drm.c.rej new file mode 100644 index 00000000000..9016c1f2638 --- /dev/null +++ b/src/egl/wayland/wayland-drm/wayland-drm.c.rej @@ -0,0 +1,10 @@ +diff a/src/egl/wayland/wayland-drm/wayland-drm.c b/src/egl/wayland/wayland-drm/wayland-drm.c (rejected hunks) +@@ -212,7 +212,7 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id) + + wl_resource_set_implementation(resource, &drm_interface, data, NULL); + +- wl_resource_post_event(resource, WL_DRM_DEVICE, drm->device_name); ++ wl_resource_post_event(resource, WL_DRM_DEVICE, "/dev/dri/card0"); + + if (drm->callbacks.is_format_supported(drm->user_data, + WL_DRM_FORMAT_ARGB2101010)) { diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index fd24be07d2d..6713a04557d 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -1446,6 +1446,11 @@ cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage, } } +void +cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen) +{ + ctx->max_sampler_seen = max_sampler_seen; +} /** * Send staged sampler state to the driver. diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h index f796310d39b..87a27597097 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.h +++ b/src/gallium/auxiliary/cso_cache/cso_context.h @@ -84,6 +84,9 @@ void cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage, unsigned idx, const struct pipe_sampler_state *states); +void +cso_set_max_sampler(struct cso_context *ctx, int max_sampler_seen); + void cso_single_sampler_done(struct cso_context *cso, enum pipe_shader_type shader_stage); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej new file mode 100644 index 00000000000..5c8f600a0b7 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c.rej @@ -0,0 +1,19 @@ +diff a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c (rejected hunks) +@@ -1027,7 +1027,7 @@ static void emit_atomic_global(struct lp_build_nir_context *bld_base, + case nir_intrinsic_global_atomic_fadd: + op = LLVMAtomicRMWBinOpFAdd; + break; +-#if LLVM_VERSION_MAJOR >= 15 ++#if LLVM_VERSION_MAJOR >= 16 + case nir_intrinsic_global_atomic_fmin: + op = LLVMAtomicRMWBinOpFMin; + break; +@@ -1542,7 +1542,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base, + case nir_intrinsic_ssbo_atomic_fadd: + op = LLVMAtomicRMWBinOpFAdd; + break; +-#if LLVM_VERSION_MAJOR >= 15 ++#if LLVM_VERSION_MAJOR >= 16 + case nir_intrinsic_shared_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmin: + op = LLVMAtomicRMWBinOpFMin; diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej new file mode 100644 index 00000000000..81d28aa4e6b --- /dev/null +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c.rej @@ -0,0 +1,10 @@ +diff a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c (rejected hunks) +@@ -189,7 +189,7 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd) + int new_fd; + + if (fd < 0 || (new_fd = os_dupfd_cloexec(fd)) < 0) +- return false; ++ return false; + + ret = pipe_loader_drm_probe_fd_nodup(dev, new_fd); + if (!ret) diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej new file mode 100644 index 00000000000..fd803af37c0 --- /dev/null +++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h.rej @@ -0,0 +1,43 @@ +diff a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h (rejected hunks) +@@ -8,6 +8,10 @@ + #include "frontend/sw_winsys.h" + #include "target-helpers/inline_debug_helper.h" + ++#include ++#include ++#include ++ + /* Helper function to choose and instantiate one of the software rasterizers: + * llvmpipe, softpipe. + */ +@@ -33,6 +37,10 @@ + #include "asahi/agx_public.h" + #endif + ++#if defined(GALLIUM_PANFROST) ++#include "panfrost/pan_public.h" ++#endif ++ + static inline struct pipe_screen * + sw_screen_create_named(struct sw_winsys *winsys, const char *driver) + { +@@ -71,6 +79,19 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver) + screen = agx_screen_create(0, NULL, winsys); + #endif + ++#if defined(GALLIUM_PANFROST) ++ if(screen == NULL && strcmp(driver, "panfrost") == 0) { ++ int kbase_device_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK); ++ if(kbase_device_fd == -1) { ++ printf("PAN_OSMESA: Failed to open kbase device: %s", strerror(errno)); ++ }else { ++ screen = panfrost_create_screen(kbase_device_fd, NULL); ++ } ++ } ++#else ++#error You forgot to include Panfrost ++#endif ++ + return screen ? debug_screen_wrap(screen) : NULL; + } + diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 987475c76df..aa0db97ea65 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -47,6 +47,7 @@ panfrost_includes = [ inc_include, inc_src, inc_panfrost, + inc_panfrost_hw, ] compile_args_panfrost = [ diff --git a/src/gallium/drivers/panfrost/meson.build.rej b/src/gallium/drivers/panfrost/meson.build.rej new file mode 100644 index 00000000000..508ebed2cd9 --- /dev/null +++ b/src/gallium/drivers/panfrost/meson.build.rej @@ -0,0 +1,10 @@ +diff a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build (rejected hunks) +@@ -51,7 +52,7 @@ compile_args_panfrost = [ + '-Wno-pointer-arith' + ] + +-panfrost_versions = ['4', '5', '6', '7', '9'] ++panfrost_versions = ['4', '5', '6', '7', '9', '10'] + libpanfrost_versions = [] + + foreach ver : panfrost_versions diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c.rej b/src/gallium/drivers/panfrost/pan_cmdstream.c.rej new file mode 100644 index 00000000000..fd0f475f81b --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c.rej @@ -0,0 +1,1186 @@ +diff a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c (rejected hunks) +@@ -23,12 +23,15 @@ + * SOFTWARE. + */ + ++#include "dma-uapi/dma-buf.h" ++ + #include "util/macros.h" + #include "util/u_prim.h" + #include "util/u_vbuf.h" + #include "util/u_helpers.h" + #include "util/u_draw.h" + #include "util/u_memory.h" ++#include "util/u_viewport.h" + #include "pipe/p_defines.h" + #include "pipe/p_state.h" + #include "gallium/auxiliary/util/u_blend.h" +@@ -749,8 +752,8 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); + float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); + float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); +- float minz = (vp->translate[2] - fabsf(vp->scale[2])); +- float maxz = (vp->translate[2] + fabsf(vp->scale[2])); ++ float minz, maxz; ++ util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz); + + /* Scissor to the intersection of viewport and to the scissor, clamped + * to the framebuffer */ +@@ -778,10 +781,16 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + maxx--; + maxy--; + +- batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY; +- batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY; +- + #if PAN_ARCH <= 7 ++ /* Proper depth clamp support was only introduced in v9, before then ++ * all that can be done is disabling clipping by adjusting the ++ * viewport. This means that the result will be wrong for float depth ++ * buffers or non-[0, 1] depth range. */ ++ if (!rast->depth_clip_near) ++ minz = -INFINITY; ++ if (!rast->depth_clip_far) ++ maxz = +INFINITY; ++ + struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); + + pan_pack(T.cpu, VIEWPORT, cfg) { +@@ -790,19 +799,22 @@ panfrost_emit_viewport(struct panfrost_batch *batch) + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + +- cfg.minimum_z = batch->minimum_z; +- cfg.maximum_z = batch->maximum_z; ++ cfg.minimum_z = minz; ++ cfg.maximum_z = maxz; + } + + return T.gpu; + #else +- pan_pack(&batch->scissor, SCISSOR, cfg) { ++ pan_pack_cs_v10(&batch->scissor, &batch->cs_vertex, SCISSOR, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } + ++ batch->minimum_z = minz; ++ batch->maximum_z = maxz; ++ + return 0; + #endif + } +@@ -838,6 +850,14 @@ panfrost_emit_depth_stencil(struct panfrost_batch *batch) + cfg.depth_units = rast->base.offset_units * 2.0f; + cfg.depth_factor = rast->base.offset_scale; + cfg.depth_bias_clamp = rast->base.offset_clamp; ++ ++ if (rast->base.depth_clip_near && rast->base.depth_clip_far) { ++ cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_0_1; ++ cfg.depth_cull_enable = true; ++ } else { ++ cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS; ++ cfg.depth_cull_enable = false; ++ } + } + + pan_merge(dynamic, zsa->desc, DEPTH_STENCIL); +@@ -1482,9 +1502,17 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; + struct panfrost_ptr transfer = + pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); ++ void *sys_cpu = malloc(sys_size); ++ ++ /* Write to a shadow buffer to make pushing cheaper */ ++ struct panfrost_ptr sys_shadow = { ++ .cpu = sys_cpu, ++ .gpu = transfer.gpu, ++ }; + + /* Upload sysvals requested by the shader */ +- panfrost_upload_sysvals(batch, &transfer, ss, stage); ++ panfrost_upload_sysvals(batch, &sys_shadow, ss, stage); ++ memcpy(transfer.cpu, sys_cpu, sys_size); + + /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ + struct panfrost_compiled_shader *shader = ctx->prog[stage]; +@@ -1527,8 +1555,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + if (pushed_words) + *pushed_words = ss->info.push.count; + +- if (ss->info.push.count == 0) ++ if (ss->info.push.count == 0) { ++ free(sys_cpu); + return ubos.gpu; ++ } + + /* Copy push constants required by the shader */ + struct panfrost_ptr push_transfer = +@@ -1580,13 +1610,15 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, + * off to upload sysvals to a staging buffer on the CPU on the + * assumption sysvals will get pushed (TODO) */ + +- const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : ++ const void *mapped_ubo = (src.ubo == sysval_ubo) ? sys_cpu : + panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); + + /* TODO: Is there any benefit to combining ranges */ + memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); + } + ++ free(sys_cpu); ++ + return ubos.gpu; + } + +@@ -2777,6 +2809,385 @@ emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb) + return transfer.gpu; + } + ++#if PAN_ARCH >= 10 ++ ++static int ++panfrost_export_dmabuf_fence(int dmabuf) ++{ ++ struct dma_buf_export_sync_file export = { ++ .flags = DMA_BUF_SYNC_RW, ++ }; ++ ++ int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export); ++ if (err < 0) { ++ fprintf(stderr, "failed to export fence: %s\n", ++ strerror(errno)); ++ return -1; ++ } ++ ++ return export.fd; ++} ++ ++static bool ++panfrost_import_dmabuf_fence(int dmabuf, int fence) ++{ ++ struct dma_buf_import_sync_file import = { ++ .flags = DMA_BUF_SYNC_RW, ++ .fd = fence, ++ }; ++ ++ int err = drmIoctl(dmabuf, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import); ++ if (err < 0) { ++ fprintf(stderr, "failed to import fence: %s\n", ++ strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static uint64_t * ++panfrost_cs_ring_allocate_instrs(struct panfrost_cs *cs, unsigned count) ++{ ++ pan_command_stream c = cs->cs; ++ ++ if (c.ptr + count > c.end) { ++ assert(c.ptr <= c.end); ++ assert(c.begin + count <= c.ptr); ++ ++ /* Instructions are in a ring buffer, simply NOP out the end ++ * and start back from the start. Possibly, doing a TAILCALL ++ * straight to the start could also work. */ ++ memset(c.ptr, 0, (c.end - c.ptr) * 8); ++ c.ptr = c.begin; ++ ++ cs->offset += cs->base.size; ++ cs->cs = c; ++ } ++ ++ /* TODO: Check against the extract offset */ ++ return c.ptr + count; ++} ++ ++// TODO: Rewrite this! ++static void ++emit_csf_queue(struct panfrost_batch *batch, struct panfrost_cs *cs, ++ pan_command_stream s, struct util_dynarray *deps, ++ bool first, bool last) ++{ ++ struct panfrost_device *dev = pan_device(batch->ctx->base.screen); ++ ++ assert(s.ptr <= s.end); ++ ++ bool fragment = (cs->hw_resources & 2); ++ bool vertex = (cs->hw_resources & 12); /* TILER | IDVS */ ++ ++ uint64_t *limit = panfrost_cs_ring_allocate_instrs(cs, ++ 128 + util_dynarray_num_elements(deps, struct panfrost_usage) * 4); ++ ++ pan_command_stream *c = &cs->cs; ++ ++ /* First, do some waiting at the start of the job */ ++ ++ pan_emit_cs_32(c, 0x54, *cs->base.latest_flush); ++ // TODO genxmlify ++ pan_emit_cs_ins(c, 0x24, 0x540000000233ULL); ++ // TODO: What does this need to be? ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 0xff; } ++ ++ /* For the first job in the batch, wait on dependencies */ ++ // TODO: Usually the vertex job shouldn't have to wait for dmabufs! ++ if (first) { ++ mali_ptr seqnum_ptr_base = dev->mali.event_mem.gpu; ++ ++ util_dynarray_foreach(deps, struct panfrost_usage, u) { ++ /* Note the multiplication in the call to ++ * cs_ring_allocate_instrs. pan_emit_cs_64 might be ++ * split, so the total is four instructions. */ ++ pan_emit_cs_48(c, 0x42, seqnum_ptr_base + ++ u->queue * PAN_EVENT_SIZE); ++ pan_emit_cs_64(c, 0x40, u->seqnum); ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.no_error = true; ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ } ++ ++ uint64_t kcpu_seqnum = ++cs->kcpu_seqnum; ++ ++ util_dynarray_foreach(&batch->dmabufs, int, fd) { ++ int fence = panfrost_export_dmabuf_fence(*fd); ++ ++ /* TODO: poll on the dma-buf? */ ++ if (fence == -1) ++ continue; ++ ++ // TODO: What if we reach the limit for number of KCPU ++ // commands in a queue? It's pretty low (256) ++ dev->mali.kcpu_fence_import(&dev->mali, cs->base.ctx, ++ fence); ++ ++ close(fence); ++ } ++ ++ bool ret = dev->mali.kcpu_cqs_set(&dev->mali, cs->base.ctx, ++ cs->kcpu_event_ptr, kcpu_seqnum + 1); ++ ++ if (ret) { ++ /* If we don't set no_error, kbase might decide to ++ * pass on errors from waiting for fences. */ ++ pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr); ++ pan_emit_cs_64(c, 0x40, kcpu_seqnum); ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.no_error = true; ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ } ++ } ++ ++ /* Fragment jobs need to wait for the vertex job */ ++ if (fragment && !first) { ++ pan_pack_ins(c, CS_EVWAIT_64, cfg) { ++ cfg.condition = MALI_WAIT_CONDITION_HIGHER; ++ cfg.value = 0x4e; ++ cfg.addr = 0x4c; ++ } ++ } ++ ++ if (vertex) { ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 3; } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; } ++ pan_pack_ins(c, CS_HEAPINC, cfg) { ++ cfg.type = MALI_HEAP_STATISTIC_V_T_START; ++ } ++ } else if (fragment) { ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 4; } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 4; } ++ } ++ ++ // copying to the main buffer can make debugging easier. ++ // TODO: This needs to be more reliable. ++#if 0 ++ unsigned length = (s.ptr - s.begin) * 8; ++ unsigned clamped = MIN2(length, cs->bo->ptr.cpu + cs->bo->size - (void *)c->ptr); ++ memcpy(c->ptr, s->begin, clamped); ++ c->ptr += clamped / 8; ++ ++ if (clamped != length) { ++ unsigned rest = length - clamped; ++ c->ptr = cs->bo->ptr.cpu; ++ memcpy(c->ptr, s->begin, rest); ++ c->ptr += rest / 8; ++ ++ cs->offset += cs->bo->size; ++ } ++#else ++ ++ pan_emit_cs_48(c, 0x48, s.gpu); ++ pan_emit_cs_32(c, 0x4a, (s.ptr - s.begin) * 8); ++ pan_pack_ins(c, CS_CALL, cfg) { cfg.address = 0x48; cfg.length = 0x4a; } ++#endif ++ ++ if (vertex) { ++ pan_pack_ins(c, CS_FLUSH_TILER, _) { } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 3; } ++ pan_pack_ins(c, CS_HEAPINC, cfg) { ++ cfg.type = MALI_HEAP_STATISTIC_V_T_END; ++ } ++ } ++ ++ if (fragment) { ++ /* Skip the next operation if the batch doesn't use a tiler ++ * heap (i.e. it's just a blit) */ ++ pan_emit_cs_ins(c, 22, 0x560030000001); /* b.ne w56, skip 1 */ ++ pan_emit_cs_ins(c, 22, 0x570020000007); /* b.eq w57, skip 7 */ ++ ++ pan_pack_ins(c, CS_LDR, cfg) { ++ cfg.offset = 4 * 10; /* Heap Start */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ pan_pack_ins(c, CS_LDR, cfg) { ++ cfg.offset = 4 * 12; /* Heap End */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4c; ++ } ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 0) | (1 << 3); } ++ ++ pan_pack_ins(c, CS_HEAPCLEAR, cfg) { ++ cfg.start = 0x4a; ++ cfg.end = 0x4c; ++ cfg.slots = 1 << 3; ++ } ++ ++ /* Reset the fields so that the clear operation isn't done again */ ++ pan_emit_cs_48(c, 0x4a, 0); ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.offset = 4 * 10; /* Heap Start */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ pan_pack_ins(c, CS_STR, cfg) { ++ cfg.offset = 4 * 12; /* Heap End */ ++ cfg.register_mask = 0x3; ++ cfg.addr = 0x56; ++ cfg.register_base = 0x4a; ++ } ++ ++ /* Branch target for above branch */ ++ ++ // This seems to be done by the HEAPCLEAR ++ //pan_pack_ins(c, CS_HEAPINC, cfg) { ++ // cfg.type = MALI_HEAP_STATISTIC_FRAGMENT_END; ++ //} ++ } ++ ++ if (fragment) { ++ pan_emit_cs_32(c, 0x54, 0); ++ pan_emit_cs_ins(c, 0x24, 0x2540000f80211); ++ pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1 << 1; } ++ } ++ ++ { ++ // This could I think be optimised to 0xf80211 rather than 0x233 ++ // TODO: Does this need to run for vertex jobs? ++ // What about when doing transform feedback? ++ // I think we at least need it for compute? ++ ++ //pan_emit_cs_32(c, 0x54, 0); ++ //pan_emit_cs_ins(c, 0x24, 0x540000000233ULL); ++ } ++ ++ if (last) { ++ uint64_t kcpu_seqnum = ++cs->kcpu_seqnum; ++ ++ pan_emit_cs_64(c, 0x40, kcpu_seqnum + 1); ++ pan_emit_cs_48(c, 0x42, cs->kcpu_event_ptr); ++ pan_pack_ins(c, CS_EVSTR_64, cfg) { ++ /* This is the scoreboard mask, right?.. */ ++ cfg.unk_2 = (3 << 3); ++ cfg.value = 0x40; ++ cfg.addr = 0x42; ++ } ++ ++ dev->mali.kcpu_cqs_wait(&dev->mali, cs->base.ctx, ++ cs->kcpu_event_ptr, kcpu_seqnum); ++ ++ int fence = dev->mali.kcpu_fence_export(&dev->mali, cs->base.ctx); ++ ++ if (fence != -1) { ++ util_dynarray_foreach(&batch->dmabufs, int, fd) { ++ panfrost_import_dmabuf_fence(*fd, fence); ++ } ++ } ++ ++ close(fence); ++ } ++ ++ pan_emit_cs_48(c, 0x48, cs->event_ptr); ++ pan_emit_cs_64(c, 0x4a, cs->seqnum + 1); ++ pan_pack_ins(c, CS_EVSTR_64, cfg) { ++ /* This is the scoreboard mask, right?.. */ ++ cfg.unk_2 = (3 << 3); ++ cfg.value = 0x4a; ++ cfg.addr = 0x48; ++ } ++ ++ // TODO: is this just a weird ddk thing, or is it required? ++ // Probably it just lessens the WC impact ++ while ((uintptr_t)c->ptr & 63) ++ pan_emit_cs_ins(c, 0, 0); ++ ++ assert(c->ptr <= limit); ++} ++ ++static void ++emit_csf_toplevel(struct panfrost_batch *batch) ++{ ++ pan_command_stream *cv = &batch->ctx->kbase_cs_vertex.cs; ++ pan_command_stream *cf = &batch->ctx->kbase_cs_fragment.cs; ++ ++ pan_command_stream v = batch->cs_vertex; ++ pan_command_stream f = batch->cs_fragment; ++ ++ if (batch->cs_vertex_last_size) { ++ assert(v.ptr <= v.end); ++ *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8; ++ v = batch->cs_vertex_first; ++ } ++ ++ bool vert = (v.ptr != v.begin); ++ bool frag = (f.ptr != f.begin); ++ ++ // TODO: Clean up control-flow? ++ ++ if (vert) { ++ pan_emit_cs_48(cv, 0x48, batch->ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(cv, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ ++ emit_csf_queue(batch, &batch->ctx->kbase_cs_vertex, v, ++ &batch->vert_deps, true, !frag); ++ } ++ ++ if (!frag) ++ return; ++ ++ pan_emit_cs_48(cf, 0x48, batch->ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(cf, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ ++ uint64_t vertex_seqnum = batch->ctx->kbase_cs_vertex.seqnum; ++ // TODO: this assumes SAME_VA ++ mali_ptr seqnum_ptr = (uintptr_t) batch->ctx->kbase_cs_vertex.event_ptr; ++ ++ pan_emit_cs_48(cf, 0x4c, seqnum_ptr); ++ pan_emit_cs_64(cf, 0x4e, vertex_seqnum); ++ ++ // What does this instruction do? ++ //pan_emit_cs_32(cf, 0x54, 0); ++ //pan_emit_cs_ins(cf, 0x24, 0x540000000200); ++ ++ assert(vert || batch->tiler_ctx.bifrost == 0); ++ pan_emit_cs_48(cf, 0x56, batch->tiler_ctx.bifrost); ++ ++ emit_csf_queue(batch, &batch->ctx->kbase_cs_fragment, f, ++ &batch->frag_deps, !vert, true); ++} ++ ++static void ++init_cs(struct panfrost_context *ctx, struct panfrost_cs *cs) ++{ ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ pan_command_stream *c = &cs->cs; ++ ++ cs->seqnum = 0; ++ ++ cs->offset = 0; ++ c->ptr = cs->bo->ptr.cpu; ++ c->begin = cs->bo->ptr.cpu; ++ c->end = cs->bo->ptr.cpu + cs->base.size; ++ c->gpu = cs->bo->ptr.gpu; ++ ++ // eight instructions == 64 bytes ++ pan_pack_ins(c, CS_RESOURCES, cfg) { cfg.mask = cs->hw_resources; } ++ pan_pack_ins(c, CS_SLOT, cfg) { cfg.index = 2; } ++ pan_emit_cs_48(c, 0x48, ctx->kbase_ctx->tiler_heap_va); ++ pan_pack_ins(c, CS_HEAPCTX, cfg) { cfg.address = 0x48; } ++ for (unsigned i = 0; i < 4; ++i) ++ pan_pack_ins(c, CS_NOP, _); ++ ++ dev->mali.cs_submit(&dev->mali, &cs->base, 64, NULL, 0); ++ //dev->mali.cs_wait(&dev->mali, &cs->base, 64); ++} ++ ++#endif ++ + #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c; + + static uint8_t +@@ -2904,14 +3315,14 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch, + #endif + + static void +-panfrost_emit_primitive_size(struct panfrost_context *ctx, ++panfrost_emit_primitive_size(struct panfrost_batch *batch, + bool points, mali_ptr size_array, + void *prim_size) + { +- struct panfrost_rasterizer *rast = ctx->rasterizer; ++ struct panfrost_rasterizer *rast = batch->ctx->rasterizer; + +- pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { +- if (panfrost_writes_point_size(ctx)) { ++ pan_pack_cs_v10(prim_size, &batch->cs_vertex, PRIMITIVE_SIZE, cfg) { ++ if (panfrost_writes_point_size(batch->ctx)) { + cfg.size_array = size_array; + } else { + cfg.constant = points ? +@@ -3037,6 +3448,43 @@ panfrost_update_state_3d(struct panfrost_batch *batch) + } + + #if PAN_ARCH >= 6 ++ ++#if PAN_ARCH >= 10 ++static mali_ptr ++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch) ++{ ++ struct panfrost_context *ctx = batch->ctx; ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ ++ if (ctx->tiler_heap_desc) ++ return ctx->tiler_heap_desc->ptr.gpu; ++ ++ ctx->tiler_heap_desc = panfrost_bo_create(dev, 4096, 0, "Tiler heap descriptor"); ++ ++ pan_pack(ctx->tiler_heap_desc->ptr.cpu, TILER_HEAP, heap) { ++ heap.size = ctx->kbase_ctx->tiler_heap_chunk_size; ++ heap.base = ctx->kbase_ctx->tiler_heap_header; ++ heap.bottom = heap.base + 64; ++ heap.top = heap.base + heap.size; ++ } ++ ++ return ctx->tiler_heap_desc->ptr.gpu; ++} ++#else ++static mali_ptr ++panfrost_get_tiler_heap_desc(struct panfrost_batch *batch) ++{ ++ struct panfrost_device *dev = pan_device(batch->ctx->base.screen); ++ ++ struct panfrost_ptr t = ++ pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); ++ ++ GENX(pan_emit_tiler_heap)(dev, t.cpu); ++ ++ return t.gpu; ++} ++#endif ++ + static mali_ptr + panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count) + { +@@ -3048,18 +3496,32 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c + if (batch->tiler_ctx.bifrost) + return batch->tiler_ctx.bifrost; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); ++ mali_ptr heap = panfrost_get_tiler_heap_desc(batch); + +- GENX(pan_emit_tiler_heap)(dev, t.cpu); ++ mali_ptr scratch = 0; ++ ++#if PAN_ARCH >= 10 ++ // TODO: Dynamically size? ++ unsigned scratch_bits = 16; ++ ++ /* Allocate scratch space for vertex positions / point sizes */ ++ // TODO: Should this be shared? ++ struct panfrost_ptr sc = ++ pan_pool_alloc_aligned(&batch->pool.base, 1 << scratch_bits, 4096); ++ ++ /* I think the scratch size is passed in the low bits of the ++ * pointer... but trying to go above 16 gives a CS_INHERIT_FAULT. ++ */ ++ scratch = sc.gpu + scratch_bits; ++#endif + +- mali_ptr heap = t.gpu; ++ struct panfrost_ptr t = ++ pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); + +- t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); + GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height, + util_framebuffer_get_num_samples(&batch->key), + pan_tristate_get(batch->first_provoking_vertex), +- heap, t.cpu); ++ heap, scratch, t.cpu); + + batch->tiler_ctx.bifrost = t.gpu; + return batch->tiler_ctx.bifrost; +@@ -3070,18 +3532,19 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c + * jobs and Valhall IDVS jobs + */ + static void +-panfrost_emit_primitive(struct panfrost_context *ctx, ++panfrost_emit_primitive(struct panfrost_batch *batch, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw, + mali_ptr indices, bool secondary_shader, void *out) + { +- UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; ++ struct panfrost_context *ctx = batch->ctx; ++ struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + + bool lines = (info->mode == PIPE_PRIM_LINES || + info->mode == PIPE_PRIM_LINE_LOOP || + info->mode == PIPE_PRIM_LINE_STRIP); + +- pan_pack(out, PRIMITIVE, cfg) { ++ pan_pack_cs_v10(out, &batch->cs_vertex, PRIMITIVE, cfg) { + cfg.draw_mode = pan_draw_mode(info->mode); + if (panfrost_writes_point_size(ctx)) + cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; +@@ -3113,12 +3576,20 @@ panfrost_emit_primitive(struct panfrost_context *ctx, + + /* Non-fixed restart indices should have been lowered */ + assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info)); ++ ++ /* TODO: This is in a hot function, optimise? */ ++ if (ctx->pipe_viewport.scale[2] > 0) { ++ cfg.low_depth_cull = rast->depth_clip_near; ++ cfg.high_depth_cull = rast->depth_clip_far; ++ } else { ++ cfg.low_depth_cull = rast->depth_clip_far; ++ cfg.high_depth_cull = rast->depth_clip_near; ++ } + #endif + + cfg.index_count = ctx->indirect_draw ? 1 : draw->count; + cfg.index_type = panfrost_translate_index_size(info->index_size); + +- + if (PAN_ARCH >= 9) { + /* Base vertex offset on Valhall is used for both + * indexed and non-indexed draws, in a simple way for +@@ -3240,7 +3711,7 @@ panfrost_emit_draw(void *out, + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + bool polygon = (prim == PIPE_PRIM_TRIANGLES); + +- pan_pack(out, DRAW, cfg) { ++ pan_pack_cs_v10(out, &batch->cs_vertex, DRAW, cfg) { + /* + * From the Gallium documentation, + * pipe_rasterizer_state::cull_face "indicates which faces of +@@ -3270,6 +3741,7 @@ panfrost_emit_draw(void *out, + ctx->prog[PIPE_SHADER_FRAGMENT]; + + cfg.multisample_enable = rast->multisample; ++ + cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF; + + /* Use per-sample shading if required by API Also use it when a +@@ -3283,7 +3755,10 @@ panfrost_emit_draw(void *out, + + cfg.single_sampled_lines = !rast->multisample; + ++ /* This is filled in by hardware on v10 */ ++#if PAN_ARCH < 10 + cfg.vertex_array.packet = true; ++#endif + + cfg.minimum_z = batch->minimum_z; + cfg.maximum_z = batch->maximum_z; +@@ -3411,14 +3886,18 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + */ + secondary_shader &= fs_required; + +- panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader, ++#if PAN_ARCH < 10 ++ panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE)); ++#else ++ panfrost_emit_primitive(batch, info, draw, 0, secondary_shader, job); ++#endif + +- pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { + cfg.count = info->instance_count; + } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { + if (secondary_shader) { + unsigned v = vs->info.varyings.output_count; + unsigned f = fs->info.varyings.input_count; +@@ -3427,34 +3906,45 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + unsigned size = slots * 16; + + /* Assumes 16 byte slots. We could do better. */ ++#if PAN_ARCH < 10 + cfg.vertex_packet_stride = size + 16; ++#endif + cfg.vertex_attribute_stride = size; + } else { + /* Hardware requirement for "no varyings" */ ++#if PAN_ARCH < 10 + cfg.vertex_packet_stride = 16; ++#endif + cfg.vertex_attribute_stride = 0; + } + } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, TILER, cfg) { + cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); + } + ++ /* For v10, the scissor is emitted directly by ++ * panfrost_emit_viewport */ ++#if PAN_ARCH < 10 + STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR)); + memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), + &batch->scissor, pan_size(SCISSOR)); ++#endif + +- panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0, ++ panfrost_emit_primitive_size(batch, info->mode == PIPE_PRIM_POINTS, 0, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE)); + +- pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, INDICES, cfg) { + cfg.address = indices; ++#if PAN_ARCH >= 10 ++ cfg.size = draw->count * info->index_size; ++#endif + } + + panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), + batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0); + +- pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) { ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, POSITION, cfg) { + /* IDVS/points vertex shader */ + mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX]; + +@@ -3464,20 +3954,21 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, + + panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr, + batch->tls.gpu); +- } + +- pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) { +- /* If a varying shader is used, we configure it with the same +- * state as the position shader for backwards compatible +- * behaviour with Bifrost. This could be optimized. +- */ +- if (!secondary_shader) continue; ++ pan_section_pack_cs_v10(job, &batch->cs_vertex, MALLOC_VERTEX_JOB, VARYING, vary) { ++ /* If a varying shader is used, we configure it with the same ++ * state as the position shader for backwards compatible ++ * behaviour with Bifrost. This could be optimized. ++ */ ++ if (!secondary_shader) continue; + +- mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + ++ mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + + (2 * pan_size(SHADER_PROGRAM)); + +- panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, +- ptr, batch->tls.gpu); ++ vary.shader = ptr; ++ ++ // TODO: Fix this function for v9! ++ } + } + } + #endif +@@ -3492,12 +3983,10 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch, + mali_ptr pos, mali_ptr psiz, bool secondary_shader, + void *job) + { +- struct panfrost_context *ctx = batch->ctx; +- + void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); + memcpy(section, invocation_template, pan_size(INVOCATION)); + +- panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader, ++ panfrost_emit_primitive(batch, info, draw, indices, secondary_shader, + pan_section_ptr(job, TILER_JOB, PRIMITIVE)); + + void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); +@@ -3514,7 +4003,7 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch, + panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW), + batch, true, prim, pos, fs_vary, varyings); + +- panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); ++ panfrost_emit_primitive_size(batch, prim == PIPE_PRIM_POINTS, psiz, prim_size); + } + #endif + +@@ -3526,8 +4015,8 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + { + struct panfrost_context *ctx = batch->ctx; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); ++ UNUSED struct panfrost_ptr t = ++ pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); + + /* Nothing to do */ + if (batch->ctx->streamout.num_targets == 0) +@@ -3556,7 +4045,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); + + #if PAN_ARCH >= 9 +- pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { ++ pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; +@@ -3569,15 +4058,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->rsd[PIPE_SHADER_VERTEX], + batch->tls.gpu); + ++#if PAN_ARCH < 10 + /* TODO: Indexing. Also, this is a legacy feature... */ + cfg.compute.attribute_offset = batch->ctx->offset_start; ++#endif + + /* Transform feedback shaders do not use barriers or shared + * memory, so we may merge workgroups. + */ + cfg.allow_merging_workgroups = true; ++ ++#if PAN_ARCH < 10 + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; ++#endif + } + #else + struct mali_invocation_packed invocation; +@@ -3593,12 +4087,20 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, + attribs, attrib_bufs, t.cpu); + #endif ++#if PAN_ARCH >= 10 ++ // TODO: Use a seperate compute queue? ++ pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) { ++ // TODO v10: Set parameters ++ } ++ batch->scoreboard.first_job = 1; ++#else + enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE; + #if PAN_ARCH <= 5 + job_type = MALI_JOB_TYPE_VERTEX; + #endif + panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type, + true, false, 0, 0, &t, false); ++#endif + + ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled; + ctx->prog[PIPE_SHADER_VERTEX] = vs; +@@ -3607,6 +4109,54 @@ panfrost_launch_xfb(struct panfrost_batch *batch, + batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; + } + ++#if PAN_ARCH >= 10 ++static pan_command_stream ++panfrost_batch_create_cs(struct panfrost_batch *batch, unsigned count) ++{ ++ struct panfrost_ptr cs = pan_pool_alloc_aligned(&batch->pool.base, count * 8, 64); ++ ++ return (pan_command_stream) { ++ .ptr = cs.cpu, ++ .begin = cs.cpu, ++ .end = cs.cpu + count, ++ .gpu = cs.gpu, ++ }; ++} ++ ++static uint64_t * ++panfrost_cs_vertex_allocate_instrs(struct panfrost_batch *batch, unsigned count) ++{ ++ /* Doing a tail call to another buffer takes three instructions */ ++ count += 3; ++ ++ pan_command_stream v = batch->cs_vertex; ++ ++ if (v.ptr + count > v.end) { ++ batch->cs_vertex = panfrost_batch_create_cs(batch, MAX2(count, 1 << 13)); ++ ++ /* The size will be filled in later. */ ++ uint32_t *last_size = (uint32_t *)v.ptr; ++ pan_emit_cs_32(&v, 0x5e, 0); ++ ++ pan_emit_cs_48(&v, 0x5c, batch->cs_vertex.gpu); ++ pan_pack_ins(&v, CS_TAILCALL, cfg) { cfg.address = 0x5c; cfg.length = 0x5e; } ++ ++ assert(v.ptr <= v.end); ++ ++ /* This is not strictly required, but makes disassembly look ++ * nicer */ ++ if (batch->cs_vertex_last_size) ++ *batch->cs_vertex_last_size = (v.ptr - v.begin) * 8; ++ ++ batch->cs_vertex_last_size = last_size; ++ if (!batch->cs_vertex_first.gpu) ++ batch->cs_vertex_first = v; ++ } ++ ++ return batch->cs_vertex.ptr + count; ++} ++#endif ++ + static void + panfrost_direct_draw(struct panfrost_batch *batch, + const struct pipe_draw_info *info, +@@ -3618,6 +4168,11 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + struct panfrost_context *ctx = batch->ctx; + ++#if PAN_ARCH >= 10 ++ /* TODO: We don't need quite so much space */ ++ uint64_t *limit = panfrost_cs_vertex_allocate_instrs(batch, 64); ++#endif ++ + /* If we change whether we're drawing points, or whether point sprites + * are enabled (specified in the rasterizer), we may need to rebind + * shaders accordingly. This implicitly covers the case of rebinding +@@ -3647,18 +4202,19 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + UNUSED struct panfrost_ptr tiler, vertex; + +- if (idvs) { + #if PAN_ARCH >= 9 +- tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB); +-#elif PAN_ARCH >= 6 ++ tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, MALLOC_VERTEX_JOB); ++#else /* PAN_ARCH < 9 */ ++ if (idvs) { ++#if PAN_ARCH >= 6 + tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); +-#else +- unreachable("IDVS is unsupported on Midgard"); + #endif ++ unreachable("IDVS is unsupported on Midgard"); + } else { +- vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); +- tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); ++ vertex = pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); ++ tiler = pan_pool_alloc_desc_cs_v10(&batch->pool.base, TILER_JOB); + } ++#endif /* PAN_ARCH */ + + unsigned vertex_count = ctx->vertex_count; + +@@ -3726,7 +4282,7 @@ panfrost_direct_draw(struct panfrost_batch *batch, + + mali_ptr attribs, attrib_bufs; + attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); +-#endif ++#endif /* PAN_ARCH <= 7 */ + + panfrost_update_state_3d(batch); + panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); +@@ -3752,13 +4308,25 @@ panfrost_direct_draw(struct panfrost_batch *batch, + #if PAN_ARCH >= 9 + assert(idvs && "Memory allocated IDVS required on Valhall"); + +- panfrost_emit_malloc_vertex(batch, info, draw, indices, +- secondary_shader, tiler.cpu); ++ panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, tiler.cpu); + ++#if PAN_ARCH >= 10 ++ pan_pack_ins(&batch->cs_vertex, IDVS_LAUNCH, _); ++ /* TODO: Find a better way to specify that there were jobs */ ++ batch->scoreboard.first_job = 1; ++ batch->scoreboard.first_tiler = NULL + 1; ++ ++ /* Make sure we didn't use more CS instructions than we allocated ++ * space for */ ++ assert(batch->cs_vertex.ptr <= limit); ++ ++#else /* PAN_ARCH < 10 */ + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, + 0, &tiler, false); +-#else ++#endif ++#else /* PAN_ARCH < 9 */ ++ + /* Fire off the draw itself */ + panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, + fs_vary, varyings, pos, psiz, secondary_shader, +@@ -3773,7 +4341,7 @@ panfrost_direct_draw(struct panfrost_batch *batch, + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_INDEXED_VERTEX, false, false, + 0, 0, &tiler, false); +-#endif ++#endif /* PAN_ARCH < 6 */ + } else { + panfrost_draw_emit_vertex(batch, info, &invocation, + vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); +@@ -4102,8 +4670,8 @@ panfrost_launch_grid(struct pipe_context *pipe, + + ctx->compute_grid = info; + +- struct panfrost_ptr t = +- pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); ++ UNUSED struct panfrost_ptr t = ++ pan_pool_alloc_desc_cs_v10(&batch->pool.base, COMPUTE_JOB); + + /* Invoke according to the grid info */ + +@@ -4143,7 +4711,7 @@ panfrost_launch_grid(struct pipe_context *pipe, + #else + struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE]; + +- pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { ++ pan_section_pack_cs_v10(t.cpu, &batch->cs_vertex, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = info->block[0]; + cfg.workgroup_size_y = info->block[1]; + cfg.workgroup_size_z = info->block[2]; +@@ -4166,12 +4734,14 @@ panfrost_launch_grid(struct pipe_context *pipe, + cs->info.cs.allow_merging_workgroups && + (info->variable_shared_mem == 0); + ++#if PAN_ARCH < 10 + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; ++#endif + } + #endif + +- unsigned indirect_dep = 0; ++ UNUSED unsigned indirect_dep = 0; // TODO v10 (unused) + #if PAN_GPU_INDIRECTS + if (info->indirect) { + struct pan_indirect_dispatch_info indirect = { +@@ -4191,9 +4761,17 @@ panfrost_launch_grid(struct pipe_context *pipe, + } + #endif + ++#if PAN_ARCH >= 10 ++ pan_pack_ins(&batch->cs_vertex, COMPUTE_LAUNCH, cfg) { ++ /* TODO: Change this as needed */ ++ cfg.unk_1 = 512; ++ } ++ batch->scoreboard.first_job = 1; ++#else + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_COMPUTE, true, false, + indirect_dep, 0, &t, false); ++#endif + panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); + } + +@@ -4453,6 +5031,30 @@ panfrost_create_sampler_view( + return (struct pipe_sampler_view *) so; + } + ++static void ++panfrost_init_logicop_blend_state(struct panfrost_blend_state *so) ++{ ++ for (unsigned c = 0; c < so->pan.rt_count; ++c) { ++ unsigned g = so->base.independent_blend_enable ? c : 0; ++ const struct pipe_rt_blend_state pipe = so->base.rt[g]; ++ ++ struct pan_blend_equation equation = {0}; ++ ++ equation.color_mask = pipe.colormask; ++ equation.blend_enable = false; ++ ++ so->info[c] = (struct pan_blend_info) { ++ .enabled = (pipe.colormask != 0), ++ .load_dest = true, ++ .fixed_function = false, ++ }; ++ ++ so->pan.rts[c].equation = equation; ++ ++ so->load_dest_mask |= BITFIELD_BIT(c); ++ } ++} ++ + /* A given Gallium blend state can be encoded to the hardware in numerous, + * dramatically divergent ways due to the interactions of blending with + * framebuffer formats. Conceptually, there are two modes: +@@ -4492,6 +5094,11 @@ panfrost_create_blend_state(struct pipe_context *pipe, + so->pan.logicop_func = blend->logicop_func; + so->pan.rt_count = blend->max_rt + 1; + ++ if (blend->logicop_enable) { ++ panfrost_init_logicop_blend_state(so); ++ return so; ++ } ++ + for (unsigned c = 0; c < so->pan.rt_count; ++c) { + unsigned g = blend->independent_blend_enable ? c : 0; + const struct pipe_rt_blend_state pipe = blend->rt[g]; +@@ -4521,12 +5128,10 @@ panfrost_create_blend_state(struct pipe_context *pipe, + .opaque = pan_blend_is_opaque(equation), + .constant_mask = constant_mask, + +- /* TODO: check the dest for the logicop */ +- .load_dest = blend->logicop_enable || +- pan_blend_reads_dest(equation), ++ .load_dest = pan_blend_reads_dest(equation), + + /* Could this possibly be fixed-function? */ +- .fixed_function = !blend->logicop_enable && ++ .fixed_function = + pan_blend_can_fixed_function(equation, + supports_2src) && + (!constant_mask || +@@ -4612,10 +5217,12 @@ prepare_shader(struct panfrost_compiled_shader *state, + + state->state = panfrost_pool_take_ref(pool, ptr.gpu); + ++ // TODO: Why set primary_shader to false again? ++ + /* Generic, or IDVS/points */ + pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); +- cfg.primary_shader = true; ++ cfg.primary_shader = false; + cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu; + cfg.preload.r48_r63 = (state->info.preload >> 48); +@@ -4631,7 +5238,7 @@ prepare_shader(struct panfrost_compiled_shader *state, + /* IDVS/triangles */ + pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); +- cfg.primary_shader = true; ++ cfg.primary_shader = false; + cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; + cfg.preload.r48_r63 = (state->info.preload >> 48); +@@ -4707,6 +5314,11 @@ init_batch(struct panfrost_batch *batch) + /* On Midgard, the TLS is embedded in the FB descriptor */ + batch->tls = batch->framebuffer; + #endif ++ ++#if PAN_ARCH >= 10 ++ batch->cs_vertex = panfrost_batch_create_cs(batch, 1 << 13); ++ batch->cs_fragment = panfrost_batch_create_cs(batch, 1 << 9); ++#endif + } + + static void +@@ -4821,6 +5433,10 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) + screen->vtbl.init_polygon_list = init_polygon_list; + screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); + screen->vtbl.compile_shader = GENX(pan_shader_compile); ++#if PAN_ARCH >= 10 ++ screen->vtbl.emit_csf_toplevel = emit_csf_toplevel; ++ screen->vtbl.init_cs = init_cs; ++#endif + + GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, + &screen->blitter.desc_pool.base); diff --git a/src/gallium/drivers/panfrost/pan_context.c.rej b/src/gallium/drivers/panfrost/pan_context.c.rej new file mode 100644 index 00000000000..8005be06871 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_context.c.rej @@ -0,0 +1,178 @@ +diff a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c (rejected hunks) +@@ -34,7 +34,6 @@ + + #include "util/macros.h" + #include "util/format/u_format.h" +-#include "util/libsync.h" + #include "util/u_inlines.h" + #include "util/u_upload_mgr.h" + #include "util/u_memory.h" +@@ -571,6 +570,19 @@ panfrost_destroy(struct pipe_context *pipe) + struct panfrost_context *panfrost = pan_context(pipe); + struct panfrost_device *dev = pan_device(pipe->screen); + ++ if (dev->kbase && dev->mali.context_create) { ++ dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_vertex.base); ++ dev->mali.cs_term(&dev->mali, &panfrost->kbase_cs_fragment.base); ++ ++ dev->mali.context_destroy(&dev->mali, panfrost->kbase_ctx); ++ ++ panfrost_bo_unreference(panfrost->kbase_cs_vertex.bo); ++ panfrost_bo_unreference(panfrost->kbase_cs_fragment.bo); ++ } ++ ++ if (panfrost->tiler_heap_desc) ++ panfrost_bo_unreference(panfrost->tiler_heap_desc); ++ + _mesa_hash_table_destroy(panfrost->writers, NULL); + + if (panfrost->blitter) +@@ -582,11 +594,15 @@ panfrost_destroy(struct pipe_context *pipe) + panfrost_pool_cleanup(&panfrost->descs); + panfrost_pool_cleanup(&panfrost->shaders); + +- drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); +- if (panfrost->in_sync_fd != -1) +- close(panfrost->in_sync_fd); ++ if (dev->kbase) { ++ dev->mali.syncobj_destroy(&dev->mali, panfrost->syncobj_kbase); ++ } else { ++ drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); ++ if (panfrost->in_sync_fd != -1) ++ close(panfrost->in_sync_fd); + +- drmSyncobjDestroy(dev->fd, panfrost->syncobj); ++ drmSyncobjDestroy(dev->fd, panfrost->syncobj); ++ } + ralloc_free(pipe); + } + +@@ -873,6 +889,58 @@ panfrost_create_fence_fd(struct pipe_context *pctx, + *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type); + } + ++struct sync_merge_data { ++ char name[32]; ++ int32_t fd2; ++ int32_t fence; ++ uint32_t flags; ++ uint32_t pad; ++}; ++ ++#define SYNC_IOC_MAGIC '>' ++#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data) ++ ++static inline int sync_merge(const char *name, int fd1, int fd2) ++{ ++ struct sync_merge_data data = {{0}}; ++ int ret; ++ ++ data.fd2 = fd2; ++ strncpy(data.name, name, sizeof(data.name)); ++ ++ do { ++ ret = ioctl(fd1, SYNC_IOC_MERGE, &data); ++ } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); ++ ++ if (ret < 0) ++ return ret; ++ ++ return data.fence; ++} ++ ++static inline int sync_accumulate(const char *name, int *fd1, int fd2) ++{ ++ int ret; ++ ++ assert(fd2 >= 0); ++ ++ if (*fd1 < 0) { ++ *fd1 = dup(fd2); ++ return 0; ++ } ++ ++ ret = sync_merge(name, *fd1, fd2); ++ if (ret < 0) { ++ /* leave *fd1 as it is */ ++ return ret; ++ } ++ ++ close(*fd1); ++ *fd1 = ret; ++ ++ return 0; ++} ++ + static void + panfrost_fence_server_sync(struct pipe_context *pctx, + struct pipe_fence_handle *f) +@@ -888,6 +956,28 @@ panfrost_fence_server_sync(struct pipe_context *pctx, + close(fd); + } + ++static struct panfrost_cs ++panfrost_cs_create(struct panfrost_context *ctx, unsigned size, unsigned mask) ++{ ++ struct panfrost_screen *screen = pan_screen(ctx->base.screen); ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ struct kbase_context *kctx = ctx->kbase_ctx; ++ ++ struct panfrost_cs c = {0}; ++ ++ c.bo = panfrost_bo_create(dev, size, 0, "Command stream"); ++ ++ c.base = dev->mali.cs_bind(&dev->mali, kctx, c.bo->ptr.gpu, size); ++ ++ c.event_ptr = dev->mali.event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE; ++ c.kcpu_event_ptr = dev->mali.kcpu_event_mem.gpu + c.base.event_mem_offset * PAN_EVENT_SIZE; ++ ++ c.hw_resources = mask; ++ screen->vtbl.init_cs(ctx, &c); ++ ++ return c; ++} ++ + struct pipe_context * + panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + { +@@ -981,6 +1071,14 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + + assert(ctx->blitter); + ++ if (dev->kbase && dev->mali.context_create) ++ ctx->kbase_ctx = dev->mali.context_create(&dev->mali); ++ ++ if (dev->arch >= 10) { ++ ctx->kbase_cs_vertex = panfrost_cs_create(ctx, 65536, 13); ++ ctx->kbase_cs_fragment = panfrost_cs_create(ctx, 65536, 2); ++ } ++ + /* Prepare for render! */ + + /* By default mask everything on */ +@@ -992,13 +1090,18 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) + /* Create a syncobj in a signaled state. Will be updated to point to the + * last queued job out_sync every time we submit a new job. + */ +- ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); +- assert(!ret && ctx->syncobj); +- +- /* Sync object/FD used for NATIVE_FENCE_FD. */ +- ctx->in_sync_fd = -1; +- ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); +- assert(!ret); ++ if (dev->kbase) { ++ ctx->syncobj_kbase = dev->mali.syncobj_create(&dev->mali); ++ ctx->in_sync_fd = -1; ++ } else { ++ ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); ++ assert(!ret && ctx->syncobj); ++ ++ /* Sync object/FD used for NATIVE_FENCE_FD. */ ++ ctx->in_sync_fd = -1; ++ ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); ++ assert(!ret); ++ } + + return gallium; + } diff --git a/src/gallium/drivers/panfrost/pan_context.h.rej b/src/gallium/drivers/panfrost/pan_context.h.rej new file mode 100644 index 00000000000..1f8d89a2948 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_context.h.rej @@ -0,0 +1,42 @@ +diff a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h (rejected hunks) +@@ -117,6 +117,19 @@ struct panfrost_streamout { + unsigned num_targets; + }; + ++// TODO: This struct is a mess ++struct panfrost_cs { ++ struct kbase_cs base; ++ struct panfrost_bo *bo; ++ pan_command_stream cs; ++ mali_ptr event_ptr; ++ uint64_t seqnum; ++ mali_ptr kcpu_event_ptr; ++ uint64_t kcpu_seqnum; ++ uint64_t offset; ++ unsigned hw_resources; ++}; ++ + struct panfrost_context { + /* Gallium context */ + struct pipe_context base; +@@ -132,6 +145,7 @@ struct panfrost_context { + + /* Sync obj used to keep track of in-flight jobs. */ + uint32_t syncobj; ++ struct kbase_syncobj *syncobj_kbase; + + /* Set of 32 batches. When the set is full, the LRU entry (the batch + * with the smallest seqnum) is flushed to free a slot. +@@ -229,6 +243,12 @@ struct panfrost_context { + + int in_sync_fd; + uint32_t in_sync_obj; ++ ++ struct kbase_context *kbase_ctx; ++ struct panfrost_bo *event_bo; ++ struct panfrost_cs kbase_cs_vertex; ++ struct panfrost_cs kbase_cs_fragment; ++ struct panfrost_bo *tiler_heap_desc; + }; + + /* Corresponds to the CSO */ diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c index 5d8e4bc499d..c25269303cf 100644 --- a/src/gallium/drivers/panfrost/pan_disk_cache.c +++ b/src/gallium/drivers/panfrost/pan_disk_cache.c @@ -34,7 +34,9 @@ #include "pan_context.h" +#ifdef ENABLE_SHADER_CACHE static bool debug = false; +#endif extern int midgard_debug; extern int bifrost_debug; diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c.rej b/src/gallium/drivers/panfrost/pan_disk_cache.c.rej new file mode 100644 index 00000000000..cd344c1d62d --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_disk_cache.c.rej @@ -0,0 +1,23 @@ +diff a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c (rejected hunks) +@@ -141,6 +143,8 @@ panfrost_disk_cache_retrieve(struct disk_cache *cache, + blob_copy_bytes(&blob, ptr, binary_size); + blob_copy_bytes(&blob, &binary->info, sizeof(binary->info)); + ++ free(buffer); ++ + return true; + #else + return false; +@@ -156,11 +160,7 @@ panfrost_disk_cache_init(struct panfrost_screen *screen) + #ifdef ENABLE_SHADER_CACHE + const char *renderer = screen->base.get_name(&screen->base); + +- const struct build_id_note *note = +- build_id_find_nhdr_for_addr(panfrost_disk_cache_init); +- assert(note && build_id_length(note) == 20); /* sha1 */ +- +- const uint8_t *id_sha1 = build_id_data(note); ++ const uint8_t *id_sha1 = "1"; + assert(id_sha1); + + char timestamp[41]; diff --git a/src/gallium/drivers/panfrost/pan_fence.c.rej b/src/gallium/drivers/panfrost/pan_fence.c.rej new file mode 100644 index 00000000000..280c30c29a3 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_fence.c.rej @@ -0,0 +1,66 @@ +diff a/src/gallium/drivers/panfrost/pan_fence.c b/src/gallium/drivers/panfrost/pan_fence.c (rejected hunks) +@@ -42,7 +42,10 @@ panfrost_fence_reference(struct pipe_screen *pscreen, + struct pipe_fence_handle *old = *ptr; + + if (pipe_reference(&old->reference, &fence->reference)) { +- drmSyncobjDestroy(dev->fd, old->syncobj); ++ if (dev->kbase) ++ dev->mali.syncobj_destroy(&dev->mali, old->kbase); ++ else ++ drmSyncobjDestroy(dev->fd, old->syncobj); + free(old); + } + +@@ -65,6 +68,13 @@ panfrost_fence_finish(struct pipe_screen *pscreen, + if (abs_timeout == OS_TIMEOUT_INFINITE) + abs_timeout = INT64_MAX; + ++ if (dev->kbase) { ++ /* TODO: Use the timeout */ ++ bool ret = dev->mali.syncobj_wait(&dev->mali, fence->kbase); ++ fence->signaled = ret; ++ return ret; ++ } ++ + ret = drmSyncobjWait(dev->fd, &fence->syncobj, + 1, + abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, +@@ -81,6 +91,10 @@ panfrost_fence_get_fd(struct pipe_screen *screen, + struct panfrost_device *dev = pan_device(screen); + int fd = -1; + ++ /* TODO: Export a sync file using KCPU */ ++ if (dev->kbase) ++ return fd; ++ + drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); + return fd; + } +@@ -92,6 +106,10 @@ panfrost_fence_from_fd(struct panfrost_context *ctx, int fd, + struct panfrost_device *dev = pan_device(ctx->base.screen); + int ret; + ++ /* TODO: Implement this for kbase */ ++ if (dev->kbase) ++ return NULL; ++ + struct pipe_fence_handle *f = calloc(1, sizeof(*f)); + if (!f) + return NULL; +@@ -134,6 +152,16 @@ panfrost_fence_create(struct panfrost_context *ctx) + struct panfrost_device *dev = pan_device(ctx->base.screen); + int fd = -1, ret; + ++ if (dev->kbase) { ++ struct pipe_fence_handle *f = calloc(1, sizeof(*f)); ++ if (!f) ++ return NULL; ++ ++ f->kbase = dev->mali.syncobj_dup(&dev->mali, ctx->syncobj_kbase); ++ pipe_reference_init(&f->reference, 1); ++ return f; ++ } ++ + /* Snapshot the last rendering out fence. We'd rather have another + * syncobj instead of a sync file, but this is all we get. + * (HandleToFD/FDToHandle just gives you another syncobj ID for the diff --git a/src/gallium/drivers/panfrost/pan_fence.h.rej b/src/gallium/drivers/panfrost/pan_fence.h.rej new file mode 100644 index 00000000000..49caf91fe93 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_fence.h.rej @@ -0,0 +1,9 @@ +diff a/src/gallium/drivers/panfrost/pan_fence.h b/src/gallium/drivers/panfrost/pan_fence.h (rejected hunks) +@@ -32,6 +32,7 @@ struct panfrost_context; + struct pipe_fence_handle { + struct pipe_reference reference; + uint32_t syncobj; ++ struct kbase_syncobj *kbase; + bool signaled; + }; + diff --git a/src/gallium/drivers/panfrost/pan_job.c.rej b/src/gallium/drivers/panfrost/pan_job.c.rej new file mode 100644 index 00000000000..a9a26176e91 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_job.c.rej @@ -0,0 +1,596 @@ +diff a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c (rejected hunks) +@@ -25,6 +25,7 @@ + */ + + #include ++#include + + #include "drm-uapi/panfrost_drm.h" + +@@ -81,6 +82,14 @@ panfrost_batch_init(struct panfrost_context *ctx, + batch->resources =_mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) ++ util_dynarray_init(&batch->resource_bos[i], NULL); ++ ++ util_dynarray_init(&batch->vert_deps, NULL); ++ util_dynarray_init(&batch->frag_deps, NULL); ++ ++ util_dynarray_init(&batch->dmabufs, NULL); ++ + /* Preallocate the main pool, since every batch has at least one job + * structure so it will be used */ + panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, true); +@@ -96,6 +105,9 @@ panfrost_batch_init(struct panfrost_context *ctx, + + panfrost_batch_add_surface(batch, batch->key.zsbuf); + ++ if ((dev->debug & PAN_DBG_SYNC) || !(dev->debug & PAN_DBG_GOFASTER)) ++ batch->needs_sync = true; ++ + screen->vtbl.init_batch(batch); + } + +@@ -115,15 +127,30 @@ static void + panfrost_batch_add_resource(struct panfrost_batch *batch, + struct panfrost_resource *rsrc) + { ++ struct panfrost_context *ctx = batch->ctx; ++ struct panfrost_device *dev = pan_device(ctx->base.screen); ++ + bool found = false; + _mesa_set_search_or_add(batch->resources, rsrc, &found); + +- if (!found) { +- /* Cache number of batches accessing a resource */ +- rsrc->track.nr_users++; ++ /* Nothing to do if we already have the resource */ ++ if (found) ++ return; ++ ++ /* Cache number of batches accessing a resource */ ++ rsrc->track.nr_users++; ++ ++ /* Reference the resource on the batch */ ++ pipe_reference(NULL, &rsrc->base.reference); + +- /* Reference the resource on the batch */ +- pipe_reference(NULL, &rsrc->base.reference); ++ if (rsrc->scanout) { ++ if (dev->has_dmabuf_fence) { ++ int fd = rsrc->image.data.bo->dmabuf_fd; ++ util_dynarray_append(&batch->dmabufs, int, fd); ++ } else { ++ perf_debug_ctx(ctx, "Forcing sync on batch"); ++ batch->needs_sync = true; ++ } + } + } + +@@ -172,6 +199,10 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc + { + struct panfrost_device *dev = pan_device(ctx->base.screen); + ++ /* Make sure we keep handling events, to free old BOs */ ++ if (dev->kbase) ++ kbase_ensure_handle_events(&dev->mali); ++ + assert(batch->seqnum); + + if (ctx->batch == batch) +@@ -186,10 +217,18 @@ panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batc + if (!flags[i]) + continue; + +- struct panfrost_bo *bo = pan_lookup_bo(dev, i); ++ struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i); + panfrost_bo_unreference(bo); + } + ++ util_dynarray_fini(&batch->dmabufs); ++ ++ util_dynarray_fini(&batch->vert_deps); ++ util_dynarray_fini(&batch->frag_deps); ++ ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) ++ util_dynarray_fini(&batch->resource_bos[i]); ++ + panfrost_batch_destroy_resources(ctx, batch); + panfrost_pool_cleanup(&batch->pool); + panfrost_pool_cleanup(&batch->invisible_pool); +@@ -313,7 +352,7 @@ panfrost_batch_update_access(struct panfrost_batch *batch, + } + } + +- if (writes) { ++ if (writes && (writer != batch)) { + _mesa_hash_table_insert(ctx->writers, rsrc, batch); + rsrc->track.nr_writers++; + } +@@ -380,6 +419,12 @@ panfrost_batch_read_rsrc(struct panfrost_batch *batch, + uint32_t access = PAN_BO_ACCESS_READ | + panfrost_access_for_stage(stage); + ++ enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ? ++ PAN_USAGE_READ_FRAGMENT : PAN_USAGE_READ_VERTEX; ++ ++ util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *, ++ rsrc->image.data.bo); ++ + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + + if (rsrc->separate_stencil) +@@ -396,6 +441,12 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch, + uint32_t access = PAN_BO_ACCESS_WRITE | + panfrost_access_for_stage(stage); + ++ enum panfrost_usage_type type = (stage == MESA_SHADER_FRAGMENT) ? ++ PAN_USAGE_WRITE_FRAGMENT : PAN_USAGE_WRITE_VERTEX; ++ ++ util_dynarray_append(&batch->resource_bos[type], struct panfrost_bo *, ++ rsrc->image.data.bo); ++ + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + + if (rsrc->separate_stencil) +@@ -489,7 +540,7 @@ panfrost_batch_get_shared_memory(struct panfrost_batch *batch, + } + + static void +-panfrost_batch_to_fb_info(const struct panfrost_batch *batch, ++panfrost_batch_to_fb_info(struct panfrost_batch *batch, + struct pan_fb_info *fb, + struct pan_image_view *rts, + struct pan_image_view *zs, +@@ -511,6 +562,7 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + fb->rt_count = batch->key.nr_cbufs; + fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin); + fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex); ++ fb->cs_fragment = &batch->cs_fragment; + + static const unsigned char id_swz[] = { + PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, +@@ -604,22 +656,22 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH); + fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL); + +- if (!fb->zs.clear.z && ++ if (!fb->zs.clear.z && z_rsrc && + ((batch->read & PIPE_CLEAR_DEPTH) || + ((batch->draws & PIPE_CLEAR_DEPTH) && +- z_rsrc && BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) ++ BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) + fb->zs.preload.z = true; + +- if (!fb->zs.clear.s && ++ if (!fb->zs.clear.s && s_rsrc && + ((batch->read & PIPE_CLEAR_STENCIL) || + ((batch->draws & PIPE_CLEAR_STENCIL) && +- s_rsrc && BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) ++ BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) + fb->zs.preload.s = true; + + /* Preserve both component if we have a combined ZS view and + * one component needs to be preserved. + */ +- if (s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) { ++ if (z_view && s_view == z_view && fb->zs.discard.z != fb->zs.discard.s) { + bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level); + + fb->zs.discard.z = false; +@@ -629,6 +681,28 @@ panfrost_batch_to_fb_info(const struct panfrost_batch *batch, + } + } + ++static int ++panfrost_batch_submit_kbase(struct panfrost_device *dev, ++ struct drm_panfrost_submit *submit, ++ struct kbase_syncobj *syncobj) ++{ ++ dev->mali.handle_events(&dev->mali); ++ ++ int atom = dev->mali.submit(&dev->mali, ++ submit->jc, ++ submit->requirements, ++ syncobj, ++ (int32_t *)(uintptr_t) submit->bo_handles, ++ submit->bo_handle_count); ++ ++ if (atom == -1) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ return 0; ++} ++ + static int + panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + mali_ptr first_job_desc, +@@ -695,7 +769,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + * We also preserve existing flags as this batch might not + * be the first one to access the BO. + */ +- struct panfrost_bo *bo = pan_lookup_bo(dev, i); ++ struct panfrost_bo *bo = pan_lookup_bo_existing(dev, i); + + bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW); + } +@@ -718,6 +792,8 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + submit.bo_handles = (u64) (uintptr_t) bo_handles; + if (ctx->is_noop) + ret = 0; ++ else if (dev->kbase) ++ ret = panfrost_batch_submit_kbase(dev, &submit, ctx->syncobj_kbase); + else + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); + free(bo_handles); +@@ -728,8 +804,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + /* Trace the job if we're doing that */ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + /* Wait so we can get errors reported back */ +- drmSyncobjWait(dev->fd, &out_sync, 1, +- INT64_MAX, 0, NULL); ++ if (dev->kbase) ++ dev->mali.syncobj_wait(&dev->mali, ctx->syncobj_kbase); ++ else ++ drmSyncobjWait(dev->fd, &out_sync, 1, ++ INT64_MAX, 0, NULL); + + if (dev->debug & PAN_DBG_TRACE) + pandecode_jc(submit.jc, dev->gpu_id); +@@ -799,6 +878,323 @@ panfrost_batch_submit_jobs(struct panfrost_batch *batch, + return ret; + } + ++#define BASE_MEM_MMU_DUMP_HANDLE (1 << 12) ++ ++static void ++mmu_dump(struct panfrost_device *dev) ++{ ++ unsigned size = 16 * 1024 * 1024; ++ ++ fprintf(stderr, "dumping MMU tables\n"); ++ sleep(3); ++ ++ void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED, ++ dev->mali.fd, BASE_MEM_MMU_DUMP_HANDLE); ++ if (mem == MAP_FAILED) { ++ perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)"); ++ return;; ++ } ++ ++ fprintf(stderr, "writing to file\n"); ++ sleep(1); ++ ++ char template[] = {"/tmp/mmu-dump.XXXXXX"}; ++ int fd = mkstemp(template); ++ if (fd == -1) { ++ perror("mkstemp(/tmp/mmu-dump.XXXXXX)"); ++ goto unmap; ++ } ++ ++ write(fd, mem, size); ++ close(fd); ++ ++unmap: ++ munmap(mem, size); ++} ++ ++static void ++reset_context(struct panfrost_context *ctx) ++{ ++ struct pipe_screen *pscreen = ctx->base.screen; ++ struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); ++ ++ /* Don't recover from the fault if PAN_MESA_DEBUG=sync is specified, ++ * to somewhat mimic behaviour with JM GPUs. TODO: Just abort? */ ++ bool recover = !(dev->debug & PAN_DBG_SYNC); ++ ++ mesa_loge("Context reset"); ++ ++ dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_vertex.base); ++ dev->mali.cs_term(&dev->mali, &ctx->kbase_cs_fragment.base); ++ ++ dev->mali.context_recreate(&dev->mali, ctx->kbase_ctx); ++ ++ //mmu_dump(dev); ++ ++ if (recover) { ++ dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_vertex.base); ++ dev->mali.cs_rebind(&dev->mali, &ctx->kbase_cs_fragment.base); ++ } else { ++ ctx->kbase_cs_vertex.base.user_io = NULL; ++ ctx->kbase_cs_fragment.base.user_io = NULL; ++ } ++ ++ ctx->kbase_cs_vertex.base.last_insert = 0; ++ ctx->kbase_cs_fragment.base.last_insert = 0; ++ ++ screen->vtbl.init_cs(ctx, &ctx->kbase_cs_vertex); ++ screen->vtbl.init_cs(ctx, &ctx->kbase_cs_fragment); ++ ++ /* TODO: this leaks memory */ ++ ctx->tiler_heap_desc = 0; ++} ++ ++static void ++pandecode_cs_ring(struct panfrost_device *dev, struct panfrost_cs *cs, ++ uint64_t insert) ++{ ++ insert %= cs->base.size; ++ uint64_t start = cs->base.last_insert % cs->base.size; ++ ++ if (insert < start) { ++ pandecode_cs(cs->base.va + start, cs->base.size - start, dev->gpu_id); ++ start = 0; ++ } ++ ++ pandecode_cs(cs->base.va + start, insert - start, dev->gpu_id); ++} ++ ++static unsigned ++panfrost_add_dep_after(struct util_dynarray *deps, ++ struct panfrost_usage u, ++ unsigned index) ++{ ++ unsigned size = util_dynarray_num_elements(deps, struct panfrost_usage); ++ ++ for (unsigned i = index; i < size; ++i) { ++ struct panfrost_usage *d = ++ util_dynarray_element(deps, struct panfrost_usage, i); ++ ++ /* TODO: Remove d if it is an invalid entry? */ ++ ++ if ((d->queue == u.queue) && (d->write == u.write)) { ++ d->seqnum = MAX2(d->seqnum, u.seqnum); ++ return i; ++ ++ } else if (d->queue > u.queue) { ++ void *p = util_dynarray_grow(deps, struct panfrost_usage, 1); ++ assert(p); ++ memmove(util_dynarray_element(deps, struct panfrost_usage, i + 1), ++ util_dynarray_element(deps, struct panfrost_usage, i), ++ (size - i) * sizeof(struct panfrost_usage)); ++ ++ *util_dynarray_element(deps, struct panfrost_usage, i) = u; ++ return i; ++ } ++ } ++ ++ util_dynarray_append(deps, struct panfrost_usage, u); ++ return size; ++} ++ ++static void ++panfrost_update_deps(struct util_dynarray *deps, struct panfrost_bo *bo, bool write) ++{ ++ /* Both lists should be sorted, so each dependency is at a higher ++ * index than the last */ ++ unsigned index = 0; ++ util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) { ++ /* read->read access does not require a dependency */ ++ if (!write && !u->write) ++ continue; ++ ++ index = panfrost_add_dep_after(deps, *u, index); ++ } ++} ++ ++static inline bool ++panfrost_usage_writes(enum panfrost_usage_type usage) ++{ ++ return (usage == PAN_USAGE_WRITE_VERTEX) || (usage == PAN_USAGE_WRITE_FRAGMENT); ++} ++ ++static inline bool ++panfrost_usage_fragment(enum panfrost_usage_type usage) ++{ ++ return (usage == PAN_USAGE_READ_FRAGMENT) || (usage == PAN_USAGE_WRITE_FRAGMENT); ++} ++ ++/* Removes invalid dependencies from deps */ ++static void ++panfrost_clean_deps(struct panfrost_device *dev, struct util_dynarray *deps) ++{ ++ kbase k = &dev->mali; ++ ++ struct panfrost_usage *rebuild = util_dynarray_begin(deps); ++ unsigned index = 0; ++ ++ util_dynarray_foreach(deps, struct panfrost_usage, u) { ++ /* Usages are ordered, so we can break here */ ++ if (u->queue >= k->event_slot_usage) ++ break; ++ ++ struct kbase_event_slot *slot = &k->event_slots[u->queue]; ++ uint64_t seqnum = u->seqnum; ++ ++ /* There is a race condition, where we can depend on an ++ * unsubmitted batch. In that cade, decrease the seqnum. ++ * Otherwise, skip invalid dependencies. */ ++ if (slot->last_submit == seqnum) ++ --seqnum; ++ else if (slot->last_submit < seqnum) ++ continue; ++ ++ /* This usage is valid, add it to the returned list */ ++ rebuild[index++] = (struct panfrost_usage) { ++ .queue = u->queue, ++ .write = u->write, ++ .seqnum = seqnum, ++ }; ++ } ++ ++ /* No need to check the return value, it can only shrink */ ++ (void)! util_dynarray_resize(deps, struct panfrost_usage, index); ++} ++ ++static int ++panfrost_batch_submit_csf(struct panfrost_batch *batch, ++ const struct pan_fb_info *fb) ++{ ++ struct panfrost_context *ctx = batch->ctx; ++ struct pipe_screen *pscreen = ctx->base.screen; ++ struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); ++ ++ ++ctx->kbase_cs_vertex.seqnum; ++ ++ if (panfrost_has_fragment_job(batch)) { ++ screen->vtbl.emit_fragment_job(batch, fb); ++ ++ctx->kbase_cs_fragment.seqnum; ++ } ++ ++ pthread_mutex_lock(&dev->bo_usage_lock); ++ for (unsigned i = 0; i < PAN_USAGE_COUNT; ++i) { ++ ++ bool write = panfrost_usage_writes(i); ++ pan_bo_access access = write ? PAN_BO_ACCESS_RW : PAN_BO_ACCESS_READ; ++ struct util_dynarray *deps; ++ unsigned queue; ++ uint64_t seqnum; ++ ++ if (panfrost_usage_fragment(i)) { ++ deps = &batch->frag_deps; ++ queue = ctx->kbase_cs_fragment.base.event_mem_offset; ++ seqnum = ctx->kbase_cs_fragment.seqnum; ++ } else { ++ deps = &batch->vert_deps; ++ queue = ctx->kbase_cs_vertex.base.event_mem_offset; ++ seqnum = ctx->kbase_cs_vertex.seqnum; ++ } ++ ++ util_dynarray_foreach(&batch->resource_bos[i], struct panfrost_bo *, bo) { ++ panfrost_update_deps(deps, *bo, write); ++ struct panfrost_usage u = { ++ .queue = queue, ++ .write = write, ++ .seqnum = seqnum, ++ }; ++ ++ panfrost_add_dep_after(&(*bo)->usage, u, 0); ++ (*bo)->gpu_access |= access; ++ } ++ } ++ pthread_mutex_unlock(&dev->bo_usage_lock); ++ ++ /* For now, only a single batch can use each tiler heap at once */ ++ if (ctx->tiler_heap_desc) { ++ panfrost_update_deps(&batch->vert_deps, ctx->tiler_heap_desc, true); ++ ++ struct panfrost_usage u = { ++ .queue = ctx->kbase_cs_fragment.base.event_mem_offset, ++ .write = true, ++ .seqnum = ctx->kbase_cs_fragment.seqnum, ++ }; ++ panfrost_add_dep_after(&ctx->tiler_heap_desc->usage, u, 0); ++ } ++ ++ /* TODO: Use atomics in kbase code to avoid lock? */ ++ pthread_mutex_lock(&dev->mali.queue_lock); ++ ++ panfrost_clean_deps(dev, &batch->vert_deps); ++ panfrost_clean_deps(dev, &batch->frag_deps); ++ ++ pthread_mutex_unlock(&dev->mali.queue_lock); ++ ++ screen->vtbl.emit_csf_toplevel(batch); ++ ++ uint64_t vs_offset = ctx->kbase_cs_vertex.offset + ++ (void *)ctx->kbase_cs_vertex.cs.ptr - ctx->kbase_cs_vertex.bo->ptr.cpu; ++ uint64_t fs_offset = ctx->kbase_cs_fragment.offset + ++ (void *)ctx->kbase_cs_fragment.cs.ptr - ctx->kbase_cs_fragment.bo->ptr.cpu; ++ ++ if (dev->debug & PAN_DBG_TRACE) { ++ pandecode_cs_ring(dev, &ctx->kbase_cs_vertex, vs_offset); ++ pandecode_cs_ring(dev, &ctx->kbase_cs_fragment, fs_offset); ++ } ++ ++ bool log = (dev->debug & PAN_DBG_LOG); ++ ++ // TODO: We need better synchronisation than a single fake syncobj! ++ ++ if (log) ++ printf("About to submit\n"); ++ ++ dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ++ ctx->syncobj_kbase, ctx->kbase_cs_vertex.seqnum); ++ ++ dev->mali.cs_submit(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ++ ctx->syncobj_kbase, ctx->kbase_cs_fragment.seqnum); ++ ++ bool reset = false; ++ ++ // TODO: How will we know to reset a CS when waiting is not done? ++ if (batch->needs_sync) { ++ if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_vertex.base, vs_offset, ctx->syncobj_kbase)) ++ reset = true; ++ ++ if (!dev->mali.cs_wait(&dev->mali, &ctx->kbase_cs_fragment.base, fs_offset, ctx->syncobj_kbase)) ++ reset = true; ++ } ++ ++ if (dev->debug & PAN_DBG_TILER) { ++ fflush(stdout); ++ FILE *stream = popen("tiler-hex-read", "w"); ++ ++ /* TODO: Dump more than just the first chunk */ ++ unsigned size = batch->ctx->kbase_ctx->tiler_heap_chunk_size; ++ uint64_t va = batch->ctx->kbase_ctx->tiler_heap_header; ++ ++ fprintf(stream, "width %i\n" "height %i\n" "mask %i\n" ++ "vaheap 0x%"PRIx64"\n" "size %i\n", ++ batch->key.width, batch->key.height, 0xfe, va, size); ++ ++ void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, ++ MAP_SHARED, dev->mali.fd, va); ++ ++ pan_hexdump(stream, ptr, size, false); ++ //memset(ptr, 0, size); ++ munmap(ptr, size); ++ ++ pclose(stream); ++ } ++ ++ if (reset) ++ reset_context(ctx); ++ ++ return 0; ++} ++ + static void + panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb) + { +@@ -824,6 +1220,7 @@ panfrost_batch_submit(struct panfrost_context *ctx, + { + struct pipe_screen *pscreen = ctx->base.screen; + struct panfrost_screen *screen = pan_screen(pscreen); ++ struct panfrost_device *dev = pan_device(pscreen); + int ret; + + /* Nothing to do! */ +@@ -867,7 +1264,11 @@ panfrost_batch_submit(struct panfrost_context *ctx, + if (batch->scoreboard.first_tiler || batch->clear) + screen->vtbl.emit_fbd(batch, &fb); + +- ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); ++ /* TODO: Don't hardcode the arch number */ ++ if (dev->arch < 10) ++ ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); ++ else ++ ret = panfrost_batch_submit_csf(batch, &fb); + + if (ret) + fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret); +@@ -969,6 +1370,8 @@ panfrost_batch_clear(struct panfrost_batch *batch, + for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; ++ if (!ctx->pipe_framebuffer.cbufs[i]) ++ continue; + + enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format; + pan_pack_color(batch->clear_color[i], color, format, false); diff --git a/src/gallium/drivers/panfrost/pan_job.h.rej b/src/gallium/drivers/panfrost/pan_job.h.rej new file mode 100644 index 00000000000..69ea4d72b11 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_job.h.rej @@ -0,0 +1,42 @@ +diff a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h (rejected hunks) +@@ -79,6 +79,14 @@ pan_tristate_get(struct pan_tristate state) + return (state.v == PAN_TRISTATE_TRUE); + } + ++enum panfrost_usage_type { ++ PAN_USAGE_READ_VERTEX, ++ PAN_USAGE_WRITE_VERTEX, ++ PAN_USAGE_READ_FRAGMENT, ++ PAN_USAGE_WRITE_FRAGMENT, ++ PAN_USAGE_COUNT, ++}; ++ + /* A panfrost_batch corresponds to a bound FBO we're rendering to, + * collecting over multiple draws. */ + +@@ -194,6 +202,25 @@ struct panfrost_batch { + + /* Referenced resources, holds a pipe_reference. */ + struct set *resources; ++ ++ struct util_dynarray resource_bos[PAN_USAGE_COUNT]; ++ ++ /* struct panfrost_usage */ ++ struct util_dynarray vert_deps; ++ struct util_dynarray frag_deps; ++ ++ /* Referenced dma-bufs FDs, for emitting synchronisation commands. */ ++ struct util_dynarray dmabufs; ++ ++ /* Command stream pointers for CSF Valhall. Vertex CS tracking is more ++ * complicated as there may be multiple buffers. */ ++ pan_command_stream cs_vertex; ++ uint32_t *cs_vertex_last_size; ++ pan_command_stream cs_vertex_first; ++ ++ pan_command_stream cs_fragment; ++ ++ bool needs_sync; + }; + + /* Functions for managing the above */ diff --git a/src/gallium/drivers/panfrost/pan_resource.c.rej b/src/gallium/drivers/panfrost/pan_resource.c.rej new file mode 100644 index 00000000000..e989ad133b2 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_resource.c.rej @@ -0,0 +1,426 @@ +diff a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c (rejected hunks) +@@ -33,6 +33,7 @@ + #include + #include + #include "drm-uapi/drm_fourcc.h" ++#include "drm-uapi/drm.h" + + #include "frontend/winsys_handle.h" + #include "util/format/u_format.h" +@@ -51,6 +52,46 @@ + #include "pan_tiling.h" + #include "decode.h" + ++/* The kbase kernel driver always maps imported BOs with caching. When we ++ * don't want that, instead do mmap from the display driver side to get a ++ * write-combine mapping. ++ */ ++static void ++panfrost_bo_mmap_scanout(struct panfrost_bo *bo, ++ struct renderonly *ro, ++ struct renderonly_scanout *scanout) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ /* If we are fine with a cached mapping, just return */ ++ if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) ++ return; ++ ++ struct drm_mode_map_dumb map_dumb = { ++ .handle = scanout->handle, ++ }; ++ ++ int err = drmIoctl(ro->kms_fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb); ++ if (err < 0) { ++ fprintf(stderr, "DRM_IOCTL_MODE_MAP_DUMB failed: %s\n", ++ strerror(errno)); ++ return; ++ } ++ ++ void *addr = mmap(NULL, bo->size, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ ro->kms_fd, map_dumb.offset); ++ if (addr == MAP_FAILED) { ++ fprintf(stderr, "kms_fd mmap failed: %s\n", ++ strerror(errno)); ++ return; ++ } ++ ++ bo->munmap_ptr = bo->ptr.cpu; ++ bo->ptr.cpu = addr; ++ bo->cached = false; ++} ++ + static struct pipe_resource * + panfrost_resource_from_handle(struct pipe_screen *pscreen, + const struct pipe_resource *templat, +@@ -102,15 +143,17 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen, + return NULL; + } + +- rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle); ++ struct panfrost_bo *bo = panfrost_bo_import(dev, whandle->handle); + /* Sometimes an import can fail e.g. on an invalid buffer fd, out of + * memory space to mmap it etc. + */ +- if (!rsc->image.data.bo) { ++ if (!bo) { + FREE(rsc); + return NULL; + } + ++ rsc->image.data.bo = bo; ++ + rsc->modifier_constant = true; + + BITSET_SET(rsc->valid.data, 0); +@@ -122,6 +165,9 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen, + /* failure is expected in some cases.. */ + } + ++ if (rsc->scanout) ++ panfrost_bo_mmap_scanout(bo, dev->ro, rsc->scanout); ++ + return prsc; + } + +@@ -473,7 +519,9 @@ panfrost_resource_setup(struct panfrost_device *dev, + static void + panfrost_resource_init_afbc_headers(struct panfrost_resource *pres) + { +- panfrost_bo_mmap(pres->image.data.bo); ++ struct panfrost_bo *bo = pres->image.data.bo; ++ ++ panfrost_bo_mmap(bo); + + unsigned nr_samples = MAX2(pres->base.nr_samples, 1); + +@@ -482,16 +530,16 @@ panfrost_resource_init_afbc_headers(struct panfrost_resource *pres) + struct pan_image_slice_layout *slice = &pres->image.layout.slices[l]; + + for (unsigned s = 0; s < nr_samples; ++s) { +- void *ptr = pres->image.data.bo->ptr.cpu + +- (i * pres->image.layout.array_stride) + +- slice->offset + +- (s * slice->afbc.surface_stride); ++ size_t offset = (i * pres->image.layout.array_stride) + ++ slice->offset + ++ (s * slice->afbc.surface_stride); + + /* Zero-ed AFBC headers seem to encode a plain + * black. Let's use this pattern to keep the + * initialization simple. + */ +- memset(ptr, 0, slice->afbc.header_size); ++ memset(bo->ptr.cpu + offset, 0, slice->afbc.header_size); ++ panfrost_bo_mem_clean(bo, offset, slice->afbc.header_size); + } + } + } +@@ -643,7 +691,9 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen, + (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" : + "Other resource"; + +- if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) { ++ /* Revert to doing a kmsro allocation for any shared BO, because kbase ++ * cannot do export */ ++ if (dev->ro && (template->bind & PAN_BIND_SHARED_MASK)) { + struct winsys_handle handle; + struct pan_block_size blocksize = panfrost_block_size(modifier, template->format); + +@@ -702,12 +752,21 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen, + free(so); + return NULL; + } ++ ++ panfrost_bo_mmap_scanout(so->image.data.bo, dev->ro, so->scanout); + } else { + /* We create a BO immediately but don't bother mapping, since we don't + * care to map e.g. FBOs which the CPU probably won't touch */ + ++ /* For now, don't cache buffers as syncing can be slow when ++ * too much memory is mapped. TODO: dynamically switch, or use ++ * the STREAM_READ etc. hints? */ ++ bool buffer = (template->target == PIPE_BUFFER); ++ unsigned cache_flag = buffer ? 0 : PAN_BO_CACHEABLE; ++ + so->image.data.bo = +- panfrost_bo_create(dev, so->image.layout.data_size, PAN_BO_DELAY_MMAP, label); ++ panfrost_bo_create(dev, so->image.layout.data_size, ++ PAN_BO_DELAY_MMAP | cache_flag, label); + + so->constant_stencil = true; + } +@@ -741,10 +800,22 @@ panfrost_resource_create_with_modifiers(struct pipe_screen *screen, + const struct pipe_resource *template, + const uint64_t *modifiers, int count) + { ++ struct panfrost_device *dev = pan_device(screen); ++ + for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { +- if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) { +- return panfrost_resource_create_with_modifier(screen, template, +- pan_best_modifiers[i]); ++ uint64_t mod = pan_best_modifiers[i]; ++ ++ if (drm_is_afbc(mod) && !dev->has_afbc) ++ continue; ++ ++ if (mod != DRM_FORMAT_MOD_LINEAR && (dev->debug & PAN_DBG_LINEAR)) ++ continue; ++ ++ /* TODO: What if mod is an unsupported AFBC variant for this ++ * format? */ ++ ++ if (drm_find_modifier(mod, modifiers, count)) { ++ return panfrost_resource_create_with_modifier(screen, template, mod); + } + } + +@@ -773,6 +844,71 @@ panfrost_resource_destroy(struct pipe_screen *screen, + free(rsrc); + } + ++static void ++panfrost_clear_render_target(struct pipe_context *pipe, ++ struct pipe_surface *dst, ++ const union pipe_color_union *color, ++ unsigned dstx, unsigned dsty, ++ unsigned width, unsigned height, ++ bool render_condition_enabled) ++{ ++ struct panfrost_context *ctx = pan_context(pipe); ++ ++ /* TODO: dstx, etc. */ ++ ++ struct pipe_framebuffer_state tmp = {0}; ++ util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer); ++ ++ struct pipe_framebuffer_state fb = { ++ .width = dst->width, ++ .height = dst->height, ++ .layers = 1, ++ .samples = 1, ++ .nr_cbufs = 1, ++ .cbufs[0] = dst, ++ }; ++ pipe->set_framebuffer_state(pipe, &fb); ++ ++ struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear render target"); ++ panfrost_batch_clear(batch, PIPE_CLEAR_COLOR0, color, 0, 0); ++ ++ pipe->set_framebuffer_state(pipe, &tmp); ++ util_unreference_framebuffer_state(&tmp); ++} ++ ++static void ++panfrost_clear_depth_stencil(struct pipe_context *pipe, ++ struct pipe_surface *dst, ++ unsigned clear_flags, ++ double depth, unsigned stencil, ++ unsigned dstx, unsigned dsty, ++ unsigned width, unsigned height, ++ bool render_condition_enabled) ++{ ++ struct panfrost_context *ctx = pan_context(pipe); ++ ++ /* TODO: dstx, etc. */ ++ ++ struct pipe_framebuffer_state tmp = {0}; ++ util_copy_framebuffer_state(&tmp, &ctx->pipe_framebuffer); ++ ++ struct pipe_framebuffer_state fb = { ++ .width = dst->width, ++ .height = dst->height, ++ .layers = 1, ++ .samples = 1, ++ .nr_cbufs = 0, ++ .zsbuf = dst, ++ }; ++ pipe->set_framebuffer_state(pipe, &fb); ++ ++ struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx, "Clear depth/stencil"); ++ panfrost_batch_clear(batch, clear_flags, NULL, depth, stencil); ++ ++ pipe->set_framebuffer_state(pipe, &tmp); ++ util_unreference_framebuffer_state(&tmp); ++} ++ + /* Most of the time we can do CPU-side transfers, but sometimes we need to use + * the 3D pipe for this. Let's wrap u_blitter to blit to/from staging textures. + * Code adapted from freedreno */ +@@ -968,6 +1104,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + struct panfrost_resource *staging = pan_alloc_staging(ctx, rsrc, level, box); + assert(staging); + ++ panfrost_bo_mmap(staging->image.data.bo); ++ + /* Staging resources have one LOD: level 0. Query the strides + * on this LOD. + */ +@@ -990,9 +1128,11 @@ panfrost_ptr_map(struct pipe_context *pctx, + pan_blit_to_staging(pctx, transfer); + panfrost_flush_writer(ctx, staging, "AFBC read staging blit"); + panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false); ++ ++ panfrost_bo_mem_invalidate(staging->image.data.bo, 0, ++ staging->image.data.bo->size); + } + +- panfrost_bo_mmap(staging->image.data.bo); + return staging->image.data.bo->ptr.cpu; + } + +@@ -1029,7 +1169,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + !(usage & PIPE_MAP_UNSYNCHRONIZED) && + !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && + (usage & PIPE_MAP_WRITE) && +- rsrc->track.nr_users > 0) { ++ rsrc->track.nr_users > 0 && ++ bo->size < 16 * 1024 * 1024) { + + /* When a resource to be modified is already being used by a + * pending batch, it is often faster to copy the whole BO than +@@ -1051,6 +1192,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + copy_resource = false; + } + ++ bool cache_inval = true; ++ + if (create_new_bo) { + /* Make sure we re-emit any descriptors using this resource */ + panfrost_dirty_state_all(ctx); +@@ -1075,12 +1218,14 @@ panfrost_ptr_map(struct pipe_context *pctx, + flags, bo->label); + + if (newbo) { +- if (copy_resource) +- memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size); ++ if (copy_resource) { ++ panfrost_bo_mem_invalidate(bo, 0, bo->size); ++ memcpy(newbo->ptr.cpu, bo->ptr.cpu, bo->size); ++ } + + panfrost_resource_swap_bo(ctx, rsrc, newbo); + +- if (!copy_resource && ++ if (!copy_resource && + drm_is_afbc(rsrc->image.layout.modifier)) + panfrost_resource_init_afbc_headers(rsrc); + +@@ -1102,6 +1247,22 @@ panfrost_ptr_map(struct pipe_context *pctx, + panfrost_flush_writer(ctx, rsrc, "Synchronized read"); + panfrost_bo_wait(bo, INT64_MAX, false); + } ++ } else { ++ /* No flush for writes to uninitialized */ ++ cache_inval = false; ++ } ++ ++ /* TODO: Only the accessed region for textures */ ++ if (cache_inval) { ++ size_t offset = 0; ++ size_t size = bo->size; ++ ++ if (resource->target == PIPE_BUFFER) { ++ offset = box->x * (size_t) bytes_per_block; ++ size = box->width * (size_t) bytes_per_block; ++ } ++ ++ panfrost_bo_mem_invalidate(bo, offset, size); + } + + /* For access to compressed textures, we want the (x, y, w, h) +@@ -1128,6 +1289,8 @@ panfrost_ptr_map(struct pipe_context *pctx, + * caching... I don't know if this is actually possible but we + * should still get it right */ + ++ // TODO: Fix this for cached BOs ++ + unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT; + + if ((usage & dpw) == dpw && rsrc->index_cache) +@@ -1281,8 +1444,15 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + * reloads that can cascade into DATA_INVALID_FAULTs due to reading + * malformed AFBC data if uninitialized */ + +- if (trans->staging.rsrc) { ++ bool afbc = trans->staging.rsrc; ++ ++ if (afbc) { + if (transfer->usage & PIPE_MAP_WRITE) { ++ struct panfrost_resource *trans_rsrc = pan_resource(trans->staging.rsrc); ++ struct panfrost_bo *trans_bo = trans_rsrc->image.data.bo; ++ ++ panfrost_bo_mem_clean(trans_bo, 0, trans_bo->size); ++ + if (panfrost_should_linear_convert(dev, prsrc, transfer)) { + + panfrost_bo_unreference(prsrc->image.data.bo); +@@ -1290,7 +1460,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); + +- prsrc->image.data.bo = pan_resource(trans->staging.rsrc)->image.data.bo; ++ prsrc->image.data.bo = trans_bo; + panfrost_bo_reference(prsrc->image.data.bo); + } else { + pan_blit_from_staging(pctx, trans); +@@ -1315,10 +1485,13 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); + if (prsrc->image.layout.data_size > bo->size) { ++ /* We want the BO to be MMAPed. */ ++ uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP; + const char *label = bo->label; ++ + panfrost_bo_unreference(bo); + bo = prsrc->image.data.bo = +- panfrost_bo_create(dev, prsrc->image.layout.data_size, 0, label); ++ panfrost_bo_create(dev, prsrc->image.layout.data_size, flags, label); + assert(bo); + } + +@@ -1339,6 +1512,25 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + } + } + ++ /* TODO: Only the accessed region */ ++ /* It is important to not do this for AFBC resources, or else the ++ * clean might overwrite the result of the blit. */ ++ if (!afbc && (transfer->usage & PIPE_MAP_WRITE)) { ++ size_t offset = 0; ++ size_t size = prsrc->image.data.bo->size; ++ ++ /* TODO: Don't recalculate */ ++ if (prsrc->base.target == PIPE_BUFFER) { ++ enum pipe_format format = prsrc->image.layout.format; ++ int bytes_per_block = util_format_get_blocksize(format); ++ ++ offset = transfer->box.x * (size_t) bytes_per_block; ++ size = transfer->box.width * (size_t) bytes_per_block; ++ } ++ ++ panfrost_bo_mem_clean(prsrc->image.data.bo, ++ offset, size); ++ } + + util_range_add(&prsrc->base, &prsrc->valid_buffer_range, + transfer->box.x, +@@ -1353,6 +1545,7 @@ panfrost_ptr_unmap(struct pipe_context *pctx, + ralloc_free(transfer); + } + ++// TODO: does this need to be changed for cached resources? + static void + panfrost_ptr_flush_region(struct pipe_context *pctx, + struct pipe_transfer *transfer, +@@ -1486,6 +1679,8 @@ panfrost_resource_context_init(struct pipe_context *pctx) + pctx->texture_unmap = u_transfer_helper_transfer_unmap; + pctx->create_surface = panfrost_create_surface; + pctx->surface_destroy = panfrost_surface_destroy; ++ pctx->clear_render_target = panfrost_clear_render_target; ++ pctx->clear_depth_stencil = panfrost_clear_depth_stencil; + pctx->resource_copy_region = util_resource_copy_region; + pctx->blit = panfrost_blit; + pctx->generate_mipmap = panfrost_generate_mipmap; diff --git a/src/gallium/drivers/panfrost/pan_screen.c.rej b/src/gallium/drivers/panfrost/pan_screen.c.rej new file mode 100644 index 00000000000..6d6ff33b6bd --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_screen.c.rej @@ -0,0 +1,87 @@ +diff a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c (rejected hunks) +@@ -56,7 +56,7 @@ + + static const struct debug_named_value panfrost_debug_options[] = { + {"perf", PAN_DBG_PERF, "Enable performance warnings"}, +- {"trace", PAN_DBG_TRACE, "Trace the command stream"}, ++ {"trace", PAN_DBG_TRACE | PAN_DBG_BO_CLEAR, "Trace the command stream"}, + {"deqp", PAN_DBG_DEQP, "Hacks for dEQP"}, + {"dirty", PAN_DBG_DIRTY, "Always re-emit all state"}, + {"sync", PAN_DBG_SYNC, "Wait for each job's completion and abort on GPU faults"}, +@@ -72,6 +72,13 @@ static const struct debug_named_value panfrost_debug_options[] = { + #ifdef PAN_DBG_OVERFLOW + {"overflow", PAN_DBG_OVERFLOW, "Check for buffer overflows in pool uploads"}, + #endif ++ {"tiler", PAN_DBG_TILER, "Decode the tiler heap"}, ++ {"bolog", PAN_DBG_BO_LOG, "Log BO allocations/deallocations"}, ++ {"boclear", PAN_DBG_BO_CLEAR, "Clear BOs on allocation"}, ++ {"nogpuc", PAN_DBG_UNCACHED_GPU, "Use uncached GPU memory for textures"}, ++ {"nocpuc", PAN_DBG_UNCACHED_CPU, "Use uncached CPU mappings for textures"}, ++ {"log", PAN_DBG_LOG, "Log job submission etc."}, ++ {"gofaster", PAN_DBG_GOFASTER, "Experimental performance improvements"}, + DEBUG_NAMED_VALUE_END + }; + +@@ -122,6 +129,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_SHADER_PACK_HALF_FLOAT: ++ case PIPE_CAP_CLIP_HALFZ: + return 1; + + case PIPE_CAP_MAX_RENDER_TARGETS: +@@ -300,7 +308,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) + * still supported as it is core GLES3.0 functionality + */ + case PIPE_CAP_PRIMITIVE_RESTART: +- return dev->arch <= 7; ++ return is_gl3 || dev->arch <= 7; + + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_TWO_SIDED_COLOR: +@@ -606,6 +614,7 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen, + bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format); + bool ytr = panfrost_afbc_can_ytr(format); + bool tiled_afbc = panfrost_afbc_can_tile(dev); ++ bool native = panfrost_afbc_only_native(dev->arch, format); + + unsigned count = 0; + +@@ -619,6 +628,9 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen, + if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc) + continue; + ++ if (drm_is_afbc(pan_best_modifiers[i]) && !(pan_best_modifiers[i] & AFBC_FORMAT_MOD_NATIVE_SWIZZLE) && native) ++ continue; ++ + if (test_modifier != DRM_FORMAT_MOD_INVALID && + test_modifier != pan_best_modifiers[i]) + continue; +@@ -822,13 +834,17 @@ panfrost_create_screen(int fd, struct renderonly *ro) + + /* Bail early on unsupported hardware */ + if (dev->model == NULL) { +- debug_printf("panfrost: Unsupported model %X", dev->gpu_id); ++ debug_printf("panfrost: Unsupported model %X\n", dev->gpu_id); + panfrost_destroy_screen(&(screen->base)); + return NULL; + } + + dev->ro = ro; + ++ /* The functionality is only useful with kbase */ ++ if (dev->kbase) ++ dev->has_dmabuf_fence = panfrost_check_dmabuf_fence(dev); ++ + screen->base.destroy = panfrost_destroy_screen; + + screen->base.get_name = panfrost_get_name; +@@ -874,6 +890,8 @@ panfrost_create_screen(int fd, struct renderonly *ro) + panfrost_cmdstream_screen_init_v7(screen); + else if (dev->arch == 9) + panfrost_cmdstream_screen_init_v9(screen); ++ else if (dev->arch == 10) ++ panfrost_cmdstream_screen_init_v10(screen); + else + unreachable("Unhandled architecture major"); + diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index 3400c0a6cbf..3ef30cf17cf 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -51,6 +51,7 @@ static const struct pipe_driver_query_info panfrost_driver_query_list[] = { struct panfrost_batch; struct panfrost_context; +struct panfrost_cs; struct panfrost_resource; struct panfrost_compiled_shader; struct pan_fb_info; diff --git a/src/gallium/drivers/panfrost/pan_screen.h.rej b/src/gallium/drivers/panfrost/pan_screen.h.rej new file mode 100644 index 00000000000..0d7d3ea9803 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_screen.h.rej @@ -0,0 +1,28 @@ +diff a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h (rejected hunks) +@@ -57,6 +58,7 @@ struct pan_blend_state; + + /* Virtual table of per-generation (GenXML) functions */ + ++ + struct panfrost_vtable { + /* Prepares the renderer state descriptor or shader program descriptor + * for a given compiled shader, and if desired uploads it as well */ +@@ -100,6 +102,10 @@ struct panfrost_vtable { + struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); ++ ++ void (*emit_csf_toplevel)(struct panfrost_batch *); ++ ++ void (*init_cs)(struct panfrost_context *ctx, struct panfrost_cs *cs); + }; + + struct panfrost_screen { +@@ -138,6 +144,7 @@ void panfrost_cmdstream_screen_init_v5(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen); + void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); ++void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); + + #define perf_debug(dev, ...) \ + do { \ diff --git a/src/gallium/frontends/nine/nine_ff.c b/src/gallium/frontends/nine/nine_ff.c index a5182fbd0a8..bae01856b57 100644 --- a/src/gallium/frontends/nine/nine_ff.c +++ b/src/gallium/frontends/nine/nine_ff.c @@ -1442,7 +1442,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key) struct ureg_src texture_coord = ps.vT[s]; struct ureg_dst delta; switch (key->ts[s].textarget) { - case 0: target = TGSI_TEXTURE_1D; break; + case 0: target = TGSI_TEXTURE_2D; break; case 1: target = TGSI_TEXTURE_2D; break; case 2: target = TGSI_TEXTURE_3D; break; case 3: target = TGSI_TEXTURE_CUBE; break; diff --git a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c index eff7a0f5de8..432d201786f 100644 --- a/src/gallium/frontends/nine/nine_shader.c +++ b/src/gallium/frontends/nine/nine_shader.c @@ -2198,7 +2198,7 @@ static inline unsigned d3dstt_to_tgsi_tex(BYTE sampler_type) { switch (sampler_type) { - case NINED3DSTT_1D: return TGSI_TEXTURE_1D; + case NINED3DSTT_1D: return TGSI_TEXTURE_2D; case NINED3DSTT_2D: return TGSI_TEXTURE_2D; case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D; case NINED3DSTT_CUBE: return TGSI_TEXTURE_CUBE; @@ -2211,7 +2211,7 @@ static inline unsigned d3dstt_to_tgsi_tex_shadow(BYTE sampler_type) { switch (sampler_type) { - case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D; + case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW2D; case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D; case NINED3DSTT_VOLUME: case NINED3DSTT_CUBE: diff --git a/src/gallium/frontends/nine/nine_shader.c.rej b/src/gallium/frontends/nine/nine_shader.c.rej new file mode 100644 index 00000000000..b6441552e6a --- /dev/null +++ b/src/gallium/frontends/nine/nine_shader.c.rej @@ -0,0 +1,10 @@ +diff a/src/gallium/frontends/nine/nine_shader.c b/src/gallium/frontends/nine/nine_shader.c (rejected hunks) +@@ -2186,7 +2186,7 @@ ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage) + { + boolean shadow = !!(info->sampler_mask_shadow & (1 << stage)); + switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) { +- case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D; ++ case 1: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D; + case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D; + case 3: return TGSI_TEXTURE_3D; + default: diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c index fb7caba7900..3e4de025e41 100644 --- a/src/gallium/frontends/nine/nine_state.c +++ b/src/gallium/frontends/nine/nine_state.c @@ -1121,8 +1121,10 @@ update_textures_and_samplers(struct NineDevice9 *device) false, view); context->enabled_sampler_count_vs = num_textures; - if (commit_samplers) + if (commit_samplers) { + cso_set_max_sampler(context->cso, num_textures - 1); cso_single_sampler_done(context->cso, PIPE_SHADER_VERTEX); + } } /* State commit only */ diff --git a/src/gallium/frontends/nine/nine_state.c.rej b/src/gallium/frontends/nine/nine_state.c.rej new file mode 100644 index 00000000000..cae533928d5 --- /dev/null +++ b/src/gallium/frontends/nine/nine_state.c.rej @@ -0,0 +1,13 @@ +diff a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c (rejected hunks) +@@ -1039,8 +1039,10 @@ update_textures_and_samplers(struct NineDevice9 *device) + false, view); + context->enabled_sampler_count_ps = num_textures; + +- if (commit_samplers) ++ if (commit_samplers) { ++ cso_set_max_sampler(context->cso, num_textures - 1); + cso_single_sampler_done(context->cso, PIPE_SHADER_FRAGMENT); ++ } + + commit_samplers = FALSE; + sampler_mask = context->programmable_vs ? context->vs->sampler_mask : 0; diff --git a/src/gallium/targets/d3dadapter9/meson.build.rej b/src/gallium/targets/d3dadapter9/meson.build.rej new file mode 100644 index 00000000000..89bfd12debe --- /dev/null +++ b/src/gallium/targets/d3dadapter9/meson.build.rej @@ -0,0 +1,11 @@ +diff a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build (rejected hunks) +@@ -64,7 +64,8 @@ libgallium_nine = shared_library( + dep_selinux, dep_libdrm, dep_llvm, dep_thread, + idep_xmlconfig, idep_mesautil, idep_nir, + driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, +- driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno ++ driver_i915, driver_svga, driver_iris, driver_crocus, driver_zink, driver_freedreno, ++ driver_panfrost, driver_kmsro, + ], + name_prefix : '', + version : '.'.join(nine_version), diff --git a/src/gallium/targets/osmesa/meson.build.rej b/src/gallium/targets/osmesa/meson.build.rej new file mode 100644 index 00000000000..05104104856 --- /dev/null +++ b/src/gallium/targets/osmesa/meson.build.rej @@ -0,0 +1,14 @@ +diff a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build (rejected hunks) +@@ -55,10 +55,10 @@ libosmesa = shared_library( + libmesa, libgallium, libws_null, osmesa_link_with, + ], + dependencies : [ +- dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast ++ dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast, driver_panfrost, dep_libdrm + ], + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libosmesa.dll +- soversion : host_machine.system() == 'windows' ? '' : '8', ++ soversion : '', + version : '8.0.0', + darwin_versions : '9.0.0', + install : true, diff --git a/src/gallium/targets/rusticl/meson.build.rej b/src/gallium/targets/rusticl/meson.build.rej new file mode 100644 index 00000000000..32064a34bd4 --- /dev/null +++ b/src/gallium/targets/rusticl/meson.build.rej @@ -0,0 +1,9 @@ +diff a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build (rejected hunks) +@@ -43,6 +43,7 @@ librusticl = shared_library( + ], + dependencies : [ + driver_iris, ++ driver_kmsro, + driver_nouveau, + driver_panfrost, + driver_swrast, diff --git a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej new file mode 100644 index 00000000000..5a81dda1e0d --- /dev/null +++ b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c.rej @@ -0,0 +1,19 @@ +diff a/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c b/src/gallium/winsys/kmsro/drm/kmsro_drm_winsys.c (rejected hunks) +@@ -101,9 +101,15 @@ struct pipe_screen *kmsro_drm_screen_create(int fd, + #endif + + #if defined(GALLIUM_PANFROST) +- ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); ++ bool noop = getenv("KBASE_NOOP"); + +- if (ro->gpu_fd >= 0) { ++ if (!noop) { ++ ro->gpu_fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); ++ if (ro->gpu_fd < 0) ++ ro->gpu_fd = open("/dev/mali0", O_RDWR | O_CLOEXEC | O_NONBLOCK); ++ } ++ ++ if ((ro->gpu_fd >= 0) || noop) { + ro->create_for_resource = renderonly_create_kms_dumb_buffer_for_resource; + screen = panfrost_drm_screen_create_renderonly(ro); + if (!screen) diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 1b188a4c800..4585cbb6ef4 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -118,7 +118,6 @@ static char* load_shader_replacement(struct _shader_replacement *repl) return NULL; } #endif -#endif /** * Return mask of GLSL_x flags by examining the MESA_GLSL env var. @@ -1933,8 +1932,6 @@ _mesa_LinkProgram(GLuint programObj) link_program_error(ctx, shProg); } -#ifdef ENABLE_SHADER_CACHE - /** * Construct a full path for shader replacement functionality using * following format: @@ -2067,8 +2064,6 @@ _mesa_read_shader_source(const gl_shader_stage stage, const char *source, return buffer; } -#endif /* ENABLE_SHADER_CACHE */ - /** * Called via glShaderSource() and glShaderSourceARB() API functions. * Basically, concatenate the source code strings into one long string @@ -2150,7 +2145,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count, uint8_t original_sha1[SHA1_DIGEST_LENGTH]; _mesa_sha1_compute(source, strlen(source), original_sha1); -#ifdef ENABLE_SHADER_CACHE GLcharARB *replacement; /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace @@ -2163,7 +2157,6 @@ shader_source(struct gl_context *ctx, GLuint shaderObj, GLsizei count, free(source); source = replacement; } -#endif /* ENABLE_SHADER_CACHE */ set_shader_source(sh, source, original_sha1); diff --git a/src/mesa/main/shaderapi.c.rej b/src/mesa/main/shaderapi.c.rej new file mode 100644 index 00000000000..52e1b756b51 --- /dev/null +++ b/src/mesa/main/shaderapi.c.rej @@ -0,0 +1,9 @@ +diff a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c (rejected hunks) +@@ -70,7 +70,6 @@ + #include "state_tracker/st_context.h" + #include "state_tracker/st_program.h" + +-#ifdef ENABLE_SHADER_CACHE + #if CUSTOM_SHADER_REPLACEMENT + #include "shader_replacement.h" + /* shader_replacement.h must declare a variable like this: diff --git a/src/meson.build b/src/meson.build index 1293538b8f6..1f04b7860cc 100644 --- a/src/meson.build +++ b/src/meson.build @@ -75,6 +75,7 @@ if with_imgui endif if with_platform_wayland subdir('egl/wayland/wayland-drm') + subdir('egl/wayland/mali-buffer-sharing') endif if with_any_vk or with_gallium_zink subdir('vulkan') diff --git a/src/panfrost/base/include/csf/mali_base_csf_kernel.h b/src/panfrost/base/include/csf/mali_base_csf_kernel.h new file mode 100644 index 00000000000..3b02350c08b --- /dev/null +++ b/src/panfrost/base/include/csf/mali_base_csf_kernel.h @@ -0,0 +1,596 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_CSF_KERNEL_H_ +#define _UAPI_BASE_CSF_KERNEL_H_ + +#include +#include "../mali_base_common_kernel.h" + +/* Memory allocation, access/hint flags & mask specific to CSF GPU. + * + * See base_mem_alloc_flags. + */ + +/* Must be FIXED memory. */ +#define BASE_MEM_FIXED ((base_mem_alloc_flags)1 << 8) + +/* CSF event memory + * + * If Outer shareable coherence is not specified or not available, then on + * allocation kbase will automatically use the uncached GPU mapping. + * There is no need for the client to specify BASE_MEM_UNCACHED_GPU + * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. + * + * This memory requires a permanent mapping + * + * See also kbase_reg_needs_kernel_mapping() + */ +#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) + +#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) + + +/* Must be FIXABLE memory: its GPU VA will be determined at a later point, + * at which time it will be at a fixed GPU VA. + */ +#define BASE_MEM_FIXABLE ((base_mem_alloc_flags)1 << 29) + +/* Note that the number of bits used for base_mem_alloc_flags + * must be less than BASE_MEM_FLAGS_NR_BITS !!! + */ + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED BASE_MEM_RESERVED_BIT_20 + +/* Special base mem handles specific to CSF. + */ +#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) + +#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ + ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ + LOCAL_PAGE_SHIFT) + +/* Valid set of just-in-time memory allocation flags */ +#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) + +/* flags for base context specific to CSF */ + +/* Base context creates a CSF event notification thread. + * + * The creation of a CSF event notification thread is conditional but + * mandatory for the handling of CSF events. + */ +#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | \ + BASE_CONTEXT_CSF_EVENT_THREAD | \ + BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* Flags for base tracepoint specific to CSF */ + +/* Enable KBase tracepoints for CSF builds */ +#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) + +/* Enable additional CSF Firmware side tracepoints */ +#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) + +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ + BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ + BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) + +/* Number of pages mapped into the process address space for a bound GPU + * command queue. A pair of input/output pages and a Hw doorbell page + * are mapped to enable direct submission of commands to Hw. + */ +#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) + +#define BASE_QUEUE_MAX_PRIORITY (15U) + +/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ +#define BASEP_EVENT_VAL_INDEX (0U) +#define BASEP_EVENT_ERR_INDEX (1U) + +/* The upper limit for number of objects that could be waited/set per command. + * This limit is now enforced as internally the error inherit inputs are + * converted to 32-bit flags in a __u32 variable occupying a previously padding + * field. + */ +#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) + +/* CSF CSI EXCEPTION_HANDLER_FLAGS */ +#define BASE_CSF_TILER_OOM_EXCEPTION_FLAG (1u << 0) +#define BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK (BASE_CSF_TILER_OOM_EXCEPTION_FLAG) + +/** + * enum base_kcpu_command_type - Kernel CPU queue command type. + * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, + * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, + * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, + * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, + * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, + * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, + * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, + */ +enum base_kcpu_command_type { + BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_SET, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, + BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, + BASE_KCPU_COMMAND_TYPE_JIT_FREE, + BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, + BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER +}; + +/** + * enum base_queue_group_priority - Priority of a GPU Command Queue Group. + * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group + * priority levels. + * + * Currently this is in order of highest to lowest, but if new levels are added + * then those new levels may be out of order to preserve the ABI compatibility + * with previous releases. At that point, ensure assignment to + * the 'priority' member in &kbase_queue_group is updated to ensure it remains + * a linear ordering. + * + * There should be no gaps in the enum, otherwise use of + * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. + */ +enum base_queue_group_priority { + BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, + BASE_QUEUE_GROUP_PRIORITY_MEDIUM, + BASE_QUEUE_GROUP_PRIORITY_LOW, + BASE_QUEUE_GROUP_PRIORITY_REALTIME, + BASE_QUEUE_GROUP_PRIORITY_COUNT +}; + +struct base_kcpu_command_fence_info { + __u64 fence; +}; + +struct base_cqs_wait_info { + __u64 addr; + __u32 val; + __u32 padding; +}; + +struct base_kcpu_command_cqs_wait_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +struct base_cqs_set { + __u64 addr; +}; + +struct base_kcpu_command_cqs_set_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * typedef basep_cqs_data_type - Enumeration of CQS Data Types + * + * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value + * is an unsigned 32-bit integer + * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value + * is an unsigned 64-bit integer + */ +typedef enum PACKED { + BASEP_CQS_DATA_TYPE_U32 = 0, + BASEP_CQS_DATA_TYPE_U64 = 1, +} basep_cqs_data_type; + +/** + * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait + * Operation conditions + * + * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Less than or Equal to + * the Wait Operation value + * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Greater than the Wait Operation value + */ +typedef enum { + BASEP_CQS_WAIT_OPERATION_LE = 0, + BASEP_CQS_WAIT_OPERATION_GT = 1, +} basep_cqs_wait_operation_op; + +struct base_cqs_wait_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information + * about the Timeline CQS wait objects + * + * @objs: An array of Timeline CQS waits. + * @nr_objs: Number of Timeline CQS waits in the array. + * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field + * to be served as the source for importing into the + * queue's error-state. + */ +struct base_kcpu_command_cqs_wait_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +/** + * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations + * + * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value + * to a synchronization object + * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value + * of a synchronization object + */ +typedef enum { + BASEP_CQS_SET_OPERATION_ADD = 0, + BASEP_CQS_SET_OPERATION_SET = 1, +} basep_cqs_set_operation_op; + +struct base_cqs_set_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_set_operation_info - structure which contains information + * about the Timeline CQS set objects + * + * @objs: An array of Timeline CQS sets. + * @nr_objs: Number of Timeline CQS sets in the array. + * @padding: Structure padding, unused bytes. + */ +struct base_kcpu_command_cqs_set_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * struct base_kcpu_command_import_info - structure which contains information + * about the imported buffer. + * + * @handle: Address of imported user buffer. + */ +struct base_kcpu_command_import_info { + __u64 handle; +}; + +/** + * struct base_kcpu_command_jit_alloc_info - structure which contains + * information about jit memory allocation. + * + * @info: An array of elements of the + * struct base_jit_alloc_info type. + * @count: The number of elements in the info array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_alloc_info { + __u64 info; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_jit_free_info - structure which contains + * information about jit memory which is to be freed. + * + * @ids: An array containing the JIT IDs to free. + * @count: The number of elements in the ids array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_free_info { + __u64 ids; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_group_suspend_info - structure which contains + * suspend buffer data captured for a suspended queue group. + * + * @buffer: Pointer to an array of elements of the type char. + * @size: Number of elements in the @buffer array. + * @group_handle: Handle to the mapping of CSG. + * @padding: padding to a multiple of 64 bits. + */ +struct base_kcpu_command_group_suspend_info { + __u64 buffer; + __u32 size; + __u8 group_handle; + __u8 padding[3]; +}; + + +/** + * struct base_kcpu_command - kcpu command. + * @type: type of the kcpu command, one enum base_kcpu_command_type + * @padding: padding to a multiple of 64 bits + * @info: structure which contains information about the kcpu command; + * actual type is determined by @p type + * @info.fence: Fence + * @info.cqs_wait: CQS wait + * @info.cqs_set: CQS set + * @info.cqs_wait_operation: CQS wait operation + * @info.cqs_set_operation: CQS set operation + * @info.import: import + * @info.jit_alloc: JIT allocation + * @info.jit_free: JIT deallocation + * @info.suspend_buf_copy: suspend buffer copy + * @info.sample_time: sample time + * @info.padding: padding + */ +struct base_kcpu_command { + __u8 type; + __u8 padding[sizeof(__u64) - sizeof(__u8)]; + union { + struct base_kcpu_command_fence_info fence; + struct base_kcpu_command_cqs_wait_info cqs_wait; + struct base_kcpu_command_cqs_set_info cqs_set; + struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; + struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; + struct base_kcpu_command_import_info import; + struct base_kcpu_command_jit_alloc_info jit_alloc; + struct base_kcpu_command_jit_free_info jit_free; + struct base_kcpu_command_group_suspend_info suspend_buf_copy; + __u64 padding[2]; /* No sub-struct should be larger */ + } info; +}; + +/** + * struct basep_cs_stream_control - CSI capabilities. + * + * @features: Features of this stream + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_stream_control { + __u32 features; + __u32 padding; +}; + +/** + * struct basep_cs_group_control - CSG interface capabilities. + * + * @features: Features of this group + * @stream_num: Number of streams in this group + * @suspend_size: Size in bytes of the suspend buffer for this group + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_group_control { + __u32 features; + __u32 stream_num; + __u32 suspend_size; + __u32 padding; +}; + +/** + * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault + * error information associated with GPU command queue group. + * + * @sideband: Additional information of the unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_group_error_fatal_payload { + __u64 sideband; + __u32 status; + __u32 padding; +}; + +/** + * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault + * error information related to GPU command queue. + * + * @sideband: Additional information about this unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @csi_index: Index of the CSF interface the queue is bound to. + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_error_fatal_payload { + __u64 sideband; + __u32 status; + __u8 csi_index; + __u8 padding[3]; +}; + +/** + * enum base_gpu_queue_group_error_type - GPU Fatal error type. + * + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU + * command queue group. + * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU + * command queue. + * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with + * progress timeout. + * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out + * of tiler heap memory. + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types + * + * This type is used for &struct_base_gpu_queue_group_error.error_type. + */ +enum base_gpu_queue_group_error_type { + BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, + BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, + BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, + BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, + BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT +}; + +/** + * struct base_gpu_queue_group_error - Unrecoverable fault information + * @error_type: Error type of @base_gpu_queue_group_error_type + * indicating which field in union payload is filled + * @padding: Unused bytes for 64bit boundary + * @payload: Input Payload + * @payload.fatal_group: Unrecoverable fault error associated with + * GPU command queue group + * @payload.fatal_queue: Unrecoverable fault error associated with command queue + */ +struct base_gpu_queue_group_error { + __u8 error_type; + __u8 padding[7]; + union { + struct base_gpu_queue_group_error_fatal_payload fatal_group; + struct base_gpu_queue_error_fatal_payload fatal_queue; + } payload; +}; + +/** + * enum base_csf_notification_type - Notification type + * + * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event + * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal + * error + * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu + * queue + * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type + * + * This type is used for &struct_base_csf_notification.type. + */ +enum base_csf_notification_type { + BASE_CSF_NOTIFICATION_EVENT = 0, + BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, + BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, + BASE_CSF_NOTIFICATION_COUNT +}; + +/** + * struct base_csf_notification - Event or error notification + * + * @type: Notification type of @base_csf_notification_type + * @padding: Padding for 64bit boundary + * @payload: Input Payload + * @payload.align: To fit the struct into a 64-byte cache line + * @payload.csg_error: CSG error + * @payload.csg_error.handle: Handle of GPU command queue group associated with + * fatal error + * @payload.csg_error.padding: Padding + * @payload.csg_error.error: Unrecoverable fault error + * + */ +struct base_csf_notification { + __u8 type; + __u8 padding[7]; + union { + struct { + __u8 handle; + __u8 padding[7]; + struct base_gpu_queue_group_error error; + } csg_error; + + __u8 align[56]; + } payload; +}; + +/** + * struct mali_base_gpu_core_props - GPU core props info + * + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to align to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; +}; + +#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ diff --git a/src/panfrost/base/include/csf/mali_gpu_csf_registers.h b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h new file mode 100644 index 00000000000..17e338cb238 --- /dev/null +++ b/src/panfrost/base/include/csf/mali_gpu_csf_registers.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * This header was originally autogenerated, but it is now ok (and + * expected) to have to add to it. + */ + +#ifndef _UAPI_GPU_CSF_REGISTERS_H_ +#define _UAPI_GPU_CSF_REGISTERS_H_ + +/* Only user block defines are included. HI words have been removed */ + +/* CS_USER_INPUT_BLOCK register offsets */ +#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ +#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ + +/* CS_USER_OUTPUT_BLOCK register offsets */ +#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ +#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ + +/* USER register offsets */ +#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ + +#endif diff --git a/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h new file mode 100644 index 00000000000..db7252605f0 --- /dev/null +++ b/src/panfrost/base/include/csf/mali_kbase_csf_ioctl.h @@ -0,0 +1,530 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_CSF_IOCTL_H_ +#define _UAPI_KBASE_CSF_IOCTL_H_ + +#include +#include + +/* + * 1.0: + * - CSF IOCTL header separated from JM + * 1.1: + * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 1.2: + * - Add new CSF GPU_FEATURES register into the property structure + * returned by KBASE_IOCTL_GET_GPUPROPS + * 1.3: + * - Add __u32 group_uid member to + * &struct_kbase_ioctl_cs_queue_group_create.out + * 1.4: + * - Replace padding in kbase_ioctl_cs_get_glb_iface with + * instr_features member of same size + * 1.5: + * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new + * queue registration call with extended format for supporting CS + * trace configurations with CSF trace_command. + * 1.6: + * - Added new HW performance counters interface to all GPUs. + * 1.7: + * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use + * 1.8: + * - Removed Kernel legacy HWC interface + * 1.9: + * - Reorganization of GPU-VA memory zones, including addition of + * FIXED_VA zone and auto-initialization of EXEC_VA zone. + * - Added new Base memory allocation interface + * 1.10: + * - First release of new HW performance counters interface. + * 1.11: + * - Dummy model (no mali) backend will now clear HWC values after each sample + * 1.12: + * - Added support for incremental rendering flag in CSG create call + */ + +#define BASE_UK_VERSION_MAJOR 1 +#define BASE_UK_VERSION_MINOR 12 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + +/** + * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the + * base back-end + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * + * Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. + * Any change of this struct should also be mirrored to the latter. + */ +struct kbase_ioctl_cs_queue_register { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER \ + _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) + +/** + * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler + * to notify that a queue has been updated + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_kick { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_KICK \ + _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) + +/** + * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group + * + * @in: Input parameters + * @in.buffer_gpu_addr: GPU address of the buffer backing the queue + * @in.group_handle: Handle of the group to which the queue should be bound + * @in.csi_index: Index of the CSF interface the queue should be bound to + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.mmap_handle: Handle to be used for creating the mapping of CS + * input/output pages + */ +union kbase_ioctl_cs_queue_bind { + struct { + __u64 buffer_gpu_addr; + __u8 group_handle; + __u8 csi_index; + __u8 padding[6]; + } in; + struct { + __u64 mmap_handle; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_BIND \ + _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) + +/** + * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the + * base back-end in extended format, + * involving trace buffer configuration + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * @ex_offset_var_addr: GPU address of the trace buffer write offset variable + * @ex_buffer_base: Trace buffer GPU base address for the queue + * @ex_buffer_size: Size of the trace buffer in bytes + * @ex_event_size: Trace event write size, in log2 designation + * @ex_event_state: Trace event states configuration + * @ex_padding: Currently unused, must be zero + * + * Note: There is an identical sub-section at the start of this struct to that + * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section + * must also be mirrored to the latter. Following the said sub-section, + * the remaining fields forms the extension, marked with ex_*. + */ +struct kbase_ioctl_cs_queue_register_ex { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; + __u64 ex_offset_var_addr; + __u64 ex_buffer_base; + __u32 ex_buffer_size; + __u8 ex_event_size; + __u8 ex_event_state; + __u8 ex_padding[2]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ + _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) + +/** + * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_terminate { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) + +/** + * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue + * group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create_1_6 { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ + _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) + +/** + * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.csi_handlers: Flags to signal that the application intends to use CSI + * exception handlers in some linear buffers to deal with + * the given exception types. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 csi_handlers; + __u8 padding[2]; + /** + * @in.reserved: Reserved + */ + __u64 reserved; + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ + _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) + +/** + * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group + * + * @group_handle: Handle of the queue group to be terminated + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_cs_queue_group_term { + __u8 group_handle; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) + +#define KBASE_IOCTL_CS_EVENT_SIGNAL \ + _IO(KBASE_IOCTL_TYPE, 44) + +typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ + +/** + * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue + * + * @id: ID of the new command queue returned by the kernel + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_new { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ + _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) + +/** + * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue + * + * @id: ID of the command queue to be destroyed + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_delete { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ + _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) + +/** + * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue + * + * @addr: Memory address of an array of struct base_kcpu_queue_command + * @nr_commands: Number of commands in the array + * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_enqueue { + __u64 addr; + __u32 nr_commands; + base_kcpu_queue_id id; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ + _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) + +/** + * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap + * @in: Input parameters + * @in.chunk_size: Size of each chunk. + * @in.initial_chunks: Initial number of chunks that heap will be created with. + * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. + * @in.target_in_flight: Number of render-passes that the driver should attempt to + * keep in flight for which allocation of new chunks is + * allowed. + * @in.group_id: Group ID to be used for physical allocations. + * @in.padding: Padding + * @out: Output parameters + * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up + * for the heap. + * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, + * actually points to the header of heap chunk and not to + * the low address of free memory in the chunk. + */ +union kbase_ioctl_cs_tiler_heap_init { + struct { + __u32 chunk_size; + __u32 initial_chunks; + __u32 max_chunks; + __u16 target_in_flight; + __u8 group_id; + __u8 padding; + } in; + struct { + __u64 gpu_heap_va; + __u64 first_chunk_va; + } out; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ + _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) + +/** + * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap + * instance + * + * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. + */ +struct kbase_ioctl_cs_tiler_heap_term { + __u64 gpu_heap_va; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ + _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) + +/** + * union kbase_ioctl_cs_get_glb_iface - Request the global control block + * of CSF interface capabilities + * + * @in: Input parameters + * @in.max_group_num: The maximum number of groups to be read. Can be 0, in + * which case groups_ptr is unused. + * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in + * which case streams_ptr is unused. + * @in.groups_ptr: Pointer where to store all the group data (sequentially). + * @in.streams_ptr: Pointer where to store all the CS data (sequentially). + * @out: Output parameters + * @out.glb_version: Global interface version. + * @out.features: Bit mask of features (e.g. whether certain types of job + * can be suspended). + * @out.group_num: Number of CSGs supported. + * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 + * hold the size of firmware performance counter data + * and 15:0 hold the size of hardware performance counter + * data. + * @out.total_stream_num: Total number of CSs, summed across all groups. + * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum + * size of events. Bits 3:0 hold the offset update rate. + * (csf >= 1.1.0) + * + */ +union kbase_ioctl_cs_get_glb_iface { + struct { + __u32 max_group_num; + __u32 max_total_stream_num; + __u64 groups_ptr; + __u64 streams_ptr; + } in; + struct { + __u32 glb_version; + __u32 features; + __u32 group_num; + __u32 prfcnt_size; + __u32 total_stream_num; + __u32 instr_features; + } out; +}; + +#define KBASE_IOCTL_CS_GET_GLB_IFACE \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) + +struct kbase_ioctl_cs_cpu_queue_info { + __u64 buffer; + __u64 size; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ + _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) + +/** + * union kbase_ioctl_mem_alloc_ex - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @in.fixed_address: The GPU virtual address requested for the allocation, + * if the allocation is using the BASE_MEM_FIXED flag. + * @in.extra: Space for extra parameters that may be added in the future. + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc_ex { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + __u64 fixed_address; + __u64 extra[3]; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +/** + * struct kbase_ioctl_cs_event_memory_write - Write an event memory address + * @cpu_addr: Memory address to write + * @value: Value to write + * @padding: Currently unused, must be zero + */ +struct kbase_ioctl_cs_event_memory_write { + __u64 cpu_addr; + __u8 value; + __u8 padding[7]; +}; + +/** + * union kbase_ioctl_cs_event_memory_read - Read an event memory address + * @in: Input parameters + * @in.cpu_addr: Memory address to read + * @out: Output parameters + * @out.value: Value read + * @out.padding: Currently unused, must be zero + */ +union kbase_ioctl_cs_event_memory_read { + struct { + __u64 cpu_addr; + } in; + struct { + __u8 value; + __u8 padding[7]; + } out; +}; + +#endif /* MALI_UNIT_TEST */ + +#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ diff --git a/src/panfrost/base/include/jm/mali_base_jm_kernel.h b/src/panfrost/base/include/jm/mali_base_jm_kernel.h new file mode 100644 index 00000000000..ae43908b936 --- /dev/null +++ b/src/panfrost/base/include/jm/mali_base_jm_kernel.h @@ -0,0 +1,1051 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_JM_KERNEL_H_ +#define _UAPI_BASE_JM_KERNEL_H_ + +#include +#include "../mali_base_common_kernel.h" + +/* Memory allocation, access/hint flags & mask specific to JM GPU. + * + * See base_mem_alloc_flags. + */ + +/* Used as BASE_MEM_FIXED in other backends */ +#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) + +/** + * BASE_MEM_RESERVED_BIT_19 - Bit 19 is reserved. + * + * Do not remove, use the next unreserved bit for new flags + */ +#define BASE_MEM_RESERVED_BIT_19 ((base_mem_alloc_flags)1 << 19) + +/** + * BASE_MEM_TILER_ALIGN_TOP - Memory starting from the end of the initial commit is aligned + * to 'extension' pages, where 'extension' must be a power of 2 and no more than + * BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES + */ +#define BASE_MEM_TILER_ALIGN_TOP ((base_mem_alloc_flags)1 << 20) + +/* Use the GPU VA chosen by the kernel client */ +#define BASE_MEM_FLAG_MAP_FIXED ((base_mem_alloc_flags)1 << 27) + +/* Force trimming of JIT allocations when creating a new allocation */ +#define BASEP_MEM_PERFORM_JIT_TRIM ((base_mem_alloc_flags)1 << 29) + +/* Note that the number of bits used for base_mem_alloc_flags + * must be less than BASE_MEM_FLAGS_NR_BITS !!! + */ + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE | \ + BASE_MEM_FLAG_MAP_FIXED | BASEP_MEM_PERFORM_JIT_TRIM) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED \ + (BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19) + + +/* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the + * initial commit is aligned to 'extension' pages, where 'extension' must be a power + * of 2 and no more than BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES + */ +#define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP (1 << 0) + +/** + * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE - If set, the heap info address points + * to a __u32 holding the used size in bytes; + * otherwise it points to a __u64 holding the lowest address of unused memory. + */ +#define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE (1 << 1) + +/** + * BASE_JIT_ALLOC_VALID_FLAGS - Valid set of just-in-time memory allocation flags + * + * Note: BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE cannot be set if heap_info_gpu_addr + * in %base_jit_alloc_info is 0 (atom with BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE set + * and heap_info_gpu_addr being 0 will be rejected). + */ +#define BASE_JIT_ALLOC_VALID_FLAGS \ + (BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP | BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* + * Private flags used on the base context + * + * These start at bit 31, and run down to zero. + * + * They share the same space as base_context_create_flags, and so must + * not collide with them. + */ + +/* Private flag tracking whether job descriptor dumping is disabled */ +#define BASEP_CONTEXT_FLAG_JOB_DUMP_DISABLED \ + ((base_context_create_flags)(1 << 31)) + +/* Flags for base tracepoint specific to JM */ +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED) +/* + * Dependency stuff, keep it private for now. May want to expose it if + * we decide to make the number of semaphores a configurable + * option. + */ +#define BASE_JD_ATOM_COUNT 256 + +/* Maximum number of concurrent render passes. + */ +#define BASE_JD_RP_COUNT (256) + +/* Set/reset values for a software event */ +#define BASE_JD_SOFT_EVENT_SET ((unsigned char)1) +#define BASE_JD_SOFT_EVENT_RESET ((unsigned char)0) + +/** + * struct base_jd_udata - Per-job data + * + * @blob: per-job data array + * + * This structure is used to store per-job data, and is completely unused + * by the Base driver. It can be used to store things such as callback + * function pointer, data to handle job completion. It is guaranteed to be + * untouched by the Base driver. + */ +struct base_jd_udata { + __u64 blob[2]; +}; + +/** + * typedef base_jd_dep_type - Job dependency type. + * + * A flags field will be inserted into the atom structure to specify whether a + * dependency is a data or ordering dependency (by putting it before/after + * 'core_req' in the structure it should be possible to add without changing + * the structure size). + * When the flag is set for a particular dependency to signal that it is an + * ordering only dependency then errors will not be propagated. + */ +typedef __u8 base_jd_dep_type; + +#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ +#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ +#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ + +/** + * typedef base_jd_core_req - Job chain hardware requirements. + * + * A job chain must specify what GPU features it needs to allow the + * driver to schedule the job correctly. By not specifying the + * correct settings can/will cause an early job termination. Multiple + * values can be ORed together to specify multiple requirements. + * Special case is ::BASE_JD_REQ_DEP, which is used to express complex + * dependencies, and that doesn't execute anything on the hardware. + */ +typedef __u32 base_jd_core_req; + +/* Requirements that come from the HW */ + +/* No requirement, dependency only + */ +#define BASE_JD_REQ_DEP ((base_jd_core_req)0) + +/* Requires fragment shaders + */ +#define BASE_JD_REQ_FS ((base_jd_core_req)1 << 0) + +/* Requires compute shaders + * + * This covers any of the following GPU job types: + * - Vertex Shader Job + * - Geometry Shader Job + * - An actual Compute Shader Job + * + * Compare this with BASE_JD_REQ_ONLY_COMPUTE, which specifies that the + * job is specifically just the "Compute Shader" job type, and not the "Vertex + * Shader" nor the "Geometry Shader" job type. + */ +#define BASE_JD_REQ_CS ((base_jd_core_req)1 << 1) + +/* Requires tiling */ +#define BASE_JD_REQ_T ((base_jd_core_req)1 << 2) + +/* Requires cache flushes */ +#define BASE_JD_REQ_CF ((base_jd_core_req)1 << 3) + +/* Requires value writeback */ +#define BASE_JD_REQ_V ((base_jd_core_req)1 << 4) + +/* SW-only requirements - the HW does not expose these as part of the job slot + * capabilities + */ + +/* Requires fragment job with AFBC encoding */ +#define BASE_JD_REQ_FS_AFBC ((base_jd_core_req)1 << 13) + +/* SW-only requirement: coalesce completion events. + * If this bit is set then completion of this atom will not cause an event to + * be sent to userspace, whether successful or not; completion events will be + * deferred until an atom completes which does not have this bit set. + * + * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. + */ +#define BASE_JD_REQ_EVENT_COALESCE ((base_jd_core_req)1 << 5) + +/* SW Only requirement: the job chain requires a coherent core group. We don't + * mind which coherent core group is used. + */ +#define BASE_JD_REQ_COHERENT_GROUP ((base_jd_core_req)1 << 6) + +/* SW Only requirement: The performance counters should be enabled only when + * they are needed, to reduce power consumption. + */ +#define BASE_JD_REQ_PERMON ((base_jd_core_req)1 << 7) + +/* SW Only requirement: External resources are referenced by this atom. + * + * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE and + * BASE_JD_REQ_SOFT_EVENT_WAIT. + */ +#define BASE_JD_REQ_EXTERNAL_RESOURCES ((base_jd_core_req)1 << 8) + +/* SW Only requirement: Software defined job. Jobs with this bit set will not be + * submitted to the hardware but will cause some action to happen within the + * driver + */ +#define BASE_JD_REQ_SOFT_JOB ((base_jd_core_req)1 << 9) + +#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) +#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) +#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) + +/* 0x4 RESERVED for now */ + +/* SW only requirement: event wait/trigger job. + * + * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. + * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the + * other waiting jobs. It completes immediately. + * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it + * possible for other jobs to wait upon. It completes immediately. + */ +#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) +#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) +#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) + +#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) + +/* SW only requirement: Just In Time allocation + * + * This job requests a single or multiple just-in-time allocations through a + * list of base_jit_alloc_info structure which is passed via the jc element of + * the atom. The number of base_jit_alloc_info structures present in the + * list is passed via the nr_extres element of the atom + * + * It should be noted that the id entry in base_jit_alloc_info must not + * be reused until it has been released via BASE_JD_REQ_SOFT_JIT_FREE. + * + * Should this soft job fail it is expected that a BASE_JD_REQ_SOFT_JIT_FREE + * soft job to free the JIT allocation is still made. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) + +/* SW only requirement: Just In Time free + * + * This job requests a single or multiple just-in-time allocations created by + * BASE_JD_REQ_SOFT_JIT_ALLOC to be freed. The ID list of the just-in-time + * allocations is passed via the jc element of the atom. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) + +/* SW only requirement: Map external resource + * + * This job requests external resource(s) are mapped once the dependencies + * of the job have been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) + +/* SW only requirement: Unmap external resource + * + * This job requests external resource(s) are unmapped once the dependencies + * of the job has been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) + +/* HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) + * + * This indicates that the Job Chain contains GPU jobs of the 'Compute + * Shaders' type. + * + * In contrast to BASE_JD_REQ_CS, this does not indicate that the Job + * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. + */ +#define BASE_JD_REQ_ONLY_COMPUTE ((base_jd_core_req)1 << 10) + +/* HW Requirement: Use the base_jd_atom::device_nr field to specify a + * particular core group + * + * If both BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag + * takes priority + * + * This is only guaranteed to work for BASE_JD_REQ_ONLY_COMPUTE atoms. + */ +#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((base_jd_core_req)1 << 11) + +/* SW Flag: If this bit is set then the successful completion of this atom + * will not cause an event to be sent to userspace + */ +#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((base_jd_core_req)1 << 12) + +/* SW Flag: If this bit is set then completion of this atom will not cause an + * event to be sent to userspace, whether successful or not. + */ +#define BASEP_JD_REQ_EVENT_NEVER ((base_jd_core_req)1 << 14) + +/* SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job starts which does not have this bit set or a job completes + * which does not have the BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use + * if the CPU may have written to memory addressed by the job since the last job + * without this bit set was submitted. + */ +#define BASE_JD_REQ_SKIP_CACHE_START ((base_jd_core_req)1 << 15) + +/* SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job completes which does not have this bit set or a job starts + * which does not have the BASE_JD_REQ_SKIP_CACHE_START bit set. Do not use + * if the CPU may read from or partially overwrite memory addressed by the job + * before the next job without this bit set completes. + */ +#define BASE_JD_REQ_SKIP_CACHE_END ((base_jd_core_req)1 << 16) + +/* Request the atom be executed on a specific job slot. + * + * When this flag is specified, it takes precedence over any existing job slot + * selection logic. + */ +#define BASE_JD_REQ_JOB_SLOT ((base_jd_core_req)1 << 17) + +/* SW-only requirement: The atom is the start of a renderpass. + * + * If this bit is set then the job chain will be soft-stopped if it causes the + * GPU to write beyond the end of the physical pages backing the tiler heap, and + * committing more memory to the heap would exceed an internal threshold. It may + * be resumed after running one of the job chains attached to an atom with + * BASE_JD_REQ_END_RENDERPASS set and the same renderpass ID. It may be + * resumed multiple times until it completes without memory usage exceeding the + * threshold. + * + * Usually used with BASE_JD_REQ_T. + */ +#define BASE_JD_REQ_START_RENDERPASS ((base_jd_core_req)1 << 18) + +/* SW-only requirement: The atom is the end of a renderpass. + * + * If this bit is set then the atom incorporates the CPU address of a + * base_jd_fragment object instead of the GPU address of a job chain. + * + * Which job chain is run depends upon whether the atom with the same renderpass + * ID and the BASE_JD_REQ_START_RENDERPASS bit set completed normally or + * was soft-stopped when it exceeded an upper threshold for tiler heap memory + * usage. + * + * It also depends upon whether one of the job chains attached to the atom has + * already been run as part of the same renderpass (in which case it would have + * written unresolved multisampled and otherwise-discarded output to temporary + * buffers that need to be read back). The job chain for doing a forced read and + * forced write (from/to temporary buffers) is run as many times as necessary. + * + * Usually used with BASE_JD_REQ_FS. + */ +#define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19) + +/* SW-only requirement: The atom needs to run on a limited core mask affinity. + * + * If this bit is set then the kbase_context.limited_core_mask will be applied + * to the affinity. + */ +#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20) + +/* These requirement bits are currently unused in base_jd_core_req + */ +#define BASEP_JD_REQ_RESERVED \ + (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ + BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | BASEP_JD_REQ_EVENT_NEVER | \ + BASE_JD_REQ_EVENT_COALESCE | \ + BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ + BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ + BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \ + BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \ + BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK)) + +/* Mask of all bits in base_jd_core_req that control the type of the atom. + * + * This allows dependency only atoms to have flags set + */ +#define BASE_JD_REQ_ATOM_TYPE \ + (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ + BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) + +/** + * BASE_JD_REQ_SOFT_JOB_TYPE - Mask of all bits in base_jd_core_req that + * controls the type of a soft job. + */ +#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) + +/* Returns non-zero value if core requirements passed define a soft job or + * a dependency only job. + */ +#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ + (((core_req) & BASE_JD_REQ_SOFT_JOB) || \ + ((core_req) & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) + +/** + * enum kbase_jd_atom_state - Atom states + * + * @KBASE_JD_ATOM_STATE_UNUSED: Atom is not used. + * @KBASE_JD_ATOM_STATE_QUEUED: Atom is queued in JD. + * @KBASE_JD_ATOM_STATE_IN_JS: Atom has been given to JS (is runnable/running). + * @KBASE_JD_ATOM_STATE_HW_COMPLETED: Atom has been completed, but not yet + * handed back to job dispatcher for + * dependency resolution. + * @KBASE_JD_ATOM_STATE_COMPLETED: Atom has been completed, but not yet handed + * back to userspace. + */ +enum kbase_jd_atom_state { + KBASE_JD_ATOM_STATE_UNUSED, + KBASE_JD_ATOM_STATE_QUEUED, + KBASE_JD_ATOM_STATE_IN_JS, + KBASE_JD_ATOM_STATE_HW_COMPLETED, + KBASE_JD_ATOM_STATE_COMPLETED +}; + +/** + * typedef base_atom_id - Type big enough to store an atom number in. + */ +typedef __u8 base_atom_id; + +/** + * struct base_dependency - base dependency + * + * @atom_id: An atom number + * @dependency_type: Dependency type + */ +struct base_dependency { + base_atom_id atom_id; + base_jd_dep_type dependency_type; +}; + +/** + * struct base_jd_fragment - Set of GPU fragment job chains used for rendering. + * + * @norm_read_norm_write: Job chain for full rendering. + * GPU address of a fragment job chain to render in the + * circumstance where the tiler job chain did not exceed + * its memory usage threshold and no fragment job chain + * was previously run for the same renderpass. + * It is used no more than once per renderpass. + * @norm_read_forced_write: Job chain for starting incremental + * rendering. + * GPU address of a fragment job chain to render in + * the circumstance where the tiler job chain exceeded + * its memory usage threshold for the first time and + * no fragment job chain was previously run for the + * same renderpass. + * Writes unresolved multisampled and normally- + * discarded output to temporary buffers that must be + * read back by a subsequent forced_read job chain + * before the renderpass is complete. + * It is used no more than once per renderpass. + * @forced_read_forced_write: Job chain for continuing incremental + * rendering. + * GPU address of a fragment job chain to render in + * the circumstance where the tiler job chain + * exceeded its memory usage threshold again + * and a fragment job chain was previously run for + * the same renderpass. + * Reads unresolved multisampled and + * normally-discarded output from temporary buffers + * written by a previous forced_write job chain and + * writes the same to temporary buffers again. + * It is used as many times as required until + * rendering completes. + * @forced_read_norm_write: Job chain for ending incremental rendering. + * GPU address of a fragment job chain to render in the + * circumstance where the tiler job chain did not + * exceed its memory usage threshold this time and a + * fragment job chain was previously run for the same + * renderpass. + * Reads unresolved multisampled and normally-discarded + * output from temporary buffers written by a previous + * forced_write job chain in order to complete a + * renderpass. + * It is used no more than once per renderpass. + * + * This structure is referenced by the main atom structure if + * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req. + */ +struct base_jd_fragment { + __u64 norm_read_norm_write; + __u64 norm_read_forced_write; + __u64 forced_read_forced_write; + __u64 forced_read_norm_write; +}; + +/** + * typedef base_jd_prio - Base Atom priority. + * + * Only certain priority levels are actually implemented, as specified by the + * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority + * level that is not one of those defined below. + * + * Priority levels only affect scheduling after the atoms have had dependencies + * resolved. For example, a low priority atom that has had its dependencies + * resolved might run before a higher priority atom that has not had its + * dependencies resolved. + * + * In general, fragment atoms do not affect non-fragment atoms with + * lower priorities, and vice versa. One exception is that there is only one + * priority value for each context. So a high-priority (e.g.) fragment atom + * could increase its context priority, causing its non-fragment atoms to also + * be scheduled sooner. + * + * The atoms are scheduled as follows with respect to their priorities: + * * Let atoms 'X' and 'Y' be for the same job slot who have dependencies + * resolved, and atom 'X' has a higher priority than atom 'Y' + * * If atom 'Y' is currently running on the HW, then it is interrupted to + * allow atom 'X' to run soon after + * * If instead neither atom 'Y' nor atom 'X' are running, then when choosing + * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' + * * Any two atoms that have the same priority could run in any order with + * respect to each other. That is, there is no ordering constraint between + * atoms of the same priority. + * + * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are + * scheduled between contexts. The default value, 0, will cause higher-priority + * atoms to be scheduled first, regardless of their context. The value 1 will + * use a round-robin algorithm when deciding which context's atoms to schedule + * next, so higher-priority atoms can only preempt lower priority atoms within + * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and + * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details. + */ +typedef __u8 base_jd_prio; + +/* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */ +#define BASE_JD_PRIO_MEDIUM ((base_jd_prio)0) +/* High atom priority. This is a priority higher than BASE_JD_PRIO_MEDIUM and + * BASE_JD_PRIO_LOW + */ +#define BASE_JD_PRIO_HIGH ((base_jd_prio)1) +/* Low atom priority. */ +#define BASE_JD_PRIO_LOW ((base_jd_prio)2) +/* Real-Time atom priority. This is a priority higher than BASE_JD_PRIO_HIGH, + * BASE_JD_PRIO_MEDIUM, and BASE_JD_PRIO_LOW + */ +#define BASE_JD_PRIO_REALTIME ((base_jd_prio)3) + +/* Invalid atom priority (max uint8_t value) */ +#define BASE_JD_PRIO_INVALID ((base_jd_prio)255) + +/* Count of the number of priority levels. This itself is not a valid + * base_jd_prio setting + */ +#define BASE_JD_NR_PRIO_LEVELS 4 + +/** + * struct base_jd_atom_v2 - Node of a dependency graph used to submit a + * GPU job chain or soft-job to the kernel driver. + * + * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS + * is set in the base_jd_core_req) the CPU address of a + * base_jd_fragment object. + * @udata: User data. + * @extres_list: List of external resources. + * @nr_extres: Number of external resources or JIT allocations. + * @jit_id: Zero-terminated array of IDs of just-in-time memory + * allocations written to by the atom. When the atom + * completes, the value stored at the + * &struct_base_jit_alloc_info.heap_info_gpu_addr of + * each allocation is read in order to enforce an + * overall physical memory usage limit. + * @pre_dep: Pre-dependencies. One need to use SETTER function to assign + * this field; this is done in order to reduce possibility of + * improper assignment of a dependency field. + * @atom_number: Unique number to identify the atom. + * @prio: Atom priority. Refer to base_jd_prio for more details. + * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + * specified. + * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. + * @core_req: Core requirements. + * @renderpass_id: Renderpass identifier used to associate an atom that has + * BASE_JD_REQ_START_RENDERPASS set in its core requirements + * with an atom that has BASE_JD_REQ_END_RENDERPASS set. + * @padding: Unused. Must be zero. + * + * This structure has changed since UK 10.2 for which base_jd_core_req was a + * __u16 value. + * + * In UK 10.3 a core_req field of a __u32 type was added to the end of the + * structure, and the place in the structure previously occupied by __u16 + * core_req was kept but renamed to compat_core_req. + * + * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2]. + * Compatibility with UK 10.x from UK 11.y is not handled because + * the major version increase prevents this. + * + * For UK 11.20 jit_id[2] must be initialized to zero. + */ +struct base_jd_atom_v2 { + __u64 jc; + struct base_jd_udata udata; + __u64 extres_list; + __u16 nr_extres; + __u8 jit_id[2]; + struct base_dependency pre_dep[2]; + base_atom_id atom_number; + base_jd_prio prio; + __u8 device_nr; + __u8 jobslot; + base_jd_core_req core_req; + __u8 renderpass_id; + __u8 padding[7]; +}; + +/** + * struct base_jd_atom - Same as base_jd_atom_v2, but has an extra seq_nr + * at the beginning. + * + * @seq_nr: Sequence number of logical grouping of atoms. + * @jc: GPU address of a job chain or (if BASE_JD_REQ_END_RENDERPASS + * is set in the base_jd_core_req) the CPU address of a + * base_jd_fragment object. + * @udata: User data. + * @extres_list: List of external resources. + * @nr_extres: Number of external resources or JIT allocations. + * @jit_id: Zero-terminated array of IDs of just-in-time memory + * allocations written to by the atom. When the atom + * completes, the value stored at the + * &struct_base_jit_alloc_info.heap_info_gpu_addr of + * each allocation is read in order to enforce an + * overall physical memory usage limit. + * @pre_dep: Pre-dependencies. One need to use SETTER function to assign + * this field; this is done in order to reduce possibility of + * improper assignment of a dependency field. + * @atom_number: Unique number to identify the atom. + * @prio: Atom priority. Refer to base_jd_prio for more details. + * @device_nr: Core group when BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + * specified. + * @jobslot: Job slot to use when BASE_JD_REQ_JOB_SLOT is specified. + * @core_req: Core requirements. + * @renderpass_id: Renderpass identifier used to associate an atom that has + * BASE_JD_REQ_START_RENDERPASS set in its core requirements + * with an atom that has BASE_JD_REQ_END_RENDERPASS set. + * @padding: Unused. Must be zero. + */ +typedef struct base_jd_atom { + __u64 seq_nr; + __u64 jc; + struct base_jd_udata udata; + __u64 extres_list; + __u16 nr_extres; + __u8 jit_id[2]; + struct base_dependency pre_dep[2]; + base_atom_id atom_number; + base_jd_prio prio; + __u8 device_nr; + __u8 jobslot; + base_jd_core_req core_req; + __u8 renderpass_id; + __u8 padding[7]; +} base_jd_atom; + +/* Job chain event code bits + * Defines the bits used to create ::base_jd_event_code + */ +enum { + BASE_JD_SW_EVENT_KERNEL = (1u << 15), /* Kernel side event */ + BASE_JD_SW_EVENT = (1u << 14), /* SW defined event */ + /* Event indicates success (SW events only) */ + BASE_JD_SW_EVENT_SUCCESS = (1u << 13), + BASE_JD_SW_EVENT_JOB = (0u << 11), /* Job related event */ + BASE_JD_SW_EVENT_BAG = (1u << 11), /* Bag related event */ + BASE_JD_SW_EVENT_INFO = (2u << 11), /* Misc/info event */ + BASE_JD_SW_EVENT_RESERVED = (3u << 11), /* Reserved event type */ + /* Mask to extract the type from an event code */ + BASE_JD_SW_EVENT_TYPE_MASK = (3u << 11) +}; + +/** + * enum base_jd_event_code - Job chain event codes + * + * @BASE_JD_EVENT_RANGE_HW_NONFAULT_START: Start of hardware non-fault status + * codes. + * Obscurely, BASE_JD_EVENT_TERMINATED + * indicates a real fault, because the + * job was hard-stopped. + * @BASE_JD_EVENT_NOT_STARTED: Can't be seen by userspace, treated as + * 'previous job done'. + * @BASE_JD_EVENT_STOPPED: Can't be seen by userspace, becomes + * TERMINATED, DONE or JOB_CANCELLED. + * @BASE_JD_EVENT_TERMINATED: This is actually a fault status code - the job + * was hard stopped. + * @BASE_JD_EVENT_ACTIVE: Can't be seen by userspace, jobs only returned on + * complete/fail/cancel. + * @BASE_JD_EVENT_RANGE_HW_NONFAULT_END: End of hardware non-fault status codes. + * Obscurely, BASE_JD_EVENT_TERMINATED + * indicates a real fault, + * because the job was hard-stopped. + * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START: Start of hardware fault and + * software error status codes. + * @BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END: End of hardware fault and + * software error status codes. + * @BASE_JD_EVENT_RANGE_SW_SUCCESS_START: Start of software success status + * codes. + * @BASE_JD_EVENT_RANGE_SW_SUCCESS_END: End of software success status codes. + * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_START: Start of kernel-only status codes. + * Such codes are never returned to + * user-space. + * @BASE_JD_EVENT_RANGE_KERNEL_ONLY_END: End of kernel-only status codes. + * @BASE_JD_EVENT_DONE: atom has completed successfull + * @BASE_JD_EVENT_JOB_CONFIG_FAULT: Atom dependencies configuration error which + * shall result in a failed atom + * @BASE_JD_EVENT_JOB_POWER_FAULT: The job could not be executed because the + * part of the memory system required to access + * job descriptors was not powered on + * @BASE_JD_EVENT_JOB_READ_FAULT: Reading a job descriptor into the Job + * manager failed + * @BASE_JD_EVENT_JOB_WRITE_FAULT: Writing a job descriptor from the Job + * manager failed + * @BASE_JD_EVENT_JOB_AFFINITY_FAULT: The job could not be executed because the + * specified affinity mask does not intersect + * any available cores + * @BASE_JD_EVENT_JOB_BUS_FAULT: A bus access failed while executing a job + * @BASE_JD_EVENT_INSTR_INVALID_PC: A shader instruction with an illegal program + * counter was executed. + * @BASE_JD_EVENT_INSTR_INVALID_ENC: A shader instruction with an illegal + * encoding was executed. + * @BASE_JD_EVENT_INSTR_TYPE_MISMATCH: A shader instruction was executed where + * the instruction encoding did not match the + * instruction type encoded in the program + * counter. + * @BASE_JD_EVENT_INSTR_OPERAND_FAULT: A shader instruction was executed that + * contained invalid combinations of operands. + * @BASE_JD_EVENT_INSTR_TLS_FAULT: A shader instruction was executed that tried + * to access the thread local storage section + * of another thread. + * @BASE_JD_EVENT_INSTR_ALIGN_FAULT: A shader instruction was executed that + * tried to do an unsupported unaligned memory + * access. + * @BASE_JD_EVENT_INSTR_BARRIER_FAULT: A shader instruction was executed that + * failed to complete an instruction barrier. + * @BASE_JD_EVENT_DATA_INVALID_FAULT: Any data structure read as part of the job + * contains invalid combinations of data. + * @BASE_JD_EVENT_TILE_RANGE_FAULT: Tile or fragment shading was asked to + * process a tile that is entirely outside the + * bounding box of the frame. + * @BASE_JD_EVENT_STATE_FAULT: Matches ADDR_RANGE_FAULT. A virtual address + * has been found that exceeds the virtual + * address range. + * @BASE_JD_EVENT_OUT_OF_MEMORY: The tiler ran out of memory when executing a job. + * @BASE_JD_EVENT_UNKNOWN: If multiple jobs in a job chain fail, only + * the first one the reports an error will set + * and return full error information. + * Subsequent failing jobs will not update the + * error status registers, and may write an + * error status of UNKNOWN. + * @BASE_JD_EVENT_DELAYED_BUS_FAULT: The GPU received a bus fault for access to + * physical memory where the original virtual + * address is no longer available. + * @BASE_JD_EVENT_SHAREABILITY_FAULT: Matches GPU_SHAREABILITY_FAULT. A cache + * has detected that the same line has been + * accessed as both shareable and non-shareable + * memory from inside the GPU. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1: A memory access hit an invalid table + * entry at level 1 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2: A memory access hit an invalid table + * entry at level 2 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3: A memory access hit an invalid table + * entry at level 3 of the translation table. + * @BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4: A memory access hit an invalid table + * entry at level 4 of the translation table. + * @BASE_JD_EVENT_PERMISSION_FAULT: A memory access could not be allowed due to + * the permission flags set in translation + * table + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1: A bus fault occurred while reading + * level 0 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2: A bus fault occurred while reading + * level 1 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3: A bus fault occurred while reading + * level 2 of the translation tables. + * @BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4: A bus fault occurred while reading + * level 3 of the translation tables. + * @BASE_JD_EVENT_ACCESS_FLAG: Matches ACCESS_FLAG_0. A memory access hit a + * translation table entry with the ACCESS_FLAG + * bit set to zero in level 0 of the + * page table, and the DISABLE_AF_FAULT flag + * was not set. + * @BASE_JD_EVENT_MEM_GROWTH_FAILED: raised for JIT_ALLOC atoms that failed to + * grow memory on demand + * @BASE_JD_EVENT_JOB_CANCELLED: raised when this atom was hard-stopped or its + * dependencies failed + * @BASE_JD_EVENT_JOB_INVALID: raised for many reasons, including invalid data + * in the atom which overlaps with + * BASE_JD_EVENT_JOB_CONFIG_FAULT, or if the + * platform doesn't support the feature specified in + * the atom. + * @BASE_JD_EVENT_DRV_TERMINATED: this is a special event generated to indicate + * to userspace that the KBase context has been + * destroyed and Base should stop listening for + * further events + * @BASE_JD_EVENT_REMOVED_FROM_NEXT: raised when an atom that was configured in + * the GPU has to be retried (but it has not + * started) due to e.g., GPU reset + * @BASE_JD_EVENT_END_RP_DONE: this is used for incremental rendering to signal + * the completion of a renderpass. This value + * shouldn't be returned to userspace but I haven't + * seen where it is reset back to JD_EVENT_DONE. + * + * HW and low-level SW events are represented by event codes. + * The status of jobs which succeeded are also represented by + * an event code (see @BASE_JD_EVENT_DONE). + * Events are usually reported as part of a &struct base_jd_event. + * + * The event codes are encoded in the following way: + * * 10:0 - subtype + * * 12:11 - type + * * 13 - SW success (only valid if the SW bit is set) + * * 14 - SW event (HW event if not set) + * * 15 - Kernel event (should never be seen in userspace) + * + * Events are split up into ranges as follows: + * * BASE_JD_EVENT_RANGE__START + * * BASE_JD_EVENT_RANGE__END + * + * code is in 's range when: + * BASE_JD_EVENT_RANGE__START <= code < + * BASE_JD_EVENT_RANGE__END + * + * Ranges can be asserted for adjacency by testing that the END of the previous + * is equal to the START of the next. This is useful for optimizing some tests + * for range. + * + * A limitation is that the last member of this enum must explicitly be handled + * (with an assert-unreachable statement) in switch statements that use + * variables of this type. Otherwise, the compiler warns that we have not + * handled that enum value. + */ +enum base_jd_event_code { + /* HW defined exceptions */ + BASE_JD_EVENT_RANGE_HW_NONFAULT_START = 0, + + /* non-fatal exceptions */ + BASE_JD_EVENT_NOT_STARTED = 0x00, + BASE_JD_EVENT_DONE = 0x01, + BASE_JD_EVENT_STOPPED = 0x03, + BASE_JD_EVENT_TERMINATED = 0x04, + BASE_JD_EVENT_ACTIVE = 0x08, + + BASE_JD_EVENT_RANGE_HW_NONFAULT_END = 0x40, + BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_START = 0x40, + + /* job exceptions */ + BASE_JD_EVENT_JOB_CONFIG_FAULT = 0x40, + BASE_JD_EVENT_JOB_POWER_FAULT = 0x41, + BASE_JD_EVENT_JOB_READ_FAULT = 0x42, + BASE_JD_EVENT_JOB_WRITE_FAULT = 0x43, + BASE_JD_EVENT_JOB_AFFINITY_FAULT = 0x44, + BASE_JD_EVENT_JOB_BUS_FAULT = 0x48, + BASE_JD_EVENT_INSTR_INVALID_PC = 0x50, + BASE_JD_EVENT_INSTR_INVALID_ENC = 0x51, + BASE_JD_EVENT_INSTR_TYPE_MISMATCH = 0x52, + BASE_JD_EVENT_INSTR_OPERAND_FAULT = 0x53, + BASE_JD_EVENT_INSTR_TLS_FAULT = 0x54, + BASE_JD_EVENT_INSTR_BARRIER_FAULT = 0x55, + BASE_JD_EVENT_INSTR_ALIGN_FAULT = 0x56, + BASE_JD_EVENT_DATA_INVALID_FAULT = 0x58, + BASE_JD_EVENT_TILE_RANGE_FAULT = 0x59, + BASE_JD_EVENT_STATE_FAULT = 0x5A, + BASE_JD_EVENT_OUT_OF_MEMORY = 0x60, + BASE_JD_EVENT_UNKNOWN = 0x7F, + + /* GPU exceptions */ + BASE_JD_EVENT_DELAYED_BUS_FAULT = 0x80, + BASE_JD_EVENT_SHAREABILITY_FAULT = 0x88, + + /* MMU exceptions */ + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL1 = 0xC1, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL2 = 0xC2, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL3 = 0xC3, + BASE_JD_EVENT_TRANSLATION_FAULT_LEVEL4 = 0xC4, + BASE_JD_EVENT_PERMISSION_FAULT = 0xC8, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL1 = 0xD1, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL2 = 0xD2, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL3 = 0xD3, + BASE_JD_EVENT_TRANSTAB_BUS_FAULT_LEVEL4 = 0xD4, + BASE_JD_EVENT_ACCESS_FLAG = 0xD8, + + /* SW defined exceptions */ + BASE_JD_EVENT_MEM_GROWTH_FAILED = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x000, + BASE_JD_EVENT_JOB_CANCELLED = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x002, + BASE_JD_EVENT_JOB_INVALID = + BASE_JD_SW_EVENT | BASE_JD_SW_EVENT_JOB | 0x003, + + BASE_JD_EVENT_RANGE_HW_FAULT_OR_SW_ERROR_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_RESERVED | 0x3FF, + + BASE_JD_EVENT_RANGE_SW_SUCCESS_START = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | 0x000, + + BASE_JD_EVENT_DRV_TERMINATED = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_INFO | 0x000, + + BASE_JD_EVENT_RANGE_SW_SUCCESS_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_SUCCESS | BASE_JD_SW_EVENT_RESERVED | 0x3FF, + + BASE_JD_EVENT_RANGE_KERNEL_ONLY_START = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | 0x000, + BASE_JD_EVENT_REMOVED_FROM_NEXT = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x000, + BASE_JD_EVENT_END_RP_DONE = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_JOB | 0x001, + + BASE_JD_EVENT_RANGE_KERNEL_ONLY_END = BASE_JD_SW_EVENT | + BASE_JD_SW_EVENT_KERNEL | BASE_JD_SW_EVENT_RESERVED | 0x3FF +}; + +/** + * struct base_jd_event_v2 - Event reporting structure + * + * @event_code: event code of type @ref base_jd_event_code. + * @atom_number: the atom number that has completed. + * @padding: padding. + * @udata: user data. + * + * This structure is used by the kernel driver to report information + * about GPU events. They can either be HW-specific events or low-level + * SW events, such as job-chain completion. + * + * The event code contains an event type field which can be extracted + * by ANDing with BASE_JD_SW_EVENT_TYPE_MASK. + */ +struct base_jd_event_v2 { + __u32 event_code; + base_atom_id atom_number; + __u8 padding[3]; + struct base_jd_udata udata; +}; + +/** + * struct base_dump_cpu_gpu_counters - Structure for + * BASE_JD_REQ_SOFT_DUMP_CPU_GPU_COUNTERS + * jobs. + * @system_time: gpu timestamp + * @cycle_counter: gpu cycle count + * @sec: cpu time(sec) + * @usec: cpu time(usec) + * @padding: padding + * + * This structure is stored into the memory pointed to by the @jc field + * of &struct base_jd_atom. + * + * It must not occupy the same CPU cache line(s) as any neighboring data. + * This is to avoid cases where access to pages containing the structure + * is shared between cached and un-cached memory regions, which would + * cause memory corruption. + */ + +struct base_dump_cpu_gpu_counters { + __u64 system_time; + __u64 cycle_counter; + __u64 sec; + __u32 usec; + __u8 padding[36]; +}; + +/** + * struct mali_base_gpu_core_props - GPU core props info + * + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to align to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + * @num_exec_engines: The number of execution engines. Only valid for tGOX + * (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise, + * this is always 0. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; + __u8 num_exec_engines; +}; + +#endif /* _UAPI_BASE_JM_KERNEL_H_ */ diff --git a/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h new file mode 100644 index 00000000000..20d931adc9b --- /dev/null +++ b/src/panfrost/base/include/jm/mali_kbase_jm_ioctl.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_JM_IOCTL_H_ +#define _UAPI_KBASE_JM_IOCTL_H_ + +#include +#include + +/* + * 11.1: + * - Add BASE_MEM_TILER_ALIGN_TOP under base_mem_alloc_flags + * 11.2: + * - KBASE_MEM_QUERY_FLAGS can return KBASE_REG_PF_GROW and KBASE_REG_PROTECTED, + * which some user-side clients prior to 11.2 might fault if they received + * them + * 11.3: + * - New ioctls KBASE_IOCTL_STICKY_RESOURCE_MAP and + * KBASE_IOCTL_STICKY_RESOURCE_UNMAP + * 11.4: + * - New ioctl KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET + * 11.5: + * - New ioctl: KBASE_IOCTL_MEM_JIT_INIT (old ioctl renamed to _OLD) + * 11.6: + * - Added flags field to base_jit_alloc_info structure, which can be used to + * specify pseudo chunked tiler alignment for JIT allocations. + * 11.7: + * - Removed UMP support + * 11.8: + * - Added BASE_MEM_UNCACHED_GPU under base_mem_alloc_flags + * 11.9: + * - Added BASE_MEM_PERMANENT_KERNEL_MAPPING and BASE_MEM_FLAGS_KERNEL_ONLY + * under base_mem_alloc_flags + * 11.10: + * - Enabled the use of nr_extres field of base_jd_atom_v2 structure for + * JIT_ALLOC and JIT_FREE type softjobs to enable multiple JIT allocations + * with one softjob. + * 11.11: + * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags + * 11.12: + * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS + * 11.13: + * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT + * 11.14: + * - Add BASE_MEM_GROUP_ID_MASK, base_mem_group_id_get, base_mem_group_id_set + * under base_mem_alloc_flags + * 11.15: + * - Added BASEP_CONTEXT_MMU_GROUP_ID_MASK under base_context_create_flags. + * - Require KBASE_IOCTL_SET_FLAGS before BASE_MEM_MAP_TRACKING_HANDLE can be + * passed to mmap(). + * 11.16: + * - Extended ioctl KBASE_IOCTL_MEM_SYNC to accept imported dma-buf. + * - Modified (backwards compatible) ioctl KBASE_IOCTL_MEM_IMPORT behavior for + * dma-buf. Now, buffers are mapped on GPU when first imported, no longer + * requiring external resource or sticky resource tracking. UNLESS, + * CONFIG_MALI_DMA_BUF_MAP_ON_DEMAND is enabled. + * 11.17: + * - Added BASE_JD_REQ_JOB_SLOT. + * - Reused padding field in base_jd_atom_v2 to pass job slot number. + * - New ioctl: KBASE_IOCTL_GET_CPU_GPU_TIMEINFO + * 11.18: + * - Added BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP under base_mem_alloc_flags + * 11.19: + * - Extended base_jd_atom_v2 to allow a renderpass ID to be specified. + * 11.20: + * - Added new phys_pages member to kbase_ioctl_mem_jit_init for + * KBASE_IOCTL_MEM_JIT_INIT, previous variants of this renamed to use _10_2 + * (replacing '_OLD') and _11_5 suffixes + * - Replaced compat_core_req (deprecated in 10.3) with jit_id[2] in + * base_jd_atom_v2. It must currently be initialized to zero. + * - Added heap_info_gpu_addr to base_jit_alloc_info, and + * BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE allowable in base_jit_alloc_info's + * flags member. Previous variants of this structure are kept and given _10_2 + * and _11_5 suffixes. + * - The above changes are checked for safe values in usual builds + * 11.21: + * - v2.0 of mali_trace debugfs file, which now versions the file separately + * 11.22: + * - Added base_jd_atom (v3), which is seq_nr + base_jd_atom_v2. + * KBASE_IOCTL_JOB_SUBMIT supports both in parallel. + * 11.23: + * - Modified KBASE_IOCTL_MEM_COMMIT behavior to reject requests to modify + * the physical memory backing of JIT allocations. This was not supposed + * to be a valid use case, but it was allowed by the previous implementation. + * 11.24: + * - Added a sysfs file 'serialize_jobs' inside a new sub-directory + * 'scheduling'. + * 11.25: + * - Enabled JIT pressure limit in base/kbase by default + * 11.26 + * - Added kinstr_jm API + * 11.27 + * - Backwards compatible extension to HWC ioctl. + * 11.28: + * - Added kernel side cache ops needed hint + * 11.29: + * - Reserve ioctl 52 + * 11.30: + * - Add a new priority level BASE_JD_PRIO_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 11.31: + * - Added BASE_JD_REQ_LIMITED_CORE_MASK. + * - Added ioctl 55: set_limited_core_count. + * 11.32: + * - Added new HW performance counters interface to all GPUs. + * 11.33: + * - Removed Kernel legacy HWC interface + * 11.34: + * - First release of new HW performance counters interface. + * 11.35: + * - Dummy model (no mali) backend will now clear HWC values after each sample + */ +#define BASE_UK_VERSION_MAJOR 11 +#define BASE_UK_VERSION_MINOR 35 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + + +/** + * struct kbase_ioctl_job_submit - Submit jobs/atoms to the kernel + * + * @addr: Memory address of an array of struct base_jd_atom_v2 or v3 + * @nr_atoms: Number of entries in the array + * @stride: sizeof(struct base_jd_atom_v2) or sizeof(struct base_jd_atom) + */ +struct kbase_ioctl_job_submit { + __u64 addr; + __u32 nr_atoms; + __u32 stride; +}; + +#define KBASE_IOCTL_JOB_SUBMIT \ + _IOW(KBASE_IOCTL_TYPE, 2, struct kbase_ioctl_job_submit) + +#define KBASE_IOCTL_POST_TERM \ + _IO(KBASE_IOCTL_TYPE, 4) + +/** + * struct kbase_ioctl_soft_event_update - Update the status of a soft-event + * @event: GPU address of the event which has been updated + * @new_status: The new status to set + * @flags: Flags for future expansion + */ +struct kbase_ioctl_soft_event_update { + __u64 event; + __u32 new_status; + __u32 flags; +}; + +#define KBASE_IOCTL_SOFT_EVENT_UPDATE \ + _IOW(KBASE_IOCTL_TYPE, 28, struct kbase_ioctl_soft_event_update) + +/** + * struct kbase_kinstr_jm_fd_out - Explains the compatibility information for + * the `struct kbase_kinstr_jm_atom_state_change` structure returned from the + * kernel + * + * @size: The size of the `struct kbase_kinstr_jm_atom_state_change` + * @version: Represents a breaking change in the + * `struct kbase_kinstr_jm_atom_state_change` + * @padding: Explicit padding to get the structure up to 64bits. See + * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst + * + * The `struct kbase_kinstr_jm_atom_state_change` may have extra members at the + * end of the structure that older user space might not understand. If the + * `version` is the same, the structure is still compatible with newer kernels. + * The `size` can be used to cast the opaque memory returned from the kernel. + */ +struct kbase_kinstr_jm_fd_out { + __u16 size; + __u8 version; + __u8 padding[5]; +}; + +/** + * struct kbase_kinstr_jm_fd_in - Options when creating the file descriptor + * + * @count: Number of atom states that can be stored in the kernel circular + * buffer. Must be a power of two + * @padding: Explicit padding to get the structure up to 64bits. See + * https://www.kernel.org/doc/Documentation/ioctl/botching-up-ioctls.rst + */ +struct kbase_kinstr_jm_fd_in { + __u16 count; + __u8 padding[6]; +}; + +union kbase_kinstr_jm_fd { + struct kbase_kinstr_jm_fd_in in; + struct kbase_kinstr_jm_fd_out out; +}; + +#define KBASE_IOCTL_KINSTR_JM_FD \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_kinstr_jm_fd) + + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#endif /* _UAPI_KBASE_JM_IOCTL_H_ */ diff --git a/src/panfrost/base/include/mali_base_common_kernel.h b/src/panfrost/base/include/mali_base_common_kernel.h new file mode 100644 index 00000000000..f8378146ace --- /dev/null +++ b/src/panfrost/base/include/mali_base_common_kernel.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_COMMON_KERNEL_H_ +#define _UAPI_BASE_COMMON_KERNEL_H_ + +#include + +struct base_mem_handle { + struct { + __u64 handle; + } basep; +}; + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +/* Memory allocation, access/hint flags & mask. + * + * See base_mem_alloc_flags. + */ + +/* IN */ +/* Read access CPU side + */ +#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) + +/* Write access CPU side + */ +#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) + +/* Read access GPU side + */ +#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) + +/* Write access GPU side + */ +#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) + +/* Execute allowed on the GPU side + */ +#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) + +/* Will be permanently mapped in kernel space. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) + +/* The allocation will completely reside within the same 4GB chunk in the GPU + * virtual space. + * Since this flag is primarily required only for the TLS memory which will + * not be used to contain executable code and also not used for Tiler heap, + * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. + */ +#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) + +/* Userspace is not allowed to free this memory. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) + +/* Grow backing store on GPU Page Fault + */ +#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) + +/* Page coherence Outer shareable, if available + */ +#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) + +/* Page coherence Inner shareable + */ +#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) + +/* IN/OUT */ +/* Should be cached on the CPU, returned if actually cached + */ +#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) + +/* IN/OUT */ +/* Must have same VA on both the GPU and the CPU + */ +#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) + +/* OUT */ +/* Must call mmap to acquire a GPU address for the allocation + */ +#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) + +/* IN */ +/* Page coherence Outer shareable, required. + */ +#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) + +/* Protected memory + */ +#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) + +/* Not needed physical memory + */ +#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) + +/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the + * addresses to be the same + */ +#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) + +/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu + * mode. Some components within the GPU might only be able to access memory + * that is GPU cacheable. Refer to the specific GPU implementation for more + * details. The 3 shareability flags will be ignored for GPU uncached memory. + * If used while importing USER_BUFFER type memory, then the import will fail + * if the memory is not aligned to GPU and CPU cache line width. + */ +#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) + +/* + * Bits [22:25] for group_id (0~15). + * + * base_mem_group_id_set() should be used to pack a memory group ID into a + * base_mem_alloc_flags value instead of accessing the bits directly. + * base_mem_group_id_get() should be used to extract the memory group ID from + * a base_mem_alloc_flags value. + */ +#define BASEP_MEM_GROUP_ID_SHIFT 22 +#define BASE_MEM_GROUP_ID_MASK ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) + +/* Must do CPU cache maintenance when imported memory is mapped/unmapped + * on GPU. Currently applicable to dma-buf type only. + */ +#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) + +/* OUT */ +/* Kernel side cache sync ops required */ +#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) + +/* Number of bits used as flags for base memory management + * + * Must be kept in sync with the base_mem_alloc_flags flags + */ +#define BASE_MEM_FLAGS_NR_BITS 30 + +/* A mask for all output bits, excluding IN/OUT bits. + */ +#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP + +/* A mask for all input bits, including IN/OUT bits. + */ +#define BASE_MEM_FLAGS_INPUT_MASK \ + (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) + +/* Special base mem handles. + */ +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) +/* reserved handles ..-47< for future special handles */ +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) + +/* Flags to pass to ::base_context_init. + * Flags can be ORed together to enable multiple things. + * + * These share the same space as BASEP_CONTEXT_FLAG_*, and so must + * not collide with them. + */ +typedef __u32 base_context_create_flags; + +/* Flags for base context */ + +/* No flags set */ +#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) + +/* Base context is embedded in a cctx object (flag used for CINSTR + * software counter macros) + */ +#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) + +/* Base context is a 'System Monitor' context for Hardware counters. + * + * One important side effect of this is that job submission is disabled. + */ +#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED ((base_context_create_flags)1 << 1) + +/* Bit-shift used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) + +/* Bitmask used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ + ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* Bitpattern describing the base_context_create_flags that can be + * passed to the kernel + */ +#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ + (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | BASEP_CONTEXT_MMU_GROUP_ID_MASK) + +/* Flags for base tracepoint + */ + +/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, + * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) + */ +#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) + +/* Indicate that job dumping is enabled. This could affect certain timers + * to account for the performance impact. + */ +#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) + +#endif /* _UAPI_BASE_COMMON_KERNEL_H_ */ diff --git a/src/panfrost/base/include/mali_base_kernel.h b/src/panfrost/base/include/mali_base_kernel.h new file mode 100644 index 00000000000..3d826c720b2 --- /dev/null +++ b/src/panfrost/base/include/mali_base_kernel.h @@ -0,0 +1,700 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * Base structures shared with the kernel. + */ + +#ifndef _UAPI_BASE_KERNEL_H_ +#define _UAPI_BASE_KERNEL_H_ + +#include +#include "mali_base_common_kernel.h" + +#define BASE_MAX_COHERENT_GROUPS 16 + +#if defined(PAGE_MASK) && defined(PAGE_SHIFT) +#define LOCAL_PAGE_SHIFT PAGE_SHIFT +#define LOCAL_PAGE_LSB ~PAGE_MASK +#else +#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 +#endif + +#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) +#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) +#else +#error Failed to find page size +#endif +#endif + +/* Physical memory group ID for normal usage. + */ +#define BASE_MEM_GROUP_DEFAULT (0) + +/* Number of physical memory groups. + */ +#define BASE_MEM_GROUP_COUNT (16) + +/** + * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. + * + * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator + * in order to determine the best cache policy. Some combinations are + * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), + * which defines a write-only region on the CPU side, which is + * heavily read by the CPU... + * Other flags are only meaningful to a particular allocator. + * More flags can be added to this list, as long as they don't clash + * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). + */ +typedef __u32 base_mem_alloc_flags; + +/* A mask for all the flags which are modifiable via the base_mem_set_flags + * interface. + */ +#define BASE_MEM_FLAGS_MODIFIABLE \ + (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ + BASE_MEM_COHERENT_LOCAL) + +/* A mask of all the flags that can be returned via the base_mem_get_flags() + * interface. + */ +#define BASE_MEM_FLAGS_QUERYABLE \ + (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ + BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ + BASEP_MEM_FLAGS_KERNEL_ONLY)) + +/** + * enum base_mem_import_type - Memory types supported by @a base_mem_import + * + * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type + * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) + * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a + * base_mem_import_user_buffer + * + * Each type defines what the supported handle type is. + * + * If any new type is added here ARM must be contacted + * to allocate a numeric value for it. + * Do not just add a new type without synchronizing with ARM + * as future releases from ARM might include other new types + * which could clash with your custom types. + */ +enum base_mem_import_type { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + /* + * Import type with value 1 is deprecated. + */ + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 +}; + +/** + * struct base_mem_import_user_buffer - Handle of an imported user buffer + * + * @ptr: address of imported user buffer + * @length: length of imported user buffer in bytes + * + * This structure is used to represent a handle of an imported user buffer. + */ + +struct base_mem_import_user_buffer { + __u64 ptr; + __u64 length; +}; + +/* Mask to detect 4GB boundary alignment */ +#define BASE_MEM_MASK_4GB 0xfffff000UL +/* Mask to detect 4GB boundary (in page units) alignment */ +#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) + +/* Limit on the 'extension' parameter for an allocation with the + * BASE_MEM_TILER_ALIGN_TOP flag set + * + * This is the same as the maximum limit for a Buffer Descriptor's chunk size + */ +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ + (21u - (LOCAL_PAGE_SHIFT)) +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ + (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) + +/* Bit mask of cookies used for memory allocation setup */ +#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ + +/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ +#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ + +/* + * struct base_fence - Cross-device synchronisation fence. + * + * A fence is used to signal when the GPU has finished accessing a resource that + * may be shared with other devices, and also to delay work done asynchronously + * by the GPU until other devices have finished accessing a shared resource. + */ +struct base_fence { + struct { + int fd; + int stream_fd; + } basep; +}; + +/** + * struct base_mem_aliasing_info - Memory aliasing info + * + * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * @offset: Offset within the handle to start aliasing from, in pages. + * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. + * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * specifies the number of times the special page is needed. + * + * Describes a memory handle to be aliased. + * A subset of the handle can be chosen for aliasing, given an offset and a + * length. + * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a + * region where a special page is mapped with a write-alloc cache setup, + * typically used when the write result of the GPU isn't needed, but the GPU + * must write anyway. + * + * Offset and length are specified in pages. + * Offset must be within the size of the handle. + * Offset+length must not overrun the size of the handle. + */ +struct base_mem_aliasing_info { + struct base_mem_handle handle; + __u64 offset; + __u64 length; +}; + +/* Maximum percentage of just-in-time memory allocation trimming to perform + * on free. + */ +#define BASE_JIT_MAX_TRIM_LEVEL (100) + +/* Maximum number of concurrent just-in-time memory allocations. + */ +#define BASE_JIT_ALLOC_COUNT (255) + +/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 + * + * jit_version is 1 + * + * Due to the lack of padding specified, user clients between 32 and 64-bit + * may have assumed a different size of the struct + * + * An array of structures was not supported + */ +struct base_jit_alloc_info_10_2 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; +}; + +/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up + * to 11.19 + * + * This structure had a number of modifications during and after kernel driver + * version 11.5, but remains size-compatible throughout its version history, and + * with earlier variants compatible with future variants by requiring + * zero-initialization to the unused space in the structure. + * + * jit_version is 2 + * + * Kernel driver version history: + * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes + * must be zero. Kbase minor version was not incremented, so some + * versions of 11.5 do not have this change. + * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase + * minor version not incremented) + * 11.6: Added 'flags', replacing 1 padding byte + * 11.10: Arrays of this structure are supported + */ +struct base_jit_alloc_info_11_5 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; +}; + +/** + * struct base_jit_alloc_info - Structure which describes a JIT allocation + * request. + * @gpu_alloc_addr: The GPU virtual address to write the JIT + * allocated GPU virtual address to. + * @va_pages: The minimum number of virtual pages required. + * @commit_pages: The minimum number of physical pages which + * should back the allocation. + * @extension: Granularity of physical pages to grow the + * allocation by during a fault. + * @id: Unique ID provided by the caller, this is used + * to pair allocation and free requests. + * Zero is not a valid value. + * @bin_id: The JIT allocation bin, used in conjunction with + * @max_allocations to limit the number of each + * type of JIT allocation. + * @max_allocations: The maximum number of allocations allowed within + * the bin specified by @bin_id. Should be the same + * for all allocations within the same bin. + * @flags: flags specifying the special requirements for + * the JIT allocation, see + * %BASE_JIT_ALLOC_VALID_FLAGS + * @padding: Expansion space - should be initialised to zero + * @usage_id: A hint about which allocation should be reused. + * The kernel should attempt to use a previous + * allocation with the same usage_id + * @heap_info_gpu_addr: Pointer to an object in GPU memory describing + * the actual usage of the region. + * + * jit_version is 3. + * + * When modifications are made to this structure, it is still compatible with + * jit_version 3 when: a) the size is unchanged, and b) new members only + * replace the padding bytes. + * + * Previous jit_version history: + * jit_version == 1, refer to &base_jit_alloc_info_10_2 + * jit_version == 2, refer to &base_jit_alloc_info_11_5 + * + * Kbase version history: + * 11.20: added @heap_info_gpu_addr + */ +struct base_jit_alloc_info { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; + __u64 heap_info_gpu_addr; +}; + +enum base_external_resource_access { + BASE_EXT_RES_ACCESS_SHARED, + BASE_EXT_RES_ACCESS_EXCLUSIVE +}; + +struct base_external_resource { + __u64 ext_resource; +}; + +/** + * BASE_EXT_RES_COUNT_MAX - The maximum number of external resources + * which can be mapped/unmapped in a single request. + */ +#define BASE_EXT_RES_COUNT_MAX 10 + +/** + * struct base_external_resource_list - Structure which describes a list of + * external resources. + * @count: The number of resources. + * @ext_res: Array of external resources which is + * sized at allocation time. + */ +struct base_external_resource_list { + __u64 count; + struct base_external_resource ext_res[1]; +}; + +struct base_jd_debug_copy_buffer { + __u64 address; + __u64 size; + struct base_external_resource extres; +}; + +#define GPU_MAX_JOB_SLOTS 16 + +/** + * DOC: User-side Base GPU Property Queries + * + * The User-side Base GPU Property Query interface encapsulates two + * sub-modules: + * + * - "Dynamic GPU Properties" + * - "Base Platform Config GPU Properties" + * + * Base only deals with properties that vary between different GPU + * implementations - the Dynamic GPU properties and the Platform Config + * properties. + * + * For properties that are constant for the GPU Architecture, refer to the + * GPU module. However, we will discuss their relevance here just to + * provide background information. + * + * About the GPU Properties in Base and GPU modules + * + * The compile-time properties (Platform Config, GPU Compile-time + * properties) are exposed as pre-processor macros. + * + * Complementing the compile-time properties are the Dynamic GPU + * Properties, which act as a conduit for the GPU Configuration + * Discovery. + * + * In general, the dynamic properties are present to verify that the platform + * has been configured correctly with the right set of Platform Config + * Compile-time Properties. + * + * As a consistent guide across the entire DDK, the choice for dynamic or + * compile-time should consider the following, in order: + * 1. Can the code be written so that it doesn't need to know the + * implementation limits at all? + * 2. If you need the limits, get the information from the Dynamic Property + * lookup. This should be done once as you fetch the context, and then cached + * as part of the context data structure, so it's cheap to access. + * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, + * then use a Compile-Time Property (Platform Config, or GPU Compile-time + * property). Examples of where this might be sensible follow: + * - Part of a critical inner-loop + * - Frequent re-use throughout the driver, causing significant extra load + * instructions or control flow that would be worthwhile optimizing out. + * + * We cannot provide an exhaustive set of examples, neither can we provide a + * rule for every possible situation. Use common sense, and think about: what + * the rest of the driver will be doing; how the compiler might represent the + * value if it is a compile-time constant; whether an OEM shipping multiple + * devices would benefit much more from a single DDK binary, instead of + * insignificant micro-optimizations. + * + * Dynamic GPU Properties + * + * Dynamic GPU properties are presented in two sets: + * 1. the commonly used properties in @ref base_gpu_props, which have been + * unpacked from GPU register bitfields. + * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props + * (also a member of base_gpu_props). All of these are presented in + * the packed form, as presented by the GPU registers themselves. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + * The properties returned extend the GPU Configuration Discovery + * registers. For example, GPU clock speed is not specified in the GPU + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. + * + * The GPU properties are obtained by a call to + * base_get_gpu_props(). This simply returns a pointer to a const + * base_gpu_props structure. It is constant for the life of a base + * context. Multiple calls to base_get_gpu_props() to a base context + * return the same pointer to a constant structure. This avoids cache pollution + * of the common data. + * + * This pointer must not be freed, because it does not point to the start of a + * region allocated by the memory allocator; instead, just close the @ref + * base_context. + * + * + * Kernel Operation + * + * During Base Context Create time, user-side makes a single kernel call: + * - A call to fill user memory with GPU information structures + * + * The kernel-side will fill the provided the entire processed base_gpu_props + * structure, because this information is required in both + * user and kernel side; it does not make sense to decode it twice. + * + * Coherency groups must be derived from the bitmasks, but this can be done + * kernel side, and just once at kernel startup: Coherency groups must already + * be known kernel-side, to support chains that specify a 'Only Coherent Group' + * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. + * + * Coherency Group calculation + * + * Creation of the coherent group data is done at device-driver startup, and so + * is one-time. This will most likely involve a loop with CLZ, shifting, and + * bit clearing on the L2_PRESENT mask, depending on whether the + * system is L2 Coherent. The number of shader cores is done by a + * population count, since faulty cores may be disabled during production, + * producing a non-contiguous mask. + * + * The memory requirements for this algorithm can be determined either by a __u64 + * population count on the L2_PRESENT mask (a LUT helper already is + * required for the above), or simple assumption that there can be no more than + * 16 coherent groups, since core groups are typically 4 cores. + */ + +/* + * More information is possible - but associativity and bus width are not + * required by upper-level apis. + */ +struct mali_base_gpu_l2_cache_props { + __u8 log2_line_size; + __u8 log2_cache_size; + __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + __u8 padding[5]; +}; + +struct mali_base_gpu_tiler_props { + __u32 bin_size_bytes; /* Max is 4*2^15 */ + __u32 max_active_levels; /* Max is 2^15 */ +}; + +/** + * struct mali_base_gpu_thread_props - GPU threading system details. + * @max_threads: Max. number of threads per core + * @max_workgroup_size: Max. number of threads per workgroup + * @max_barrier_size: Max. number of threads that can synchronize on a + * simple barrier + * @max_registers: Total size [1..65535] of the register file available + * per core. + * @max_task_queue: Max. tasks [1..255] which may be sent to a core + * before it becomes blocked. + * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split + * field. + * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, + * 3 = SW Model/Emulation + * @padding: padding to align to 8-byte + * @tls_alloc: Number of threads per core that TLS must be + * allocated for + */ +struct mali_base_gpu_thread_props { + __u32 max_threads; + __u32 max_workgroup_size; + __u32 max_barrier_size; + __u16 max_registers; + __u8 max_task_queue; + __u8 max_thread_group_split; + __u8 impl_tech; + __u8 padding[3]; + __u32 tls_alloc; +}; + +/** + * struct mali_base_gpu_coherent_group - descriptor for a coherent group + * @core_mask: Core restriction mask required for the group + * @num_cores: Number of cores in the group + * @padding: padding to align to 8-byte + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_base_gpu_coherent_group { + __u64 core_mask; + __u16 num_cores; + __u16 padding[3]; +}; + +/** + * struct mali_base_gpu_coherent_group_info - Coherency group information + * @num_groups: Number of coherent groups in the GPU. + * @num_core_groups: Number of core groups (coherent or not) in the GPU. + * Equivalent to the number of L2 Caches. + * The GPU Counter dumping writes 2048 bytes per core group, + * regardless of whether the core groups are coherent or not. + * Hence this member is needed to calculate how much memory + * is required for dumping. + * @note Do not use it to work out how many valid elements + * are in the group[] member. Use num_groups instead. + * @coherency: Coherency features of the memory, accessed by gpu_mem_features + * methods + * @padding: padding to align to 8-byte + * @group: Descriptors of coherent groups + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_base_gpu_coherent_group_info { + __u32 num_groups; + __u32 num_core_groups; + __u32 coherency; + __u32 padding; + struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; +}; + +#if MALI_USE_CSF +#include "csf/mali_base_csf_kernel.h" +#else +#include "jm/mali_base_jm_kernel.h" +#endif + +/** + * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware + * Configuration Discovery registers. + * @shader_present: Shader core present bitmap + * @tiler_present: Tiler core present bitmap + * @l2_present: Level 2 cache present bitmap + * @stack_present: Core stack present bitmap + * @l2_features: L2 features + * @core_features: Core features + * @mem_features: Mem features + * @mmu_features: Mmu features + * @as_present: Bitmap of address spaces present + * @js_present: Job slots present + * @js_features: Array of job slot features. + * @tiler_features: Tiler features + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU + * @gpu_id: GPU and revision identifier + * @thread_max_threads: Maximum number of threads per core + * @thread_max_workgroup_size: Maximum number of threads per workgroup + * @thread_max_barrier_size: Maximum number of threads per barrier + * @thread_features: Thread features + * @coherency_mode: Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register + * @thread_tls_alloc: Number of threads per core that TLS must be allocated for + * @gpu_features: GPU features + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct gpu_raw_gpu_props { + __u64 shader_present; + __u64 tiler_present; + __u64 l2_present; + __u64 stack_present; + __u32 l2_features; + __u32 core_features; + __u32 mem_features; + __u32 mmu_features; + + __u32 as_present; + + __u32 js_present; + __u32 js_features[GPU_MAX_JOB_SLOTS]; + __u32 tiler_features; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + __u32 gpu_id; + + __u32 thread_max_threads; + __u32 thread_max_workgroup_size; + __u32 thread_max_barrier_size; + __u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + __u32 coherency_mode; + + __u32 thread_tls_alloc; + __u64 gpu_features; +}; + +/** + * struct base_gpu_props - Return structure for base_get_gpu_props(). + * @core_props: Core props. + * @l2_props: L2 props. + * @unused_1: Keep for backwards compatibility. + * @tiler_props: Tiler props. + * @thread_props: Thread props. + * @raw_props: This member is large, likely to be 128 bytes. + * @coherency_info: This must be last member of the structure. + * + * NOTE: the raw_props member in this data structure contains the register + * values from which the value of the other members are derived. The derived + * members exist to allow for efficient access and/or shielding the details + * of the layout of the registers. + */ +struct base_gpu_props { + struct mali_base_gpu_core_props core_props; + struct mali_base_gpu_l2_cache_props l2_props; + __u64 unused_1; + struct mali_base_gpu_tiler_props tiler_props; + struct mali_base_gpu_thread_props thread_props; + struct gpu_raw_gpu_props raw_props; + struct mali_base_gpu_coherent_group_info coherency_info; +}; + +#define BASE_MEM_GROUP_ID_GET(flags) \ + ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) + +#define BASE_MEM_GROUP_ID_SET(id) \ + (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ + BASE_MEM_GROUP_DEFAULT : \ + id) \ + << BASEP_MEM_GROUP_ID_SHIFT) & \ + BASE_MEM_GROUP_ID_MASK) + +#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ + (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ + ((base_context_create_flags)(group_id) \ + << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) + +#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ + ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ + BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* + * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These + * flags are also used, where applicable, for specifying which fields + * are valid following the request operation. + */ + +/* For monotonic (counter) timefield */ +#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) +/* For system wide timestamp */ +#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) +/* For GPU cycle counter */ +#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) +/* Specify kernel GPU register timestamp */ +#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) +/* Specify userspace cntvct_el0 timestamp source */ +#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) + +#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ + BASE_TIMEINFO_MONOTONIC_FLAG | \ + BASE_TIMEINFO_TIMESTAMP_FLAG | \ + BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ + BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ + BASE_TIMEINFO_USER_SOURCE_FLAG) + +/* Maximum number of source allocations allowed to create an alias allocation. + * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array + * layers, since each cube map in the array will have 6 faces. + */ +#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) + +#endif /* _UAPI_BASE_KERNEL_H_ */ diff --git a/src/panfrost/base/include/mali_kbase_gpuprops.h b/src/panfrost/base/include/mali_kbase_gpuprops.h new file mode 100644 index 00000000000..b250feca022 --- /dev/null +++ b/src/panfrost/base/include/mali_kbase_gpuprops.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_GPUPROP_H_ +#define _UAPI_KBASE_GPUPROP_H_ + +/********************************** + * Definitions for GPU properties * + **********************************/ +#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) +#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) +#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) +#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) + +#define KBASE_GPUPROP_PRODUCT_ID 1 +#define KBASE_GPUPROP_VERSION_STATUS 2 +#define KBASE_GPUPROP_MINOR_REVISION 3 +#define KBASE_GPUPROP_MAJOR_REVISION 4 +/* 5 previously used for GPU speed */ +#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 +/* 7 previously used for minimum GPU speed */ +#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 +#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 +#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 +#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 +#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 + +#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 +#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 +#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 + +#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 +#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 + +#define KBASE_GPUPROP_MAX_THREADS 18 +#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 +#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 +#define KBASE_GPUPROP_MAX_REGISTERS 21 +#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 +#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 +#define KBASE_GPUPROP_IMPL_TECH 24 + +#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 +#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 +#define KBASE_GPUPROP_RAW_L2_PRESENT 27 +#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 +#define KBASE_GPUPROP_RAW_L2_FEATURES 29 +#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 +#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 +#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 +#define KBASE_GPUPROP_RAW_AS_PRESENT 33 +#define KBASE_GPUPROP_RAW_JS_PRESENT 34 +#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 +#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 +#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 +#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 +#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 +#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 +#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 +#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 +#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 +#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 +#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 +#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 +#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 +#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 +#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 +#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 +#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 +#define KBASE_GPUPROP_RAW_GPU_ID 55 +#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 +#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 +#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 +#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 +#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 + +#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 +#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 +#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 +#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 +#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 +#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 +#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 +#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 +#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 +#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 +#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 +#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 +#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 +#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 +#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 +#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 +#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 +#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 +#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 + +#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 + +#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 + +#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 +#define KBASE_GPUPROP_TLS_ALLOC 84 +#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 + +#endif diff --git a/src/panfrost/base/include/mali_kbase_ioctl.h b/src/panfrost/base/include/mali_kbase_ioctl.h new file mode 100644 index 00000000000..96f606af5f8 --- /dev/null +++ b/src/panfrost/base/include/mali_kbase_ioctl.h @@ -0,0 +1,759 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2022 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_IOCTL_H_ +#define _UAPI_KBASE_IOCTL_H_ + +#ifdef __cpluscplus +extern "C" { +#endif + +#include +#include + +#if MALI_USE_CSF +#include "csf/mali_kbase_csf_ioctl.h" +#else +#include "jm/mali_kbase_jm_ioctl.h" +#endif /* MALI_USE_CSF */ + +#define KBASE_IOCTL_TYPE 0x80 + +/** + * struct kbase_ioctl_set_flags - Set kernel context creation flags + * + * @create_flags: Flags - see base_context_create_flags + */ +struct kbase_ioctl_set_flags { + __u32 create_flags; +}; + +#define KBASE_IOCTL_SET_FLAGS \ + _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) + +/** + * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel + * + * @buffer: Pointer to the buffer to store properties into + * @size: Size of the buffer + * @flags: Flags - must be zero for now + * + * The ioctl will return the number of bytes stored into @buffer or an error + * on failure (e.g. @size is too small). If @size is specified as 0 then no + * data will be written but the return value will be the number of bytes needed + * for all the properties. + * + * @flags may be used in the future to request a different format for the + * buffer. With @flags == 0 the following format is used. + * + * The buffer will be filled with pairs of values, a __u32 key identifying the + * property followed by the value. The size of the value is identified using + * the bottom bits of the key. The value then immediately followed the key and + * is tightly packed (there is no padding). All keys and values are + * little-endian. + * + * 00 = __u8 + * 01 = __u16 + * 10 = __u32 + * 11 = __u64 + */ +struct kbase_ioctl_get_gpuprops { + __u64 buffer; + __u32 size; + __u32 flags; +}; + +#define KBASE_IOCTL_GET_GPUPROPS \ + _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) + +/** + * union kbase_ioctl_mem_alloc - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC \ + _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) + +/** + * struct kbase_ioctl_mem_query - Query properties of a GPU memory region + * @in: Input parameters + * @in.gpu_addr: A GPU address contained within the region + * @in.query: The type of query + * @out: Output parameters + * @out.value: The result of the query + * + * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. + */ +union kbase_ioctl_mem_query { + struct { + __u64 gpu_addr; + __u64 query; + } in; + struct { + __u64 value; + } out; +}; + +#define KBASE_IOCTL_MEM_QUERY \ + _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) + +#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) +#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) +#define KBASE_MEM_QUERY_FLAGS ((__u64)3) + +/** + * struct kbase_ioctl_mem_free - Free a memory region + * @gpu_addr: Handle to the region to free + */ +struct kbase_ioctl_mem_free { + __u64 gpu_addr; +}; + +#define KBASE_IOCTL_MEM_FREE \ + _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @buffer_count: requested number of dumping buffers + * @fe_bm: counters selection bitmask (Front end) + * @shader_bm: counters selection bitmask (Shader) + * @tiler_bm: counters selection bitmask (Tiler) + * @mmu_l2_bm: counters selection bitmask (MMU_L2) + * + * A fd is returned from the ioctl if successful, or a negative value on error + */ +struct kbase_ioctl_hwcnt_reader_setup { + __u32 buffer_count; + __u32 fe_bm; + __u32 shader_bm; + __u32 tiler_bm; + __u32 mmu_l2_bm; +}; + +#define KBASE_IOCTL_HWCNT_READER_SETUP \ + _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) + +/** + * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. + * @data: Counter samples for the dummy model. + * @size: Size of the counter sample data. + * @padding: Padding. + */ +struct kbase_ioctl_hwcnt_values { + __u64 data; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_HWCNT_SET \ + _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) + +/** + * struct kbase_ioctl_disjoint_query - Query the disjoint counter + * @counter: A counter of disjoint events in the kernel + */ +struct kbase_ioctl_disjoint_query { + __u32 counter; +}; + +#define KBASE_IOCTL_DISJOINT_QUERY \ + _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) + +/** + * struct kbase_ioctl_get_ddk_version - Query the kernel version + * @version_buffer: Buffer to receive the kernel version string + * @size: Size of the buffer + * @padding: Padding + * + * The ioctl will return the number of bytes written into version_buffer + * (which includes a NULL byte) or a negative error code + * + * The ioctl request code has to be _IOW because the data in ioctl struct is + * being copied to the kernel, even though the kernel then writes out the + * version info to the buffer specified in the ioctl. + */ +struct kbase_ioctl_get_ddk_version { + __u64 version_buffer; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_GET_DDK_VERSION \ + _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) + +/** + * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 10.2--11.4) + * @va_pages: Number of VA pages to reserve for JIT + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_10_2 { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) + +/** + * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 11.5--11.19) + * @va_pages: Number of VA pages to reserve for JIT + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_11_5 { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) + +/** + * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory + * allocator + * @va_pages: Number of GPU virtual address pages to reserve for just-in-time + * memory allocations + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * @phys_pages: Maximum number of physical pages to allocate just-in-time + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + */ +struct kbase_ioctl_mem_jit_init { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; + __u64 phys_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) + +/** + * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory + * + * @handle: GPU memory handle (GPU VA) + * @user_addr: The address where it is mapped in user space + * @size: The number of bytes to synchronise + * @type: The direction to synchronise: 0 is sync to memory (clean), + * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_mem_sync { + __u64 handle; + __u64 user_addr; + __u64 size; + __u8 type; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_MEM_SYNC \ + _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) + +/** + * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer + * + * @in: Input parameters + * @in.gpu_addr: The GPU address of the memory region + * @in.cpu_addr: The CPU address to locate + * @in.size: A size in bytes to validate is contained within the region + * @out: Output parameters + * @out.offset: The offset from the start of the memory region to @cpu_addr + */ +union kbase_ioctl_mem_find_cpu_offset { + struct { + __u64 gpu_addr; + __u64 cpu_addr; + __u64 size; + } in; + struct { + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) + +/** + * struct kbase_ioctl_get_context_id - Get the kernel context ID + * + * @id: The kernel context ID + */ +struct kbase_ioctl_get_context_id { + __u32 id; +}; + +#define KBASE_IOCTL_GET_CONTEXT_ID \ + _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) + +/** + * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd + * + * @flags: Flags + * + * The ioctl returns a file descriptor when successful + */ +struct kbase_ioctl_tlstream_acquire { + __u32 flags; +}; + +#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ + _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) + +#define KBASE_IOCTL_TLSTREAM_FLUSH \ + _IO(KBASE_IOCTL_TYPE, 19) + +/** + * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region + * + * @gpu_addr: The memory region to modify + * @pages: The number of physical pages that should be present + * + * The ioctl may return on the following error codes or 0 for success: + * -ENOMEM: Out of memory + * -EINVAL: Invalid arguments + */ +struct kbase_ioctl_mem_commit { + __u64 gpu_addr; + __u64 pages; +}; + +#define KBASE_IOCTL_MEM_COMMIT \ + _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) + +/** + * union kbase_ioctl_mem_alias - Create an alias of memory regions + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.stride: Bytes between start of each memory region + * @in.nents: The number of regions to pack together into the alias + * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_alias { + struct { + __u64 flags; + __u64 stride; + __u64 nents; + __u64 aliasing_info; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_ALIAS \ + _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) + +/** + * union kbase_ioctl_mem_import - Import memory for use by the GPU + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.phandle: Handle to the external memory + * @in.type: Type of external memory, see base_mem_import_type + * @in.padding: Amount of extra VA pages to append to the imported buffer + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_import { + struct { + __u64 flags; + __u64 phandle; + __u32 type; + __u32 padding; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_IMPORT \ + _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) + +/** + * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region + * @gpu_va: The GPU region to modify + * @flags: The new flags to set + * @mask: Mask of the flags to modify + */ +struct kbase_ioctl_mem_flags_change { + __u64 gpu_va; + __u64 flags; + __u64 mask; +}; + +#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ + _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) + +/** + * struct kbase_ioctl_stream_create - Create a synchronisation stream + * @name: A name to identify this stream. Must be NULL-terminated. + * + * Note that this is also called a "timeline", but is named stream to avoid + * confusion with other uses of the word. + * + * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. + * + * The ioctl returns a file descriptor. + */ +struct kbase_ioctl_stream_create { + char name[32]; +}; + +#define KBASE_IOCTL_STREAM_CREATE \ + _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) + +/** + * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence + * @fd: The file descriptor to validate + */ +struct kbase_ioctl_fence_validate { + int fd; +}; + +#define KBASE_IOCTL_FENCE_VALIDATE \ + _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) + +/** + * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel + * @buffer: Pointer to the information + * @len: Length + * @padding: Padding + * + * The data provided is accessible through a debugfs file + */ +struct kbase_ioctl_mem_profile_add { + __u64 buffer; + __u32 len; + __u32 padding; +}; + +#define KBASE_IOCTL_MEM_PROFILE_ADD \ + _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) + +/** + * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to map + */ +struct kbase_ioctl_sticky_resource_map { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ + _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) + +/** + * struct kbase_ioctl_sticky_resource_unmap - Unmap a resource mapped which was + * previously permanently mapped + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to unmap + */ +struct kbase_ioctl_sticky_resource_unmap { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ + _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) + +/** + * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of + * the GPU memory region for + * the given gpu address and + * the offset of that address + * into the region + * @in: Input parameters + * @in.gpu_addr: GPU virtual address + * @in.size: Size in bytes within the region + * @out: Output parameters + * @out.start: Address of the beginning of the memory region enclosing @gpu_addr + * for the length of @offset bytes + * @out.offset: The offset from the start of the memory region to @gpu_addr + */ +union kbase_ioctl_mem_find_gpu_start_and_offset { + struct { + __u64 gpu_addr; + __u64 size; + } in; + struct { + __u64 start; + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) + +#define KBASE_IOCTL_CINSTR_GWT_START \ + _IO(KBASE_IOCTL_TYPE, 33) + +#define KBASE_IOCTL_CINSTR_GWT_STOP \ + _IO(KBASE_IOCTL_TYPE, 34) + +/** + * union kbase_ioctl_cinstr_gwt_dump - Used to collect all GPU write fault + * addresses. + * @in: Input parameters + * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. + * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) + * @in.len: Number of addresses the buffers can hold. + * @in.padding: padding + * @out: Output parameters + * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. + * @out.more_data_available: Status indicating if more addresses are available. + * @out.padding: padding + * + * This structure is used when performing a call to dump GPU write fault + * addresses. + */ +union kbase_ioctl_cinstr_gwt_dump { + struct { + __u64 addr_buffer; + __u64 size_buffer; + __u32 len; + __u32 padding; + + } in; + struct { + __u32 no_of_addr_collected; + __u8 more_data_available; + __u8 padding[27]; + } out; +}; + +#define KBASE_IOCTL_CINSTR_GWT_DUMP \ + _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) + +/** + * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone + * + * @va_pages: Number of VA pages to reserve for EXEC_VA + */ +struct kbase_ioctl_mem_exec_init { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_EXEC_INIT \ + _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) + +/** + * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of + * cpu/gpu time (counter values) + * @in: Input parameters + * @in.request_flags: Bit-flags indicating the requested types. + * @in.paddings: Unused, size alignment matching the out. + * @out: Output parameters + * @out.sec: Integer field of the monotonic time, unit in seconds. + * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. + * @out.padding: Unused, for __u64 alignment + * @out.timestamp: System wide timestamp (counter) value. + * @out.cycle_counter: GPU cycle counter value. + */ +union kbase_ioctl_get_cpu_gpu_timeinfo { + struct { + __u32 request_flags; + __u32 paddings[7]; + } in; + struct { + __u64 sec; + __u32 nsec; + __u32 padding; + __u64 timestamp; + __u64 cycle_counter; + } out; +}; + +#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ + _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) + +/** + * struct kbase_ioctl_context_priority_check - Check the max possible priority + * @priority: Input priority & output priority + */ + +struct kbase_ioctl_context_priority_check { + __u8 priority; +}; + +#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) + +/** + * struct kbase_ioctl_set_limited_core_count - Set the limited core count. + * + * @max_core_count: Maximum core count + */ +struct kbase_ioctl_set_limited_core_count { + __u8 max_core_count; +}; + +#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ + _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) + +/** + * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter + * information + * @info_item_size: Performance counter item size in bytes. + * @info_item_count: Performance counter item count in the info_list_ptr. + * @info_list_ptr: Performance counter item list pointer which points to a + * list with info_item_count of items. + * + * On success: returns info_item_size and info_item_count if info_list_ptr is + * NULL, returns performance counter information if info_list_ptr is not NULL. + * On error: returns a negative error code. + */ +struct kbase_ioctl_kinstr_prfcnt_enum_info { + __u32 info_item_size; + __u32 info_item_count; + __u64 info_list_ptr; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ + _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) + +/** + * struct kbase_ioctl_kinstr_prfcnt_setup - Setup HWC dumper/reader + * @in: input parameters. + * @in.request_item_count: Number of requests in the requests array. + * @in.request_item_size: Size in bytes of each request in the requests array. + * @in.requests_ptr: Pointer to the requests array. + * @out: output parameters. + * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for + * each sample. + * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap + * for reading performance counter samples. + * + * A fd is returned from the ioctl if successful, or a negative value on error. + */ +union kbase_ioctl_kinstr_prfcnt_setup { + struct { + __u32 request_item_count; + __u32 request_item_size; + __u64 requests_ptr; + } in; + struct { + __u32 prfcnt_metadata_item_size; + __u32 prfcnt_mmap_size_bytes; + } out; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ + _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) + + +/** + * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes + * @bytes_collected: number of bytes read by user + * @bytes_generated: number of bytes generated by tracepoints + */ +struct kbase_ioctl_tlstream_stats { + __u32 bytes_collected; + __u32 bytes_generated; +}; + +#define KBASE_IOCTL_TLSTREAM_STATS \ + _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) + +#endif /* MALI_UNIT_TEST */ + +/* Customer extension range */ +#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) + +/* If the integration needs extra ioctl add them there + * like this: + * + * struct my_ioctl_args { + * .... + * } + * + * #define KBASE_IOCTL_MY_IOCTL \ + * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) + */ + +#ifdef __cpluscplus +} +#endif + +#endif /* _UAPI_KBASE_IOCTL_H_ */ diff --git a/src/panfrost/base/include/old/mali-ioctl-midgard.h b/src/panfrost/base/include/old/mali-ioctl-midgard.h new file mode 100644 index 00000000000..5f33f5c4c4b --- /dev/null +++ b/src/panfrost/base/include/old/mali-ioctl-midgard.h @@ -0,0 +1,80 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef __KBASE_IOCTL_MIDGARD_H__ +#define __KBASE_IOCTL_MIDGARD_H__ + +#define KBASE_IOCTL_TYPE_BASE 0x80 +#define KBASE_IOCTL_TYPE_MAX 0x82 + +union kbase_ioctl_mem_alloc { + struct { + union kbase_ioctl_header header; + u64 va_pages; + u64 commit_pages; + u64 extension; + u64 flags; + } in; + struct { + union kbase_ioctl_header header; + u64 pad[3]; + u64 flags; + mali_ptr gpu_va; + u16 va_alignment; + } out; + u64 pad[7]; +} __attribute__((packed)); + +#define KBASE_IOCTL_TYPE_COUNT (KBASE_IOCTL_TYPE_MAX - KBASE_IOCTL_TYPE_BASE + 1) + +#define KBASE_IOCTL_GET_VERSION (_IOWR(0x80, 0, struct kbase_ioctl_get_version)) +#define KBASE_IOCTL_MEM_ALLOC (_IOWR(0x82, 0, union kbase_ioctl_mem_alloc)) +#define KBASE_IOCTL_MEM_IMPORT (_IOWR(0x82, 1, union kbase_ioctl_mem_import)) +#define KBASE_IOCTL_MEM_COMMIT (_IOWR(0x82, 2, struct kbase_ioctl_mem_commit)) +#define KBASE_IOCTL_MEM_QUERY (_IOWR(0x82, 3, struct kbase_ioctl_mem_query)) +#define KBASE_IOCTL_MEM_FREE (_IOWR(0x82, 4, struct kbase_ioctl_mem_free)) +#define KBASE_IOCTL_MEM_FLAGS_CHANGE (_IOWR(0x82, 5, struct kbase_ioctl_mem_flags_change)) +#define KBASE_IOCTL_MEM_ALIAS (_IOWR(0x82, 6, struct kbase_ioctl_mem_alias)) +#define KBASE_IOCTL_MEM_SYNC (_IOWR(0x82, 8, struct kbase_ioctl_mem_sync)) +#define KBASE_IOCTL_POST_TERM (_IOWR(0x82, 9, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_SETUP (_IOWR(0x82, 10, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_DUMP (_IOWR(0x82, 11, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_CLEAR (_IOWR(0x82, 12, __ioctl_placeholder)) +#define KBASE_IOCTL_GPU_PROPS_REG_DUMP (_IOWR(0x82, 14, struct kbase_ioctl_gpu_props_reg_dump)) +#define KBASE_IOCTL_FIND_CPU_OFFSET (_IOWR(0x82, 15, __ioctl_placeholder)) +#define KBASE_IOCTL_GET_VERSION_NEW (_IOWR(0x82, 16, struct kbase_ioctl_get_version)) +#define KBASE_IOCTL_SET_FLAGS (_IOWR(0x82, 18, struct kbase_ioctl_set_flags)) +#define KBASE_IOCTL_SET_TEST_DATA (_IOWR(0x82, 19, __ioctl_placeholder)) +#define KBASE_IOCTL_INJECT_ERROR (_IOWR(0x82, 20, __ioctl_placeholder)) +#define KBASE_IOCTL_MODEL_CONTROL (_IOWR(0x82, 21, __ioctl_placeholder)) +#define KBASE_IOCTL_KEEP_GPU_POWERED (_IOWR(0x82, 22, __ioctl_placeholder)) +#define KBASE_IOCTL_FENCE_VALIDATE (_IOWR(0x82, 23, __ioctl_placeholder)) +#define KBASE_IOCTL_STREAM_CREATE (_IOWR(0x82, 24, struct kbase_ioctl_stream_create)) +#define KBASE_IOCTL_GET_PROFILING_CONTROLS (_IOWR(0x82, 25, __ioctl_placeholder)) +#define KBASE_IOCTL_SET_PROFILING_CONTROLS (_IOWR(0x82, 26, __ioctl_placeholder)) +#define KBASE_IOCTL_DEBUGFS_MEM_PROFILE_ADD (_IOWR(0x82, 27, __ioctl_placeholder)) +#define KBASE_IOCTL_JOB_SUBMIT (_IOWR(0x82, 28, struct kbase_ioctl_job_submit)) +#define KBASE_IOCTL_DISJOINT_QUERY (_IOWR(0x82, 29, __ioctl_placeholder)) +#define KBASE_IOCTL_GET_CONTEXT_ID (_IOWR(0x82, 31, struct kbase_ioctl_get_context_id)) +#define KBASE_IOCTL_TLSTREAM_ACQUIRE_V10_4 (_IOWR(0x82, 32, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_TEST (_IOWR(0x82, 33, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_STATS (_IOWR(0x82, 34, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_FLUSH (_IOWR(0x82, 35, __ioctl_placeholder)) +#define KBASE_IOCTL_HWCNT_READER_SETUP (_IOWR(0x82, 36, __ioctl_placeholder)) +#define KBASE_IOCTL_SET_PRFCNT_VALUES (_IOWR(0x82, 37, __ioctl_placeholder)) +#define KBASE_IOCTL_SOFT_EVENT_UPDATE (_IOWR(0x82, 38, __ioctl_placeholder)) +#define KBASE_IOCTL_MEM_JIT_INIT (_IOWR(0x82, 39, __ioctl_placeholder)) +#define KBASE_IOCTL_TLSTREAM_ACQUIRE (_IOWR(0x82, 40, __ioctl_placeholder)) + +#endif /* __KBASE_IOCTL_MIDGARD_H__ */ diff --git a/src/panfrost/base/include/old/mali-ioctl.h b/src/panfrost/base/include/old/mali-ioctl.h new file mode 100644 index 00000000000..5c76f2dc8e5 --- /dev/null +++ b/src/panfrost/base/include/old/mali-ioctl.h @@ -0,0 +1,743 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +/** + * Definitions for all of the ioctls for the original open source bifrost GPU + * kernel driver, written by ARM. + */ + +#ifndef __KBASE_IOCTL_H__ +#define __KBASE_IOCTL_H__ + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int32_t s32; +typedef int64_t s64; + + +typedef u8 mali_atom_id; + +/** + * Since these structs are passed to and from the kernel we need to make sure + * that we get the size of each struct to match exactly what the kernel is + * expecting. So, when editing this file make sure to add static asserts that + * check each struct's size against the arg length you see in strace. + */ + +enum kbase_ioctl_mem_flags { + /* IN */ + BASE_MEM_PROT_CPU_RD = (1U << 0), /**< Read access CPU side */ + BASE_MEM_PROT_CPU_WR = (1U << 1), /**< Write access CPU side */ + BASE_MEM_PROT_GPU_RD = (1U << 2), /**< Read access GPU side */ + BASE_MEM_PROT_GPU_WR = (1U << 3), /**< Write access GPU side */ + BASE_MEM_PROT_GPU_EX = (1U << 4), /**< Execute allowed on the GPU + side */ + + BASE_MEM_GROW_ON_GPF = (1U << 9), /**< Grow backing store on GPU + Page Fault */ + + BASE_MEM_COHERENT_SYSTEM = (1U << 10), /**< Page coherence Outer + shareable, if available */ + BASE_MEM_COHERENT_LOCAL = (1U << 11), /**< Page coherence Inner + shareable */ + BASE_MEM_CACHED_CPU = (1U << 12), /**< Should be cached on the + CPU */ + + /* IN/OUT */ + BASE_MEM_SAME_VA = (1U << 13), /**< Must have same VA on both the GPU + and the CPU */ + /* OUT */ + BASE_MEM_NEED_MMAP = (1U << 14), /**< Must call mmap to acquire a GPU + address for the alloc */ + /* IN */ + BASE_MEM_COHERENT_SYSTEM_REQUIRED = (1U << 15), /**< Page coherence + Outer shareable, required. */ + BASE_MEM_SECURE = (1U << 16), /**< Secure memory */ + BASE_MEM_DONT_NEED = (1U << 17), /**< Not needed physical + memory */ + BASE_MEM_IMPORT_SHARED = (1U << 18), /**< Must use shared CPU/GPU zone + (SAME_VA zone) but doesn't + require the addresses to + be the same */ +}; + +#define KBASE_IOCTL_MEM_FLAGS_IN_MASK \ + (BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | \ + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_GPU_EX | \ + BASE_MEM_GROW_ON_GPF | \ + BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL | \ + BASE_MEM_CACHED_CPU | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_SECURE | \ + BASE_MEM_DONT_NEED | BASE_MEM_IMPORT_SHARED) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12) + +enum kbase_ioctl_coherency_mode { + COHERENCY_ACE_LITE = 0, + COHERENCY_ACE = 1, + COHERENCY_NONE = 31 +}; + +/* + * Mali Atom priority + * + * Only certain priority levels are actually implemented, as specified by the + * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority + * level that is not one of those defined below. + * + * Priority levels only affect scheduling between atoms of the same type within + * a mali context, and only after the atoms have had dependencies resolved. + * Fragment atoms does not affect non-frament atoms with lower priorities, and + * the other way around. For example, a low priority atom that has had its + * dependencies resolved might run before a higher priority atom that has not + * had its dependencies resolved. + * + * The scheduling between mali contexts/processes and between atoms from + * different mali contexts/processes is unaffected by atom priority. + * + * The atoms are scheduled as follows with respect to their priorities: + * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies + * resolved, and atom 'X' has a higher priority than atom 'Y' + * - If atom 'Y' is currently running on the HW, then it is interrupted to + * allow atom 'X' to run soon after + * - If instead neither atom 'Y' nor atom 'X' are running, then when choosing + * the next atom to run, atom 'X' will always be chosen instead of atom 'Y' + * - Any two atoms that have the same priority could run in any order with + * respect to each other. That is, there is no ordering constraint between + * atoms of the same priority. + */ +typedef u8 mali_jd_prio; +#define BASE_JD_PRIO_MEDIUM ((mali_jd_prio)0) +#define BASE_JD_PRIO_HIGH ((mali_jd_prio)1) +#define BASE_JD_PRIO_LOW ((mali_jd_prio)2) + +/** + * @brief Job dependency type. + * + * A flags field will be inserted into the atom structure to specify whether a + * dependency is a data or ordering dependency (by putting it before/after + * 'core_req' in the structure it should be possible to add without changing + * the structure size). When the flag is set for a particular dependency to + * signal that it is an ordering only dependency then errors will not be + * propagated. + */ +typedef u8 mali_jd_dep_type; +#define BASE_JD_DEP_TYPE_INVALID (0) /**< Invalid dependency */ +#define BASE_JD_DEP_TYPE_DATA (1U << 0) /**< Data dependency */ +#define BASE_JD_DEP_TYPE_ORDER (1U << 1) /**< Order dependency */ + +/** + * @brief Job chain hardware requirements. + * + * A job chain must specify what GPU features it needs to allow the + * driver to schedule the job correctly. By not specifying the + * correct settings can/will cause an early job termination. Multiple + * values can be ORed together to specify multiple requirements. + * Special case is ::BASE_JD_REQ_DEP, which is used to express complex + * dependencies, and that doesn't execute anything on the hardware. + */ +typedef u32 mali_jd_core_req; + +/* Requirements that come from the HW */ + +/** + * No requirement, dependency only + */ +#define BASE_JD_REQ_DEP ((mali_jd_core_req)0) + +/** + * Requires fragment shaders + */ +#define BASE_JD_REQ_FS ((mali_jd_core_req)1 << 0) + +/** + * Requires compute shaders + * This covers any of the following Midgard Job types: + * - Vertex Shader Job + * - Geometry Shader Job + * - An actual Compute Shader Job + * + * Compare this with @ref BASE_JD_REQ_ONLY_COMPUTE, which specifies that the + * job is specifically just the "Compute Shader" job type, and not the "Vertex + * Shader" nor the "Geometry Shader" job type. + */ +#define BASE_JD_REQ_CS ((mali_jd_core_req)1 << 1) +#define BASE_JD_REQ_T ((mali_jd_core_req)1 << 2) /**< Requires tiling */ +#define BASE_JD_REQ_CF ((mali_jd_core_req)1 << 3) /**< Requires cache flushes */ +#define BASE_JD_REQ_V ((mali_jd_core_req)1 << 4) /**< Requires value writeback */ + +/* SW-only requirements - the HW does not expose these as part of the job slot + * capabilities */ + +/* Requires fragment job with AFBC encoding */ +#define BASE_JD_REQ_FS_AFBC ((mali_jd_core_req)1 << 13) + +/** + * SW-only requirement: coalesce completion events. + * If this bit is set then completion of this atom will not cause an event to + * be sent to userspace, whether successful or not; completion events will be + * deferred until an atom completes which does not have this bit set. + * + * This bit may not be used in combination with BASE_JD_REQ_EXTERNAL_RESOURCES. + */ +#define BASE_JD_REQ_EVENT_COALESCE ((mali_jd_core_req)1 << 5) + +/** + * SW Only requirement: the job chain requires a coherent core group. We don't + * mind which coherent core group is used. + */ +#define BASE_JD_REQ_COHERENT_GROUP ((mali_jd_core_req)1 << 6) + +/** + * SW Only requirement: The performance counters should be enabled only when + * they are needed, to reduce power consumption. + */ + +#define BASE_JD_REQ_PERMON ((mali_jd_core_req)1 << 7) + +/** + * SW Only requirement: External resources are referenced by this atom. When + * external resources are referenced no syncsets can be bundled with the atom + * but should instead be part of a NULL jobs inserted into the dependency + * tree. The first pre_dep object must be configured for the external + * resouces to use, the second pre_dep object can be used to create other + * dependencies. + * + * This bit may not be used in combination with BASE_JD_REQ_EVENT_COALESCE. + */ +#define BASE_JD_REQ_EXTERNAL_RESOURCES ((mali_jd_core_req)1 << 8) + +/** + * SW Only requirement: Software defined job. Jobs with this bit set will not + * be submitted to the hardware but will cause some action to happen within + * the driver + */ +#define BASE_JD_REQ_SOFT_JOB ((mali_jd_core_req)1 << 9) + +#define BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME (BASE_JD_REQ_SOFT_JOB | 0x1) +#define BASE_JD_REQ_SOFT_FENCE_TRIGGER (BASE_JD_REQ_SOFT_JOB | 0x2) +#define BASE_JD_REQ_SOFT_FENCE_WAIT (BASE_JD_REQ_SOFT_JOB | 0x3) + +/** + * SW Only requirement : Replay job. + * + * If the preceding job fails, the replay job will cause the jobs specified in + * the list of mali_jd_replay_payload pointed to by the jc pointer to be + * replayed. + * + * A replay job will only cause jobs to be replayed up to MALIP_JD_REPLAY_LIMIT + * times. If a job fails more than MALIP_JD_REPLAY_LIMIT times then the replay + * job is failed, as well as any following dependencies. + * + * The replayed jobs will require a number of atom IDs. If there are not enough + * free atom IDs then the replay job will fail. + * + * If the preceding job does not fail, then the replay job is returned as + * completed. + * + * The replayed jobs will never be returned to userspace. The preceding failed + * job will be returned to userspace as failed; the status of this job should + * be ignored. Completion should be determined by the status of the replay soft + * job. + * + * In order for the jobs to be replayed, the job headers will have to be + * modified. The Status field will be reset to NOT_STARTED. If the Job Type + * field indicates a Vertex Shader Job then it will be changed to Null Job. + * + * The replayed jobs have the following assumptions : + * + * - No external resources. Any required external resources will be held by the + * replay atom. + * - Pre-dependencies are created based on job order. + * - Atom numbers are automatically assigned. + * - device_nr is set to 0. This is not relevant as + * BASE_JD_REQ_SPECIFIC_COHERENT_GROUP should not be set. + * - Priority is inherited from the replay job. + */ +#define BASE_JD_REQ_SOFT_REPLAY (BASE_JD_REQ_SOFT_JOB | 0x4) +/** + * SW only requirement: event wait/trigger job. + * + * - BASE_JD_REQ_SOFT_EVENT_WAIT: this job will block until the event is set. + * - BASE_JD_REQ_SOFT_EVENT_SET: this job sets the event, thus unblocks the + * other waiting jobs. It completes immediately. + * - BASE_JD_REQ_SOFT_EVENT_RESET: this job resets the event, making it + * possible for other jobs to wait upon. It completes immediately. + */ +#define BASE_JD_REQ_SOFT_EVENT_WAIT (BASE_JD_REQ_SOFT_JOB | 0x5) +#define BASE_JD_REQ_SOFT_EVENT_SET (BASE_JD_REQ_SOFT_JOB | 0x6) +#define BASE_JD_REQ_SOFT_EVENT_RESET (BASE_JD_REQ_SOFT_JOB | 0x7) + +#define BASE_JD_REQ_SOFT_DEBUG_COPY (BASE_JD_REQ_SOFT_JOB | 0x8) + +/** + * SW only requirement: Just In Time allocation + * + * This job requests a JIT allocation based on the request in the + * @base_jit_alloc_info structure which is passed via the jc element of + * the atom. + * + * It should be noted that the id entry in @base_jit_alloc_info must not + * be reused until it has been released via @BASE_JD_REQ_SOFT_JIT_FREE. + * + * Should this soft job fail it is expected that a @BASE_JD_REQ_SOFT_JIT_FREE + * soft job to free the JIT allocation is still made. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_ALLOC (BASE_JD_REQ_SOFT_JOB | 0x9) +/** + * SW only requirement: Just In Time free + * + * This job requests a JIT allocation created by @BASE_JD_REQ_SOFT_JIT_ALLOC + * to be freed. The ID of the JIT allocation is passed via the jc element of + * the atom. + * + * The job will complete immediately. + */ +#define BASE_JD_REQ_SOFT_JIT_FREE (BASE_JD_REQ_SOFT_JOB | 0xa) + +/** + * SW only requirement: Map external resource + * + * This job requests external resource(s) are mapped once the dependencies + * of the job have been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * @base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_MAP (BASE_JD_REQ_SOFT_JOB | 0xb) +/** + * SW only requirement: Unmap external resource + * + * This job requests external resource(s) are unmapped once the dependencies + * of the job has been satisfied. The list of external resources are + * passed via the jc element of the atom which is a pointer to a + * @base_external_resource_list. + */ +#define BASE_JD_REQ_SOFT_EXT_RES_UNMAP (BASE_JD_REQ_SOFT_JOB | 0xc) + +/** + * HW Requirement: Requires Compute shaders (but not Vertex or Geometry Shaders) + * + * This indicates that the Job Chain contains Midgard Jobs of the 'Compute + * Shaders' type. + * + * In contrast to @ref BASE_JD_REQ_CS, this does \b not indicate that the Job + * Chain contains 'Geometry Shader' or 'Vertex Shader' jobs. + */ +#define BASE_JD_REQ_ONLY_COMPUTE ((mali_jd_core_req)1 << 10) + +/** + * HW Requirement: Use the mali_jd_atom::device_nr field to specify a + * particular core group + * + * If both @ref BASE_JD_REQ_COHERENT_GROUP and this flag are set, this flag + * takes priority + * + * This is only guaranteed to work for @ref BASE_JD_REQ_ONLY_COMPUTE atoms. + * + * If the core availability policy is keeping the required core group turned + * off, then the job will fail with a @ref BASE_JD_EVENT_PM_EVENT error code. + */ +#define BASE_JD_REQ_SPECIFIC_COHERENT_GROUP ((mali_jd_core_req)1 << 11) + +/** + * SW Flag: If this bit is set then the successful completion of this atom + * will not cause an event to be sent to userspace + */ +#define BASE_JD_REQ_EVENT_ONLY_ON_FAILURE ((mali_jd_core_req)1 << 12) + +/** + * SW Flag: If this bit is set then completion of this atom will not cause an + * event to be sent to userspace, whether successful or not. + */ +#define BASE_JD_REQ_EVENT_NEVER ((mali_jd_core_req)1 << 14) + +/** + * SW Flag: Skip GPU cache clean and invalidation before starting a GPU job. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job starts which does not have this bit set or a job completes + * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_END bit set. Do not use if + * the CPU may have written to memory addressed by the job since the last job + * without this bit set was submitted. + */ +#define BASE_JD_REQ_SKIP_CACHE_START ((mali_jd_core_req)1 << 15) + +/** + * SW Flag: Skip GPU cache clean and invalidation after a GPU job completes. + * + * If this bit is set then the GPU's cache will not be cleaned and invalidated + * until a GPU job completes which does not have this bit set or a job starts + * which does not have the @ref BASE_JD_REQ_SKIP_CACHE_START bti set. Do not + * use if the CPU may read from or partially overwrite memory addressed by the + * job before the next job without this bit set completes. + */ +#define BASE_JD_REQ_SKIP_CACHE_END ((mali_jd_core_req)1 << 16) + +/** + * These requirement bits are currently unused in mali_jd_core_req + */ +#define MALIP_JD_REQ_RESERVED \ + (~(BASE_JD_REQ_ATOM_TYPE | BASE_JD_REQ_EXTERNAL_RESOURCES | \ + BASE_JD_REQ_EVENT_ONLY_ON_FAILURE | MALIP_JD_REQ_EVENT_NEVER | \ + BASE_JD_REQ_EVENT_COALESCE | \ + BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP | \ + BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \ + BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END)) + +/** + * Mask of all bits in mali_jd_core_req that control the type of the atom. + * + * This allows dependency only atoms to have flags set + */ +#define BASE_JD_REQ_ATOM_TYPE \ + (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | \ + BASE_JD_REQ_V | BASE_JD_REQ_SOFT_JOB | BASE_JD_REQ_ONLY_COMPUTE) + +/** + * Mask of all bits in mali_jd_core_req that control the type of a soft job. + */ +#define BASE_JD_REQ_SOFT_JOB_TYPE (BASE_JD_REQ_SOFT_JOB | 0x1f) + +/* + * Returns non-zero value if core requirements passed define a soft job or + * a dependency only job. + */ +#define BASE_JD_REQ_SOFT_JOB_OR_DEP(core_req) \ + ((core_req & BASE_JD_REQ_SOFT_JOB) || \ + (core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP) + +/** + * @brief The payload for a replay job. This must be in GPU memory. + */ +struct mali_jd_replay_payload { + /** + * Pointer to the first entry in the mali_jd_replay_jc list. These + * will be replayed in @b reverse order (so that extra ones can be added + * to the head in future soft jobs without affecting this soft job) + */ + u64 tiler_jc_list; + + /** + * Pointer to the fragment job chain. + */ + u64 fragment_jc; + + /** + * Pointer to the tiler heap free FBD field to be modified. + */ + u64 tiler_heap_free; + + /** + * Hierarchy mask for the replayed fragment jobs. May be zero. + */ + u16 fragment_hierarchy_mask; + + /** + * Hierarchy mask for the replayed tiler jobs. May be zero. + */ + u16 tiler_hierarchy_mask; + + /** + * Default weight to be used for hierarchy levels not in the original + * mask. + */ + u32 hierarchy_default_weight; + + /** + * Core requirements for the tiler job chain + */ + mali_jd_core_req tiler_core_req; + + /** + * Core requirements for the fragment job chain + */ + mali_jd_core_req fragment_core_req; +}; + +/** + * @brief An entry in the linked list of job chains to be replayed. This must + * be in GPU memory. + */ +struct mali_jd_replay_jc { + /** + * Pointer to next entry in the list. A setting of NULL indicates the + * end of the list. + */ + u64 next; + + /** + * Pointer to the job chain. + */ + u64 jc; +}; + +typedef u64 mali_ptr; + +#define MALI_PTR_FMT "0x%" PRIx64 +#define MALI_SHORT_PTR_FMT "0x%" PRIxPTR + +#ifdef __LP64__ +#define PAD_CPU_PTR(p) p +#else +#define PAD_CPU_PTR(p) p; u32 :32; +#endif + +/* FIXME: Again, they don't specify any of these as packed structs. However, + * looking at these structs I'm worried that there is already spots where the + * compiler is potentially sticking in padding... + * Going to try something a little crazy, and just hope that our compiler + * happens to add the same kind of offsets since we can't really compare sizes + */ + +/* + * Blob provided by the driver to store callback driver, not actually modified + * by the driver itself + */ +struct mali_jd_udata { + u64 blob[2]; +}; + +struct mali_jd_dependency { + mali_atom_id atom_id; /**< An atom number */ + mali_jd_dep_type dependency_type; /**< Dependency type */ +}; + +#define MALI_EXT_RES_MAX 10 + +/* The original header never explicitly defines any values for these. In C, + * this -should- expand to SHARED == 0 and EXCLUSIVE == 1, so the only flag we + * actually need to decode here is EXCLUSIVE + */ +enum mali_external_resource_access { + MALI_EXT_RES_ACCESS_SHARED, + MALI_EXT_RES_ACCESS_EXCLUSIVE, +}; + +/* An aligned address to the resource | mali_external_resource_access */ +typedef u64 mali_external_resource; + +struct base_jd_atom_v2 { + mali_ptr jc; /**< job-chain GPU address */ + struct mali_jd_udata udata; /**< user data */ + u64 extres_list; /**< list of external resources */ + u16 nr_extres; /**< nr of external resources */ + u16 compat_core_req; /**< core requirements which + correspond to the legacy support + for UK 10.2 */ + struct mali_jd_dependency pre_dep[2]; /**< pre-dependencies, one need to + use SETTER function to assign + this field, this is done in + order to reduce possibility of + improper assigment of a + dependency field */ + mali_atom_id atom_number; /**< unique number to identify the + atom */ + mali_jd_prio prio; /**< Atom priority. Refer to @ref + mali_jd_prio for more details */ + u8 device_nr; /**< coregroup when + BASE_JD_REQ_SPECIFIC_COHERENT_GROUP + specified */ + u8 :8; + mali_jd_core_req core_req; /**< core requirements */ +} __attribute__((packed)); + +/** + * enum mali_error - Mali error codes shared with userspace + * + * This is subset of those common Mali errors that can be returned to userspace. + * Values of matching user and kernel space enumerators MUST be the same. + * MALI_ERROR_NONE is guaranteed to be 0. + * + * @MALI_ERROR_NONE: Success + * @MALI_ERROR_OUT_OF_GPU_MEMORY: Not used in the kernel driver + * @MALI_ERROR_OUT_OF_MEMORY: Memory allocation failure + * @MALI_ERROR_FUNCTION_FAILED: Generic error code + */ +enum mali_error { + MALI_ERROR_NONE = 0, + MALI_ERROR_OUT_OF_GPU_MEMORY, + MALI_ERROR_OUT_OF_MEMORY, + MALI_ERROR_FUNCTION_FAILED, +}; + +/** + * Header used by all ioctls + */ +union kbase_ioctl_header { +#ifdef dvalin + u32 pad[0]; +#else + /* [in] The ID of the UK function being called */ + u32 id :32; + /* [out] The return value of the UK function that was called */ + enum mali_error rc :32; + + u64 :64; +#endif +} __attribute__((packed)); + +struct kbase_ioctl_get_version { + union kbase_ioctl_header header; + u16 major; /* [out] */ + u16 minor; /* [out] */ + u32 :32; +} __attribute__((packed)); + +struct mali_mem_import_user_buffer { + u64 ptr; + u64 length; +}; + +union kbase_ioctl_mem_import { + struct { + union kbase_ioctl_header header; + u64 phandle; + enum { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + BASE_MEM_IMPORT_TYPE_UMP = 1, + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3, + } type :32; + u32 :32; + u64 flags; + } in; + struct { + union kbase_ioctl_header header; + u64 pad[2]; + u64 flags; + u64 gpu_va; + u64 va_pages; + } out; +} __attribute__((packed)); + +struct kbase_ioctl_mem_commit { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_addr; + u64 pages; + /* [out] */ + u32 result_subcode; + u32 :32; +} __attribute__((packed)); + +enum kbase_ioctl_mem_query_type { + BASE_MEM_QUERY_COMMIT_SIZE = 1, + BASE_MEM_QUERY_VA_SIZE = 2, + BASE_MEM_QUERY_FLAGS = 3 +}; + +struct kbase_ioctl_mem_query { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_addr; + enum kbase_ioctl_mem_query_type query : 32; + u32 :32; + /* [out] */ + u64 value; +} __attribute__((packed)); + +struct kbase_ioctl_mem_free { + union kbase_ioctl_header header; + mali_ptr gpu_addr; /* [in] */ +} __attribute__((packed)); +/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ + +struct kbase_ioctl_mem_flags_change { + union kbase_ioctl_header header; + /* [in] */ + mali_ptr gpu_va; + u64 flags; + u64 mask; +} __attribute__((packed)); +/* FIXME: Size unconfirmed (haven't seen in a trace yet) */ + +struct kbase_ioctl_mem_alias { + union kbase_ioctl_header header; + /* [in/out] */ + u64 flags; + /* [in] */ + u64 stride; + u64 nents; + u64 ai; + /* [out] */ + mali_ptr gpu_va; + u64 va_pages; +} __attribute__((packed)); + +struct kbase_ioctl_mem_sync { + union kbase_ioctl_header header; + mali_ptr handle; + u64 user_addr; + u64 size; + enum { + MALI_SYNC_TO_DEVICE = 1, + MALI_SYNC_TO_CPU = 2, + } type :8; + u64 :56; +} __attribute__((packed)); + +struct kbase_ioctl_set_flags { + union kbase_ioctl_header header; + u32 create_flags; /* [in] */ + u32 :32; +} __attribute__((packed)); + +struct kbase_ioctl_stream_create { + union kbase_ioctl_header header; + /* [in] */ + char name[32]; + /* [out] */ + s32 fd; + u32 :32; +} __attribute__((packed)); + +struct kbase_ioctl_job_submit { + union kbase_ioctl_header header; + /* [in] */ + u64 addr; + u32 nr_atoms; + u32 stride; +} __attribute__((packed)); + +struct kbase_ioctl_get_context_id { + union kbase_ioctl_header header; + /* [out] */ + s64 id; +} __attribute__((packed)); + +#undef PAD_CPU_PTR + +enum base_jd_event_code { + BASE_JD_EVENT_DONE = 1, +}; + +struct base_jd_event_v2 { + enum base_jd_event_code event_code; + mali_atom_id atom_number; + struct mali_jd_udata udata; +}; + +/* Defined in mali-props.h */ +struct kbase_ioctl_gpu_props_reg_dump; + +/* For ioctl's we haven't written decoding stuff for yet */ +typedef struct { + union kbase_ioctl_header header; +} __ioctl_placeholder; + +#endif /* __KBASE_IOCTL_H__ */ diff --git a/src/panfrost/base/include/old/mali-props.h b/src/panfrost/base/include/old/mali-props.h new file mode 100644 index 00000000000..5b9d8723600 --- /dev/null +++ b/src/panfrost/base/include/old/mali-props.h @@ -0,0 +1,262 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * A copy of the licence is included with the program, and can also be obtained + * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef __MALI_PROPS_H__ +#define __MALI_PROPS_H__ + +#include "mali-ioctl.h" + +#define MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3 +#define MALI_GPU_MAX_JOB_SLOTS 16 +#define MALI_MAX_COHERENT_GROUPS 16 + +/* Capabilities of a job slot as reported by JS_FEATURES registers */ + +#define JS_FEATURE_NULL_JOB (1u << 1) +#define JS_FEATURE_SET_VALUE_JOB (1u << 2) +#define JS_FEATURE_CACHE_FLUSH_JOB (1u << 3) +#define JS_FEATURE_COMPUTE_JOB (1u << 4) +#define JS_FEATURE_VERTEX_JOB (1u << 5) +#define JS_FEATURE_GEOMETRY_JOB (1u << 6) +#define JS_FEATURE_TILER_JOB (1u << 7) +#define JS_FEATURE_FUSED_JOB (1u << 8) +#define JS_FEATURE_FRAGMENT_JOB (1u << 9) + +struct mali_gpu_core_props { + /** + * Product specific value. + */ + u32 product_id; + + /** + * Status of the GPU release. + * No defined values, but starts at 0 and increases by one for each + * release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + */ + u16 version_status; + + /** + * Minor release number of the GPU. "P" part of an "RnPn" release + * number. + * 8 bit values (0-255). + */ + u16 minor_revision; + + /** + * Major release number of the GPU. "R" part of an "RnPn" release + * number. + * 4 bit values (0-15). + */ + u16 major_revision; + + u16 :16; + + /** + * @usecase GPU clock speed is not specified in the Midgard + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() + * function. + */ + u32 gpu_speed_mhz; + + /** + * @usecase GPU clock max/min speed is required for computing + * best/worst case in tasks as job scheduling ant irq_throttling. (It + * is not specified in the Midgard Architecture). + */ + u32 gpu_freq_khz_max; + u32 gpu_freq_khz_min; + + /** + * Size of the shader program counter, in bits. + */ + u32 log2_program_counter_size; + + /** + * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a + * bitpattern where a set bit indicates that the format is supported. + * + * Before using a texture format, it is recommended that the + * corresponding bit be checked. + */ + u32 texture_features[MALI_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + /** + * Theoretical maximum memory available to the GPU. It is unlikely + * that a client will be able to allocate all of this memory for their + * own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + */ + u64 gpu_available_memory_size; +}; + +struct mali_gpu_l2_cache_props { + u8 log2_line_size; + u8 log2_cache_size; + u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + u64 :40; +}; + +struct mali_gpu_tiler_props { + u32 bin_size_bytes; /* Max is 4*2^15 */ + u32 max_active_levels; /* Max is 2^15 */ +}; + +struct mali_gpu_thread_props { + u32 max_threads; /* Max. number of threads per core */ + u32 max_workgroup_size; /* Max. number of threads per workgroup */ + u32 max_barrier_size; /* Max. number of threads that can + synchronize on a simple barrier */ + u16 max_registers; /* Total size [1..65535] of the register + file available per core. */ + u8 max_task_queue; /* Max. tasks [1..255] which may be sent + to a core before it becomes blocked. */ + u8 max_thread_group_split; /* Max. allowed value [1..15] of the + Thread Group Split field. */ + enum { + MALI_GPU_IMPLEMENTATION_UNKNOWN = 0, + MALI_GPU_IMPLEMENTATION_SILICON = 1, + MALI_GPU_IMPLEMENTATION_FPGA = 2, + MALI_GPU_IMPLEMENTATION_SW = 3, + } impl_tech :8; + u64 :56; +}; + +/** + * @brief descriptor for a coherent group + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_ioctl_gpu_coherent_group { + u64 core_mask; /**< Core restriction mask required for the + group */ + u16 num_cores; /**< Number of cores in the group */ + u64 :48; +}; + +/** + * @brief Coherency group information + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_gpu_coherent_group_info { + u32 num_groups; + + /** + * Number of core groups (coherent or not) in the GPU. Equivalent to + * the number of L2 Caches. + * + * The GPU Counter dumping writes 2048 bytes per core group, + * regardless of whether the core groups are coherent or not. Hence + * this member is needed to calculate how much memory is required for + * dumping. + * + * @note Do not use it to work out how many valid elements are in the + * group[] member. Use num_groups instead. + */ + u32 num_core_groups; + + /** + * Coherency features of the memory, accessed by @ref gpu_mem_features + * methods + */ + u32 coherency; + + u32 :32; + + /** + * Descriptors of coherent groups + */ + struct mali_ioctl_gpu_coherent_group group[MALI_MAX_COHERENT_GROUPS]; +}; + +/** + * A complete description of the GPU's Hardware Configuration Discovery + * registers. + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct mali_gpu_raw_props { + u64 shader_present; + u64 tiler_present; + u64 l2_present; + u64 stack_present; + + u32 l2_features; + u32 suspend_size; /* API 8.2+ */ + u32 mem_features; + u32 mmu_features; + + u32 as_present; + + u32 js_present; + u32 js_features[MALI_GPU_MAX_JOB_SLOTS]; + u32 tiler_features; + u32 texture_features[3]; + + u32 gpu_id; + + u32 thread_max_threads; + u32 thread_max_workgroup_size; + u32 thread_max_barrier_size; + u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + u32 coherency_mode; +}; + +struct kbase_ioctl_gpu_props_reg_dump { + union kbase_ioctl_header header; + struct mali_gpu_core_props core; + struct mali_gpu_l2_cache_props l2; + u64 :64; + struct mali_gpu_tiler_props tiler; + struct mali_gpu_thread_props thread; + + struct mali_gpu_raw_props raw; + + /** This must be last member of the structure */ + struct mali_gpu_coherent_group_info coherency_info; +} __attribute__((packed)); + +#endif diff --git a/src/panfrost/base/meson.build b/src/panfrost/base/meson.build new file mode 100644 index 00000000000..5d7b9f1dff9 --- /dev/null +++ b/src/panfrost/base/meson.build @@ -0,0 +1,55 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora +# Copyright © 2022 Icecream95 + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_base_versions = ['0', '1', '2', '258'] +libpanfrost_base_per_arch = [] + +foreach ver : libpanfrost_base_versions + libpanfrost_base_per_arch += static_library( + 'pan-base-v' + ver, + 'pan_vX_base.c', + include_directories : [ + inc_src, inc_include, inc_gallium, inc_mesa, inc_gallium_aux, + include_directories('include'), + ], + c_args : ['-DPAN_BASE_VER=' + ver], + gnu_symbol_visibility : 'hidden', + dependencies: [dep_valgrind], +) +endforeach + +libpanfrost_base = static_library( + 'panfrost_base', + 'pan_base.c', + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw, + include_directories('include'), + ], + gnu_symbol_visibility : 'hidden', + build_by_default : false, + link_with: [libpanfrost_base_per_arch], +) + +libpanfrost_base_dep = declare_dependency( + link_with: [libpanfrost_base_per_arch, libpanfrost_base], + include_directories: [include_directories('.')], +) diff --git a/src/panfrost/base/pan_base.c b/src/panfrost/base/pan_base.c new file mode 100644 index 00000000000..22dc09cfb52 --- /dev/null +++ b/src/panfrost/base/pan_base.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/macros.h" +#include "pan_base.h" + +#include "mali_kbase_ioctl.h" + +bool +kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose) +{ + *k = (struct kbase_) {0}; + k->fd = fd; + k->cs_queue_count = cs_queue_count; + k->page_size = sysconf(_SC_PAGE_SIZE); + k->verbose = verbose; + + if (k->fd == -1) + return kbase_open_csf_noop(k); + + struct kbase_ioctl_version_check ver = { 0 }; + + if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK_RESERVED, &ver) == 0) { + return kbase_open_csf(k); + } else if (ioctl(k->fd, KBASE_IOCTL_VERSION_CHECK, &ver) == 0) { + if (ver.major == 3) + return kbase_open_old(k); + else + return kbase_open_new(k); + } + + return false; +} + +/* If fd != -1, ownership is passed in */ +int +kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd) +{ + kbase_handle h = { + .va = va, + .fd = fd + }; + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + kbase_handle *handles = util_dynarray_begin(&k->gem_handles); + + for (unsigned i = 0; i < size; ++i) { + if (handles[i].fd == -2) { + handles[i] = h; + return i; + } + } + + util_dynarray_append(&k->gem_handles, kbase_handle, h); + + return size; +} + +int +kbase_alloc_gem_handle(kbase k, base_va va, int fd) +{ + pthread_mutex_lock(&k->handle_lock); + + int ret = kbase_alloc_gem_handle_locked(k, va, fd); + + pthread_mutex_unlock(&k->handle_lock); + + return ret; +} + +void +kbase_free_gem_handle(kbase k, int handle) +{ + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + int fd; + + if (handle >= size) { + pthread_mutex_unlock(&k->handle_lock); + return; + } + + if (handle + 1 < size) { + kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); + fd = ptr->fd; + ptr->fd = -2; + } else { + fd = (util_dynarray_pop(&k->gem_handles, kbase_handle)).fd; + } + + if (fd != -1) + close(fd); + + pthread_mutex_unlock(&k->handle_lock); +} + +kbase_handle +kbase_gem_handle_get(kbase k, int handle) +{ + kbase_handle h = { .fd = -1 }; + + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + if (handle < size) + h = *util_dynarray_element(&k->gem_handles, kbase_handle, handle); + + pthread_mutex_unlock(&k->handle_lock); + + return h; +} + +int +kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers) +{ + struct kbase_wait_ctx wait = kbase_wait_init(k, timeout_ns); + + while (kbase_wait_for_event(&wait)) { + pthread_mutex_lock(&k->handle_lock); + if (handle >= util_dynarray_num_elements(&k->gem_handles, kbase_handle)) { + pthread_mutex_unlock(&k->handle_lock); + kbase_wait_fini(wait); + errno = EINVAL; + return -1; + } + kbase_handle *ptr = util_dynarray_element(&k->gem_handles, kbase_handle, handle); + if (!ptr->use_count) { + pthread_mutex_unlock(&k->handle_lock); + kbase_wait_fini(wait); + return 0; + } + pthread_mutex_unlock(&k->handle_lock); + } + + kbase_wait_fini(wait); + errno = ETIMEDOUT; + return -1; +} + +static void +adjust_time(struct timespec *tp, int64_t ns) +{ + ns += tp->tv_nsec; + tp->tv_nsec = ns % 1000000000; + tp->tv_sec += ns / 1000000000; +} + +static int64_t +ns_until(struct timespec tp) +{ + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + + int64_t sec = (tp.tv_sec - now.tv_sec) * 1000000000; + int64_t ns = tp.tv_nsec - now.tv_nsec; + + /* Clamp the value to zero to avoid errors from ppoll */ + return MAX2(sec + ns, 0); +} + +static void +kbase_wait_signal(kbase k) +{ + /* We must acquire the event condition lock, otherwise another + * thread could be between the trylock and the cond_wait, and + * not notice the broadcast. */ + pthread_mutex_lock(&k->event_cnd_lock); + pthread_cond_broadcast(&k->event_cnd); + pthread_mutex_unlock(&k->event_cnd_lock); +} + +struct kbase_wait_ctx +kbase_wait_init(kbase k, int64_t timeout_ns) +{ + struct timespec tp; + clock_gettime(CLOCK_MONOTONIC, &tp); + + adjust_time(&tp, timeout_ns); + + return (struct kbase_wait_ctx) { + .k = k, + .until = tp, + }; +} + +bool +kbase_wait_for_event(struct kbase_wait_ctx *ctx) +{ + kbase k = ctx->k; + + /* Return instantly the first time so that a check outside the + * wait_for_Event loop is not required */ + if (!ctx->has_cnd_lock) { + pthread_mutex_lock(&k->event_cnd_lock); + ctx->has_cnd_lock = true; + return true; + } + + if (!ctx->has_lock) { + if (pthread_mutex_trylock(&k->event_read_lock) == 0) { + ctx->has_lock = true; + pthread_mutex_unlock(&k->event_cnd_lock); + } else { + int ret = pthread_cond_timedwait(&k->event_cnd, + &k->event_cnd_lock, &ctx->until); + return ret != ETIMEDOUT; + } + } + + bool event = k->poll_event(k, ns_until(ctx->until)); + k->handle_events(k); + kbase_wait_signal(k); + return event; +} + +void +kbase_wait_fini(struct kbase_wait_ctx ctx) +{ + kbase k = ctx.k; + + if (ctx.has_lock) { + pthread_mutex_unlock(&k->event_read_lock); + kbase_wait_signal(k); + } else if (ctx.has_cnd_lock) { + pthread_mutex_unlock(&k->event_cnd_lock); + } +} + +void +kbase_ensure_handle_events(kbase k) +{ + /* If we don't manage to take the lock, then events have recently/will + * soon be handled, there is no need to do anything. */ + if (pthread_mutex_trylock(&k->event_read_lock) == 0) { + k->handle_events(k); + pthread_mutex_unlock(&k->event_read_lock); + kbase_wait_signal(k); + } +} + +bool +kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp) +{ + struct pollfd pfd = { + .fd = fd, + .events = wait_shared ? POLLOUT : POLLIN, + }; + + uint64_t timeout = ns_until(tp); + + struct timespec t = { + .tv_sec = timeout / 1000000000, + .tv_nsec = timeout % 1000000000, + }; + + int ret = ppoll(&pfd, 1, &t, NULL); + + if (ret == -1 && errno != EINTR) + perror("kbase_poll_fd_until"); + + return ret != 0; +} diff --git a/src/panfrost/base/pan_base.h b/src/panfrost/base/pan_base.h new file mode 100644 index 00000000000..878f7468433 --- /dev/null +++ b/src/panfrost/base/pan_base.h @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Library for interfacing with kbase */ +#ifndef PAN_BASE_H +#define PAN_BASE_H + +#include "util/u_dynarray.h" +#include "util/list.h" + +#define PAN_EVENT_SIZE 16 + +typedef uint64_t base_va; +struct base_ptr { + void *cpu; + base_va gpu; +}; + +struct kbase_syncobj; + +/* The job is done when the queue seqnum > seqnum */ +struct kbase_sync_link { + struct kbase_sync_link *next; /* must be first */ + uint64_t seqnum; + void (*callback)(void *); + void *data; +}; + +struct kbase_event_slot { + struct kbase_sync_link *syncobjs; + struct kbase_sync_link **back; + uint64_t last_submit; + uint64_t last; +}; + +struct kbase_context { + uint8_t csg_handle; + uint8_t kcpu_queue; + bool kcpu_init; // TODO: Always create a queue? + uint32_t csg_uid; + unsigned num_csi; + + unsigned tiler_heap_chunk_size; + base_va tiler_heap_va; + base_va tiler_heap_header; +}; + +struct kbase_cs { + struct kbase_context *ctx; + void *user_io; + base_va va; + unsigned size; + unsigned event_mem_offset; + unsigned csi; + + uint64_t last_insert; + + // TODO: This is only here because it's convenient for emit_csf_queue + uint32_t *latest_flush; +}; + +#define KBASE_SLOT_COUNT 2 + +typedef struct { + base_va va; + int fd; + uint8_t use_count; + /* For emulating implicit sync. TODO make this work on v10 */ + uint8_t last_access[KBASE_SLOT_COUNT]; +} kbase_handle; + +struct kbase_; +typedef struct kbase_ *kbase; + +struct kbase_ { + unsigned setup_state; + bool verbose; + + int fd; + unsigned api; + unsigned page_size; + // TODO: Actually we may want to try to pack multiple contexts / queue + // "sets" into a single group... + unsigned cs_queue_count; + + /* Must not hold handle_lock while acquiring event_read_lock */ + pthread_mutex_t handle_lock; + pthread_mutex_t event_read_lock; + pthread_mutex_t event_cnd_lock; + pthread_cond_t event_cnd; + /* TODO: Per-context/queue locks? */ + pthread_mutex_t queue_lock; + + struct list_head syncobjs; + + unsigned gpuprops_size; + void *gpuprops; + + void *tracking_region; + void *csf_user_reg; + struct base_ptr event_mem; + struct base_ptr kcpu_event_mem; + // TODO: dynamically size + struct kbase_event_slot event_slots[256]; + // TODO: USe a bitset? + unsigned event_slot_usage; + + uint8_t atom_number; + + struct util_dynarray gem_handles; + struct util_dynarray atom_bos[256]; + uint64_t job_seq; + + void (*close)(kbase k); + + bool (*get_pan_gpuprop)(kbase k, unsigned name, uint64_t *value); + bool (*get_mali_gpuprop)(kbase k, unsigned name, uint64_t *value); + + struct base_ptr (*alloc)(kbase k, size_t size, + unsigned pan_flags, + unsigned mali_flags); + void (*free)(kbase k, base_va va); + + int (*import_dmabuf)(kbase k, int fd); + void *(*mmap_import)(kbase k, base_va va, size_t size); + + void (*cache_clean)(void *ptr, size_t size); + void (*cache_invalidate)(void *ptr, size_t size); + + /* Returns false on timeout */ + bool (*poll_event)(kbase k, int64_t timeout_ns); + bool (*handle_events)(kbase k); + + /* <= v9 GPUs */ + int (*submit)(kbase k, uint64_t va, unsigned req, + struct kbase_syncobj *o, + int32_t *handles, unsigned num_handles); + + /* >= v10 GPUs */ + struct kbase_context *(*context_create)(kbase k); + void (*context_destroy)(kbase k, struct kbase_context *ctx); + bool (*context_recreate)(kbase k, struct kbase_context *ctx); + + // TODO: Pass in a priority? + struct kbase_cs (*cs_bind)(kbase k, struct kbase_context *ctx, + base_va va, unsigned size); + void (*cs_term)(kbase k, struct kbase_cs *cs); + void (*cs_rebind)(kbase k, struct kbase_cs *cs); + + bool (*cs_submit)(kbase k, struct kbase_cs *cs, uint64_t insert_offset, + struct kbase_syncobj *o, uint64_t seqnum); + bool (*cs_wait)(kbase k, struct kbase_cs *cs, uint64_t extract_offset, + struct kbase_syncobj *o); + + int (*kcpu_fence_export)(kbase k, struct kbase_context *ctx); + bool (*kcpu_fence_import)(kbase k, struct kbase_context *ctx, int fd); + + bool (*kcpu_cqs_set)(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value); + bool (*kcpu_cqs_wait)(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value); + + /* syncobj functions */ + struct kbase_syncobj *(*syncobj_create)(kbase k); + void (*syncobj_destroy)(kbase k, struct kbase_syncobj *o); + struct kbase_syncobj *(*syncobj_dup)(kbase k, struct kbase_syncobj *o); + /* TODO: timeout? (and for cs_wait) */ + bool (*syncobj_wait)(kbase k, struct kbase_syncobj *o); + + /* Returns false if there are no active queues */ + bool (*callback_all_queues)(kbase k, int32_t *count, + void (*callback)(void *), void *data); + + void (*mem_sync)(kbase k, base_va gpu, void *cpu, size_t size, + bool invalidate); +}; + +bool kbase_open(kbase k, int fd, unsigned cs_queue_count, bool verbose); + +/* Called from kbase_open */ +bool kbase_open_old(kbase k); +bool kbase_open_new(kbase k); +bool kbase_open_csf(kbase k); +bool kbase_open_csf_noop(kbase k); + +/* BO management */ +int kbase_alloc_gem_handle(kbase k, base_va va, int fd); +int kbase_alloc_gem_handle_locked(kbase k, base_va va, int fd); +void kbase_free_gem_handle(kbase k, int handle); +kbase_handle kbase_gem_handle_get(kbase k, int handle); +int kbase_wait_bo(kbase k, int handle, int64_t timeout_ns, bool wait_readers); + +/* Event waiting */ +struct kbase_wait_ctx { + kbase k; + struct timespec until; + bool has_lock; + bool has_cnd_lock; +}; + +struct kbase_wait_ctx kbase_wait_init(kbase k, int64_t timeout_ns); +/* Returns false on timeout, kbase_wait_fini must still be called */ +bool kbase_wait_for_event(struct kbase_wait_ctx *ctx); +void kbase_wait_fini(struct kbase_wait_ctx ctx); + +void kbase_ensure_handle_events(kbase k); + +bool kbase_poll_fd_until(int fd, bool wait_shared, struct timespec tp); + +/* Must not conflict with PANFROST_BO_* flags */ +#define MALI_BO_CACHED_CPU (1 << 16) +#define MALI_BO_UNCACHED_GPU (1 << 17) + +#endif diff --git a/src/panfrost/base/pan_base_noop.h b/src/panfrost/base/pan_base_noop.h new file mode 100644 index 00000000000..750a445a995 --- /dev/null +++ b/src/panfrost/base/pan_base_noop.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PAN_BASE_NOOP_H +#define PAN_BASE_NOOP_H + +/* For Mali-G610 as used in RK3588 */ +#define PROP(name, value) ((name << 2) | 2), value +static const uint32_t gpu_props[] = { + PROP(KBASE_GPUPROP_RAW_GPU_ID, 0xa8670000), + PROP(KBASE_GPUPROP_PRODUCT_ID, 0xa867), + PROP(KBASE_GPUPROP_RAW_SHADER_PRESENT, 0x50005), + PROP(KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, 0xc1ffff9e), + PROP(KBASE_GPUPROP_TLS_ALLOC, 0x800), + PROP(KBASE_GPUPROP_RAW_TILER_FEATURES, 0x809), +}; +#undef PROP + +#define NOOP_COOKIE_ALLOC 0x41000 +#define NOOP_COOKIE_USER_IO 0x42000 +#define NOOP_COOKIE_MEM_ALLOC 0x43000 + +static int +kbase_ioctl(int fd, unsigned long request, ...) +{ + int ret = 0; + + va_list args; + + va_start(args, request); + void *ptr = va_arg(args, void *); + va_end(args); + + switch (request) { + case KBASE_IOCTL_GET_GPUPROPS: { + struct kbase_ioctl_get_gpuprops *props = ptr; + + if (props->size) + memcpy((void *)(uintptr_t) props->buffer, + gpu_props, MIN2(props->size, sizeof(gpu_props))); + + ret = sizeof(gpu_props); + break; + } + + case KBASE_IOCTL_MEM_ALLOC: { + union kbase_ioctl_mem_alloc *alloc = ptr; + + alloc->out.gpu_va = NOOP_COOKIE_ALLOC; + alloc->out.flags = BASE_MEM_SAME_VA; + break; + } + + case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6: { + union kbase_ioctl_cs_queue_group_create_1_6 *create = ptr; + + // TODO: Don't return duplicates? + create->out.group_handle = 0; + create->out.group_uid = 1; + break; + } + + case KBASE_IOCTL_CS_TILER_HEAP_INIT: { + union kbase_ioctl_cs_tiler_heap_init *init = ptr; + + /* The values don't really matter, the CPU has no business in accessing + * these. */ + init->out.gpu_heap_va = 0x60000; + init->out.first_chunk_va = 0x61000; + break; + } + + case KBASE_IOCTL_CS_QUEUE_BIND: { + union kbase_ioctl_cs_queue_bind *bind = ptr; + bind->out.mmap_handle = NOOP_COOKIE_USER_IO; + break; + } + + case KBASE_IOCTL_MEM_IMPORT: { + union kbase_ioctl_mem_import *import = ptr; + + if (import->in.type != BASE_MEM_IMPORT_TYPE_UMM) { + ret = -1; + errno = EINVAL; + break; + } + + int *fd = (int *)(uintptr_t) import->in.phandle; + + off_t size = lseek(*fd, 0, SEEK_END); + + import->out.flags = BASE_MEM_NEED_MMAP; + import->out.gpu_va = NOOP_COOKIE_MEM_ALLOC; + import->out.va_pages = DIV_ROUND_UP(size, 4096); + } + + case KBASE_IOCTL_SET_FLAGS: + case KBASE_IOCTL_MEM_EXEC_INIT: + case KBASE_IOCTL_MEM_JIT_INIT: + case KBASE_IOCTL_CS_QUEUE_REGISTER: + case KBASE_IOCTL_CS_QUEUE_KICK: + case KBASE_IOCTL_CS_TILER_HEAP_TERM: + case KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE: + case KBASE_IOCTL_MEM_SYNC: + break; + + default: + ret = -1; + errno = ENOSYS; + } + + return ret; +} + +static void * +kbase_mmap(void *addr, size_t length, int prot, int flags, + int fd, off_t offset) +{ + switch (offset) { + case BASE_MEM_MAP_TRACKING_HANDLE: + case BASEP_MEM_CSF_USER_REG_PAGE_HANDLE: + case NOOP_COOKIE_ALLOC: + case NOOP_COOKIE_USER_IO: + case NOOP_COOKIE_MEM_ALLOC: + return mmap(NULL, length, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + default: + errno = ENOSYS; + return MAP_FAILED; + } +} +#endif diff --git a/src/panfrost/base/pan_cache.h b/src/panfrost/base/pan_cache.h new file mode 100644 index 00000000000..ad5af0c7098 --- /dev/null +++ b/src/panfrost/base/pan_cache.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PAN_CACHE_H +#define PAN_CACHE_H + +#ifdef __aarch64__ + +static void +cache_clean(volatile void *addr) +{ + __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); +} + +static void +cache_invalidate(volatile void *addr) +{ + __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); +} + +typedef void (*cacheline_op)(volatile void *addr); + +#define CACHELINE_SIZE 64 + +static void +cacheline_op_range(volatile void *start, size_t length, cacheline_op op) +{ + volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); + volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); + for (; ptr < end; ptr += CACHELINE_SIZE) + op(ptr); +} + +static void +cache_clean_range(volatile void *start, size_t length) +{ + /* TODO: Do an invalidate at the start of the range? */ + cacheline_op_range(start, length, cache_clean); +} + +static void +cache_invalidate_range(volatile void *start, size_t length) +{ + cacheline_op_range(start, length, cache_invalidate); +} + +#endif /* __aarch64__ */ + +/* The #ifdef covers both 32-bit and 64-bit ARM */ +#ifdef __ARM_ARCH +static void +cache_barrier(void) +{ + __asm__ volatile ("dsb sy" ::: "memory"); +} + +static void +memory_barrier(void) +{ + __asm__ volatile ("dmb sy" ::: "memory"); +} +#else + +/* TODO: How to do cache barriers when emulated? */ +static void +cache_barrier(void) +{ +} + +static void +memory_barrier(void) +{ +} +#endif +#endif diff --git a/src/panfrost/base/pan_vX_base.c b/src/panfrost/base/pan_vX_base.c new file mode 100644 index 00000000000..99bd356c536 --- /dev/null +++ b/src/panfrost/base/pan_vX_base.c @@ -0,0 +1,1825 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_VALGRIND +#include +#else +#define RUNNING_ON_VALGRIND 0 +#endif + +#include "util/macros.h" +#include "util/list.h" +#include "util/u_atomic.h" +#include "util/os_file.h" + +#include "pan_base.h" +#include "pan_cache.h" + +#include "drm-uapi/panfrost_drm.h" + +#define PAN_BASE_API (PAN_BASE_VER & 0xff) +#if (PAN_BASE_VER & 0x100) == 0x100 +#define PAN_BASE_NOOP +#endif + +#if PAN_BASE_API >= 2 +#include "csf/mali_gpu_csf_registers.h" + +#define MALI_USE_CSF 1 +#endif + +#include "mali_kbase_gpuprops.h" + +#ifndef PAN_BASE_NOOP +#define kbase_mmap mmap +#endif + +#if PAN_BASE_API >= 1 +#include "mali_base_kernel.h" +#include "mali_kbase_ioctl.h" + +#ifdef PAN_BASE_NOOP +#include "pan_base_noop.h" +#else +#define kbase_ioctl ioctl +#endif +#else + +#include "old/mali-ioctl.h" +#include "old/mali-ioctl-midgard.h" +#include "old/mali-props.h" +#endif + +#define LOG(fmt, ...) do { \ + if (k->verbose) { \ + struct timespec tp; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &tp); \ + printf("%"PRIu64".%09li\t" fmt, (uint64_t) tp.tv_sec, tp.tv_nsec __VA_OPT__(,) __VA_ARGS__); \ + } \ + } while (0) + +#if PAN_BASE_API == 0 +static int +kbase_ioctl(int fd, unsigned long request, ...) +{ + int ioc_size = _IOC_SIZE(request); + + assert(ioc_size); + + va_list args; + + va_start(args, request); + int *ptr = va_arg(args, void *); + va_end(args); + + *ptr = (_IOC_TYPE(request) - 0x80) * 256 + _IOC_NR(request); + + int ret = ioctl(fd, request, ptr); + if (ret) + return ret; + + int r = *ptr; + switch (r) { + case MALI_ERROR_OUT_OF_GPU_MEMORY: + errno = ENOSPC; + return -1; + case MALI_ERROR_OUT_OF_MEMORY: + errno = ENOMEM; + return -1; + case MALI_ERROR_FUNCTION_FAILED: + errno = EINVAL; + return -1; + default: + return 0; + } +} +#endif + +#if PAN_BASE_API >= 1 +static bool +kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + int i = 0; + uint64_t x = 0; + while (i < k->gpuprops_size) { + x = 0; + memcpy(&x, k->gpuprops + i, 4); + i += 4; + + int size = 1 << (x & 3); + int this_name = x >> 2; + + x = 0; + memcpy(&x, k->gpuprops + i, size); + i += size; + + if (this_name == name) { + *value = x; + return true; + } + } + + return false; +} +#else +static bool +kbase_get_mali_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + struct kbase_ioctl_gpu_props_reg_dump *props = k->gpuprops; + + switch (name) { + case KBASE_GPUPROP_PRODUCT_ID: + *value = props->core.product_id; + return true; + case KBASE_GPUPROP_RAW_SHADER_PRESENT: + *value = props->raw.shader_present; + return true; + case KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0: + *value = props->raw.texture_features[0]; + return true; + case KBASE_GPUPROP_RAW_TILER_FEATURES: + *value = props->raw.tiler_features; + return true; + case KBASE_GPUPROP_RAW_GPU_ID: + *value = props->raw.gpu_id; + return true; + default: + return false; + } +} +#endif + +static bool +alloc_handles(kbase k) +{ + util_dynarray_init(&k->gem_handles, NULL); + return true; +} + +static bool +free_handles(kbase k) +{ + util_dynarray_fini(&k->gem_handles); + return true; +} + +static bool +set_flags(kbase k) +{ + struct kbase_ioctl_set_flags flags = { + .create_flags = 0 + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_SET_FLAGS, &flags); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); + return false; + } + return true; +} + +static bool +mmap_tracking(kbase k) +{ + k->tracking_region = kbase_mmap(NULL, k->page_size, PROT_NONE, + MAP_SHARED, k->fd, + BASE_MEM_MAP_TRACKING_HANDLE); + + if (k->tracking_region == MAP_FAILED) { + perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); + k->tracking_region = NULL; + return false; + } + return true; +} + +static bool +munmap_tracking(kbase k) +{ + if (k->tracking_region) + return munmap(k->tracking_region, k->page_size) == 0; + return true; +} + +#if PAN_BASE_API >= 1 +static bool +get_gpuprops(kbase k) +{ + struct kbase_ioctl_get_gpuprops props = { 0 }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); + return false; + } else if (!ret) { + fprintf(stderr, "GET_GPUPROPS returned zero size\n"); + return false; + } + + k->gpuprops_size = ret; + k->gpuprops = calloc(k->gpuprops_size, 1); + + props.size = k->gpuprops_size; + props.buffer = (uint64_t)(uintptr_t) k->gpuprops; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); + return false; + } + + return true; +} +#else +static bool +get_gpuprops(kbase k) +{ + k->gpuprops = calloc(1, sizeof(struct kbase_ioctl_gpu_props_reg_dump)); + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_GPU_PROPS_REG_DUMP, k->gpuprops); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GPU_PROPS_REG_DUMP)"); + return false; + } + + return true; +} +#endif + +static bool +free_gpuprops(kbase k) +{ + free(k->gpuprops); + return true; +} + +#if PAN_BASE_API >= 2 +static bool +mmap_user_reg(kbase k) +{ + k->csf_user_reg = kbase_mmap(NULL, k->page_size, PROT_READ, + MAP_SHARED, k->fd, + BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); + + if (k->csf_user_reg == MAP_FAILED) { + perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); + k->csf_user_reg = NULL; + return false; + } + return true; +} + +static bool +munmap_user_reg(kbase k) +{ + if (k->csf_user_reg) + return munmap(k->csf_user_reg, k->page_size) == 0; + return true; +} +#endif + +#if PAN_BASE_API >= 1 +static bool +init_mem_exec(kbase k) +{ + struct kbase_ioctl_mem_exec_init init = { + .va_pages = 0x100000, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); + return false; + } + return true; +} + +static bool +init_mem_jit(kbase k) +{ + struct kbase_ioctl_mem_jit_init init = { + .va_pages = 1 << 25, + .max_allocations = 255, + .phys_pages = 1 << 25, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_JIT_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); + return false; + } + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static struct base_ptr +kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags); + +static bool +alloc_event_mem(kbase k) +{ + k->event_mem = kbase_alloc(k, k->page_size * 2, + PANFROST_BO_NOEXEC, + BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | + BASE_MEM_SAME_VA | BASE_MEM_CSF_EVENT); + k->kcpu_event_mem = (struct base_ptr) { + .cpu = k->event_mem.cpu + k->page_size, + .gpu = k->event_mem.gpu + k->page_size, + }; + return k->event_mem.cpu; +} + +static bool +free_event_mem(kbase k) +{ + if (k->event_mem.cpu) + return munmap(k->event_mem.cpu, k->page_size * 2) == 0; + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static bool +cs_group_create(kbase k, struct kbase_context *c) +{ + /* TODO: What about compute-only contexts? */ + union kbase_ioctl_cs_queue_group_create_1_6 create = { + .in = { + /* Mali *still* only supports a single tiler unit */ + .tiler_mask = 1, + .fragment_mask = ~0ULL, + .compute_mask = ~0ULL, + + .cs_min = k->cs_queue_count, + + .priority = 1, + .tiler_max = 1, + .fragment_max = 64, + .compute_max = 64, + } + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); + return false; + } + + c->csg_handle = create.out.group_handle; + c->csg_uid = create.out.group_uid; + + /* Should be at least 1 */ + assert(c->csg_uid); + + return true; +} + +static bool +cs_group_term(kbase k, struct kbase_context *c) +{ + if (!c->csg_uid) + return true; + + struct kbase_ioctl_cs_queue_group_term term = { + .group_handle = c->csg_handle + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); + return false; + } + return true; +} +#endif + +#if PAN_BASE_API >= 2 +static bool +tiler_heap_create(kbase k, struct kbase_context *c) +{ + c->tiler_heap_chunk_size = 1 << 21; /* 2 MB */ + + union kbase_ioctl_cs_tiler_heap_init init = { + .in = { + .chunk_size = c->tiler_heap_chunk_size, + .initial_chunks = 5, + .max_chunks = 200, + .target_in_flight = 65535, + } + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); + return false; + } + + c->tiler_heap_va = init.out.gpu_heap_va; + c->tiler_heap_header = init.out.first_chunk_va; + + return true; +} + +static bool +tiler_heap_term(kbase k, struct kbase_context *c) +{ + if (!c->tiler_heap_va) + return true; + + struct kbase_ioctl_cs_tiler_heap_term term = { + .gpu_heap_va = c->tiler_heap_va + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); + return false; + } + return true; +} +#endif + +typedef bool (* kbase_func)(kbase k); + +struct kbase_op { + kbase_func part; + kbase_func cleanup; + const char *label; +}; + +static struct kbase_op kbase_main[] = { + { alloc_handles, free_handles, "Allocate handle array" }, +#if PAN_BASE_API >= 1 + { set_flags, NULL, "Set flags" }, +#endif + { mmap_tracking, munmap_tracking, "Map tracking handle" }, +#if PAN_BASE_API == 0 + { set_flags, NULL, "Set flags" }, +#endif + { get_gpuprops, free_gpuprops, "Get GPU properties" }, +#if PAN_BASE_API >= 2 + { mmap_user_reg, munmap_user_reg, "Map user register page" }, +#endif +#if PAN_BASE_API >= 1 + { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, + { init_mem_jit, NULL, "Initialise JIT allocator" }, +#endif +#if PAN_BASE_API >= 2 + { alloc_event_mem, free_event_mem, "Allocate event memory" }, +#endif +}; + +static void +kbase_close(kbase k) +{ + while (k->setup_state) { + unsigned i = k->setup_state - 1; + if (kbase_main[i].cleanup) + kbase_main[i].cleanup(k); + --k->setup_state; + } + + pthread_mutex_destroy(&k->handle_lock); + pthread_mutex_destroy(&k->event_read_lock); + pthread_mutex_destroy(&k->event_cnd_lock); + pthread_mutex_destroy(&k->queue_lock); + pthread_cond_destroy(&k->event_cnd); + + close(k->fd); +} + +static bool +kbase_get_pan_gpuprop(kbase k, unsigned name, uint64_t *value) +{ + unsigned conv[] = { + [DRM_PANFROST_PARAM_GPU_PROD_ID] = KBASE_GPUPROP_PRODUCT_ID, + [DRM_PANFROST_PARAM_SHADER_PRESENT] = KBASE_GPUPROP_RAW_SHADER_PRESENT, + [DRM_PANFROST_PARAM_TEXTURE_FEATURES0] = KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0, + [DRM_PANFROST_PARAM_THREAD_TLS_ALLOC] = KBASE_GPUPROP_TLS_ALLOC, + [DRM_PANFROST_PARAM_TILER_FEATURES] = KBASE_GPUPROP_RAW_TILER_FEATURES, + }; + + if (name < ARRAY_SIZE(conv) && conv[name]) + return kbase_get_mali_gpuprop(k, conv[name], value); + + switch (name) { + case DRM_PANFROST_PARAM_AFBC_FEATURES: + *value = 0; + return true; + case DRM_PANFROST_PARAM_GPU_REVISION: { + if (!kbase_get_mali_gpuprop(k, KBASE_GPUPROP_RAW_GPU_ID, value)) + return false; + *value &= 0xffff; + return true; + } + default: + return false; + } +} + +static void +kbase_free(kbase k, base_va va) +{ + struct kbase_ioctl_mem_free f = { + .gpu_addr = va + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_FREE, &f); + + if (ret == -1) + perror("ioctl(KBASE_IOCTL_MEM_FREE)"); +} + +static struct base_ptr +kbase_alloc(kbase k, size_t size, unsigned pan_flags, unsigned mali_flags) +{ + struct base_ptr r = {0}; + + unsigned pages = DIV_ROUND_UP(size, k->page_size); + + union kbase_ioctl_mem_alloc a = { + .in = { + .va_pages = pages, + .commit_pages = pages, + } + }; + + size_t alloc_size = size; + unsigned flags = mali_flags; + bool exec_align = false; + + if (!flags) { + flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR | + BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | + BASE_MEM_SAME_VA; + + /* Add COHERENT_LOCAL to keep GPU cores coherent with each + * other. */ + if (PAN_BASE_API >= 1) + flags |= BASE_MEM_COHERENT_LOCAL; + } + + if (pan_flags & PANFROST_BO_HEAP) { + size_t align_size = 2 * 1024 * 1024 / k->page_size; /* 2 MB */ + + a.in.va_pages = ALIGN_POT(a.in.va_pages, align_size); + a.in.commit_pages = 0; + a.in.extension = align_size; + flags |= BASE_MEM_GROW_ON_GPF; + } + +#if PAN_BASE_API >= 1 + if (pan_flags & MALI_BO_CACHED_CPU) + flags |= BASE_MEM_CACHED_CPU; +#endif + +#if PAN_BASE_API >= 2 + if (pan_flags & MALI_BO_UNCACHED_GPU) + flags |= BASE_MEM_UNCACHED_GPU; +#endif + + if (!(pan_flags & PANFROST_BO_NOEXEC)) { + /* Using SAME_VA for executable BOs would make it too likely + * for a blend shader to end up on the wrong side of a 4 GB + * boundary. */ + flags |= BASE_MEM_PROT_GPU_EX; + flags &= ~(BASE_MEM_PROT_GPU_WR | BASE_MEM_SAME_VA); + + if (PAN_BASE_API == 0) { + /* Assume 4K pages */ + a.in.va_pages = 0x1000; /* Align shader BOs to 16 MB */ + size = 1 << 26; /* Four times the alignment */ + exec_align = true; + } + } + + a.in.flags = flags; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_ALLOC, &a); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); + return r; + } + + // TODO: Is this always true, even in the face of multithreading? + if (PAN_BASE_API == 0) + a.out.gpu_va = 0x41000; + + if ((flags & BASE_MEM_SAME_VA) && + !((a.out.flags & BASE_MEM_SAME_VA) && + a.out.gpu_va < 0x80000)) { + + fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", + (uint64_t) a.out.flags, (uint64_t) a.out.gpu_va); + errno = EINVAL; + return r; + } + + void *ptr = kbase_mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, a.out.gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU BO)"); + kbase_free(k, a.out.gpu_va); + return r; + } + + uint64_t gpu_va = (a.out.flags & BASE_MEM_SAME_VA) ? + (uintptr_t) ptr : a.out.gpu_va; + + if (exec_align) { + gpu_va = ALIGN_POT(gpu_va, 1 << 24); + + ptr = kbase_mmap(NULL, alloc_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU EXEC BO)"); + kbase_free(k, gpu_va); + return r; + } + } + + r.cpu = ptr; + r.gpu = gpu_va; + + return r; +} + +static int +kbase_import_dmabuf(kbase k, int fd) +{ + int ret; + + pthread_mutex_lock(&k->handle_lock); + + unsigned size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + + kbase_handle *handles = util_dynarray_begin(&k->gem_handles); + + for (unsigned i = 0; i < size; ++i) { + kbase_handle h = handles[i]; + + if (h.fd < 0) + continue; + + ret = os_same_file_description(h.fd, fd); + + if (ret == 0) { + pthread_mutex_unlock(&k->handle_lock); + return i; + } else if (ret < 0) { + printf("error in os_same_file_description(%i, %i)\n", h.fd, fd); + } + } + + int dup = os_dupfd_cloexec(fd); + + union kbase_ioctl_mem_import import = { + .in = { + .phandle = (uintptr_t) &dup, + .type = BASE_MEM_IMPORT_TYPE_UMM, + /* Usage flags: CPU/GPU reads/writes */ + .flags = 0xf, + } + }; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_IMPORT, &import); + + int handle; + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_IMPORT)"); + handle = -1; + } else if (import.out.flags & BASE_MEM_NEED_MMAP) { + uint64_t va = (uintptr_t) kbase_mmap(NULL, import.out.va_pages * k->page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, k->fd, import.out.gpu_va); + + if (va == (uintptr_t) MAP_FAILED) { + perror("mmap(IMPORTED BO)"); + handle = -1; + } else { + handle = kbase_alloc_gem_handle_locked(k, va, dup); + } + } else { + handle = kbase_alloc_gem_handle_locked(k, import.out.gpu_va, dup); + } + + pthread_mutex_unlock(&k->handle_lock); + + return handle; +} + +static void * +kbase_mmap_import(kbase k, base_va va, size_t size) +{ + return kbase_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, k->fd, va); +} + +struct kbase_fence { + struct list_head link; + + unsigned slot; + uint64_t value; +}; + +struct kbase_syncobj { + struct list_head link; + + struct list_head fences; +}; + +static struct kbase_syncobj * +kbase_syncobj_create(kbase k) +{ + struct kbase_syncobj *o = calloc(1, sizeof(*o)); + list_inithead(&o->fences); + pthread_mutex_lock(&k->queue_lock); + list_add(&o->link, &k->syncobjs); + pthread_mutex_unlock(&k->queue_lock); + return o; +} + +static void +kbase_syncobj_destroy(kbase k, struct kbase_syncobj *o) +{ + pthread_mutex_lock(&k->queue_lock); + list_del(&o->link); + pthread_mutex_unlock(&k->queue_lock); + + list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { + list_del(&fence->link); + free(fence); + } + + free(o); +} + +static void +kbase_syncobj_add_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) +{ + struct kbase_fence *fence = calloc(1, sizeof(*fence)); + + fence->slot = slot; + fence->value = value; + + list_add(&fence->link, &o->fences); +} + +static void +kbase_syncobj_update_fence(struct kbase_syncobj *o, unsigned slot, uint64_t value) +{ + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { + if (fence->slot == slot) { + if (value > fence->value) + fence->value = value; + + return; + } + } + + kbase_syncobj_add_fence(o, slot, value); +} + +static struct kbase_syncobj * +kbase_syncobj_dup(kbase k, struct kbase_syncobj *o) +{ + struct kbase_syncobj *dup = kbase_syncobj_create(k); + + pthread_mutex_lock(&k->queue_lock); + + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) + kbase_syncobj_add_fence(dup, fence->slot, fence->value); + + pthread_mutex_unlock(&k->queue_lock); + + return dup; +} + +static void +kbase_syncobj_update(kbase k, struct kbase_syncobj *o) +{ + list_for_each_entry_safe(struct kbase_fence, fence, &o->fences, link) { + uint64_t value = k->event_slots[fence->slot].last; + + if (value > fence->value) { + LOG("syncobj %p slot %u value %"PRIu64" vs %"PRIu64"\n", + o, fence->slot, fence->value, value); + + list_del(&fence->link); + free(fence); + } + } +} + +static bool +kbase_syncobj_wait(kbase k, struct kbase_syncobj *o) +{ + if (list_is_empty(&o->fences)) { + LOG("syncobj has no fences\n"); + return true; + } + + struct kbase_wait_ctx wait = kbase_wait_init(k, 1 * 1000000000LL); + + while (kbase_wait_for_event(&wait)) { + kbase_syncobj_update(k, o); + + if (list_is_empty(&o->fences)) { + kbase_wait_fini(wait); + return true; + } + } + + kbase_wait_fini(wait); + + fprintf(stderr, "syncobj %p wait timeout\n", o); + return false; +} + +static bool +kbase_poll_event(kbase k, int64_t timeout_ns) +{ + struct pollfd pfd = { + .fd = k->fd, + .events = POLLIN, + }; + + struct timespec t = { + .tv_sec = timeout_ns / 1000000000, + .tv_nsec = timeout_ns % 1000000000, + }; + + int ret = ppoll(&pfd, 1, &t, NULL); + + if (ret == -1 && errno != EINTR) + perror("poll(mali fd)"); + + LOG("poll returned %i\n", pfd.revents); + + return ret != 0; +} + +#if PAN_BASE_API < 2 +static bool +kbase_handle_events(kbase k) +{ + struct base_jd_event_v2 event; + bool ret = true; + + for (;;) { + int ret = read(k->fd, &event, sizeof(event)); + + if (ret == -1) { + if (errno == EAGAIN) { + return true; + } else { + perror("read(mali fd)"); + return false; + } + } + + if (event.event_code != BASE_JD_EVENT_DONE) { + fprintf(stderr, "Atom %i reported event 0x%x!\n", + event.atom_number, event.event_code); + ret = false; + } + + pthread_mutex_lock(&k->handle_lock); + + k->event_slots[event.atom_number].last = event.udata.blob[0]; + + unsigned size = util_dynarray_num_elements(&k->gem_handles, + kbase_handle); + kbase_handle *handle_data = util_dynarray_begin(&k->gem_handles); + + struct util_dynarray *handles = k->atom_bos + event.atom_number; + + util_dynarray_foreach(handles, int32_t, h) { + if (*h >= size) + continue; + assert(handle_data[*h].use_count); + --handle_data[*h].use_count; + } + util_dynarray_fini(handles); + + pthread_mutex_unlock(&k->handle_lock); + } + + return ret; +} + +#else + +static bool +kbase_read_event(kbase k) +{ + struct base_csf_notification event; + int ret = read(k->fd, &event, sizeof(event)); + + if (ret == -1) { + if (errno == EAGAIN) { + return true; + } else { + perror("read(mali_fd)"); + return false; + } + } + + if (ret != sizeof(event)) { + fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", + ret, (int) sizeof(event)); + return false; + } + + switch (event.type) { + case BASE_CSF_NOTIFICATION_EVENT: + LOG("Notification event!\n"); + return true; + + case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: + break; + + case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: + fprintf(stderr, "No event from mali_fd!\n"); + return true; + + default: + fprintf(stderr, "Unknown event type!\n"); + return true; + } + + struct base_gpu_queue_group_error e = event.payload.csg_error.error; + + switch (e.error_type) { + case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue group error: status 0x%x " + "sideband 0x%"PRIx64"\n", + e.payload.fatal_group.status, + (uint64_t) e.payload.fatal_group.sideband); + break; + } + case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { + unsigned queue = e.payload.fatal_queue.csi_index; + + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue %i error: status 0x%x " + "sideband 0x%"PRIx64"\n", + queue, e.payload.fatal_queue.status, + (uint64_t) e.payload.fatal_queue.sideband); + + /* TODO: Decode the instruct that it got stuck at */ + + break; + } + + case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: + fprintf(stderr, "Command stream timeout!\n"); + break; + case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: + fprintf(stderr, "Command stream OOM!\n"); + break; + default: + fprintf(stderr, "Unknown error type!\n"); + } + + return false; +} + +static void +kbase_update_queue_callbacks(kbase k, + struct kbase_event_slot *slot, + uint64_t seqnum) +{ + struct kbase_sync_link **list = &slot->syncobjs; + struct kbase_sync_link **back = slot->back; + + while (*list) { + struct kbase_sync_link *link = *list; + + LOG("seq %"PRIu64" %"PRIu64"\n", seqnum, link->seqnum); + + /* Items in the list should be in order, there is no need to + * check any more if we can't process this link yet. */ + if (seqnum <= link->seqnum) + break; + + LOG("done, calling %p(%p)\n", link->callback, link->data); + link->callback(link->data); + *list = link->next; + if (&link->next == back) + slot->back = list; + free(link); + } +} + +static bool +kbase_handle_events(kbase k) +{ +#ifdef PAN_BASE_NOOP + return true; +#endif + + /* This will clear the event count, so there's no need to do it in a + * loop. */ + bool ret = kbase_read_event(k); + + uint64_t *event_mem = k->event_mem.cpu; + + pthread_mutex_lock(&k->queue_lock); + + for (unsigned i = 0; i < k->event_slot_usage; ++i) { + uint64_t seqnum = event_mem[i * 2]; + uint64_t cmp = k->event_slots[i].last; + + LOG("MAIN SEQ %"PRIu64" > %"PRIu64"?\n", seqnum, cmp); + + if (seqnum < cmp) { + if (false) + fprintf(stderr, "seqnum at offset %i went backward " + "from %"PRIu64" to %"PRIu64"!\n", + i, cmp, seqnum); + } else /*if (seqnum > cmp)*/ { + kbase_update_queue_callbacks(k, &k->event_slots[i], + seqnum); + } + + /* TODO: Atomic operations? */ + k->event_slots[i].last = seqnum; + } + + pthread_mutex_unlock(&k->queue_lock); + + return ret; +} + +#endif + +#if PAN_BASE_API < 2 +static uint8_t +kbase_latest_slot(uint8_t a, uint8_t b, uint8_t newest) +{ + /* If a == 4 and newest == 5, a will become 255 */ + a -= newest; + b -= newest; + a = MAX2(a, b); + a += newest; + return a; +} + +static int +kbase_submit(kbase k, uint64_t va, unsigned req, + struct kbase_syncobj *o, + int32_t *handles, unsigned num_handles) +{ + struct util_dynarray buf; + util_dynarray_init(&buf, NULL); + + memcpy(util_dynarray_resize(&buf, int32_t, num_handles), + handles, num_handles * sizeof(int32_t)); + + pthread_mutex_lock(&k->handle_lock); + + unsigned slot = (req & PANFROST_JD_REQ_FS) ? 0 : 1; + unsigned dep_slots[KBASE_SLOT_COUNT]; + + uint8_t nr = k->atom_number++; + + struct base_jd_atom_v2 atom = { + .jc = va, + .atom_number = nr, + .udata.blob[0] = k->job_seq++, + }; + + for (unsigned i = 0; i < KBASE_SLOT_COUNT; ++i) + dep_slots[i] = nr; + + /* Make sure that we haven't taken an atom that's already in use. */ + assert(!k->atom_bos[nr].data); + k->atom_bos[atom.atom_number] = buf; + + unsigned handle_buf_size = util_dynarray_num_elements(&k->gem_handles, kbase_handle); + kbase_handle *handle_buf = util_dynarray_begin(&k->gem_handles); + + struct util_dynarray extres; + util_dynarray_init(&extres, NULL); + + /* Mark the BOs as in use */ + for (unsigned i = 0; i < num_handles; ++i) { + int32_t h = handles[i]; + assert(h < handle_buf_size); + assert(handle_buf[h].use_count < 255); + + /* Implicit sync */ + if (handle_buf[h].use_count) + for (unsigned s = 0; s < KBASE_SLOT_COUNT; ++s) + dep_slots[s] = + kbase_latest_slot(dep_slots[s], + handle_buf[h].last_access[s], + nr); + + handle_buf[h].last_access[slot] = nr; + ++handle_buf[h].use_count; + + if (handle_buf[h].fd != -1) + util_dynarray_append(&extres, base_va, handle_buf[h].va); + } + + pthread_mutex_unlock(&k->handle_lock); + + /* TODO: Better work out the difference between handle_lock and + * queue_lock. */ + if (o) { + pthread_mutex_lock(&k->queue_lock); + kbase_syncobj_update_fence(o, nr, atom.udata.blob[0]); + pthread_mutex_unlock(&k->queue_lock); + } + + assert(KBASE_SLOT_COUNT == 2); + if (dep_slots[0] != nr) { + atom.pre_dep[0].atom_id = dep_slots[0]; + /* TODO: Use data dependencies? */ + atom.pre_dep[0].dependency_type = BASE_JD_DEP_TYPE_ORDER; + } + if (dep_slots[1] != nr) { + atom.pre_dep[1].atom_id = dep_slots[1]; + atom.pre_dep[1].dependency_type = BASE_JD_DEP_TYPE_ORDER; + } + + if (extres.size) { + atom.core_req |= BASE_JD_REQ_EXTERNAL_RESOURCES; + atom.nr_extres = util_dynarray_num_elements(&extres, base_va); + atom.extres_list = (uintptr_t) util_dynarray_begin(&extres); + } + + if (req & PANFROST_JD_REQ_FS) + atom.core_req |= BASE_JD_REQ_FS; + else + atom.core_req |= BASE_JD_REQ_CS | BASE_JD_REQ_T; + + struct kbase_ioctl_job_submit submit = { + .nr_atoms = 1, + .stride = sizeof(atom), + .addr = (uintptr_t) &atom, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_JOB_SUBMIT, &submit); + + util_dynarray_fini(&extres); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_JOB_SUBMIT)"); + return -1; + } + + return atom.atom_number; +} + +#else +static struct kbase_context * +kbase_context_create(kbase k) +{ + struct kbase_context *c = calloc(1, sizeof(*c)); + + if (!cs_group_create(k, c)) { + free(c); + return NULL; + } + + if (!tiler_heap_create(k, c)) { + cs_group_term(k, c); + free(c); + return NULL; + } + + return c; +} + +static void +kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx); + +static void +kbase_context_destroy(kbase k, struct kbase_context *ctx) +{ + kbase_kcpu_queue_destroy(k, ctx); + tiler_heap_term(k, ctx); + cs_group_term(k, ctx); + free(ctx); +} + +static bool +kbase_context_recreate(kbase k, struct kbase_context *ctx) +{ + kbase_kcpu_queue_destroy(k, ctx); + tiler_heap_term(k, ctx); + cs_group_term(k, ctx); + + if (!cs_group_create(k, ctx)) { + free(ctx); + return false; + } + + if (!tiler_heap_create(k, ctx)) { + free(ctx); + return false; + } + + return true; +} + +static struct kbase_cs +kbase_cs_bind_noevent(kbase k, struct kbase_context *ctx, + base_va va, unsigned size, unsigned csi) +{ + struct kbase_cs cs = { + .ctx = ctx, + .va = va, + .size = size, + .csi = csi, + .latest_flush = (uint32_t *)k->csf_user_reg, + }; + + struct kbase_ioctl_cs_queue_register reg = { + .buffer_gpu_addr = va, + .buffer_size = size, + .priority = 1, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); + return cs; + } + + union kbase_ioctl_cs_queue_bind bind = { + .in = { + .buffer_gpu_addr = va, + .group_handle = ctx->csg_handle, + .csi_index = csi, + } + }; + + ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); + // hack + cs.user_io = (void *)1; + return cs; + } + + cs.user_io = + kbase_mmap(NULL, + k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, + PROT_READ | PROT_WRITE, MAP_SHARED, + k->fd, bind.out.mmap_handle); + + if (cs.user_io == MAP_FAILED) { + perror("mmap(CS USER IO)"); + cs.user_io = NULL; + } + + return cs; +} + +static struct kbase_cs +kbase_cs_bind(kbase k, struct kbase_context *ctx, + base_va va, unsigned size) +{ + struct kbase_cs cs = kbase_cs_bind_noevent(k, ctx, va, size, ctx->num_csi++); + + // TODO: Fix this problem properly + if (k->event_slot_usage >= 256) { + fprintf(stderr, "error: Too many contexts created!\n"); + + /* *very* dangerous, but might just work */ + --k->event_slot_usage; + } + + // TODO: This is a misnomer... it isn't a byte offset + cs.event_mem_offset = k->event_slot_usage++; + k->event_slots[cs.event_mem_offset].back = + &k->event_slots[cs.event_mem_offset].syncobjs; + + uint64_t *event_data = k->event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; + + /* We use the "Higher" wait condition, so initialise to 1 to allow + * waiting before writing... */ + event_data[0] = 1; + /* And reset the error field to 0, to avoid INHERITing faults */ + event_data[1] = 0; + + /* Just a zero-init is fine... reads and writes are always paired */ + uint64_t *kcpu_data = k->kcpu_event_mem.cpu + cs.event_mem_offset * PAN_EVENT_SIZE; + kcpu_data[0] = 0; + kcpu_data[1] = 0; + + /* To match the event data */ + k->event_slots[cs.event_mem_offset].last = 1; + k->event_slots[cs.event_mem_offset].last_submit = 1; + + return cs; +} + +static void +kbase_cs_term(kbase k, struct kbase_cs *cs) +{ + if (cs->user_io) { + LOG("unmapping %p user_io %p\n", cs, cs->user_io); + munmap(cs->user_io, + k->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES); + } + + struct kbase_ioctl_cs_queue_terminate term = { + .buffer_gpu_addr = cs->va, + }; + + kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, &term); + + pthread_mutex_lock(&k->queue_lock); + kbase_update_queue_callbacks(k, &k->event_slots[cs->event_mem_offset], + ~0ULL); + + k->event_slots[cs->event_mem_offset].last = ~0ULL; + + /* Make sure that no syncobjs are referencing this CS */ + list_for_each_entry(struct kbase_syncobj, o, &k->syncobjs, link) + kbase_syncobj_update(k, o); + + + k->event_slots[cs->event_mem_offset].last = 0; + pthread_mutex_unlock(&k->queue_lock); +} + +static void +kbase_cs_rebind(kbase k, struct kbase_cs *cs) +{ + struct kbase_cs new; + new = kbase_cs_bind_noevent(k, cs->ctx, cs->va, cs->size, cs->csi); + + cs->user_io = new.user_io; + LOG("remapping %p user_io %p\n", cs, cs->user_io); + + fprintf(stderr, "bound csi %i again\n", cs->csi); +} + +static bool +kbase_cs_kick(kbase k, struct kbase_cs *cs) +{ + struct kbase_ioctl_cs_queue_kick kick = { + .buffer_gpu_addr = cs->va, + }; + + int ret = kbase_ioctl(k->fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); + return false; + } + + return true; +} + +#define CS_RING_DOORBELL(cs) \ + *((uint32_t *)(cs->user_io)) = 1 + +#define CS_READ_REGISTER(cs, r) \ + *((uint64_t *)(cs->user_io + 4096 * 2 + r)) + +#define CS_WRITE_REGISTER(cs, r, v) \ + *((uint64_t *)(cs->user_io + 4096 + r)) = v + +static bool +kbase_cs_submit(kbase k, struct kbase_cs *cs, uint64_t insert_offset, + struct kbase_syncobj *o, uint64_t seqnum) +{ + LOG("submit %p, seq %"PRIu64", insert %"PRIu64" -> %"PRIu64"\n", + cs, seqnum, cs->last_insert, insert_offset); + + if (!cs->user_io) + return false; + + if (insert_offset == cs->last_insert) + return true; + +#ifndef PAN_BASE_NOOP + struct kbase_event_slot *slot = + &k->event_slots[cs->event_mem_offset]; + + pthread_mutex_lock(&k->queue_lock); + slot->last_submit = seqnum + 1; + + if (o) + kbase_syncobj_update_fence(o, cs->event_mem_offset, seqnum); + pthread_mutex_unlock(&k->queue_lock); +#endif + + memory_barrier(); + + bool active = CS_READ_REGISTER(cs, CS_ACTIVE); + LOG("active is %i\n", active); + + CS_WRITE_REGISTER(cs, CS_INSERT, insert_offset); + cs->last_insert = insert_offset; + + if (false /*active*/) { + memory_barrier(); + CS_RING_DOORBELL(cs); + memory_barrier(); + + active = CS_READ_REGISTER(cs, CS_ACTIVE); + LOG("active is now %i\n", active); + } else { + kbase_cs_kick(k, cs); + } + + return true; +} + +static bool +kbase_cs_wait(kbase k, struct kbase_cs *cs, uint64_t extract_offset, + struct kbase_syncobj *o) +{ + if (!cs->user_io) + return false; + + if (kbase_syncobj_wait(k, o)) + return true; + + uint64_t e = CS_READ_REGISTER(cs, CS_EXTRACT); + unsigned a = CS_READ_REGISTER(cs, CS_ACTIVE); + + fprintf(stderr, "CSI %i CS_EXTRACT (%"PRIu64") != %"PRIu64", " + "CS_ACTIVE (%i)\n", + cs->csi, e, extract_offset, a); + + fprintf(stderr, "fences:\n"); + list_for_each_entry(struct kbase_fence, fence, &o->fences, link) { + fprintf(stderr, " slot %i: seqnum %"PRIu64"\n", + fence->slot, fence->value); + } + + return false; +} + +static bool +kbase_kcpu_queue_create(kbase k, struct kbase_context *ctx) +{ +#ifdef PAN_BASE_NOOP + return false; +#endif + + if (ctx->kcpu_init) + return true; + + struct kbase_ioctl_kcpu_queue_new create = {0}; + + int ret; + ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_CREATE, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_CREATE)"); + return false; + } + + ctx->kcpu_queue = create.id; + ctx->kcpu_init = true; + return true; +} + +static void +kbase_kcpu_queue_destroy(kbase k, struct kbase_context *ctx) +{ + if (!ctx->kcpu_init) + return; + + struct kbase_ioctl_kcpu_queue_delete destroy = { + .id = ctx->kcpu_queue, + }; + + int ret; + ret = ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_DELETE, &destroy); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_DELETE)"); + } + + ctx->kcpu_init = false; +} + +static bool +kbase_kcpu_command(kbase k, struct kbase_context *ctx, struct base_kcpu_command *cmd) +{ + int err; + bool ret = true; + + if (!kbase_kcpu_queue_create(k, ctx)) + return false; + + struct kbase_ioctl_kcpu_queue_enqueue enqueue = { + .addr = (uintptr_t) cmd, + .nr_commands = 1, + .id = ctx->kcpu_queue, + }; + + err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); + if (err != -1) + return ret; + + /* If the enqueue failed, probably we hit the limit of enqueued + * commands (256), wait a bit and try again. + */ + + struct kbase_wait_ctx wait = kbase_wait_init(k, 1000000000); + while (kbase_wait_for_event(&wait)) { + err = kbase_ioctl(k->fd, KBASE_IOCTL_KCPU_QUEUE_ENQUEUE, &enqueue); + if (err != -1) + break; + + if (errno != EBUSY) { + ret = false; + perror("ioctl(KBASE_IOCTL_KCPU_QUEUE_ENQUEUE"); + break; + } + } + kbase_wait_fini(wait); + + return ret; +} + +static int +kbase_kcpu_fence_export(kbase k, struct kbase_context *ctx) +{ + struct base_fence fence = { + .basep.fd = -1, + }; + + struct base_kcpu_command fence_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + .info.fence.fence = (uintptr_t) &fence, + }; + + return kbase_kcpu_command(k, ctx, &fence_cmd) ? fence.basep.fd : -1; +} + +static bool +kbase_kcpu_fence_import(kbase k, struct kbase_context *ctx, int fd) +{ + struct base_kcpu_command fence_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + .info.fence.fence = (uintptr_t) &(struct base_fence) { + .basep.fd = fd, + }, + }; + + return kbase_kcpu_command(k, ctx, &fence_cmd); +} + +static bool +kbase_kcpu_cqs_set(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value) +{ + struct base_kcpu_command set_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + .info.cqs_set_operation = { + .objs = (uintptr_t) &(struct base_cqs_set_operation_info) { + .addr = addr, + .val = value, + .operation = BASEP_CQS_SET_OPERATION_SET, + .data_type = BASEP_CQS_DATA_TYPE_U64, + }, + .nr_objs = 1, + }, + }; + + return kbase_kcpu_command(k, ctx, &set_cmd); +} + +static bool +kbase_kcpu_cqs_wait(kbase k, struct kbase_context *ctx, + base_va addr, uint64_t value) +{ + struct base_kcpu_command wait_cmd = { + .type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + .info.cqs_wait_operation = { + .objs = (uintptr_t) &(struct base_cqs_wait_operation_info) { + .addr = addr, + .val = value, + .operation = BASEP_CQS_WAIT_OPERATION_GT, + .data_type = BASEP_CQS_DATA_TYPE_U64, + }, + .nr_objs = 1, + .inherit_err_flags = 0, + }, + }; + + return kbase_kcpu_command(k, ctx, &wait_cmd); +} +#endif + +// TODO: Only define for CSF kbases? +static bool +kbase_callback_all_queues(kbase k, int32_t *count, + void (*callback)(void *), void *data) +{ + pthread_mutex_lock(&k->queue_lock); + + int32_t queue_count = 0; + + for (unsigned i = 0; i < k->event_slot_usage; ++i) { + struct kbase_event_slot *slot = &k->event_slots[i]; + + /* There is no need to do anything for idle slots */ + if (slot->last == slot->last_submit) + continue; + + struct kbase_sync_link *link = malloc(sizeof(*link)); + *link = (struct kbase_sync_link) { + .next = NULL, + .seqnum = slot->last_submit, + .callback = callback, + .data = data, + }; + + // TODO: Put insertion code into its own function + struct kbase_sync_link **list = slot->back; + slot->back = &link->next; + assert(!*list); + *list = link; + + ++queue_count; + } + + p_atomic_add(count, queue_count); + + pthread_mutex_unlock(&k->queue_lock); + + return queue_count != 0; +} + +static void +kbase_mem_sync(kbase k, base_va gpu, void *cpu, size_t size, + bool invalidate) +{ +#ifdef __aarch64__ + /* Valgrind replaces the operations with DC CVAU, which is not enough + * for CPU<->GPU coherency. The ioctl can be used instead. */ + if (!RUNNING_ON_VALGRIND) { + /* I don't that memory barriers are needed here... having the + * DMB SY before submit should be enough. TODO what about + * dma-bufs? */ + if (invalidate) + cache_invalidate_range(cpu, size); + else + cache_clean_range(cpu, size); + return; + } +#endif + + struct kbase_ioctl_mem_sync sync = { + .handle = gpu, + .user_addr = (uintptr_t) cpu, + .size = size, + .type = invalidate + (PAN_BASE_API == 0 ? 0 : 1), + }; + + int ret; + ret = kbase_ioctl(k->fd, KBASE_IOCTL_MEM_SYNC, &sync); + if (ret == -1) + perror("ioctl(KBASE_IOCTL_MEM_SYNC)"); +} + +bool +#if defined(PAN_BASE_NOOP) +kbase_open_csf_noop +#elif PAN_BASE_API == 0 +kbase_open_old +#elif PAN_BASE_API == 1 +kbase_open_new +#elif PAN_BASE_API == 2 +kbase_open_csf +#endif +(kbase k) +{ + k->api = PAN_BASE_API; + + pthread_mutex_init(&k->handle_lock, NULL); + pthread_mutex_init(&k->event_read_lock, NULL); + pthread_mutex_init(&k->event_cnd_lock, NULL); + pthread_mutex_init(&k->queue_lock, NULL); + + pthread_condattr_t attr; + pthread_condattr_init(&attr); + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + pthread_cond_init(&k->event_cnd, &attr); + pthread_condattr_destroy(&attr); + + list_inithead(&k->syncobjs); + + /* For later APIs, we've already checked the version in pan_base.c */ +#if PAN_BASE_API == 0 + struct kbase_ioctl_get_version ver = { 0 }; + kbase_ioctl(k->fd, KBASE_IOCTL_GET_VERSION, &ver); +#endif + + k->close = kbase_close; + + k->get_pan_gpuprop = kbase_get_pan_gpuprop; + k->get_mali_gpuprop = kbase_get_mali_gpuprop; + + k->alloc = kbase_alloc; + k->free = kbase_free; + k->import_dmabuf = kbase_import_dmabuf; + k->mmap_import = kbase_mmap_import; + + k->poll_event = kbase_poll_event; + k->handle_events = kbase_handle_events; + +#if PAN_BASE_API < 2 + k->submit = kbase_submit; +#else + k->context_create = kbase_context_create; + k->context_destroy = kbase_context_destroy; + k->context_recreate = kbase_context_recreate; + + k->cs_bind = kbase_cs_bind; + k->cs_term = kbase_cs_term; + k->cs_rebind = kbase_cs_rebind; + k->cs_submit = kbase_cs_submit; + k->cs_wait = kbase_cs_wait; + + k->kcpu_fence_export = kbase_kcpu_fence_export; + k->kcpu_fence_import = kbase_kcpu_fence_import; + k->kcpu_cqs_set = kbase_kcpu_cqs_set; + k->kcpu_cqs_wait = kbase_kcpu_cqs_wait; +#endif + + k->syncobj_create = kbase_syncobj_create; + k->syncobj_destroy = kbase_syncobj_destroy; + k->syncobj_dup = kbase_syncobj_dup; + k->syncobj_wait = kbase_syncobj_wait; + + k->callback_all_queues = kbase_callback_all_queues; + + k->mem_sync = kbase_mem_sync; + + for (unsigned i = 0; i < ARRAY_SIZE(kbase_main); ++i) { + ++k->setup_state; + if (!kbase_main[i].part(k)) { + k->close(k); + return false; + } + } + return true; +} diff --git a/src/panfrost/ci/deqp-panfrost-g610.toml b/src/panfrost/ci/deqp-panfrost-g610.toml new file mode 100644 index 00000000000..6bad2fb44de --- /dev/null +++ b/src/panfrost/ci/deqp-panfrost-g610.toml @@ -0,0 +1,11 @@ +# Basic test set +[[deqp]] +deqp = "/deqp/modules/gles2/deqp-gles2" +caselists = ["/deqp/mustpass/gles2-master.txt"] +deqp_args = [ + "--deqp-surface-width=256", "--deqp-surface-height=256", + "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden", + "--deqp-gl-config-name=rgba8888d24s8ms0", +] +version_check = "GL ES 3.1.*git" +renderer_check = "Mali-G610" diff --git a/src/panfrost/csf_test/interpret.py b/src/panfrost/csf_test/interpret.py new file mode 100755 index 00000000000..081d32d94c9 --- /dev/null +++ b/src/panfrost/csf_test/interpret.py @@ -0,0 +1,1820 @@ +#!/usr/bin/env python3 + +import os +import re +import struct +import subprocess +import sys + +try: + py_path = os.path.dirname(os.path.realpath(__file__)) + "/../bifrost/valhall" +except: + py_path = "../bifrost/valhall" + +if py_path not in sys.path: + sys.path.insert(0, py_path) + +import asm +import struct + +def ff(val): + return struct.unpack("=f", struct.pack("=I", val))[0] + +def ii(val): + return struct.unpack("=I", struct.pack("=f", val))[0] + +shaders = { + "atomic": """ +IADD_IMM.i32.reconverge r0, 0x0, #0x0 +NOP.wait0 +ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 +BRANCHZ.eq.reconverge ^r1.h0, offset:1 +BRANCHZ.eq 0x0, offset:3 +ATOM1_RETURN.i32.slot0.ainc @r1, u0, offset:0x0 +IADD_IMM.i32 r0, ^r0, #0x1 +BRANCHZ.eq.reconverge 0x0, offset:-7 +NOP.end +""", + "rmw": """ +IADD_IMM.i32.reconverge r0, 0x0, #0x0 +ICMP_OR.u32.ge.m1 r1, r0, u2, 0x0 +BRANCHZ.eq.reconverge r1.h0, offset:1 +BRANCHZ.eq 0x0, offset:6 +NOP.wait1 +LOAD.i32.unsigned.slot0.wait0 @r1, u0, offset:0 +IADD_IMM.i32 r1, ^r1, #0x1 +STORE.i32.slot1 @r1, u0, offset:0 +IADD_IMM.i32 r0, ^r0, #0x1 +BRANCHZ.eq.reconverge 0x0, offset:-9 +NOP.end +""", + "global_invocation": """ +IADD_IMM.i32 r0, ^r60, #0x1 +STORE.i32.slot0.end @r0, u0, offset:0 +""", + "invoc_offset": """ +LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 +IADD.s32 r0, u0, ^r0 +ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 +IADD.s32 r1, ^r1, u1 +MOV.i32 r2, u2 +STORE.i32.slot0.end @r2, ^r0, offset:0 +""", + "invoc_rmw": """ +LSHIFT_OR.i32 r0, ^r60, 0x3020100.b22, 0x0 +IADD.s32 r0, u0, ^r0 +ICMP_OR.u32.lt.i1 r1, r0, u0, 0x0 +IADD.s32 r1, ^r1, u1 +LOAD.i32.unsigned.slot0.wait0 @r2, r0, offset:0 +IADD.s32 r2, ^r2, u2 +STORE.i32.slot1.end @r2, ^r0, offset:0 +""", + + "preframe": """ +U16_TO_U32.discard r0, r59.h00 +U16_TO_U32 r1, ^r59.h10 +IADD_IMM.i32 r2, 0x0, #0x1 +IADD_IMM.i32 r3, 0x0, #0x0 +TEX_FETCH.slot0.skip.f.32.2d.wait @r4:r5:r6:r7, @r0:r1, ^r2 +FADD.f32 r4, ^r4, 0x40490FDB +FADD.f32 r5, ^r5, 0x40490FDB +BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 +""", + + + "position": """ +LEA_BUF_IMM.slot0.wait0 @r4:r5, r59, table:0xD, index:0x0 +#BRANCHZI.absolute 0x1000000, ^r4 +# position of 16384 +IADD_IMM.i32 r2, 0x0, #0x0e +# position of 16 +IADD_IMM.i32 r2, 0x0, #0x04 +LSHIFT_OR.i32 r0, 0x03020100.b1, r2, 0x0 +LSHIFT_AND.i32 r0, r60, r2, ^r0 +IADD_IMM.i32 r1, 0x0, #0x01 +RSHIFT_AND.i32 r1, r60, 0x03020100.b11, ^r1 +LSHIFT_OR.i32 r1, ^r1, ^r2, 0x0 +S32_TO_F32 r0, ^r0 +S32_TO_F32 r1, ^r1 + +RSHIFT_OR.i32 r2, ^r60, 0x03020100.b22, 0x0 +S32_TO_F32 r2, ^r2 +FADD.f32 r0, ^r0, r2.neg +#FADD.f32 r1, ^r1, ^r2 +S32_TO_F32 r2, ^r60 +#MOV.i32 r1, 0x0 + +FADD.f32 r0, ^r0, 0x40490FDB +FADD.f32 r1, ^r1, 0x40490FDB +#FMA.f32 r2, ^r2, 0x3DCCCCCD, 0x0 +MOV.i32 r2, 0x3DCCCCCD +MOV.i32 r3, 0x0 + +#STORE.i128.slot0 @r0:r1:r2:r3, thread_local_pointer, offset:0 + +IADD_IMM.i32 r8, 0x0, #0x00004000 +STORE.i16.istream.slot0 @r8, r4, offset:64 + +STORE.i128.istream.slot0 @r0:r1:r2:r3, r4, offset:0 +STORE.i128.slot0.end @r0:r1:r2:r3, ^r4, offset:0x7000 +""", + + "fragment": """ +ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, u0, offset:0 +IADD_IMM.i32 r1, 0x0, #0x1ff +LSHIFT_AND.i32 r0, ^r0, 0x0, ^r1 +SHADDX.u64 r2, u2, ^r0.w0, shift:0x2 +STORE.i32.slot0.wait0 @r59, ^r2, offset:0 + +IADD_IMM.i32 r4, 0x0, #0x3f100000 +IADD_IMM.i32 r5, 0x0, #0x3f400000 +IADD_IMM.i32 r6, 0x0, #0x3f300000 +IADD_IMM.i32 r7, 0x0, #0x32cccccd +BLEND.slot0.v4.f32.end @r4:r5:r6:r7, blend_descriptor_0.w0, r60, target:0x0 +""", + +} + +flg = 0xf +#flg = 0x20000f # Uncached! + +HEAP_SIZE = 1024 * 1024 + +memory = { + "ev": (8192, 0x8200f), + "x": 1024 * 1024, + "y": 4096, + "ls_alloc": 4096, + "occlusion": 4096, + + "ssbo": 4096, + "tls": 4096, + + #"plane_0": (256 * 256 * 32, 0x380f), # 2 MB + "plane_0": (256 * 256 * 32, 0x280f), # 2 MB + + "idk": HEAP_SIZE, + "heap": HEAP_SIZE, +} + +w = 0xffffffff + +# Words are 32-bit, apart from address references +descriptors = { + "shader": [0x118, 1 << 12, "invoc_rmw"], + "ls": [3, 31, "ls_alloc"], + "fau": [("ssbo", 0), ("ssbo", 16)], + "fau2": [("ev", 8 + (0 << 34)), 7, 0], + + "tiler_heap": [ + 0x029, 1 << 21, #HEAP_SIZE, + 0x1000, 0x60, 0x1040, 0x60, 0x1000 + (1 << 21), 0x60 + #"heap", ("heap", 64), ("heap", HEAP_SIZE), + ], + +} | { + x: [ + 0, 0, + # Hierarchy mask, + # Single-sampled + # Last provoking vertex + 0x6 | (0 << 18), + 0x00ff00ff, + # Layer + 0, 0, + "tiler_heap", + ("idk", 0x10), + #("tiler_heap", -0xfff0), + # "Weights" + ] + ([0] * (32 - 10)) + [ + # "State" + 0, + 31, + 0, + 0x10000000, + ] for x in ("tiler_ctx", "tiler_ctx2", "tiler_ctx3") +} | { + + "thread_storage": [ + 1, 31, + "tls", + 0, 0, + ], + + # Preload r59/r60 + "preframe_shader": [0x128, 3 << 11, "preframe"], + "position_shader": [0x138, 3 << 11, "position"], + "fragment_shader": [0x128, 3 << 11, "fragment"], + + "idvs_zs": [ + 0x70077, # Depth/stencil type, Always for stencil tests + 0, 0, # Stencil state + 0, # unk + # Depth source minimum, write disabled + # [0, 1] Depth clamp + # Depth function: Always + (1 << 23) | (7 << 29), + 0, # Depth units + 0, # Depth factor + 0, # Depth bias clamp + ], + + "preframe_zs": [ + 0x70077, # Depth/stencil type, Always for stencil tests + 0, 0, # Stencil state + 0, # unk + # Depth source minimum, write disabled + # [0, 1] Depth clamp + # Depth function: Always + (1 << 23) | (7 << 29), + 0, # Depth units + 0, # Depth factor + 0, # Depth bias clamp + ], + + "idvs_blend": [ + # Load dest, enable + 1 | (1 << 9), + # RGB/Alpha: Src + Zero * Src + # All channels + ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), + # Fixed function blending, four components + 2 | (3 << 3), + # RGBA8 TB pixel format / F32 register format + 0 | (237 << 12) | (0 << 22) | (1 << 24), + ], + + "preframe_blend": [ + # Load dest, enable + 1 | (1 << 9), + # RGB/Alpha: Src + Zero * Src + # All channels + ((2 | (2 << 4) | (1 << 8)) * 0x1001) | (0xf << 28), + # Fixed function blending, four components + 2 | (3 << 3), + # RGBA8 TB pixel format / F32 register format + 0 | (237 << 12) | (0 << 22) | (1 << 24), + ], + + "preframe_surface": [ + # Plane descriptor, generic, tiled, RAW32 clump format + 10 | (1 << 4) | (1 << 8) | (2 << 24), + 256 * 256 * 4, + "plane_0", + 0, + 0, 0, + 0, # was 15, + ], + + "preframe_table": [ + # Texture descriptor, 2D, format + 2 | (2 << 4) | (187 << (10 + 12)), + # Width, height + 255 | (255 << 16), + # Swizzle, interleave + 1672 | (1 << 12), + 0, + "preframe_surface", + 0, 0, + + # Sampler descriptor, clamp to edge + 1 | (9 << 8) | (9 << 12) | (9 << 16), + 0, 0, 0, 0, 0, 0, 0, + ], + + "preframe_resources": [ + ("preframe_table", (1 << (32 + 24))), 0x40, 0, + ], + + "dcds": [ + # Clean fragment write, primitive barrier + (1 << 9) | (1 << 10), + # Sample mask of 0xffff, RT mask of 1 + 0x1ffff, + 0, 0, # vertex array + 0, 0, # unk + 0, 0x3f800000, # min/max depth + 0, 0, # unk + "preframe_zs", # depth/stencil + ("preframe_blend", 1), # blend (count == 1) + 0, 0, # occlusion + + # Shader environment: + 0, # Attribute offset + 2, # FAU count + 0, 0, 0, 0, 0, 0, # unk + ("preframe_resources", 1), # Resources + "preframe_shader", # Shader + 0, 0, # Thread storage + "fau", # FAU + ], + + "framebuffer": [ + 1, 0, # Pre/post, downscale, layer index + 0x10000, 0, # Argument + "ls_alloc", # Sample locations + "dcds", # DCDs + 0x00ff00ff, # width / height + 0, 0x00ff00ff, # bound min/max + # 32x32 tile size + # 4096 byte buffer allocation (maybe?) + (10 << 9) | (4 << 24), + 0, # Disable S, ZS/CRC, Empty Tile, CRC + 0, # Z Clear + "tiler_ctx", # Tiler + + # Framebuffer padding + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + # Render target + # R8G8B8A8 internal format + (1 << 26), + # Write Enable + # R8G8B8A8 colour format + # Linear block format + # 0123 swizzle + # Clean pixel write enable + 1 | (19 << 3) | (1 << 8) | (0o3210 << 16) | (1 << 31), + + # AFBC overlay + # No YTR, no split, no wide, no reverse, no front, no alpha + # RGBA8 compression mode + 0 | (10 << 10), + 0, 0, 0, 0, 0, + + # RT Buffer + "plane_0", + 256 * 4 * 16, # Row stride (for tiling) + 0x400, # Surface stride / Body offset + + # RT Clear + 0x2e234589, 0, 0, 0, + ], + + "index_buffer": [ + 0, 1, 2, + 0, 2, 1, + 1, 0, 2, + 1, 2, 0, + 2, 0, 1, + 2, 1, 0, + + #63, 64, 65, + 1, 2, 3, + 4, 5, 6, + 12, 13, 14, + 0, 1, 2, + 4, 5, 6, + 8, 9, 10, + 3, 4, 5, + ], + + "point_index": [x * 4 for x in range(32)] + [ + 0, 64, 440, 0, + ], + + "position_data": [ + ii(10.0), ii(10.0), ii(1.0), ii(1.0), + ], +} + +# TODO: Use mako? Or just change the syntax for "LDM/STM" +# and use f-strings again? + +cmds = """ +!cs 0 +resources fragment + +@ Bound min +mov w2a, i16:0,0 +@ Bound max +mov w2b, i16:255,255 +mov x28, $framebuffer+1 + +slot 2 + +fragment + +mov w4a, #0x0 +UNK 02 24, #0x4a0000ff0211 +wait 1 + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!raw sleep 20 +!memset plane_0 0 0 262144 +!raw sleep 200 +!dump plane_0 0 12 +!heatmap plane_0 0 262144 gran 4096 len 32768 stride 32768 +""" + +altcmds = """ +!cs 0 + +@ Some time is required for the change to become active +@ Just submitting a second job appears to be enough +resources compute fragment tiler idvs +mov x48, #0x6000000000 +heapctx x48 + +!cs 0 + +slot 3 +wait 3 +heapinc vt_start + +@ Base vertex count +mov w24, 0 +@ Instance count +mov w22, 1 + +@ Vertex attribute stride +mov x30, 0 + +@ Primitive +mov w38, 0x430000 +@@ Draw +@ Pixel kill etc. +@ Enable occlusion query +@mov w39, 0xc000 +mov w39, 0 +@ Unk... +mov w26, 0x1000 +@ Sample mask / render target mask +mov w3a, 0x1ffff +@ Min/max Z +mov w2c, float:0 +mov w2d, float:1.0 +@ Depth/stencil +mov x34, $idvs_zs +@ Blend +mov x32, $idvs_blend+1 +@ Occlusion +mov x2e, $occlusion + +@ Primitive size +mov x3c, float:3.75 +@ Fragment shader environment +mov x14, $fragment_shader +@ FAU count == 2 +movp x0c, $fau+0x0200000000000000 + +@ Position shader environment +mov x10, $position_shader + +mov x18, $thread_storage + +@ is this right?! "Vertex attribute stride" apparently? +@ that was for pure tiler jobs, for idvs it messes up points/lines +@ for some reason +@mov x30, $position_data + +@ Tiler +mov x28, $tiler_ctx + +@ Scissor min +mov w2a, i16:0,0 +@ Scissor max +mov w2b, i16:255,255 + +mov w21, 18 +mov w27, 4096 +mov x36, $index_buffer + +idvs 0x4002, mode triangles, index uint32 + +mov w21, 1 @36 +mov w27, 4096 +mov x36, $point_index + +@idvs 0x4a42, mode points, index uint32 + +mov w21, 400000 +mov w21, 18 +@idvs 0x4a42, mode triangles, index none + +@idvs 0x4a42, mode points, index none +@idvs 0x4a42, mode line-loop, index none + +flush_tiler +wait 3 +heapinc vt_end + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +UNK 00 24, #0x5f0000000233 +wait all + +!dump64 tiler_heap 0 4096 +@!dump idk 0 1048576 +@!dump position_data 0 4096 + +!cs 0 + +UNK 00 24, #0x5f0000000233 +wait all + +slot 4 +wait 4 +heapinc vt_start + +mov x28, $tiler_ctx2 +idvs 0x4002, mode triangles, index none +flush_tiler +wait 4 +heapinc vt_end + +UNK 00 24, #0x5f0000000233 +wait all + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!dump64 tiler_heap 0 4096 + +!cs 0 + +mov x50, $ev + +@ Bound min +mov w2a, i16:0,0 +@ Bound max +mov w2b, i16:255,255 +mov x28, $framebuffer+1 +@ Tile enable map +mov x2c, $x +mov x2e, 64 + +mov w40, 1 +str w40, [x2c] +@str w40, [x2c, 128] + +@ Use tile enable map +@fragment tem 1 + +fragment + +@ Does this actually do anytihng? +mov x48, $tiler_ctx +ldr x4a, [x48, 40] +ldr x4c, [x48, 48] +wait 0,4 +UNK 02 0b, 0x4a4c00100001 + +mov x48, $tiler_ctx2 +ldr x4a, [x48, 40] +ldr x4c, [x48, 48] +wait 0,4 +UNK 02 0b, 0x4a4c00100001 + +UNK 02 24, #0x5f0000f80211 +@UNK 00 24, #0x5f0000000233 +wait 1 + +mov x54, $plane_0 +ldr x56, [x54] +wait 0 + +mov x52, $y +str x56, [x52] + +evstr w5f, [x50], unk 0xfd, irq + +!raw td +!fdump heap 0 1048576 +!tiler heap 0 1048576 + + +@!dump rt_buffer 0 4096 +!dump y 0 4096 +@!dump plane_0 0 524288 +@!heatmap plane_0 0 524288 gran 0x80 len 0x200 stride 0x4000 +!heatmap plane_0 0 8192 gran 0x04 len 0x20 stride 0x400 +!dump occlusion 0 4096 +@!dump ssbo 0 4096 + +!dump64 tiler_heap 0 4096 +!dump tiler_ctx 0 4096 +!dump tiler_ctx2 0 4096 + +@!fdump heap 0 1048576 + +!cs 0 + +slot 3 +wait 3 +heapinc vt_start + +mov x28, $tiler_ctx3 +mov w2c, float:0 +mov w2d, float:1.0 +mov x2e, $occlusion + +idvs 0x4002, mode triangles, index none +flush_tiler +wait 3 +heapinc vt_end + +UNK 00 24, #0x5f0000000233 +wait all + +mov x50, $ev +evstr w5f, [x50], unk 0xfd, irq + +!dump64 tiler_heap 0 4096 +!dump tiler_ctx 0 4096 +!raw td + +""" + +docopy = """ +ldr {w00-w0f}, [x52] +ldr {w10-w1f}, [x52, 64] +ldr {w20-w2f}, [x52, 128] +ldr {w30-w3f}, [x52, 192] +add x52, x52, 256 + +loop: +wait 0 + +str {w00-w0f}, [x54] +ldr {w00-w0f}, [x52] +str {w10-w1f}, [x54, 64] +ldr {w10-w1f}, [x52, 64] +str {w20-w2f}, [x54, 128] +ldr {w20-w2f}, [x52, 128] +str {w30-w3f}, [x54, 192] +ldr {w30-w3f}, [x52, 192] + +add x54, x54, 256 +add x52, x52, 256 +add x50, x50, -256 + +b.ne w50, loop +b.ne w51, loop +""" + +oldcmds = f""" +!cs 0 + +mov x50, 0x8000000 + +mov x52, $from +mov x54, $to +mov x56, $x +mov x58, $ev +mov x5a, $y + +str cycles, [x56] +{docopy} +str cycles, [x56, 8] + +UNK 00 24, #0x5f0000000233 +evstr w5f, [x58], unk 0xfd, irq + +!cs 1 + +mov x50, 0x8000000 + +mov x52, $from +mov x54, $to +mov x56, $x +mov x58, $ev +mov x5a, $y + +add x52, x52, 0x8000000 +add x54, x54, 0x8000000 +add x56, x56, 32 + +nop +nop + +str cycles, [x56] +{docopy} +str cycles, [x56, 8] + +UNK 00 24, #0x5f0000000233 +evstr w5f, [x58], unk 0xfd, irq + +!delta x 0 4096 +""" + +oldcmds = """ +!cs 0 +endpt compute +!cs 0 + +@ Workgroup size 1x1x1, merging allowed +mov w21, 0x80000000 + +@ Workgroup count 1x1x1 +mov w25, 1 +mov w26, 1 +mov w27, 1 + +@ Offset 0,0,0 +mov w22, 0 +mov w23, 0 +mov w24, 0 + +@ TODO: offset x/y/z + +@ Resources +mov x06, 0 + +@ Shader +mov x16, $shader + +@ Local storage +mov x1e, $ls + +@ FAU +movp x0e, $fau+0x0200000000000000 + +slot 2 +wait 2 + +UNK 0400000000008200 + +mov x58, $fau +ldr x56, [x58] +wait 0 + +@mov w4a, 0 + +@slot 6 +@mov x54, $x +@UNK 02 24, #0x4a0000f80211 +@ldr x52, [x56] +@wait 0,1 +@str x52, [x54] + +mov w40, 60 +1: add w40, w40, -1 + +@mov w4a, #0x0 +@UNK 02 24, #0x4a0000f80211 +@wait 1 + +@mov w54, #0 +@UNK 00 24, #0x540000000233 +@wait all + +slot 2 +wait 2 + +add w22, w22, 1 +@UNK 0400ff0000008200 + +@b.ne w40, 1b + +!dump x 0 4096 +!dump y 0 4096 +!dump ev 0 4096 +""" + +oldcmds = """ +!cs 0 + +mov x48, $x + +mov w21, 0x80000000 +mov w25, 1 +mov w26, 1 +mov w27, 1 + +movp x0e, $fau+0x0200000000000000 + +@ Write FAUs +@add x0e, x48, 64 +@mov x50, $ev +@str x50, [x0e] +@mov x30, 10 +@str x30, [x0e, 8] +@add w0f, w0f, 0x02000000 + +@ Write shader descriptor +@add x16, x48, 128 +@mov x30, 0x118 +@str x30, [x16] +@mov x30, $compute +@str x30, [x16, 8] + +wait 0 + +add x1e, x48, 192 + +mov x30, $y +@regdump x30 +@mov x30, 0 + +resources compute +slot 2 +mov w54, #0xffffe0 +UNK 00 24, #0x540000000233 + +wait all + +mov x54, 0 +mov w56, 0 +mov w5d, 1 + +slot 2 +wait 2 +wait 2 +regdump x30 +UNK 0400ff0000008200 +add x30, x30, 0x200 +regdump x30 +slot 2 +wait 2 + +mov w40, 1000 +1: add w40, w40, -1 +str cycles, [x50, 32] +b.ne w40, 1b + +wait 0 +wait all + +@ 6 / 10 / 14 +mov w40, 1 +1: add w40, w40, -1 +UNK 0400ff0000000200 +b.ne w40, 1b + +mov w40, 1000 +1: add w40, w40, -1 +str cycles, [x50, 32] +b.ne w40, 1b + +mov w42, 200 +mov w40, 100 +1: add w40, w40, -1 +@wait all +@UNK 0400ff0000008001 @ compute + +@UNK 0400ff0000000001 +@UNK 2501504200000004 @ evadd +@UNK 3 24, #0x4a0000000211 + +@wait all +b.ne w40, 1b + +@UNK 2601504200000004 + +str cycles, [x50, 40] +str cycles, [x50, 48] +UNK 02 24, #0x4a0000000211 +wait 0 + +add x5c, x50, 64 +evadd w5e, [x5c], unk 0xfd +evadd w5e, [x5c], unk 0xfd, irq, unk0 + +!dump x 0 4096 +!dump y 0 4096 +!delta ev 0 4096 +""" + +altcmds = """ +!cs 0 +!alloc x 4096 +!alloc ev 4096 0x8200f +!alloc ev2 4096 0x8200f + +mov x10, $x +UNK 00 30, #0x100000000000 +add x12, x10, 256 +str cycles, [x12] +mov x5a, $ev2 +mov x48, 0 +mov w4a, 0 +slot 3 +wait 3 +UNK 00 31, 0 +mov x48, $ev +mov w4a, 0x4321 +add x46, x48, 64 +mov w42, 0 + +str cycles, [x12, 8] +UNK 01 26, 0x484a00000005 +str cycles, [x12, 16] +UNK 01 26, 0x484a00000005 +str cycles, [x12, 24] + +nop + +mov w10, 10000 +1: +UNK 01 26, 0x484a00000005 +add w10, w10, -1 +b.ne w10, 1b +str cycles, [x12, 32] + +mov w10, 10000 +1: +UNK 01 26, 0x484a00000005 +@UNK 02 24, #0x420000000211 +add w10, w10, -1 +b.ne w10, 1b +str cycles, [x12, 40] + +ldr x16, [x48, 0] +wait 0 +str x16, [x48, 16] + +UNK 00 31, 0x100000000 + +mov w4a, #0x0 +UNK 02 24, #0x4a0000000211 + +mov w5e, 1 +add x5c, x5a, 0x100 +UNK 01 25, 0x5c5e00f80001 + +!delta x 0 4096 +!dump ev 0 4096 +!dump ev2 0 4096 +""" + +altcmds = """ +!cs 0 +!alloc x 4096 +!alloc ev 4096 0x8200f + +iter vertex +slot 2 + +mov x40, $x +mov w10, 1 +mov x48, 0 +mov w4a, 0 +call w4a, x48 + nop + nop + nop + mov x20, $. +@ movp x22, 0x0126000011223344 + movp x22, 0x1600000060000001 + str x22, [x20, 56] + 1: nop + b 1b + nop + add x40, x40, #256 + regdump x40 + +mov x5a, #0x5ff7fd6000 +mov x48, $ev +mov x40, #0x5ff7fd6000 +mov w54, #0x1 +UNK 00 24, #0x540000000233 +wait 0 +slot 6 +@UNK 00 31, #0x0 +UNK 00 09, #0x0 +wait 6 +@UNK 00 31, #0x100000000 +mov x4a, x40 +UNK 01 26, 0x484a00040001 + +!dump x 0 4096 +@!dump ev 0 4096 +@!delta x 0 4096 +""" + +cycletest = """ +mov w10, 10 +1: +str cycles, [x5c] +add x5c, x5c, 8 +add w10, w10, -1 +mov w11, 100000 + +inner: +add w11, w11, -1 +b.ne w11, inner + +b.ne w10, 1b +""" + +def get_cmds(cmd): + return cmds.replace("{cmd}", str(cmd)) + +def assemble_shader(text): + lines = text.strip().split("\n") + lines = [l for l in lines if len(l) > 0 and l[0] not in "#@"] + return [asm.parse_asm(ln) for ln in lines] + +class Buffer: + id = 0 + + def __init__(self): + self.id = Buffer.id + Buffer.id += 1 + +def resolve_rel(to, branch): + return (to - branch) // 8 - 1 + +def to_int16(value): + assert(value < 36768) + assert(value >= -32768) + return value & 0xffff + +class Level(Buffer): + def __init__(self, indent): + super().__init__() + + self.indent = indent + self.buffer = [] + self.call_addr_offset = None + self.call_len_offset = None + + self.labels = {} + self.label_refs = [] + # Numeric labels can be reused, so have to be handled specially. + self.num_labels = {} + self.num_refs = {} + + def offset(self): + return len(self.buffer) * 8 + + def __repr__(self): + buf = " ".join(hex(x) for x in self.buffer) + return f"buffer {self.id} {self.offset()} 0x200f {buf}" + + def buffer_add_value(self, offset, value): + self.buffer[offset // 8] += value + + def process_relocs(self, refs, to=None): + for ref, offset, type_ in refs: + assert(type_ == "rel") + + if to is None: + goto = self.labels[ref] + else: + goto = to + + value = to_int16(resolve_rel(goto, offset)) + self.buffer_add_value(offset, value) + + def finish(self): + self.process_relocs(self.label_refs) + +class Alloc(Buffer): + def __init__(self, size, flags=0x280f): + super().__init__() + + self.size = size + self.flags = flags + self.buffer = [] + + def __repr__(self): + buf = " ".join(hex(x) for x in self.buffer) + return f"buffer {self.id} {self.size} {hex(self.flags)} {buf}" + +def fmt_reloc(r, name="reloc"): + dst, offset, src, src_offset = r + return f"{name} {dst}+{offset} {src}+{src_offset}" + +def fmt_exe(e): + return " ".join(str(x) for x in e) + +class Context: + def __init__(self): + self.levels = [] + self.l = None + + self.allocs = {} + self.completed = [] + self.reloc = [] + self.reloc_split = [] + + self.exe = [] + self.last_exe = None + + self.is_call = False + + def set_l(self): + if len(self.levels): + self.l = self.levels[-1] + + def pop_until(self, indent): + while self.l.indent != indent: + l = self.levels.pop() + self.completed.append(l) + + self.set_l() + if not len(self.levels): + return + + buf_len = l.offset() + + r = self.l + self.reloc.append((r.id, r.call_addr_offset * 8, l.id, 0)) + r.buffer[r.call_len_offset] = ( + (r.buffer[r.call_len_offset] & (0xffff << 48)) + + buf_len) + r.buffer[r.call_addr_offset] &= (0xffff << 48) + + r.call_addr_offset = None + r.call_len_offset = None + + def flush_exe(self): + ind = self.levels[0].indent + + self.pop_until(ind) + if len(self.levels[0].buffer): + l = self.levels.pop() + l.finish() + self.completed.append(l) + + self.levels.append(Level(ind)) + self.set_l() + + if not len(self.exe): + return + + if self.last_exe is None: + print("# Trying to add multiple CSs to an exe line, becoming confused") + return + + if len(self.completed): + p = self.completed[-1] + assert(p.indent == ind) + + self.exe[self.last_exe] += [p.id, p.offset()] + + self.last_exe = None + + def add_shaders(self, shaders): + for sh in shaders: + qwords = assemble_shader(shaders[sh]) + sh = sh.lower() + + a = Alloc(len(qwords) * 8, flags=0x2017) + a.buffer = qwords + self.allocs[sh] = a + + def add_memory(self, memory): + for m in memory: + f = memory[m] + if isinstance(f, int): + size, flags = f, 0x280f + else: + size, flags = f + self.allocs[m] = Alloc(size, flags) + + def add_descriptors(self, descriptors): + for d in descriptors: + words = descriptors[d] + a = Alloc(0) + + buf = [] + for w in words: + if isinstance(w, int): + buf.append(w) + else: + if isinstance(w, str): + alloc, offset = w, 0 + else: + alloc, offset = w + ref = self.allocs[alloc] + self.reloc.append((a.id, len(buf) * 4, + ref.id, offset)) + buf.append(0) + buf.append(0) + + it = iter(buf) + a.buffer = [x | (y << 32) for x, y in zip(it, it)] + a.size = len(a.buffer) * 8 + self.allocs[d] = a + + def interpret(self, text): + text = text.split("\n") + + old_indent = None + + for orig_line in text: + #print(orig_line, file=sys.stderr) + + line = orig_line.split("@")[0].expandtabs().rstrip().lower() + if not line: + continue + + indent = len(line) - len(line.lstrip()) + line = line.lstrip() + + if old_indent is None: + self.levels.append(Level(indent)) + elif indent != old_indent: + if indent > old_indent: + assert(self.is_call) + + self.levels.append(Level(indent)) + else: + self.pop_until(indent) + + self.set_l() + + old_indent = indent + self.is_call = False + + given_code = None + + # TODO: Check against this to test the disassembler? + if re.match(r"[0-9a-f]{16} ", line): + given_code = int(line[:16], 16) + line = line[16:].lstrip() + + s = [x.strip(",") for x in line.split()] + + if s[0].endswith(":") or (len(s) == 1 and is_num(s[0])): + label = s[0] + if s[0].endswith(":"): + label = label[:-1] + + if is_num(label): + label = int(label) + if label in self.l.num_refs: + self.l.process_relocs(self.l.num_refs[label], self.l.offset()) + del self.l.num_refs[label] + self.l.num_labels[label] = self.l.offset() + else: + if label in self.l.labels: + print("Label reuse is not supported for non-numeric labels") + self.l.labels[label] = self.l.offset() + + s = s[1:] + if not len(s): + continue + + for i in range(len(s)): + if s[i].startswith("$"): + name, *offset = s[i][1:].split("+") + if name == ".": + buf = self.l + else: + buf = self.allocs[name] + if len(offset): + assert(len(offset) == 1) + offset = int(offset[0], 0) + else: + offset = 0 + + if s[0] == "movp": + rels = self.reloc_split + else: + rels = self.reloc + + rels.append((self.l.id, self.l.offset(), + buf.id, offset)) + s[i] = "#0x0" + + def is_num(str): + return re.fullmatch(r"[0-9]+", str) + + def hx(word): + return int(word, 16) + + def reg(word): + return hx(word[1:]) + + def val(word): + if word.startswith("float:"): + return ii(float(word.split(":")[1])) + elif word.startswith("i16:"): + lo, hi = word.split(":")[1].split(",") + lo, hi = val(lo), val(hi) + assert(lo < (1 << 16)) + assert(hi < (1 << 16)) + return (lo & 0xffff) | (hi << 16) + + value = int(word.strip("#"), 0) + assert(value < (1 << 48)) + return value + + sk = True + + if s[0] == "!cs": + assert(len(s) == 2) + self.flush_exe() + self.last_exe = len(self.exe) + self.exe.append(["exe", int(s[1])]) + continue + elif s[0] == "!parallel": + assert(len(s) == 2) + self.flush_exe() + self.last_exe = len(self.exe) - 1 + self.exe[-1] += [int(s[1])] + continue + elif s[0] == "!alloc": + assert(len(s) == 3 or len(s) == 4) + alloc_id = s[1] + size = int(s[2]) + flags = val(s[3]) if len(s) == 4 else 0x280f + self.allocs[alloc_id] = Alloc(size, flags) + continue + elif s[0] in ("!dump", "!dump64", "!fdump", "!delta", "!tiler"): + assert(len(s) == 4) + alloc_id = s[1] + offset = val(s[2]) + size = val(s[3]) + mode = { + "!dump": "hex", + "!dump64": "hex64", + "!fdump": "filehex", + "!delta": "delta", + "!tiler": "tiler", + }[s[0]] + self.exe.append(("dump", self.allocs[alloc_id].id, + offset, size, mode)) + continue + elif s[0] == "!heatmap": + assert(len(s) == 10) + assert(s[4] == "gran") + assert(s[6] == "len") + assert(s[8] == "stride") + alloc_id = s[1] + offset = val(s[2]) + size = val(s[3]) + granularity = val(s[5]) + length = val(s[7]) + stride = val(s[9]) + mode = "heatmap" + self.exe.append(("heatmap", self.allocs[alloc_id].id, + offset, size, granularity, length, stride)) + continue + elif s[0] == "!memset": + assert(len(s) == 5) + alloc_id = s[1] + offset = val(s[2]) + value = val(s[3]) + size = val(s[4]) + self.exe.append(("memset", self.allocs[alloc_id].id, + offset, value, size)) + continue + elif s[0] == "!raw": + self.exe.append(s[1:]) + continue + elif s[0] == "movp": + assert(len(s) == 3) + assert(s[1][0] == "x") + addr = reg(s[1]) + # Can't use val() as that has a max of 48 bits + value = int(s[2].strip("#"), 0) + + self.l.buffer.append((2 << 56) | (addr << 48) | (value & 0xffffffff)) + self.l.buffer.append((2 << 56) | ((addr + 1) << 48) + | ((value >> 32) & 0xffffffff)) + continue + elif s[0] == "regdump": + assert(len(s) == 2) + assert(s[1][0] == "x") + dest = reg(s[1]) + + # Number of registers to write per instruction + regs = 16 + + cmd = 21 + value = (dest << 40) | (((1 << regs) - 1) << 16) + + for i in range(0, 0x60, regs): + code = (cmd << 56) | (i << 48) | value | (i << 2) + self.l.buffer.append(code) + + del cmd, value + continue + + elif s[0] == "unk": + if len(s) == 2: + h = hx(s[1]) + cmd = h >> 56 + addr = (h >> 48) & 0xff + value = h & 0xffffffffffff + else: + assert(len(s) == 4) + cmd = hx(s[2]) + addr = hx(s[1]) + value = val(s[3]) + elif s[0] == "nop": + if len(s) == 1: + addr = 0 + value = 0 + cmd = 0 + else: + assert(len(s) == 3) + addr = hx(s[1]) + value = val(s[2]) + cmd = 0 + elif s[0] == "mov" and s[2][0] in "xw": + # This is actually an addition command + assert(len(s) == 3) + assert(s[1][0] == s[2][0]) + cmd = { "x": 17, "w": 16 }[s[1][0]] + addr = reg(s[1]) + value = reg(s[2]) << 40 + elif s[0] == "mov": + assert(len(s) == 3) + cmd = { "x": 1, "w": 2 }[s[1][0]] + addr = reg(s[1]) + value = val(s[2]) + elif s[0] == "add": + assert(len(s) == 4) + assert(s[1][0] == s[2][0]) + assert(s[1][0] in "wx") + cmd = 16 if s[1][0] == "w" else 17 + addr = reg(s[1]) + value = (reg(s[2]) << 40) | (val(s[3]) & 0xffffffff) + elif s[0] == "resources": + assert(len(s) >= 2) + types = ["compute", "fragment", "tiler", "idvs"] + cmd = 34 + addr = 0 + value = 0 + for t in s[1:]: + if t in types: + value |= 1 << types.index(t) + else: + value |= int(t, 0) + elif s[0] == "fragment": + cmd = 7 + addr = 0 + value = 0 + if len(s) != 1: + arg_map = { + "tem": {"0": 0, "1": 1}, + "render": { + "z_order": 0, + "horizontal": 0x10, + "vertical": 0x20, + "reverse_horizontal": 0x50, + "reverse_vertical": 0x60, + }, + "unk": {"0": 0, "1": 1 << 32}, + } + for arg, val in zip(s[1::2], s[2::2]): + value |= arg_map[arg][val] + elif s[0] == "wait": + assert(len(s) == 2) + cmd = 3 + addr = 0 + if s[1] == "all": + value = 255 + else: + value = sum(1 << int(x) for x in s[1].split(",")) + value <<= 16 + elif s[0] == "slot": + assert(len(s) == 2) + cmd = 23 + addr = 0 + value = int(s[1], 0) + elif s[0] == "add": + # TODO: unk variant + assert(len(s) == 4) + assert(s[1][0] == "x") + assert(s[2][0] == "x") + cmd = 17 + addr = reg(s[1]) + v = val(s[3]) + assert(v < (1 << 32)) + assert(v >= (-1 << 31)) + value = (reg(s[2]) << 40) | (v & 0xffffffff) + elif s[0] == "idvs": + assert(len(s) == 6) + unk = val(s[1]) + assert(s[2] == "mode") + modes = { + "none": 0, + "points": 1, + "lines": 2, + "line-strip": 4, + "line-loop": 6, + "triangles": 8, + "triangle-strip": 10, + "triangle-fan": 12, + "polygon": 13, + "quads": 14, + } + if s[3] in modes: + mode = modes[s[3]] + else: + mode = int(s[3]) + assert(s[4] == "index") + itypes = { + "none": 0, + "uint8": 1, + "uint16": 2, + "uint32": 3, + } + if s[5] in itypes: + index = itypes[s[5]] + else: + index = int(s[5]) + + cmd = 6 + addr = 0 + value = (unk << 32) | (index << 8) | mode + elif s[0] == "flush_tiler": + assert(len(s) == 1) + cmd = 9 + addr = 0 + value = 0 + elif s[0] == "str" and s[1] in ("cycles", "timestamp"): + assert(len(s) == 3 or len(s) == 4) + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]") for x in s] + assert(s[2][0] == "x") + + type_ = 1 if s[1] == "cycles" else 0 + dest = reg(s[2]) + if len(s) == 4: + offset = val(s[3]) + else: + offset = 0 + + cmd = 40 + addr = 0 + value = (dest << 40) | (type_ << 32) | to_int16(offset) + elif s[0] in ("ldr", "str"): + reglist = s[1] + if reglist[0] == "{": + end = [x[-1] for x in s].index("}") + reglist = s[1:end + 1] + s = s[:1] + s[end:] + + assert(len(s) == 3 or len(s) == 4) + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]") for x in s] + assert(s[2][0] == "x") + + if isinstance(reglist, str): + assert(reglist[0] in "xw") + src = reg(reglist) + mask = 3 if reglist[0] == "x" else 1 + else: + src = None + mask = 0 + + for r in ",".join(reglist).strip("{}").split(","): + r = r.split("-") + assert(len(r) in (1, 2)) + regno = [reg(x) for x in r] + + if src is None: + src = regno[0] + + if len(r) == 1: + assert(r[0][0] in "xw") + new = 3 if r[0][0] == "x" else 1 + new = (new << regno[0]) >> src + else: + assert(regno[1] > regno[0]) + new = ((2 << regno[1]) - (1 << regno[0])) >> src + + assert(new < (1 << 16)) + assert(mask & new == 0) + mask |= new + + # Name is correct for str, but inverted for ldr + # (The same holds for src above) + dest = reg(s[2]) + if len(s) == 4: + offset = val(s[3]) + else: + offset = 0 + + cmd = 20 if s[0] == "ldr" else 21 + addr = src + value = (dest << 40) | (mask << 16) | to_int16(offset) + elif s[0] == "b" or s[0].startswith("b."): + # For unconditional jumps, use w00 as a source register if it + # is not specified + if s[0] == "b" and (len(s) == 2 or + (len(s) == 3 and + s[1] in ("back", "skip"))): + s = [s[0], "w00", *s[1:]] + + assert(len(s) == 3 or (len(s) == 4 and s[2] in ("back", "skip"))) + assert(s[1][0] == "w") + + ops = { + "b.le": 0, "b.gt": 1, + "b.eq": 2, "b.ne": 3, + "b.lt": 4, "b.ge": 5, + "b": 6, "b.al": 6, + } + + src = reg(s[1]) + if len(s) == 4: + offset = val(s[3]) + if s[2] == "back": + offset = -1 - offset + else: + label = s[2] + if re.fullmatch(r"[0-9]+b", label): + label = int(label[:-1]) + assert(label in self.l.num_labels) + offset = resolve_rel(self.l.num_labels[label], + self.l.offset()) + elif re.fullmatch(r"[0-9]+f", label): + label = int(label[:-1]) + if label not in self.l.num_refs: + self.l.num_refs[label] = [] + self.l.num_refs[label].append((label, self.l.offset(), "rel")) + offset = 0 + else: + assert(not re.fullmatch(r"[0-9]+", label)) + self.l.label_refs.append((label, self.l.offset(), "rel")) + offset = 0 + + cmd = 22 + addr = 0 + value = (src << 40) | (ops[s[0]] << 28) | to_int16(offset) + + elif s[0] in ("evadd", "evstr"): + assert(len(s) in range(5, 8)) + assert(s[1][0] in "wx") + assert(s[2].startswith("[x")) + assert(s[2][-1] == "]") + assert(s[3] == "unk") + s = [x.strip("[]()") for x in s] + + val = reg(s[1]) + dst = reg(s[2]) + mask = hx(s[4]) + irq = "irq" not in s + unk0 = "unk0" in s + + if s[1][0] == "w": + cmd = 37 if s[0] == "evadd" else 38 + else: + cmd = 51 if s[0] == "evadd" else 52 + addr = 1 + value = ((dst << 40) | (val << 32) | (mask << 16) | + (irq << 2) | unk0) + elif s[0].split(".")[0] == "evwait": + for mod in s[0].split(".")[1:]: + assert(mod in {"lo", "hi", "inherit", "no_error"}) + assert(len(s) == 3) + assert(s[1][0] in "wx") + assert(s[2][0] == "[") + assert(s[-1][-1] == "]") + s = [x.strip("[]()") for x in s] + src = reg(s[2]) + val = reg(s[1]) + cond = 1 if ".hi" in s[0] else 0 + error = 1 if ".no_error" in s[0] else 0 + + cmd = 53 if s[1][0] == "x" else 39 + addr = 0 + value = (src << 40) | (val << 32) | (cond << 28) | error + elif s[0] in ("call", "tailcall"): + ss = [x for x in s if x.find('(') == -1 and x.find(')') == -1] + assert(len(ss) == 3) + assert(ss[1][0] == "w") + assert(ss[2][0] == "x") + cmd = { "call": 32, "tailcall": 33 }[s[0]] + addr = 0 + num = reg(ss[1]) + target = reg(ss[2]) + value = (num << 32) | (target << 40) + + l = self.l + + cur = len(l.buffer) + for ofs in range(cur - 2, cur): + if l.buffer[ofs] >> 48 == 0x100 + target: + l.call_addr_offset = ofs + if l.buffer[ofs] >> 48 == 0x200 + num: + l.call_len_offset = ofs + assert(l.call_addr_offset is not None) + assert(l.call_len_offset is not None) + + self.is_call = True + elif s[0] == "heapctx": + assert(len(s) == 2) + assert(s[1][0] == "x") + cmd = 48 + addr = 0 + value = reg(s[1]) << 40 + elif s[0] == "heapinc": + assert(len(s) == 2) + modes = { + "vt_start": 0, + "vt_end": 1, + "frag_end": 3, + } + if s[1] in modes: + mode = modes[s[1]] + else: + mode = int(s[1]) + cmd = 49 + addr = 0 + value = mode << 32 + else: + print("Unknown command:", orig_line, file=sys.stderr) + # TODO remove + cmd = 0 + addr = 0 + value = 0 + sk = False + pass + + code = (cmd << 56) | (addr << 48) | value + + if given_code and code != given_code: + print(f"Mismatch! {hex(code)} != {hex(given_code)}, {orig_line}") + + self.l.buffer.append(code) + + del cmd, addr, value + + if False and not sk: + print(orig_line, file=sys.stderr) + print(indent, s, hex(code) if sk else "", file=sys.stderr) + + self.pop_until(self.levels[0].indent) + self.flush_exe() + + def __repr__(self): + r = [] + r += [str(self.allocs[x]) for x in self.allocs] + r += [str(x) for x in self.completed] + r += [fmt_reloc(x) for x in self.reloc] + r += [fmt_reloc(x, name="relsplit") for x in self.reloc_split] + r += [fmt_exe(x) for x in self.exe] + return "\n".join(r) + +def interpret(text): + c = Context() + c.add_shaders(shaders) + c.add_memory(memory) + c.add_descriptors(descriptors) + c.interpret(text) + #print(str(c)) + return str(c) + +def run(text, capture=False): + if capture: + cap = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT} + else: + cap = {} + + i = interpret(text) + "\n" + + with open("/tmp/csf.cmds", "w") as f: + f.write(i) + + # TODO: Keep seperate or merge stdout/stderr? + ret = subprocess.run(["csf_test", "/dev/stdin"], + input=i, text=True, **cap) + if ret.stderr is None: + ret.stderr = "" + if ret.stdout is None: + ret.stdout = "" + return ret.stderr + ret.stdout + +def rebuild(): + try: + p = subprocess.run(["rebuild-mesa"]) + if p.returncode != 0: + return False + except FileNotFoundError: + pass + return True + +def go(text): + #print(interpret(text)) + #return + + if not rebuild(): + return + + print(run(text)) + #subprocess.run("ls /tmp/fdump.????? | tail -n2 | xargs diff -U3 -s", + # shell=True) + +os.environ["CSF_QUIET"] = "1" + +go(get_cmds("")) + +#for c in range(1, 64): +# val = c +# ret = run(get_cmds(ii(val))) +# print(str(val) + '\t' + [x for x in ret.split("\n") if x.startswith("0FFF10")][0]) + +#rebuild() +#for c in range(256): +# print(c, end=":") +# sys.stdout.flush() +# cmd = f"UNK 00 {hex(c)[2:]} 0x00000000" +# run(get_cmds(cmd)) + +#interpret(cmds) +#go(cmds) diff --git a/src/panfrost/csf_test/mali_base_csf_kernel.h b/src/panfrost/csf_test/mali_base_csf_kernel.h new file mode 100644 index 00000000000..f5f859eb9ad --- /dev/null +++ b/src/panfrost/csf_test/mali_base_csf_kernel.h @@ -0,0 +1,721 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_BASE_CSF_KERNEL_H_ +#define _UAPI_BASE_CSF_KERNEL_H_ + +#include + +/* Memory allocation, access/hint flags. + * + * See base_mem_alloc_flags. + */ + +/* IN */ +/* Read access CPU side + */ +#define BASE_MEM_PROT_CPU_RD ((base_mem_alloc_flags)1 << 0) + +/* Write access CPU side + */ +#define BASE_MEM_PROT_CPU_WR ((base_mem_alloc_flags)1 << 1) + +/* Read access GPU side + */ +#define BASE_MEM_PROT_GPU_RD ((base_mem_alloc_flags)1 << 2) + +/* Write access GPU side + */ +#define BASE_MEM_PROT_GPU_WR ((base_mem_alloc_flags)1 << 3) + +/* Execute allowed on the GPU side + */ +#define BASE_MEM_PROT_GPU_EX ((base_mem_alloc_flags)1 << 4) + +/* Will be permanently mapped in kernel space. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_PERMANENT_KERNEL_MAPPING ((base_mem_alloc_flags)1 << 5) + +/* The allocation will completely reside within the same 4GB chunk in the GPU + * virtual space. + * Since this flag is primarily required only for the TLS memory which will + * not be used to contain executable code and also not used for Tiler heap, + * it can't be used along with BASE_MEM_PROT_GPU_EX and TILER_ALIGN_TOP flags. + */ +#define BASE_MEM_GPU_VA_SAME_4GB_PAGE ((base_mem_alloc_flags)1 << 6) + +/* Userspace is not allowed to free this memory. + * Flag is only allowed on allocations originating from kbase. + */ +#define BASEP_MEM_NO_USER_FREE ((base_mem_alloc_flags)1 << 7) + +#define BASE_MEM_RESERVED_BIT_8 ((base_mem_alloc_flags)1 << 8) + +/* Grow backing store on GPU Page Fault + */ +#define BASE_MEM_GROW_ON_GPF ((base_mem_alloc_flags)1 << 9) + +/* Page coherence Outer shareable, if available + */ +#define BASE_MEM_COHERENT_SYSTEM ((base_mem_alloc_flags)1 << 10) + +/* Page coherence Inner shareable + */ +#define BASE_MEM_COHERENT_LOCAL ((base_mem_alloc_flags)1 << 11) + +/* IN/OUT */ +/* Should be cached on the CPU, returned if actually cached + */ +#define BASE_MEM_CACHED_CPU ((base_mem_alloc_flags)1 << 12) + +/* IN/OUT */ +/* Must have same VA on both the GPU and the CPU + */ +#define BASE_MEM_SAME_VA ((base_mem_alloc_flags)1 << 13) + +/* OUT */ +/* Must call mmap to acquire a GPU address for the alloc + */ +#define BASE_MEM_NEED_MMAP ((base_mem_alloc_flags)1 << 14) + +/* IN */ +/* Page coherence Outer shareable, required. + */ +#define BASE_MEM_COHERENT_SYSTEM_REQUIRED ((base_mem_alloc_flags)1 << 15) + +/* Protected memory + */ +#define BASE_MEM_PROTECTED ((base_mem_alloc_flags)1 << 16) + +/* Not needed physical memory + */ +#define BASE_MEM_DONT_NEED ((base_mem_alloc_flags)1 << 17) + +/* Must use shared CPU/GPU zone (SAME_VA zone) but doesn't require the + * addresses to be the same + */ +#define BASE_MEM_IMPORT_SHARED ((base_mem_alloc_flags)1 << 18) + +/* CSF event memory + * + * If Outer shareable coherence is not specified or not available, then on + * allocation kbase will automatically use the uncached GPU mapping. + * There is no need for the client to specify BASE_MEM_UNCACHED_GPU + * themselves when allocating memory with the BASE_MEM_CSF_EVENT flag. + * + * This memory requires a permanent mapping + * + * See also kbase_reg_needs_kernel_mapping() + */ +#define BASE_MEM_CSF_EVENT ((base_mem_alloc_flags)1 << 19) + +#define BASE_MEM_RESERVED_BIT_20 ((base_mem_alloc_flags)1 << 20) + +/* Should be uncached on the GPU, will work only for GPUs using AARCH64 mmu + * mode. Some components within the GPU might only be able to access memory + * that is GPU cacheable. Refer to the specific GPU implementation for more + * details. The 3 shareability flags will be ignored for GPU uncached memory. + * If used while importing USER_BUFFER type memory, then the import will fail + * if the memory is not aligned to GPU and CPU cache line width. + */ +#define BASE_MEM_UNCACHED_GPU ((base_mem_alloc_flags)1 << 21) + +/* + * Bits [22:25] for group_id (0~15). + * + * base_mem_group_id_set() should be used to pack a memory group ID into a + * base_mem_alloc_flags value instead of accessing the bits directly. + * base_mem_group_id_get() should be used to extract the memory group ID from + * a base_mem_alloc_flags value. + */ +#define BASEP_MEM_GROUP_ID_SHIFT 22 +#define BASE_MEM_GROUP_ID_MASK \ + ((base_mem_alloc_flags)0xF << BASEP_MEM_GROUP_ID_SHIFT) + +/* Must do CPU cache maintenance when imported memory is mapped/unmapped + * on GPU. Currently applicable to dma-buf type only. + */ +#define BASE_MEM_IMPORT_SYNC_ON_MAP_UNMAP ((base_mem_alloc_flags)1 << 26) + +/* OUT */ +/* Kernel side cache sync ops required */ +#define BASE_MEM_KERNEL_SYNC ((base_mem_alloc_flags)1 << 28) + +/* Number of bits used as flags for base memory management + * + * Must be kept in sync with the base_mem_alloc_flags flags + */ +#define BASE_MEM_FLAGS_NR_BITS 29 + +/* A mask of all the flags which are only valid for allocations within kbase, + * and may not be passed from user space. + */ +#define BASEP_MEM_FLAGS_KERNEL_ONLY \ + (BASEP_MEM_PERMANENT_KERNEL_MAPPING | BASEP_MEM_NO_USER_FREE) + +/* A mask for all output bits, excluding IN/OUT bits. + */ +#define BASE_MEM_FLAGS_OUTPUT_MASK BASE_MEM_NEED_MMAP + +/* A mask for all input bits, including IN/OUT bits. + */ +#define BASE_MEM_FLAGS_INPUT_MASK \ + (((1 << BASE_MEM_FLAGS_NR_BITS) - 1) & ~BASE_MEM_FLAGS_OUTPUT_MASK) + +/* A mask of all currently reserved flags + */ +#define BASE_MEM_FLAGS_RESERVED \ + BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20 + +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) +/* reserved handles ..-47< for future special handles */ +#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS \ + ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) + +#define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ + ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ + LOCAL_PAGE_SHIFT) + +/** + * Valid set of just-in-time memory allocation flags + */ +#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0) + +/* Flags to pass to ::base_context_init. + * Flags can be ORed together to enable multiple things. + * + * These share the same space as BASEP_CONTEXT_FLAG_*, and so must + * not collide with them. + */ +typedef __u32 base_context_create_flags; + +/* No flags set */ +#define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0) + +/* Base context is embedded in a cctx object (flag used for CINSTR + * software counter macros) + */ +#define BASE_CONTEXT_CCTX_EMBEDDED ((base_context_create_flags)1 << 0) + +/* Base context is a 'System Monitor' context for Hardware counters. + * + * One important side effect of this is that job submission is disabled. + */ +#define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \ + ((base_context_create_flags)1 << 1) + +/* Base context creates a CSF event notification thread. + * + * The creation of a CSF event notification thread is conditional but + * mandatory for the handling of CSF events. + */ +#define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2) + +/* Bit-shift used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_SHIFT (3) + +/* Bitmask used to encode a memory group ID in base_context_create_flags + */ +#define BASEP_CONTEXT_MMU_GROUP_ID_MASK \ + ((base_context_create_flags)0xF << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* Bitpattern describing the base_context_create_flags that can be + * passed to the kernel + */ +#define BASEP_CONTEXT_CREATE_KERNEL_FLAGS \ + (BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED | \ + BASEP_CONTEXT_MMU_GROUP_ID_MASK) + +/* Bitpattern describing the ::base_context_create_flags that can be + * passed to base_context_init() + */ +#define BASEP_CONTEXT_CREATE_ALLOWED_FLAGS \ + (BASE_CONTEXT_CCTX_EMBEDDED | \ + BASE_CONTEXT_CSF_EVENT_THREAD | \ + BASEP_CONTEXT_CREATE_KERNEL_FLAGS) + +/* Enable additional tracepoints for latency measurements (TL_ATOM_READY, + * TL_ATOM_DONE, TL_ATOM_PRIO_CHANGE, TL_ATOM_EVENT_POST) + */ +#define BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS (1 << 0) + +/* Indicate that job dumping is enabled. This could affect certain timers + * to account for the performance impact. + */ +#define BASE_TLSTREAM_JOB_DUMPING_ENABLED (1 << 1) + +/* Enable KBase tracepoints for CSF builds */ +#define BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS (1 << 2) + +/* Enable additional CSF Firmware side tracepoints */ +#define BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS (1 << 3) + +#define BASE_TLSTREAM_FLAGS_MASK (BASE_TLSTREAM_ENABLE_LATENCY_TRACEPOINTS | \ + BASE_TLSTREAM_JOB_DUMPING_ENABLED | \ + BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS | \ + BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) + +/* Number of pages mapped into the process address space for a bound GPU + * command queue. A pair of input/output pages and a Hw doorbell page + * are mapped to enable direct submission of commands to Hw. + */ +#define BASEP_QUEUE_NR_MMAP_USER_PAGES ((size_t)3) + +#define BASE_QUEUE_MAX_PRIORITY (15U) + +/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */ +#define BASEP_EVENT_VAL_INDEX (0U) +#define BASEP_EVENT_ERR_INDEX (1U) + +/* The upper limit for number of objects that could be waited/set per command. + * This limit is now enforced as internally the error inherit inputs are + * converted to 32-bit flags in a __u32 variable occupying a previously padding + * field. + */ +#define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) + +/** + * enum base_kcpu_command_type - Kernel CPU queue command type. + * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, + * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, + * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, + * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, + * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, + * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, + * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, + * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, + * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, + * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, + */ +enum base_kcpu_command_type { + BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, + BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT, + BASE_KCPU_COMMAND_TYPE_CQS_SET, + BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, + BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, + BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, + BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, + BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, + BASE_KCPU_COMMAND_TYPE_JIT_FREE, + BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, + BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER +}; + +/** + * enum base_queue_group_priority - Priority of a GPU Command Queue Group. + * @BASE_QUEUE_GROUP_PRIORITY_HIGH: GPU Command Queue Group is of high + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_MEDIUM: GPU Command Queue Group is of medium + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_LOW: GPU Command Queue Group is of low + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_REALTIME: GPU Command Queue Group is of real-time + * priority. + * @BASE_QUEUE_GROUP_PRIORITY_COUNT: Number of GPU Command Queue Group + * priority levels. + * + * Currently this is in order of highest to lowest, but if new levels are added + * then those new levels may be out of order to preserve the ABI compatibility + * with previous releases. At that point, ensure assignment to + * the 'priority' member in &kbase_queue_group is updated to ensure it remains + * a linear ordering. + * + * There should be no gaps in the enum, otherwise use of + * BASE_QUEUE_GROUP_PRIORITY_COUNT in kbase must be updated. + */ +enum base_queue_group_priority { + BASE_QUEUE_GROUP_PRIORITY_HIGH = 0, + BASE_QUEUE_GROUP_PRIORITY_MEDIUM, + BASE_QUEUE_GROUP_PRIORITY_LOW, + BASE_QUEUE_GROUP_PRIORITY_REALTIME, + BASE_QUEUE_GROUP_PRIORITY_COUNT +}; + +struct base_kcpu_command_fence_info { + __u64 fence; +}; + +struct base_cqs_wait_info { + __u64 addr; + __u32 val; + __u32 padding; +}; + +struct base_kcpu_command_cqs_wait_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +struct base_cqs_set { + __u64 addr; +}; + +struct base_kcpu_command_cqs_set_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * typedef basep_cqs_data_type - Enumeration of CQS Data Types + * + * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value + * is an unsigned 32-bit integer + * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value + * is an unsigned 64-bit integer + */ +typedef enum PACKED { + BASEP_CQS_DATA_TYPE_U32 = 0, + BASEP_CQS_DATA_TYPE_U64 = 1, +} basep_cqs_data_type; + +/** + * typedef basep_cqs_wait_operation_op - Enumeration of CQS Object Wait + * Operation conditions + * + * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Less than or Equal to + * the Wait Operation value + * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a + * wait will be satisfied when a CQS Object's + * value is Greater than the Wait Operation value + */ +typedef enum { + BASEP_CQS_WAIT_OPERATION_LE = 0, + BASEP_CQS_WAIT_OPERATION_GT = 1, +} basep_cqs_wait_operation_op; + +struct base_cqs_wait_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information + * about the Timeline CQS wait objects + * + * @objs: An array of Timeline CQS waits. + * @nr_objs: Number of Timeline CQS waits in the array. + * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field + * to be served as the source for importing into the + * queue's error-state. + */ +struct base_kcpu_command_cqs_wait_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 inherit_err_flags; +}; + +/** + * typedef basep_cqs_set_operation_op - Enumeration of CQS Set Operations + * + * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value + * to a synchronization object + * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value + * of a synchronization object + */ +typedef enum { + BASEP_CQS_SET_OPERATION_ADD = 0, + BASEP_CQS_SET_OPERATION_SET = 1, +} basep_cqs_set_operation_op; + +struct base_cqs_set_operation_info { + __u64 addr; + __u64 val; + __u8 operation; + __u8 data_type; + __u8 padding[6]; +}; + +/** + * struct base_kcpu_command_cqs_set_operation_info - structure which contains information + * about the Timeline CQS set objects + * + * @objs: An array of Timeline CQS sets. + * @nr_objs: Number of Timeline CQS sets in the array. + * @padding: Structure padding, unused bytes. + */ +struct base_kcpu_command_cqs_set_operation_info { + __u64 objs; + __u32 nr_objs; + __u32 padding; +}; + +/** + * struct base_kcpu_command_import_info - structure which contains information + * about the imported buffer. + * + * @handle: Address of imported user buffer. + */ +struct base_kcpu_command_import_info { + __u64 handle; +}; + +/** + * struct base_kcpu_command_jit_alloc_info - structure which contains + * information about jit memory allocation. + * + * @info: An array of elements of the + * struct base_jit_alloc_info type. + * @count: The number of elements in the info array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_alloc_info { + __u64 info; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_jit_free_info - structure which contains + * information about jit memory which is to be freed. + * + * @ids: An array containing the JIT IDs to free. + * @count: The number of elements in the ids array. + * @padding: Padding to a multiple of 64 bits. + */ +struct base_kcpu_command_jit_free_info { + __u64 ids; + __u8 count; + __u8 padding[7]; +}; + +/** + * struct base_kcpu_command_group_suspend_info - structure which contains + * suspend buffer data captured for a suspended queue group. + * + * @buffer: Pointer to an array of elements of the type char. + * @size: Number of elements in the @buffer array. + * @group_handle: Handle to the mapping of CSG. + * @padding: padding to a multiple of 64 bits. + */ +struct base_kcpu_command_group_suspend_info { + __u64 buffer; + __u32 size; + __u8 group_handle; + __u8 padding[3]; +}; + + +/** + * struct base_kcpu_command - kcpu command. + * @type: type of the kcpu command, one enum base_kcpu_command_type + * @padding: padding to a multiple of 64 bits + * @info: structure which contains information about the kcpu command; + * actual type is determined by @p type + * @info.fence: Fence + * @info.cqs_wait: CQS wait + * @info.cqs_set: CQS set + * @info.import: import + * @info.jit_alloc: jit allocation + * @info.jit_free: jit deallocation + * @info.suspend_buf_copy: suspend buffer copy + * @info.sample_time: sample time + * @info.padding: padding + */ +struct base_kcpu_command { + __u8 type; + __u8 padding[sizeof(__u64) - sizeof(__u8)]; + union { + struct base_kcpu_command_fence_info fence; + struct base_kcpu_command_cqs_wait_info cqs_wait; + struct base_kcpu_command_cqs_set_info cqs_set; + struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation; + struct base_kcpu_command_cqs_set_operation_info cqs_set_operation; + struct base_kcpu_command_import_info import; + struct base_kcpu_command_jit_alloc_info jit_alloc; + struct base_kcpu_command_jit_free_info jit_free; + struct base_kcpu_command_group_suspend_info suspend_buf_copy; + __u64 padding[2]; /* No sub-struct should be larger */ + } info; +}; + +/** + * struct basep_cs_stream_control - CSI capabilities. + * + * @features: Features of this stream + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_stream_control { + __u32 features; + __u32 padding; +}; + +/** + * struct basep_cs_group_control - CSG interface capabilities. + * + * @features: Features of this group + * @stream_num: Number of streams in this group + * @suspend_size: Size in bytes of the suspend buffer for this group + * @padding: Padding to a multiple of 64 bits. + */ +struct basep_cs_group_control { + __u32 features; + __u32 stream_num; + __u32 suspend_size; + __u32 padding; +}; + +/** + * struct base_gpu_queue_group_error_fatal_payload - Unrecoverable fault + * error information associated with GPU command queue group. + * + * @sideband: Additional information of the unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_group_error_fatal_payload { + __u64 sideband; + __u32 status; + __u32 padding; +}; + +/** + * struct base_gpu_queue_error_fatal_payload - Unrecoverable fault + * error information related to GPU command queue. + * + * @sideband: Additional information about this unrecoverable fault. + * @status: Unrecoverable fault information. + * This consists of exception type (least significant byte) and + * data (remaining bytes). One example of exception type is + * CS_INVALID_INSTRUCTION (0x49). + * @csi_index: Index of the CSF interface the queue is bound to. + * @padding: Padding to make multiple of 64bits + */ +struct base_gpu_queue_error_fatal_payload { + __u64 sideband; + __u32 status; + __u8 csi_index; + __u8 padding[3]; +}; + +/** + * enum base_gpu_queue_group_error_type - GPU Fatal error type. + * + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL: Fatal error associated with GPU + * command queue group. + * @BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: Fatal error associated with GPU + * command queue. + * @BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: Fatal error associated with + * progress timeout. + * @BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: Fatal error due to running out + * of tiler heap memory. + * @BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT: The number of fatal error types + * + * This type is used for &struct_base_gpu_queue_group_error.error_type. + */ +enum base_gpu_queue_group_error_type { + BASE_GPU_QUEUE_GROUP_ERROR_FATAL = 0, + BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, + BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT, + BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM, + BASE_GPU_QUEUE_GROUP_ERROR_FATAL_COUNT +}; + +/** + * struct base_gpu_queue_group_error - Unrecoverable fault information + * @error_type: Error type of @base_gpu_queue_group_error_type + * indicating which field in union payload is filled + * @padding: Unused bytes for 64bit boundary + * @payload: Input Payload + * @payload.fatal_group: Unrecoverable fault error associated with + * GPU command queue group + * @payload.fatal_queue: Unrecoverable fault error associated with command queue + */ +struct base_gpu_queue_group_error { + __u8 error_type; + __u8 padding[7]; + union { + struct base_gpu_queue_group_error_fatal_payload fatal_group; + struct base_gpu_queue_error_fatal_payload fatal_queue; + } payload; +}; + +/** + * enum base_csf_notification_type - Notification type + * + * @BASE_CSF_NOTIFICATION_EVENT: Notification with kernel event + * @BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: Notification with GPU fatal + * error + * @BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: Notification with dumping cpu + * queue + * @BASE_CSF_NOTIFICATION_COUNT: The number of notification type + * + * This type is used for &struct_base_csf_notification.type. + */ +enum base_csf_notification_type { + BASE_CSF_NOTIFICATION_EVENT = 0, + BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, + BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP, + BASE_CSF_NOTIFICATION_COUNT +}; + +/** + * struct base_csf_notification - Event or error notification + * + * @type: Notification type of @base_csf_notification_type + * @padding: Padding for 64bit boundary + * @payload: Input Payload + * @payload.align: To fit the struct into a 64-byte cache line + * @payload.csg_error: CSG error + * @payload.csg_error.handle: Handle of GPU command queue group associated with + * fatal error + * @payload.csg_error.padding: Padding + * @payload.csg_error.error: Unrecoverable fault error + * + */ +struct base_csf_notification { + __u8 type; + __u8 padding[7]; + union { + struct { + __u8 handle; + __u8 padding[7]; + struct base_gpu_queue_group_error error; + } csg_error; + + __u8 align[56]; + } payload; +}; + +#endif /* _UAPI_BASE_CSF_KERNEL_H_ */ diff --git a/src/panfrost/csf_test/mali_base_kernel.h b/src/panfrost/csf_test/mali_base_kernel.h new file mode 100644 index 00000000000..305956f341a --- /dev/null +++ b/src/panfrost/csf_test/mali_base_kernel.h @@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * Base structures shared with the kernel. + */ + +#ifndef _UAPI_BASE_KERNEL_H_ +#define _UAPI_BASE_KERNEL_H_ + +#include + +struct base_mem_handle { + struct { + __u64 handle; + } basep; +}; + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +#define BASE_MAX_COHERENT_GROUPS 16 + +#if defined(PAGE_MASK) && defined(PAGE_SHIFT) +#define LOCAL_PAGE_SHIFT PAGE_SHIFT +#define LOCAL_PAGE_LSB ~PAGE_MASK +#else +#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12 +#endif + +#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2) +#define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2 +#define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1) +#else +#error Failed to find page size +#endif +#endif + +/* Physical memory group ID for normal usage. + */ +#define BASE_MEM_GROUP_DEFAULT (0) + +/* Number of physical memory groups. + */ +#define BASE_MEM_GROUP_COUNT (16) + +/** + * typedef base_mem_alloc_flags - Memory allocation, access/hint flags. + * + * A combination of MEM_PROT/MEM_HINT flags must be passed to each allocator + * in order to determine the best cache policy. Some combinations are + * of course invalid (e.g. MEM_PROT_CPU_WR | MEM_HINT_CPU_RD), + * which defines a write-only region on the CPU side, which is + * heavily read by the CPU... + * Other flags are only meaningful to a particular allocator. + * More flags can be added to this list, as long as they don't clash + * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit). + */ +typedef __u32 base_mem_alloc_flags; + +/* A mask for all the flags which are modifiable via the base_mem_set_flags + * interface. + */ +#define BASE_MEM_FLAGS_MODIFIABLE \ + (BASE_MEM_DONT_NEED | BASE_MEM_COHERENT_SYSTEM | \ + BASE_MEM_COHERENT_LOCAL) + +/* A mask of all the flags that can be returned via the base_mem_get_flags() + * interface. + */ +#define BASE_MEM_FLAGS_QUERYABLE \ + (BASE_MEM_FLAGS_INPUT_MASK & ~(BASE_MEM_SAME_VA | \ + BASE_MEM_COHERENT_SYSTEM_REQUIRED | BASE_MEM_DONT_NEED | \ + BASE_MEM_IMPORT_SHARED | BASE_MEM_FLAGS_RESERVED | \ + BASEP_MEM_FLAGS_KERNEL_ONLY)) + +/** + * enum base_mem_import_type - Memory types supported by @a base_mem_import + * + * @BASE_MEM_IMPORT_TYPE_INVALID: Invalid type + * @BASE_MEM_IMPORT_TYPE_UMM: UMM import. Handle type is a file descriptor (int) + * @BASE_MEM_IMPORT_TYPE_USER_BUFFER: User buffer import. Handle is a + * base_mem_import_user_buffer + * + * Each type defines what the supported handle type is. + * + * If any new type is added here ARM must be contacted + * to allocate a numeric value for it. + * Do not just add a new type without synchronizing with ARM + * as future releases from ARM might include other new types + * which could clash with your custom types. + */ +enum base_mem_import_type { + BASE_MEM_IMPORT_TYPE_INVALID = 0, + /* + * Import type with value 1 is deprecated. + */ + BASE_MEM_IMPORT_TYPE_UMM = 2, + BASE_MEM_IMPORT_TYPE_USER_BUFFER = 3 +}; + +/** + * struct base_mem_import_user_buffer - Handle of an imported user buffer + * + * @ptr: address of imported user buffer + * @length: length of imported user buffer in bytes + * + * This structure is used to represent a handle of an imported user buffer. + */ + +struct base_mem_import_user_buffer { + __u64 ptr; + __u64 length; +}; + +/* Mask to detect 4GB boundary alignment */ +#define BASE_MEM_MASK_4GB 0xfffff000UL +/* Mask to detect 4GB boundary (in page units) alignment */ +#define BASE_MEM_PFN_MASK_4GB (BASE_MEM_MASK_4GB >> LOCAL_PAGE_SHIFT) + +/* Limit on the 'extension' parameter for an allocation with the + * BASE_MEM_TILER_ALIGN_TOP flag set + * + * This is the same as the maximum limit for a Buffer Descriptor's chunk size + */ +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2 \ + (21u - (LOCAL_PAGE_SHIFT)) +#define BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES \ + (1ull << (BASE_MEM_TILER_ALIGN_TOP_EXTENSION_MAX_PAGES_LOG2)) + +/* Bit mask of cookies used for for memory allocation setup */ +#define KBASE_COOKIE_MASK ~1UL /* bit 0 is reserved */ + +/* Maximum size allowed in a single KBASE_IOCTL_MEM_ALLOC call */ +#define KBASE_MEM_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */ + +/* + * struct base_fence - Cross-device synchronisation fence. + * + * A fence is used to signal when the GPU has finished accessing a resource that + * may be shared with other devices, and also to delay work done asynchronously + * by the GPU until other devices have finished accessing a shared resource. + */ +struct base_fence { + struct { + int fd; + int stream_fd; + } basep; +}; + +/** + * struct base_mem_aliasing_info - Memory aliasing info + * + * Describes a memory handle to be aliased. + * A subset of the handle can be chosen for aliasing, given an offset and a + * length. + * A special handle BASE_MEM_WRITE_ALLOC_PAGES_HANDLE is used to represent a + * region where a special page is mapped with a write-alloc cache setup, + * typically used when the write result of the GPU isn't needed, but the GPU + * must write anyway. + * + * Offset and length are specified in pages. + * Offset must be within the size of the handle. + * Offset+length must not overrun the size of the handle. + * + * @handle: Handle to alias, can be BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * @offset: Offset within the handle to start aliasing from, in pages. + * Not used with BASE_MEM_WRITE_ALLOC_PAGES_HANDLE. + * @length: Length to alias, in pages. For BASE_MEM_WRITE_ALLOC_PAGES_HANDLE + * specifies the number of times the special page is needed. + */ +struct base_mem_aliasing_info { + struct base_mem_handle handle; + __u64 offset; + __u64 length; +}; + +/* Maximum percentage of just-in-time memory allocation trimming to perform + * on free. + */ +#define BASE_JIT_MAX_TRIM_LEVEL (100) + +/* Maximum number of concurrent just-in-time memory allocations. + */ +#define BASE_JIT_ALLOC_COUNT (255) + +/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5 + * + * jit_version is 1 + * + * Due to the lack of padding specified, user clients between 32 and 64-bit + * may have assumed a different size of the struct + * + * An array of structures was not supported + */ +struct base_jit_alloc_info_10_2 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; +}; + +/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up + * to 11.19 + * + * This structure had a number of modifications during and after kernel driver + * version 11.5, but remains size-compatible throughout its version history, and + * with earlier variants compatible with future variants by requiring + * zero-initialization to the unused space in the structure. + * + * jit_version is 2 + * + * Kernel driver version history: + * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes + * must be zero. Kbase minor version was not incremented, so some + * versions of 11.5 do not have this change. + * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase + * minor version not incremented) + * 11.6: Added 'flags', replacing 1 padding byte + * 11.10: Arrays of this structure are supported + */ +struct base_jit_alloc_info_11_5 { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; +}; + +/** + * struct base_jit_alloc_info - Structure which describes a JIT allocation + * request. + * @gpu_alloc_addr: The GPU virtual address to write the JIT + * allocated GPU virtual address to. + * @va_pages: The minimum number of virtual pages required. + * @commit_pages: The minimum number of physical pages which + * should back the allocation. + * @extension: Granularity of physical pages to grow the + * allocation by during a fault. + * @id: Unique ID provided by the caller, this is used + * to pair allocation and free requests. + * Zero is not a valid value. + * @bin_id: The JIT allocation bin, used in conjunction with + * @max_allocations to limit the number of each + * type of JIT allocation. + * @max_allocations: The maximum number of allocations allowed within + * the bin specified by @bin_id. Should be the same + * for all allocations within the same bin. + * @flags: flags specifying the special requirements for + * the JIT allocation, see + * %BASE_JIT_ALLOC_VALID_FLAGS + * @padding: Expansion space - should be initialised to zero + * @usage_id: A hint about which allocation should be reused. + * The kernel should attempt to use a previous + * allocation with the same usage_id + * @heap_info_gpu_addr: Pointer to an object in GPU memory describing + * the actual usage of the region. + * + * jit_version is 3. + * + * When modifications are made to this structure, it is still compatible with + * jit_version 3 when: a) the size is unchanged, and b) new members only + * replace the padding bytes. + * + * Previous jit_version history: + * jit_version == 1, refer to &base_jit_alloc_info_10_2 + * jit_version == 2, refer to &base_jit_alloc_info_11_5 + * + * Kbase version history: + * 11.20: added @heap_info_gpu_addr + */ +struct base_jit_alloc_info { + __u64 gpu_alloc_addr; + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u8 id; + __u8 bin_id; + __u8 max_allocations; + __u8 flags; + __u8 padding[2]; + __u16 usage_id; + __u64 heap_info_gpu_addr; +}; + +enum base_external_resource_access { + BASE_EXT_RES_ACCESS_SHARED, + BASE_EXT_RES_ACCESS_EXCLUSIVE +}; + +struct base_external_resource { + __u64 ext_resource; +}; + + +/** + * The maximum number of external resources which can be mapped/unmapped + * in a single request. + */ +#define BASE_EXT_RES_COUNT_MAX 10 + +/** + * struct base_external_resource_list - Structure which describes a list of + * external resources. + * @count: The number of resources. + * @ext_res: Array of external resources which is + * sized at allocation time. + */ +struct base_external_resource_list { + __u64 count; + struct base_external_resource ext_res[1]; +}; + +struct base_jd_debug_copy_buffer { + __u64 address; + __u64 size; + struct base_external_resource extres; +}; + +#define GPU_MAX_JOB_SLOTS 16 + +/** + * User-side Base GPU Property Queries + * + * The User-side Base GPU Property Query interface encapsulates two + * sub-modules: + * + * - "Dynamic GPU Properties" + * - "Base Platform Config GPU Properties" + * + * Base only deals with properties that vary between different GPU + * implementations - the Dynamic GPU properties and the Platform Config + * properties. + * + * For properties that are constant for the GPU Architecture, refer to the + * GPU module. However, we will discuss their relevance here just to + * provide background information. + * + * About the GPU Properties in Base and GPU modules + * + * The compile-time properties (Platform Config, GPU Compile-time + * properties) are exposed as pre-processor macros. + * + * Complementing the compile-time properties are the Dynamic GPU + * Properties, which act as a conduit for the GPU Configuration + * Discovery. + * + * In general, the dynamic properties are present to verify that the platform + * has been configured correctly with the right set of Platform Config + * Compile-time Properties. + * + * As a consistent guide across the entire DDK, the choice for dynamic or + * compile-time should consider the following, in order: + * 1. Can the code be written so that it doesn't need to know the + * implementation limits at all? + * 2. If you need the limits, get the information from the Dynamic Property + * lookup. This should be done once as you fetch the context, and then cached + * as part of the context data structure, so it's cheap to access. + * 3. If there's a clear and arguable inefficiency in using Dynamic Properties, + * then use a Compile-Time Property (Platform Config, or GPU Compile-time + * property). Examples of where this might be sensible follow: + * - Part of a critical inner-loop + * - Frequent re-use throughout the driver, causing significant extra load + * instructions or control flow that would be worthwhile optimizing out. + * + * We cannot provide an exhaustive set of examples, neither can we provide a + * rule for every possible situation. Use common sense, and think about: what + * the rest of the driver will be doing; how the compiler might represent the + * value if it is a compile-time constant; whether an OEM shipping multiple + * devices would benefit much more from a single DDK binary, instead of + * insignificant micro-optimizations. + * + * Dynamic GPU Properties + * + * Dynamic GPU properties are presented in two sets: + * 1. the commonly used properties in @ref base_gpu_props, which have been + * unpacked from GPU register bitfields. + * 2. The full set of raw, unprocessed properties in gpu_raw_gpu_props + * (also a member of base_gpu_props). All of these are presented in + * the packed form, as presented by the GPU registers themselves. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + * The properties returned extend the GPU Configuration Discovery + * registers. For example, GPU clock speed is not specified in the GPU + * Architecture, but is necessary for OpenCL's clGetDeviceInfo() function. + * + * The GPU properties are obtained by a call to + * base_get_gpu_props(). This simply returns a pointer to a const + * base_gpu_props structure. It is constant for the life of a base + * context. Multiple calls to base_get_gpu_props() to a base context + * return the same pointer to a constant structure. This avoids cache pollution + * of the common data. + * + * This pointer must not be freed, because it does not point to the start of a + * region allocated by the memory allocator; instead, just close the @ref + * base_context. + * + * + * Kernel Operation + * + * During Base Context Create time, user-side makes a single kernel call: + * - A call to fill user memory with GPU information structures + * + * The kernel-side will fill the provided the entire processed base_gpu_props + * structure, because this information is required in both + * user and kernel side; it does not make sense to decode it twice. + * + * Coherency groups must be derived from the bitmasks, but this can be done + * kernel side, and just once at kernel startup: Coherency groups must already + * be known kernel-side, to support chains that specify a 'Only Coherent Group' + * SW requirement, or 'Only Coherent Group with Tiler' SW requirement. + * + * Coherency Group calculation + * + * Creation of the coherent group data is done at device-driver startup, and so + * is one-time. This will most likely involve a loop with CLZ, shifting, and + * bit clearing on the L2_PRESENT mask, depending on whether the + * system is L2 Coherent. The number of shader cores is done by a + * population count, since faulty cores may be disabled during production, + * producing a non-contiguous mask. + * + * The memory requirements for this algorithm can be determined either by a __u64 + * population count on the L2_PRESENT mask (a LUT helper already is + * required for the above), or simple assumption that there can be no more than + * 16 coherent groups, since core groups are typically 4 cores. + */ + +#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4 + +#define BASE_MAX_COHERENT_GROUPS 16 +/** + * struct mali_base_gpu_core_props - GPU core props info + * @product_id: Pro specific value. + * @version_status: Status of the GPU release. No defined values, but starts at + * 0 and increases by one for each release status (alpha, beta, EAC, etc.). + * 4 bit values (0-15). + * @minor_revision: Minor release number of the GPU. "P" part of an "RnPn" + * release number. + * 8 bit values (0-255). + * @major_revision: Major release number of the GPU. "R" part of an "RnPn" + * release number. + * 4 bit values (0-15). + * @padding: padding to allign to 8-byte + * @gpu_freq_khz_max: The maximum GPU frequency. Reported to applications by + * clGetDeviceInfo() + * @log2_program_counter_size: Size of the shader program counter, in bits. + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU. This + * is a bitpattern where a set bit indicates that the format is supported. + * Before using a texture format, it is recommended that the corresponding + * bit be checked. + * @gpu_available_memory_size: Theoretical maximum memory available to the GPU. + * It is unlikely that a client will be able to allocate all of this memory + * for their own purposes, but this at least provides an upper bound on the + * memory available to the GPU. + * This is required for OpenCL's clGetDeviceInfo() call when + * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The + * client will not be expecting to allocate anywhere near this value. + * @num_exec_engines: The number of execution engines. + */ +struct mali_base_gpu_core_props { + __u32 product_id; + __u16 version_status; + __u16 minor_revision; + __u16 major_revision; + __u16 padding; + __u32 gpu_freq_khz_max; + __u32 log2_program_counter_size; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + __u64 gpu_available_memory_size; + __u8 num_exec_engines; +}; + +/* + * More information is possible - but associativity and bus width are not + * required by upper-level apis. + */ +struct mali_base_gpu_l2_cache_props { + __u8 log2_line_size; + __u8 log2_cache_size; + __u8 num_l2_slices; /* Number of L2C slices. 1 or higher */ + __u8 padding[5]; +}; + +struct mali_base_gpu_tiler_props { + __u32 bin_size_bytes; /* Max is 4*2^15 */ + __u32 max_active_levels; /* Max is 2^15 */ +}; + +/** + * struct mali_base_gpu_thread_props - GPU threading system details. + * @max_threads: Max. number of threads per core + * @max_workgroup_size: Max. number of threads per workgroup + * @max_barrier_size: Max. number of threads that can synchronize on a + * simple barrier + * @max_registers: Total size [1..65535] of the register file available + * per core. + * @max_task_queue: Max. tasks [1..255] which may be sent to a core + * before it becomes blocked. + * @max_thread_group_split: Max. allowed value [1..15] of the Thread Group Split + * field. + * @impl_tech: 0 = Not specified, 1 = Silicon, 2 = FPGA, + * 3 = SW Model/Emulation + * @padding: padding to allign to 8-byte + * @tls_alloc: Number of threads per core that TLS must be + * allocated for + */ +struct mali_base_gpu_thread_props { + __u32 max_threads; + __u32 max_workgroup_size; + __u32 max_barrier_size; + __u16 max_registers; + __u8 max_task_queue; + __u8 max_thread_group_split; + __u8 impl_tech; + __u8 padding[3]; + __u32 tls_alloc; +}; + +/** + * struct mali_base_gpu_coherent_group - descriptor for a coherent group + * @core_mask: Core restriction mask required for the group + * @num_cores: Number of cores in the group + * @padding: padding to allign to 8-byte + * + * \c core_mask exposes all cores in that coherent group, and \c num_cores + * provides a cached population-count for that mask. + * + * @note Whilst all cores are exposed in the mask, not all may be available to + * the application, depending on the Kernel Power policy. + * + * @note if u64s must be 8-byte aligned, then this structure has 32-bits of + * wastage. + */ +struct mali_base_gpu_coherent_group { + __u64 core_mask; + __u16 num_cores; + __u16 padding[3]; +}; + +/** + * struct mali_base_gpu_coherent_group_info - Coherency group information + * @num_groups: Number of coherent groups in the GPU. + * @num_core_groups: Number of core groups (coherent or not) in the GPU. + * Equivalent to the number of L2 Caches. + * The GPU Counter dumping writes 2048 bytes per core group, regardless + * of whether the core groups are coherent or not. Hence this member is + * needed to calculate how much memory is required for dumping. + * @note Do not use it to work out how many valid elements are in the + * group[] member. Use num_groups instead. + * @coherency: Coherency features of the memory, accessed by gpu_mem_features + * methods + * @padding: padding to allign to 8-byte + * @group: Descriptors of coherent groups + * + * Note that the sizes of the members could be reduced. However, the \c group + * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte + * aligned, thus leading to wastage if the other members sizes were reduced. + * + * The groups are sorted by core mask. The core masks are non-repeating and do + * not intersect. + */ +struct mali_base_gpu_coherent_group_info { + __u32 num_groups; + __u32 num_core_groups; + __u32 coherency; + __u32 padding; + struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS]; +}; + +/** + * struct gpu_raw_gpu_props - A complete description of the GPU's Hardware + * Configuration Discovery registers. + * @shader_present: Shader core present bitmap + * @tiler_present: Tiler core present bitmap + * @l2_present: Level 2 cache present bitmap + * @stack_present: Core stack present bitmap + * @l2_features: L2 features + * @core_features: Core features + * @mem_features: Mem features + * @mmu_features: Mmu features + * @as_present: Bitmap of address spaces present + * @js_present: Job slots present + * @js_features: Array of job slot features. + * @tiler_features: Tiler features + * @texture_features: TEXTURE_FEATURES_x registers, as exposed by the GPU + * @gpu_id: GPU and revision identifier + * @thread_max_threads: Maximum number of threads per core + * @thread_max_workgroup_size: Maximum number of threads per workgroup + * @thread_max_barrier_size: Maximum number of threads per barrier + * @thread_features: Thread features + * @coherency_mode: Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register + * @thread_tls_alloc: Number of threads per core that TLS must be allocated for + * @gpu_features: GPU features + * + * The information is presented inefficiently for access. For frequent access, + * the values should be better expressed in an unpacked form in the + * base_gpu_props structure. + * + * The raw properties in gpu_raw_gpu_props are necessary to + * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device + * behaving differently?". In this case, all information about the + * configuration is potentially useful, but it does not need to be processed + * by the driver. Instead, the raw registers can be processed by the Mali + * Tools software on the host PC. + * + */ +struct gpu_raw_gpu_props { + __u64 shader_present; + __u64 tiler_present; + __u64 l2_present; + __u64 stack_present; + __u32 l2_features; + __u32 core_features; + __u32 mem_features; + __u32 mmu_features; + + __u32 as_present; + + __u32 js_present; + __u32 js_features[GPU_MAX_JOB_SLOTS]; + __u32 tiler_features; + __u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS]; + + __u32 gpu_id; + + __u32 thread_max_threads; + __u32 thread_max_workgroup_size; + __u32 thread_max_barrier_size; + __u32 thread_features; + + /* + * Note: This is the _selected_ coherency mode rather than the + * available modes as exposed in the coherency_features register. + */ + __u32 coherency_mode; + + __u32 thread_tls_alloc; + __u64 gpu_features; +}; + +/** + * struct base_gpu_props - Return structure for base_get_gpu_props(). + * @core_props: Core props. + * @l2_props: L2 props. + * @unused_1: Keep for backwards compatibility. + * @tiler_props: Tiler props. + * @thread_props: Thread props. + * @raw_props: This member is large, likely to be 128 bytes. + * @coherency_info: This must be last member of the structure. + * + * NOTE: the raw_props member in this data structure contains the register + * values from which the value of the other members are derived. The derived + * members exist to allow for efficient access and/or shielding the details + * of the layout of the registers. + */ +struct base_gpu_props { + struct mali_base_gpu_core_props core_props; + struct mali_base_gpu_l2_cache_props l2_props; + __u64 unused_1; + struct mali_base_gpu_tiler_props tiler_props; + struct mali_base_gpu_thread_props thread_props; + struct gpu_raw_gpu_props raw_props; + struct mali_base_gpu_coherent_group_info coherency_info; +}; + +#define BASE_MEM_GROUP_ID_GET(flags) \ + ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) + +#define BASE_MEM_GROUP_ID_SET(id) \ + (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ + BASE_MEM_GROUP_DEFAULT : \ + id) \ + << BASEP_MEM_GROUP_ID_SHIFT) & \ + BASE_MEM_GROUP_ID_MASK) + +#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ + (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ + ((base_context_create_flags)(group_id) \ + << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) + +#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ + ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ + BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) + +/* + * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These + * flags are also used, where applicable, for specifying which fields + * are valid following the request operation. + */ + +/* For monotonic (counter) timefield */ +#define BASE_TIMEINFO_MONOTONIC_FLAG (1UL << 0) +/* For system wide timestamp */ +#define BASE_TIMEINFO_TIMESTAMP_FLAG (1UL << 1) +/* For GPU cycle counter */ +#define BASE_TIMEINFO_CYCLE_COUNTER_FLAG (1UL << 2) +/* Specify kernel GPU register timestamp */ +#define BASE_TIMEINFO_KERNEL_SOURCE_FLAG (1UL << 30) +/* Specify userspace cntvct_el0 timestamp source */ +#define BASE_TIMEINFO_USER_SOURCE_FLAG (1UL << 31) + +#define BASE_TIMEREQUEST_ALLOWED_FLAGS (\ + BASE_TIMEINFO_MONOTONIC_FLAG | \ + BASE_TIMEINFO_TIMESTAMP_FLAG | \ + BASE_TIMEINFO_CYCLE_COUNTER_FLAG | \ + BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \ + BASE_TIMEINFO_USER_SOURCE_FLAG) + +/* Maximum number of source allocations allowed to create an alias allocation. + * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array + * layers, since each cube map in the array will have 6 faces. + */ +#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576) + +#endif /* _UAPI_BASE_KERNEL_H_ */ diff --git a/src/panfrost/csf_test/mali_gpu_csf_registers.h b/src/panfrost/csf_test/mali_gpu_csf_registers.h new file mode 100644 index 00000000000..17e338cb238 --- /dev/null +++ b/src/panfrost/csf_test/mali_gpu_csf_registers.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * This header was originally autogenerated, but it is now ok (and + * expected) to have to add to it. + */ + +#ifndef _UAPI_GPU_CSF_REGISTERS_H_ +#define _UAPI_GPU_CSF_REGISTERS_H_ + +/* Only user block defines are included. HI words have been removed */ + +/* CS_USER_INPUT_BLOCK register offsets */ +#define CS_INSERT 0x0000 /* () Current insert offset for ring buffer, low word */ +#define CS_EXTRACT_INIT 0x0008 /* () Initial extract offset for ring buffer, low word */ + +/* CS_USER_OUTPUT_BLOCK register offsets */ +#define CS_EXTRACT 0x0000 /* () Current extract offset for ring buffer, low word */ +#define CS_ACTIVE 0x0008 /* () Initial extract offset when the CS is started */ + +/* USER register offsets */ +#define LATEST_FLUSH 0x0000 /* () Flush ID of latest clean-and-invalidate operation */ + +#endif diff --git a/src/panfrost/csf_test/mali_kbase_csf_ioctl.h b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h new file mode 100644 index 00000000000..3df8a01699f --- /dev/null +++ b/src/panfrost/csf_test/mali_kbase_csf_ioctl.h @@ -0,0 +1,483 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_CSF_IOCTL_H_ +#define _UAPI_KBASE_CSF_IOCTL_H_ + +#include +#include + +/* + * 1.0: + * - CSF IOCTL header separated from JM + * 1.1: + * - Add a new priority level BASE_QUEUE_GROUP_PRIORITY_REALTIME + * - Add ioctl 54: This controls the priority setting. + * 1.2: + * - Add new CSF GPU_FEATURES register into the property structure + * returned by KBASE_IOCTL_GET_GPUPROPS + * 1.3: + * - Add __u32 group_uid member to + * &struct_kbase_ioctl_cs_queue_group_create.out + * 1.4: + * - Replace padding in kbase_ioctl_cs_get_glb_iface with + * instr_features member of same size + * 1.5: + * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new + * queue registration call with extended format for supporting CS + * trace configurations with CSF trace_command. + * 1.6: + * - Added new HW performance counters interface to all GPUs. + * 1.7: + * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use + * 1.8: + * - Removed Kernel legacy HWC interface + */ + +#define BASE_UK_VERSION_MAJOR 1 +#define BASE_UK_VERSION_MINOR 8 + +/** + * struct kbase_ioctl_version_check - Check version compatibility between + * kernel and userspace + * + * @major: Major version number + * @minor: Minor version number + */ +struct kbase_ioctl_version_check { + __u16 major; + __u16 minor; +}; + +#define KBASE_IOCTL_VERSION_CHECK_RESERVED \ + _IOWR(KBASE_IOCTL_TYPE, 0, struct kbase_ioctl_version_check) + + +/** + * struct kbase_ioctl_cs_queue_register - Register a GPU command queue with the + * base back-end + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * + * @Note: There is an identical sub-section in kbase_ioctl_cs_queue_register_ex. + * Any change of this struct should also be mirrored to the latter. + */ +struct kbase_ioctl_cs_queue_register { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER \ + _IOW(KBASE_IOCTL_TYPE, 36, struct kbase_ioctl_cs_queue_register) + +/** + * struct kbase_ioctl_cs_queue_kick - Kick the GPU command queue group scheduler + * to notify that a queue has been updated + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_kick { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_KICK \ + _IOW(KBASE_IOCTL_TYPE, 37, struct kbase_ioctl_cs_queue_kick) + +/** + * union kbase_ioctl_cs_queue_bind - Bind a GPU command queue to a group + * + * @in: Input parameters + * @in.buffer_gpu_addr: GPU address of the buffer backing the queue + * @in.group_handle: Handle of the group to which the queue should be bound + * @in.csi_index: Index of the CSF interface the queue should be bound to + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.mmap_handle: Handle to be used for creating the mapping of CS + * input/output pages + */ +union kbase_ioctl_cs_queue_bind { + struct { + __u64 buffer_gpu_addr; + __u8 group_handle; + __u8 csi_index; + __u8 padding[6]; + } in; + struct { + __u64 mmap_handle; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_BIND \ + _IOWR(KBASE_IOCTL_TYPE, 39, union kbase_ioctl_cs_queue_bind) + +/** + * struct kbase_ioctl_cs_queue_register_ex - Register a GPU command queue with the + * base back-end in extended format, + * involving trace buffer configuration + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + * @buffer_size: Size of the buffer in bytes + * @priority: Priority of the queue within a group when run within a process + * @padding: Currently unused, must be zero + * @ex_offset_var_addr: GPU address of the trace buffer write offset variable + * @ex_buffer_base: Trace buffer GPU base address for the queue + * @ex_buffer_size: Size of the trace buffer in bytes + * @ex_event_size: Trace event write size, in log2 designation + * @ex_event_state: Trace event states configuration + * @ex_padding: Currently unused, must be zero + * + * @Note: There is an identical sub-section at the start of this struct to that + * of @ref kbase_ioctl_cs_queue_register. Any change of this sub-section + * must also be mirrored to the latter. Following the said sub-section, + * the remaining fields forms the extension, marked with ex_*. + */ +struct kbase_ioctl_cs_queue_register_ex { + __u64 buffer_gpu_addr; + __u32 buffer_size; + __u8 priority; + __u8 padding[3]; + __u64 ex_offset_var_addr; + __u64 ex_buffer_base; + __u32 ex_buffer_size; + __u8 ex_event_size; + __u8 ex_event_state; + __u8 ex_padding[2]; +}; + +#define KBASE_IOCTL_CS_QUEUE_REGISTER_EX \ + _IOW(KBASE_IOCTL_TYPE, 40, struct kbase_ioctl_cs_queue_register_ex) + +/** + * struct kbase_ioctl_cs_queue_terminate - Terminate a GPU command queue + * + * @buffer_gpu_addr: GPU address of the buffer backing the queue + */ +struct kbase_ioctl_cs_queue_terminate { + __u64 buffer_gpu_addr; +}; + +#define KBASE_IOCTL_CS_QUEUE_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate) + +/** + * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue + * group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create_1_6 { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6 \ + _IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6) + +/** + * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group + * @in: Input parameters + * @in.tiler_mask: Mask of tiler endpoints the group is allowed to use. + * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use. + * @in.compute_mask: Mask of compute endpoints the group is allowed to use. + * @in.cs_min: Minimum number of CSs required. + * @in.priority: Queue group's priority within a process. + * @in.tiler_max: Maximum number of tiler endpoints the group is allowed + * to use. + * @in.fragment_max: Maximum number of fragment endpoints the group is + * allowed to use. + * @in.compute_max: Maximum number of compute endpoints the group is allowed + * to use. + * @in.padding: Currently unused, must be zero + * @out: Output parameters + * @out.group_handle: Handle of a newly created queue group. + * @out.padding: Currently unused, must be zero + * @out.group_uid: UID of the queue group available to base. + */ +union kbase_ioctl_cs_queue_group_create { + struct { + __u64 tiler_mask; + __u64 fragment_mask; + __u64 compute_mask; + __u8 cs_min; + __u8 priority; + __u8 tiler_max; + __u8 fragment_max; + __u8 compute_max; + __u8 padding[3]; + __u64 reserved; + } in; + struct { + __u8 group_handle; + __u8 padding[3]; + __u32 group_uid; + } out; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \ + _IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create) + +/** + * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group + * + * @group_handle: Handle of the queue group to be terminated + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_cs_queue_group_term { + __u8 group_handle; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE \ + _IOW(KBASE_IOCTL_TYPE, 43, struct kbase_ioctl_cs_queue_group_term) + +#define KBASE_IOCTL_CS_EVENT_SIGNAL \ + _IO(KBASE_IOCTL_TYPE, 44) + +typedef __u8 base_kcpu_queue_id; /* We support up to 256 active KCPU queues */ + +/** + * struct kbase_ioctl_kcpu_queue_new - Create a KCPU command queue + * + * @id: ID of the new command queue returned by the kernel + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_new { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_CREATE \ + _IOR(KBASE_IOCTL_TYPE, 45, struct kbase_ioctl_kcpu_queue_new) + +/** + * struct kbase_ioctl_kcpu_queue_delete - Destroy a KCPU command queue + * + * @id: ID of the command queue to be destroyed + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_delete { + base_kcpu_queue_id id; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_DELETE \ + _IOW(KBASE_IOCTL_TYPE, 46, struct kbase_ioctl_kcpu_queue_delete) + +/** + * struct kbase_ioctl_kcpu_queue_enqueue - Enqueue commands into the KCPU queue + * + * @addr: Memory address of an array of struct base_kcpu_queue_command + * @nr_commands: Number of commands in the array + * @id: kcpu queue identifier, returned by KBASE_IOCTL_KCPU_QUEUE_CREATE ioctl + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_kcpu_queue_enqueue { + __u64 addr; + __u32 nr_commands; + base_kcpu_queue_id id; + __u8 padding[3]; +}; + +#define KBASE_IOCTL_KCPU_QUEUE_ENQUEUE \ + _IOW(KBASE_IOCTL_TYPE, 47, struct kbase_ioctl_kcpu_queue_enqueue) + +/** + * union kbase_ioctl_cs_tiler_heap_init - Initialize chunked tiler memory heap + * @in: Input parameters + * @in.chunk_size: Size of each chunk. + * @in.initial_chunks: Initial number of chunks that heap will be created with. + * @in.max_chunks: Maximum number of chunks that the heap is allowed to use. + * @in.target_in_flight: Number of render-passes that the driver should attempt to + * keep in flight for which allocation of new chunks is + * allowed. + * @in.group_id: Group ID to be used for physical allocations. + * @in.padding: Padding + * @out: Output parameters + * @out.gpu_heap_va: GPU VA (virtual address) of Heap context that was set up + * for the heap. + * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap, + * actually points to the header of heap chunk and not to + * the low address of free memory in the chunk. + */ +union kbase_ioctl_cs_tiler_heap_init { + struct { + __u32 chunk_size; + __u32 initial_chunks; + __u32 max_chunks; + __u16 target_in_flight; + __u8 group_id; + __u8 padding; + } in; + struct { + __u64 gpu_heap_va; + __u64 first_chunk_va; + } out; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_INIT \ + _IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init) + +/** + * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap + * instance + * + * @gpu_heap_va: GPU VA of Heap context that was set up for the heap. + */ +struct kbase_ioctl_cs_tiler_heap_term { + __u64 gpu_heap_va; +}; + +#define KBASE_IOCTL_CS_TILER_HEAP_TERM \ + _IOW(KBASE_IOCTL_TYPE, 49, struct kbase_ioctl_cs_tiler_heap_term) + +/** + * union kbase_ioctl_cs_get_glb_iface - Request the global control block + * of CSF interface capabilities + * + * @in: Input parameters + * @in.max_group_num: The maximum number of groups to be read. Can be 0, in + * which case groups_ptr is unused. + * @in.max_total_stream _num: The maximum number of CSs to be read. Can be 0, in + * which case streams_ptr is unused. + * @in.groups_ptr: Pointer where to store all the group data (sequentially). + * @in.streams_ptr: Pointer where to store all the CS data (sequentially). + * @out: Output parameters + * @out.glb_version: Global interface version. + * @out.features: Bit mask of features (e.g. whether certain types of job + * can be suspended). + * @out.group_num: Number of CSGs supported. + * @out.prfcnt_size: Size of CSF performance counters, in bytes. Bits 31:16 + * hold the size of firmware performance counter data + * and 15:0 hold the size of hardware performance counter + * data. + * @out.total_stream_num: Total number of CSs, summed across all groups. + * @out.instr_features: Instrumentation features. Bits 7:4 hold the maximum + * size of events. Bits 3:0 hold the offset update rate. + * (csf >= 1.1.0) + * + */ +union kbase_ioctl_cs_get_glb_iface { + struct { + __u32 max_group_num; + __u32 max_total_stream_num; + __u64 groups_ptr; + __u64 streams_ptr; + } in; + struct { + __u32 glb_version; + __u32 features; + __u32 group_num; + __u32 prfcnt_size; + __u32 total_stream_num; + __u32 instr_features; + } out; +}; + +#define KBASE_IOCTL_CS_GET_GLB_IFACE \ + _IOWR(KBASE_IOCTL_TYPE, 51, union kbase_ioctl_cs_get_glb_iface) + +struct kbase_ioctl_cs_cpu_queue_info { + __u64 buffer; + __u64 size; +}; + +#define KBASE_IOCTL_VERSION_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check) + +#define KBASE_IOCTL_CS_CPU_QUEUE_DUMP \ + _IOW(KBASE_IOCTL_TYPE, 53, struct kbase_ioctl_cs_cpu_queue_info) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +/** + * struct kbase_ioctl_cs_event_memory_write - Write an event memory address + * @cpu_addr: Memory address to write + * @value: Value to write + * @padding: Currently unused, must be zero + */ +struct kbase_ioctl_cs_event_memory_write { + __u64 cpu_addr; + __u8 value; + __u8 padding[7]; +}; + +/** + * union kbase_ioctl_cs_event_memory_read - Read an event memory address + * @in: Input parameters + * @in.cpu_addr: Memory address to read + * @out: Output parameters + * @out.value: Value read + * @out.padding: Currently unused, must be zero + */ +union kbase_ioctl_cs_event_memory_read { + struct { + __u64 cpu_addr; + } in; + struct { + __u8 value; + __u8 padding[7]; + } out; +}; + +#endif /* MALI_UNIT_TEST */ + +#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */ diff --git a/src/panfrost/csf_test/mali_kbase_ioctl.h b/src/panfrost/csf_test/mali_kbase_ioctl.h new file mode 100644 index 00000000000..fc81b71b46a --- /dev/null +++ b/src/panfrost/csf_test/mali_kbase_ioctl.h @@ -0,0 +1,854 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2017-2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _UAPI_KBASE_IOCTL_H_ +#define _UAPI_KBASE_IOCTL_H_ + +#ifdef __cpluscplus +extern "C" { +#endif + +#include +#include + +#define KBASE_IOCTL_TYPE 0x80 + +/** + * struct kbase_ioctl_set_flags - Set kernel context creation flags + * + * @create_flags: Flags - see base_context_create_flags + */ +struct kbase_ioctl_set_flags { + __u32 create_flags; +}; + +#define KBASE_IOCTL_SET_FLAGS \ + _IOW(KBASE_IOCTL_TYPE, 1, struct kbase_ioctl_set_flags) + +/** + * struct kbase_ioctl_get_gpuprops - Read GPU properties from the kernel + * + * @buffer: Pointer to the buffer to store properties into + * @size: Size of the buffer + * @flags: Flags - must be zero for now + * + * The ioctl will return the number of bytes stored into @buffer or an error + * on failure (e.g. @size is too small). If @size is specified as 0 then no + * data will be written but the return value will be the number of bytes needed + * for all the properties. + * + * @flags may be used in the future to request a different format for the + * buffer. With @flags == 0 the following format is used. + * + * The buffer will be filled with pairs of values, a __u32 key identifying the + * property followed by the value. The size of the value is identified using + * the bottom bits of the key. The value then immediately followed the key and + * is tightly packed (there is no padding). All keys and values are + * little-endian. + * + * 00 = __u8 + * 01 = __u16 + * 10 = __u32 + * 11 = __u64 + */ +struct kbase_ioctl_get_gpuprops { + __u64 buffer; + __u32 size; + __u32 flags; +}; + +#define KBASE_IOCTL_GET_GPUPROPS \ + _IOW(KBASE_IOCTL_TYPE, 3, struct kbase_ioctl_get_gpuprops) + +/** + * union kbase_ioctl_mem_alloc - Allocate memory on the GPU + * @in: Input parameters + * @in.va_pages: The number of pages of virtual address space to reserve + * @in.commit_pages: The number of physical pages to allocate + * @in.extension: The number of extra pages to allocate on each GPU fault which grows the region + * @in.flags: Flags + * @out: Output parameters + * @out.flags: Flags + * @out.gpu_va: The GPU virtual address which is allocated + */ +union kbase_ioctl_mem_alloc { + struct { + __u64 va_pages; + __u64 commit_pages; + __u64 extension; + __u64 flags; + } in; + struct { + __u64 flags; + __u64 gpu_va; + } out; +}; + +#define KBASE_IOCTL_MEM_ALLOC \ + _IOWR(KBASE_IOCTL_TYPE, 5, union kbase_ioctl_mem_alloc) + +/** + * struct kbase_ioctl_mem_query - Query properties of a GPU memory region + * @in: Input parameters + * @in.gpu_addr: A GPU address contained within the region + * @in.query: The type of query + * @out: Output parameters + * @out.value: The result of the query + * + * Use a %KBASE_MEM_QUERY_xxx flag as input for @query. + */ +union kbase_ioctl_mem_query { + struct { + __u64 gpu_addr; + __u64 query; + } in; + struct { + __u64 value; + } out; +}; + +#define KBASE_IOCTL_MEM_QUERY \ + _IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query) + +#define KBASE_MEM_QUERY_COMMIT_SIZE ((__u64)1) +#define KBASE_MEM_QUERY_VA_SIZE ((__u64)2) +#define KBASE_MEM_QUERY_FLAGS ((__u64)3) + +/** + * struct kbase_ioctl_mem_free - Free a memory region + * @gpu_addr: Handle to the region to free + */ +struct kbase_ioctl_mem_free { + __u64 gpu_addr; +}; + +#define KBASE_IOCTL_MEM_FREE \ + _IOW(KBASE_IOCTL_TYPE, 7, struct kbase_ioctl_mem_free) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @buffer_count: requested number of dumping buffers + * @fe_bm: counters selection bitmask (Front end) + * @shader_bm: counters selection bitmask (Shader) + * @tiler_bm: counters selection bitmask (Tiler) + * @mmu_l2_bm: counters selection bitmask (MMU_L2) + * + * A fd is returned from the ioctl if successful, or a negative value on error + */ +struct kbase_ioctl_hwcnt_reader_setup { + __u32 buffer_count; + __u32 fe_bm; + __u32 shader_bm; + __u32 tiler_bm; + __u32 mmu_l2_bm; +}; + +#define KBASE_IOCTL_HWCNT_READER_SETUP \ + _IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup) + +/** + * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to. + * @data: Counter samples for the dummy model. + * @size: Size of the counter sample data. + * @padding: Padding. + */ +struct kbase_ioctl_hwcnt_values { + __u64 data; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_HWCNT_SET \ + _IOW(KBASE_IOCTL_TYPE, 32, struct kbase_ioctl_hwcnt_values) + +/** + * struct kbase_ioctl_disjoint_query - Query the disjoint counter + * @counter: A counter of disjoint events in the kernel + */ +struct kbase_ioctl_disjoint_query { + __u32 counter; +}; + +#define KBASE_IOCTL_DISJOINT_QUERY \ + _IOR(KBASE_IOCTL_TYPE, 12, struct kbase_ioctl_disjoint_query) + +/** + * struct kbase_ioctl_get_ddk_version - Query the kernel version + * @version_buffer: Buffer to receive the kernel version string + * @size: Size of the buffer + * @padding: Padding + * + * The ioctl will return the number of bytes written into version_buffer + * (which includes a NULL byte) or a negative error code + * + * The ioctl request code has to be _IOW because the data in ioctl struct is + * being copied to the kernel, even though the kernel then writes out the + * version info to the buffer specified in the ioctl. + */ +struct kbase_ioctl_get_ddk_version { + __u64 version_buffer; + __u32 size; + __u32 padding; +}; + +#define KBASE_IOCTL_GET_DDK_VERSION \ + _IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version) + +/** + * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 10.2--11.4) + * @va_pages: Number of VA pages to reserve for JIT + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_10_2 { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2) + +/** + * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory + * allocator (between kernel driver + * version 11.5--11.19) + * @va_pages: Number of VA pages to reserve for JIT + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + * + * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for + * backwards compatibility. + */ +struct kbase_ioctl_mem_jit_init_11_5 { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5) + +/** + * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory + * allocator + * @va_pages: Number of GPU virtual address pages to reserve for just-in-time + * memory allocations + * @max_allocations: Maximum number of concurrent allocations + * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%) + * @group_id: Group ID to be used for physical allocations + * @padding: Currently unused, must be zero + * @phys_pages: Maximum number of physical pages to allocate just-in-time + * + * Note that depending on the VA size of the application and GPU, the value + * specified in @va_pages may be ignored. + */ +struct kbase_ioctl_mem_jit_init { + __u64 va_pages; + __u8 max_allocations; + __u8 trim_level; + __u8 group_id; + __u8 padding[5]; + __u64 phys_pages; +}; + +#define KBASE_IOCTL_MEM_JIT_INIT \ + _IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init) + +/** + * struct kbase_ioctl_mem_sync - Perform cache maintenance on memory + * + * @handle: GPU memory handle (GPU VA) + * @user_addr: The address where it is mapped in user space + * @size: The number of bytes to synchronise + * @type: The direction to synchronise: 0 is sync to memory (clean), + * 1 is sync from memory (invalidate). Use the BASE_SYNCSET_OP_xxx constants. + * @padding: Padding to round up to a multiple of 8 bytes, must be zero + */ +struct kbase_ioctl_mem_sync { + __u64 handle; + __u64 user_addr; + __u64 size; + __u8 type; + __u8 padding[7]; +}; + +#define KBASE_IOCTL_MEM_SYNC \ + _IOW(KBASE_IOCTL_TYPE, 15, struct kbase_ioctl_mem_sync) + +/** + * union kbase_ioctl_mem_find_cpu_offset - Find the offset of a CPU pointer + * + * @in: Input parameters + * @in.gpu_addr: The GPU address of the memory region + * @in.cpu_addr: The CPU address to locate + * @in.size: A size in bytes to validate is contained within the region + * @out: Output parameters + * @out.offset: The offset from the start of the memory region to @cpu_addr + */ +union kbase_ioctl_mem_find_cpu_offset { + struct { + __u64 gpu_addr; + __u64 cpu_addr; + __u64 size; + } in; + struct { + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_CPU_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 16, union kbase_ioctl_mem_find_cpu_offset) + +/** + * struct kbase_ioctl_get_context_id - Get the kernel context ID + * + * @id: The kernel context ID + */ +struct kbase_ioctl_get_context_id { + __u32 id; +}; + +#define KBASE_IOCTL_GET_CONTEXT_ID \ + _IOR(KBASE_IOCTL_TYPE, 17, struct kbase_ioctl_get_context_id) + +/** + * struct kbase_ioctl_tlstream_acquire - Acquire a tlstream fd + * + * @flags: Flags + * + * The ioctl returns a file descriptor when successful + */ +struct kbase_ioctl_tlstream_acquire { + __u32 flags; +}; + +#define KBASE_IOCTL_TLSTREAM_ACQUIRE \ + _IOW(KBASE_IOCTL_TYPE, 18, struct kbase_ioctl_tlstream_acquire) + +#define KBASE_IOCTL_TLSTREAM_FLUSH \ + _IO(KBASE_IOCTL_TYPE, 19) + +/** + * struct kbase_ioctl_mem_commit - Change the amount of memory backing a region + * + * @gpu_addr: The memory region to modify + * @pages: The number of physical pages that should be present + * + * The ioctl may return on the following error codes or 0 for success: + * -ENOMEM: Out of memory + * -EINVAL: Invalid arguments + */ +struct kbase_ioctl_mem_commit { + __u64 gpu_addr; + __u64 pages; +}; + +#define KBASE_IOCTL_MEM_COMMIT \ + _IOW(KBASE_IOCTL_TYPE, 20, struct kbase_ioctl_mem_commit) + +/** + * union kbase_ioctl_mem_alias - Create an alias of memory regions + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.stride: Bytes between start of each memory region + * @in.nents: The number of regions to pack together into the alias + * @in.aliasing_info: Pointer to an array of struct base_mem_aliasing_info + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_alias { + struct { + __u64 flags; + __u64 stride; + __u64 nents; + __u64 aliasing_info; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_ALIAS \ + _IOWR(KBASE_IOCTL_TYPE, 21, union kbase_ioctl_mem_alias) + +/** + * union kbase_ioctl_mem_import - Import memory for use by the GPU + * @in: Input parameters + * @in.flags: Flags, see BASE_MEM_xxx + * @in.phandle: Handle to the external memory + * @in.type: Type of external memory, see base_mem_import_type + * @in.padding: Amount of extra VA pages to append to the imported buffer + * @out: Output parameters + * @out.flags: Flags, see BASE_MEM_xxx + * @out.gpu_va: Address of the new alias + * @out.va_pages: Size of the new alias + */ +union kbase_ioctl_mem_import { + struct { + __u64 flags; + __u64 phandle; + __u32 type; + __u32 padding; + } in; + struct { + __u64 flags; + __u64 gpu_va; + __u64 va_pages; + } out; +}; + +#define KBASE_IOCTL_MEM_IMPORT \ + _IOWR(KBASE_IOCTL_TYPE, 22, union kbase_ioctl_mem_import) + +/** + * struct kbase_ioctl_mem_flags_change - Change the flags for a memory region + * @gpu_va: The GPU region to modify + * @flags: The new flags to set + * @mask: Mask of the flags to modify + */ +struct kbase_ioctl_mem_flags_change { + __u64 gpu_va; + __u64 flags; + __u64 mask; +}; + +#define KBASE_IOCTL_MEM_FLAGS_CHANGE \ + _IOW(KBASE_IOCTL_TYPE, 23, struct kbase_ioctl_mem_flags_change) + +/** + * struct kbase_ioctl_stream_create - Create a synchronisation stream + * @name: A name to identify this stream. Must be NULL-terminated. + * + * Note that this is also called a "timeline", but is named stream to avoid + * confusion with other uses of the word. + * + * Unused bytes in @name (after the first NULL byte) must be also be NULL bytes. + * + * The ioctl returns a file descriptor. + */ +struct kbase_ioctl_stream_create { + char name[32]; +}; + +#define KBASE_IOCTL_STREAM_CREATE \ + _IOW(KBASE_IOCTL_TYPE, 24, struct kbase_ioctl_stream_create) + +/** + * struct kbase_ioctl_fence_validate - Validate a fd refers to a fence + * @fd: The file descriptor to validate + */ +struct kbase_ioctl_fence_validate { + int fd; +}; + +#define KBASE_IOCTL_FENCE_VALIDATE \ + _IOW(KBASE_IOCTL_TYPE, 25, struct kbase_ioctl_fence_validate) + +/** + * struct kbase_ioctl_mem_profile_add - Provide profiling information to kernel + * @buffer: Pointer to the information + * @len: Length + * @padding: Padding + * + * The data provided is accessible through a debugfs file + */ +struct kbase_ioctl_mem_profile_add { + __u64 buffer; + __u32 len; + __u32 padding; +}; + +#define KBASE_IOCTL_MEM_PROFILE_ADD \ + _IOW(KBASE_IOCTL_TYPE, 27, struct kbase_ioctl_mem_profile_add) + +/** + * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to map + */ +struct kbase_ioctl_sticky_resource_map { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_MAP \ + _IOW(KBASE_IOCTL_TYPE, 29, struct kbase_ioctl_sticky_resource_map) + +/** + * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was + * previously permanently mapped + * @count: Number of resources + * @address: Array of __u64 GPU addresses of the external resources to unmap + */ +struct kbase_ioctl_sticky_resource_unmap { + __u64 count; + __u64 address; +}; + +#define KBASE_IOCTL_STICKY_RESOURCE_UNMAP \ + _IOW(KBASE_IOCTL_TYPE, 30, struct kbase_ioctl_sticky_resource_unmap) + +/** + * union kbase_ioctl_mem_find_gpu_start_and_offset - Find the start address of + * the GPU memory region for + * the given gpu address and + * the offset of that address + * into the region + * @in: Input parameters + * @in.gpu_addr: GPU virtual address + * @in.size: Size in bytes within the region + * @out: Output parameters + * @out.start: Address of the beginning of the memory region enclosing @gpu_addr + * for the length of @offset bytes + * @out.offset: The offset from the start of the memory region to @gpu_addr + */ +union kbase_ioctl_mem_find_gpu_start_and_offset { + struct { + __u64 gpu_addr; + __u64 size; + } in; + struct { + __u64 start; + __u64 offset; + } out; +}; + +#define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \ + _IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset) + +#define KBASE_IOCTL_CINSTR_GWT_START \ + _IO(KBASE_IOCTL_TYPE, 33) + +#define KBASE_IOCTL_CINSTR_GWT_STOP \ + _IO(KBASE_IOCTL_TYPE, 34) + +/** + * union kbase_ioctl_gwt_dump - Used to collect all GPU write fault addresses. + * @in: Input parameters + * @in.addr_buffer: Address of buffer to hold addresses of gpu modified areas. + * @in.size_buffer: Address of buffer to hold size of modified areas (in pages) + * @in.len: Number of addresses the buffers can hold. + * @in.padding: padding + * @out: Output parameters + * @out.no_of_addr_collected: Number of addresses collected into addr_buffer. + * @out.more_data_available: Status indicating if more addresses are available. + * @out.padding: padding + * + * This structure is used when performing a call to dump GPU write fault + * addresses. + */ +union kbase_ioctl_cinstr_gwt_dump { + struct { + __u64 addr_buffer; + __u64 size_buffer; + __u32 len; + __u32 padding; + + } in; + struct { + __u32 no_of_addr_collected; + __u8 more_data_available; + __u8 padding[27]; + } out; +}; + +#define KBASE_IOCTL_CINSTR_GWT_DUMP \ + _IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump) + +/** + * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone + * + * @va_pages: Number of VA pages to reserve for EXEC_VA + */ +struct kbase_ioctl_mem_exec_init { + __u64 va_pages; +}; + +#define KBASE_IOCTL_MEM_EXEC_INIT \ + _IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init) + +/** + * union kbase_ioctl_get_cpu_gpu_timeinfo - Request zero or more types of + * cpu/gpu time (counter values) + * @in: Input parameters + * @in.request_flags: Bit-flags indicating the requested types. + * @in.paddings: Unused, size alignment matching the out. + * @out: Output parameters + * @out.sec: Integer field of the monotonic time, unit in seconds. + * @out.nsec: Fractional sec of the monotonic time, in nano-seconds. + * @out.padding: Unused, for __u64 alignment + * @out.timestamp: System wide timestamp (counter) value. + * @out.cycle_counter: GPU cycle counter value. + */ +union kbase_ioctl_get_cpu_gpu_timeinfo { + struct { + __u32 request_flags; + __u32 paddings[7]; + } in; + struct { + __u64 sec; + __u32 nsec; + __u32 padding; + __u64 timestamp; + __u64 cycle_counter; + } out; +}; + +#define KBASE_IOCTL_GET_CPU_GPU_TIMEINFO \ + _IOWR(KBASE_IOCTL_TYPE, 50, union kbase_ioctl_get_cpu_gpu_timeinfo) + +/** + * struct kbase_ioctl_context_priority_check - Check the max possible priority + * @priority: Input priority & output priority + */ + +struct kbase_ioctl_context_priority_check { + __u8 priority; +}; + +#define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \ + _IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check) + +/** + * struct kbase_ioctl_set_limited_core_count - Set the limited core count. + * + * @max_core_count: Maximum core count + */ +struct kbase_ioctl_set_limited_core_count { + __u8 max_core_count; +}; + +#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ + _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) + +/** + * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter + * information + * @info_item_size: Performance counter item size in bytes. + * @info_item_count: Performance counter item count in the info_list_ptr. + * @info_list_ptr: Performance counter item list pointer which points to a + * list with info_item_count of items. + * + * On success: returns info_item_size and info_item_count if info_list_ptr is + * NULL, returns performance counter information if info_list_ptr is not NULL. + * On error: returns a negative error code. + */ +struct kbase_ioctl_kinstr_prfcnt_enum_info { + __u32 info_item_size; + __u32 info_item_count; + __u64 info_list_ptr; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ + _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @in: input parameters. + * @in.request_item_count: Number of requests in the requests array. + * @in.request_item_size: Size in bytes of each request in the requests array. + * @in.requests_ptr: Pointer to the requests array. + * @out: output parameters. + * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for + * each sample. + * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap + * for reading performance counter samples. + * + * A fd is returned from the ioctl if successful, or a negative value on error. + */ +union kbase_ioctl_kinstr_prfcnt_setup { + struct { + __u32 request_item_count; + __u32 request_item_size; + __u64 requests_ptr; + } in; + struct { + __u32 prfcnt_metadata_item_size; + __u32 prfcnt_mmap_size_bytes; + } out; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ + _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) + +/*************** + * test ioctls * + ***************/ +#if MALI_UNIT_TEST +/* These ioctls are purely for test purposes and are not used in the production + * driver, they therefore may change without notice + */ + +#define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1) + + +/** + * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes + * @bytes_collected: number of bytes read by user + * @bytes_generated: number of bytes generated by tracepoints + */ +struct kbase_ioctl_tlstream_stats { + __u32 bytes_collected; + __u32 bytes_generated; +}; + +#define KBASE_IOCTL_TLSTREAM_STATS \ + _IOR(KBASE_IOCTL_TEST_TYPE, 2, struct kbase_ioctl_tlstream_stats) + +#endif /* MALI_UNIT_TEST */ + +/* Customer extension range */ +#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2) + +/* If the integration needs extra ioctl add them there + * like this: + * + * struct my_ioctl_args { + * .... + * } + * + * #define KBASE_IOCTL_MY_IOCTL \ + * _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args) + */ + + +/********************************** + * Definitions for GPU properties * + **********************************/ +#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0) +#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1) +#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2) +#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3) + +#define KBASE_GPUPROP_PRODUCT_ID 1 +#define KBASE_GPUPROP_VERSION_STATUS 2 +#define KBASE_GPUPROP_MINOR_REVISION 3 +#define KBASE_GPUPROP_MAJOR_REVISION 4 +/* 5 previously used for GPU speed */ +#define KBASE_GPUPROP_GPU_FREQ_KHZ_MAX 6 +/* 7 previously used for minimum GPU speed */ +#define KBASE_GPUPROP_LOG2_PROGRAM_COUNTER_SIZE 8 +#define KBASE_GPUPROP_TEXTURE_FEATURES_0 9 +#define KBASE_GPUPROP_TEXTURE_FEATURES_1 10 +#define KBASE_GPUPROP_TEXTURE_FEATURES_2 11 +#define KBASE_GPUPROP_GPU_AVAILABLE_MEMORY_SIZE 12 + +#define KBASE_GPUPROP_L2_LOG2_LINE_SIZE 13 +#define KBASE_GPUPROP_L2_LOG2_CACHE_SIZE 14 +#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15 + +#define KBASE_GPUPROP_TILER_BIN_SIZE_BYTES 16 +#define KBASE_GPUPROP_TILER_MAX_ACTIVE_LEVELS 17 + +#define KBASE_GPUPROP_MAX_THREADS 18 +#define KBASE_GPUPROP_MAX_WORKGROUP_SIZE 19 +#define KBASE_GPUPROP_MAX_BARRIER_SIZE 20 +#define KBASE_GPUPROP_MAX_REGISTERS 21 +#define KBASE_GPUPROP_MAX_TASK_QUEUE 22 +#define KBASE_GPUPROP_MAX_THREAD_GROUP_SPLIT 23 +#define KBASE_GPUPROP_IMPL_TECH 24 + +#define KBASE_GPUPROP_RAW_SHADER_PRESENT 25 +#define KBASE_GPUPROP_RAW_TILER_PRESENT 26 +#define KBASE_GPUPROP_RAW_L2_PRESENT 27 +#define KBASE_GPUPROP_RAW_STACK_PRESENT 28 +#define KBASE_GPUPROP_RAW_L2_FEATURES 29 +#define KBASE_GPUPROP_RAW_CORE_FEATURES 30 +#define KBASE_GPUPROP_RAW_MEM_FEATURES 31 +#define KBASE_GPUPROP_RAW_MMU_FEATURES 32 +#define KBASE_GPUPROP_RAW_AS_PRESENT 33 +#define KBASE_GPUPROP_RAW_JS_PRESENT 34 +#define KBASE_GPUPROP_RAW_JS_FEATURES_0 35 +#define KBASE_GPUPROP_RAW_JS_FEATURES_1 36 +#define KBASE_GPUPROP_RAW_JS_FEATURES_2 37 +#define KBASE_GPUPROP_RAW_JS_FEATURES_3 38 +#define KBASE_GPUPROP_RAW_JS_FEATURES_4 39 +#define KBASE_GPUPROP_RAW_JS_FEATURES_5 40 +#define KBASE_GPUPROP_RAW_JS_FEATURES_6 41 +#define KBASE_GPUPROP_RAW_JS_FEATURES_7 42 +#define KBASE_GPUPROP_RAW_JS_FEATURES_8 43 +#define KBASE_GPUPROP_RAW_JS_FEATURES_9 44 +#define KBASE_GPUPROP_RAW_JS_FEATURES_10 45 +#define KBASE_GPUPROP_RAW_JS_FEATURES_11 46 +#define KBASE_GPUPROP_RAW_JS_FEATURES_12 47 +#define KBASE_GPUPROP_RAW_JS_FEATURES_13 48 +#define KBASE_GPUPROP_RAW_JS_FEATURES_14 49 +#define KBASE_GPUPROP_RAW_JS_FEATURES_15 50 +#define KBASE_GPUPROP_RAW_TILER_FEATURES 51 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_0 52 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_1 53 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_2 54 +#define KBASE_GPUPROP_RAW_GPU_ID 55 +#define KBASE_GPUPROP_RAW_THREAD_MAX_THREADS 56 +#define KBASE_GPUPROP_RAW_THREAD_MAX_WORKGROUP_SIZE 57 +#define KBASE_GPUPROP_RAW_THREAD_MAX_BARRIER_SIZE 58 +#define KBASE_GPUPROP_RAW_THREAD_FEATURES 59 +#define KBASE_GPUPROP_RAW_COHERENCY_MODE 60 + +#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61 +#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62 +#define KBASE_GPUPROP_COHERENCY_COHERENCY 63 +#define KBASE_GPUPROP_COHERENCY_GROUP_0 64 +#define KBASE_GPUPROP_COHERENCY_GROUP_1 65 +#define KBASE_GPUPROP_COHERENCY_GROUP_2 66 +#define KBASE_GPUPROP_COHERENCY_GROUP_3 67 +#define KBASE_GPUPROP_COHERENCY_GROUP_4 68 +#define KBASE_GPUPROP_COHERENCY_GROUP_5 69 +#define KBASE_GPUPROP_COHERENCY_GROUP_6 70 +#define KBASE_GPUPROP_COHERENCY_GROUP_7 71 +#define KBASE_GPUPROP_COHERENCY_GROUP_8 72 +#define KBASE_GPUPROP_COHERENCY_GROUP_9 73 +#define KBASE_GPUPROP_COHERENCY_GROUP_10 74 +#define KBASE_GPUPROP_COHERENCY_GROUP_11 75 +#define KBASE_GPUPROP_COHERENCY_GROUP_12 76 +#define KBASE_GPUPROP_COHERENCY_GROUP_13 77 +#define KBASE_GPUPROP_COHERENCY_GROUP_14 78 +#define KBASE_GPUPROP_COHERENCY_GROUP_15 79 + +#define KBASE_GPUPROP_TEXTURE_FEATURES_3 80 +#define KBASE_GPUPROP_RAW_TEXTURE_FEATURES_3 81 + +#define KBASE_GPUPROP_NUM_EXEC_ENGINES 82 + +#define KBASE_GPUPROP_RAW_THREAD_TLS_ALLOC 83 +#define KBASE_GPUPROP_TLS_ALLOC 84 +#define KBASE_GPUPROP_RAW_GPU_FEATURES 85 +#ifdef __cpluscplus +} +#endif + +#endif /* _UAPI_KBASE_IOCTL_H_ */ diff --git a/src/panfrost/csf_test/test.c b/src/panfrost/csf_test/test.c new file mode 100644 index 00000000000..cb9ff398314 --- /dev/null +++ b/src/panfrost/csf_test/test.c @@ -0,0 +1,1903 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/macros.h" + +#include "mali_kbase_csf_ioctl.h" +#include "mali_kbase_ioctl.h" +#include "mali_base_kernel.h" +#include "mali_base_csf_kernel.h" +#include "mali_gpu_csf_registers.h" + +#define PAN_ARCH 10 +#include "genxml/gen_macros.h" + +#include "wrap.h" +#include "decode.h" + +#include "pan_shader.h" +#include "compiler/nir/nir_builder.h" +#include "bifrost/valhall/disassemble.h" + +#define CS_EVENT_REGISTER 0x5A + +static bool pr = true; +static bool colour_term = true; + +static void +dump_start(FILE *f) +{ + if (colour_term) + fprintf(f, "\x1b[90m"); +} + +static void +dump_end(FILE *f) +{ + if (colour_term) + fprintf(f, "\x1b[39m"); +} + +/* TODO: Use KBASE_IOCTL_MEM_SYNC for 32-bit systems */ +static void +cache_clean(volatile void *addr) +{ +#ifdef __aarch64__ + __asm__ volatile ("dc cvac, %0" :: "r" (addr) : "memory"); +#endif +} + +static void +cache_invalidate(volatile void *addr) +{ +#ifdef __aarch64__ + __asm__ volatile ("dc civac, %0" :: "r" (addr) : "memory"); +#endif +} + +static void +cache_barrier(void) +{ +#ifdef __ARM_ARCH + __asm__ volatile ("dsb sy" ::: "memory"); +#endif +} + +static void +memory_barrier(void) +{ +#ifdef __ARM_ARCH + __asm__ volatile ("dmb sy" ::: "memory"); +#endif +} + +typedef void (*cacheline_op)(volatile void *addr); + +#define CACHELINE_SIZE 64 + +static void +cacheline_op_range(volatile void *start, unsigned length, cacheline_op op) +{ + volatile void *ptr = (volatile void *)((uintptr_t) start & ~((uintptr_t) CACHELINE_SIZE - 1)); + volatile void *end = (volatile void *) ALIGN_POT((uintptr_t) start + length, CACHELINE_SIZE); + for (; ptr < end; ptr += CACHELINE_SIZE) + op(ptr); +} + +static void +cache_clean_range(volatile void *start, unsigned length) +{ + cacheline_op_range(start, length, cache_clean); +} + +static void +cache_invalidate_range(volatile void *start, unsigned length) +{ + cacheline_op_range(start, length, cache_invalidate); +} + +struct state; +struct test; + +typedef bool (* section)(struct state *s, struct test *t); + +#define CS_QUEUE_COUNT 4 /* compute / vertex / fragment / other */ +#define CS_QUEUE_SIZE 65536 + +struct state { + int page_size; + int argc; + char **argv; + + int mali_fd; + int tl_fd; + void *tracking_region; + void *csf_user_reg; + + uint8_t *gpuprops; + unsigned gpuprops_size; + uint32_t gpu_id; + + struct { + struct panfrost_ptr normal, exec, coherent, cached, event, ev2; + } allocations; + + uint64_t tiler_heap_va; + uint64_t tiler_heap_header; + + uint8_t csg_handle; + uint32_t csg_uid; + + struct panfrost_ptr cs_mem[CS_QUEUE_COUNT]; + void *cs_user_io[CS_QUEUE_COUNT]; + unsigned cs_last_submit[CS_QUEUE_COUNT]; + struct pan_command_stream cs[CS_QUEUE_COUNT]; + + unsigned shader_alloc_offset; + mali_ptr compute_shader; +}; + +struct test { + section part; + section cleanup; + const char *label; + + struct test *subtests; + unsigned sub_length; + + /* for allocation tests */ + unsigned offset; + unsigned flags; + + bool add; + bool invalid; + bool blit; + bool vertex; +}; + +/* See STATE and ALLOC macros below */ +#define DEREF_STATE(s, offset) ((void*) s + offset) + +static uint64_t +pan_get_gpuprop(struct state *s, int name) +{ + int i = 0; + uint64_t x = 0; + while (i < s->gpuprops_size) { + x = 0; + memcpy(&x, s->gpuprops + i, 4); + i += 4; + + int size = 1 << (x & 3); + int this_name = x >> 2; + + x = 0; + memcpy(&x, s->gpuprops + i, size); + i += size; + + if (this_name == name) + return x; + } + + fprintf(stderr, "Unknown prop %i\n", name); + return 0; +} + +static bool +open_kbase(struct state *s, struct test *t) +{ + s->mali_fd = open("/dev/mali0", O_RDWR); + if (s->mali_fd != -1) + return true; + + perror("open(\"/dev/mali0\")"); + return false; +} + +static bool +close_kbase(struct state *s, struct test *t) +{ + if (getenv("TEST_CHECK_LEAKS")) { + int pid = getpid(); + char cmd_buffer[64] = {0}; + sprintf(cmd_buffer, "grep /dev/mali /proc/%i/maps", pid); + system(cmd_buffer); + sprintf(cmd_buffer, "ls -l /proc/%i/fd", pid); + system(cmd_buffer); + } + + if (s->mali_fd > 0) + return close(s->mali_fd) == 0; + return true; +} + +static bool +get_version(struct state *s, struct test *t) +{ + struct kbase_ioctl_version_check ver = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_VERSION_CHECK, &ver); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_VERSION_CHECK)"); + return false; + } + + if (pr) + printf("Major %i Minor %i: ", ver.major, ver.minor); + return true; +} + +static bool +set_flags(struct state *s, struct test *t) +{ + struct kbase_ioctl_set_flags flags = { + .create_flags = 0 + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_SET_FLAGS, &flags); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_SET_FLAGS)"); + return false; + } + return true; +} + +static bool +mmap_tracking(struct state *s, struct test *t) +{ + s->tracking_region = mmap(NULL, s->page_size, PROT_NONE, + MAP_SHARED, s->mali_fd, + BASE_MEM_MAP_TRACKING_HANDLE); + + if (s->tracking_region == MAP_FAILED) { + perror("mmap(BASE_MEM_MAP_TRACKING_HANDLE)"); + s->tracking_region = NULL; + return false; + } + return true; +} + +static bool +munmap_tracking(struct state *s, struct test *t) +{ + if (s->tracking_region) + return munmap(s->tracking_region, s->page_size) == 0; + return true; +} + +static bool +get_gpuprops(struct state *s, struct test *t) +{ + struct kbase_ioctl_get_gpuprops props = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(0))"); + return false; + } else if (!ret) { + fprintf(stderr, "GET_GPUPROPS returned zero size\n"); + return false; + } + + s->gpuprops_size = ret; + s->gpuprops = calloc(s->gpuprops_size, 1); + + props.size = s->gpuprops_size; + props.buffer = (uint64_t)(uintptr_t) s->gpuprops; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_GET_GPUPROPS, &props); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_GET_GPUPROPS(size))"); + return false; + } + + return true; +} + +static bool +free_gpuprops(struct state *s, struct test *t) +{ + free(s->gpuprops); + return true; +} + +static bool +get_gpu_id(struct state *s, struct test *t) +{ + uint64_t gpu_id = pan_get_gpuprop(s, KBASE_GPUPROP_PRODUCT_ID); + if (!gpu_id) + return false; + s->gpu_id = gpu_id; + + uint16_t maj = gpu_id >> 12; + uint16_t min = (gpu_id >> 8) & 0xf; + uint16_t rev = (gpu_id >> 4) & 0xf; + + uint16_t product = gpu_id & 0xf; + uint16_t prod = product | ((maj & 1) << 4); + + const char *names[] = { + [1] = "TDUX", + [2] = "G710", + [3] = "G510", + [4] = "G310", + [7] = "G610", + [16 + 2] = "G715", /* TODO: Immortalis instead of Mali? */ + [16 + 3] = "G615", + }; + const char *name = (prod < ARRAY_SIZE(names)) ? names[prod] : NULL; + if (!name) + name = "unknown"; + + if (pr) + printf("v%i.%i.%i Mali-%s (%i): ", maj, min, rev, name, product); + + if (maj < 10) { + printf("not v10 or later: "); + return false; + } + + return true; +} + +static bool +get_coherency_mode(struct state *s, struct test *t) +{ + uint64_t mode = pan_get_gpuprop(s, KBASE_GPUPROP_RAW_COHERENCY_MODE); + + const char *modes[] = { + [0] = "ACE-Lite", + [1] = "ACE", + [31] = "None", + }; + const char *name = (mode < ARRAY_SIZE(modes)) ? modes[mode] : NULL; + if (!name) + name = "Unknown"; + + if (pr) + printf("0x%"PRIx64" (%s): ", mode, name); + return true; +} + +static bool +get_csf_caps(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_get_glb_iface iface = { 0 }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(0))"); + return false; + } + + int ver_maj = iface.out.glb_version >> 24; + int ver_min = (iface.out.glb_version >> 16) & 0xff; + int ver_rev = iface.out.glb_version & 0xffff; + + if (pr) + printf("v%i.%i.%i: feature mask 0x%x, %i groups, %i total: ", + ver_maj, ver_min, ver_rev, iface.out.features, + iface.out.group_num, iface.out.total_stream_num); + + unsigned group_num = iface.out.group_num; + unsigned stream_num = iface.out.total_stream_num; + + struct basep_cs_group_control *group_data = + calloc(group_num, sizeof(*group_data)); + + struct basep_cs_stream_control *stream_data = + calloc(stream_num, sizeof(*stream_data)); + + iface = (union kbase_ioctl_cs_get_glb_iface) { + .in = { + .max_group_num = group_num, + .max_total_stream_num = stream_num, + .groups_ptr = (uintptr_t) group_data, + .streams_ptr = (uintptr_t) stream_data, + } + }; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_GET_GLB_IFACE, &iface); + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_GET_GLB_IFACE(size))"); + + free(group_data); + free(stream_data); + + return false; + } + + unsigned print_groups = pr ? group_num : 0; + unsigned print_streams = pr ? stream_num : 0; + + for (unsigned i = 0; i < print_groups; ++i) { + if (i && !memcmp(group_data + i, group_data + i - 1, sizeof(*group_data))) + continue; + + fprintf(stderr, "Group %i-: feature mask 0x%x, %i streams\n", + i, group_data[i].features, group_data[i].stream_num); + } + + for (unsigned i = 0; i < print_streams; ++i) { + if (i && !memcmp(stream_data + i, stream_data + i - 1, sizeof(*stream_data))) + continue; + + unsigned reg = stream_data[i].features & 0xff; + unsigned score = (stream_data[i].features >> 8) & 0xff; + unsigned feat = stream_data[i].features >> 16; + + fprintf(stderr, "Stream %i-: 0x%x work registers, %i scoreboards, iterator mask: 0x%x\n", + i, reg, score, feat); + } + + free(group_data); + free(stream_data); + + return true; +} + +static bool +mmap_user_reg(struct state *s, struct test *t) +{ + s->csf_user_reg = mmap(NULL, s->page_size, PROT_READ, + MAP_SHARED, s->mali_fd, + BASEP_MEM_CSF_USER_REG_PAGE_HANDLE); + + if (s->csf_user_reg == MAP_FAILED) { + perror("mmap(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)"); + s->csf_user_reg = NULL; + return false; + } + return true; +} + +static bool +munmap_user_reg(struct state *s, struct test *t) +{ + if (s->csf_user_reg) + return munmap(s->csf_user_reg, s->page_size) == 0; + return true; +} + +static bool +init_mem_exec(struct state *s, struct test *t) +{ + struct kbase_ioctl_mem_exec_init init = { + .va_pages = 0x100000, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_EXEC_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_EXEC_INIT)"); + return false; + } + return true; +} + +static bool +init_mem_jit(struct state *s, struct test *t) +{ + struct kbase_ioctl_mem_jit_init init = { + .va_pages = 1 << 25, + .max_allocations = 255, + .phys_pages = 1 << 25, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_JIT_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_JIT_INIT)"); + return false; + } + return true; +} + +static bool +stream_create(struct state *s, struct test *t) +{ + struct kbase_ioctl_stream_create stream = { + .name = "stream" + }; + + s->tl_fd = ioctl(s->mali_fd, KBASE_IOCTL_STREAM_CREATE, &stream); + + if (s->tl_fd == -1) { + perror("ioctl(KBASE_IOCTL_STREAM_CREATE)"); + return false; + } + return true; + +} + +static bool +stream_destroy(struct state *s, struct test *t) +{ + if (s->tl_fd > 0) + return close(s->tl_fd) == 0; + return true; +} + +static bool +tiler_heap_create(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_tiler_heap_init init = { + .in = { + .chunk_size = 1 << 21, + .initial_chunks = 5, + .max_chunks = 200, + .target_in_flight = 65535, + } + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_INIT, &init); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_INIT)"); + return false; + } + + s->tiler_heap_va = init.out.gpu_heap_va; + s->tiler_heap_header = init.out.first_chunk_va; + printf("heap va: %"PRIx64", heap header: %"PRIx64"\n", + s->tiler_heap_va, s->tiler_heap_header); + + return true; +} + +static bool +tiler_heap_term(struct state *s, struct test *t) +{ + if (!s->tiler_heap_va) + return true; + + struct kbase_ioctl_cs_tiler_heap_term term = { + .gpu_heap_va = s->tiler_heap_va + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_TILER_HEAP_TERM, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_TILER_HEAP_TERM)"); + return false; + } + return true; +} + +static bool +cs_group_create(struct state *s, struct test *t) +{ + union kbase_ioctl_cs_queue_group_create_1_6 create = { + .in = { + /* Mali *still* only supports a single tiler unit */ + .tiler_mask = 1, + .fragment_mask = ~0ULL, + .compute_mask = ~0ULL, + + .cs_min = CS_QUEUE_COUNT, + + .priority = 1, + .tiler_max = 1, + .fragment_max = 64, + .compute_max = 64, + } + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6, &create); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6)"); + return false; + } + + s->csg_handle = create.out.group_handle; + s->csg_uid = create.out.group_uid; + + if (pr) + printf("CSG handle: %i UID: %i: ", s->csg_handle, s->csg_uid); + + /* Should be at least 1 */ + if (!s->csg_uid) + abort(); + + return true; +} + +static bool +cs_group_term(struct state *s, struct test *t) +{ + if (!s->csg_uid) + return true; + + struct kbase_ioctl_cs_queue_group_term term = { + .group_handle = s->csg_handle + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE, &term); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_GROUP_TERMINATE)"); + return false; + } + return true; +} + +static bool +decode_init(struct state *s, struct test *t) +{ + pandecode_initialize(true); + return true; +} + +static bool +decode_close(struct state *s, struct test *t) +{ + pandecode_close(); + return true; +} + +static struct panfrost_ptr +alloc_ioctl(struct state *s, union kbase_ioctl_mem_alloc *a) +{ + struct panfrost_ptr p = {0}; + + uint64_t va_pages = a->in.va_pages; + uint64_t flags = a->in.flags; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_MEM_ALLOC, a); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_MEM_ALLOC)"); + return p; + } + + if ((flags & BASE_MEM_SAME_VA) && + (!(a->out.flags & BASE_MEM_SAME_VA) || + a->out.gpu_va != 0x41000)) { + + fprintf(stderr, "Flags: 0x%"PRIx64", VA: 0x%"PRIx64"\n", + (uint64_t) a->out.flags, (uint64_t) a->out.gpu_va); + return p; + } + + void *ptr = mmap(NULL, s->page_size * va_pages, + PROT_READ | PROT_WRITE, MAP_SHARED, + s->mali_fd, a->out.gpu_va); + + if (ptr == MAP_FAILED) { + perror("mmap(GPU BO)"); + return p; + } + + uint64_t gpu_va = (a->out.flags & BASE_MEM_SAME_VA) ? + (uintptr_t) ptr : a->out.gpu_va; + + pandecode_inject_mmap(gpu_va, ptr, s->page_size * va_pages, NULL); + + p.cpu = ptr; + p.gpu = gpu_va; + + memset(p.cpu, 0, s->page_size * va_pages); + + return p; +} + +static struct panfrost_ptr +alloc_mem(struct state *s, uint64_t size, uint64_t flags) +{ + unsigned pages = size / s->page_size; + + union kbase_ioctl_mem_alloc a = { + .in = { + .va_pages = pages, + .commit_pages = pages, + .extension = 0, + .flags = flags, + } + }; + + return alloc_ioctl(s, &a); +} + +static void +alloc_redzone(struct state *s, struct panfrost_ptr p, uint64_t alloc_size) +{ + mmap(p.cpu - s->page_size, 1, + PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, + -1, 0); + + mmap(p.cpu + alloc_size, 1, + PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, + -1, 0); +} + +static bool +alloc(struct state *s, struct test *t) +{ + struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); + + *ptr = alloc_mem(s, s->page_size, t->flags); + + volatile int *p = (volatile int *) ptr->cpu; + *p = 0x12345; + if (*p != 0x12345) { + printf("Error reading from allocated memory at %p\n", p); + return false; + } + *p = 0; + cache_clean(p); + + return true; +} + +static bool +dealloc(struct state *s, struct test *t) +{ + struct panfrost_ptr *ptr = DEREF_STATE(s, t->offset); + + if (ptr->cpu) + return munmap(ptr->cpu, s->page_size) == 0; + return true; +} + +static bool +cs_queue_create(struct state *s, struct test *t) +{ + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + + /* Read/write from CPU/GPU, nothing special + * like coherency */ + s->cs_mem[i] = alloc_mem(s, CS_QUEUE_SIZE, 0x200f); + s->cs[i].ptr = s->cs_mem[i].cpu; + + if (!s->cs_mem[i].cpu) + return false; + } + + return true; +} + +static bool +cs_queue_free(struct state *s, struct test *t) +{ + bool pass = true; + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + if (s->cs_mem[i].cpu && munmap(s->cs_mem[i].cpu, CS_QUEUE_SIZE)) + pass = false; + } + return pass; +} + +static bool +cs_queue_register(struct state *s, struct test *t) +{ + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + struct kbase_ioctl_cs_queue_register reg = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + .buffer_size = CS_QUEUE_SIZE, + .priority = 1, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_REGISTER, ®); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_REGISTER)"); + return false; + } + + union kbase_ioctl_cs_queue_bind bind = { + .in = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + .group_handle = s->csg_handle, + .csi_index = i, + } + }; + + ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_BIND, &bind); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_BIND)"); + } + + s->cs_user_io[i] = + mmap(NULL, + s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES, + PROT_READ | PROT_WRITE, MAP_SHARED, + s->mali_fd, bind.out.mmap_handle); + + if (s->cs_user_io[i] == MAP_FAILED) { + perror("mmap(CS USER IO)"); + s->cs_user_io[i] = NULL; + return false; + } + } + return true; +} + +static bool +cs_queue_term(struct state *s, struct test *t) +{ + bool pass = true; + + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + if (s->cs_user_io[i] && + munmap(s->cs_user_io[i], + s->page_size * BASEP_QUEUE_NR_MMAP_USER_PAGES)) + pass = false; + + struct kbase_ioctl_cs_queue_terminate term = { + .buffer_gpu_addr = s->cs_mem[i].gpu, + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_TERMINATE, + &term); + + if (ret == -1) + pass = false; + } + return pass; +} + +#define CS_RING_DOORBELL(s, i) \ + *((uint32_t *)(s->cs_user_io[i])) = 1 + +#define CS_READ_REGISTER(s, i, r) \ + *((uint64_t *)(s->cs_user_io[i] + s->page_size * 2 + r)) + +#define CS_WRITE_REGISTER(s, i, r, v) \ + *((uint64_t *)(s->cs_user_io[i] + s->page_size + r)) = v + +static void +submit_cs(struct state *s, unsigned i) +{ + uintptr_t p = (uintptr_t) s->cs[i].ptr; + unsigned pad = (-p) & 63; + memset(s->cs[i].ptr, 0, pad); + + unsigned last_offset = s->cs_last_submit[i]; + + unsigned insert_offset = p + pad - (uintptr_t) s->cs_mem[i].cpu; + insert_offset %= CS_QUEUE_SIZE; + + for (unsigned o = last_offset; o != insert_offset; + o = (o + 64) % CS_QUEUE_SIZE) + cache_clean(s->cs_mem[i].cpu + o); + + // TODO: Handle wraparound + // TODO: Provide a persistent buffer for pandecode to use? + if (pr) { + dump_start(stderr); + pandecode_cs(s->cs_mem[i].gpu + last_offset, + insert_offset - last_offset, s->gpu_id); + dump_end(stderr); + } + + cache_barrier(); + + CS_WRITE_REGISTER(s, i, CS_INSERT, insert_offset); + s->cs[i].ptr = s->cs_mem[i].cpu + insert_offset; + + memory_barrier(); + CS_RING_DOORBELL(s, i); + memory_barrier(); + + s->cs_last_submit[i] = insert_offset; +} + +/* Returns true if there was a timeout */ +static bool +wait_event(struct state *s, unsigned timeout_ms) +{ + struct pollfd fd = { + .fd = s->mali_fd, + .events = POLLIN, + }; + + int ret = poll(&fd, 1, timeout_ms); + + if (ret == -1) { + perror("poll(mali_fd)"); + return true; + } + + /* Timeout */ + if (ret == 0) + return true; + + struct base_csf_notification event; + ret = read(s->mali_fd, &event, sizeof(event)); + + if (ret == -1) { + perror("read(mali_fd)"); + return true; + } + + if (ret != sizeof(event)) { + fprintf(stderr, "read(mali_fd) returned %i, expected %i!\n", + ret, (int) sizeof(event)); + return false; + } + + switch (event.type) { + case BASE_CSF_NOTIFICATION_EVENT: + fprintf(stderr, "Notification event!\n"); + return false; + + case BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR: + break; + + case BASE_CSF_NOTIFICATION_CPU_QUEUE_DUMP: + fprintf(stderr, "No event from mali_fd!\n"); + return false; + + default: + fprintf(stderr, "Unknown event type!\n"); + return false; + } + + struct base_gpu_queue_group_error e = event.payload.csg_error.error; + + switch (e.error_type) { + case BASE_GPU_QUEUE_GROUP_ERROR_FATAL: { + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue group error: status 0x%x " + "sideband 0x%"PRIx64"\n", + e.payload.fatal_group.status, + (uint64_t) e.payload.fatal_group.sideband); + break; + } + case BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL: { + unsigned queue = e.payload.fatal_queue.csi_index; + + // See CS_FATAL_EXCEPTION_* in mali_gpu_csf_registers.h + fprintf(stderr, "Queue %i error: status 0x%x " + "sideband 0x%"PRIx64":", + queue, e.payload.fatal_queue.status, + (uint64_t) e.payload.fatal_queue.sideband); + + unsigned e = CS_READ_REGISTER(s, queue, CS_EXTRACT); + pandecode_cs(s->cs_mem[queue].gpu + e, 8, s->gpu_id); + + break; + } + + case BASE_GPU_QUEUE_GROUP_ERROR_TIMEOUT: + fprintf(stderr, "Command stream timeout!\n"); + break; + case BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM: + fprintf(stderr, "Command stream OOM!\n"); + break; + default: + fprintf(stderr, "Unknown error type!\n"); + } + + return false; +} + +static bool +kick_queue(struct state *s, unsigned i) +{ + struct kbase_ioctl_cs_queue_kick kick = { + .buffer_gpu_addr = s->cs_mem[i].gpu + }; + + int ret = ioctl(s->mali_fd, KBASE_IOCTL_CS_QUEUE_KICK, &kick); + + if (ret == -1) { + perror("ioctl(KBASE_IOCTL_CS_QUEUE_KICK)"); + return false; + } + + return true; +} + +static bool +wait_cs(struct state *s, unsigned i) +{ + unsigned extract_offset = (void *) s->cs[i].ptr - s->cs_mem[i].cpu; + + unsigned timeout_ms = 500; + + bool done_kick = false; + + while (CS_READ_REGISTER(s, i, CS_EXTRACT) != extract_offset) { + if (wait_event(s, timeout_ms)) { + if (pr) + fprintf(stderr, "Event wait timeout!\n"); + + unsigned e = CS_READ_REGISTER(s, i, CS_EXTRACT); + unsigned a = CS_READ_REGISTER(s, i, CS_ACTIVE); + + if (e != extract_offset) { + fprintf(stderr, "CS_EXTRACT (%i) != %i, " + "CS_ACTIVE (%i) on queue %i:", + e, extract_offset, a, i); + /* Decode two instructions instead? */ + pandecode_cs(s->cs_mem[i].gpu + e, 8, 1); + + if (done_kick) { + cache_barrier(); + return false; + } else { + fprintf(stderr, "Kicking queue\n"); + kick_queue(s, i); + done_kick = true; + } + } + } + } + + cache_barrier(); + + return true; +} + +static bool +cs_init(struct state *s, struct test *t) +{ + uint64_t event_init[] = { 1, 1, 1 }; + memcpy(s->allocations.event.cpu, event_init, sizeof(event_init)); + + for (unsigned i = 0; i < CS_QUEUE_COUNT; ++i) { + CS_WRITE_REGISTER(s, i, CS_INSERT, 0); + pan_pack_ins(s->cs + i, CS_RESOURCES, cfg) { + switch (i) { + case 0: cfg.compute = true; break; + case 1: cfg.compute = true; cfg.fragment = true; break; + case 2: cfg.compute = true; cfg.tiler = true; cfg.idvs = true; break; + case 3: cfg.fragment = true; break; + } + } + pan_pack_ins(s->cs + i, CS_SLOT, cfg) { + cfg.index = 2; + } + pan_emit_cs_48(s->cs + i, CS_EVENT_REGISTER, + s->allocations.event.gpu); + submit_cs(s, i); + + if (!kick_queue(s, i)) + return false; + } + + return true; +} + +static struct panfrost_ptr * +buffers_elem(struct util_dynarray *buffers, unsigned index) +{ + unsigned size = util_dynarray_num_elements(buffers, + struct panfrost_ptr); + + if (index >= size) { + unsigned grow = index + 1 - size; + + memset(util_dynarray_grow(buffers, struct panfrost_ptr, grow), + 0, grow * sizeof(struct panfrost_ptr)); + } + + return util_dynarray_element(buffers, struct panfrost_ptr, index); +} + +static void +dump_hex64(FILE *fp, uint64_t *values, unsigned size) +{ + bool zero = false; + for (unsigned i = 0; i < size / 8; i += 2) { + uint64_t a = values[i]; + uint64_t b = values[i + 1]; + + if (!a && !b) { + if (!zero) + fprintf(fp, "%06X *\n", i * 8); + zero = true; + continue; + } + + zero = false; + + fprintf(fp, "%06X %16"PRIx64" %16"PRIx64"\n", + i * 8, a, b); + } + + fprintf(fp, "\n"); +} + +static void +dump_delta(FILE *fp, uint64_t *values, unsigned size) +{ + uint64_t old = 0; + bool zero = false; + bool el = false; + for (unsigned i = 0; i < size / 8; ++i) { + uint64_t val = values[i]; + int64_t delta = val - old; + + if (!zero || delta) { + fprintf(fp, "%"PRIi64"\n", delta); + el = false; + } else if (!el) { + fprintf(fp, "...\n"); + el = true; + } + + old = val; + zero = (delta == 0); + } +} + +static void +dump_tiler(FILE *fp, uint8_t *values, unsigned size) +{ + fflush(stdout); + FILE *stream = popen("tiler-hex-read", "w"); + // TODO! + fprintf(stream, "width %i\nheight %i\nmask %i\nvaheap %p\nsize %i\n", + 256, 256, 6, values, size); + pan_hexdump(stream, values, size, false); + pclose(stream); +} + +/* TODO: Pass in a filename? */ +static void +dump_filehex(uint8_t *values, unsigned size) +{ + char buf[1024] = {0}; + + for (unsigned i = 0; i < 10000; ++i) { + snprintf(buf, 1024, "/tmp/fdump.%05i", i); + + int fd = open(buf, O_WRONLY | O_CREAT | O_EXCL, 0666); + if (fd == -1) + continue; + + FILE *fp = fdopen(fd, "w"); + + fprintf(fp, "%p, %u:\n", values, size); + pan_hexdump(fp, values, size, false); + + fclose(fp); /* will close fd */ + break; + } +} + +static void +dump_heatmap(FILE *fp, uint8_t *values, unsigned size, + unsigned gran, unsigned length, unsigned stride) +{ + unsigned sum = 0; + unsigned gr = 0; + unsigned st = 0; + unsigned ll = 0; + + while (size && !values[size - 1]) + --size; + + for (unsigned i = 0; i < size; ++i) { + sum += values[i]; + + if (++gr == gran) { + fprintf(fp, " %02x", sum & 0xff); + gr = 0; + sum = 0; + } + + if (++ll == length) { + i += stride - length; + fprintf(fp, "\n"); + st = 0; + ll = 0; + } else if (++st == stride) { + fprintf(fp, "\n"); + st = 0; + } + } + fprintf(fp, " %02x\n", sum & 0xff); +} + +static bool +cs_test(struct state *s, struct test *t) +{ + if (s->argc < 2) + return true; + + FILE *f = fopen(s->argv[1], "r"); + + struct util_dynarray buffers; + util_dynarray_init(&buffers, NULL); + + for (;;) { + char *line = NULL; + size_t sz = 0; + if (getline(&line, &sz, f) == -1) + break; + + unsigned long src, dst, offset, src_offset, size, iter, flags; + unsigned long gran, stride, length; + int read; + char *mode; + + if (sscanf(line, "rel%ms %lu+%lu %lu+%lu", + &mode, &dst, &offset, &src, &src_offset) == 5) { + + if (strcmp(mode, "oc") && strcmp(mode, "split")) { + fprintf(stderr, "Unknown relocation mode 'rel%s'\n", mode); + } + bool split = (mode[0] == 's'); + free(mode); + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + struct panfrost_ptr *d = buffers_elem(&buffers, dst); + + if (!s->gpu || !d->gpu) { + fprintf(stderr, "relocating to buffer that doesn't exist!\n"); + } + + uint64_t *dest = d->cpu + offset; + uint64_t value = s->gpu + src_offset; + if (split) { + dest[0] |= (uint32_t) value; + dest[1] |= (uint32_t) (value >> 32); + } else { + *dest |= value; + } + + } else if (sscanf(line, "buffer %lu %lu %lx %n", + &dst, &size, &flags, &read) == 3) { + line += read; + + struct panfrost_ptr buffer = + alloc_mem(s, ALIGN_POT(size, s->page_size), + flags); + + alloc_redzone(s, buffer, ALIGN_POT(size, s->page_size)); + + *buffers_elem(&buffers, dst) = buffer; + + //printf("buffer %lu == 0x%lx\n", dst, buffer.gpu); + + uint64_t *fill = buffer.cpu; + + for (unsigned i = 0; i < size / 8; ++i) { + read = 0; + unsigned long long val = 0; + if (sscanf(line, "%Lx %n", &val, &read) != 1) + break; + line += read; + fill[i] = val; + } + + cache_clean_range(buffer.cpu, size); + + } else if (sscanf(line, "exe %n %lu %lu %lu", + &read, &iter, &dst, &size) == 3) { + line += read; + + unsigned iter_mask = 0; + + for (;;) { + read = 0; + if (sscanf(line, "%lu %lu %lu %n", + &iter, &dst, &size, &read) != 3) + break; + line += read; + + struct panfrost_ptr *d = + buffers_elem(&buffers, dst); + + /* TODO: Check 'size' against buffer size */ + + pandecode_cs(d->gpu, size, s->gpu_id); + + if (iter > 3) { + fprintf(stderr, + "execute on out-of-bounds " + "iterator\n"); + continue; + } + + memcpy(s->cs[iter].ptr, d->cpu, size); + s->cs[iter].ptr += size / 8; + + iter_mask |= (1 << iter); + } + + u_foreach_bit(i, iter_mask) + submit_cs(s, i); + + u_foreach_bit(i, iter_mask) + kick_queue(s, i); + + u_foreach_bit(i, iter_mask) + wait_cs(s, i); + + } else if (sscanf(line, "dump %lu %lu %lu %ms", + &src, &offset, &size, &mode) == 4) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "dumping buffer that doesn't exist!\n"); + + cache_invalidate_range(s->cpu + offset, size); + + if (!strcmp(mode, "hex")) + pan_hexdump(stdout, s->cpu + offset, size, true); + else if (!strcmp(mode, "hex64")) + dump_hex64(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "delta")) + dump_delta(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "tiler")) + dump_tiler(stdout, s->cpu + offset, size); + else if (!strcmp(mode, "filehex")) + dump_filehex(s->cpu + offset, size); + + free(mode); + + } else if (sscanf(line, "heatmap %lu %lu %lu %lu %lu %lu", + &src, &offset, &size, + &gran, &length, &stride) == 6) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "dumping buffer that doesn't exist!\n"); + + cache_invalidate_range(s->cpu + offset, size); + + dump_heatmap(stdout, s->cpu + offset, size, + gran, length, stride); + + } else if (sscanf(line, "memset %lu %lu %lu %lu", + &src, &offset, &gran, &size) == 4) { + + struct panfrost_ptr *s = buffers_elem(&buffers, src); + + if (!s->gpu) + fprintf(stderr, "memset on buffer that doesn't exist!\n"); + + memset(s->cpu + offset, gran, size); + cache_clean_range(s->cpu + offset, size); + + } else if (sscanf(line, "sleep %lu", &size) == 1) { + + usleep(size * 1000); + + } else if (strcmp(line, "td\n") == 0 || strcmp(line, "td") == 0) { + + void *ptr; + + ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, + s->tiler_heap_header); + pan_hexdump(stdout, ptr, 4096, false); + pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); + munmap(ptr, 1 << 21); + + ptr = mmap(NULL, 1 << 21, PROT_READ | PROT_WRITE, MAP_SHARED, s->mali_fd, + s->tiler_heap_header + (1 << 21)); + pan_hexdump(stdout, ptr, 4096, false); + pan_hexdump(stdout, ptr + (1 << 21) - 4096, 4096, false); + munmap(ptr, 1 << 21); + + } else { + fprintf(stderr, "unknown command '%s'\n", line); + } + } + + /* Skip following tests */ + return false; +} + +static void +pan_cs_evadd(pan_command_stream *c, unsigned offset, unsigned value) +{ + pan_emit_cs_32(c, 0x5e, value); + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = offset; + cfg.src = 0x5a; + cfg.dest = 0x5c; + } + pan_pack_ins(c, CS_EVADD, cfg) { + cfg.value = 0x5e; + cfg.addr = 0x5c; + } +} + +static bool +cs_simple(struct state *s, struct test *t) +{ + unsigned queue = t->vertex ? 2 : 0; + + pan_command_stream *c = s->cs + queue; + + unsigned dest = t->invalid ? 0x65 : 0x48; + + pan_emit_cs_32(c, dest, 0x1234); + pan_cs_evadd(c, 0, 1); + + submit_cs(s, queue); + return wait_cs(s, queue); +} + +static bool +cs_store(struct state *s, struct test *t) +{ + pan_command_stream *c = s->cs; + + uint32_t *dest = s->allocations.ev2.cpu + 240; + mali_ptr dest_va = s->allocations.ev2.gpu + 240; + uint32_t value = 1234; + uint32_t add = 4320000; + + *dest = 0; + cache_clean(dest); + + unsigned addr_reg = 0x48; + unsigned value_reg = 0x4a; + + if (t->invalid) + dest_va = 0xfdcba9876543; + + pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = (1 << 1); } + pan_emit_cs_48(c, addr_reg, dest_va); + pan_emit_cs_32(c, value_reg, value); + + if (t->add) { + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = add; + cfg.src = value_reg; + cfg.dest = value_reg; + } + value += add; + } + + pan_pack_ins(c, CS_STR, cfg) { + cfg.addr = addr_reg; + cfg.register_base = value_reg; + cfg.register_mask = 1; + } + pan_cs_evadd(c, 0, 1); + + submit_cs(s, 0); + wait_cs(s, 0); + + cache_invalidate(dest); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = *dest; + + if (t->invalid && result == value) { + printf("Got %i, did not expect %i: ", result, value); + return false; + } else if (result != value) { + printf("Got %i, expected %i: ", result, value); + return false; + } + + return true; +} + +static void +emit_cs_call(pan_command_stream *c, mali_ptr va, void *start, void *end) +{ + cache_clean_range(start, end - start); + + pan_emit_cs_48(c, 0x48, va); + pan_emit_cs_32(c, 0x4a, end - start); + pan_pack_ins(c, CS_CALL, cfg) { + cfg.address = 0x48; + cfg.length = 0x4a; + } +} + +static bool +cs_sub(struct state *s, struct test *t) +{ + pan_command_stream *c = s->cs; + pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; + mali_ptr cs_va = s->allocations.cached.gpu; + + uint32_t *dest = s->allocations.normal.cpu; + mali_ptr dest_va = s->allocations.normal.gpu; + uint32_t value = 4321; + + *dest = 0; + cache_clean(dest); + + unsigned addr_reg = 0x48; + unsigned value_reg = 0x4a; + + void *start = i->ptr; + + pan_emit_cs_ins(c, 0x30, 0x5a0000000000); + + pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } + pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = (1 << 3); } + //pan_emit_cs_ins(i, 0x31, 0); + + pan_emit_cs_48(i, addr_reg, dest_va); + pan_emit_cs_32(i, value_reg, value); + //pan_emit_cs_ins(i, 0x25, 0x01484a00000005ULL); + pan_pack_ins(i, CS_STR, cfg) { + cfg.addr = addr_reg; + cfg.register_base = value_reg; + cfg.register_mask = 1; + } + //pan_emit_cs_ins(i, 0x09, 0); + //pan_emit_cs_ins(i, 0x31, 0x100000000); + + //pan_emit_cs_ins(i, 0x24, 0x024a0000f80211ULL); + + /* + pan_pack_ins(i, CS_STR_32, cfg) { + cfg.unk_1 = 1; + cfg.unk_2 = 4; + cfg.unk_3 = 1; + cfg.addr = addr_reg; + cfg.value = value_reg; + }*/ + + emit_cs_call(c, cs_va, start, i->ptr); + pan_cs_evadd(c, 0, 1); + + submit_cs(s, 0); + wait_cs(s, 0); + + cache_invalidate(dest); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = *dest; + + if (result != value) { + printf("Got %i, expected %i: ", result, value); + return false; + } + + return true; +} + +static mali_ptr +upload_shader(struct state *s, struct util_dynarray binary) +{ + assert(s->shader_alloc_offset + binary.size < s->page_size); + + mali_ptr va = s->allocations.exec.gpu + s->shader_alloc_offset; + + memcpy(s->allocations.exec.cpu, binary.data, binary.size); + + /* Shouldn't be needed, but just in case... */ + cache_clean_range(s->allocations.exec.cpu, binary.size); + + s->shader_alloc_offset += binary.size; + + return va; +} + +static bool +compute_compile(struct state *s, struct test *t) +{ + nir_builder _b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, + GENX(pan_shader_get_compiler_options)(), + "mem_store"), *b = &_b; + + nir_ssa_def *ptr = + nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0)); + + nir_ssa_def *value = nir_imm_int(b, 123); + + nir_store_global(b, ptr, 8, value, 1); + + struct panfrost_compile_inputs inputs = { + .gpu_id = s->gpu_id, + .no_ubo_to_push = true, + }; + + struct util_dynarray binary = {0}; + struct pan_shader_info shader_info = {0}; + + GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info); + + dump_start(stderr); + disassemble_valhall(stderr, binary.data, binary.size, true); + dump_end(stderr); + + s->compute_shader = upload_shader(s, binary); + + util_dynarray_fini(&binary); + ralloc_free(b->shader); + + return true; +} + +static struct panfrost_ptr +mem_offset(struct panfrost_ptr ptr, unsigned offset) +{ + ptr.cpu += offset; + ptr.gpu += offset; + return ptr; +} + +static bool +compute_execute(struct state *s, struct test *t) +{ + unsigned queue = t->blit ? 1 : 0; + + pan_command_stream *c = s->cs + queue; + pan_command_stream _i = { .ptr = s->allocations.cached.cpu }, *i = &_i; + mali_ptr cs_va = s->allocations.cached.gpu; + + struct panfrost_ptr dest = s->allocations.normal; + uint32_t value = 123; + + *(uint32_t *) dest.cpu = 0; + cache_clean(dest.cpu); + + struct panfrost_ptr fau = mem_offset(dest, 128); + *(uint64_t *) fau.cpu = dest.gpu; + cache_clean(fau.cpu); + + struct panfrost_ptr local_storage = mem_offset(dest, 192); + pan_pack(local_storage.cpu, LOCAL_STORAGE, _); + cache_clean(local_storage.cpu); + + struct panfrost_ptr shader_program = mem_offset(dest, 256); + pan_pack(shader_program.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = MALI_SHADER_STAGE_COMPUTE; + cfg.primary_shader = true; + cfg.register_allocation = + MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; + cfg.binary = s->compute_shader; + } + cache_clean(shader_program.cpu); + + void *start = i->ptr; + + pan_pack_ins(i, CS_SLOT, cfg) { cfg.index = 3; } + //pan_pack_ins(i, CS_WAIT, cfg) { cfg.slots = 1 << 3; } + + pan_pack_cs(i, COMPUTE_PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; + + cfg.workgroup_count_x = 1; + cfg.workgroup_count_y = 1; + cfg.workgroup_count_z = 1; + + cfg.compute.shader = shader_program.gpu; + cfg.compute.thread_storage = local_storage.gpu; + + cfg.compute.fau = fau.gpu; + cfg.compute.fau_count = 1; + } + + pan_pack_ins(i, COMPUTE_LAUNCH, _); + + //pan_emit_cs_32(c, 0x54, 1); + //pan_emit_cs_ins(c, 0x24, 0x540000000233); + emit_cs_call(c, cs_va, start, i->ptr); + + pan_emit_cs_32(c, 0x4a, 0); + pan_emit_cs_ins(c, 0x24, 0x024a0000000211ULL); + + pan_emit_cs_48(c, 0x48, dest.gpu); + pan_pack_ins(c, CS_LDR, cfg) { + cfg.offset = 0; + cfg.register_mask = 1; + cfg.addr = 0x48; + cfg.register_base = 0x20; + } + pan_pack_ins(c, CS_WAIT, cfg) { cfg.slots = 1; } + pan_pack_ins(c, CS_ADD_IMM, cfg) { + cfg.value = 1; + cfg.src = 0x20; + cfg.dest = 0x20; + } + pan_pack_ins(c, CS_STR, cfg) { + cfg.offset = 64; + cfg.register_mask = 1; + cfg.addr = 0x48; + cfg.register_base = 0x20; + } + + pan_cs_evadd(c, 0, 1); + + submit_cs(s, queue); + wait_cs(s, queue); + + cache_invalidate(dest.cpu); + cache_barrier(); /* Just in case it's needed */ + uint32_t result = ((uint32_t *)dest.cpu)[0]; + uint32_t result2 = ((uint32_t *)dest.cpu)[16]; + + if (result != value) { + printf("Got %i, %i, expected %i: ", result, result2, value); + return false; + } + + return true; +} + +static bool +mmu_dump(struct state *s, struct test *t) +{ + unsigned size = 1024 * 1024; + + void *mem = mmap(NULL, size, PROT_READ, MAP_SHARED, + s->mali_fd, BASE_MEM_MMU_DUMP_HANDLE); + if (mem == MAP_FAILED) { + perror("mmap(BASE_MEM_MMU_DUMP_HANDLE)"); + return false; + } + + pan_hexdump(stdout, mem, size, true); + + return true; +} + +#define SUBTEST(s) { .label = #s, .subtests = s, .sub_length = ARRAY_SIZE(s) } + +#define STATE(item) .offset = offsetof(struct state, item) + +#define ALLOC(item) .offset = offsetof(struct state, allocations.item) +#define ALLOC_TEST(label, item, f) { alloc, dealloc, label, ALLOC(item), .flags = f } + +struct test kbase_main[] = { + { open_kbase, close_kbase, "Open kbase device" }, + { get_version, NULL, "Check version" }, + { set_flags, NULL, "Set flags" }, + { mmap_tracking, munmap_tracking, "Map tracking handle" }, + { get_gpuprops, free_gpuprops, "Get GPU properties" }, + { get_gpu_id, NULL, "GPU ID" }, + { get_coherency_mode, NULL, "Coherency mode" }, + { get_csf_caps, NULL, "CSF caps" }, + { mmap_user_reg, munmap_user_reg, "Map user register page" }, + { init_mem_exec, NULL, "Initialise EXEC_VA zone" }, + { init_mem_jit, NULL, "Initialise JIT allocator" }, + { stream_create, stream_destroy, "Create synchronisation stream" }, + { tiler_heap_create, tiler_heap_term, "Create chunked tiler heap" }, + { cs_group_create, cs_group_term, "Create command stream group" }, + { decode_init, decode_close, "Initialize pandecode" }, + + /* Flags are named in mali_base_csf_kernel.h, omitted for brevity */ + ALLOC_TEST("Allocate normal memory", normal, 0x200f), + ALLOC_TEST("Allocate exectuable memory", exec, 0x2017), + ALLOC_TEST("Allocate coherent memory", coherent, 0x280f), + ALLOC_TEST("Allocate cached memory", cached, 0x380f), + ALLOC_TEST("Allocate CSF event memory", event, 0x8200f), + ALLOC_TEST("Allocate CSF event memory 2", ev2, 0x8200f), + + /* These three tests are run for every queue, but later ones are not */ + { cs_queue_create, cs_queue_free, "Create command stream queues" }, + { cs_queue_register, cs_queue_term, "Register command stream queues" }, + + { cs_test, NULL, "Test command stream" }, + + { cs_init, NULL, "Initialise and start command stream queues" }, + { cs_simple, NULL, "Execute MOV command" }, + { cs_simple, NULL, "Execute MOV command (again)" }, + { cs_simple, NULL, "Execute MOV command (vertex)", .vertex = true }, + //{ cs_simple, NULL, "Execute MOV command (vertex, invalid)", .invalid = true, .vertex = true }, + { cs_simple, NULL, "Execute MOV command (vertex, again)", .vertex = true }, + { cs_store, NULL, "Execute STR command" }, + //{ cs_store, NULL, "Execute STR command to invalid address", .invalid = true }, + { cs_store, NULL, "Execute ADD command", .add = true }, + { cs_sub, NULL, "Execute STR on iterator" }, + + { compute_compile, NULL, "Compile a compute shader" }, + { compute_execute, NULL, "Execute a compute shader" }, + { compute_execute, NULL, "Execute compute on blit queue", .blit = true }, + + //{ mmu_dump, NULL, "Dump MMU pagetables" }, +}; + +static void +do_test_list(struct state *s, struct test *tests, unsigned length); + +static void +cleanup_test_list(struct state *s, struct test *tests, unsigned length) +{ + for (unsigned i = length; i > 0; --i) { + unsigned n = i - 1; + + struct test *t = &tests[n]; + if (!t->cleanup) + continue; + + if (pr) + printf("[CLEANUP %i] %s: ", n, t->label); + if (t->cleanup(s, t)) { + if (pr) + printf("PASS\n"); + } else { + if (pr) + printf("FAIL\n"); + } + } +} + +static unsigned +interpret_test_list(struct state *s, struct test *tests, unsigned length) +{ + for (unsigned i = 0; i < length; ++i) { + struct test *t = &tests[i]; + + if (pr) + printf("[TEST %i] %s: ", i, t->label); + if (t->part) { + if (t->part(s, t)) { + if (pr) + printf("PASS\n"); + } else { + if (pr) + printf("FAIL\n"); + if (!getenv("TEST_KEEP_GOING")) + return i + 1; + } + } + if (t->subtests) + do_test_list(s, t->subtests, t->sub_length); + } + + return length; +} + +static void +do_test_list(struct state *s, struct test *tests, unsigned length) +{ + unsigned ran = interpret_test_list(s, tests, length); + cleanup_test_list(s, tests, ran); +} + +int +main(int argc, char *argv[]) +{ + struct state s = { + .page_size = sysconf(_SC_PAGE_SIZE), + .argc = argc, + .argv = argv, + }; + + if (getenv("CSF_QUIET")) + pr = false; + + if (!strcmp(getenv("TERM"), "dumb")) + colour_term = false; + + if (pr) + printf("Running Valhall CSF tests\n"); + + do_test_list(&s, kbase_main, ARRAY_SIZE(kbase_main)); +} diff --git a/src/panfrost/lib/genxml/common.xml b/src/panfrost/lib/genxml/common.xml index d4b5240fb01..d75baaba208 100644 --- a/src/panfrost/lib/genxml/common.xml +++ b/src/panfrost/lib/genxml/common.xml @@ -46,7 +46,7 @@ - + diff --git a/src/panfrost/lib/genxml/decode.c.rej b/src/panfrost/lib/genxml/decode.c.rej new file mode 100644 index 00000000000..946a9fb8bfb --- /dev/null +++ b/src/panfrost/lib/genxml/decode.c.rej @@ -0,0 +1,940 @@ +diff a/src/panfrost/lib/genxml/decode.c b/src/panfrost/lib/genxml/decode.c (rejected hunks) +@@ -54,6 +54,12 @@ + pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \ + } + ++#define DUMP_SECTION_CS_V10(A, S, cl, buf, buf_unk, ...) { \ ++ pan_section_unpack_cs_v10(cl, buf, buf_unk, A, S, temp); \ ++ pandecode_log(__VA_ARGS__); \ ++ pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \ ++} ++ + #define MAP_ADDR(T, addr, cl) \ + const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T)); + +@@ -158,7 +164,7 @@ pandecode_midgard_tiler_descriptor( + if (nonzero_weights) + DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n"); + } +-#endif ++#endif /* PAN_ARCH <= 5 */ + + #if PAN_ARCH >= 5 + static void +@@ -184,7 +190,7 @@ pandecode_render_target(uint64_t gpu_va, unsigned gpu_id, + pandecode_indent--; + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH >= 5 */ + + #if PAN_ARCH >= 6 + static void +@@ -201,7 +207,7 @@ pandecode_sample_locations(const void *fb) + samples[2 * i + 1] - 128); + } + } +-#endif ++#endif /* PAN_ARCH >= 6 */ + + static void + pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, +@@ -228,29 +234,29 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id) + #if PAN_ARCH >= 6 + pandecode_sample_locations(fb); + +- unsigned dcd_size = pan_size(DRAW); ++ unsigned dcd_size = pan_size(DRAW_NO_CS); + + if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (0 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Pre frame 0:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } + + if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (1 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Pre frame 1:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } + + if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (2 * dcd_size)); +- pan_unpack(dcd, DRAW, draw); ++ pan_unpack(dcd, DRAW_NO_CS, draw); + pandecode_log("Post frame:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } +-#else ++#else /* PAN_ARCH < 6 */ + DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); + + const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER); +@@ -284,7 +290,7 @@ pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id) + .rt_count = params.render_target_count, + .has_extra = params.has_zs_crc_extension + }; +-#else ++#else /* PAN_ARCH < 5 */ + /* Dummy unpack of the padding section to make sure all words are 0. + * No need to call print here since the section is supposed to be empty. + */ +@@ -341,7 +347,7 @@ pandecode_attributes(mali_ptr addr, int count, + } + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH >= 5 + static mali_ptr +@@ -358,7 +364,7 @@ pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader) + return b.blend_shader ? (b.shader_pc & ~0xf) : 0; + #endif + } +-#endif ++#endif /* PAN_ARCH >= 6 || PAN_ARCH == 5 */ + + #if PAN_ARCH <= 7 + static unsigned +@@ -412,8 +418,9 @@ pandecode_invocation(const void *i) + + DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n") + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + ++#if PAN_ARCH < 10 + static void + pandecode_primitive(const void *p) + { +@@ -439,7 +446,7 @@ pandecode_primitive(const void *p) + pandecode_validate_buffer(primitive.indices, primitive.index_count * size); + } else if (primitive.index_type) + pandecode_log("// XXX: unexpected index size\n"); +-#endif ++#endif /* PAN_ARCH <= 7 */ + } + + static void +@@ -451,6 +458,7 @@ pandecode_primitive_size(const void *s, bool constant) + + DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n") + } ++#endif /* PAN_ARCH < 10 */ + + #if PAN_ARCH <= 7 + static void +@@ -482,7 +490,7 @@ pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count) + free(ptr); + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + static void + pandecode_shader_disassemble(mali_ptr shader_ptr, int type, unsigned gpu_id) +@@ -566,7 +574,7 @@ pandecode_texture_payload(mali_ptr payload, + pandecode_indent--; + pandecode_log("},\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH <= 5 + static void +@@ -585,7 +593,7 @@ pandecode_texture(mali_ptr u, unsigned tex) + temp.levels, nr_samples, temp.array_size); + pandecode_indent--; + } +-#else ++#else /* PAN_ARCH > 5 */ + static void + pandecode_texture(const void *cl, unsigned tex) + { +@@ -603,7 +611,7 @@ pandecode_texture(const void *cl, unsigned tex) + + for (unsigned i = 0; i < plane_count; ++i) + DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i); +-#else ++#else /* PAN_ARCH < 9 */ + unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ? + 1 : temp.sample_count; + +@@ -630,7 +638,7 @@ pandecode_textures(mali_ptr textures, unsigned texture_count) + + for (unsigned tex = 0; tex < texture_count; ++tex) + pandecode_texture(cl + pan_size(TEXTURE) * tex, tex); +-#else ++#else /* PAN_ARCH < 6 */ + mali_ptr *PANDECODE_PTR_VAR(u, textures); + + for (int tex = 0; tex < texture_count; ++tex) { +@@ -741,7 +749,7 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + gpu_id); + } + } +-#endif ++#endif /* PAN_ARCH >= 5 */ + } else + pandecode_log("// XXX: missing shader descriptor\n"); + +@@ -807,7 +815,7 @@ pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h, + pandecode_indent--; + pandecode_log("\n"); + } +-#endif ++#endif /* PAN_ARCH <= 7 */ + + #if PAN_ARCH >= 6 + static void +@@ -823,6 +831,10 @@ pandecode_tiler(mali_ptr gpu_va) + DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); + } + ++#endif /* PAN_ARCH >= 6 */ ++ ++#if PAN_ARCH < 10 ++#if PAN_ARCH >= 6 + #if PAN_ARCH <= 7 + static void + pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, +@@ -854,8 +866,8 @@ pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, + + pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding); + } +-#endif +-#endif ++#endif /* PAN_ARCH <= 7 */ ++#endif /* PAN_ARCH >= 6 */ + + static void + pandecode_tiler_job(const struct MALI_JOB_HEADER *h, +@@ -890,7 +902,7 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h, + pan_section_unpack(p, TILER_JOB, PADDING, padding); + #endif + +-#else ++#else /* PAN_ARCH < 6 */ + pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive); + pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), + primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE); +@@ -898,12 +910,17 @@ pandecode_tiler_job(const struct MALI_JOB_HEADER *h, + pandecode_indent--; + pandecode_log("\n"); + } ++#endif /* PAN_ARCH < 10 */ + + static void +-pandecode_fragment_job(mali_ptr job, unsigned gpu_id) ++pandecode_fragment_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job); +- pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s); ++#endif ++ ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, FRAGMENT_JOB, PAYLOAD, s); + + UNUSED struct pandecode_fbd info = pandecode_fbd(s.framebuffer, true, gpu_id); + +@@ -920,7 +937,7 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id) + expected_tag |= MALI_FBD_TAG_HAS_ZS_RT; + + expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2); +-#endif ++#endif /* PAN_ARCH >= 5 */ + + DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n"); + +@@ -936,6 +953,8 @@ pandecode_fragment_job(mali_ptr job, unsigned gpu_id) + pandecode_log("\n"); + } + ++#if PAN_ARCH < 10 ++// TODO: Use the same model as for malloc_vertex jobs? + static void + pandecode_write_value_job(mali_ptr job) + { +@@ -953,6 +972,7 @@ pandecode_cache_flush_job(mali_ptr job) + DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n"); + pandecode_log("\n"); + } ++#endif /* PAN_ARCH < 10 */ + + #if PAN_ARCH >= 9 + static void +@@ -1034,6 +1054,9 @@ pandecode_resource_tables(mali_ptr addr, const char *label) + static void + pandecode_depth_stencil(mali_ptr addr) + { ++ if (!addr) ++ return; ++ + MAP_ADDR(DEPTH_STENCIL, addr, cl); + pan_unpack(cl, DEPTH_STENCIL, desc); + DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil"); +@@ -1060,14 +1083,15 @@ static void + pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + unsigned gpu_id) + { +- mali_ptr frag_shader = 0; +- + pandecode_depth_stencil(p->depth_stencil); + + for (unsigned i = 0; i < p->blend_count; ++i) { ++ MAP_ADDR(SHADER_PROGRAM, p->shader.shader, cl); ++ pan_unpack(cl, SHADER_PROGRAM, desc); ++ + struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend); + +- mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader); ++ mali_ptr blend_shader = pandecode_blend(blend_descs, i, desc.binary); + if (blend_shader) { + fprintf(pandecode_dump_stream, "Blend shader %u", i); + pandecode_shader_disassemble(blend_shader, 0, gpu_id); +@@ -1079,21 +1103,26 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, + } + + static void +-pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id) ++pandecode_malloc_vertex_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job); ++#endif + +- DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE, p, cs_buf, cs_buf_unk, "Primitive:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, cs_buf, cs_buf_unk, "Instance count:\n"); ++#if PAN_ARCH < 10 + DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n"); +- DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n"); ++#endif ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, TILER, p, cs_buf, cs_buf_unk, "Tiler:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, SCISSOR, p, cs_buf, cs_buf_unk, "Scissor:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, cs_buf, cs_buf_unk, "Primitive Size:\n"); ++ DUMP_SECTION_CS_V10(MALLOC_VERTEX_JOB, INDICES, p, cs_buf, cs_buf_unk, "Indices:\n"); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, DRAW, dcd); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, TILER, tiler_ptr); + pandecode_log("Tiler Job Payload:\n"); + pandecode_indent++; + if (tiler_ptr.address) +@@ -1104,17 +1133,20 @@ pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id) + + pandecode_dcd(&dcd, 0, gpu_id); + +- pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position); +- pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, POSITION, position); ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, MALLOC_VERTEX_JOB, VARYING, varying); + pandecode_shader_environment(&position, gpu_id); + pandecode_shader_environment(&varying, gpu_id); + } + + static void +-pandecode_compute_job(mali_ptr job, unsigned gpu_id) ++pandecode_compute_job(mali_ptr job, uint32_t *cs_buf, uint32_t *cs_buf_unk, ++ unsigned gpu_id) + { ++#if PAN_ARCH < 10 + struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); +- pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload); ++#endif ++ pan_section_unpack_cs_v10(p, cs_buf, cs_buf_unk, COMPUTE_JOB, PAYLOAD, payload); + + pandecode_shader(payload.compute.shader, "Shader", gpu_id); + if (payload.compute.thread_storage) +@@ -1126,8 +1158,9 @@ pandecode_compute_job(mali_ptr job, unsigned gpu_id) + + DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n"); + } +-#endif ++#endif /* PAN_ARCH >= 9 */ + ++#if PAN_ARCH < 10 + /* Entrypoint to start tracing. jc_gpu_va is the GPU address for the first job + * in the chain; later jobs are found by walking the chain. GPU ID is the + * more finegrained ID because some details are model-specific even within a +@@ -1183,18 +1216,18 @@ GENX(pandecode_jc)(mali_ptr jc_gpu_va, unsigned gpu_id) + pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id); + break; + #endif +-#else ++#else /* PAN_ARCH > 7 */ + case MALI_JOB_TYPE_COMPUTE: +- pandecode_compute_job(jc_gpu_va, gpu_id); ++ pandecode_compute_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + + case MALI_JOB_TYPE_MALLOC_VERTEX: +- pandecode_malloc_vertex_job(jc_gpu_va, gpu_id); ++ pandecode_malloc_vertex_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + #endif + + case MALI_JOB_TYPE_FRAGMENT: +- pandecode_fragment_job(jc_gpu_va, gpu_id); ++ pandecode_fragment_job(jc_gpu_va, NULL, NULL, gpu_id); + break; + + default: +@@ -1232,3 +1265,544 @@ GENX(pandecode_abort_on_fault)(mali_ptr jc_gpu_va) + + pandecode_map_read_write(); + } ++#endif ++ ++#if PAN_ARCH >= 10 ++static void ++pandecode_cs_dump_state(uint32_t *state) ++{ ++ uint64_t *st_64 = (uint64_t *)state; ++ /* Only registers below 0x40 seem to be actually be used by jobs */ ++ for (unsigned i = 0; i < 0x40 / 4; ++i) { ++ uint64_t v1 = st_64[i * 2]; ++ uint64_t v2 = st_64[i * 2 + 1]; ++ ++ if (!v1 && !v2) ++ continue; ++ ++ pandecode_log("0x%2x: 0x%16"PRIx64" 0x%16"PRIx64"\n", ++ i * 4, v1, v2); ++ } ++} ++ ++/* Assumes eight scoreboards */ ++static void ++pandecode_scoreboard_mask(unsigned mask) ++{ ++ if (mask == 0xff) { ++ pandecode_log_cont("all"); ++ return; ++ } else if (!mask) { ++ pandecode_log_cont("none"); ++ return; ++ } ++ ++ const char *comma = ""; ++ for (unsigned i = 0; i < 8; ++i) { ++ if (mask & (1 << i)) { ++ pandecode_log_cont("%s%i", comma, i); ++ comma = ","; ++ } ++ } ++} ++ ++static void ++pandecode_regmask(unsigned base, unsigned mask) ++{ ++ switch (mask) { ++ case 0: ++ pandecode_log_cont("(invalid: %02x mask 0)", base); ++ return; ++ case 1: ++ pandecode_log_cont("w%02x", base); ++ return; ++ case 3: ++ pandecode_log_cont("x%02x", base); ++ return; ++ default: ++ break; ++ } ++ ++ unsigned first = ffs(mask) - 1; ++ if (first) ++ pandecode_log_cont("{(+%i) ", first); ++ else ++ pandecode_log_cont("{"); ++ ++ unsigned edges = mask ^ (mask << 1); ++ ++ const char *comma = ""; ++ ++ bool outside = true; ++ unsigned start; ++ u_foreach_bit(i, edges) { ++ if (outside) ++ start = i; ++ else if (i == start + 1) ++ pandecode_log_cont("%sw%02x", comma, ++ base + start); ++ else if (i == start + 2) ++ pandecode_log_cont("%sx%02x", comma, ++ base + start); ++ else ++ pandecode_log_cont("%sw%02x-w%02x", comma, ++ base + start, ++ base + i - 1); ++ outside = !outside; ++ ++ if (outside) ++ comma = ", "; ++ } ++ ++ pandecode_log_cont("}"); ++} ++ ++static void ++pandecode_cs_buffer(uint64_t *commands, unsigned size, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id, mali_ptr va); ++ ++// Hack hack hackity hack: gpu_id == 1 means "don't decode" (only disassemble) ++static void ++pandecode_cs_command(uint64_t command, mali_ptr va, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id) ++{ ++ uint8_t op = command >> 56; ++ uint8_t addr = (command >> 48) & 0xff; ++ uint64_t value = command & 0xffffffffffffULL; ++ ++ uint32_t h = value >> 32; ++ uint32_t l = value; ++ ++ uint8_t arg1 = h & 0xff; ++ uint8_t arg2 = h >> 8; ++ ++ if (command) ++ pandecode_log("%"PRIx64" %016"PRIx64" ", va, command); ++ ++ switch (op) { ++ case 0: ++ if (addr || value) ++ pandecode_log("nop %02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 1: ++ buffer_unk[addr] = buffer[addr] = l; ++ buffer_unk[addr + 1] = buffer[addr + 1] = h; ++ pandecode_log("mov x%02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 2: ++ buffer_unk[addr] = buffer[addr] = l; ++ pandecode_log("mov w%02x, #0x%"PRIx64"\n", addr, value); ++ break; ++ case 3: ++ if (l & 0xff00ffff || h || addr) { ++ pandecode_log("wait (unk %02x), (unk %04x), " ++ "%i, (unk %04x)\n", addr, h, l >> 16, l); ++ } else { ++ pandecode_log("wait "); ++ pandecode_scoreboard_mask(l >> 16); ++ pandecode_log_cont("\n"); ++ } ++ break; ++ case 4: { ++ uint32_t masked = l & 0xffff0000; ++ unsigned task_increment = l & 0x3fff; ++ unsigned task_axis = (l >> 14) & 3; ++ if (h != 0xff00 || addr || masked) ++ pandecode_log("compute (unk %02x), (unk %04x), " ++ "(unk %x), inc %i, axis %i\n\n", addr, h, masked, task_increment, task_axis); ++ else ++ pandecode_log("compute inc %i, axis %i\n\n", task_increment, task_axis); ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_compute_job(0, buffer, buffer_unk, gpu_id); ++ ++ /* The gallium driver emits this even for compute jobs, clear ++ * it from unknown state */ ++ pan_unpack_cs(buffer, buffer_unk, SCISSOR, unused_scissor); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ case 6: { ++ /* The meaning of the first argument (in h) is unknown, but it ++ * appears that the second bit must be set. */ ++ uint32_t masked = l & 0xfffff8f0; ++ uint8_t mode = l & 0xf; ++ uint8_t index = (l >> 8) & 7; ++ if (addr || masked) ++ pandecode_log("idvs (unk %02x), 0x%04x, (unk %x), " ++ "mode %i index %i\n\n", ++ addr, h, masked, mode, index); ++ else ++ pandecode_log("idvs 0x%04x, mode %i index %i\n\n", ++ h, mode, index); ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_malloc_vertex_job(0, buffer, buffer_unk, gpu_id); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ case 7: { ++ uint64_t masked = value & ~0x000100000071; ++ bool tem = value & 1; ++ bool unk = (value >> 32) & 1; ++ ++ const char *order = (const char *[]){ ++ "z_order", ++ "horizontal", ++ "vertical", ++ "invalid_3", ++ "invalid_4", ++ "reverse_horizontal", ++ "reverse_vertical", ++ "invalid_7", ++ }[(value >> 4) & 7]; ++ ++ if (addr || masked) { ++ pandecode_log("fragment (unk %02x), (unk %"PRIx64")\n\n", ++ addr, value); ++ } else if (value) { ++ pandecode_log("fragment tem %i, render %s, unk %i\n\n", ++ tem, order, unk); ++ } else { ++ pandecode_log("fragment\n\n"); ++ } ++ ++ if (gpu_id != 1) { ++ pandecode_indent++; ++ ++ pandecode_fragment_job(0, buffer, buffer_unk, gpu_id); ++ pandecode_cs_dump_state(buffer_unk); ++ ++ pandecode_log("\n"); ++ pandecode_indent--; ++ } ++ ++ break; ++ } ++ ++ case 9: { ++ if (addr || l || h > 1) ++ pandecode_log("flush_tiler (unk %02x), (unk %"PRIx64")\n", ++ addr, value); ++ else if (h) ++ pandecode_log("flush_tiler unk\n"); ++ else ++ pandecode_log("flush_tiler\n"); ++ break; ++ } ++ ++ case 16: case 17: { ++ char wid = (op == 16) ? 'w' : 'x'; ++ ++ if (op == 16) { ++ buffer_unk[addr] = buffer[addr] = buffer[arg2] + l; ++ } else { ++ uint64_t r = buffer[arg2] + ((uint64_t)buffer[arg2 + 1] << 32) + l; ++ buffer_unk[addr] = buffer[addr] = r; ++ buffer_unk[addr + 1] = buffer[addr + 1] = r >> 32; ++ } ++ ++ if (arg1) ++ pandecode_log("add %c%02x, (unk %x), %c%02x, #0x%x\n", ++ wid, addr, arg1, wid, arg2, l); ++ else if ((int32_t) l < 0) ++ pandecode_log("add %c%02x, %c%02x, %i\n", ++ wid, addr, wid, arg2, (int32_t) l); ++ else if (l) ++ pandecode_log("add %c%02x, %c%02x, #0x%x\n", ++ wid, addr, wid, arg2, l); ++ else ++ pandecode_log("mov %c%02x, %c%02x\n", ++ wid, addr, wid, arg2); ++ ++ break; ++ } ++ ++ case 20: case 21: { ++ const char *name = (op == 20) ? "ldr" : "str"; ++ ++ /* The immediate offset must be 4-aligned (though if the ++ * address itself is unaligned, the bits will silently be ++ * masked off). ++ * ++ * Up to 16 32-bit registers can be read or written in a ++ * single instruction, behaviour is similar to LDM or STM ++ * except that a base register is specified. ++ * ++ * These instructions are high latency. Use WAIT 0 to wait for ++ * the result of an LDR, or for a STR to finish. ++ * ++ * For LDR, it is an error for the address register to be ++ * included in the destination register set. ++ */ ++ ++ if (arg1) { ++ pandecode_log("%s (unk %02x), x%02x, (mask %x), [x%02x, %i]\n", ++ name, arg1, addr, l >> 16, arg2, (int16_t) l); ++ } else { ++ pandecode_log("%s ", name); ++ pandecode_regmask(addr, l >> 16); ++ pandecode_log_cont(", [x%02x, %i]\n", arg2, (int16_t) l); ++ } ++ break; ++ } ++ ++ case 22: { ++ /* The signed 32-bit source register is compared against zero ++ * for these comparisons. For example, .GT means that the ++ * branch is taken if the signed register value is greater ++ * than zero. */ ++ const char *comparisons[] = { ++ ".le", ".gt", ++ ".eq", ".ne", ++ ".lt", ".ge", ++ "" /* always */, ".(invalid: never)", ++ }; ++ ++ const char *m = comparisons[(l >> 28) & 7]; ++ ++ int16_t offset = l; ++ ++ bool forward = (offset >= 0); ++ if (!forward) ++ offset = -1 - offset; ++ ++ if (addr || arg1 || l & 0x8fff0000) { ++ pandecode_log("b%s (unk %02x), w%02x, (unk %02x), " ++ "(unk 0x%x), %s %i\n", ++ m, addr, arg2, arg1, l & 0x8fff0000, ++ forward ? "skip" : "back", ++ offset); ++ } else { ++ pandecode_log("b%s w%02x, %s %i\n", ++ m, arg2, ++ forward ? "skip" : "back", ++ offset); ++ } ++ ++ break; ++ } ++ ++ case 23: { ++ if (value >> 3 || addr) ++ pandecode_log("slot (unk %02x), (unk %"PRIx64"), " ++ "%i\n", addr, value >> 3, l & 7); ++ else ++ pandecode_log("slot %i\n", l); ++ break; ++ } ++ ++ case 32: case 33: { ++ /* A tail call is similar to a normal call, but reuses the ++ * current stack entry so that execution returns directly to ++ * the parent, rather than pushing a new entry and returning ++ * to the instruction after the call. Using tail calls avoids ++ * the possibility of stack overflow. ++ */ ++ const char *name = (op == 32) ? "call" : "tailcall"; ++ ++ unsigned length = buffer[arg1]; ++ uint64_t target = (((uint64_t)buffer[arg2 + 1]) << 32) | buffer[arg2]; ++ ++ assert(!(length & 7)); ++ unsigned instrs = length / 8; ++ ++ if (addr || l) ++ pandecode_log("%s (unk %02x), w%02x (%i instructions), x%02x (0x%"PRIx64"), (unk %x)\n", ++ name, addr, arg1, instrs, arg2, target, l); ++ else ++ pandecode_log("%s w%02x (%i instructions), x%02x (0x%"PRIx64")\n", ++ name, arg1, instrs, arg2, target); ++ ++ if (!target || !length) ++ break; ++ ++ uint64_t *t = pandecode_fetch_gpu_mem(target, length); ++ pandecode_indent++; ++ pandecode_cs_buffer(t, length, buffer, buffer_unk, gpu_id, ++ target); ++ pandecode_indent--; ++ break; ++ } ++ ++ case 34: { ++ /* idvs implies tiler */ ++ if (l & ~0xf) ++ pandecode_log("resources 0x%x\n", l); ++ else ++ pandecode_log("resources%s%s%s%s\n", ++ (l & 1) ? " compute" : "", ++ (l & 2) ? " fragment" : "", ++ (l & 4) ? " tiler" : "", ++ (l & 8) ? " idvs" : ""); ++ break; ++ } ++ ++ case 37: case 38: case 51: case 52: { ++ /* ++ * 0b 00100101 / 00100110 -- opcode ++ * ????0??? -- unk. usually 1, faults if "0" bit set ++ * aaaaaaaa -- address register ++ * vvvvvvvv -- 32-bit value register ++ * 00000000 -- seems to act as NOP if nonzero ++ * mmmmmmmm -- some sort of mask, unknown purpose ++ * ???????? -- seems to have no effect ++ * ?????s0u -- 's' disables signal to CPU, ++ * 'u' has unknown purpose (disable GPU signal?) ++ * ++ * The difference between the two opcodes is unknown. ++ * ++ * That the 'mmmmmmmm' byte is somehow a scoreboard mask is ++ * a possibility. ++ */ ++ ++ const char *name = (op & 1) ? "evadd" : "evstr"; ++ const char *type = (op > 50) ? "x" : "w"; ++ ++ if (addr != 1 || l & 0xff00fffa) { ++ pandecode_log("%s (unk %02x), %s%02x, [x%02x], " ++ "unk 0x%x, flags 0x%x\n", ++ name, addr, type, arg1, arg2, ++ l >> 16, (uint16_t) l); ++ } else { ++ pandecode_log("%s %s%02x, [x%02x], unk 0x%x%s%s\n", ++ name, type, arg1, arg2, l >> 16, ++ l & 0x4 ? "" : ", irq", ++ l & 0x1 ? ", unk0" : ""); ++ } ++ ++ break; ++ } ++ ++ case 39: case 53: { ++ const char *m = (const char *[]){ ++ ".ls", ++ ".hi", ++ }[(l >> 28) & 1]; ++ const char *e = (const char *[]){ ++ ".inherit", ++ ".no_error", ++ }[l & 1]; ++ const char *type = (op > 50) ? "x" : "w"; ++ ++ /* Wait until the value in the destination register is changed ++ * to pass the comparison. For example, with .LS the value ++ * in memory must be less than or same as the reference to ++ * continue execution. */ ++ if (addr || l & ~((1 << 28) | (1 << 0))) ++ pandecode_log("evwait%s%s (unk %02x), %s%02x, " ++ "[x%02x, unk %x]\n", ++ m, e, addr, type, arg1, arg2, l); ++ else ++ pandecode_log("evwait%s%s %s%02x, [x%02x]\n", ++ m, e, type, arg1, arg2); ++ break; ++ } ++ ++ case 40: { ++ if (addr || l >> 16 || arg1 > 1) { ++ pandecode_log("str type %02x, (unk %02x), " ++ "(unk %x), [x%02x, %i]\n", ++ addr, arg1, ++ l >> 16, arg2, (int16_t) l); ++ } else { ++ const char *type = (const char *[]) { ++ "timestamp", ++ "cycles", ++ }[arg1]; ++ ++ pandecode_log("str %s, [x%02x, %i]\n", ++ type, arg2, (int16_t) l); ++ } ++ break; ++ } ++ ++ case 48: { ++ if (addr || arg1 || l) ++ pandecode_log("heapctx (unk %02x), " ++ "x%02x, (unk %02x), (unk %x)\n", ++ addr, arg2, arg1, l); ++ else ++ pandecode_log("heapctx x%02x\n", arg2); ++ break; ++ } ++ ++ case 49: { ++ const char *m = (const char *[]){ ++ "vt_start", ++ "vt_end", ++ "unk", ++ "frag_end", ++ }[arg1 & 3]; ++ ++ if (addr || arg2 || arg1 > 3 || l) ++ pandecode_log("heapinc (unk %02x), " ++ "(unk %02x), %02x, (unk %x)\n", ++ addr, arg2, arg1, l); ++ else ++ pandecode_log("heapinc %s\n", m); ++ break; ++ } ++ ++ default: ++ /* ++ * UNK 00 30, #0x480000000000 -- takes an eight-byte aligned ++ * memory address. ++ */ ++ ++ pandecode_log("UNK %02x %02x, #0x%"PRIx64"\n", addr, op, value); ++ break; ++ } ++} ++ ++// TODO: reorder args ++static void ++pandecode_cs_buffer(uint64_t *commands, unsigned size, ++ uint32_t *buffer, uint32_t *buffer_unk, ++ unsigned gpu_id, mali_ptr va) ++{ ++ uint64_t *end = (uint64_t *)((uint8_t *) commands + size); ++ ++ for (uint64_t c = *commands; commands < end; c = *(++commands)) { ++ pandecode_cs_command(c, va, buffer, buffer_unk, gpu_id); ++ va += 8; ++ } ++} ++ ++// TODO: Does it make sense to pass in the length? ++void ++GENX(pandecode_cs)(mali_ptr cs_gpu_va, unsigned size, unsigned gpu_id) ++{ ++ pandecode_dump_file_open(); ++ ++ // TODO: Pass down the buffer during recursion ++ uint32_t buffer[256] = {0}; ++ uint32_t buffer_unk[256] = {0}; ++ ++ uint64_t *commands = pandecode_fetch_gpu_mem(cs_gpu_va, 1); ++ ++ pandecode_log("\n"); ++ ++ pandecode_cs_buffer(commands, size, buffer, buffer_unk, gpu_id, ++ cs_gpu_va); ++ ++ fflush(pandecode_dump_stream); ++ pandecode_map_read_write(); ++} ++#endif diff --git a/src/panfrost/lib/genxml/decode.h.rej b/src/panfrost/lib/genxml/decode.h.rej new file mode 100644 index 00000000000..d3673d771d1 --- /dev/null +++ b/src/panfrost/lib/genxml/decode.h.rej @@ -0,0 +1,28 @@ +diff a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h (rejected hunks) +@@ -50,8 +50,6 @@ struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(uint64_ + + void pandecode_map_read_write(void); + +-void pandecode_dump_mappings(void); +- + static inline void * + __pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size, + int line, const char *filename) +@@ -98,6 +96,8 @@ void pandecode_abort_on_fault_v6(mali_ptr jc_gpu_va); + void pandecode_abort_on_fault_v7(mali_ptr jc_gpu_va); + void pandecode_abort_on_fault_v9(mali_ptr jc_gpu_va); + ++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ + static inline void + pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings) + { +@@ -130,7 +130,7 @@ pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings) + fprintf(fp, " | "); + for (unsigned j = i & ~0xF; j <= i; ++j) { + uint8_t c = hex[j]; +- fputc((c < 32 || c > 128) ? '.' : c, fp); ++ fputc((c < 32 || c > 126) ? '.' : c, fp); + } + } + diff --git a/src/panfrost/lib/genxml/decode_common.c.rej b/src/panfrost/lib/genxml/decode_common.c.rej new file mode 100644 index 00000000000..127dcd7a4f7 --- /dev/null +++ b/src/panfrost/lib/genxml/decode_common.c.rej @@ -0,0 +1,52 @@ +diff a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c (rejected hunks) +@@ -202,7 +202,7 @@ pointer_as_memory_reference(uint64_t ptr) + + static int pandecode_dump_frame_count = 0; + +-static bool force_stderr = false; ++bool force_stderr = false; + + void + pandecode_dump_file_open(void) +@@ -230,7 +230,7 @@ pandecode_dump_file_open(void) + } + } + +-static void ++void + pandecode_dump_file_close(void) + { + simple_mtx_assert_locked(&pandecode_lock); +@@ -289,8 +289,9 @@ pandecode_dump_mappings(void) + if (!it->addr || !it->length) + continue; + +- fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n", +- it->name, it->gpu_va); ++ fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 ++ " length %zu\n\n", ++ it->name, it->gpu_va, it->length); + + pan_hexdump(pandecode_dump_stream, it->addr, it->length, false); + fprintf(pandecode_dump_stream, "\n"); +@@ -333,3 +334,20 @@ pandecode_jc(mali_ptr jc_gpu_va, unsigned gpu_id) + + simple_mtx_unlock(&pandecode_lock); + } ++ ++void pandecode_cs_v10(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ ++void ++pandecode_cs(mali_ptr cs_gpu_va, unsigned cs_size, unsigned gpu_id) ++{ ++ simple_mtx_lock(&pandecode_lock); ++ ++ switch (pan_arch(gpu_id)) { ++ // Hack hack hackity hack: gpu_id == 1 means "don't decode" (only ++ // disassemble) ++ case 0: case 10: pandecode_cs_v10(cs_gpu_va, cs_size, gpu_id); break; ++ default: unreachable("Unsupported architecture"); ++ } ++ ++ simple_mtx_unlock(&pandecode_lock); ++} diff --git a/src/panfrost/lib/genxml/gen_macros.h.rej b/src/panfrost/lib/genxml/gen_macros.h.rej new file mode 100644 index 00000000000..0b1a6a9070a --- /dev/null +++ b/src/panfrost/lib/genxml/gen_macros.h.rej @@ -0,0 +1,11 @@ +diff a/src/panfrost/lib/genxml/gen_macros.h b/src/panfrost/lib/genxml/gen_macros.h (rejected hunks) +@@ -93,6 +93,9 @@ pan_arch(unsigned gpu_id) + #elif (PAN_ARCH == 9) + # define GENX(X) X##_v9 + # include "genxml/v9_pack.h" ++#elif (PAN_ARCH == 10) ++# define GENX(X) X##_v10 ++# include "genxml/v10_pack.h" + #else + # error "Need to add suffixing macro for this architecture" + #endif diff --git a/src/panfrost/lib/genxml/gen_pack.py b/src/panfrost/lib/genxml/gen_pack.py index 8d1df522ca0..cbcde745cf6 100644 --- a/src/panfrost/lib/genxml/gen_pack.py +++ b/src/panfrost/lib/genxml/gen_pack.py @@ -46,6 +46,18 @@ #include "util/bitpack_helpers.h" +/* Most functions assume the caller has done bounds checking */ +typedef struct pan_command_stream { + uint64_t *ptr; + uint64_t *begin; + uint64_t *end; + uint64_t gpu; +} pan_command_stream; + +struct pan_command_stream_decoded { + uint32_t values[256]; +}; + #define __gen_unpack_float(x, y, z) uif(__gen_unpack_uint(x, y, z)) static inline uint32_t @@ -114,6 +126,20 @@ return (2*odd + 1) << shift; } +static inline void +__gen_clear_value(uint8_t *restrict cl, uint32_t start, uint32_t end) +{ + for (uint32_t byte = start / 8; byte <= end / 8; byte++) { + uint8_t m = 0; + if (byte == start / 8) + m |= 0xff >> (8 - start % 8); + if (byte == end / 8) + m |= 0xff << (1 + end % 8); + + cl[byte] &= m; + } +} + #define PREFIX1(A) MALI_ ## A #define PREFIX2(A, B) MALI_ ## A ## _ ## B #define PREFIX4(A, B, C, D) MALI_ ## A ## _ ## B ## _ ## C ## _ ## D @@ -199,6 +225,96 @@ """ +no_cs = "".join([f""" +#define MALI_{y} MALI_{x} +#define MALI_{y}_header MALI_{x}_header +#define MALI_{y}_pack MALI_{x}_pack +#define MALI_{y}_LENGTH MALI_{x}_LENGTH +#define MALI_{y}_ALIGN MALI_{x}_ALIGN +#define mali_{y.lower()}_packed mali_{x.lower()}_packed +#define MALI_{y}_unpack MALI_{x}_unpack +#define MALI_{y}_print MALI_{x}_print +""" for x, y in (("DRAW", "DRAW_NO_CS"), )]) + """ + +#define pan_pack_cs_v10(dst, _, T, name) pan_pack(dst, T, name) + +#define pan_section_pack_cs_v10(dst, _, A, S, name) pan_section_pack(dst, A, S, name) + +#define pan_unpack_cs_v10(dst, _, __, T, name) pan_unpack(dst, T, name) + +#define pan_section_unpack_cs_v10(src, _, __, A, S, name) pan_section_unpack(src, A, S, name) +""" + +with_cs = """ +#define pan_pack_cs(dst, T, name) \\ + for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\ + *_loop_terminate = (void *) (dst); \\ + __builtin_expect(_loop_terminate != NULL, 1); \\ + ({ PREFIX2(T, pack_cs)(dst, &name); \\ + _loop_terminate = NULL; })) + +#define pan_section_pack_cs(dst, A, S, name) \\ + for (PREFIX4(A, SECTION, S, TYPE) name = { PREFIX4(A, SECTION, S, header) }, \\ + *_loop_terminate = (void *) (dst); \\ + __builtin_expect(_loop_terminate != NULL, 1); \\ + ({ PREFIX4(A, SECTION, S, pack_cs) (dst, &name); \\ + _loop_terminate = NULL; })) + +#define pan_section_pack_cs_v10(_, dst, A, S, name) pan_section_pack_cs(dst, A, S, name) + +// TODO: assert that the first argument is NULL +#define pan_pack_cs_v10(_, dst, T, name) pan_pack_cs(dst, T, name) + +#define pan_pack_ins(dst, T, name) \\ + for (struct PREFIX1(T) name = { PREFIX2(T, header) }, \\ + *_loop_terminate = (void *) (dst); \\ + __builtin_expect(_loop_terminate != NULL, 1); \\ + ({ PREFIX2(T, pack_ins)(dst, &name); \\ + _loop_terminate = NULL; })) + +#define pan_unpack_cs(buf, buf_unk, T, name) \\ + struct PREFIX1(T) name; \\ + PREFIX2(T, unpack)(buf, buf_unk, &name) + +#define pan_unpack_cs_v10(_, buf, buf_unk, T, name) pan_unpack_cs(buf, buf_unk, T, name) + +#define pan_section_unpack_cs_v10(_, buf, buf_unk, A, S, name) \\ + PREFIX4(A, SECTION, S, TYPE) name; \\ + PREFIX4(A, SECTION, S, unpack)(buf, buf_unk, &name) + +static inline void +pan_emit_cs_ins(pan_command_stream *s, uint8_t op, uint64_t instr) +{ + assert(instr < (1ULL << 56)); + instr |= ((uint64_t)op << 56); + *((s->ptr)++) = instr; +} + +static inline void +pan_emit_cs_32(pan_command_stream *s, uint8_t reg, uint32_t value) +{ + pan_emit_cs_ins(s, 2, ((uint64_t) reg << 48) | value); +} + +static inline void +pan_emit_cs_48(pan_command_stream *s, uint8_t reg, uint64_t value) +{ + assert(value < (1ULL << 48)); + pan_emit_cs_ins(s, 1, ((uint64_t) reg << 48) | value); +} + +static inline void +pan_emit_cs_64(pan_command_stream *s, uint8_t reg, uint64_t value) +{ + if (value < (1ULL << 48)) { + pan_emit_cs_48(s, reg, value); + } else { + pan_emit_cs_32(s, reg, value); + pan_emit_cs_32(s, reg + 1, value >> 32); + } +} +""" + def to_alphanum(name): substitutions = { ' ': '_', @@ -313,7 +429,7 @@ def __init__(self, parser, attrs): if ":" in str(attrs["start"]): (word, bit) = attrs["start"].split(":") - self.start = (int(word) * 32) + int(bit) + self.start = (int(word, 0) * 32) + int(bit) else: self.start = int(attrs["start"]) @@ -347,7 +463,8 @@ def emit_template_struct(self, dim): type = 'uint64_t' elif self.type == 'int': type = 'int32_t' - elif self.type in ['uint', 'hex', 'uint/float', 'padded', 'Pixel Format']: + # TODO: Convert to tuple + elif self.type in ['uint', 'hex', 'register', 'uint/float', 'padded', 'Pixel Format']: type = 'uint32_t' elif self.type in self.parser.structs: type = 'struct ' + self.parser.gen_prefix(safe_name(self.type.upper())) @@ -401,8 +518,8 @@ def emit_template_struct(self, dim): field.emit_template_struct(dim) class Word: - def __init__(self): - self.size = 32 + def __init__(self, size=32): + self.size = size self.contributors = [] class FieldRef: @@ -426,7 +543,7 @@ def collect_fields(self, fields, offset, path, all_fields): end = offset + field.end all_fields.append(self.FieldRef(field, field_path, start, end)) - def collect_words(self, fields, offset, path, words): + def collect_words(self, fields, offset, path, words, ins=False): for field in fields: field_path = '{}{}'.format(path, field.name) start = offset + field.start @@ -440,16 +557,27 @@ def collect_words(self, fields, offset, path, words): contributor = self.FieldRef(field, field_path, start, end) first_word = contributor.start // 32 last_word = contributor.end // 32 + if ins: + assert(last_word < 2) + first_word = last_word = 0 + for b in range(first_word, last_word + 1): if not b in words: - words[b] = self.Word() + words[b] = self.Word(size=64 if ins else 32) + words[b].contributors.append(contributor) - def emit_pack_function(self): - self.get_length() + return + + def emit_pack_function(self, csf=False, ins=False): + if csf: + self.length = 256 * 4 + else: + self.get_length() + assert(not ins) words = {} - self.collect_words(self.fields, 0, '', words) + self.collect_words(self.fields, 0, '', words, ins=ins) # Validate the modifier is lossless for field in self.fields: @@ -465,25 +593,52 @@ def emit_pack_function(self): elif field.modifier[0] == "log2": print(" assert(util_is_power_of_two_nonzero(values->{}));".format(field.name)) - for index in range(self.length // 4): + if ins: + index_list = (0, ) + elif csf: + index_list = sorted(words) + else: + index_list = range(self.length // 4) + + for index in index_list: # Handle MBZ words if not index in words: - print(" cl[%2d] = 0;" % index) + if ins: + print(" pan_emit_cs_ins(s, 0x%02x, 0);" % self.op) + elif not csf: + print(" cl[%2d] = 0;" % index) continue word = words[index] word_start = index * 32 + size = 32 + # Can we move all fields from the next index here? + if csf and index % 2 == 0 and index + 1 in words: + word_next = words[index + 1] + end = max(c.end for c in word_next.contributors) + if end - word_start < 48: + size = 48 + word.contributors += [x for x in word_next.contributors if not x in word.contributors] + del words[index + 1] + v = None - prefix = " cl[%2d] =" % index + if ins: + prefix = " pan_emit_cs_ins(s, 0x%02x," % self.op + elif size == 48: + prefix = " pan_emit_cs_48(s, 0x%02x," % index + elif csf: + prefix = " pan_emit_cs_32(s, 0x%02x," % index + else: + prefix = " cl[%2d] = (" % index for contributor in word.contributors: field = contributor.field name = field.name start = contributor.start end = contributor.end - contrib_word_start = (start // 32) * 32 + contrib_word_start = (start // word.size) * word.size start -= contrib_word_start end -= contrib_word_start @@ -498,7 +653,7 @@ def emit_pack_function(self): elif field.modifier[0] == "log2": value = "util_logbase2({})".format(value) - if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format"]: + if field.type in ["uint", "hex", "uint/float", "address", "register", "Pixel Format"]: s = "util_bitpack_uint(%s, %d, %d)" % \ (value, start, end) elif field.type == "padded": @@ -529,11 +684,13 @@ def emit_pack_function(self): if not s == None: shift = word_start - contrib_word_start - if shift: + if shift > 0: s = "%s >> %d" % (s, shift) + elif shift < 0: + s = "%s << %d" % (s, -shift) if contributor == word.contributors[-1]: - print("%s %s;" % (prefix, s)) + print("%s %s);" % (prefix, s)) else: print("%s %s |" % (prefix, s)) prefix = " " @@ -552,22 +709,23 @@ def mask_for_word(self, index, start, end): count = (end - start + 1) return (((1 << count) - 1) << start) - def emit_unpack_function(self): + def emit_unpack_function(self, csf=False): # First, verify there is no garbage in unused bits words = {} self.collect_words(self.fields, 0, '', words) - for index in range(self.length // 4): - base = index * 32 - word = words.get(index, self.Word()) - masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors] - mask = reduce(lambda x,y: x | y, masks, 0) + if not csf: + for index in range(self.length // 4): + base = index * 32 + word = words.get(index, self.Word()) + masks = [self.mask_for_word(index, c.start, c.end) for c in word.contributors] + mask = reduce(lambda x,y: x | y, masks, 0) - ALL_ONES = 0xffffffff + ALL_ONES = 0xffffffff - if mask != ALL_ONES: - TMPL = ' if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");' - print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index)) + if mask != ALL_ONES: + TMPL = ' if (((const uint32_t *) cl)[{}] & {}) fprintf(stderr, "XXX: Invalid field of {} unpacked at word {}\\n");' + print(TMPL.format(index, hex(mask ^ ALL_ONES), self.label, index)) fieldrefs = [] self.collect_fields(self.fields, 0, '', fieldrefs) @@ -580,7 +738,7 @@ def emit_unpack_function(self): args.append(str(fieldref.start)) args.append(str(fieldref.end)) - if field.type in set(["uint", "hex", "uint/float", "address", "Pixel Format"]): + if field.type in set(["uint", "hex", "uint/float", "address", "register", "Pixel Format"]): convert = "__gen_unpack_uint" elif field.type in self.parser.enums: convert = "(enum %s)__gen_unpack_uint" % enum_name(field.type) @@ -616,6 +774,9 @@ def emit_unpack_function(self): mask = hex(field.modifier[1] - 1) print(' assert(!(values->{} & {}));'.format(fieldref.path, mask)) + if csf: + print(' __gen_clear_value({});'.format(', '.join(['cl_unk'] + args[1:]))) + def emit_print_function(self): for field in self.fields: convert = None @@ -638,7 +799,7 @@ def emit_print_function(self): print(' fprintf(fp, "%*s{}: %f\\n", indent, "", {});'.format(name, val)) elif field.type in ["uint", "hex"] and (field.end - field.start) >= 32: print(' fprintf(fp, "%*s{}: 0x%" PRIx64 "\\n", indent, "", {});'.format(name, val)) - elif field.type == "hex": + elif field.type in ("hex", "register"): print(' fprintf(fp, "%*s{}: 0x%x\\n", indent, "", {});'.format(name, val)) elif field.type == "uint/float": print(' fprintf(fp, "%*s{}: 0x%X (%f)\\n", indent, "", {}, uif({}));'.format(name, val, val)) @@ -677,9 +838,13 @@ def start_element(self, name, attrs): print(v6_format_printer) else: print(v7_format_printer) + if arch < 10: + print(no_cs) + else: + print(with_cs) elif name == "struct": name = attrs["name"] - self.no_direct_packing = attrs.get("no-direct-packing", False) + self.layout = attrs.get("layout", "struct") object_name = self.gen_prefix(safe_name(name.upper())) self.struct = object_name @@ -687,10 +852,16 @@ def start_element(self, name, attrs): if "size" in attrs: self.group.length = int(attrs["size"]) * 4 self.group.align = int(attrs["align"]) if "align" in attrs else None + self.group.op = int(attrs["op"]) if "op" in attrs else None self.structs[attrs["name"]] = self.group + self.unpacked_alias = self.gen_prefix(safe_name(attrs["unpacked"].upper())) if "unpacked" in attrs else None elif name == "field": - self.group.fields.append(Field(self, attrs)) self.values = [] + self.skip_field = self.layout == "cs" and not attrs["start"].startswith("0x") + if self.skip_field: + #print(f"#warning Skipping non-CS field {attrs['name']}") + return + self.group.fields.append(Field(self, attrs)) elif name == "enum": self.values = [] self.enum = safe_name(attrs["name"]) @@ -703,6 +874,8 @@ def start_element(self, name, attrs): self.values.append(Value(attrs)) elif name == "aggregate": aggregate_name = self.gen_prefix(safe_name(attrs["name"].upper())) + # TODO: Make .layout less "global"? + self.layout = attrs.get("layout", "struct") self.aggregate = Aggregate(self, aggregate_name, attrs) self.aggregates[attrs['name']] = self.aggregate elif name == "section": @@ -715,7 +888,8 @@ def end_element(self, name): self.struct = None self.group = None elif name == "field": - self.group.fields[-1].values = self.values + if not self.skip_field: + self.group.fields[-1].values = self.values elif name == "enum": self.emit_enum() self.enum = None @@ -745,22 +919,33 @@ def emit_header(self, name): print('') def emit_template_struct(self, name, group): - print("struct %s {" % name) - group.emit_template_struct("") - print("};\n") + if self.unpacked_alias: + # TODO: Check the fields match + print("#define %s %s" % (name, self.unpacked_alias)) + else: + print("struct %s {" % name) + group.emit_template_struct("") + print("};\n") def emit_aggregate(self): aggregate = self.aggregate - print("struct %s_packed {" % aggregate.name.lower()) - print(" uint32_t opaque[{}];".format(aggregate.get_size() // 4)) - print("};\n") - print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size)) + + if self.layout == "struct": + print("struct %s_packed {" % aggregate.name.lower()) + print(" uint32_t opaque[{}];".format(aggregate.get_size() // 4)) + print("};\n") + print('#define {}_LENGTH {}'.format(aggregate.name.upper(), aggregate.size)) + else: + assert(self.layout == "cs") + if aggregate.align != None: print('#define {}_ALIGN {}'.format(aggregate.name.upper(), aggregate.align)) for section in aggregate.sections: print('#define {}_SECTION_{}_TYPE struct {}'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) print('#define {}_SECTION_{}_header {}_header'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) print('#define {}_SECTION_{}_pack {}_pack'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) + # TODO: Only when req'd + print('#define {}_SECTION_{}_pack_cs {}_pack_cs'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) print('#define {}_SECTION_{}_unpack {}_unpack'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) print('#define {}_SECTION_{}_print {}_print'.format(aggregate.name.upper(), section.name.upper(), section.type_name)) print('#define {}_SECTION_{}_OFFSET {}'.format(aggregate.name.upper(), section.name.upper(), section.offset)) @@ -775,12 +960,32 @@ def emit_pack_function(self, name, group): print("}\n\n") # Should be a whole number of words - assert((self.group.length % 4) == 0) + assert((group.length % 4) == 0) + + print('#define {} {}'.format (name + "_LENGTH", group.length)) + if group.align != None: + print('#define {} {}'.format (name + "_ALIGN", group.align)) + print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), group.length // 4)) + + def emit_cs_pack_function(self, name, group): + print("static inline void\n%s_pack_cs(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{\n" % + (name, ' ' * (len(name) + 6), name)) + + group.emit_pack_function(csf=True) - print('#define {} {}'.format (name + "_LENGTH", self.group.length)) - if self.group.align != None: - print('#define {} {}'.format (name + "_ALIGN", self.group.align)) - print('struct {}_packed {{ uint32_t opaque[{}]; }};'.format(name.lower(), self.group.length // 4)) + print("}\n\n") + + assert(group.length == 256 * 4) + + def emit_ins_pack_function(self, name, group): + print("static inline void\n%s_pack_ins(pan_command_stream * restrict s,\n%sconst struct %s * restrict values)\n{" % + (name, ' ' * (len(name) + 6), name)) + + group.emit_pack_function(csf=True, ins=True) + + print("}\n\n") + + assert(group.length == 256 * 4) def emit_unpack_function(self, name, group): print("static inline void") @@ -791,6 +996,18 @@ def emit_unpack_function(self, name, group): print("}\n") + def emit_cs_unpack_function(self, name, group): + print("static inline void") + print("%s_unpack(const uint32_t * restrict buffer, uint32_t * restrict buffer_unk,\n" + "%sstruct %s * restrict values)\n{" + " const uint8_t *cl = (uint8_t *)buffer;\n" + " uint8_t *cl_unk = (uint8_t *)buffer_unk;\n" % + (name.upper(), ' ' * (len(name) + 8), name)) + + group.emit_unpack_function(csf=True) + + print("}\n") + def emit_print_function(self, name, group): print("static inline void") print("{}_print(FILE *fp, const struct {} * values, unsigned indent)\n{{".format(name.upper(), name)) @@ -804,14 +1021,20 @@ def emit_struct(self): self.emit_template_struct(self.struct, self.group) self.emit_header(name) - if self.no_direct_packing == False: + if self.layout == "struct": self.emit_pack_function(self.struct, self.group) self.emit_unpack_function(self.struct, self.group) + elif self.layout == "cs": + self.emit_cs_pack_function(self.struct, self.group) + self.emit_cs_unpack_function(self.struct, self.group) + elif self.layout == "ins": + # TODO: I don't think that the current unpack emit functions would + # work + self.emit_ins_pack_function(self.struct, self.group) + else: + assert(self.layout == "none") self.emit_print_function(self.struct, self.group) - def enum_prefix(self, name): - return - def emit_enum(self): e_name = enum_name(self.enum) prefix = e_name if self.enum != 'Format' else global_prefix diff --git a/src/panfrost/lib/genxml/meson.build.rej b/src/panfrost/lib/genxml/meson.build.rej new file mode 100644 index 00000000000..75405947ded --- /dev/null +++ b/src/panfrost/lib/genxml/meson.build.rej @@ -0,0 +1,19 @@ +diff a/src/panfrost/lib/genxml/meson.build b/src/panfrost/lib/genxml/meson.build (rejected hunks) +@@ -20,7 +20,7 @@ + # SOFTWARE. + + pan_packers = [] +-foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9'] ++foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10'] + pan_packers += custom_target( + packer + '_pack.h', + input : ['gen_pack.py', packer + '.xml'], +@@ -37,7 +37,7 @@ idep_pan_packers = declare_dependency( + + libpanfrost_decode_per_arch = [] + +-foreach ver : ['4', '5', '6', '7', '9'] ++foreach ver : ['4', '5', '6', '7', '9', '10'] + libpanfrost_decode_per_arch += static_library( + 'pandecode-arch-v' + ver, + ['decode.c', pan_packers], diff --git a/src/panfrost/lib/genxml/v4.xml b/src/panfrost/lib/genxml/v4.xml index 63b7f7f57ac..4f8dd3f2e13 100644 --- a/src/panfrost/lib/genxml/v4.xml +++ b/src/panfrost/lib/genxml/v4.xml @@ -446,7 +446,7 @@ - + diff --git a/src/panfrost/lib/genxml/v5.xml b/src/panfrost/lib/genxml/v5.xml index 6c53dac00e6..eacd75501f5 100644 --- a/src/panfrost/lib/genxml/v5.xml +++ b/src/panfrost/lib/genxml/v5.xml @@ -467,7 +467,7 @@ - + diff --git a/src/panfrost/lib/genxml/v6.xml b/src/panfrost/lib/genxml/v6.xml index 9d042c4db93..701c204d04b 100644 --- a/src/panfrost/lib/genxml/v6.xml +++ b/src/panfrost/lib/genxml/v6.xml @@ -467,7 +467,7 @@ - + @@ -689,7 +689,7 @@ - + @@ -708,7 +708,7 @@ - + @@ -717,7 +717,7 @@ - + diff --git a/src/panfrost/lib/genxml/v7.xml b/src/panfrost/lib/genxml/v7.xml index 7e0b794ec85..ec0bad1f0c3 100644 --- a/src/panfrost/lib/genxml/v7.xml +++ b/src/panfrost/lib/genxml/v7.xml @@ -512,7 +512,7 @@ - + @@ -762,7 +762,7 @@ - + @@ -781,7 +781,7 @@ - + @@ -790,7 +790,7 @@ - + @@ -854,13 +854,13 @@ - + - + diff --git a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml index c08d49e2025..0818e5128a6 100644 --- a/src/panfrost/lib/genxml/v9.xml +++ b/src/panfrost/lib/genxml/v9.xml @@ -526,7 +526,7 @@ - + @@ -1322,28 +1322,28 @@ - - - - - - + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + @@ -1353,9 +1353,9 @@ - - - + + + @@ -1376,8 +1376,8 @@ - - + + @@ -1387,6 +1387,7 @@ + @@ -1404,7 +1405,7 @@ - + @@ -1420,24 +1421,24 @@ - - - - - - - + + + + + + + - + - + - - + + diff --git a/src/panfrost/lib/genxml/v9.xml.rej b/src/panfrost/lib/genxml/v9.xml.rej new file mode 100644 index 00000000000..2594d849f1d --- /dev/null +++ b/src/panfrost/lib/genxml/v9.xml.rej @@ -0,0 +1,28 @@ +diff a/src/panfrost/lib/genxml/v9.xml b/src/panfrost/lib/genxml/v9.xml (rejected hunks) +@@ -599,12 +599,6 @@ + + + +- +- +- +- +- +- + + + +@@ -612,10 +606,10 @@ + + + +- +- ++ ++ + +- ++ + + + diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build index 12b927a2973..344d5299deb 100644 --- a/src/panfrost/lib/meson.build +++ b/src/panfrost/lib/meson.build @@ -40,7 +40,7 @@ endforeach libpanfrost_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9'] +foreach ver : ['4', '5', '6', '7', '9', '10'] libpanfrost_per_arch += static_library( 'pan-arch-v' + ver, [ diff --git a/src/panfrost/lib/meson.build.rej b/src/panfrost/lib/meson.build.rej new file mode 100644 index 00000000000..775ed402e1d --- /dev/null +++ b/src/panfrost/lib/meson.build.rej @@ -0,0 +1,10 @@ +diff a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build (rejected hunks) +@@ -93,7 +93,7 @@ libpanfrost_lib = static_library( + include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], + c_args : [no_override_init_args], + gnu_symbol_visibility : 'hidden', +- dependencies: [dep_libdrm, idep_nir], ++ dependencies: [dep_libdrm, idep_nir, libpanfrost_base_dep], + build_by_default : false, + link_with: [libpanfrost_pixel_format, libpanfrost_per_arch], + ) diff --git a/src/panfrost/lib/pan_afbc.c.rej b/src/panfrost/lib/pan_afbc.c.rej new file mode 100644 index 00000000000..92be882a371 --- /dev/null +++ b/src/panfrost/lib/pan_afbc.c.rej @@ -0,0 +1,25 @@ +diff a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c (rejected hunks) +@@ -125,10 +125,6 @@ panfrost_afbc_format(unsigned arch, enum pipe_format format) + */ + format = util_format_linear(format); + +- /* Don't allow swizzled formats on v7+ */ +- if (arch >= 7 && format != unswizzled_format(format)) +- return PIPE_FORMAT_NONE; +- + /* Otherwise swizzling doesn't affect AFBC */ + format = unswizzled_format(format); + +@@ -189,3 +185,12 @@ panfrost_afbc_can_tile(const struct panfrost_device *dev) + { + return (dev->arch >= 7); + } ++ ++/* ++ * Can this format only be used with AFBC_FORMAT_MOD_NATIVE_SWIZZLE? ++ */ ++bool ++panfrost_afbc_only_native(unsigned arch, enum pipe_format format) ++{ ++ return (arch >= 7 && format != unswizzled_format(format)); ++} diff --git a/src/panfrost/lib/pan_blend.c.rej b/src/panfrost/lib/pan_blend.c.rej new file mode 100644 index 00000000000..4c4b12a9cc8 --- /dev/null +++ b/src/panfrost/lib/pan_blend.c.rej @@ -0,0 +1,10 @@ +diff a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c (rejected hunks) +@@ -800,7 +800,7 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, + }; + + /* Blend shaders should only be used for blending on Bifrost onwards */ +- assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation)); ++ assert(dev->arch <= 5 || state->logicop_enable || !pan_blend_is_opaque(state->rts[rt].equation)); + assert(state->rts[rt].equation.color_mask != 0); + + struct hash_entry *he = _mesa_hash_table_search(dev->blend_shaders.shaders, &key); diff --git a/src/panfrost/lib/pan_blitter.c.rej b/src/panfrost/lib/pan_blitter.c.rej new file mode 100644 index 00000000000..ea98fc10c8a --- /dev/null +++ b/src/panfrost/lib/pan_blitter.c.rej @@ -0,0 +1,28 @@ +diff a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c (rejected hunks) +@@ -1150,7 +1150,7 @@ pan_preload_emit_dcd(struct pan_pool *pool, + blend.cpu); + } + +- pan_pack(out, DRAW, cfg) { ++ pan_pack(out, DRAW_NO_CS, cfg) { + if (zs) { + /* ZS_EMIT requires late update/kill */ + cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; +@@ -1225,7 +1225,7 @@ pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool, + return; + + fb->bifrost.pre_post.dcds = +- pan_pool_alloc_desc_array(desc_pool, 3, DRAW); ++ pan_pool_alloc_desc_array(desc_pool, 3, DRAW_NO_CS); + } + + static void +@@ -1237,7 +1237,7 @@ pan_preload_emit_pre_frame_dcd(struct pan_pool *desc_pool, + pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb); + assert(fb->bifrost.pre_post.dcds.cpu); + void *dcd = fb->bifrost.pre_post.dcds.cpu + +- (dcd_idx * pan_size(DRAW)); ++ (dcd_idx * pan_size(DRAW_NO_CS)); + + /* We only use crc_rt to determine whether to force writes for updating + * the CRCs, so use a conservative tile size (16x16). diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c index 77d59a2719a..eebc2266b52 100644 --- a/src/panfrost/lib/pan_bo.c +++ b/src/panfrost/lib/pan_bo.c @@ -38,6 +38,7 @@ #include "util/u_inlines.h" #include "util/u_math.h" +#include "util/os_file.h" /* This file implements a userspace BO cache. Allocating and freeing * GPU-visible buffers is very expensive, and even the extra kernel roundtrips diff --git a/src/panfrost/lib/pan_bo.c.rej b/src/panfrost/lib/pan_bo.c.rej new file mode 100644 index 00000000000..ca2610eef42 --- /dev/null +++ b/src/panfrost/lib/pan_bo.c.rej @@ -0,0 +1,584 @@ +diff a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c (rejected hunks) +@@ -71,7 +72,38 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size, + create_bo.flags |= PANFROST_BO_NOEXEC; + } + +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); ++ void *cpu = NULL; ++ ++ bool cached = false; ++ ++ if (dev->kbase) { ++ if (flags & PAN_BO_CACHEABLE) { ++ if (!(dev->debug & PAN_DBG_UNCACHED_CPU)) { ++ create_bo.flags |= MALI_BO_CACHED_CPU; ++ /* TODO: What if kbase decides not to cache it? */ ++ cached = true; ++ } ++ if (dev->debug & PAN_DBG_UNCACHED_GPU) ++ create_bo.flags |= MALI_BO_UNCACHED_GPU; ++ } ++ ++ unsigned mali_flags = (flags & PAN_BO_EVENT) ? 0x8200f : 0; ++ ++ struct base_ptr p = dev->mali.alloc(&dev->mali, size, create_bo.flags, mali_flags); ++ ++ if (p.gpu) { ++ cpu = p.cpu; ++ create_bo.offset = p.gpu; ++ create_bo.handle = kbase_alloc_gem_handle(&dev->mali, p.gpu, -1); ++ if (!cpu) ++ abort(); ++ ret = 0; ++ } else { ++ ret = -1; ++ } ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); ++ } + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); + return NULL; +@@ -82,29 +114,99 @@ panfrost_bo_alloc(struct panfrost_device *dev, size_t size, + + bo->size = create_bo.size; + bo->ptr.gpu = create_bo.offset; ++ bo->ptr.cpu = cpu; ++ if ((uintptr_t) bo->ptr.cpu != bo->ptr.gpu) ++ bo->free_ioctl = true; + bo->gem_handle = create_bo.handle; + bo->flags = flags; + bo->dev = dev; + bo->label = label; ++ bo->cached = cached; ++ bo->dmabuf_fd = -1; + return bo; + } + + static void + panfrost_bo_free(struct panfrost_bo *bo) + { ++ struct panfrost_device *dev = bo->dev; + struct drm_gem_close gem_close = { .handle = bo->gem_handle }; + int ret; + +- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li memfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ if (dev->kbase) { ++ os_munmap(bo->ptr.cpu, bo->size); ++ if (bo->munmap_ptr) ++ os_munmap(bo->munmap_ptr, bo->size); ++ if (bo->free_ioctl) ++ dev->mali.free(&dev->mali, bo->ptr.gpu); ++ kbase_free_gem_handle(&dev->mali, bo->gem_handle); ++ ret = 0; ++ } else { ++ ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ } + if (ret) { + fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); + assert(0); + } + +- /* BO will be freed with the sparse array, but zero to indicate free */ ++ /* BO will be freed with the stable_array, but zero to indicate free */ + memset(bo, 0, sizeof(*bo)); + } + ++static bool ++panfrost_bo_usage_finished(struct panfrost_bo *bo, bool readers) ++{ ++ struct panfrost_device *dev = bo->dev; ++ kbase k = &dev->mali; ++ ++ bool ret = true; ++ ++ pthread_mutex_lock(&dev->bo_usage_lock); ++ pthread_mutex_lock(&dev->mali.queue_lock); ++ ++ util_dynarray_foreach(&bo->usage, struct panfrost_usage, u) { ++ /* Skip if we are only waiting for writers */ ++ if (!u->write && !readers) ++ continue; ++ ++ /* Usages are ordered, so everything else is also invalid */ ++ if (u->queue >= k->event_slot_usage) ++ break; ++ ++ struct kbase_event_slot *slot = &k->event_slots[u->queue]; ++ uint64_t seqnum = u->seqnum; ++ ++ /* There is a race condition, where we can depend on an ++ * unsubmitted batch. In that cade, decrease the seqnum. ++ * Otherwise, skip invalid dependencies. TODO: do GC? */ ++ if (slot->last_submit == seqnum) ++ --seqnum; ++ else if (slot->last_submit < seqnum) ++ continue; ++ ++ if (slot->last <= seqnum) { ++ ret = false; ++ break; ++ } ++ } ++ ++ pthread_mutex_unlock(&dev->mali.queue_lock); ++ pthread_mutex_unlock(&dev->bo_usage_lock); ++ ++ return ret; ++} ++ + /* Returns true if the BO is ready, false otherwise. + * access_type is encoding the type of access one wants to ensure is done. + * Waiting is always done for writers, but if wait_readers is set then readers +@@ -113,12 +215,15 @@ panfrost_bo_free(struct panfrost_bo *bo) + bool + panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + { ++ struct panfrost_device *dev = bo->dev; + struct drm_panfrost_wait_bo req = { + .handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret; + ++ /* TODO: With driver-handled sync, is gpu_access even worth it? */ ++ + /* If the BO has been exported or imported we can't rely on the cached + * state, we need to call the WAIT_BO ioctl. + */ +@@ -134,10 +239,31 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + return true; + } + ++ if (dev->kbase && (dev->arch >= 10)) { ++ struct kbase_wait_ctx wait = kbase_wait_init(&dev->mali, timeout_ns); ++ while (kbase_wait_for_event(&wait)) { ++ if (panfrost_bo_usage_finished(bo, wait_readers)) ++ break; ++ } ++ kbase_wait_fini(wait); ++ ++ bool ret = panfrost_bo_usage_finished(bo, wait_readers); ++ if (bo->flags & PAN_BO_SHARED) ++ ret &= kbase_poll_fd_until(bo->dmabuf_fd, wait_readers, wait.until); ++ ++ if (ret) ++ bo->gpu_access &= (wait_readers ? 0 : PAN_BO_ACCESS_READ); ++ return ret; ++ } ++ + /* The ioctl returns >= 0 value when the BO we are waiting for is ready + * -1 otherwise. + */ +- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); ++ if (dev->kbase) ++ ret = kbase_wait_bo(&dev->mali, bo->gem_handle, timeout_ns, ++ wait_readers); ++ else ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); + if (ret != -1) { + /* Set gpu_access to 0 so that the next call to bo_wait() + * doesn't have to call the WAIT_BO ioctl. +@@ -153,6 +279,32 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) + return false; + } + ++static void ++panfrost_bo_mem_op(struct panfrost_bo *bo, size_t offset, size_t length, bool invalidate) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ assert(offset + length <= bo->size); ++ ++ if (!bo->cached) ++ return; ++ ++ dev->mali.mem_sync(&dev->mali, bo->ptr.gpu, bo->ptr.cpu + offset, length, ++ invalidate); ++} ++ ++void ++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length) ++{ ++ panfrost_bo_mem_op(bo, offset, length, true); ++} ++ ++void ++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length) ++{ ++ panfrost_bo_mem_op(bo, offset, length, false); ++} ++ + /* Helper to calculate the bucket index of a BO */ + + static unsigned +@@ -200,21 +352,31 @@ panfrost_bo_cache_fetch(struct panfrost_device *dev, + + /* If the oldest BO in the cache is busy, likely so is + * everything newer, so bail. */ +- if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, +- PAN_BO_ACCESS_RW)) +- break; ++ ++ /* For kbase, BOs are not added to the cache until the GPU is ++ * done with them, so there is no need to wait. */ ++ if (!dev->kbase) { ++ if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, ++ PAN_BO_ACCESS_RW)) ++ break; ++ } + + struct drm_panfrost_madvise madv = { + .handle = entry->gem_handle, + .madv = PANFROST_MADV_WILLNEED, + }; +- int ret; ++ int ret = 0; + + /* This one works, splice it out of the cache */ + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ if (dev->kbase) { ++ /* With kbase, BOs are never freed from the cache */ ++ madv.retained = true; ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ } + if (!ret && !madv.retained) { + panfrost_bo_free(entry); + continue; +@@ -276,7 +438,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo) + madv.madv = PANFROST_MADV_DONTNEED; + madv.retained = 0; + +- drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); ++ // TODO: Allow freeing madvise'd BOs with kbase... not that it really ++ // matters for boards with 16 GB RAM ++ if (!dev->kbase) ++ drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + + /* Add us to the bucket */ + list_addtail(&bo->bucket_link, bucket); +@@ -286,6 +451,10 @@ panfrost_bo_cache_put(struct panfrost_bo *bo) + clock_gettime(CLOCK_MONOTONIC, &time); + bo->last_used = time.tv_sec; + ++ /* For kbase, the GPU can't be accessing this BO any more */ ++ if (dev->kbase) ++ bo->gpu_access = 0; ++ + /* Let's do some cleanup in the BO cache while we hold the + * lock. + */ +@@ -352,10 +521,15 @@ panfrost_bo_mmap(struct panfrost_bo *bo) + static void + panfrost_bo_munmap(struct panfrost_bo *bo) + { ++ /* We can't munmap BOs when using kbase, as that frees the storage and ++ * the GPU might still be using the BO. */ ++ if (bo->dev->kbase) ++ return; ++ + if (!bo->ptr.cpu) + return; + +- if (os_munmap((void *) (uintptr_t)bo->ptr.cpu, bo->size)) { ++ if (os_munmap(bo->ptr.cpu, bo->size)) { + perror("munmap"); + abort(); + } +@@ -390,8 +564,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + if (!bo) + bo = panfrost_bo_cache_fetch(dev, size, flags, label, false); + if (!bo) { +- panfrost_bo_cache_evict_all(dev); +- bo = panfrost_bo_alloc(dev, size, flags, label); ++ for (unsigned i = 0; i < 5; ++i) { ++ usleep(20 * 1000 * i * i); ++ if (dev->kbase) ++ kbase_ensure_handle_events(&dev->mali); ++ panfrost_bo_cache_evict_all(dev); ++ bo = panfrost_bo_alloc(dev, size, flags, label); ++ if (bo) ++ break; ++ } + } + + if (!bo) { +@@ -406,8 +587,15 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) + panfrost_bo_mmap(bo); + ++ if ((dev->debug & PAN_DBG_BO_CLEAR) && !(flags & PAN_BO_INVISIBLE)) { ++ memset(bo->ptr.cpu, 0, bo->size); ++ panfrost_bo_mem_clean(bo, 0, bo->size); ++ } ++ + p_atomic_set(&bo->refcnt, 1); + ++ util_dynarray_init(&bo->usage, NULL); ++ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + if (flags & PAN_BO_INVISIBLE) + pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL); +@@ -415,6 +603,14 @@ panfrost_bo_create(struct panfrost_device *dev, size_t size, + pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); + } + ++ if (dev->bo_log) { ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li alloc %"PRIx64" to %"PRIx64" size %zu label %s\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label); ++ fflush(NULL); ++ } ++ + return bo; + } + +@@ -427,6 +623,60 @@ panfrost_bo_reference(struct panfrost_bo *bo) + } + } + ++static void ++panfrost_bo_fini(struct panfrost_bo *bo) ++{ ++ struct panfrost_device *dev = bo->dev; ++ ++ /* When the reference count goes to zero, we need to cleanup */ ++ panfrost_bo_munmap(bo); ++ ++ if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) ++ pandecode_inject_free(bo->ptr.gpu, bo->size); ++ ++ /* Rather than freeing the BO now, we'll cache the BO for later ++ * allocations if we're allowed to. ++ */ ++ if (!panfrost_bo_cache_put(bo)) ++ panfrost_bo_free(bo); ++} ++ ++static void ++panfrost_bo_free_gpu(void *data) ++{ ++ struct panfrost_bo *bo = data; ++ struct panfrost_device *dev = bo->dev; ++ ++ /* Don't free if there are still references */ ++ if (p_atomic_dec_return(&bo->gpu_refcnt)) ++ return; ++ ++ pthread_mutex_lock(&dev->bo_map_lock); ++ ++ /* Someone might have imported this BO while we were waiting for the ++ * lock, let's make sure it's still not referenced before freeing it. ++ */ ++ if (p_atomic_read(&bo->refcnt) != 0) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } ++ ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li gpufree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ panfrost_bo_fini(bo); ++ ++ pthread_mutex_unlock(&dev->bo_map_lock); ++} ++ + void + panfrost_bo_unreference(struct panfrost_bo *bo) + { +@@ -439,25 +689,57 @@ panfrost_bo_unreference(struct panfrost_bo *bo) + + struct panfrost_device *dev = bo->dev; + ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li free %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ + pthread_mutex_lock(&dev->bo_map_lock); + + /* Someone might have imported this BO while we were waiting for the + * lock, let's make sure it's still not referenced before freeing it. + */ +- if (p_atomic_read(&bo->refcnt) == 0) { +- /* When the reference count goes to zero, we need to cleanup */ +- panfrost_bo_munmap(bo); ++ if (p_atomic_read(&bo->refcnt) != 0) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } + +- if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) +- pandecode_inject_free(bo->ptr.gpu, bo->size); ++ util_dynarray_fini(&bo->usage); + +- /* Rather than freeing the BO now, we'll cache the BO for later +- * allocations if we're allowed to. ++ if (dev->kbase) { ++ /* Assume that all queues are using this BO, and so free the ++ * BO only after all currently-submitted jobs have finished. ++ * This could eventually be optimised to only wait on a subset ++ * of queues. + */ +- if (!panfrost_bo_cache_put(bo)) +- panfrost_bo_free(bo); ++ bool added = dev->mali.callback_all_queues(&dev->mali, ++ &bo->gpu_refcnt, panfrost_bo_free_gpu, bo); + ++ if (added) { ++ pthread_mutex_unlock(&dev->bo_map_lock); ++ return; ++ } + } ++ ++ if (dev->bo_log) { ++ int fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li immfree %"PRIx64" to %"PRIx64" size %zu label %s obj (%p,%i,%i)\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, bo->label, ++ bo, bo->gem_handle, fd); ++ fflush(NULL); ++ } ++ ++ panfrost_bo_fini(bo); ++ + pthread_mutex_unlock(&dev->bo_map_lock); + } + +@@ -467,22 +749,42 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + struct panfrost_bo *bo; + struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; + ASSERTED int ret; ++ kbase_handle handle = { .fd = -1 }; + unsigned gem_handle; + +- ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); +- assert(!ret); ++ if (dev->kbase) { ++ gem_handle = dev->mali.import_dmabuf(&dev->mali, fd); ++ if (gem_handle == -1) ++ return NULL; ++ } else { ++ ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); ++ assert(!ret); ++ } + + pthread_mutex_lock(&dev->bo_map_lock); + bo = pan_lookup_bo(dev, gem_handle); + ++ bool found = false; ++ + if (!bo->dev) { + get_bo_offset.handle = gem_handle; +- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); +- assert(!ret); ++ if (dev->kbase) { ++ handle = kbase_gem_handle_get(&dev->mali, gem_handle); ++ get_bo_offset.offset = handle.va; ++ } else { ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); ++ assert(!ret); ++ } + + bo->dev = dev; +- bo->ptr.gpu = (mali_ptr) get_bo_offset.offset; + bo->size = lseek(fd, 0, SEEK_END); ++ bo->ptr.gpu = (mali_ptr) get_bo_offset.offset; ++ if (dev->kbase && (sizeof(void *) > 4 || get_bo_offset.offset < (1LL << 32))) { ++ bo->ptr.cpu = (void *)(uintptr_t) get_bo_offset.offset; ++ } else if (dev->kbase) { ++ bo->ptr.cpu = dev->mali.mmap_import(&dev->mali, bo->ptr.gpu, bo->size); ++ bo->free_ioctl = true; ++ } + /* Sometimes this can fail and return -1. size of -1 is not + * a nice thing for mmap to try mmap. Be more robust also + * for zero sized maps and fail nicely too +@@ -493,8 +795,21 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + } + bo->flags = PAN_BO_SHARED; + bo->gem_handle = gem_handle; ++ util_dynarray_init(&bo->usage, NULL); ++ if (dev->kbase) { ++ /* kbase always maps dma-bufs with caching */ ++ bo->cached = true; ++ ++ /* Importing duplicates the FD, so we cache the FD ++ * from the handle */ ++ bo->dmabuf_fd = handle.fd; ++ } else { ++ bo->dmabuf_fd = -1; ++ } + p_atomic_set(&bo->refcnt, 1); + } else { ++ found = true; ++ + /* bo->refcnt == 0 can happen if the BO + * was being released but panfrost_bo_import() acquired the + * lock before panfrost_bo_unreference(). In that case, refcnt +@@ -512,12 +827,34 @@ panfrost_bo_import(struct panfrost_device *dev, int fd) + } + pthread_mutex_unlock(&dev->bo_map_lock); + ++ if (dev->bo_log) { ++ int new_fd = kbase_gem_handle_get(&dev->mali, bo->gem_handle).fd; ++ ++ struct timespec tp; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &tp); ++ fprintf(dev->bo_log, "%"PRIu64".%09li import %"PRIx64" to %"PRIx64" size %zu fd %i new %i handle %i found %i\n", ++ (uint64_t) tp.tv_sec, tp.tv_nsec, bo->ptr.gpu, bo->ptr.gpu + bo->size, bo->size, ++ fd, new_fd, gem_handle, found); ++ fflush(NULL); ++ } ++ + return bo; + } + + int + panfrost_bo_export(struct panfrost_bo *bo) + { ++ struct panfrost_device *dev = bo->dev; ++ ++ if (bo->dmabuf_fd != -1) { ++ assert(bo->flags & PAN_BO_SHARED); ++ ++ return os_dupfd_cloexec(bo->dmabuf_fd); ++ } ++ ++ if (dev->kbase) ++ return -1; ++ + struct drm_prime_handle args = { + .handle = bo->gem_handle, + .flags = DRM_CLOEXEC, diff --git a/src/panfrost/lib/pan_bo.h.rej b/src/panfrost/lib/pan_bo.h.rej new file mode 100644 index 00000000000..b7833465c45 --- /dev/null +++ b/src/panfrost/lib/pan_bo.h.rej @@ -0,0 +1,84 @@ +diff a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h (rejected hunks) +@@ -27,6 +27,7 @@ + #define __PAN_BO_H__ + + #include "util/list.h" ++#include "util/u_dynarray.h" + #include "panfrost-job.h" + #include + +@@ -50,6 +51,12 @@ + * cached locally */ + #define PAN_BO_SHARED (1 << 4) + ++/* Use event memory, required for CSF events to be signaled to the kernel */ ++#define PAN_BO_EVENT (1 << 5) ++ ++/* Use the caching policy for resource BOs */ ++#define PAN_BO_CACHEABLE (1 << 6) ++ + /* GPU access flags */ + + /* BO is either shared (can be accessed by more than one GPU batch) or private +@@ -80,6 +87,12 @@ struct panfrost_ptr { + mali_ptr gpu; + }; + ++struct panfrost_usage { ++ uint32_t queue; ++ bool write; ++ uint64_t seqnum; ++}; ++ + struct panfrost_bo { + /* Must be first for casting */ + struct list_head bucket_link; +@@ -95,11 +108,16 @@ struct panfrost_bo { + /* Atomic reference count */ + int32_t refcnt; + ++ /* Reference count for GPU jobs */ ++ int32_t gpu_refcnt; ++ + struct panfrost_device *dev; + + /* Mapping for the entire object (all levels) */ + struct panfrost_ptr ptr; + ++ struct util_dynarray usage; ++ + /* Size of all entire trees */ + size_t size; + +@@ -115,11 +133,31 @@ struct panfrost_bo { + + /* Human readable description of the BO for debugging. */ + const char *label; ++ ++ /* Sometimes we don't access the BO through kbase's mapping of the ++ * memory, in that case we need to save the pointer to pass to ++ * munmap to avoid leaking memory. */ ++ void *munmap_ptr; ++ ++ /* For 32-bit applications we may not even be able to that, because ++ * the VA may be too high for kbase to map to an equivalent CPU ++ * address, in which case we must use the memory free icotl. */ ++ bool free_ioctl; ++ ++ /* Is the BO cached CPU-side? */ ++ bool cached; ++ ++ /* File descriptor for the dma-buf */ ++ int dmabuf_fd; + }; + + bool + panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers); + void ++panfrost_bo_mem_invalidate(struct panfrost_bo *bo, size_t offset, size_t length); ++void ++panfrost_bo_mem_clean(struct panfrost_bo *bo, size_t offset, size_t length); ++void + panfrost_bo_reference(struct panfrost_bo *bo); + void + panfrost_bo_unreference(struct panfrost_bo *bo); diff --git a/src/panfrost/lib/pan_device.h.rej b/src/panfrost/lib/pan_device.h.rej new file mode 100644 index 00000000000..5ff078535fe --- /dev/null +++ b/src/panfrost/lib/pan_device.h.rej @@ -0,0 +1,88 @@ +diff a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h (rejected hunks) +@@ -35,11 +35,12 @@ + #include "util/u_dynarray.h" + #include "util/bitset.h" + #include "util/list.h" +-#include "util/sparse_array.h" ++#include "util/stable_array.h" + + #include "panfrost/util/pan_ir.h" + #include "pan_pool.h" + #include "pan_util.h" ++#include "pan_base.h" + + #include + +@@ -182,6 +183,7 @@ struct panfrost_device { + void *memctx; + + int fd; ++ bool kbase; + + /* Properties of the GPU in use */ + unsigned arch; +@@ -204,6 +206,9 @@ struct panfrost_device { + const struct panfrost_model *model; + bool has_afbc; + ++ /* Does the kernel support dma-buf fence import/export? */ ++ bool has_dmabuf_fence; ++ + /* Table of formats, indexed by a PIPE format */ + const struct panfrost_format *formats; + +@@ -217,8 +222,11 @@ struct panfrost_device { + + struct renderonly *ro; + ++ /* Hold this while updating usage field of BOs */ ++ pthread_mutex_t bo_usage_lock; ++ + pthread_mutex_t bo_map_lock; +- struct util_sparse_array bo_map; ++ struct stable_array bo_map; + + struct { + pthread_mutex_t lock; +@@ -263,6 +271,10 @@ struct panfrost_device { + * unconditionally on Bifrost, and useful for sharing with Midgard */ + + struct panfrost_bo *sample_positions; ++ ++ struct kbase_ mali; ++ ++ FILE *bo_log; + }; + + void +@@ -271,6 +283,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); + void + panfrost_close_device(struct panfrost_device *dev); + ++bool ++panfrost_check_dmabuf_fence(struct panfrost_device *dev); ++ + bool + panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt); + +@@ -287,12 +302,18 @@ panfrost_query_sample_position( + float *out); + + unsigned +-panfrost_query_l2_slices(const struct panfrost_device *dev); ++panfrost_query_l2_slices(struct panfrost_device *dev); + + static inline struct panfrost_bo * + pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle) + { +- return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle); ++ return stable_array_get(&dev->bo_map, struct panfrost_bo, gem_handle); ++} ++ ++static inline struct panfrost_bo * ++pan_lookup_bo_existing(struct panfrost_device *dev, uint32_t gem_handle) ++{ ++ return stable_array_get_existing(&dev->bo_map, struct panfrost_bo, gem_handle); + } + + static inline bool diff --git a/src/panfrost/lib/pan_layout.c.rej b/src/panfrost/lib/pan_layout.c.rej new file mode 100644 index 00000000000..d37ee10f41d --- /dev/null +++ b/src/panfrost/lib/pan_layout.c.rej @@ -0,0 +1,66 @@ +diff a/src/panfrost/lib/pan_layout.c b/src/panfrost/lib/pan_layout.c (rejected hunks) +@@ -32,6 +32,14 @@ + * enabling the YUV-like transform is typically a win where possible. */ + + uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT] = { ++ DRM_FORMAT_MOD_ARM_AFBC( ++ AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | ++ AFBC_FORMAT_MOD_TILED | ++ AFBC_FORMAT_MOD_SC | ++ AFBC_FORMAT_MOD_SPARSE | ++ AFBC_FORMAT_MOD_YTR | ++ AFBC_FORMAT_MOD_NATIVE_SWIZZLE), ++ + DRM_FORMAT_MOD_ARM_AFBC( + AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_TILED | +@@ -201,18 +209,17 @@ pan_afbc_body_align(uint64_t modifier) + #define CHECKSUM_TILE_HEIGHT 16 + #define CHECKSUM_BYTES_PER_TILE 8 + +-unsigned +-panfrost_compute_checksum_size( +- struct pan_image_slice_layout *slice, +- unsigned width, +- unsigned height) ++struct pan_image_slice_crc ++panfrost_compute_checksum_size(unsigned width, unsigned height) + { + unsigned tile_count_x = DIV_ROUND_UP(width, CHECKSUM_TILE_WIDTH); + unsigned tile_count_y = DIV_ROUND_UP(height, CHECKSUM_TILE_HEIGHT); + +- slice->crc.stride = tile_count_x * CHECKSUM_BYTES_PER_TILE; +- +- return slice->crc.stride * tile_count_y; ++ struct pan_image_slice_crc ret = { ++ .stride = tile_count_x * CHECKSUM_BYTES_PER_TILE, ++ .size = ret.stride * tile_count_y, ++ }; ++ return ret; + } + + unsigned +@@ -236,8 +243,11 @@ panfrost_get_legacy_stride(const struct pan_image_layout *layout, + panfrost_block_size(layout->modifier, layout->format); + + if (drm_is_afbc(layout->modifier)) { ++ unsigned align_w = block_size.width * ++ pan_afbc_tile_size(layout->modifier); ++ + unsigned width = u_minify(layout->width, level); +- width = ALIGN_POT(width, block_size.width); ++ width = ALIGN_POT(width, align_w); + + return width * util_format_get_blocksize(layout->format); + } else { +@@ -392,9 +402,7 @@ pan_image_layout_init(struct pan_image_layout *layout, + + /* Add a checksum region if necessary */ + if (layout->crc) { +- slice->crc.size = +- panfrost_compute_checksum_size(slice, width, height); +- ++ slice->crc = panfrost_compute_checksum_size(width, height); + slice->crc.offset = offset; + offset += slice->crc.size; + slice->size += slice->crc.size; diff --git a/src/panfrost/lib/pan_pool.h.rej b/src/panfrost/lib/pan_pool.h.rej new file mode 100644 index 00000000000..c7ee5984d5a --- /dev/null +++ b/src/panfrost/lib/pan_pool.h.rej @@ -0,0 +1,19 @@ +diff a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h (rejected hunks) +@@ -130,4 +130,17 @@ pan_pool_alloc_descs(struct pan_pool *pool, + #define pan_pool_alloc_desc_aggregate(pool, ...) \ + pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__)) + ++#ifdef PAN_ARCH ++#if PAN_ARCH < 10 ++ ++#define pan_pool_alloc_desc_cs_v10(pool, name) \ ++ pan_pool_alloc_desc(pool, name) ++ ++#else /* PAN_ARCH >= 10 */ ++ ++#define pan_pool_alloc_desc_cs_v10(pool, name) ((struct panfrost_ptr) {0}) ++ ++#endif ++#endif /* PAN_ARCH */ ++ + #endif diff --git a/src/panfrost/lib/pan_props.c.rej b/src/panfrost/lib/pan_props.c.rej new file mode 100644 index 00000000000..af28edb15b2 --- /dev/null +++ b/src/panfrost/lib/pan_props.c.rej @@ -0,0 +1,365 @@ +diff a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c (rejected hunks) +@@ -24,6 +24,7 @@ + * Alyssa Rosenzweig + */ + ++#include + #include + + #include "util/u_math.h" +@@ -31,12 +32,14 @@ + #include "util/hash_table.h" + #include "util/u_thread.h" + #include "drm-uapi/panfrost_drm.h" ++#include "dma-uapi/dma-buf.h" + #include "pan_encoder.h" + #include "pan_device.h" + #include "pan_bo.h" + #include "pan_texture.h" + #include "wrap.h" + #include "pan_util.h" ++#include "pan_base.h" + + /* Fixed "minimum revisions" */ + #define NO_ANISO (~0) +@@ -70,6 +73,18 @@ const struct panfrost_model panfrost_model_list[] = { + MODEL(0x7212, "G52", "TGOx", HAS_ANISO, 16384, {}), + MODEL(0x7402, "G52 r1", "TGOx", HAS_ANISO, 16384, {}), + MODEL(0x9093, "G57", "TNAx", HAS_ANISO, 16384, {}), ++ MODEL(0xa867, "G610", "LODx", HAS_ANISO, 65536, {}), ++ /* Matching the kbase dummy model, probably not real GPUs */ ++ MODEL(0xa802, "G710", "TODx", HAS_ANISO, 65536, {}), ++}; ++ ++const struct panfrost_model panfrost_unknown_model = { ++ .gpu_id = 0, ++ .name = "Unknowm Mali device (Panfrost)", ++ .performance_counters = "AAAA", ++ .min_rev_anisotropic = NO_ANISO, ++ .tilebuffer_size = 8192, ++ .quirks = {}, + }; + + #undef NO_ANISO +@@ -83,12 +98,13 @@ const struct panfrost_model panfrost_model_list[] = { + const struct panfrost_model * + panfrost_get_model(uint32_t gpu_id) + { ++ + for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) { + if (panfrost_model_list[i].gpu_id == gpu_id) + return &panfrost_model_list[i]; + } + +- return NULL; ++ return &panfrost_unknown_model; + } + + /* Abstraction over the raw drm_panfrost_get_param ioctl for fetching +@@ -96,16 +112,27 @@ panfrost_get_model(uint32_t gpu_id) + + static __u64 + panfrost_query_raw( +- int fd, ++ struct panfrost_device *dev, + enum drm_panfrost_param param, + bool required, + unsigned default_value) + { ++ if (dev->kbase) { ++ uint64_t value; ++ bool ret = dev->mali.get_pan_gpuprop(&dev->mali, param, &value); ++ if (ret) { ++ return value; ++ } else { ++ assert(!required); ++ return default_value; ++ } ++ } ++ + struct drm_panfrost_get_param get_param = {0,}; + ASSERTED int ret; + + get_param.param = param; +- ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); ++ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); + + if (ret) { + assert(!required); +@@ -116,23 +143,23 @@ panfrost_query_raw( + } + + static unsigned +-panfrost_query_gpu_version(int fd) ++panfrost_query_gpu_version(struct panfrost_device *dev) + { +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); + } + + static unsigned +-panfrost_query_gpu_revision(int fd) ++panfrost_query_gpu_revision(struct panfrost_device *dev) + { +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); + } + + unsigned +-panfrost_query_l2_slices(const struct panfrost_device *dev) ++panfrost_query_l2_slices(struct panfrost_device *dev) + { + /* Query MEM_FEATURES register */ + uint32_t mem_features = +- panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES, ++ panfrost_query_raw(dev, DRM_PANFROST_PARAM_MEM_FEATURES, + true, 0); + + /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */ +@@ -140,10 +167,10 @@ panfrost_query_l2_slices(const struct panfrost_device *dev) + } + + static struct panfrost_tiler_features +-panfrost_query_tiler_features(int fd) ++panfrost_query_tiler_features(struct panfrost_device *dev) + { + /* Default value (2^9 bytes and 8 levels) to match old behaviour */ +- uint32_t raw = panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES, ++ uint32_t raw = panfrost_query_raw(dev, DRM_PANFROST_PARAM_TILER_FEATURES, + false, 0x809); + + /* Bin size is log2 in the first byte, max levels in the second byte */ +@@ -154,11 +181,11 @@ panfrost_query_tiler_features(int fd) + } + + static unsigned +-panfrost_query_core_count(int fd, unsigned *core_id_range) ++panfrost_query_core_count(struct panfrost_device *dev, unsigned *core_id_range) + { + /* On older kernels, worst-case to 16 cores */ + +- unsigned mask = panfrost_query_raw(fd, ++ unsigned mask = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); + + /* Some cores might be absent. In some cases, we care +@@ -199,16 +226,16 @@ panfrost_max_thread_count(unsigned arch) + } + + static unsigned +-panfrost_query_thread_tls_alloc(int fd, unsigned major) ++panfrost_query_thread_tls_alloc(struct panfrost_device *dev, unsigned major) + { +- unsigned tls = panfrost_query_raw(fd, ++ unsigned tls = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0); + + return (tls > 0) ? tls : panfrost_max_thread_count(major); + } + + static uint32_t +-panfrost_query_compressed_formats(int fd) ++panfrost_query_compressed_formats(struct panfrost_device *dev) + { + /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and + * should exist on any Mali configuration. All hardware should report +@@ -227,7 +254,7 @@ panfrost_query_compressed_formats(int fd) + (1 << MALI_ASTC_2D_LDR) | + (1 << MALI_ASTC_2D_HDR); + +- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, ++ return panfrost_query_raw(dev, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, + false, default_set); + } + +@@ -250,9 +277,9 @@ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) + * may omit it, signaled as a nonzero value in the AFBC_FEATURES property. */ + + static bool +-panfrost_query_afbc(int fd, unsigned arch) ++panfrost_query_afbc(struct panfrost_device *dev, unsigned arch) + { +- unsigned reg = panfrost_query_raw(fd, ++ unsigned reg = panfrost_query_raw(dev, + DRM_PANFROST_PARAM_AFBC_FEATURES, + false, 0); + +@@ -281,24 +308,40 @@ panfrost_query_optimal_tib_size(const struct panfrost_device *dev) + void + panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + { ++ if (kbase_open(&dev->mali, fd, 4, (dev->debug & PAN_DBG_LOG))) { ++ dev->kbase = true; ++ fd = -1; ++ } ++ + dev->fd = fd; + dev->memctx = memctx; +- dev->gpu_id = panfrost_query_gpu_version(fd); ++ dev->gpu_id = panfrost_query_gpu_version(dev); + dev->arch = pan_arch(dev->gpu_id); +- dev->kernel_version = drmGetVersion(fd); +- dev->revision = panfrost_query_gpu_revision(fd); ++ if (dev->kbase) { ++ dev->kernel_version = calloc(1, sizeof(drmVersion)); ++ *dev->kernel_version = (drmVersion) { ++ .version_major = 1, ++ .version_minor = 999, ++ }; ++ } else { ++ dev->kernel_version = drmGetVersion(fd); ++ } ++ dev->revision = panfrost_query_gpu_revision(dev); + dev->model = panfrost_get_model(dev->gpu_id); + + /* If we don't recognize the model, bail early */ + if (!dev->model) + return; + +- dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range); +- dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch); ++ if (dev->debug & PAN_DBG_BO_LOG) ++ dev->bo_log = fopen("/tmp/bo_log", "w"); ++ ++ dev->core_count = panfrost_query_core_count(dev, &dev->core_id_range); ++ dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(dev, dev->arch); + dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev); +- dev->compressed_formats = panfrost_query_compressed_formats(fd); +- dev->tiler_features = panfrost_query_tiler_features(fd); +- dev->has_afbc = panfrost_query_afbc(fd, dev->arch); ++ dev->compressed_formats = panfrost_query_compressed_formats(dev); ++ dev->tiler_features = panfrost_query_tiler_features(dev); ++ dev->has_afbc = panfrost_query_afbc(dev, dev->arch); + + if (dev->arch <= 6) + dev->formats = panfrost_pipe_format_v6; +@@ -307,8 +350,10 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + else + dev->formats = panfrost_pipe_format_v9; + +- util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); ++ stable_array_init(&dev->bo_map, struct panfrost_bo); + ++ pthread_mutex_init(&dev->bo_usage_lock, NULL); ++ pthread_mutex_init(&dev->bo_map_lock, NULL); + pthread_mutex_init(&dev->bo_cache.lock, NULL); + list_inithead(&dev->bo_cache.lru); + +@@ -323,8 +368,9 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) + * active for a single job chain at once, so a single heap can be + * shared across batches/contextes */ + +- dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024, +- PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); ++ if (dev->arch < 10) ++ dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024, ++ PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); + + pthread_mutex_init(&dev->submit_lock, NULL); + +@@ -341,11 +387,102 @@ panfrost_close_device(struct panfrost_device *dev) + if (dev->model) { + pthread_mutex_destroy(&dev->submit_lock); + panfrost_bo_unreference(dev->tiler_heap); ++ panfrost_bo_unreference(dev->sample_positions); + panfrost_bo_cache_evict_all(dev); + pthread_mutex_destroy(&dev->bo_cache.lock); +- util_sparse_array_finish(&dev->bo_map); ++ pthread_mutex_destroy(&dev->bo_map_lock); ++ pthread_mutex_destroy(&dev->bo_usage_lock); ++ stable_array_fini(&dev->bo_map); ++ } ++ ++ if (dev->kbase) ++ free(dev->kernel_version); ++ else ++ drmFreeVersion(dev->kernel_version); ++ if (dev->kbase) ++ dev->mali.close(&dev->mali); ++ else ++ close(dev->fd); ++} ++ ++bool ++panfrost_check_dmabuf_fence(struct panfrost_device *dev) ++{ ++ bool ret = false; ++ int err; ++ ++ /* This function is only useful for kbase, where we can't create ++ * dma-bufs from the kbase FD. */ ++ if (!dev->ro) ++ goto out; ++ ++ struct drm_mode_create_dumb create_dumb = { ++ .width = 16, ++ .height = 16, ++ .bpp = 32, ++ }; ++ ++ err = drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_dumb); ++ if (err < 0) { ++ fprintf(stderr, "DRM_IOCTL_MODE_CREATE_DUMB failed " ++ "for fence check: %s\n", ++ strerror(errno)); ++ goto out; ++ } ++ ++ int fd; ++ err = drmPrimeHandleToFD(dev->ro->kms_fd, create_dumb.handle, O_CLOEXEC, ++ &fd); ++ if (err < 0) { ++ fprintf(stderr, "failed to export buffer for fence check: %s\n", ++ strerror(errno)); ++ goto free_dumb; + } + +- drmFreeVersion(dev->kernel_version); +- close(dev->fd); ++ struct dma_buf_export_sync_file export = { ++ .flags = DMA_BUF_SYNC_RW, ++ }; ++ ++ /* ENOTTY is returned if the ioctl is unsupported */ ++ ++ err = drmIoctl(fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &export); ++ if (err < 0) { ++ if (errno != ENOTTY) ++ fprintf(stderr, "failed to export fence: %s\n", ++ strerror(errno)); ++ goto free_fd; ++ } ++ ++ struct dma_buf_import_sync_file import = { ++ .flags = DMA_BUF_SYNC_RW, ++ .fd = export.fd, ++ }; ++ ++ err = drmIoctl(fd, DMA_BUF_IOCTL_IMPORT_SYNC_FILE, &import); ++ if (err < 0) { ++ if (errno != ENOTTY) ++ fprintf(stderr, "failed to import fence: %s\n", ++ strerror(errno)); ++ goto free_sync; ++ } ++ ++ /* We made it this far, the kernel must support the ioctls */ ++ ret = true; ++ ++free_sync: ++ close(export.fd); ++ ++free_fd: ++ close(fd); ++ ++ /* Some compilers don't like goto to a declaration */ ++ struct drm_mode_destroy_dumb destroy_dumb; ++free_dumb: ++ destroy_dumb = (struct drm_mode_destroy_dumb) { ++ .handle = create_dumb.handle, ++ }; ++ drmIoctl(dev->ro->kms_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb); ++ ++out: ++ return ret; + } diff --git a/src/panfrost/lib/pan_texture.h.rej b/src/panfrost/lib/pan_texture.h.rej new file mode 100644 index 00000000000..7a7f33572de --- /dev/null +++ b/src/panfrost/lib/pan_texture.h.rej @@ -0,0 +1,55 @@ +diff a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h (rejected hunks) +@@ -44,9 +44,15 @@ + extern "C" { + #endif + +-#define PAN_MODIFIER_COUNT 6 ++#define PAN_MODIFIER_COUNT 7 + extern uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT]; + ++struct pan_image_slice_crc { ++ unsigned offset; ++ unsigned stride; ++ unsigned size; ++}; ++ + struct pan_image_slice_layout { + unsigned offset; + +@@ -80,11 +86,7 @@ struct pan_image_slice_layout { + + /* If checksumming is enabled following the slice, what + * is its offset/stride? */ +- struct { +- unsigned offset; +- unsigned stride; +- unsigned size; +- } crc; ++ struct pan_image_slice_crc crc; + + unsigned size; + }; +@@ -141,11 +143,8 @@ struct pan_image_view { + } buf; + }; + +-unsigned +-panfrost_compute_checksum_size( +- struct pan_image_slice_layout *slice, +- unsigned width, +- unsigned height); ++struct pan_image_slice_crc ++panfrost_compute_checksum_size(unsigned width, unsigned height); + + /* AFBC */ + +@@ -164,6 +163,9 @@ panfrost_afbc_can_ytr(enum pipe_format format); + bool + panfrost_afbc_can_tile(const struct panfrost_device *dev); + ++bool ++panfrost_afbc_only_native(unsigned arch, enum pipe_format format); ++ + /* + * Represents the block size of a single plane. For AFBC, this represents the + * superblock size. For u-interleaving, this represents the tile size. diff --git a/src/panfrost/lib/pan_util.h.rej b/src/panfrost/lib/pan_util.h.rej new file mode 100644 index 00000000000..eb65d19d46e --- /dev/null +++ b/src/panfrost/lib/pan_util.h.rej @@ -0,0 +1,19 @@ +diff a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h (rejected hunks) +@@ -47,10 +47,16 @@ + #define PAN_DBG_LINEAR 0x1000 + #define PAN_DBG_NO_CACHE 0x2000 + #define PAN_DBG_DUMP 0x4000 +- + #ifndef NDEBUG + #define PAN_DBG_OVERFLOW 0x8000 + #endif ++#define PAN_DBG_TILER 0x010000 ++#define PAN_DBG_BO_LOG 0x020000 ++#define PAN_DBG_BO_CLEAR 0x040000 ++#define PAN_DBG_UNCACHED_GPU 0x100000 ++#define PAN_DBG_UNCACHED_CPU 0x200000 ++#define PAN_DBG_LOG 0x400000 ++#define PAN_DBG_GOFASTER 0x800000 + + struct panfrost_device; + diff --git a/src/panfrost/lib/wrap.h.rej b/src/panfrost/lib/wrap.h.rej new file mode 100644 index 00000000000..f645b59013b --- /dev/null +++ b/src/panfrost/lib/wrap.h.rej @@ -0,0 +1,21 @@ +diff a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h (rejected hunks) +@@ -46,6 +46,8 @@ void pandecode_initialize(bool to_stderr); + + void pandecode_next_frame(void); + ++void pandecode_dump_file_close(void); ++ + void pandecode_close(void); + + void +@@ -55,6 +57,10 @@ void pandecode_inject_free(uint64_t gpu_va, unsigned sz); + + void pandecode_jc(uint64_t jc_gpu_va, unsigned gpu_id); + ++void pandecode_cs(uint64_t cs_gpu_va, unsigned cs_size, unsigned gpu_id); ++ ++void pandecode_dump_mappings(void); ++ + void + pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id); + diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build index aa393d44fe5..97773fe970f 100644 --- a/src/panfrost/meson.build +++ b/src/panfrost/meson.build @@ -20,7 +20,7 @@ # SOFTWARE. inc_panfrost_hw = include_directories([ - 'include' + 'include', 'base' ]) inc_panfrost = include_directories([ @@ -70,6 +70,46 @@ bifrost_compiler = executable( build_by_default : with_tools.contains('panfrost') ) +csf_test = executable( + 'csf_test', + ['csf_test/test.c'], + include_directories : [ + inc_mapi, + inc_mesa, + inc_gallium, + inc_gallium_aux, + inc_include, + inc_src, + inc_panfrost, + inc_panfrost_hw, + ], + dependencies : [ + idep_nir, + idep_mesautil, + idep_bi_opcodes_h, + dep_libdrm, + libpanfrost_dep, + ], + build_by_default : true +) + +custom_target( + 'panfrost_panloader', + output: ['panfrost_panloader.txt'], + depends : [ + libpanfrost_lib, + libpanfrost_util, + _libmesa_util, + libpanfrost_decode, + libpanfrost_decode_per_arch, + libpanfrost_midgard_disasm, + libpanfrost_bifrost_disasm, + libpanfrost_valhall_disasm, + ], + command: ['touch', '@OUTPUT@'], + build_by_default : false, +) + if with_panfrost_vk subdir('vulkan') endif diff --git a/src/panfrost/meson.build.rej b/src/panfrost/meson.build.rej new file mode 100644 index 00000000000..7e2b30f869f --- /dev/null +++ b/src/panfrost/meson.build.rej @@ -0,0 +1,10 @@ +diff a/src/panfrost/meson.build b/src/panfrost/meson.build (rejected hunks) +@@ -36,6 +36,8 @@ subdir('util') + subdir('midgard') + subdir('bifrost') + ++subdir('base') ++ + if with_gallium_panfrost or with_panfrost_vk + subdir('lib') + subdir('perf') diff --git a/src/panfrost/midgard/disassemble.c.rej b/src/panfrost/midgard/disassemble.c.rej new file mode 100644 index 00000000000..84b6a93ef56 --- /dev/null +++ b/src/panfrost/midgard/disassemble.c.rej @@ -0,0 +1,12 @@ +diff a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c (rejected hunks) +@@ -1242,7 +1242,9 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words, + UNUSED static void + print_varying_parameters(FILE *fp, midgard_load_store_word *word) + { +- midgard_varying_params p = midgard_unpack_varying_params(*word); ++ unsigned params = word->signed_offset & 0x1FF; ++ midgard_varying_params p; ++ memcpy(&p, ¶ms, sizeof(p)); + + /* If a varying, there are qualifiers */ + if (p.flat_shading) diff --git a/src/panfrost/tiler/tiler-hex-read b/src/panfrost/tiler/tiler-hex-read new file mode 100755 index 00000000000..1c188e38ec1 --- /dev/null +++ b/src/panfrost/tiler/tiler-hex-read @@ -0,0 +1,400 @@ +#!/usr/bin/env python3 + +import sys +import struct + +FLIP_Y = False + +data = b'' + +fb_width = 160 +fb_height = 160 +hierarchy_mask = 0xffff + +HEAP_OFS = 0x8000 + +base_ptr = 0 +heap_ptr = 0 +midgard = False +bifrost = True +valhall = False +size = None + +bak_data = b'' + +cur_data = b'' + +# TODO: More robust looping.. +for line in sys.stdin.read().split("\n"): + print(line) + split = line.split(" ") + if not len(split) or split[0] == "": + continue + if split[0] == "width": + fb_width = int(split[1]) + continue + if split[0] == "height": + fb_height = int(split[1]) + continue + if split[0] == "mask": + hierarchy_mask = int(split[1], 0) + continue + if split[0] == "vaheap": + base_ptr = int(split[1], 16) + bifrost = False + valhall = True + continue + if split[0] == "addr": + base_ptr = int(split[1], 16) + bifrost = False + midgard = True + HEAP_OFS = 0x40 + continue + if split[0] == "heap": + heap_ptr = int(split[1], 16) + data += cur_data + cur_data = b'' + bak_data = data + data = b'' + continue + if split[0] == "size": + size = int(split[1], 0) + continue + offset = int(split[0], 16) + if offset > len(data): + data += cur_data + cur_data = b'' + data += b'\0' * (offset - len(data)) + for d in split[1:]: + if d == "" or d == "*": + continue + cur_data += bytes([int(d, 16)]) + +data += cur_data + +if heap_ptr: + data, heap_data = bak_data, data + +if size == None: + size = len(data) + +def int7(val, signed=True): + val = val & 0x7f + if signed and val >= 0x40: + return val - 0x80 + else: + return val + +def int8(val, signed=True): + val = val & 0xff + if signed and val >= 0x80: + return val - 0x100 + else: + return val + +def fetch(ptr, size): + if midgard: + if ptr >= base_ptr and ptr < base_ptr + len(data): + base = ptr - base_ptr + return data[base:base+size] + elif ptr >= heap_ptr and ptr < heap_ptr + len(heap_data): + base = ptr - heap_ptr + return heap_data[base:base+size] + else: + if valhall: + ptr -= base_ptr + if ptr < 0: + return b"" + return data[ptr:ptr+size] + +def print_draw(ptr): + draw = fetch(ptr, 128) + if len(draw) < 128: + print(" couldn't fetch draw struct") + return + decoded = struct.unpack("=16Q", draw) + coverage = [0 for x in decoded] + + fields = ( + ("Allow forward pixel to kill", 1, "0:0", "bool"), + ("Allow forward pixel to be killed", 1, "0:1", "bool"), + ("Pixel kill operation", 2, "0:2", "Pixel Kill"), + ("ZS update operation", 2, "0:4", "Pixel Kill"), + ("Allow primitive reorder", 1, "0:6", "bool"), + ("Overdraw alpha0", 1, "0:7", "bool"), + ("Overdraw alpha1", 1, "0:8", "bool"), + ("Clean Fragment Write", 1, "0:9", "bool"), + ("Primitive Barrier", 1, "0:10", "bool"), + ("Evaluate per-sample", 1, "0:11", "bool"), + ("Single-sampled lines", 1, "0:13", "bool"), + ("Occlusion query", 2, "0:14", "Occlusion Mode"), + ("Front face CCW", 1, "0:16", "bool"), + ("Cull front face", 1, "0:17", "bool"), + ("Cull back face", 1, "0:18", "bool"), + ("Multisample enable", 1, "0:19", "bool"), + ("Shader modifies coverage", 1, "0:20", "bool"), + ("Alpha-to-coverage Invert", 1, "0:21", "bool"), + ("Alpha-to-coverage", 1, "0:22", "bool"), + ("Scissor to bounding box", 1, "0:23", "bool"), + ("Sample mask", 16, "1:0", "uint"), + ("Render target mask", 8, "1:16", "hex"), + + ("Packet", 1, "2:0", "bool"), + # TODO: shr modifier + ("Vertex array", 64, "2:0", "address"), + ("Vertex packet stride", 16, "4:0", "uint"), + ("Vertex attribute stride", 16, "4:16", "uint"), + ("Unk", 16, "5:0", "uint"), + + ("Minimum Z", 32, "6:0", "float"), + ("Maximum Z", 32, "7:0", "float"), + ("Depth/stencil", 64, "10:0", "address"), + ("Blend count", 4, "12:0", "uint"), + ("Blend", 60, "12:4", "address"), + ("Occlusion", 64, "14:0", "address"), + + ("Attribute offset", 32, "16:0", "uint"), + ("FAU count", 8, "17:0", "uint"), + ("Resources", 48, "24:0", "address"), + ("Shader", 48, "26:0", "address"), + ("Thread storage", 48, "28:0", "address"), + ("FAU", 64, "30:0", "address"), + ) + + for f in fields: + name, size, start, type = f + word, bit = [int(x) for x in start.split(":")] + if word & 1: + bit += 32 + word >>= 1 + + mask = (1 << size) - 1 + data = (decoded[word] >> bit) & mask + coverage[word] |= mask << bit + if type == "float": + data = struct.unpack("=f", struct.pack("=I", data))[0] + else: + data = hex(data) + print(f" {name}: {data}") + + for i, (d, c) in enumerate(zip(decoded, coverage)): + ci = c ^ ((1 << 64) - 1) + if d & ci: + print(f" unk at 64-bit word {i}: {hex(d)} (known mask {hex(c)})") + +def print_vertex(ptr, positions): + for p in positions: + addr = ptr + p * 16 + data = fetch(addr, 16) + if len(data) < 16: + print(f" ") + continue + x, y, z, w = struct.unpack("=4f", data) + print(f" <{x} {y} {z} {w}>") + +DRAW_TYPES = [ + "unk", + "points", + "lines", + "tris", +] + +def heap_interpret(start, end): + print(f"interpreting from {hex(start)} to {hex(end)}") + + struct_count = 0 + + signed = True + + base = 0 + a = 0 + b = 0 + c = 0 + + num_vert = 3 + + draw_ptr = 0 + pos_ptr = 0 + + while start != end: + if midgard and start & 0x1ff == 0x1f8: + jump = struct.unpack("=Q", fetch(start, 8))[0] + print(f"jump mdg: {hex(jump)}") + start = jump + continue + + dat = fetch(start, 4) + if dat[3] & 0xe0 == 0x80: + struct_count += 1 + + print(f"{struct_count}:", " ".join([f"{hex(x)[2:].upper():>02}" for x in dat]), end=" ") + + masked_op = dat[3] & ~3 + + up = struct.unpack("=I", dat)[0] + + if valhall: + tri0 = tri0_7 = int7(up >> 15, signed) + tri1 = int7(up >> 8, signed) + tri2 = int7(up >> 1, signed) + else: + tri0 = int8(up >> 14, signed) + tri0_7 = int7(up >> 14, signed) + tri1 = int7(up >> 7, signed) + tri2 = int7(up, signed) + + signed = True + + if dat[3] & 0xe0 == 0x80: + res = "" + if valhall: + address = (up & 0x7ffffff) * 32 + num_vert = (dat[3] >> 3) & 0x3 + else: + address = (up & 0xffffff) * 64 + num_vert = (dat[3] >> 2) & 0x3 + if dat[3] & 0x10: + a = 0 + res = " reset" + draw_ptr = address + if valhall: + pos_ptr = address + 128 + print(f"draw {DRAW_TYPES[num_vert]}{res}: {hex(address)}") + elif valhall and dat[3] >> 4 == 12: + unk1 = up & 0x3f + address = (up >> 6) & 0xffff + unk2 = up >> 22 + draw_ptr += address << 32 + pos_ptr += address << 32 + print(f"draw offset: {hex(address)}, unk {hex(unk1)}, {hex(unk2)}") + + print_draw(draw_ptr) + elif dat[3] >> 6 == 1: + # TODO: handle two of these in a row + res = "" + if valhall: + # TOOD: Is the mask correct? + pf = (up >> 22) & 0x7f + shift = 7 + if dat[3] & 0x20: + a = 0 + res = " reset" + else: + pf = (up >> 21) & 0x7f + shift = 8 + + a += tri0_7 << shift + b += tri1 << 7 + c += tri2 << 7 + print(f"primitive offset{res}: {hex(pf << 4)} | +{tri0_7 << shift} {tri1 << 7} {tri2 << 7}") + signed = False + # TODO: Jumps are located based on position, not opcode + elif dat[3] == 0xff: + up64 = struct.unpack("=Q", fetch(start, 8))[0] + assert((up64 & 3) == 3) + print(f"jump (from {hex(start+8)}-8): {hex(up64 - 3)}") + start = up64 - 7 + elif dat[3] == 0x00: + assert((up & 3) == 3) + print(f"jump (from {hex(start+4)}-4): {hex(up - 3)}, {hex(HEAP_OFS + up - 3)}") + start = HEAP_OFS + up - 7 + elif (masked_op & 0xc0) == 0: + mode = hex(dat[3] >> 2) + + pre_offset = (up >> 22) & 0xf + + unk = "" + if valhall and up & 1: + unk = ", unk 1" + + a += base + tri0 + b += a + tri1 + c += a + tri2 + base = a + + print(f"{mode} draw: {hex(pre_offset)} | +{tri0} {tri1} {tri2}{unk}") + + print_vertex(pos_ptr, [a, b, c][:num_vert]) + + a = b = c = 0 + + else: + print(f"Unknown opcode {hex(dat[3])}") + + start += 4 + +def level_list(): + levels = [] + size = 16 + anylevel = False + + # TODO: Does this miss the largest level? + while anylevel == False or size // 2 < min(fb_width, fb_height): + if (hierarchy_mask << 4) & size != 0: + anylevel = True + levels.append(size) + + size *= 2 + + return levels + +def div_round_up(x, y): + return (x + y - 1) // y + +def align(x, y): + return div_round_up(x, y) * y + +def tile_count(alignment=4): + return sum(align(div_round_up(fb_width, size) * div_round_up(fb_height, size), 4) + for size in level_list()) + +if midgard: + unpacked_header = list(struct.unpack("=16i", data[0:64])) + # Is this really big endian? + unpacked_header[5:7] = struct.unpack(">2i", data[20:28]) + print(f"header: {' '.join([str(x) for x in unpacked_header])}") + + # Extra is because of HEAP_OFS + header_size = align(tile_count() + 8, 64) +elif valhall: + # TODO: Does this figure need alignment? + HEAP_STRIDE = tile_count() * 8 + HEAP_OFS = size - HEAP_STRIDE * 2 + +pos = base_ptr + HEAP_OFS + +for size in level_list(): + for y in range((fb_height + size - 1) // size): + for x in range((fb_width + size - 1) // size): + header = fetch(pos, 8) + if len(header) == 0: + break + + if midgard: + end = struct.unpack("=Q", header)[0] + use = bool(end) + end += 4 + start = base_ptr + header_size * 8 + (pos - base_ptr - HEAP_OFS) * 64 + elif bifrost: + end, start = struct.unpack("=II", header) + use = bool(end) + start += HEAP_OFS + end += HEAP_OFS + 4 + end &= ~3 + else: + footer = fetch(pos + HEAP_STRIDE, 8) + if len(footer) == 0: + break + start, end = struct.unpack("=QQ", header + footer) + use = bool(end) + # The upper bits are used for jump metadata + end &= (1 << 48) - 1 + end += 4 + if use: + if FLIP_Y: + print([x * size, fb_height - (y + 1) * size], ((x + 1) * size, fb_height - y * size)) + else: + print([x * size, y * size], ((x + 1) * size, (y + 1) * size)) + heap_interpret(start, end) + + pos += 8 diff --git a/src/util/os_misc.c.rej b/src/util/os_misc.c.rej new file mode 100644 index 00000000000..261ce7607cd --- /dev/null +++ b/src/util/os_misc.c.rej @@ -0,0 +1,103 @@ +diff a/src/util/os_misc.c b/src/util/os_misc.c (rejected hunks) +@@ -53,7 +53,6 @@ + # define LOG_TAG "MESA" + # include + # include +-# include + #elif DETECT_OS_LINUX || DETECT_OS_CYGWIN || DETECT_OS_SOLARIS || DETECT_OS_HURD + # include + #elif DETECT_OS_OPENBSD || DETECT_OS_FREEBSD +@@ -123,93 +122,10 @@ os_log_message(const char *message) + #endif + } + +-#if DETECT_OS_ANDROID +-# include +-# include "hash_table.h" +-# include "ralloc.h" +-# include "simple_mtx.h" +- +-static struct hash_table *options_tbl; +- +-static void +-options_tbl_fini(void) +-{ +- _mesa_hash_table_destroy(options_tbl, NULL); +-} +- +-/** +- * Get an option value from android's property system, as a fallback to +- * getenv() (which is generally less useful on android due to processes +- * typically being forked from the zygote. +- * +- * The option name used for getenv is translated into a property name +- * by: +- * +- * 1) convert to lowercase +- * 2) replace '_' with '.' +- * 3) if necessary, prepend "mesa." +- * +- * For example: +- * - MESA_EXTENSION_OVERRIDE -> mesa.extension.override +- * - GALLIUM_HUD -> mesa.gallium.hud +- * +- * Note that we use a hashtable for two purposes: +- * 1) Avoid re-translating the option name on subsequent lookups +- * 2) Avoid leaking memory. Because property_get() returns the +- * property value into a user allocated buffer, we cannot return +- * that directly to the caller, so we need to strdup(). With the +- * hashtable, subsquent lookups can return the existing string. +- */ +-static const char * +-os_get_android_option(const char *name) +-{ +- if (!options_tbl) { +- options_tbl = _mesa_hash_table_create(NULL, _mesa_hash_string, +- _mesa_key_string_equal); +- atexit(options_tbl_fini); +- } +- +- struct hash_entry *entry = _mesa_hash_table_search(options_tbl, name); +- if (entry) { +- return entry->data; +- } +- +- char value[PROPERTY_VALUE_MAX]; +- char key[PROPERTY_KEY_MAX]; +- char *p = key, *end = key + PROPERTY_KEY_MAX; +- /* add "mesa." prefix if necessary: */ +- if (strstr(name, "MESA_") != name) +- p += strlcpy(p, "mesa.", end - p); +- p += strlcpy(p, name, end - p); +- for (int i = 0; key[i]; i++) { +- if (key[i] == '_') { +- key[i] = '.'; +- } else { +- key[i] = tolower(key[i]); +- } +- } +- +- const char *opt = NULL; +- int len = property_get(key, value, NULL); +- if (len > 1) { +- opt = ralloc_strdup(options_tbl, value); +- } +- +- _mesa_hash_table_insert(options_tbl, name, (void *)opt); +- +- return opt; +-} +-#endif +- + const char * + os_get_option(const char *name) + { + const char *opt = getenv(name); +-#if DETECT_OS_ANDROID +- if (!opt) { +- opt = os_get_android_option(name); +- } +-#endif + return opt; + } + diff --git a/src/util/perf/cpu_trace.h.rej b/src/util/perf/cpu_trace.h.rej new file mode 100644 index 00000000000..f1e688f3f4d --- /dev/null +++ b/src/util/perf/cpu_trace.h.rej @@ -0,0 +1,21 @@ +diff a/src/util/perf/cpu_trace.h b/src/util/perf/cpu_trace.h (rejected hunks) +@@ -27,19 +27,6 @@ + util_perfetto_trace_end(category); \ + } while (0) + +-/* NOTE: for now disable atrace for C++ to workaround a ndk bug with ordering +- * between stdatomic.h and atomic.h. See: +- * +- * https://github.com/android/ndk/issues/1178 +- */ +-#elif defined(ANDROID) && !defined(__cplusplus) +- +-#include +- +-#define _MESA_TRACE_BEGIN(category, name) \ +- atrace_begin(ATRACE_TAG_GRAPHICS, name) +-#define _MESA_TRACE_END(category) atrace_end(ATRACE_TAG_GRAPHICS) +- + #else + + #define _MESA_TRACE_BEGIN(category, name) diff --git a/src/util/stable_array.h b/src/util/stable_array.h new file mode 100644 index 00000000000..a590aa48a50 --- /dev/null +++ b/src/util/stable_array.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2022 Icecream95 + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef STABLE_ARRAY_H +#define STABLE_ARRAY_H + +#include "util/simple_mtx.h" +#include "util/u_math.h" + +/* A thread-safe automatically growing array where elements have stable locations + * + * This data structure has these properties: + * + * 1. Accessing an element is constant time (if allocation is not required). + * + * 2. Elements are not moved in memory, so it is safe to store a pointer to + * something in a stable_array. + * + * 3. The data structure is thread-safe. To improve performance, there is + * also a fast path that does not require atomics. + * + * 4. Although the data structure is not lock-free, there is a limit on the + * number of times that a lock is ever acquired--a maximum of 32 times the + * number of accessing threads. In practice, contention will never be an + * issue for long-lived stable_arrays. + * + * 5. Memory usage is similar to util_dynarray, with each allocation being + * twice as large as the last. Freeing buckets is currently never done. + * + * The data structure is faster than util_sparse_array, but is not sparse. + */ + +struct stable_array +{ + uint8_t *buckets[32]; + simple_mtx_t lock; + size_t eltsize; +}; + +static inline void +stable_array_init_bytes(struct stable_array *buf, size_t eltsize) +{ + memset(buf, 0, sizeof(*buf)); + buf->eltsize = eltsize; + simple_mtx_init(&buf->lock, mtx_plain); +} + +static inline void +stable_array_fini(struct stable_array *buf) +{ + simple_mtx_destroy(&buf->lock); + for (unsigned i = 0; i < ARRAY_SIZE(buf->buckets); ++i) { + if (buf->buckets[i]) + free(buf->buckets[i]); + } +} + +struct stable_array_index +{ + unsigned bucket; + unsigned idx; +}; + +static inline struct stable_array_index +stable_array_get_index(unsigned idx) +{ + struct stable_array_index i = {0}; + i.bucket = util_logbase2(idx); + i.idx = i.bucket ? (idx -= (1 << i.bucket)) : idx; + return i; +} + +static inline void * +stable_array_get_bytes(struct stable_array *buf, unsigned idx, size_t eltsize) +{ + assert(eltsize == buf->eltsize); + + struct stable_array_index i = stable_array_get_index(idx); + + uint8_t *bucket = p_atomic_read(&buf->buckets[i.bucket]); + + if (!bucket) { + simple_mtx_lock(&buf->lock); + bucket = buf->buckets[i.bucket]; + + if (!bucket) { + /* The first two buckets both have two elements */ + bucket = (uint8_t *)calloc(1U << MAX2(i.bucket, 1), eltsize); + + p_atomic_set(&buf->buckets[i.bucket], bucket); + } + simple_mtx_unlock(&buf->lock); + } + + return bucket + eltsize * i.idx; +} + +static inline void * +stable_array_get_existing_bytes(struct stable_array *buf, unsigned idx, size_t eltsize) +{ + assert(eltsize == buf->eltsize); + + struct stable_array_index i = stable_array_get_index(idx); + + return buf->buckets[i.bucket] + eltsize * i.idx; +} + +#define stable_array_init(buf, type) stable_array_init_bytes((buf), sizeof(type)) +#define stable_array_get(buf, type, idx) ((type*)stable_array_get_bytes((buf), (idx), sizeof(type))) +#define stable_array_get_existing(buf, type, idx) ((type*)stable_array_get_existing_bytes((buf), (idx), sizeof(type))) + +#endif diff --git a/src/util/u_debug_stack_android.cpp.rej b/src/util/u_debug_stack_android.cpp.rej new file mode 100644 index 00000000000..ce8ce1ef853 --- /dev/null +++ b/src/util/u_debug_stack_android.cpp.rej @@ -0,0 +1,83 @@ +diff a/src/util/u_debug_stack_android.cpp b/src/util/u_debug_stack_android.cpp (rejected hunks) +@@ -21,7 +21,6 @@ + * IN THE SOFTWARE. + */ + +-#include + + #include "util/simple_mtx.h" + #include "util/u_debug.h" +@@ -52,56 +51,14 @@ debug_backtrace_capture(debug_stack_frame *backtrace, + unsigned start_frame, + unsigned nr_frames) + { +- Backtrace *bt; + +- if (!nr_frames) +- return; +- +- bt = Backtrace::Create(BACKTRACE_CURRENT_PROCESS, +- BACKTRACE_CURRENT_THREAD); +- if (bt == NULL) { +- for (unsigned i = 0; i < nr_frames; i++) +- backtrace[i].procname = NULL; +- return; +- } +- +- /* Add one to exclude this call. Unwind already ignores itself. */ +- bt->Unwind(start_frame + 1); +- +- simple_mtx_lock(&table_mutex); +- +- for (unsigned i = 0; i < nr_frames; i++) { +- const backtrace_frame_data_t* frame = bt->GetFrame(i); +- if (frame) { +- backtrace[i].procname = intern_symbol(frame->func_name.c_str()); +- backtrace[i].start_ip = frame->pc; +- backtrace[i].off = frame->func_offset; +- backtrace[i].map = intern_symbol(frame->map.Name().c_str()); +- backtrace[i].map_off = frame->rel_pc; +- } else { +- backtrace[i].procname = NULL; +- } +- } +- +- simple_mtx_unlock(&table_mutex); +- +- delete bt; + } + + void + debug_backtrace_dump(const debug_stack_frame *backtrace, + unsigned nr_frames) + { +- for (unsigned i = 0; i < nr_frames; i++) { +- if (backtrace[i].procname) +- debug_printf( +- "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n", +- backtrace[i].map, +- backtrace[i].map_off, +- backtrace[i].start_ip, +- backtrace[i].procname, +- backtrace[i].off); +- } ++ + } + + void +@@ -109,14 +66,5 @@ debug_backtrace_print(FILE *f, + const debug_stack_frame *backtrace, + unsigned nr_frames) + { +- for (unsigned i = 0; i < nr_frames; i++) { +- if (backtrace[i].procname) +- fprintf(f, +- "%s(+0x%x)\t%012" PRIx64 ": %s+0x%x\n", +- backtrace[i].map, +- backtrace[i].map_off, +- backtrace[i].start_ip, +- backtrace[i].procname, +- backtrace[i].off); +- } ++ + }