Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update BitBlt support (primarily for 64-bit ARM) #565

Open
wants to merge 17 commits into
base: Cog
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.linux32ARMv6/squeak.cog.spur/build.assert/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -e
# assert VM with VM profiler and threaded heartbeat
INSTALLDIR=assert/sqcogspurlinuxhtRPi
OPT="-m32 -g3 -O1 -DDEBUGVM=0"
OPT="-g3 -O1 -DDEBUGVM=0"

if [ $# -ge 1 ]; then
INSTALLDIR="$1"; shift
Expand Down
2 changes: 1 addition & 1 deletion build.linux32ARMv6/squeak.cog.spur/build.debug/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -e
# debug Spur VM with VM profiler and threaded heartbeat
INSTALLDIR=debug/sqcogspurlinuxhtRPi
OPT="-m32 -g3 -O0 -DDEBUGVM=1"
OPT="-g3 -O0 -DDEBUGVM=1"

if [ $# -ge 1 ]; then
INSTALLDIR="$1"; shift
Expand Down
2 changes: 1 addition & 1 deletion build.linux32ARMv6/squeak.cog.spur/build/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -e
# Spur VM with VM profiler and threaded heartbeat
INSTALLDIR=sqcogspurlinuxhtRPi
OPT="-m32 -g -O2 -DNDEBUG -DDEBUGVM=0"
OPT="-g -O2 -DNDEBUG -DDEBUGVM=0"

if [ $# -ge 1 ]; then
INSTALLDIR="$1"; shift
Expand Down
1 change: 1 addition & 0 deletions build.linux64ARMv8/squeak.cog.spur/build.assert/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
test -f config.h || ../../../platforms/unix/config/configure \
--with-vmversion=5.0 --with-src=spur64src \
--without-vm-display-fbdev --without-npsqueak \
--enable-fast-bitblt \
CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
LIBS="-lrt"

Expand Down
1 change: 1 addition & 0 deletions build.linux64ARMv8/squeak.cog.spur/build.debug/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
test -f config.h || ../../../platforms/unix/config/configure \
--with-vmversion=5.0 --with-src=spur64src \
--without-vm-display-fbdev --without-npsqueak \
--enable-fast-bitblt \
CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
LIBS="-lrt"

Expand Down
1 change: 1 addition & 0 deletions build.linux64ARMv8/squeak.cog.spur/build/mvm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
test -f config.h || ../../../platforms/unix/config/configure \
--with-vmversion=5.0 --with-src=spur64src \
--without-npsqueak \
--enable-fast-bitblt \
CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
LIBS="-lrt"
## --without-vm-display-fbdev --without-npsqueak \
Expand Down
2,482 changes: 2,482 additions & 0 deletions platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright © 2021 RISC OS Open Ltd
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. The copyright holders make no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
*/

#ifndef BITBLTARM64_H_
#define BITBLTARM64_H_

void addArm64FastPaths(void);

#endif /* BITBLTARM64_H_ */
2 changes: 1 addition & 1 deletion platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
Original file line number Diff line number Diff line change
Expand Up @@ -1542,7 +1542,7 @@ subblock SETA 0
WHILE subblock < pix_per_block*dst_w_bpp/128
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
ASSERT flags & FLAG_MAX_256BIT_MACRO = 0
AddL scratch, x, (prefetch_distance+2)*pix_per_block + subblock*128/dst_w_bpp
AddL scratch, x, (prefetch_distance+2)*pix_per_block - subblock*128/dst_w_bpp
TST scratch, #32/src_bpp - 128/dst_w_bpp
BNE %FT53
Read1Word src, 0, carry, &$fixed_skew, skew, scratch
Expand Down
17 changes: 11 additions & 6 deletions platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ counter SETA counter + 4

SourceWord GenerateFunctions 1, 4,, \
FLAG_COLOUR_MAP :OR: FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS :OR: FLAG_NO_EXPAND_SKEW, 2, \
"stride_s,map,bitptrs,orig_w,scratch", \
"stride_s,map,bitptrs,scratch,orig_w", \
"x,stride_s,bitptrs", orig_w,, init ; leading_pixels_reg=wk3

; ********************************************************************
Expand Down Expand Up @@ -951,16 +951,16 @@ counter SETA counter + 4

MACRO
SourceWord1_2_128bits_head $src, $fixed_skew, $intra_preloads
Read2Words src, 3, carry, $fixed_skew, skew, $wk0
Read2Words src, 2, carry, $fixed_skew, skew, $wk0
MEND

MACRO
SourceWord1_2_128bits_tail $src
LCLA counter
counter SETA 0
WHILE counter < 16
MSR CPSR_f, $wk3
MOV $wk3, $wk3, LSL #4
MSR CPSR_f, $wk2
MOV $wk2, $wk2, LSL #4
ORRPL $wk0, ht, $wk0, LSL #2
ORRMI $wk0, ht_info, $wk0, LSL #2
ORRNE $wk0, ht, $wk0, LSL #2
Expand All @@ -973,8 +973,8 @@ counter SETA counter + 4
WEND
counter SETA 0
WHILE counter < 16
MSR CPSR_f, $wk3
MOV $wk3, $wk3, LSL #4
MSR CPSR_f, $wk2
MOV $wk2, $wk2, LSL #4
ORRPL $wk1, ht, $wk1, LSL #2
ORRMI $wk1, ht_info, $wk1, LSL #2
ORRNE $wk1, ht, $wk1, LSL #2
Expand All @@ -987,8 +987,13 @@ counter SETA counter + 4
WEND
counter SETA 0
WHILE counter < 16
[ counter = 0
MSR CPSR_f, $wk3
MOV $wk4, $wk3, LSL #4
|
MSR CPSR_f, $wk4
MOV $wk4, $wk4, LSL #4
]
ORRPL $wk2, ht, $wk2, LSL #2
ORRMI $wk2, ht_info, $wk2, LSL #2
ORRNE $wk2, ht, $wk2, LSL #2
Expand Down
126 changes: 97 additions & 29 deletions platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#include "BitBltDispatch.h"
#include "BitBltArm.h"
#include "BitBltArm64.h"
#include "BitBltGeneric.h"
#include "BitBltInternal.h"

Expand Down Expand Up @@ -202,7 +203,9 @@ void initialiseCopyBits(void)
#ifdef __arm__
addArmFastPaths();
#endif

#ifdef __aarch64__
addArm64FastPaths();
#endif
}

void addFastPaths(fast_path_t *paths, size_t n)
Expand Down Expand Up @@ -310,41 +313,106 @@ void copyBitsDispatch(operation_t *op)

if (op->cmFlags & ColorMapIndexedPart) {
if (op->cmFlags & ColorMapFixedPart) {
if (op->src.depth == 32) {
if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
flags |= FAST_PATH_15BIT_COLOR_MAP;
else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0)
flags |= FAST_PATH_12BIT_COLOR_MAP;
else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0)
flags |= FAST_PATH_9BIT_COLOR_MAP;
else {
/* Unsupported case 1 */
copyBitsFallback(op, 0);
/* First check whether all except the first colour map entry match.
* This indicates that the operation depends only upon whether
* each pixel has value 0 or not (note that non-zero pixels that
* reduce to 0 when bit-packed use lookup index 1)
*/
#ifdef __aarch64__
if (op->cmMask >= 15 && (op->cmMask & (op->cmMask + 1)) == 0) {
uint32_t value = (*op->cmLookupTable)[1];
uint32_t *ptr = (*op->cmLookupTable) + 2;
uint32_t *end = (*op->cmLookupTable) + 16;
for (; ptr != end; ++ptr)
if (*ptr != value)
break;
if (ptr == end) {
uint64_t result0, result1;
end += op->cmMask + 1 - 16;
__asm__ volatile (
"dup v16.4s, %w[value] \n\t"
"movi v4.16b, #0 \n\t"
"movi v5.16b, #0 \n\t"
"movi v6.16b, #0 \n\t"
"movi v7.16b, #0 \n\t"
"1: \n\t"
"ld1 {v0.16b-v3.16b}, [%[ptr]], #64 \n\t"
"eor v0.16b, v0.16b, v16.16b \n\t"
"eor v1.16b, v1.16b, v16.16b \n\t"
"eor v2.16b, v2.16b, v16.16b \n\t"
"eor v3.16b, v3.16b, v16.16b \n\t"
"orr v4.16b, v4.16b, v0.16b \n\t"
"orr v5.16b, v5.16b, v1.16b \n\t"
"orr v6.16b, v6.16b, v2.16b \n\t"
"orr v7.16b, v7.16b, v3.16b \n\t"
"cmp %[ptr], %[end] \n\t"
"b.ne 1b \n\t"
"orr v4.16b, v4.16b, v5.16b \n\t"
"orr v6.16b, v6.16b, v7.16b \n\t"
"orr v4.16b, v4.16b, v6.16b \n\t"
"mov %[result0], v4.d[0] \n\t"
"mov %[result1], v4.d[1] \n\t"
: /* Outputs */
[ptr]"+r"(ptr),
[result0]"=r"(result0),
[result1]"=r"(result1)
: /* Inputs */
[value]"r"(value),
[end]"r"(end)
: /* Clobbers */
"cc"
);
if (result0 == 0 && result1 == 0)
flags |= FAST_PATH_1BIT_COLOR_MAP;
}
}
#else
usqInt i;
flags |= FAST_PATH_1BIT_COLOR_MAP;
for (i = op->cmMask; i >= 2; --i) {
if ((*op->cmLookupTable)[i] != (*op->cmLookupTable)[1]) {
flags &= ~FAST_PATH_1BIT_COLOR_MAP;
break;
}
}
#endif
if ((flags & FAST_PATH_1BIT_COLOR_MAP) == 0) {
if (op->src.depth == 32) {
if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
flags |= FAST_PATH_15BIT_COLOR_MAP;
else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0)
flags |= FAST_PATH_12BIT_COLOR_MAP;
else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0)
flags |= FAST_PATH_9BIT_COLOR_MAP;
else {
/* Unsupported case 1 */
copyBitsFallback(op, 0);
#ifdef PROFILING
profile_unrecorded_cases[1]++;
profile_unrecorded_cases[1]++;
#endif
return;
}
} else if (op->src.depth == 16) {
if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0)
flags |= FAST_PATH_12BIT_COLOR_MAP;
else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0)
flags |= FAST_PATH_9BIT_COLOR_MAP;
else {
/* Unsupported case 2 */
return;
}
} else if (op->src.depth == 16) {
if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0)
flags |= FAST_PATH_12BIT_COLOR_MAP;
else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0)
flags |= FAST_PATH_9BIT_COLOR_MAP;
else {
/* Unsupported case 2 */
copyBitsFallback(op, 0);
#ifdef PROFILING
profile_unrecorded_cases[2]++;
#endif
return;
}
} else {
/* Unsupported case 3 */
copyBitsFallback(op, 0);
#ifdef PROFILING
profile_unrecorded_cases[2]++;
profile_unrecorded_cases[3]++;
#endif
return;
}
} else {
/* Unsupported case 3 */
copyBitsFallback(op, 0);
#ifdef PROFILING
profile_unrecorded_cases[3]++;
#endif
return;
}
} else {
if ((op->src.depth < 16 && op->cmMask == (1u << op->src.depth) - 1) ||
Expand Down
2 changes: 1 addition & 1 deletion platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ typedef struct {
unsigned int (*cmLookupTable)[];
bool noHalftone;
usqInt halftoneHeight;
sqInt (*halftoneBase)[];
unsigned int (*halftoneBase)[];
union {
sqInt sourceAlpha;
struct {
Expand Down
Loading