diff --git a/Makefile b/Makefile index 39b5fbaf8..5d8850665 100644 --- a/Makefile +++ b/Makefile @@ -288,7 +288,7 @@ OBJS += $(LCHDR)/src/libchdr_cdrom.o OBJS += $(LCHDR)/src/libchdr_chd.o OBJS += $(LCHDR)/src/libchdr_flac.o OBJS += $(LCHDR)/src/libchdr_huffman.o -$(LCHDR)/src/%.o: CFLAGS += -Wno-unused -Wno-maybe-uninitialized -std=gnu11 +$(LCHDR)/src/%.o: CFLAGS += -Wno-unused -Wno-maybe-uninitialized -Wno-format -std=gnu11 OBJS += $(LCHDR_LZMA)/src/Alloc.o OBJS += $(LCHDR_LZMA)/src/CpuArch.o OBJS += $(LCHDR_LZMA)/src/Delta.o diff --git a/Makefile.libretro b/Makefile.libretro index 6bacb3f47..21cbd282a 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -66,6 +66,10 @@ else ifeq ($(platform), vita) LD = arm-vita-eabi-ld$(EXE_EXT) OBJCOPY = arm-vita-eabi-objcopy$(EXE_EXT) else ifeq ($(platform), ctr) + ifneq ($(strip $(DEVKITPRO)),) + DEVKITARM ?= $(DEVKITPRO)/devkitARM + CTRULIB ?= $(DEVKITPRO)/libctru + endif ifeq ($(strip $(DEVKITARM)),) $(error "DEVKITARM env var is not set") endif diff --git a/frontend/libretro-rthreads.c b/frontend/libretro-rthreads.c index 82af3ef23..90067b137 100644 --- a/frontend/libretro-rthreads.c +++ b/frontend/libretro-rthreads.c @@ -24,6 +24,8 @@ void pcsxr_sthread_init(void) SysPrintf("%d cpu core(s) detected\n", cpu_features_get_core_amount()); #ifdef _3DS int64_t version = 0; + int fpscr = -1; + APT_CheckNew3DS(&is_new_3ds); svcGetSystemInfo(&version, 0x10000, 0); @@ -31,9 +33,10 @@ void pcsxr_sthread_init(void) u32 percent = -1; APT_GetAppCpuTimeLimit(&percent); - SysPrintf("%s3ds detected, v%d.%d, AppCpuTimeLimit=%ld\n", + __asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr)); + SysPrintf("%s3ds detected, v%d.%d, AppCpuTimeLimit=%ld fpscr=%08x\n", is_new_3ds ? "new" : "old", (int)GET_VERSION_MAJOR(version), - (int)GET_VERSION_MINOR(version), percent); + (int)GET_VERSION_MINOR(version), percent, fpscr); #endif } diff --git a/frontend/libretro.c b/frontend/libretro.c index 68ec65ff8..aa69ce65e 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -2297,7 +2297,10 @@ static void update_variables(bool in_flight) if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value) { int psxclock = atoi(var.value); - Config.cycle_multiplier = 10000 / psxclock; + if (strcmp(var.value, "auto") == 0 || psxclock == 0) + Config.cycle_multiplier = CYCLE_MULT_DEFAULT; + else + Config.cycle_multiplier = 10000 / psxclock; } #if !defined(DRC_DISABLE) && !defined(LIGHTREC) @@ -3612,13 +3615,6 @@ void retro_init(void) if (environ_cb(RETRO_ENVIRONMENT_GET_RUMBLE_INTERFACE, &rumble)) rumble_cb = rumble.set_rumble_state; - /* Set how much slower PSX CPU runs * 100 (so that 200 is 2 times) - * we have to do this because cache misses and some IO penalties - * are not emulated. Warning: changing this may break compatibility. */ - Config.cycle_multiplier = CYCLE_MULT_DEFAULT; -#if defined(HAVE_PRE_ARMV7) && !defined(_3DS) - Config.cycle_multiplier = 200; -#endif pl_rearmed_cbs.gpu_peops.iUseDither = 1; pl_rearmed_cbs.gpu_peops.dwActFixes = GPU_PEOPS_OLD_FRAME_SKIP; diff --git a/frontend/libretro_core_options.h b/frontend/libretro_core_options.h index ed2f41c31..220df0f2e 100644 --- a/frontend/libretro_core_options.h +++ b/frontend/libretro_core_options.h @@ -239,6 +239,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { NULL, "system", { + { "auto", "Auto" }, { "30", NULL }, { "31", NULL }, { "32", NULL }, @@ -312,11 +313,7 @@ struct retro_core_option_v2_definition option_defs_us[] = { { "100", NULL }, { NULL, NULL }, }, -#if defined(HAVE_PRE_ARMV7) && !defined(_3DS) - "50", -#else - "57", -#endif + "auto", }, { "pcsx_rearmed_dithering", diff --git a/libpcsxcore/cdriso.c b/libpcsxcore/cdriso.c index 9b98fbe5d..9282430f9 100644 --- a/libpcsxcore/cdriso.c +++ b/libpcsxcore/cdriso.c @@ -1647,7 +1647,7 @@ int ISOreadTrack(const unsigned char *time, void *buf) ret = cdimg_read_func(cdHandle, 0, buf, sector); if (ret < 12*2 + 2048) { - if (multifile && sector >= msf2sec(ti[1].length)) { + if (buf && multifile && sector >= msf2sec(ti[1].length)) { // assume a gap not backed by a file memset(buf, 0, CD_FRAMESIZE_RAW); return 0; diff --git a/libpcsxcore/cdrom.c b/libpcsxcore/cdrom.c index 5fcc54cdc..a8eb6d9b8 100644 --- a/libpcsxcore/cdrom.c +++ b/libpcsxcore/cdrom.c @@ -476,7 +476,7 @@ static int ReadTrack(const u8 *time) return 1; ret = cdra_readTrack(time); - if (ret != 0) + if (ret == 0) memcpy(cdr.Prev, time, 3); return ret == 0; } @@ -1793,8 +1793,9 @@ int cdrFreeze(void *f, int Mode) { tmpp[1] = btoi(tmpp[1]); tmpp[2] = btoi(tmpp[2]); } - cdr.Prev[0]++; - ReadTrack(tmpp); + cdr.Prev[0] = 0xff; + if (tmpp[0] != 0xff) + ReadTrack(tmpp); if (cdr.Play) { if (cdr.freeze_ver < 0x63647202) diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c index 054e2a66a..76951a682 100644 --- a/libpcsxcore/database.c +++ b/libpcsxcore/database.c @@ -138,6 +138,8 @@ cycle_multiplier_overrides[] = { 200, { "SCES02873" } }, /* Zero Divide - sometimes too fast */ { 200, { "SLUS00183", "SLES00159", "SLPS00083", "SLPM80008" } }, + /* Eagle One: Harrier Attack - hangs (but not in standalone build?) */ + { 153, { "SLUS00943" } }, }; static const struct diff --git a/libpcsxcore/misc.c b/libpcsxcore/misc.c index 6ba8d72ce..5b4d81b98 100644 --- a/libpcsxcore/misc.c +++ b/libpcsxcore/misc.c @@ -795,8 +795,6 @@ int LoadState(const char *file) { CdromFrontendId = misc->CdromFrontendId; } - psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); - if (Config.HLE) psxBiosFreeze(0); @@ -828,6 +826,8 @@ int LoadState(const char *file) { if (Config.HLE) psxBiosCheckExe(biosBranchCheckOld, 0x60, 1); + psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); + result = 0; cleanup: memset(misc, 0, sizeof(*misc)); diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index 62e984b6f..0b8d9e2bc 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -484,6 +484,10 @@ static void ari64_thread_init(void) else { u32 cpu_count = cpu_features_get_core_amount(); enable = cpu_count > 1; +#ifdef _3DS + // bad for old3ds, reprotedly no improvement for new3ds + enable = 0; +#endif } if (!ndrc_g.thread.handle == !enable) diff --git a/libpcsxcore/psxcommon.h b/libpcsxcore/psxcommon.h index 68c32a91a..8a0ac703b 100644 --- a/libpcsxcore/psxcommon.h +++ b/libpcsxcore/psxcommon.h @@ -112,6 +112,8 @@ extern int Log; void __Log(char *fmt, ...); +// lots of timing depends on this and makes or breaks compatibility, +// don't change unless you're going to retest hundreds of games #define CYCLE_MULT_DEFAULT 175 typedef struct { diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index f398695d2..e78feaf24 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -39,7 +39,7 @@ const u8 command_lengths[256] = }; #endif -void update_texture_ptr(psx_gpu_struct *psx_gpu) +static void update_texture_ptr(psx_gpu_struct *psx_gpu) { u8 *texture_base; u8 *texture_ptr; @@ -91,7 +91,7 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu) psx_gpu->texture_page_ptr = texture_ptr; } -void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings) +static void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings) { texture_settings &= 0x1FF; if(psx_gpu->texture_settings != texture_settings) @@ -135,17 +135,18 @@ void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings) } } -void set_clut(psx_gpu_struct *psx_gpu, u32 clut_settings) +static void set_clut(psx_gpu_struct *psx_gpu, u32 clut_settings) { - if(psx_gpu->clut_settings != clut_settings) + clut_settings &= 0x7FFF; + if (psx_gpu->clut_settings != clut_settings) { flush_render_block_buffer(psx_gpu); psx_gpu->clut_settings = clut_settings; - psx_gpu->clut_ptr = psx_gpu->vram_ptr + ((clut_settings & 0x7FFF) * 16); + psx_gpu->clut_ptr = psx_gpu->vram_ptr + clut_settings * 16; } } -void set_triangle_color(psx_gpu_struct *psx_gpu, u32 triangle_color) +static void set_triangle_color(psx_gpu_struct *psx_gpu, u32 triangle_color) { if(psx_gpu->triangle_color != triangle_color) { diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index ec87f211a..93269932d 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -1,5 +1,5 @@ /* - * (C) Gražvydas "notaz" Ignotas, 2011 + * (C) Gražvydas "notaz" Ignotas, 2011,2024 * * This work is licensed under the terms of GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. @@ -10,9 +10,15 @@ .text .align 2 +.macro pld_ reg offs=#0 +#ifdef HAVE_ARMV6 + pld [\reg, \offs] +#endif +.endm + @ in: r0=dst, r2=pal, r12=0x1e @ trashes r6-r8,lr,flags -.macro do_4_pixels rs ibase obase +.macro do_4x_4bpp rs ibase obase .if \ibase - 1 < 0 and r6, r12, \rs, lsl #1 .else @@ -35,22 +41,170 @@ strneh lr, [r0, #\obase+6] .endm -.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines) -draw_spr16_full: +@ in: r0=dst, r2=pal, r12=0x1fe +@ loads/stores \rs,r6-r8 +.macro do_4x_8bpp rs + and r6, r12, \rs, lsl #1 + and r7, r12, \rs, lsr #7 + and r8, r12, \rs, lsr #15 + and \rs,r12, \rs, lsr #23 + ldrh r6, [r2, r6] + ldrh r7, [r2, r7] + ldrh r8, [r2, r8] + ldrh \rs,[r2, \rs] + tst r6, r6 + strneh r6, [r0, #0] + tst r7, r7 + strneh r7, [r0, #2] + tst r8, r8 + strneh r8, [r0, #4] + tst \rs,\rs + strneh \rs,[r0, #6] +.endm + +.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines) +sprite_4bpp_x16_asm_: + ldr r2, [r3] @ pal + ldr r3, [r3, #0x1c] @ lines +sprite_4bpp_x16_asm: + .cfi_startproc stmfd sp!, {r4-r8,lr} + .cfi_def_cfa_offset 4*6 + .cfi_rel_offset lr, 4*5 mov r12, #0x1e @ empty pixel 0: ldmia r1, {r4,r5} - do_4_pixels r4, 0, 0 - do_4_pixels r4, 16, 8 - do_4_pixels r5, 0, 16 - do_4_pixels r5, 16, 24 + pld_ r1, #2048 + do_4x_4bpp r4, 0, 0 + do_4x_4bpp r4, 16, 8 + do_4x_4bpp r5, 0, 16 + do_4x_4bpp r5, 16, 24 subs r3, r3, #1 add r0, r0, #2048 add r1, r1, #2048 bgt 0b ldmfd sp!, {r4-r8,pc} + .cfi_endproc + + +@ +.macro sprite_driver_part1 is8bpp + stmfd sp!, {r4-r11,lr} + .cfi_def_cfa_offset 4*9 + .cfi_rel_offset lr, 4*8 + mov r12, #0x01e +.if \is8bpp + orr r12, r12, #0x1f0 @ mask=0x01fe +.endif + ldr r4, [r3, #4] @ u0 + ldr r5, [r3, #0x1c] @ h + and r4, r4, #((8 >> \is8bpp) - 1) + sub r5, r5, #1 + orr r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction + mov r9, r2 @ saved_w + mov r10, r0 @ saved_dst + mov r11, r1 @ saved_src + ldr r2, [r3] @ pal +11: @ line_loop: + pld_ r11, #2048 + mov r0, r10 + mov r1, r11 + mov r3, r9 + ands r6, r5, #(7 >> \is8bpp) + bne 15f @ fractional_u +12: + subs r3, r3, #(8 >> \is8bpp) @ w + bmi 14f @ fractional_w +.endm +.macro sprite_driver_part2 is8bpp + cmn r3, #(8 >> \is8bpp) + bne 14f @ fractional_w +13: @ eol: + add r10, r10, #2048 + add r11, r11, #2048 + subs r5, r5, #0x100 + bpl 11b @ line_loop + ldmfd sp!, {r4-r11,pc} +14: @ fractional_w: + ldr r4, [r1], #4 + add r8, r3, #(8 >> \is8bpp) + mov r3, #0 + mov r4, r4, lsl #1 + b 16f @ fractional_loop +15: @ fractional_u: + bic r1, r1, #3 + rsb r8, r6, #(8 >> \is8bpp) + ldr r4, [r1], #4 + cmp r8, r3 + movgt r8, r3 + mov r7, r6, lsl #(2 + \is8bpp) + sub r3, r3, r8 + sub r7, r7, #1 + mov r4, r4, lsr r7 +16: @ fractional_loop: +.endm +.macro sprite_driver_part3 + tst r3, r3 + beq 13b @ sprd4_eol + b 12b @ return from fractional_u +.endm + +.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) +sprite_driver_4bpp_asm: + .cfi_startproc + ldr r12, [r3, #4] @ u0 + mov r12, r12, lsl #29 + orr r12, r12, r2 @ w + cmp r12, #16 + beq sprite_4bpp_x16_asm_ @ use specialized aligned x16 version + sprite_driver_part1 0 +0: + ldr r4, [r1], #4 + pld_ r1, #28 + do_4x_4bpp r4, 0, 0 + do_4x_4bpp r4, 16, 8 + add r0, r0, #16 + subs r3, r3, #8 + bpl 0b + sprite_driver_part2 0 +0: + and r7, r12, r4 + mov r4, r4, lsr #4 + ldrh r7, [r2, r7] + add r0, r0, #2 + tst r7, r7 + strneh r7, [r0, #-2] + subs r8, r8, #1 + bgt 0b + sprite_driver_part3 + .cfi_endproc + + +.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) +sprite_driver_8bpp_asm: + .cfi_startproc + sprite_driver_part1 1 +0: + ldr r4, [r1], #4 + pld_ r1, #28 + do_4x_8bpp r4 + add r0, r0, #8 + subs r3, r3, #4 + bpl 0b + sprite_driver_part2 1 +0: + and r7, r12, r4 + mov r4, r4, lsr #8 + ldrh r7, [r2, r7] + add r0, r0, #2 + tst r7, r7 + strneh r7, [r0, #-2] + subs r8, r8, #1 + bgt 0b + sprite_driver_part3 + .cfi_endproc + @ vim:filetype=armasm diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h index 0f8ed6b5f..2329c46c3 100644 --- a/plugins/gpu_unai/gpu_arm.h +++ b/plugins/gpu_unai/gpu_arm.h @@ -5,7 +5,13 @@ extern "C" { #endif -void draw_spr16_full(void *d, void *s, void *pal, int lines); +struct spriteDriverArg; + +void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct spriteDriverArg *arg); +void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct spriteDriverArg *arg); +void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines); #ifdef __cplusplus } diff --git a/plugins/gpu_unai/gpu_fixedpoint.h b/plugins/gpu_unai/gpu_fixedpoint.h index f809905ee..364321b7e 100644 --- a/plugins/gpu_unai/gpu_fixedpoint.h +++ b/plugins/gpu_unai/gpu_fixedpoint.h @@ -75,7 +75,7 @@ INLINE float FloatInv(const float x) /////////////////////////////////////////////////////////////////////////// // --- BEGIN INVERSE APPROXIMATION SECTION --- /////////////////////////////////////////////////////////////////////////// -#if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || !defined(GPU_UNAI_NO_OLD) +#if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || (!defined(GPU_UNAI_NO_OLD) && !defined(GPU_UNAI_USE_FLOATMATH)) // big precision inverse table. #define TABLE_BITS 16 diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 1a93a3920..a80c3a3a4 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -360,8 +360,19 @@ const PT gpuTileSpanDrivers[32] = { /////////////////////////////////////////////////////////////////////////////// // GPU Sprites innerloops generator +// warning: gpu_arm.S asm uses this, update it if you change this +typedef struct spriteDriverArg { + const le16_t *CBA; // 00 + u32 u0, v0, u0_mask, v0_mask; // 04 08 0c 10 + s32 y0, y1, lines, li; // 14 +} spriteDriverArg; + +typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt, + const spriteDriverArg *arg); + template -static void gpuSpriteSpanFn(le16_t *pDst, u32 count, u8* pTxt, u32 u0) +static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) { // Blend func can save an operation if it knows uSrc MSB is unset. // Untextured prims can always skip (source color always comes with MSB=0). @@ -370,7 +381,7 @@ static void gpuSpriteSpanFn(le16_t *pDst, u32 count, u8* pTxt, u32 u0) uint_fast16_t uSrc, uDst, srcMSB; bool should_blend; - u32 u0_mask = gpu_unai.TextureWindow[2]; + u32 u0_mask = arg->u0_mask; u8 r5, g5, b5; if (CF_LIGHT) { @@ -384,10 +395,20 @@ static void gpuSpriteSpanFn(le16_t *pDst, u32 count, u8* pTxt, u32 u0) u0_mask <<= 1; } - const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA; + const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = arg->CBA; + const u32 v0_mask = arg->v0_mask; + s32 y0 = arg->y0, y1 = arg->y1, li = arg->li; + u32 u0_ = arg->u0, v0 = arg->v0; - do + for (; y0 < y1; ++y0, pPixel += FRAME_WIDTH, ++v0) { + if (y0 & li) continue; + const u8 *pTxt = pTxt_base + ((v0 & v0_mask) * 2048); + le16_t *pDst = pPixel; + u32 u0 = u0_; + u32 count1 = count; + do + { if (CF_MASKCHECK || CF_BLEND) { uDst = le16_to_u16(*pDst); } if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; } @@ -423,11 +444,47 @@ static void gpuSpriteSpanFn(le16_t *pDst, u32 count, u8* pTxt, u32 u0) endsprite: u0 += (CF_TEXTMODE==3) ? 2 : 1; pDst++; + } + while (--count1); + } +} + +#ifdef __arm__ +#include "gpu_arm.h" + +static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) +{ +#if 1 + s32 lines = arg->lines; + u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; + if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { + pTxt_base += arg->u0 / 2 + arg->v0 * 2048; + sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg); } - while (--count); + else +#endif + gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg); +} + +static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) +{ +#if 1 + s32 lines = arg->lines; + u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; + if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { + pTxt_base += arg->u0 + arg->v0 * 2048; + sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg); + } + else +#endif + gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg); } +#endif // __arm__ -static void SpriteNULL(le16_t *pDst, u32 count, u8* pTxt, u32 u0) +static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) { #ifdef ENABLE_GPU_LOG_SUPPORT fprintf(stdout,"SpriteNULL()\n"); @@ -438,30 +495,36 @@ static void SpriteNULL(le16_t *pDst, u32 count, u8* pTxt, u32 u0) /////////////////////////////////////////////////////////////////////////////// // Sprite innerloops driver -typedef void (*PS)(le16_t *pDst, u32 count, u8* pTxt, u32 u0); // Template instantiation helper macros -#define TI(cf) gpuSpriteSpanFn<(cf)> +#define TI(cf) gpuSpriteDriverFn<(cf)> #define TN SpriteNULL +#ifdef __arm__ +#define TA4(cf) Sprite4bppMaybeAsm +#define TA8(cf) Sprite8bppMaybeAsm +#else +#define TA4(cf) TI(cf) +#define TA8(cf) TI(cf) +#endif #define TIBLOCK(ub) \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ - TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ - TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ - TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ - TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ - TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ - TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ - TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ - TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ - TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ - TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ - TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) - -const PS gpuSpriteSpanDrivers[256] = { + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ + TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ + TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ + TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ + TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ + TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ + TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ + TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ + TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ + TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) + +const PS gpuSpriteDrivers[256] = { TIBLOCK(0<<8), TIBLOCK(1<<8) }; diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h index ff6dc00d7..1b9e08dca 100644 --- a/plugins/gpu_unai/gpu_raster_polygon.h +++ b/plugins/gpu_unai/gpu_raster_polygon.h @@ -257,7 +257,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = x4 = i2x(x0); if (dx < 0) { #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; #else @@ -275,7 +275,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad #endif } else { #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; #else @@ -303,7 +303,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = i2x(x0) + (dx3 * (y1 - y0)); x4 = i2x(x1); #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -319,7 +319,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = i2x(x1); x4 = i2x(x0) + (dx4 * (y1 - y0)); #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -581,7 +581,7 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua v3 += (dv3 * (y1 - y0)); } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -920,7 +920,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -1305,7 +1305,7 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h index 6909f4f8a..2564e7f03 100644 --- a/plugins/gpu_unai/gpu_raster_sprite.h +++ b/plugins/gpu_unai/gpu_raster_sprite.h @@ -24,7 +24,7 @@ /////////////////////////////////////////////////////////////////////////////// // GPU internal sprite drawing functions -void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver, s32 *w_out, s32 *h_out) +void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out) { s32 x0, x1, y0, y1; u32 u0, v0; @@ -67,70 +67,26 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver, s32 *w_out, s32 *h_ le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)]; const int li=gpu_unai.ilace_mask; - const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + //const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + //const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); unsigned int tmode = gpu_unai.TEXT_MODE >> 5; - const u32 v0_mask = gpu_unai.TextureWindow[3]; u8* pTxt_base = (u8*)gpu_unai.TBA; // Texture is accessed byte-wise, so adjust idx if 16bpp if (tmode == 3) u0 <<= 1; - for (; y0 xmax - 16 || x0 < xmin || - ((u0 | v0) & 15) || !(gpu_unai.TextureWindow[2] & gpu_unai.TextureWindow[3] & 8)) { - // send corner cases to general handler - packet.U4[3] = u32_to_le32(0x00100010); - gpuDrawS(packet, gpuSpriteSpanFn<0x20>, w_out, h_out); - return; - } - - if (y0 >= ymax || y0 <= ymin - 16) - return; - if (y0 < ymin) { - h -= ymin - y0; - v0 += ymin - y0; - y0 = ymin; - } - else if (ymax - y0 < 16) - h = ymax - y0; - *w_out = 16; - *h_out = h; - - draw_spr16_full(&gpu_unai.vram[FRAME_OFFSET(x0, y0)], &gpu_unai.TBA[FRAME_OFFSET(u0/4, v0)], gpu_unai.CBA, h); + spriteDriverArg arg; + arg.CBA = gpu_unai.CBA; + arg.u0 = u0; + arg.v0 = v0; + arg.u0_mask = gpu_unai.TextureWindow[2]; + arg.v0_mask = gpu_unai.TextureWindow[3]; + arg.y0 = y0; + arg.y1 = y1; + arg.lines = y1 - y0; + arg.li = li; + gpuSpriteDriver(Pixel, x1, pTxt_base, &arg); } -#endif // __arm__ void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out) { diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index be6b6c9e7..eb47c2a6d 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -254,7 +254,7 @@ int renderer_init(void) //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack; gpu_unai.ilace_mask = gpu_unai.config.ilace_force; -#if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || !defined(GPU_UNAI_NO_OLD) +#if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || (!defined(GPU_UNAI_NO_OLD) && !defined(GPU_UNAI_USE_FLOATMATH)) // s_invTable for(int i=1;i<=(1<> 16); - gpuDrawS16(packet, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); - break; - } - // fallthrough -#endif case 0x7E: case 0x7F: { // Textured rectangle (16x16) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010); @@ -788,7 +777,7 @@ int do_cmd_list(u32 *list_, int list_len, // Strip lower 3 bits of each color and determine if lighting should be used: if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) driver_idx |= Lighting; - PS driver = gpuSpriteSpanDrivers[driver_idx]; + PS driver = gpuSpriteDrivers[driver_idx]; gpuDrawS(packet, driver, &w, &h); gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); } break; diff --git a/plugins/gpu_unai/old/gpu_arm.h b/plugins/gpu_unai/old/gpu_arm.h deleted file mode 100644 index a0b22487b..000000000 --- a/plugins/gpu_unai/old/gpu_arm.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifdef __cplusplus -extern "C" { -#endif - -void draw_spr16_full(u16 *d, void *s, u16 *pal, int lines); - -#ifdef __cplusplus -} -#endif diff --git a/plugins/gpu_unai/old/gpu_arm.s b/plugins/gpu_unai/old/gpu_arm.s deleted file mode 100644 index 8fa44a7ac..000000000 --- a/plugins/gpu_unai/old/gpu_arm.s +++ /dev/null @@ -1,55 +0,0 @@ -/* - * (C) Gražvydas "notaz" Ignotas, 2011 - * - * This work is licensed under the terms of GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - - -.text -.align 2 - -@ in: r0=dst, r2=pal, r12=0x1e -@ trashes r6-r8,lr,flags -.macro do_4_pixels rs ibase obase -.if \ibase - 1 < 0 - and r6, r12, \rs, lsl #1 -.else - and r6, r12, \rs, lsr #\ibase-1 -.endif - and r7, r12, \rs, lsr #\ibase+3 - and r8, r12, \rs, lsr #\ibase+7 - and lr, r12, \rs, lsr #\ibase+11 - ldrh r6, [r2, r6] - ldrh r7, [r2, r7] - ldrh r8, [r2, r8] - ldrh lr, [r2, lr] - tst r6, r6 - strneh r6, [r0, #\obase+0] - tst r7, r7 - strneh r7, [r0, #\obase+2] - tst r8, r8 - strneh r8, [r0, #\obase+4] - tst lr, lr - strneh lr, [r0, #\obase+6] -.endm - -.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines) -draw_spr16_full: - stmfd sp!, {r4-r8,lr} - mov r12, #0x1e @ empty pixel - -0: - ldmia r1, {r4,r5} - do_4_pixels r4, 0, 0 - do_4_pixels r4, 16, 8 - do_4_pixels r5, 0, 16 - do_4_pixels r5, 16, 24 - subs r3, r3, #1 - add r0, r0, #2048 - add r1, r1, #2048 - bgt 0b - - ldmfd sp!, {r4-r8,pc} - -@ vim:filetype=armasm diff --git a/plugins/gpu_unai/old/gpu_fixedpoint.h b/plugins/gpu_unai/old/gpu_fixedpoint.h index 5dae806d0..4ae3ed000 100644 --- a/plugins/gpu_unai/old/gpu_fixedpoint.h +++ b/plugins/gpu_unai/old/gpu_fixedpoint.h @@ -38,9 +38,6 @@ typedef s32 fixed; #define fixed_TWO ((fixed)2<>1)) -// big precision inverse table. -extern s32 s_invTable[(1<>FIXED_BITS); } @@ -57,12 +54,39 @@ INLINE u32 Log2(u32 _a) } */ +#ifdef GPU_UNAI_USE_FLOATMATH + +#define inv_type float + +INLINE void xInv (const fixed _b, float & factor_, float & shift_) +{ + factor_ = 1.0f / _b; + shift_ = 0.0f; // not used +} + +INLINE fixed xInvMulx (const fixed _a, const float fact, const float shift) +{ + return (fixed)((_a << FIXED_BITS) * fact); +} + +INLINE fixed xLoDivx (const fixed _a, const fixed _b) +{ + return (fixed)((_a << FIXED_BITS) / (float)_b); +} + +#else + +#define inv_type s32 + #ifdef HAVE_ARMV5 INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; } #else INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; } #endif +// big precision inverse table. +extern s32 s_invTable[(1< INLINE T Min2 (const T _a, const T _b) { return (_a<_b)?_a:_b; } diff --git a/plugins/gpu_unai/old/gpu_raster_polygon.h b/plugins/gpu_unai/old/gpu_raster_polygon.h index c4b035094..fcd1f6cea 100644 --- a/plugins/gpu_unai/old/gpu_raster_polygon.h +++ b/plugins/gpu_unai/old/gpu_raster_polygon.h @@ -245,7 +245,7 @@ void gpuDrawFT3(const PP gpuPolySpanDriver) du4 = (u2 - u1) * ya - (u2 - u0) * yb; dv4 = (v2 - v1) * ya - (v2 - v0) * yb; - s32 iF,iS; + inv_type iF,iS; xInv( dx, iF, iS); du4 = xInvMulx( du4, iF, iS); dv4 = xInvMulx( dv4, iF, iS); @@ -425,7 +425,7 @@ void gpuDrawG3(const PP gpuPolySpanDriver) dg4 = (g2 - g1) * ya - (g2 - g0) * yb; db4 = (b2 - b1) * ya - (b2 - b0) * yb; - s32 iF,iS; + inv_type iF,iS; xInv( dx, iF, iS); dr4 = xInvMulx( dr4, iF, iS); dg4 = xInvMulx( dg4, iF, iS); @@ -619,7 +619,7 @@ void gpuDrawGT3(const PP gpuPolySpanDriver) dg4 = (g2 - g1) * ya - (g2 - g0) * yb; db4 = (b2 - b1) * ya - (b2 - b0) * yb; - s32 iF,iS; + inv_type iF,iS; xInv( dx, iF, iS); du4 = xInvMulx( du4, iF, iS); diff --git a/plugins/gpu_unai/old/gpu_raster_sprite.h b/plugins/gpu_unai/old/gpu_raster_sprite.h index a700db32f..4e19428e6 100644 --- a/plugins/gpu_unai/old/gpu_raster_sprite.h +++ b/plugins/gpu_unai/old/gpu_raster_sprite.h @@ -85,7 +85,7 @@ void gpuDrawS(const PS gpuSpriteSpanDriver) } #ifdef __arm__ -#include "gpu_arm.h" +#include "../gpu_arm.h" void gpuDrawS16(void) { @@ -121,7 +121,7 @@ void gpuDrawS16(void) else if (ymax - y0 < 16) h = ymax - y0; - draw_spr16_full(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h); + sprite_4bpp_x16_asm(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h); } #endif // __arm__ diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index 03be13d00..c4702d2fc 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -240,7 +240,7 @@ static noinline void get_gpu_info(uint32_t data) // fills. (Will change this value if it ever gets large page support) #define VRAM_ALIGN 8192 #else - #define VRAM_ALIGN 16 + #define VRAM_ALIGN 64 #endif // double, for overdraw guard + at least 1 page before @@ -250,7 +250,7 @@ static noinline void get_gpu_info(uint32_t data) static uint16_t *vram_ptr_orig = NULL; #ifndef GPULIB_USE_MMAP -# ifdef __linux__ +# if defined(__linux__) || defined(_3DS) || defined(HAVE_LIBNX) || defined(VITA) # define GPULIB_USE_MMAP 1 # else # define GPULIB_USE_MMAP 0