diff --git a/Makefile b/Makefile index 39b5fbaf..52275723 100644 --- a/Makefile +++ b/Makefile @@ -229,7 +229,7 @@ plugins/dfsound/out.o: CFLAGS += -DHAVE_LIBRETRO endif # builtin gpu -OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o +OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o plugins/gpulib/prim.o ifeq "$(BUILTIN_GPU)" "neon" CFLAGS += -DGPU_NEON OBJS += plugins/gpu_neon/psx_gpu_if.o @@ -272,9 +272,12 @@ OBJS += plugins/gpu_unai/old/if.o else CFLAGS += -DGPU_UNAI_NO_OLD endif +plugins/gpu_unai/gpulib_if.o: plugins/gpu_unai/*.h plugins/gpu_unai/gpulib_if.o: CFLAGS += -DREARMED -DUSE_GPULIB=1 +ifneq ($(DEBUG), 1) plugins/gpu_unai/gpulib_if.o \ plugins/gpu_unai/old/if.o: CFLAGS += -O3 +endif CC_LINK = $(CXX) endif diff --git a/Makefile.libretro b/Makefile.libretro index 06eab7d6..7ea7addb 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -356,7 +356,6 @@ else ifeq ($(platform), ctr) TARGET := $(TARGET_NAME)_libretro_ctr.a CFLAGS += -DARM11 -D_3DS -D__3DS__ CFLAGS += -DGPU_UNAI_USE_FLOATMATH -DGPU_UNAI_USE_FLOAT_DIV_MULTINV - CFLAGS += -DGPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE # needed on some compilers? CFLAGS += -march=armv6k -mtune=mpcore -mfloat-abi=hard -marm -mfpu=vfp -mtp=soft CFLAGS += -mword-relocations CFLAGS += -fomit-frame-pointer diff --git a/frontend/cspace_arm.S b/frontend/cspace_arm.S index 3ef5083b..41b1e691 100644 --- a/frontend/cspace_arm.S +++ b/frontend/cspace_arm.S @@ -20,6 +20,14 @@ orr \rn, r12, lsl #6 .endm +.macro bgr555_to_rgb565_one_i rn1 rn2 + and r12, lr, \rn1, lsr #5 + and \rn1,lr, \rn1, lsr #10 + orr r12, r11, lsl #5 + and r11, lr, \rn2 + orr \rn1,r12, lsl #6 +.endm + .macro pld_ reg offs=#0 #ifdef HAVE_ARMV6 pld [\reg, \offs] @@ -27,7 +35,6 @@ .endm FUNCTION(bgr555_to_rgb565): @ void *dst, const void *src, int bytes - pld_ r1 push {r4-r11,lr} mov lr, #0x001f subs r2, #4*8 @@ -43,16 +50,17 @@ FUNCTION(bgr555_to_rgb565): @ void *dst, const void *src, int bytes 0: ldmia r1!, {r3-r10} subs r2, #4*8 - bgr555_to_rgb565_one r3 - - pld_ r1, #32*2 - bgr555_to_rgb565_one r4 - bgr555_to_rgb565_one r5 - bgr555_to_rgb565_one r6 - bgr555_to_rgb565_one r7 - bgr555_to_rgb565_one r8 - bgr555_to_rgb565_one r9 - bgr555_to_rgb565_one r10 + bic r12, r1, #0x1f + pld_ r12, #32*1 + and r11, lr, r3 + bgr555_to_rgb565_one_i r3 r4 + bgr555_to_rgb565_one_i r4 r5 + bgr555_to_rgb565_one_i r5 r6 + bgr555_to_rgb565_one_i r6 r7 + bgr555_to_rgb565_one_i r7 r8 + bgr555_to_rgb565_one_i r8 r9 + bgr555_to_rgb565_one_i r9 r10 + bgr555_to_rgb565_one_i r10 r10 stmia r0!, {r3-r10} bge 0b diff --git a/include/compiler_features.h b/include/compiler_features.h index 21549ddf..77114efb 100644 --- a/include/compiler_features.h +++ b/include/compiler_features.h @@ -2,6 +2,7 @@ #ifdef __GNUC__ # define likely(x) __builtin_expect((x),1) # define unlikely(x) __builtin_expect((x),0) +# define preload __builtin_prefetch # ifdef __clang__ # define noinline __attribute__((noinline)) # else @@ -11,6 +12,7 @@ #else # define likely(x) (x) # define unlikely(x) (x) +# define preload (x) # define noinline # define attr_unused #endif diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c index 76951a68..a393ee92 100644 --- a/libpcsxcore/database.c +++ b/libpcsxcore/database.c @@ -82,6 +82,18 @@ static const char * const fractional_Framerate_hack_db[] = "SCUS94425", "SCES02104", }; +static const char * const f1_hack_db[] = +{ + /* Formula One Arcade */ + "SCES03886", + /* Formula One '99 */ + "SLUS00870", "SCPS10101", "SCES01979", "SLES01979", + /* Formula One 2000 */ + "SLUS01134", "SCES02777", "SCES02778", "SCES02779", + /* Formula One 2001 */ + "SCES03404", "SCES03423", "SCES03424", "SCES03524", +}; + #define HACK_ENTRY(var, list) \ { #var, &Config.hacks.var, list, ARRAY_SIZE(list) } @@ -100,6 +112,7 @@ hack_db[] = HACK_ENTRY(gpu_timing1024, dualshock_timing1024_hack_db), HACK_ENTRY(dualshock_init_analog, dualshock_init_analog_hack_db), HACK_ENTRY(fractional_Framerate, fractional_Framerate_hack_db), + HACK_ENTRY(f1, f1_hack_db), }; static const struct @@ -142,35 +155,6 @@ cycle_multiplier_overrides[] = { 153, { "SLUS00943" } }, }; -static const struct -{ - const char * const id; - u32 hacks; -} -lightrec_hacks_db[] = -{ - /* Formula One Arcade */ - { "SCES03886", LIGHTREC_HACK_INV_DMA_ONLY }, - - /* Formula One '99 */ - { "SLUS00870", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCPS10101", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES01979", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SLES01979", LIGHTREC_HACK_INV_DMA_ONLY }, - - /* Formula One 2000 */ - { "SLUS01134", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES02777", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES02778", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES02779", LIGHTREC_HACK_INV_DMA_ONLY }, - - /* Formula One 2001 */ - { "SCES03404", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES03423", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES03424", LIGHTREC_HACK_INV_DMA_ONLY }, - { "SCES03524", LIGHTREC_HACK_INV_DMA_ONLY }, -}; - /* Function for automatic patching according to GameID. */ void Apply_Hacks_Cdrom(void) { @@ -211,6 +195,8 @@ void Apply_Hacks_Cdrom(void) /* Dynarec game-specific hacks */ ndrc_g.hacks_pergame = 0; + if (Config.hacks.f1) + ndrc_g.hacks_pergame |= NDHACK_THREAD_FORCE; // force without *_ON -> off Config.cycle_multiplier_override = 0; for (i = 0; i < ARRAY_SIZE(cycle_multiplier_overrides); i++) @@ -229,15 +215,12 @@ void Apply_Hacks_Cdrom(void) } } - lightrec_hacks = 0; - - for (i = 0; drc_is_lightrec() && i < ARRAY_SIZE(lightrec_hacks_db); i++) { - if (strcmp(CdromId, lightrec_hacks_db[i].id) == 0) - { - lightrec_hacks = lightrec_hacks_db[i].hacks; + if (drc_is_lightrec()) { + lightrec_hacks = 0; + if (Config.hacks.f1) + lightrec_hacks |= LIGHTREC_HACK_INV_DMA_ONLY; + if (lightrec_hacks) SysPrintf("using lightrec_hacks: 0x%x\n", lightrec_hacks); - break; - } } } diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index 0dcec554..a19bd2dd 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -303,10 +303,10 @@ static void ari64_apply_config() else ndrc_g.hacks &= ~NDHACK_NO_STALLS; - thread_changed = (ndrc_g.hacks ^ ndrc_g.hacks_old) + thread_changed = ((ndrc_g.hacks | ndrc_g.hacks_pergame) ^ ndrc_g.hacks_old) & (NDHACK_THREAD_FORCE | NDHACK_THREAD_FORCE_ON); if (Config.cycle_multiplier != ndrc_g.cycle_multiplier_old - || ndrc_g.hacks != ndrc_g.hacks_old) + || (ndrc_g.hacks | ndrc_g.hacks_pergame) != ndrc_g.hacks_old) { new_dynarec_clear_full(); } @@ -485,7 +485,9 @@ static void ari64_thread_init(void) { int enable; - if (ndrc_g.hacks & NDHACK_THREAD_FORCE) + if (ndrc_g.hacks_pergame & NDHACK_THREAD_FORCE) + enable = 0; + else if (ndrc_g.hacks & NDHACK_THREAD_FORCE) enable = ndrc_g.hacks & NDHACK_THREAD_FORCE_ON; else { u32 cpu_count = cpu_features_get_core_amount(); diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index e19e4361..e247faf2 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -6292,13 +6292,13 @@ void new_dynarec_clear_full(void) stat_clear(stat_links); if (ndrc_g.cycle_multiplier_old != Config.cycle_multiplier - || ndrc_g.hacks_old != ndrc_g.hacks) + || ndrc_g.hacks_old != (ndrc_g.hacks | ndrc_g.hacks_pergame)) { SysPrintf("ndrc config: mul=%d, ha=%x, pex=%d\n", get_cycle_multiplier(), ndrc_g.hacks, Config.PreciseExceptions); } ndrc_g.cycle_multiplier_old = Config.cycle_multiplier; - ndrc_g.hacks_old = ndrc_g.hacks; + ndrc_g.hacks_old = ndrc_g.hacks | ndrc_g.hacks_pergame; } static int pgsize(void) diff --git a/libpcsxcore/psxcommon.h b/libpcsxcore/psxcommon.h index 8a0ac703..0a1ef707 100644 --- a/libpcsxcore/psxcommon.h +++ b/libpcsxcore/psxcommon.h @@ -156,6 +156,7 @@ typedef struct { boolean dualshock_init_analog; boolean gpu_timing1024; boolean fractional_Framerate; + boolean f1; } hacks; } PcsxConfig; diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index e78feaf2..1fa06a15 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -16,6 +16,7 @@ #include "common.h" #include "../../gpulib/gpu_timing.h" +#include "../../gpulib/gpu.h" #ifndef command_lengths const u8 command_lengths[256] = @@ -245,12 +246,27 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y, #define SET_Ex(r, v) #endif +static void textured_sprite(psx_gpu_struct *psx_gpu, const u32 *list, + s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles) +{ + s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x); + s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y); + u8 v = (list[2] >> 8) & 0xff; + u8 u = list[2] & 0xff; + + set_clut(psx_gpu, list[2] >> 16); + + render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]); + gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height)); +} + u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command) { vertex_struct vertexes[4] __attribute__((aligned(16))) = {}; u32 current_command = 0, command_length; u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last; + u32 siplified_prim[4*4]; u32 *list_start = list; u32 *list_end = list + (size / 4); @@ -328,8 +344,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x2C ... 0x2F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[9]); + u32 i, simplified_count; + set_texture(psx_gpu, list[4] >> 16); + if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); set_triangle_color(psx_gpu, list[0] & 0xFFFFFF); get_vertex_data_xy_uv(0, 2); @@ -383,8 +410,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x3C ... 0x3F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[11]); + u32 i, simplified_count; + set_texture(psx_gpu, list[5] >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); get_vertex_data_xy_uv_rgb(0, 0); get_vertex_data_xy_uv_rgb(1, 6); @@ -525,23 +563,12 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); break; } - - case 0x64 ... 0x67: - { - u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = list_s16[6] & 0x3FF; - s32 height = list_s16[7] & 0x1FF; - - set_clut(psx_gpu, list_s16[5]); - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + case 0x64 ... 0x67: + textured_sprite(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF, + &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x68 ... 0x6B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -565,22 +592,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); break; } - - case 0x74 ... 0x77: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = 8, height = 8; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + case 0x74 ... 0x77: + textured_sprite(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x78 ... 0x7B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -594,19 +610,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, } case 0x7C ... 0x7F: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u32 uv = list_s16[4]; - s32 width = 16, height = 16; - - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); + textured_sprite(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles); break; - } #ifdef PCSX case 0x1F: // irq? @@ -1155,12 +1160,31 @@ static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y, } #endif +static void textured_sprite_enh(psx_gpu_struct *psx_gpu, const u32 *list, + s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles) +{ + s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x); + s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y); + s32 width_b = width, height_b = height; + u8 v = (list[2] >> 8) & 0xff; + u8 u = list[2] & 0xff; + + set_clut(psx_gpu, list[2] >> 16); + + render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]); + gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height)); + + if (check_enhanced_range(psx_gpu, x, x + width)) + do_sprite_enhanced(psx_gpu, x, y, u, v, width_b, height_b, list[0]); +} + u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command) { vertex_struct vertexes[4] __attribute__((aligned(16))) = {}; u32 current_command = 0, command_length; u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last; + u32 siplified_prim[4*4]; u32 *list_start = list; u32 *list_end = list + (size / 4); @@ -1265,8 +1289,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x2C ... 0x2F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[9]); + u32 i, simplified_count; + set_texture(psx_gpu, list[4] >> 16); + if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); set_triangle_color(psx_gpu, list[0] & 0xFFFFFF); get_vertex_data_xy_uv(0, 2); @@ -1318,8 +1353,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, case 0x3C ... 0x3F: { - set_clut(psx_gpu, list_s16[5]); - set_texture(psx_gpu, list_s16[11]); + u32 i, simplified_count; + set_texture(psx_gpu, list[5] >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list))) + { + for (i = 0; i < simplified_count; i++) { + const u32 *list_ = &siplified_prim[i * 4]; + textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF, + (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles); + } + break; + } + + set_clut(psx_gpu, list[2] >> 16); get_vertex_data_xy_uv_rgb(0, 0); get_vertex_data_xy_uv_rgb(1, 6); @@ -1475,30 +1521,12 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, } break; } - - case 0x64 ... 0x67: - { - u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = list_s16[6] & 0x3FF; - s32 height = list_s16[7] & 0x1FF; - - set_clut(psx_gpu, list_s16[5]); - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + width)) { - width = list_s16[6] & 0x3FF; - height = list_s16[7] & 0x1FF; - do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]); - } + case 0x64 ... 0x67: + textured_sprite_enh(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF, + &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x68 ... 0x6B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -1528,26 +1556,11 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]); break; } - - case 0x74 ... 0x77: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = 8, height = 8; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + 8)) - do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]); + case 0x74 ... 0x77: + textured_sprite_enh(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles); break; - } - + case 0x78 ... 0x7B: { s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); @@ -1562,25 +1575,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]); break; } - - case 0x7C ... 0x7F: - { - s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x); - s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y); - u8 u = list_s16[4]; - u8 v = list_s16[4] >> 8; - s32 width = 16, height = 16; - set_clut(psx_gpu, list_s16[5]); - - render_sprite(psx_gpu, x, y, u, v, - &width, &height, current_command, list[0]); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height)); - - if (check_enhanced_range(psx_gpu, x, x + 16)) - do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]); + case 0x7C ... 0x7F: + textured_sprite_enh(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles); break; - } case 0x80 ... 0x9F: // vid -> vid case 0xA0 ... 0xBF: // sys -> vid diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index 93269932..a516f08f 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -7,6 +7,7 @@ #include "arm_features.h" +.syntax unified .text .align 2 @@ -16,6 +17,89 @@ #endif .endm +#ifdef HAVE_ARMV6 + +.macro modulate rp mbr mg t0 t1 t2 + and \t0, \rp, #0x001f + and \t1, \rp, #0x03e0 + and \t2, \rp, #0x7c00 + smulbb \t0, \t0, \mbr @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx + smulbt \t1, \t1, \mg @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000 + smulbt \t2, \t2, \mbr @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000 + and \rp, \rp, #0x8000 @ retain msb + usat \t0, #5, \t0, asr #14 + usat \t1, #5, \t1, asr #19 + usat \t2, #5, \t2, asr #24 + orr \rp, \rp, \t0 + orr \rp, \rp, \t1, lsl #5 + orr \rp, \rp, \t2, lsl #10 +.endm + +@ http://www.slack.net/~ant/info/rgb_mixing.html +@ p0 = (p0 + p1) / 2; p1 |= 0x8000 +@ msb of input p0 is assumed to be set +.macro semitrans0 p0 p1 t + eor \t, \p0, \p1 + and \t, \t, #0x0420 + sub \p0, \p0, \t + orr \p1, \p1, #0x8000 + uhadd16 \p0, \p0, \p1 +.endm + +.macro semitrans0p p0 p1 m421 t + eor \t, \p0, \p1 + and \t, \t, \m421 + add \p0, \p0, \p1 + uhsub16 \p0, \p0, \t @ sub because of borrow into hi16 +.endm + +@ p0 - {p1|r,g,b} // p1* - premasked rgb +.macro semitrans2p p0 p1r p1g p1b m1f t0 t1 + and \t0, \p0, \m1f + and \t1, \p0, \m1f, lsl #5 + and \p0, \p0, \m1f, lsl #10 + uqsub16 \t0, \t0, \p1r + uqsub16 \t1, \t1, \p1g + uqsub16 \p0, \p0, \p1b + orr \t0, \t0, \t1 + orr \p0, \p0, \t0 +.endm + +#else + +@ msb of input p0 is assumed to be set +.macro semitrans0 p0 p1 t + eor \t, \p0, \p1 + and \t, \t, #0x0420 + orr \p1, \p1, #0x8000 + sub \p0, \p0, \t + add \p0, \p0, \p1 + orr \p0, \p0, #0x10000 + mov \p0, \p0, lsr #1 +.endm + +.macro semitrans0p p0 p1 m421 t + eor \t, \p0, \p1 + and \t, \t, \m421 + add \p0, \p0, \p1 + sub \p0, \p0, \t + mov \p0, \p0, lsr #1 +.endm + +#endif // HAVE_ARMV6 + +.macro semitrans13p p0 p1 m421 t0 + add \t0, \p0, \p1 + eor \p0, \p0, \p1 + and \p0, \p0, \m421 @ low_bits + sub \p0, \t0, \p0 + and \p0, \p0, \m421, lsl #5 @ carries + sub \t0, \t0, \p0 @ modulo + sub \p0, \p0, \p0, lsr #5 @ clamp + orr \p0, \t0, \p0 +.endm + + @ in: r0=dst, r2=pal, r12=0x1e @ trashes r6-r8,lr,flags .macro do_4x_4bpp rs ibase obase @@ -32,13 +116,13 @@ ldrh r8, [r2, r8] ldrh lr, [r2, lr] tst r6, r6 - strneh r6, [r0, #\obase+0] + strhne r6, [r0, #\obase+0] tst r7, r7 - strneh r7, [r0, #\obase+2] + strhne r7, [r0, #\obase+2] tst r8, r8 - strneh r8, [r0, #\obase+4] + strhne r8, [r0, #\obase+4] tst lr, lr - strneh lr, [r0, #\obase+6] + strhne lr, [r0, #\obase+6] .endm @ in: r0=dst, r2=pal, r12=0x1fe @@ -53,25 +137,112 @@ ldrh r8, [r2, r8] ldrh \rs,[r2, \rs] tst r6, r6 - strneh r6, [r0, #0] + strhne r6, [r0, #0] tst r7, r7 - strneh r7, [r0, #2] + strhne r7, [r0, #2] tst r8, r8 - strneh r8, [r0, #4] + strhne r8, [r0, #4] tst \rs,\rs - strneh \rs,[r0, #6] + strhne \rs,[r0, #6] +.endm + + +@ (void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn) +@ see also poly_untex_st_m +.macro tile_driver_st_m name semit +FUNCTION(\name): + .cfi_startproc + stmfd sp!, {r4-r9,lr} + .cfi_def_cfa_offset 4*7 + .cfi_rel_offset lr, 4*6 + ldr r7, [r3, #0x18] @ y0 + ldr r8, [r3, #0x1c] @ y1 +.if \semit != 2 + mov r4, #0x8000 + orr r4, r4, r4, lsl #16 @ mask 8000 + mov r6, #0x420 + orr r6, r6, #1 + orr r6, r6, r6, lsl #16 @ mask 0421 +.endif +.if \semit == 2 + and r4, r1, #0x03e0 + and r5, r1, #0x7c00 + and r1, r1, #0x001f + orr r4, r4, r4, lsl #16 @ premasked g + orr r5, r5, r5, lsl #16 @ premasked b + mov r6, #0x00001f + orr r6, #0x1f0000 @ mask +.elseif \semit == 3 + mov r1, r1, lsr #2 + bic r1, r1, #(0x0c60>>2) +.endif + orr r1, r1, r1, lsl #16 + sub r3, r8, r7 @ h + mov r7, r2 @ save w +0: + ldrh r8, [r0] + pld_ r0, #2048 + tst r0, #2 + beq 1f + sub r2, #1 +.if \semit == 0 + bic r8, r8, r4 + semitrans0p r8, r1, r6, lr +.elseif \semit == 1 || \semit == 3 + bic r8, r8, r4 + semitrans13p r8, r1, r6, lr +.elseif \semit == 2 + semitrans2p r8, r1, r4, r5, r6, r9, lr +.endif + strh r8, [r0], #2 +1: + ldr r8, [r0] + pld_ r0, #32 + subs r2, r2, #2 +.if \semit == 0 + bic r8, r8, r4 + semitrans0p r8, r1, r6, lr +.elseif \semit == 1 || \semit == 3 + bic r8, r8, r4 + semitrans13p r8, r1, r6, lr +.elseif \semit == 2 + semitrans2p r8, r1, r4, r5, r6, r9, lr +.endif + strpl r8, [r0], #4 + bpl 1b +2: + tst r2, #1 + strhne r8, [r0], #2 + mov r2, r7 @ w + add r0, r0, #2048 + sub r0, r0, r7, lsl #1 + subs r3, r3, #1 + bgt 0b + + ldmfd sp!, {r4-r9,pc} + .cfi_endproc .endm -.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines) + +tile_driver_st_m tile_driver_st0_asm, 0 +tile_driver_st_m tile_driver_st1_asm, 1 +tile_driver_st_m tile_driver_st3_asm, 3 +#ifdef HAVE_ARMV6 +tile_driver_st_m tile_driver_st2_asm, 2 +#endif + +@ (u16 *d, void *s, u16 *pal, int lines) sprite_4bpp_x16_asm_: - ldr r2, [r3] @ pal - ldr r3, [r3, #0x1c] @ lines -sprite_4bpp_x16_asm: + ldr r12,[r3, #0x18] @ y0 + ldr r2, [r3, #0x04] @ pal + ldr r3, [r3, #0x1c] @ y1 + sub r3, r3, r12 +FUNCTION(sprite_4bpp_x16_asm): .cfi_startproc stmfd sp!, {r4-r8,lr} .cfi_def_cfa_offset 4*6 .cfi_rel_offset lr, 4*5 - mov r12, #0x1e @ empty pixel + mov r12, #0x1e 0: ldmia r1, {r4,r5} @@ -98,15 +269,17 @@ sprite_4bpp_x16_asm: .if \is8bpp orr r12, r12, #0x1f0 @ mask=0x01fe .endif - ldr r4, [r3, #4] @ u0 - ldr r5, [r3, #0x1c] @ h + ldr r4, [r3, #0x08] @ u + ldr r5, [r3, #0x1c] @ v1 + ldr r6, [r3, #0x18] @ v0 and r4, r4, #((8 >> \is8bpp) - 1) + sub r5, r5, r6 sub r5, r5, #1 orr r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction mov r9, r2 @ saved_w mov r10, r0 @ saved_dst mov r11, r1 @ saved_src - ldr r2, [r3] @ pal + ldr r2, [r3, #0x04] @ pal 11: @ line_loop: pld_ r11, #2048 mov r0, r10 @@ -151,10 +324,10 @@ sprite_4bpp_x16_asm: b 12b @ return from fractional_u .endm -.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) -sprite_driver_4bpp_asm: +@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *) +FUNCTION(sprite_driver_4bpp_asm): .cfi_startproc - ldr r12, [r3, #4] @ u0 + ldr r12, [r3, #8] @ u mov r12, r12, lsl #29 orr r12, r12, r2 @ w cmp r12, #16 @@ -175,15 +348,15 @@ sprite_driver_4bpp_asm: ldrh r7, [r2, r7] add r0, r0, #2 tst r7, r7 - strneh r7, [r0, #-2] + strhne r7, [r0, #-2] subs r8, r8, #1 bgt 0b sprite_driver_part3 .cfi_endproc -.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) -sprite_driver_8bpp_asm: +@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *) +FUNCTION(sprite_driver_8bpp_asm): .cfi_startproc sprite_driver_part1 1 0: @@ -200,11 +373,425 @@ sprite_driver_8bpp_asm: ldrh r7, [r2, r7] add r0, r0, #2 tst r7, r7 - strneh r7, [r0, #-2] + strhne r7, [r0, #-2] subs r8, r8, #1 bgt 0b sprite_driver_part3 .cfi_endproc +@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *) +.macro sprite_driver_l_st name bpp light semit +FUNCTION(\name): + .cfi_startproc + stmfd sp!, {r4-r11,lr} + .cfi_def_cfa_offset 4*4 + .cfi_rel_offset lr, 4*3 + ldr r5, [r3, #0x18] @ y0 + ldr r7, [r3, #0x1c] @ y1 + ldr r8, [r3, #0x20] @ rbg5 + mov r6, r2 @ saved_w + ldr r2, [r3, #0x04] @ pal + ldr r10,[r3, #0x08] @ u + ldr r11,[r3, #0x10] @ u_msk + sub r5, r7, r5 @ h + mov r7, r8, lsl #(8+2) @ 0bbb bb00 0ggg gg00 0rrr rr00 0000 0000 + mov r8, r8, lsl #(16+2)@ 0ggg gg00 ... + mov r3, r11,lsr #10 + orr r6, r3, r6, lsl #16 @ (w << 16) | u_mask + mov r3, r6 + and r10,r10,r6 + +3: @ line_loop: +.if \bpp == 4 + add r9, r1, r10, lsr #1 +.elseif \bpp == 8 + add r9, r1, r10 + pld_ r9, #2048 +.endif +0: +.if \bpp == 4 + ldrb r4, [r1, r10, lsr #1] +.elseif \bpp == 8 + ldrb r4, [r1, r10] +.endif + subs r3, r3, #1<<16 + bmi 1f +.if \bpp == 4 + tst r10, #1 + movne r4, r4, lsr #3 + addeq r4, r4, r4 + and r4, r4, #0x1e +.elseif \bpp == 8 + add r4, r4, r4 @ <<= 1 +.endif + ldrsh r12,[r2, r4] + add r10,r10,#1 + and r10,r10,r6 + add r0, r0, #2 + tst r12,r12 + beq 0b +.if \light && \semit != 1 + modulate r12, r7, r8, r4, r9, lr +.endif +.if \semit == 0 + ldrhmi lr, [r0, #-2] + strhpl r12,[r0, #-2] + bpl 0b + semitrans0 r12, lr, r9 +.elseif \light && \semit == 1 + and r4, r12, #0x001f + and r9, r12, #0x03e0 + and r12, r12, #0x7c00 + ldrhmi r11, [r0, #-2] + smulbb r4, r4, r7 @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx + smulbt r9, r9, r8 @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000 + smulbt r12, r12, r7 @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000 + and r8, r11, #0x001f + and lr, r11, #0x03e0 + and r11, r11, #0x7c00 + addmi r4, r4, r8, lsl #14 + addmi r9, r9, lr, lsl #14 + addmi r12, r12, r11, lsl #14 + usat r4, #5, r4, asr #14 + usat r9, #5, r9, asr #19 + usat r12, #5, r12, asr #24 + orrmi r4, r4, #0x8000 + orr r4, r4, r9, lsl #5 + orr r12, r4, r12, lsl #10 + mov r8, r7, lsl #8 @ restore r8 +.endif + strh r12,[r0, #-2] + b 0b +1: + add r0, r0, #2048 + add r1, r1, #2048 + sub r0, r0, r6, lsr #15 @ dst + sub r10,r10,r6, lsr #16 @ u + mov r3, r6 @ (w << 16) | u_mask + and r10,r6, r10 + subs r5, r5, #1 + and r10,r10,#0xff + bgt 3b @ line_loop + + ldmfd sp!, {r4-r11,pc} + .cfi_endproc +.endm + +sprite_driver_l_st sprite_driver_4bpp_l0_std_asm, 4, 0, -1 +sprite_driver_l_st sprite_driver_4bpp_l0_st0_asm, 4, 0, 0 +sprite_driver_l_st sprite_driver_8bpp_l0_std_asm, 8, 0, -1 +sprite_driver_l_st sprite_driver_8bpp_l0_st0_asm, 8, 0, 0 + +#ifdef HAVE_ARMV6 + +sprite_driver_l_st sprite_driver_4bpp_l1_std_asm, 4, 1, -1 +sprite_driver_l_st sprite_driver_4bpp_l1_st0_asm, 4, 1, 0 +sprite_driver_l_st sprite_driver_4bpp_l1_st1_asm, 4, 1, 1 +sprite_driver_l_st sprite_driver_8bpp_l1_std_asm, 8, 1, -1 +sprite_driver_l_st sprite_driver_8bpp_l1_st0_asm, 8, 1, 0 +sprite_driver_l_st sprite_driver_8bpp_l1_st1_asm, 8, 1, 1 + +#endif // HAVE_ARMV6 + + +@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *) +FUNCTION(sprite_driver_16bpp_asm): + .cfi_startproc + stmfd sp!, {r4-r6,lr} + .cfi_def_cfa_offset 4*4 + .cfi_rel_offset lr, 4*3 + ldr r4, [r3, #0x1c] @ v1 + ldr r5, [r3, #0x18] @ v0 + mov r12, #0x00ff + orr r12, r12, #0xff00 @ mask + mov r6, r2 @ saved_w + sub r5, r4, r5 + sub r5, r5, #1 @ h-1 +3: @ line_loop: + pld_ r1, #2048 + mov r2, r6 @ w + tst r1, #2 + beq 0f +2: @ 1pix: + ldrh lr, [r1], #2 + add r0, r0, #2 + sub r2, r2, #1 + tst lr, lr + strhne lr, [r0, #-2] +0: + subs r2, r2, #4 + bmi 1f +0: + ldmia r1!, {r3,r4} + add r0, r0, #2*4 + pld_ r1, #24 + tst r3, r12 + strhne r3, [r0, #-8] + movs lr, r3, lsr #16 + strhne lr, [r0, #-6] + tst r4, r12 + strhne r4, [r0, #-4] + movs lr, r4, lsr #16 + strhne lr, [r0, #-2] + subs r2, r2, #4 + bpl 0b +1: + adds r2, r2, #4 + bne 2b @ 1pix + add r0, r0, #2048 + add r1, r1, #2048 + sub r0, r0, r6, lsl #1 @ dst + sub r1, r1, r6, lsl #1 + subs r5, r5, #1 + bpl 3b @ line_loop + + ldmfd sp!, {r4-r6,pc} + .cfi_endproc + + +@ (void *d, const gpu_unai_inner_t *inn, int count) +@ see also tile_driver_st_m +.macro poly_untex_st_m name semit +FUNCTION(\name): + .cfi_startproc + ldrh r1, [r1, #0x38] @ rgb + stmfd sp!, {r4-r7,lr} + .cfi_def_cfa_offset 4*5 + .cfi_rel_offset lr, 4*4 +.if \semit != 2 + mov r4, #0x8000 + orr r4, r4, r4, lsl #16 @ mask 8000 + mov r6, #0x420 + orr r6, r6, #1 + orr r6, r6, r6, lsl #16 @ mask 0421 +.endif +.if \semit == 2 + and r4, r1, #0x03e0 + and r5, r1, #0x7c00 + and r1, r1, #0x001f + orr r4, r4, r4, lsl #16 @ premasked g + orr r5, r5, r5, lsl #16 @ premasked b + mov r6, #0x00001f + orr r6, #0x1f0000 @ mask +.elseif \semit == 3 + mov r1, r1, lsr #2 + bic r1, r1, #(0x0c60>>2) +.endif + orr r1, r1, r1, lsl #16 +0: + ldrh r3, [r0] + pld_ r0, #2048 + tst r0, #2 + beq 1f + sub r2, #1 +.if \semit == 0 + bic r3, r3, r4 + semitrans0p r3, r1, r6, lr +.elseif \semit == 1 || \semit == 3 + bic r3, r3, r4 + semitrans13p r3, r1, r6, lr +.elseif \semit == 2 + semitrans2p r3, r1, r4, r5, r6, r7, lr +.endif + strh r3, [r0], #2 +1: + ldr r3, [r0] + pld_ r0, #32 + subs r2, r2, #2 +.if \semit == 0 + bic r3, r3, r4 + semitrans0p r3, r1, r6, lr +.elseif \semit == 1 || \semit == 3 + bic r3, r3, r4 + semitrans13p r3, r1, r6, lr +.elseif \semit == 2 + semitrans2p r3, r1, r4, r5, r6, r7, lr +.endif + strpl r3, [r0], #4 + bpl 1b +2: + tst r2, #1 + strhne r3, [r0], #2 + + ldmfd sp!, {r4-r7,pc} + .cfi_endproc +.endm + +poly_untex_st_m poly_untex_st0_asm, 0 +poly_untex_st_m poly_untex_st1_asm, 1 +poly_untex_st_m poly_untex_st3_asm, 3 +#ifdef HAVE_ARMV6 +poly_untex_st_m poly_untex_st2_asm, 2 +#endif + + +.macro poly_4_8bpp_asm_m name bpp light semit +FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count) + .cfi_startproc + stmfd sp!, {r4-r11,lr} + .cfi_def_cfa_offset 4*9 + .cfi_rel_offset lr, 4*8 + add r12, r1, #4 + ldmia r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk + ldr r5, [r1, #0x18] @ u_inc +.if \light + ldr r10,[r1, #0x24] @ rbg +.endif + mov r6, r12 @ u_msk + ldr r12,[r1, #0x1c] @ v_inc +.if \light + mov r10,r10,lsl #7 @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000 + bic r10,r10,#1<<23 + bic r10,r10,#1<<15 + mov r11,r10,lsl #8 @ 0ggg gggg ... +.endif + and r4, r4, r6 + and lr, lr, r7 @ v_msk & v + and lr, lr, #0xff<<10 + tst r12,r12 + bne v_\name + ldr r1, [r1] @ src + mov r7, r4, lsr #(13 - (\bpp / 8 * 3)) + add r1, r1, lr, lsl #1 +#ifdef HAVE_ARMV6 + add r12,r1, r7, lsl #(2 - (\bpp / 8 * 2)) + pld_ r12,#2048 @ next line +#endif +0: +.if \light || \semit >= 0 + mov r7, r4, lsr #(13 - (\bpp / 8 * 3)) + subs r2, r2, #1 + bmi 1f +.endif +.if \bpp == 4 + ldr lr, [r1, r7, lsl #2] + lsr r12,r4, #8 + and r12,r12,#0x1c + sub r12,r12,#1 + mov r12,lr, ror r12 + add r4, r4, r5 + and r12,r12,#0x1e +.else + ldrb r12,[r1, r7] + add r4, r4, r5 + add r12,r12,r12 +.endif + and r4, r4, r6 + ldrsh r12,[r3, r12] + add r0, r0, #2 +.if !\light && \semit < 0 + mov r7, r4, lsr #(13 - (\bpp / 8 * 3)) + tst r12,r12 + strhne r12,[r0, #-2] + subs r2, r2, #1 + bgt 0b + @ end +.else + tst r12,r12 + beq 0b +.if \light && \semit != 1 + modulate r12, r10, r11, r7, r8, lr +.endif +.if \semit == 0 + ldrhmi r7, [r0, #-2] + strhpl r12,[r0, #-2] + bpl 0b + semitrans0 r12, r7, lr +.endif + strh r12,[r0, #-2] + b 0b +.endif @ \light || \semit >= 0 +1: + ldmfd sp!, {r4-r11,pc} + +v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked +.if \light || \semit >= 0 + sub sp, sp, #4*2 + stmia sp, {r5,r6} + .cfi_def_cfa_offset 4*(9+2) + .cfi_rel_offset lr, 4*(8+2) +.endif + ldr r9, [r1, #0x14] @ v_msk + ldr r1, [r1] @ src + mov r8, r12 @ v_inc + and r9, r9, #0xff<<10 @ v_msk_final +.if !\light && \semit < 0 + and lr, r7, r9 + mov r12,r4, lsr #(13 - (\bpp / 8 * 3)) + add lr, r1, lr, lsl #1 +.endif +0: +.if \light || \semit >= 0 + and lr, r7, r9 + mov r12,r4, lsr #(13 - (\bpp / 8 * 3)) + add lr, r1, lr, lsl #1 + subs r2, r2, #1 + bmi 1f +.endif +.if \bpp == 4 + ldr lr, [lr, r12, lsl #2] + lsr r12,r4, #8 + and r12,r12,#0x1c + sub r12,r12,#1 + mov r12,lr, ror r12 + add r4, r4, r5 + and r12,r12,#0x1e +.else + ldrb r12,[lr, r12] + add r4, r4, r5 + add r12,r12,r12 +.endif + and r4, r4, r6 + ldrsh r12,[r3, r12] + add r0, r0, #2 + add r7, r7, r8 +.if !\light && \semit < 0 + and lr, r7, r9 + tst r12,r12 + add lr, r1, lr, lsl #1 + strhne r12,[r0, #-2] + mov r12,r4, lsr #(13 - (\bpp / 8 * 3)) + subs r2, r2, #1 + bgt 0b + @ end +.else + tst r12,r12 + beq 0b +.if \light && \semit != 1 + modulate r12, r10, r11, r5, r6, lr +.endif +.if \semit == 0 + ldrhmi r6, [r0, #-2] + strhpl r12,[r0, #-2] + ldmiapl sp, {r5,r6} + bpl 0b + semitrans0 r12, r6, lr +.endif + strh r12,[r0, #-2] + ldmia sp, {r5,r6} + b 0b +.endif @ \light || \semit >= 0 +1: +.if \light || \semit >= 0 + add sp, sp, #4*2 +.endif + ldmfd sp!, {r4-r11,pc} + .cfi_endproc +.endm + +poly_4_8bpp_asm_m poly_4bpp_asm, 4, 0, -1 +poly_4_8bpp_asm_m poly_4bpp_l0_st0_asm, 4, 0, 0 +poly_4_8bpp_asm_m poly_8bpp_asm, 8, 0, -1 +poly_4_8bpp_asm_m poly_8bpp_l0_st0_asm, 8, 0, 0 + +#ifdef HAVE_ARMV6 + +poly_4_8bpp_asm_m poly_4bpp_l1_std_asm, 4, 1, -1 +poly_4_8bpp_asm_m poly_4bpp_l1_st0_asm, 4, 1, 0 +poly_4_8bpp_asm_m poly_8bpp_l1_std_asm, 8, 1, -1 +poly_4_8bpp_asm_m poly_8bpp_l1_st0_asm, 8, 1, 0 + +#endif // HAVE_ARMV6 + @ vim:filetype=armasm diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h index 2329c46c..d69490ff 100644 --- a/plugins/gpu_unai/gpu_arm.h +++ b/plugins/gpu_unai/gpu_arm.h @@ -5,14 +5,62 @@ extern "C" { #endif -struct spriteDriverArg; +struct gpu_unai_inner_t; + +void tile_driver_st0_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn); +void tile_driver_st1_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn); +void tile_driver_st3_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn); void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base, - u32 count, const struct spriteDriverArg *arg); + u32 count, const struct gpu_unai_inner_t *inn); void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base, - u32 count, const struct spriteDriverArg *arg); + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_16bpp_asm(void *pPixel, const void *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines); +void sprite_driver_4bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_4bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_8bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_8bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); + +void poly_untex_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_untex_st1_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_untex_st3_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_asm (void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_8bpp_asm (void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_8bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); + +#ifdef HAVE_ARMV6 + +void tile_driver_st2_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn); + +void sprite_driver_4bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_4bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_4bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_8bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_8bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); +void sprite_driver_8bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct gpu_unai_inner_t *inn); + +void poly_untex_st2_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_8bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_8bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); + +#endif // HAVE_ARMV6 + #ifdef __cplusplus } #endif diff --git a/plugins/gpu_unai/gpu_command.h b/plugins/gpu_unai/gpu_command.h index cf6b62b4..adede2b5 100644 --- a/plugins/gpu_unai/gpu_command.h +++ b/plugins/gpu_unai/gpu_command.h @@ -45,13 +45,13 @@ void gpuSetTexture(u16 tpage) gpu_unai.BLEND_MODE = ((tpage>>5) & 3) << 3; gpu_unai.TEXT_MODE = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one - gpu_unai.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)]; + gpu_unai.inn.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)]; } /////////////////////////////////////////////////////////////////////////////// INLINE void gpuSetCLUT(u16 clut) { - gpu_unai.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4]; + gpu_unai.inn.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4]; } #ifdef ENABLE_GPU_NULL_SUPPORT diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index a80c3a3a..3281d0fa 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -55,7 +55,10 @@ #include "gpu_inner_quantization.h" #include "gpu_inner_light.h" +#include "arm_features.h" +#include "compiler_features.h" #ifdef __arm__ +#include "gpu_arm.h" #include "gpu_inner_blend_arm.h" #include "gpu_inner_light_arm.h" #define gpuBlending gpuBlendingARM @@ -276,7 +279,7 @@ const PSD gpuPixelSpanDrivers[64] = // GPU Tiles innerloops generator template -static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data) +static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count) { le16_t ldata; @@ -328,7 +331,42 @@ static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data) } } -static void TileNULL(le16_t *pDst, u32 count, u16 data) +template +static noinline void gpuTileDriverFn(le16_t *pDst, u16 data, u32 count, + const gpu_unai_inner_t &inn) +{ + const int li=gpu_unai.inn.ilace_mask; + const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0); + const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1); + const int y1 = inn.y1; + int y0 = inn.y0; + + for (; y0 < y1; ++y0) { + if (!(y0&li) && (y0&pi) != pif) + gpuTileSpanFn(pDst, data, count); + pDst += FRAME_WIDTH; + } +} + +#ifdef __arm__ + +template +static void TileAsm(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn) +{ + switch (CF) { + case 0x02: tile_driver_st0_asm(pDst, data, count, &inn); return; + case 0x0a: tile_driver_st1_asm(pDst, data, count, &inn); return; + case 0x1a: tile_driver_st3_asm(pDst, data, count, &inn); return; +#ifdef HAVE_ARMV6 + case 0x12: tile_driver_st2_asm(pDst, data, count, &inn); return; +#endif + } + gpuTileDriverFn(pDst, data, count, inn); +} + +#endif + +static void TileNULL(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn) { #ifdef ENABLE_GPU_LOG_SUPPORT fprintf(stdout,"TileNULL()\n"); @@ -337,42 +375,47 @@ static void TileNULL(le16_t *pDst, u32 count, u16 data) /////////////////////////////////////////////////////////////////////////////// // Tiles innerloops driver -typedef void (*PT)(le16_t *pDst, u32 count, u16 data); +typedef void (*PT)(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn); // Template instantiation helper macros -#define TI(cf) gpuTileSpanFn<(cf)> +#define TI(cf) gpuTileDriverFn<(cf)> #define TN TileNULL +#ifdef __arm__ +#define TA(cf) TileAsm<(cf)> +#else +#define TA(cf) TI(cf) +#endif +#ifdef HAVE_ARMV6 +#define TA6(cf) TileAsm<(cf)> +#else +#define TA6(cf) TI(cf) +#endif #define TIBLOCK(ub) \ - TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \ - TN, TI((ub)|0x0a), TN, TI((ub)|0x0e), \ - TN, TI((ub)|0x12), TN, TI((ub)|0x16), \ - TN, TI((ub)|0x1a), TN, TI((ub)|0x1e) + TI((ub)|0x00), TA6((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \ + TN, TA ((ub)|0x0a), TN, TI((ub)|0x0e), \ + TN, TA6((ub)|0x12), TN, TI((ub)|0x16), \ + TN, TA ((ub)|0x1a), TN, TI((ub)|0x1e) -const PT gpuTileSpanDrivers[32] = { +const PT gpuTileDrivers[32] = { TIBLOCK(0<<8), TIBLOCK(1<<8) }; #undef TI #undef TN +#undef TA +#undef TA6 #undef TIBLOCK /////////////////////////////////////////////////////////////////////////////// // GPU Sprites innerloops generator -// warning: gpu_arm.S asm uses this, update it if you change this -typedef struct spriteDriverArg { - const le16_t *CBA; // 00 - u32 u0, v0, u0_mask, v0_mask; // 04 08 0c 10 - s32 y0, y1, lines, li; // 14 -} spriteDriverArg; - typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt, - const spriteDriverArg *arg); + const gpu_unai_inner_t &inn); template -static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, - const spriteDriverArg *arg) +static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const gpu_unai_inner_t &inn) { // Blend func can save an operation if it knows uSrc MSB is unset. // Untextured prims can always skip (source color always comes with MSB=0). @@ -381,25 +424,26 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, uint_fast16_t uSrc, uDst, srcMSB; bool should_blend; - u32 u0_mask = arg->u0_mask; + u32 u0_mask = inn.u_msk >> 10; u8 r5, g5, b5; if (CF_LIGHT) { - r5 = gpu_unai.r5; - g5 = gpu_unai.g5; - b5 = gpu_unai.b5; + r5 = inn.r5; + g5 = inn.g5; + b5 = inn.b5; } + const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA; + const u32 v0_mask = inn.v_msk >> 10; + s32 y0 = inn.y0, y1 = inn.y1, li = inn.ilace_mask; + u32 u0_ = inn.u, v0 = inn.v; + if (CF_TEXTMODE==3) { - // Texture is accessed byte-wise, so adjust mask if 16bpp + // Texture is accessed byte-wise, so adjust to 16bpp + u0_ <<= 1; u0_mask <<= 1; } - const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = arg->CBA; - const u32 v0_mask = arg->v0_mask; - s32 y0 = arg->y0, y1 = arg->y1, li = arg->li; - u32 u0_ = arg->u0, v0 = arg->v0; - for (; y0 < y1; ++y0, pPixel += FRAME_WIDTH, ++v0) { if (y0 & li) continue; @@ -450,41 +494,46 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, } #ifdef __arm__ -#include "gpu_arm.h" -static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, - const spriteDriverArg *arg) +template +static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const gpu_unai_inner_t &inn) { #if 1 - s32 lines = arg->lines; - u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; - if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { - pTxt_base += arg->u0 / 2 + arg->v0 * 2048; - sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg); - } - else + s32 lines = inn.y1 - inn.y0; + u32 u1m = inn.u + count - 1, v1m = inn.v + lines - 1; + if (u1m == (u1m & (inn.u_msk >> 10)) && v1m == (v1m & (inn.v_msk >> 10))) { + const u8 *pTxt = pTxt_base + inn.v * 2048; + switch (CF) { + case 0x20: sprite_driver_4bpp_asm (pPixel, pTxt + inn.u / 2, count, &inn); return; + case 0x40: sprite_driver_8bpp_asm (pPixel, pTxt + inn.u, count, &inn); return; + case 0x60: sprite_driver_16bpp_asm(pPixel, pTxt + inn.u * 2, count, &inn); return; + } + } + if (v1m == (v1m & (inn.v_msk >> 10))) { + const u8 *pTxt = pTxt_base + inn.v * 2048; + switch (CF) { + case 0x20: sprite_driver_4bpp_l0_std_asm(pPixel, pTxt, count, &inn); return; + case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return; + case 0x40: sprite_driver_8bpp_l0_std_asm(pPixel, pTxt, count, &inn); return; + case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return; +#ifdef HAVE_ARMV6 + case 0x21: sprite_driver_4bpp_l1_std_asm(pPixel, pTxt, count, &inn); return; + case 0x23: sprite_driver_4bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return; + case 0x2b: sprite_driver_4bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return; + case 0x41: sprite_driver_8bpp_l1_std_asm(pPixel, pTxt, count, &inn); return; + case 0x43: sprite_driver_8bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return; + case 0x4b: sprite_driver_8bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return; #endif - gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg); -} - -static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, - const spriteDriverArg *arg) -{ -#if 1 - s32 lines = arg->lines; - u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; - if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { - pTxt_base += arg->u0 + arg->v0 * 2048; - sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg); - } - else + } + } #endif - gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg); + gpuSpriteDriverFn(pPixel, count, pTxt_base, inn); } #endif // __arm__ static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base, - const spriteDriverArg *arg) + const gpu_unai_inner_t &inn) { #ifdef ENABLE_GPU_LOG_SUPPORT fprintf(stdout,"SpriteNULL()\n"); @@ -500,29 +549,32 @@ static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base, #define TI(cf) gpuSpriteDriverFn<(cf)> #define TN SpriteNULL #ifdef __arm__ -#define TA4(cf) Sprite4bppMaybeAsm -#define TA8(cf) Sprite8bppMaybeAsm +#define TA(cf) SpriteMaybeAsm<(cf)> +#else +#define TA(cf) TI(cf) +#endif +#ifdef HAVE_ARMV6 +#define TA6(cf) SpriteMaybeAsm<(cf)> #else -#define TA4(cf) TI(cf) -#define TA8(cf) TI(cf) +#define TA6(cf) TI(cf) #endif #define TIBLOCK(ub) \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ - TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ - TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ - TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ - TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ - TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ - TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ - TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ - TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ - TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ - TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ - TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TN, TN, TI((ub)|0x2a), TA6((ub)|0x2b),TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ + TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ + TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ + TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TN, TN, TI((ub)|0x4a), TA6((ub)|0x4b),TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ + TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ + TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ + TA((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ + TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ + TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ + TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) const PS gpuSpriteDrivers[256] = { TIBLOCK(0<<8), TIBLOCK(1<<8) @@ -531,6 +583,8 @@ const PS gpuSpriteDrivers[256] = { #undef TI #undef TN #undef TIBLOCK +#undef TA +#undef TA6 /////////////////////////////////////////////////////////////////////////////// // GPU Polygon innerloops generator @@ -554,7 +608,7 @@ const PS gpuSpriteDrivers[256] = { // relevant blend/light headers. // (see README_senquack.txt) template -static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) +static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) { // Blend func can save an operation if it knows uSrc MSB is unset. // Untextured prims can always skip this (src color MSB is always 0). @@ -562,14 +616,14 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT; bool should_blend; - u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask; + u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.inn.blit_mask; if (!CF_TEXTMODE) { if (!CF_GOURAUD) { // UNTEXTURED, NO GOURAUD - const u16 pix15 = gpu_unai.PixelData; + const u16 pix15 = gpu_unai.inn.PixelData; do { uint_fast16_t uSrc, uDst; @@ -596,8 +650,8 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) else { // UNTEXTURED, GOURAUD - gcol_t l_gCol = gpu_unai.gCol; - gcol_t l_gInc = gpu_unai.gInc; + gcol_t l_gCol = gpu_unai.inn.gCol; + gcol_t l_gInc = gpu_unai.inn.gInc; do { uint_fast16_t uDst, uSrc; @@ -643,12 +697,15 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) //senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into // one 32-bit unsigned int, but this proved to lose too much accuracy // (pixel drouputs noticeable in NFS3 sky), so now are separate vars. - u32 l_u_msk = gpu_unai.u_msk; u32 l_v_msk = gpu_unai.v_msk; - u32 l_u = gpu_unai.u & l_u_msk; u32 l_v = gpu_unai.v & l_v_msk; - s32 l_u_inc = gpu_unai.u_inc; s32 l_v_inc = gpu_unai.v_inc; + u32 l_u_msk = gpu_unai.inn.u_msk; u32 l_v_msk = gpu_unai.inn.v_msk; + u32 l_u = gpu_unai.inn.u & l_u_msk; u32 l_v = gpu_unai.inn.v & l_v_msk; + s32 l_u_inc = gpu_unai.inn.u_inc; s32 l_v_inc = gpu_unai.inn.v_inc; + l_v <<= 1; + l_v_inc <<= 1; + l_v_msk = (l_v_msk & (0xff<<10)) << 1; - const le16_t* TBA_ = gpu_unai.TBA; - const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA; + const le16_t* TBA_ = gpu_unai.inn.TBA; + const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA; u8 r5, g5, b5; u8 r8, g8, b8; @@ -657,17 +714,17 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) if (CF_LIGHT) { if (CF_GOURAUD) { - l_gInc = gpu_unai.gInc; - l_gCol = gpu_unai.gCol; + l_gInc = gpu_unai.inn.gInc; + l_gCol = gpu_unai.inn.gCol; } else { if (CF_DITHER) { - r8 = gpu_unai.r8; - g8 = gpu_unai.g8; - b8 = gpu_unai.b8; + r8 = gpu_unai.inn.r8; + g8 = gpu_unai.inn.g8; + b8 = gpu_unai.inn.b8; } else { - r5 = gpu_unai.r5; - g5 = gpu_unai.g5; - b5 = gpu_unai.b5; + r5 = gpu_unai.inn.r5; + g5 = gpu_unai.inn.g5; + b5 = gpu_unai.inn.b5; } } } @@ -682,17 +739,19 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) // (UNAI originally used 16.16) if (CF_TEXTMODE==1) { // 4bpp (CLUT) u32 tu=(l_u>>10); - u32 tv=(l_v<<1)&(0xff<<11); + u32 tv=l_v&l_v_msk; u8 rgb=((u8*)TBA_)[tv+(tu>>1)]; uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]); if (!uSrc) goto endpolytext; } if (CF_TEXTMODE==2) { // 8bpp (CLUT) - uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]); + u32 tv=l_v&l_v_msk; + uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]); if (!uSrc) goto endpolytext; } if (CF_TEXTMODE==3) { // 16bpp - uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]); + u32 tv=(l_v&l_v_msk)>>1; + uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]); if (!uSrc) goto endpolytext; } @@ -736,7 +795,7 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) endpolytext: pDst++; l_u = (l_u + l_u_inc) & l_u_msk; - l_v = (l_v + l_v_inc) & l_v_msk; + l_v += l_v_inc; if (CF_LIGHT && CF_GOURAUD) l_gCol.raw += l_gInc.raw; } @@ -744,6 +803,30 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) } } +#ifdef __arm__ +template +static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) +{ + switch (CF) { + case 0x02: poly_untex_st0_asm (pDst, &gpu_unai.inn, count); break; + case 0x0a: poly_untex_st1_asm (pDst, &gpu_unai.inn, count); break; + case 0x1a: poly_untex_st3_asm (pDst, &gpu_unai.inn, count); break; + case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break; + case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break; + case 0x40: poly_8bpp_asm (pDst, &gpu_unai.inn, count); break; + case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break; +#ifdef HAVE_ARMV6 + case 0x12: poly_untex_st2_asm (pDst, &gpu_unai.inn, count); break; + case 0x21: poly_4bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break; + case 0x23: poly_4bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break; + case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break; + case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break; +#endif + default: gpuPolySpanFn(gpu_unai, pDst, count); + } +} +#endif + static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) { #ifdef ENABLE_GPU_LOG_SUPPORT @@ -758,16 +841,26 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count); // Template instantiation helper macros #define TI(cf) gpuPolySpanFn<(cf)> #define TN PolyNULL +#ifdef __arm__ +#define TA(cf) PolySpanMaybeAsm<(cf)> +#else +#define TA(cf) TI(cf) +#endif +#ifdef HAVE_ARMV6 +#define TA6(cf) PolySpanMaybeAsm<(cf)> +#else +#define TA6(cf) TI(cf) +#endif #define TIBLOCK(ub) \ - TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \ - TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \ - TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \ - TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \ - TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TI((ub)|0x00), TI((ub)|0x01), TA6((ub)|0x02),TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \ + TN, TN, TA((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \ + TN, TN, TA6((ub)|0x12),TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \ + TN, TN, TA((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \ + TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ - TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ @@ -800,5 +893,7 @@ const PP gpuPolySpanDrivers[2048] = { #undef TI #undef TN #undef TIBLOCK +#undef TA +#undef TA6 #endif /* __GPU_UNAI_GPU_INNER_H__ */ diff --git a/plugins/gpu_unai/gpu_inner_blend_arm.h b/plugins/gpu_unai/gpu_inner_blend_arm.h index 6413527c..f887374c 100644 --- a/plugins/gpu_unai/gpu_inner_blend_arm.h +++ b/plugins/gpu_unai/gpu_inner_blend_arm.h @@ -41,10 +41,14 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst) asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst "and %[mix], %[mix], %[mask]\n\t" // ... & 0x0421 "sub %[mix], %[uDst], %[mix]\n\t" // uDst - ... + #ifdef HAVE_ARMV6 + "uhadd16 %[mix], %[uSrc], %[mix]\n\t" + #else "add %[mix], %[uSrc], %[mix]\n\t" // uSrc + ... "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1 + #endif : [mix] "=&r" (mix) - : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421)); + : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0420)); // 421 } if (BLENDMODE == 1 || BLENDMODE == 3) { diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h index 7bd58908..7edb8fb0 100644 --- a/plugins/gpu_unai/gpu_inner_light_arm.h +++ b/plugins/gpu_unai/gpu_inner_light_arm.h @@ -1,6 +1,8 @@ #ifndef _OP_LIGHT_ARM_H_ #define _OP_LIGHT_ARM_H_ +#include "arm_features.h" + //////////////////////////////////////////////////////////////////////////////// // Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet // @@ -40,6 +42,27 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol) // u16 output: mbbbbbgggggrrrrr // Where 'X' are fixed-pt bits. //////////////////////////////////////////////////////////////////////////////// +#ifdef HAVE_ARMV6 +// clang uses smulbb but not gcc, so we need this +GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b) +{ + int_fast16_t r; + asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) +{ + // on v6 we have single-cycle mul and sat which is better than the lut + int_fast16_t r = smulbb(uSrc & 0x001f, r5); + int_fast16_t g = smulbb(uSrc & 0x03e0, g5); + int_fast16_t b = smulbb(uSrc & 0x7c00, b5); + asm volatile("usat %0, #5, %0, asr #4" : "=r"(r) : "0"(r)); + asm volatile("usat %0, #5, %0, asr #9" : "=r"(g) : "0"(g)); + asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b)); + return (uSrc & 0x8000) | (b << 10) | (g << 5) | r; +} +#else GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) { uint_fast16_t out = 0x03E0; @@ -65,6 +88,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 : "cc"); return out; } +#endif //////////////////////////////////////////////////////////////////////////////// // Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color: diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h index 1b9e08dc..6aaf9adc 100644 --- a/plugins/gpu_unai/gpu_raster_polygon.h +++ b/plugins/gpu_unai/gpu_raster_polygon.h @@ -223,13 +223,14 @@ static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVerte /*---------------------------------------------------------------------- gpuDrawPolyF - Flat-shaded, untextured poly ----------------------------------------------------------------------*/ -void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) +void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad, + PolyType ptype = POLYTYPE_F) { // Set up bgr555 color to be used across calls in inner driver - gpu_unai.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0])); + gpu_unai.inn.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0])); PolyVertex vbuf[4]; - polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad); + polyInitVertexBuffer(vbuf, packet, ptype, is_quad); int total_passes = is_quad ? 2 : 1; int cur_pass = 0; @@ -257,7 +258,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = x4 = i2x(x0); if (dx < 0) { #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; #else @@ -275,7 +276,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad #endif } else { #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; #else @@ -303,7 +304,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = i2x(x0) + (dx3 * (y1 - y0)); x4 = i2x(x1); #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -319,7 +320,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad x3 = i2x(x1); x4 = i2x(x0) + (dx4 * (y1 - y0)); #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -351,9 +352,9 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad continue; le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)]; - int li=gpu_unai.ilace_mask; - int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + int li=gpu_unai.inn.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1); for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH, x3 += dx3, x4 += dx4 ) @@ -374,19 +375,20 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad /*---------------------------------------------------------------------- gpuDrawPolyFT - Flat-shaded, textured poly ----------------------------------------------------------------------*/ -void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) +void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad, + PolyType ptype = POLYTYPE_FT) { // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light) - gpu_unai.r8 = packet.U1[0]; - gpu_unai.g8 = packet.U1[1]; - gpu_unai.b8 = packet.U1[2]; + gpu_unai.inn.r8 = packet.U1[0]; + gpu_unai.inn.g8 = packet.U1[1]; + gpu_unai.inn.b8 = packet.U1[2]; // r5/g5/b5 used if just texture-blending is applied (15-bit light) - gpu_unai.r5 = packet.U1[0] >> 3; - gpu_unai.g5 = packet.U1[1] >> 3; - gpu_unai.b5 = packet.U1[2] >> 3; + gpu_unai.inn.r5 = packet.U1[0] >> 3; + gpu_unai.inn.g5 = packet.U1[1] >> 3; + gpu_unai.inn.b5 = packet.U1[2] >> 3; PolyVertex vbuf[4]; - polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad); + polyInitVertexBuffer(vbuf, packet, ptype, is_quad); int total_passes = is_quad ? 2 : 1; int cur_pass = 0; @@ -460,8 +462,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua #endif #endif // Set u,v increments for inner driver - gpu_unai.u_inc = du4; - gpu_unai.v_inc = dv4; + gpu_unai.inn.u_inc = du4; + gpu_unai.inn.v_inc = dv4; //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here? // (SAME ISSUE ELSEWHERE) @@ -581,7 +583,7 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua v3 += (dv3 * (y1 - y0)); } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -661,9 +663,9 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua continue; le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)]; - int li=gpu_unai.ilace_mask; - int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + int li=gpu_unai.inn.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1); for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, x3 += dx3, x4 += dx4, @@ -693,8 +695,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua } // Set u,v coords for inner driver - gpu_unai.u = u4; - gpu_unai.v = v4; + gpu_unai.inn.u = u4; + gpu_unai.inn.v = v4; if (xb > xmax) xb = xmax; if ((xb - xa) > 0) @@ -790,7 +792,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad #endif #endif // Setup packed Gouraud increment for inner driver - gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4); + gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4); for (s32 loop0 = 2; loop0; loop0--) { if (loop0 == 2) { @@ -920,7 +922,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -1006,9 +1008,9 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad continue; le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)]; - int li=gpu_unai.ilace_mask; - int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + int li=gpu_unai.inn.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1); for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, x3 += dx3, x4 += dx4, @@ -1042,7 +1044,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad } // Setup packed Gouraud color for inner driver - gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4); + gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4); if (xb > xmax) xb = xmax; if ((xb - xa) > 0) @@ -1156,9 +1158,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua #endif #endif // Set u,v increments and packed Gouraud increment for inner driver - gpu_unai.u_inc = du4; - gpu_unai.v_inc = dv4; - gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4); + gpu_unai.inn.u_inc = du4; + gpu_unai.inn.v_inc = dv4; + gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4); for (s32 loop0 = 2; loop0; loop0--) { if (loop0 == 2) { @@ -1305,7 +1307,7 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua } #ifdef GPU_UNAI_USE_FLOATMATH -#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; #else dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; @@ -1401,9 +1403,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua continue; le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)]; - int li=gpu_unai.ilace_mask; - int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + int li=gpu_unai.inn.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1); for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, x3 += dx3, x4 += dx4, @@ -1446,9 +1448,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua } // Set packed Gouraud color and u,v coords for inner driver - gpu_unai.u = u4; - gpu_unai.v = v4; - gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4); + gpu_unai.inn.u = u4; + gpu_unai.inn.v = v4; + gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4); if (xb > xmax) xb = xmax; if ((xb - xa) > 0) diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h index 2564e7f0..5c7b67ce 100644 --- a/plugins/gpu_unai/gpu_raster_sprite.h +++ b/plugins/gpu_unai/gpu_raster_sprite.h @@ -61,34 +61,19 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out) *w_out = x1; *h_out = y1 - y0; - gpu_unai.r5 = packet.U1[0] >> 3; - gpu_unai.g5 = packet.U1[1] >> 3; - gpu_unai.b5 = packet.U1[2] >> 3; - le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)]; - const int li=gpu_unai.ilace_mask; - //const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - //const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); - unsigned int tmode = gpu_unai.TEXT_MODE >> 5; - u8* pTxt_base = (u8*)gpu_unai.TBA; - - // Texture is accessed byte-wise, so adjust idx if 16bpp - if (tmode == 3) u0 <<= 1; - - spriteDriverArg arg; - arg.CBA = gpu_unai.CBA; - arg.u0 = u0; - arg.v0 = v0; - arg.u0_mask = gpu_unai.TextureWindow[2]; - arg.v0_mask = gpu_unai.TextureWindow[3]; - arg.y0 = y0; - arg.y1 = y1; - arg.lines = y1 - y0; - arg.li = li; - gpuSpriteDriver(Pixel, x1, pTxt_base, &arg); + + gpu_unai.inn.r5 = packet.U1[0] >> 3; + gpu_unai.inn.g5 = packet.U1[1] >> 3; + gpu_unai.inn.b5 = packet.U1[2] >> 3; + gpu_unai.inn.u = u0; + gpu_unai.inn.v = v0; + gpu_unai.inn.y0 = y0; + gpu_unai.inn.y1 = y1; + gpuSpriteDriver(Pixel, x1, (u8 *)gpu_unai.inn.TBA, gpu_unai.inn); } -void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out) +void gpuDrawT(PtrUnion packet, const PT gpuTileDriver, s32 *w_out, s32 *h_out) { s32 x0, x1, y0, y1; @@ -118,15 +103,10 @@ void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_ou const u16 Data = GPU_RGB16(le32_to_u32(packet.U4[0])); le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)]; - const int li=gpu_unai.ilace_mask; - const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); - const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); - - for (; y0> 24; + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + s32 w = 0, h = 0; + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + // This fixes Silent Hill running animation on loading screens: + // (On PSX, color values 0x00-0x7F darken the source texture's color, + // 0x81-FF lighten textures (ultimately clamped to 0x1F), + // 0x80 leaves source texture color unchanged, HOWEVER, + // gpu_unai uses a simple lighting LUT whereby only the upper + // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as + // 0x80. + // + // NOTE: I've changed all textured sprite draw commands here and + // elsewhere to use proper behavior, but left poly commands + // alone, I don't want to slow rendering down too much. (TODO) + if (need_lighting(le32_raw(gpu_unai.PacketBuffer.U4[0]))) + driver_idx |= Lighting; + PS driver = gpuSpriteDrivers[driver_idx]; + PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer }; + gpuDrawS(packet, driver, &w, &h); + gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); +} + extern const unsigned char cmd_lengths[256]; int do_cmd_list(u32 *list_, int list_len, @@ -468,8 +502,20 @@ int do_cmd_list(u32 *list_, int list_len, case 0x2D: case 0x2E: case 0x2F: { // Textured 4-pt poly - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); + u32 simplified_count; gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[4]) >> 16); + if ((simplified_count = prim_try_simplify_quad_t(gpu_unai.PacketBuffer.U4, + gpu_unai.PacketBuffer.U4))) + { + for (i = 0;; ) { + textured_sprite(cpu_cycles_sum, cpu_cycles); + if (++i >= simplified_count) + break; + memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16); + } + break; + } + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); u32 driver_idx = //(gpu_unai.blit_mask?1024:0) | @@ -497,13 +543,22 @@ int do_cmd_list(u32 *list_, int list_len, // this is an untextured poly, so CF_LIGHT (texture blend) // shouldn't apply. Until the original array of template // instantiation ptrs is fixed, we're stuck with this. (TODO) + u8 gouraud = 129; + u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]); + for (i = 1; i < 3; i++) + xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 2]); + if ((xor_ & HTOLE32(0xf8f8f8)) == 0) + gouraud = 0; PP driver = gpuPolySpanDrivers[ //(gpu_unai.blit_mask?1024:0) | Dithering | Blending_Mode | - gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB ]; - gpuDrawPolyG(packet, driver, false); + if (gouraud) + gpuDrawPolyG(packet, driver, false); + else + gpuDrawPolyF(packet, driver, false, POLYTYPE_G); gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g()); } break; @@ -513,13 +568,28 @@ int do_cmd_list(u32 *list_, int list_len, case 0x37: { // Gouraud-shaded, textured 3-pt poly gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16); + u8 lighting = Lighting; + u8 gouraud = lighting ? (1<<7) : 0; + if (lighting) { + u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]); + for (i = 1; i < 3; i++) + xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 3]); + if ((xor_ & HTOLE32(0xf8f8f8)) == 0) { + gouraud = 0; + if (!need_lighting(rgb0)) + lighting = 0; + } + } PP driver = gpuPolySpanDrivers[ //(gpu_unai.blit_mask?1024:0) | Dithering | Blending_Mode | gpu_unai.TEXT_MODE | - gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB ]; - gpuDrawPolyGT(packet, driver, false); + if (gouraud) + gpuDrawPolyGT(packet, driver, false); // is_quad = true + else + gpuDrawPolyFT(packet, driver, false, POLYTYPE_GT); gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt()); } break; @@ -528,13 +598,22 @@ int do_cmd_list(u32 *list_, int list_len, case 0x3A: case 0x3B: { // Gouraud-shaded 4-pt poly // See notes regarding '129' for 0x30..0x33 further above -senquack + u8 gouraud = 129; + u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]); + for (i = 1; i < 4; i++) + xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 2]); + if ((xor_ & HTOLE32(0xf8f8f8)) == 0) + gouraud = 0; PP driver = gpuPolySpanDrivers[ //(gpu_unai.blit_mask?1024:0) | Dithering | Blending_Mode | - gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB ]; - gpuDrawPolyG(packet, driver, true); // is_quad = true + if (gouraud) + gpuDrawPolyG(packet, driver, true); // is_quad = true + else + gpuDrawPolyF(packet, driver, true, POLYTYPE_G); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g()); } break; @@ -542,15 +621,42 @@ int do_cmd_list(u32 *list_, int list_len, case 0x3D: case 0x3E: case 0x3F: { // Gouraud-shaded, textured 4-pt poly - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16); + u32 simplified_count; + gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16); + if ((simplified_count = prim_try_simplify_quad_gt(gpu_unai.PacketBuffer.U4, + gpu_unai.PacketBuffer.U4))) + { + for (i = 0;; ) { + textured_sprite(cpu_cycles_sum, cpu_cycles); + if (++i >= simplified_count) + break; + memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16); + } + break; + } + gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); + u8 lighting = Lighting; + u8 gouraud = lighting ? (1<<7) : 0; + if (lighting) { + u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]); + for (i = 1; i < 4; i++) + xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 3]); + if ((xor_ & HTOLE32(0xf8f8f8)) == 0) { + gouraud = 0; + if (!need_lighting(rgb0)) + lighting = 0; + } + } PP driver = gpuPolySpanDrivers[ //(gpu_unai.blit_mask?1024:0) | Dithering | Blending_Mode | gpu_unai.TEXT_MODE | - gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB ]; - gpuDrawPolyGT(packet, driver, true); // is_quad = true + if (gouraud) + gpuDrawPolyGT(packet, driver, true); // is_quad = true + else + gpuDrawPolyFT(packet, driver, true, POLYTYPE_GT); gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt()); } break; @@ -642,7 +748,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x61: case 0x62: case 0x63: { // Monochrome rectangle (variable size) - PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; s32 w = 0, h = 0; gpuDrawT(packet, driver, &w, &h); gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); @@ -651,38 +757,16 @@ int do_cmd_list(u32 *list_, int list_len, case 0x64: case 0x65: case 0x66: - case 0x67: { // Textured rectangle (variable size) - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - // This fixes Silent Hill running animation on loading screens: - // (On PSX, color values 0x00-0x7F darken the source texture's color, - // 0x81-FF lighten textures (ultimately clamped to 0x1F), - // 0x80 leaves source texture color unchanged, HOWEVER, - // gpu_unai uses a simple lighting LUT whereby only the upper - // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as - // 0x80. - // - // NOTE: I've changed all textured sprite draw commands here and - // elsewhere to use proper behavior, but left poly commands - // alone, I don't want to slow rendering down too much. (TODO) - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); - } break; + case 0x67: // Textured rectangle (variable size) + textured_sprite(cpu_cycles_sum, cpu_cycles); + break; case 0x68: case 0x69: case 0x6A: case 0x6B: { // Monochrome rectangle (1x1 dot) gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00010001); - PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; s32 w = 0, h = 0; gpuDrawT(packet, driver, &w, &h); gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1)); @@ -693,7 +777,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x72: case 0x73: { // Monochrome rectangle (8x8) gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00080008); - PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; s32 w = 0, h = 0; gpuDrawT(packet, driver, &w, &h); gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); @@ -704,18 +788,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x76: case 0x77: { // Textured rectangle (8x8) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00080008); - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); + textured_sprite(cpu_cycles_sum, cpu_cycles); } break; case 0x78: @@ -723,7 +796,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x7A: case 0x7B: { // Monochrome rectangle (16x16) gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00100010); - PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; s32 w = 0, h = 0; gpuDrawT(packet, driver, &w, &h); gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); @@ -734,17 +807,7 @@ int do_cmd_list(u32 *list_, int list_len, case 0x7E: case 0x7F: { // Textured rectangle (16x16) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010); - gpuSetCLUT (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); - s32 w = 0, h = 0; - //senquack - Only color 808080h-878787h allows skipping lighting calculation: - //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) - // Strip lower 3 bits of each color and determine if lighting should be used: - if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080)) - driver_idx |= Lighting; - PS driver = gpuSpriteDrivers[driver_idx]; - gpuDrawS(packet, driver, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); + textured_sprite(cpu_cycles_sum, cpu_cycles); } break; #ifdef TEST diff --git a/plugins/gpulib/Makefile b/plugins/gpulib/Makefile index cff61410..53aaa886 100644 --- a/plugins/gpulib/Makefile +++ b/plugins/gpulib/Makefile @@ -5,7 +5,7 @@ endif include ../../config.mak -OBJS += gpu.o +OBJS += gpu.o prim.o ifeq "$(ARCH)" "arm" OBJS += vout_pl.o diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index df1c1c6c..88aa6704 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -17,23 +17,11 @@ #include "gpu_timing.h" #include "../../libpcsxcore/gpu.h" // meh #include "../../frontend/plugin_lib.h" +#include "../../include/compiler_features.h" #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif -#ifdef __GNUC__ -# define unlikely(x) __builtin_expect((x), 0) -# define preload __builtin_prefetch -# ifndef __clang__ -# define noinline __attribute__((noinline,noclone)) -# else -# define noinline __attribute__((noinline)) -# endif -#else -# define unlikely(x) -# define preload(...) -# define noinline -#endif //#define log_io gpu_log #define log_io(...) diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h index e654500d..570d8421 100644 --- a/plugins/gpulib/gpu.h +++ b/plugins/gpulib/gpu.h @@ -147,6 +147,9 @@ void vout_update(void); void vout_blank(void); void vout_set_config(const struct rearmed_cbs *config); +int prim_try_simplify_quad_t (void *simplified, const void *prim); +int prim_try_simplify_quad_gt(void *simplified, const void *prim); + /* listing these here for correct linkage if rasterizer uses c++ */ struct GPUFreeze; diff --git a/plugins/gpulib/prim.c b/plugins/gpulib/prim.c new file mode 100644 index 00000000..d6294641 --- /dev/null +++ b/plugins/gpulib/prim.c @@ -0,0 +1,249 @@ +#include +#include +#include +#include "../../include/compiler_features.h" +#include "gpu.h" + +// retain neon's ability to sample textures pixel-perfectly +#ifdef GPU_NEON +#define STRICT +#endif + +struct vert_t +{ + union { + struct { + int16_t x, y; + }; + uint32_t xy; + }; + union { + struct { + uint8_t u, v; + int16_t clut; + }; + uint32_t uvclut; + }; +}; + +// gt ~ gouraud textured +struct vert_gt +{ + uint32_t rgb; + struct vert_t t; +}; + +struct quad_t +{ + uint32_t rgb_c; + struct vert_t v[4]; +}; + +struct quad_gt +{ + struct vert_gt v[4]; +}; + +struct sprite +{ + uint32_t rgb_c; + union { + struct { + int16_t x, y; + }; + uint32_t xy; + }; + union { + struct { + uint8_t u, v; + int16_t clut; + }; + uint32_t uvclut; + }; + int16_t w, h; +}; + +// debug +#if 0 +static void log_quad_t(const struct quad_t *q, int ret) +{ +#if 1 + printf("quad_t %08x", q->rgb_c); + int i; + for (i = 0; i < 4; i++) + printf(" | %3d,%3d %3d,%3d", + q->v[i].x, q->v[i].y, q->v[i].u, q->v[i].v); + printf(" -> %d\n", ret); +#endif +} + +static void log_quad_gt(const struct vert_gt *v, int ret) +{ +#if 1 + printf("quad_gt %02x", v[0].rgb >> 24); + int i; + for (i = 0; i < 4; i++) + printf(" | %3d,%3d %3d,%3d %06x", + v[i].t.x, v[i].t.y, v[i].t.u, v[i].t.v, v[i].rgb & 0xffffff); + printf(" -> %d\n", ret); +#endif +} + +int prim_try_simplify_quad_t_(void *simplified, const void *prim_); +int prim_try_simplify_quad_t(void *simplified, const void *prim_) +{ + struct quad_t prim = *(struct quad_t *)prim_; + int ret = prim_try_simplify_quad_t_(simplified, prim_); + #define prim_try_simplify_quad_t prim_try_simplify_quad_t_ + ///if (!ret) + log_quad_t(&prim, ret); + return ret; +} + +int prim_try_simplify_quad_gt_(void *simplified, const void *prim_); +int prim_try_simplify_quad_gt(void *simplified, const void *prim_) +{ + struct quad_gt prim = *(struct quad_gt *)prim_; + int ret = prim_try_simplify_quad_gt_(simplified, prim_); + #define prim_try_simplify_quad_gt prim_try_simplify_quad_gt_ + ///if (!ret) + log_quad_gt(prim.v, ret); + return ret; +} +#endif // debug + +static noinline int simplify_quad_t(void *simplified, const struct vert_t *v, + int xd, int ud, int yd, int vd, uint32_t rgb_c, uint16_t clut) +{ + struct sprite *s = simplified; + int ret = 1; + rgb_c &= HTOLE32(0x03ffffff); + rgb_c |= HTOLE32(0x64000000); + xd = abs(xd); + ud = abs(ud); + s[0].rgb_c = rgb_c; + s[0].xy = v->xy; + s[0].u = v->u; + s[0].v = v->v; + s[0].clut = clut; + s[0].w = HTOLE16(xd); + s[0].h = HTOLE16(yd); +#ifndef STRICT + if (xd != ud) { + int mid = xd / 2; + s[0].w = HTOLE16(mid); + s[1].rgb_c = rgb_c; + s[1].x = HTOLE16(LE16TOH(s[0].x) + mid); + s[1].y = s[0].y; + s[1].u = s[0].u + mid + ud - xd; + s[1].v = s[0].v; + s[1].clut = clut; + s[1].w = HTOLE16(xd - mid); + s[1].h = s[0].h; + ret = 2; + } + if (yd != vd) { + int i, mid = yd / 2, y = LE16TOH(s[0].y); + memcpy(s + ret, s, sizeof(s[0]) * ret); + for (i = 0; i < ret; i++) { + s[i].h = HTOLE16(mid); + s[ret+i].y = HTOLE16(y + mid); + s[ret+i].h = HTOLE16(yd - mid); + s[ret+i].v = s[0].v + mid + vd - yd; + } + ret *= 2; + } +#endif + return ret; +} + +// this is split to reduce gcc spilling +static noinline int prim_try_simplify_quad_t2(void *simplified, + const struct vert_t *v, uint32_t rgb_c) +{ + do { + int yd = LE16TOH(v[2].y) - LE16TOH(v[0].y); + int xd, ud, vd; + if (yd < 0) + break; + xd = LE16TOH(v[1].x) - LE16TOH(v[0].x); + ud = LE16TOH(v[1].u) - LE16TOH(v[0].u); + vd = LE16TOH(v[2].v) - LE16TOH(v[0].v); +#ifdef STRICT + if (xd != ud || yd != vd) +#else + if (abs(xd - ud) > 1 || abs(yd - vd) > 1) +#endif + break; + return simplify_quad_t(simplified, xd < 0 ? &v[1] : &v[0], + xd, ud, yd, vd, rgb_c, v[0].clut); + } + while (0); + return 0; +} + +static noinline int prim_try_simplify_quad_gt2(void *simplified, + const struct vert_gt *v) +{ + do { + int yd = LE16TOH(v[2].t.y) - LE16TOH(v[0].t.y); + int xd, ud, vd; + if (yd < 0) + break; + xd = LE16TOH(v[1].t.x) - LE16TOH(v[0].t.x); + ud = LE16TOH(v[1].t.u) - LE16TOH(v[0].t.u); + vd = LE16TOH(v[2].t.v) - LE16TOH(v[0].t.v); +#ifdef STRICT + if (xd != ud || yd != vd) +#else + if (abs(xd - ud) > 1 || abs(yd - vd) > 1) +#endif + break; + if (!(v[0].rgb & HTOLE32(1 << 24))) { // modulation/"lighting" + uint32_t i, xor = 0, rgb0 = v[0].rgb; + for (i = 1; i < 4; i++) + xor |= rgb0 ^ v[i].rgb; + if (xor & HTOLE32(0xf8f8f8)) + break; + } + return simplify_quad_t(simplified, xd < 0 ? &v[1].t : &v[0].t, + xd, ud, yd, vd, v[0].rgb, v[0].t.clut); + } + while (0); + return 0; +} + +// 2c-2f +int prim_try_simplify_quad_t(void *simplified, const void *prim_) +{ + const struct quad_t *prim = prim_; + const struct vert_t *v = prim->v; + int ret = 0; + do { + if (v[0].y != v[1].y || v[0].x != v[2].x || v[2].y != v[3].y || v[1].x != v[3].x) + break; + if (v[0].v != v[1].v || v[0].u != v[2].u || v[2].v != v[3].v || v[1].u != v[3].u) + break; + ret = prim_try_simplify_quad_t2(simplified, v, prim->rgb_c); + } + while (0); + return ret; +} + +// 3c-3f +int prim_try_simplify_quad_gt(void *simplified, const void *prim) +{ + const struct vert_gt *v = prim; + int ret = 0; + do { + if (v[0].t.y != v[1].t.y || v[0].t.x != v[2].t.x || v[2].t.y != v[3].t.y || v[1].t.x != v[3].t.x) + break; + if (v[0].t.v != v[1].t.v || v[0].t.u != v[2].t.u || v[2].t.v != v[3].t.v || v[1].t.u != v[3].t.u) + break; + ret = prim_try_simplify_quad_gt2(simplified, v); + } + while (0); + return ret; +} + +// vim:shiftwidth=2:expandtab