diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index a58b5b6d..1dec0259 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -528,6 +528,11 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu) render_block_handler_struct *render_block_handler = psx_gpu->render_block_handler; +#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) + // the asm doesn't bother to save callee-save vector regs, so do it here + __asm__ __volatile__("":::"q4","q5","q6","q7"); +#endif + render_block_handler->texture_blocks(psx_gpu); render_block_handler->shade_blocks(psx_gpu); render_block_handler->blend_blocks(psx_gpu); @@ -538,6 +543,9 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu) #endif psx_gpu->num_blocks = 0; +#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) + __asm__ __volatile__("":::"q4","q5","q6","q7"); +#endif } } @@ -3037,6 +3045,11 @@ static void render_triangle_p(psx_gpu_struct *psx_gpu, triangle_set_direction(y_direction_b, y_delta_b); triangle_set_direction(y_direction_c, y_delta_c); +#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) + // the asm doesn't bother to save callee-save vector regs, so do it here + __asm__ __volatile__("vstmia %0, {q4-q7}" :: "r"(psx_gpu->saved_q4_q7) : "memory"); +#endif + compute_all_gradients(psx_gpu, a, b, c); switch(y_direction_a | (y_direction_b << 2) | (y_direction_c << 4) | @@ -3163,6 +3176,10 @@ static void render_triangle_p(psx_gpu_struct *psx_gpu, &(render_triangle_block_handlers[render_state]); ((setup_blocks_function_type *)psx_gpu->render_block_handler->setup_blocks) (psx_gpu); + +#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) + __asm__ __volatile__("vldmia %0, {q4-q7}" :: "r"(psx_gpu->saved_q4_q7)); +#endif } void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.h b/plugins/gpu_neon/psx_gpu/psx_gpu.h index edea0a9e..e585611e 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.h @@ -218,7 +218,11 @@ typedef struct // Align up to 64 byte boundary to keep the upcoming buffers cache line // aligned, also make reachable with single immediate addition - u8 reserved_a[180 + 9*4 - 9*sizeof(void *)]; + u8 reserved_a[68 + 9*4 - 9*sizeof(void *)]; + + // space for saving regs on c call to flush_render_block_buffer() and asm + u32 saved_tmp[48 / sizeof(u32)]; + u32 saved_q4_q7[64 / sizeof(u32)]; // 8KB block_struct blocks[MAX_BLOCKS_PER_ROW]; diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index 82738855..d187fce9 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -1381,20 +1381,20 @@ function(setup_spans_up_down) #define setup_blocks_uv_adj_hack_textured(hacks_active) \ tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \ beq 91f; \ - /* see flush_render_block_buffer below for a reg saving note */ \ - vpush { texture_mask }; \ - vpush { uvrg_dx4 }; \ \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + /* pushing odd num of regs here realigns our unaligned stack */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + push { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 }; \ mov r12, span_uvrg_offset; \ sub r1, block_ptr_a, #64; \ mov r2, span_edge_data; \ mov r3, r12; \ bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */ \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + pop { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ \ - vpop { uvrg_dx4 }; \ - vpop { texture_mask }; \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ 91: \ @@ -1650,17 +1650,14 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - /* this callee-save reg saving may look unnecessary but it actually is */ \ - /* because the callee violates the ABI */ \ - vpush { texture_mask }; \ - vpush { uvrg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + /* pushing odd num of regs here realigns our unaligned stack */ \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ - \ - vpop { uvrg_dx4 }; \ - vpop { texture_mask }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ @@ -1853,15 +1850,13 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \ ldmia sp!, { r4 - r11, pc }; \ \ 2: \ - vpush { texture_mask }; \ - vpush { uvrg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ + vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ - \ - vpop { uvrg_dx4 }; \ - vpop { texture_mask }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset]; \ + vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8]; \ \ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \ vmov.u8 fb_mask_ptrs, #0; \ @@ -1972,13 +1967,13 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect) ldmia sp!, { r4 - r11, pc } 2: - vpush { colors } - - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } + vstr d4, [r0, #psx_gpu_saved_tmp_offset] /* colors */ + vstr d5, [r0, #psx_gpu_saved_tmp_offset + 8] + push { r0 - r3, EXTRA_UNSAVED_REGS r12 } bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { colors } + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 } + vldr d4, [r0, #psx_gpu_saved_tmp_offset] + vldr d5, [r0, #psx_gpu_saved_tmp_offset + 8] vld1.u32 { test_mask }, [psx_gpu, :128] veor.u32 draw_mask, draw_mask, draw_mask @@ -2385,16 +2380,14 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \ bne 0b; \ \ restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc }; \ + pop { r4 - r11, pc }; \ \ 2: \ - vpush { rg_dx4 }; \ - \ - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ + vstr rg_dx4, [r0, #psx_gpu_saved_tmp_offset]; \ + push { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ bl flush_render_block_buffer; \ - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \ - \ - vpop { rg_dx4 }; \ + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \ + vldr rg_dx4, [r0, #psx_gpu_saved_tmp_offset]; \ \ vmov.u8 d64_1, #1; \ vmov.u8 d128_4, #4; \ @@ -2804,7 +2797,7 @@ function(texture_blocks_4bpp) .align 3 function(texture_blocks_8bpp) - stmdb sp!, { r3 - r11, r14 } + push { r4 - r11, lr } add block_ptr, psx_gpu, #psx_gpu_blocks_offset ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset] @@ -2882,15 +2875,14 @@ function(texture_blocks_8bpp) add block_ptr, block_ptr, #64 bne 0b - ldmia sp!, { r3 - r11, pc } + pop { r4 - r11, pc } 1: - stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } - - bl update_texture_8bpp_cache - - ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 } - bal 0b + /* pushing odd num of regs here realigns our unaligned stack */ + push { r1 - r2, EXTRA_UNSAVED_REGS r12 } + bl update_texture_8bpp_cache + pop { r1 - r2, EXTRA_UNSAVED_REGS r12 } + bal 0b #undef uv_0 @@ -4534,28 +4526,27 @@ function(warmup) .align 3 setup_sprite_flush_blocks: - vpush { q1 - q5 } - - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { q1 - q5 } + push { r0 - r3, EXTRA_UNSAVED_REGS r12, lr } + add block, r0, #psx_gpu_saved_tmp_offset /* r5 */ + vstmia block, { q1 - q3 } + bl flush_render_block_buffer + vldmia block, { q1 - q3 } + pop { r0 - r3, EXTRA_UNSAVED_REGS r12, lr } - add block, psx_gpu, #psx_gpu_blocks_offset - bx lr + add block, psx_gpu, #psx_gpu_blocks_offset + bx lr setup_sprite_update_texture_4bpp_cache: - stmdb sp!, { r0 - r3, r14 } + push { r0 - r4, lr } bl update_texture_4bpp_cache - ldmia sp!, { r0 - r3, pc } + pop { r0 - r4, pc } setup_sprite_update_texture_8bpp_cache: - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 } + push { r0 - r4, EXTRA_UNSAVED_REGS lr } bl update_texture_8bpp_cache - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc } + pop { r0 - r4, EXTRA_UNSAVED_REGS pc } #define setup_sprite_tiled_initialize_4bpp() \ @@ -4842,8 +4833,8 @@ setup_sprite_update_texture_8bpp_cache: setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \ \ setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \ - restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc } \ + vpop { q4 - q7 }; \ + pop { r3 - r11, pc } \ #define setup_sprite_tiled_advance_column() \ add texture_offset_base, texture_offset_base, #0x100; \ @@ -4879,8 +4870,8 @@ setup_sprite_update_texture_8bpp_cache: \ setup_sprite_tiled_advance_column(); \ setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\ - restore_abi_regs(); \ - ldmia sp!, { r4 - r11, pc } \ + vpop { q4 - q7 }; \ + pop { r3 - r11, pc } \ #define setup_sprite_offset_u_adjust() \ @@ -5226,19 +5217,19 @@ setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \ .align 4; \ \ function(setup_sprite_##texture_mode##x4mode) \ - stmdb sp!, { r4 - r11, r14 }; \ + push { r3 - r11, lr }; \ setup_sprite_tiled_initialize_##texture_mode##x4mode(); \ \ - ldr v, [sp, #36]; \ + ldr v, [sp, #4*(10+0)]; \ and offset_u, u, #0xF; \ \ - ldr width, [sp, #40]; \ + ldr width, [sp, #4*(10+1)]; \ ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \ \ - ldr height, [sp, #44]; \ + ldr height, [sp, #4*(10+2)]; \ add fb_ptr, fb_ptr, y, lsl #11; \ \ - save_abi_regs(); \ + vpush { q4 - q7 }; \ \ add fb_ptr, fb_ptr, x, lsl #1; \ and offset_v, v, #0xF; \ @@ -5362,7 +5353,7 @@ setup_sprite_tiled_builder(8bpp, _4x); #define texels_67 r9 function(texture_sprite_blocks_8bpp) - stmdb sp!, { r4 - r11, r14 } + push { r4 - r11, r14 } movw texel_shift_mask, #(0xFF << 1) ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] @@ -5415,8 +5406,9 @@ function(texture_sprite_blocks_8bpp) add block_ptr, block_ptr, #64 bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r4 - r11, pc } #undef width_rounded @@ -5481,30 +5473,30 @@ function(texture_sprite_blocks_8bpp) setup_sprites_16bpp_flush: - vpush { d0 - d3 } - - stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - bl flush_render_block_buffer - ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } - - vpop { d0 - d3 } + push { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 } + add r1, r0, #psx_gpu_saved_tmp_offset + vstmia r1, { d0 - d3 } + bl flush_render_block_buffer + pop { r0 - r3, EXTRA_UNSAVED_REGS r12 } + add lr, r0, #psx_gpu_saved_tmp_offset + vldmia lr, { d0 - d3 } add block, psx_gpu, #psx_gpu_blocks_offset mov num_blocks, block_width - bx lr + pop { pc } function(setup_sprite_16bpp) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, lr } ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr v, [sp, #36] + ldr v, [sp, #4*(10+0)] add fb_ptr, fb_ptr, y, lsl #11 - ldr width, [sp, #40] + ldr width, [sp, #4*(10+1)] add fb_ptr, fb_ptr, x, lsl #1 - ldr height, [sp, #44] + ldr height, [sp, #4*(10+2)] and left_offset, u, #0x7 add texture_offset_base, u, u @@ -5574,7 +5566,7 @@ function(setup_sprite_16bpp) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 1b - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } 0: add num_blocks, num_blocks, block_width @@ -5648,8 +5640,9 @@ function(setup_sprite_16bpp) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } // 4x version @@ -5657,16 +5650,16 @@ function(setup_sprite_16bpp) #undef draw_mask_fb_ptr function(setup_sprite_16bpp_4x) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, lr } ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr v, [sp, #36] + ldr v, [sp, #4*(10+0)] add fb_ptr, fb_ptr, y, lsl #11 - ldr width, [sp, #40] + ldr width, [sp, #4*(10+1)] add fb_ptr, fb_ptr, x, lsl #1 - ldr height, [sp, #44] + ldr height, [sp, #4*(10+2)] and left_offset, u, #0x7 add texture_offset_base, u, u @@ -5735,7 +5728,7 @@ function(setup_sprite_16bpp_4x) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 1b - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } 0: add num_blocks, num_blocks, block_width @@ -5793,8 +5786,9 @@ function(setup_sprite_16bpp_4x) strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bne 0b + nop - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } #undef width @@ -5839,18 +5833,18 @@ function(setup_sprite_16bpp_4x) .align 3 function(setup_sprite_untextured_512) - stmdb sp!, { r4 - r11, r14 } + push { r4 - r11, r14 } - ldr width, [sp, #40] + ldr width, [sp, #4*(9+1)] ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset] - ldr height, [sp, #44] + ldr height, [sp, #4*(9+2)] add fb_ptr, fb_ptr, y, lsl #11 add fb_ptr, fb_ptr, x, lsl #1 sub right_width, width, #1 - ldr color, [sp, #48] + ldr color, [sp, #4*(9+3)] and right_width, #7 add block_width, width, #7 @@ -5927,7 +5921,7 @@ setup_sprite_untextured_height_loop: strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] bgt setup_sprite_untextured_height_loop - ldmia sp!, { r4 - r11, pc } + pop { r4 - r11, pc } @@ -5960,7 +5954,7 @@ setup_sprite_untextured_height_loop: #define texel_block_expanded_cd q3 function(update_texture_4bpp_cache) - stmdb sp!, { r4 - r11, r14 } + push { r3 - r11, r14 } vpush { q0 - q3 } ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset] @@ -6034,7 +6028,7 @@ function(update_texture_4bpp_cache) bne 0b vpop { q0 - q3 } - ldmia sp!, { r4 - r11, pc } + pop { r3 - r11, pc } #undef current_texture_page @@ -6064,7 +6058,6 @@ function(update_texture_4bpp_cache) function(update_texture_8bpp_cache_slice) stmdb sp!, { r4 - r11, r14 } - vpush { q0 - q3 } ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset] ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset] @@ -6125,7 +6118,6 @@ function(update_texture_8bpp_cache_slice) bne 0b - vpop { q0 - q3 } ldmia sp!, { r4 - r11, pc } diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h index 7c21d31c..0243026f 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h @@ -37,6 +37,7 @@ #define psx_gpu_texture_mask_height_offset 0xfb #define psx_gpu_reciprocal_table_ptr_offset 0x108 #define psx_gpu_hacks_active_offset 0x114 +#define psx_gpu_saved_tmp_offset 0x190 #define psx_gpu_blocks_offset 0x200 #define psx_gpu_span_uvrg_offset_offset 0x2200 #define psx_gpu_span_edge_data_offset 0x4200 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c index 740df981..1a452e6d 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c @@ -77,6 +77,8 @@ int main() //WRITE_OFFSET(f, texture_settings); WRITE_OFFSET(f, reciprocal_table_ptr); WRITE_OFFSET(f, hacks_active); + WRITE_OFFSET(f, saved_tmp); + //WRITE_OFFSET(f, saved_q4_q7); WRITE_OFFSET(f, blocks); WRITE_OFFSET(f, span_uvrg_offset); WRITE_OFFSET(f, span_edge_data); diff --git a/plugins/gpu_neon/psx_gpu_if.c b/plugins/gpu_neon/psx_gpu_if.c index 5b6a3351..f85155e3 100644 --- a/plugins/gpu_neon/psx_gpu_if.c +++ b/plugins/gpu_neon/psx_gpu_if.c @@ -43,11 +43,6 @@ int do_cmd_list(uint32_t *list, int count, { int ret; -#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) - // the asm doesn't bother to save callee-save vector regs, so do it here - __asm__ __volatile__("":::"q4","q5","q6","q7"); -#endif - if (gpu.state.enhancement_active) ret = gpu_parse_enhanced(&egpu, list, count * 4, cycles_sum, cycles_last, (u32 *)last_cmd); @@ -55,10 +50,6 @@ int do_cmd_list(uint32_t *list, int count, ret = gpu_parse(&egpu, list, count * 4, cycles_sum, cycles_last, (u32 *)last_cmd); -#if defined(__arm__) && defined(NEON_BUILD) && !defined(SIMD_BUILD) - __asm__ __volatile__("":::"q4","q5","q6","q7"); -#endif - ex_regs[1] &= ~0x1ff; ex_regs[1] |= egpu.texture_settings & 0x1ff; return ret;