diff --git a/Makefile b/Makefile
index 39b5fbaf..52275723 100644
--- a/Makefile
+++ b/Makefile
@@ -229,7 +229,7 @@ plugins/dfsound/out.o: CFLAGS += -DHAVE_LIBRETRO
 endif
 
 # builtin gpu
-OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o
+OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o plugins/gpulib/prim.o
 ifeq "$(BUILTIN_GPU)" "neon"
 CFLAGS += -DGPU_NEON
 OBJS += plugins/gpu_neon/psx_gpu_if.o
@@ -272,9 +272,12 @@ OBJS += plugins/gpu_unai/old/if.o
 else
 CFLAGS += -DGPU_UNAI_NO_OLD
 endif
+plugins/gpu_unai/gpulib_if.o: plugins/gpu_unai/*.h
 plugins/gpu_unai/gpulib_if.o: CFLAGS += -DREARMED -DUSE_GPULIB=1
+ifneq ($(DEBUG), 1)
 plugins/gpu_unai/gpulib_if.o \
 plugins/gpu_unai/old/if.o: CFLAGS += -O3
+endif
 CC_LINK = $(CXX)
 endif
 
diff --git a/Makefile.libretro b/Makefile.libretro
index 06eab7d6..7ea7addb 100644
--- a/Makefile.libretro
+++ b/Makefile.libretro
@@ -356,7 +356,6 @@ else ifeq ($(platform), ctr)
 	TARGET := $(TARGET_NAME)_libretro_ctr.a
 	CFLAGS += -DARM11 -D_3DS -D__3DS__
 	CFLAGS += -DGPU_UNAI_USE_FLOATMATH -DGPU_UNAI_USE_FLOAT_DIV_MULTINV
-	CFLAGS += -DGPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE # needed on some compilers?
 	CFLAGS += -march=armv6k -mtune=mpcore -mfloat-abi=hard -marm -mfpu=vfp -mtp=soft
 	CFLAGS += -mword-relocations
 	CFLAGS += -fomit-frame-pointer
diff --git a/frontend/cspace_arm.S b/frontend/cspace_arm.S
index 3ef5083b..41b1e691 100644
--- a/frontend/cspace_arm.S
+++ b/frontend/cspace_arm.S
@@ -20,6 +20,14 @@
     orr      \rn, r12, lsl #6
 .endm
 
+.macro bgr555_to_rgb565_one_i rn1 rn2
+    and      r12, lr, \rn1, lsr #5
+    and      \rn1,lr, \rn1, lsr #10
+    orr      r12, r11, lsl #5
+    and      r11, lr, \rn2
+    orr      \rn1,r12, lsl #6
+.endm
+
 .macro pld_ reg offs=#0
 #ifdef HAVE_ARMV6
     pld      [\reg, \offs]
@@ -27,7 +35,6 @@
 .endm
 
 FUNCTION(bgr555_to_rgb565): @ void *dst, const void *src, int bytes
-    pld_     r1
     push     {r4-r11,lr}
     mov      lr, #0x001f
     subs     r2, #4*8
@@ -43,16 +50,17 @@ FUNCTION(bgr555_to_rgb565): @ void *dst, const void *src, int bytes
 0:
     ldmia    r1!, {r3-r10}
     subs     r2, #4*8
-    bgr555_to_rgb565_one r3
-
-    pld_     r1, #32*2
-    bgr555_to_rgb565_one r4
-    bgr555_to_rgb565_one r5
-    bgr555_to_rgb565_one r6
-    bgr555_to_rgb565_one r7
-    bgr555_to_rgb565_one r8
-    bgr555_to_rgb565_one r9
-    bgr555_to_rgb565_one r10
+    bic      r12, r1, #0x1f
+    pld_     r12, #32*1
+    and      r11, lr, r3
+    bgr555_to_rgb565_one_i r3 r4
+    bgr555_to_rgb565_one_i r4 r5
+    bgr555_to_rgb565_one_i r5 r6
+    bgr555_to_rgb565_one_i r6 r7
+    bgr555_to_rgb565_one_i r7 r8
+    bgr555_to_rgb565_one_i r8 r9
+    bgr555_to_rgb565_one_i r9 r10
+    bgr555_to_rgb565_one_i r10 r10
     stmia    r0!, {r3-r10}
     bge      0b
 
diff --git a/include/compiler_features.h b/include/compiler_features.h
index 21549ddf..77114efb 100644
--- a/include/compiler_features.h
+++ b/include/compiler_features.h
@@ -2,6 +2,7 @@
 #ifdef __GNUC__
 # define likely(x)       __builtin_expect((x),1)
 # define unlikely(x)     __builtin_expect((x),0)
+# define preload         __builtin_prefetch
 # ifdef __clang__
 #  define noinline       __attribute__((noinline))
 # else
@@ -11,6 +12,7 @@
 #else
 # define likely(x)       (x)
 # define unlikely(x)     (x)
+# define preload         (x)
 # define noinline
 # define attr_unused
 #endif
diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c
index 76951a68..a393ee92 100644
--- a/libpcsxcore/database.c
+++ b/libpcsxcore/database.c
@@ -82,6 +82,18 @@ static const char * const fractional_Framerate_hack_db[] =
 	"SCUS94425", "SCES02104",
 };
 
+static const char * const f1_hack_db[] =
+{
+	/* Formula One Arcade */
+	"SCES03886",
+	/* Formula One '99 */
+	"SLUS00870", "SCPS10101", "SCES01979", "SLES01979",
+	/* Formula One 2000 */
+	"SLUS01134", "SCES02777", "SCES02778", "SCES02779",
+	/* Formula One 2001 */
+	"SCES03404", "SCES03423", "SCES03424", "SCES03524",
+};
+
 #define HACK_ENTRY(var, list) \
 	{ #var, &Config.hacks.var, list, ARRAY_SIZE(list) }
 
@@ -100,6 +112,7 @@ hack_db[] =
 	HACK_ENTRY(gpu_timing1024, dualshock_timing1024_hack_db),
 	HACK_ENTRY(dualshock_init_analog, dualshock_init_analog_hack_db),
 	HACK_ENTRY(fractional_Framerate, fractional_Framerate_hack_db),
+	HACK_ENTRY(f1, f1_hack_db),
 };
 
 static const struct
@@ -142,35 +155,6 @@ cycle_multiplier_overrides[] =
 	{ 153, { "SLUS00943" } },
 };
 
-static const struct
-{
-	const char * const id;
-	u32 hacks;
-}
-lightrec_hacks_db[] =
-{
-	/* Formula One Arcade */
-	{ "SCES03886", LIGHTREC_HACK_INV_DMA_ONLY },
-
-	/* Formula One '99 */
-	{ "SLUS00870", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCPS10101", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES01979", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SLES01979", LIGHTREC_HACK_INV_DMA_ONLY },
-
-	/* Formula One 2000 */
-	{ "SLUS01134", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES02777", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES02778", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES02779", LIGHTREC_HACK_INV_DMA_ONLY },
-
-	/* Formula One 2001 */
-	{ "SCES03404", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES03423", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES03424", LIGHTREC_HACK_INV_DMA_ONLY },
-	{ "SCES03524", LIGHTREC_HACK_INV_DMA_ONLY },
-};
-
 /* Function for automatic patching according to GameID. */
 void Apply_Hacks_Cdrom(void)
 {
@@ -211,6 +195,8 @@ void Apply_Hacks_Cdrom(void)
 
 	/* Dynarec game-specific hacks */
 	ndrc_g.hacks_pergame = 0;
+	if (Config.hacks.f1)
+		ndrc_g.hacks_pergame |= NDHACK_THREAD_FORCE; // force without *_ON -> off
 	Config.cycle_multiplier_override = 0;
 
 	for (i = 0; i < ARRAY_SIZE(cycle_multiplier_overrides); i++)
@@ -229,15 +215,12 @@ void Apply_Hacks_Cdrom(void)
 		}
 	}
 
-	lightrec_hacks = 0;
-
-	for (i = 0; drc_is_lightrec() && i < ARRAY_SIZE(lightrec_hacks_db); i++) {
-		if (strcmp(CdromId, lightrec_hacks_db[i].id) == 0)
-		{
-			lightrec_hacks = lightrec_hacks_db[i].hacks;
+	if (drc_is_lightrec()) {
+		lightrec_hacks = 0;
+		if (Config.hacks.f1)
+			lightrec_hacks |= LIGHTREC_HACK_INV_DMA_ONLY;
+		if (lightrec_hacks)
 			SysPrintf("using lightrec_hacks: 0x%x\n", lightrec_hacks);
-			break;
-		}
 	}
 }
 
diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c
index 0dcec554..a19bd2dd 100644
--- a/libpcsxcore/new_dynarec/emu_if.c
+++ b/libpcsxcore/new_dynarec/emu_if.c
@@ -303,10 +303,10 @@ static void ari64_apply_config()
 	else
 		ndrc_g.hacks &= ~NDHACK_NO_STALLS;
 
-	thread_changed = (ndrc_g.hacks ^ ndrc_g.hacks_old)
+	thread_changed = ((ndrc_g.hacks | ndrc_g.hacks_pergame) ^ ndrc_g.hacks_old)
 		& (NDHACK_THREAD_FORCE | NDHACK_THREAD_FORCE_ON);
 	if (Config.cycle_multiplier != ndrc_g.cycle_multiplier_old
-	    || ndrc_g.hacks != ndrc_g.hacks_old)
+	    || (ndrc_g.hacks | ndrc_g.hacks_pergame) != ndrc_g.hacks_old)
 	{
 		new_dynarec_clear_full();
 	}
@@ -485,7 +485,9 @@ static void ari64_thread_init(void)
 {
 	int enable;
 
-	if (ndrc_g.hacks & NDHACK_THREAD_FORCE)
+	if (ndrc_g.hacks_pergame & NDHACK_THREAD_FORCE)
+		enable = 0;
+	else if (ndrc_g.hacks & NDHACK_THREAD_FORCE)
 		enable = ndrc_g.hacks & NDHACK_THREAD_FORCE_ON;
 	else {
 		u32 cpu_count = cpu_features_get_core_amount();
diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c
index e19e4361..e247faf2 100644
--- a/libpcsxcore/new_dynarec/new_dynarec.c
+++ b/libpcsxcore/new_dynarec/new_dynarec.c
@@ -6292,13 +6292,13 @@ void new_dynarec_clear_full(void)
   stat_clear(stat_links);
 
   if (ndrc_g.cycle_multiplier_old != Config.cycle_multiplier
-      || ndrc_g.hacks_old != ndrc_g.hacks)
+      || ndrc_g.hacks_old != (ndrc_g.hacks | ndrc_g.hacks_pergame))
   {
     SysPrintf("ndrc config: mul=%d, ha=%x, pex=%d\n",
       get_cycle_multiplier(), ndrc_g.hacks, Config.PreciseExceptions);
   }
   ndrc_g.cycle_multiplier_old = Config.cycle_multiplier;
-  ndrc_g.hacks_old = ndrc_g.hacks;
+  ndrc_g.hacks_old = ndrc_g.hacks | ndrc_g.hacks_pergame;
 }
 
 static int pgsize(void)
diff --git a/libpcsxcore/psxcommon.h b/libpcsxcore/psxcommon.h
index 8a0ac703..0a1ef707 100644
--- a/libpcsxcore/psxcommon.h
+++ b/libpcsxcore/psxcommon.h
@@ -156,6 +156,7 @@ typedef struct {
 		boolean dualshock_init_analog;
 		boolean gpu_timing1024;
 		boolean fractional_Framerate;
+		boolean f1;
 	} hacks;
 } PcsxConfig;
 
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
index e78feaf2..1fa06a15 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
@@ -16,6 +16,7 @@
 
 #include "common.h"
 #include "../../gpulib/gpu_timing.h"
+#include "../../gpulib/gpu.h"
 
 #ifndef command_lengths
 const u8 command_lengths[256] =
@@ -245,12 +246,27 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
 #define SET_Ex(r, v)
 #endif
 
+static void textured_sprite(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+}
+
 u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
  s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
   u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 siplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -328,8 +344,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
@@ -383,8 +410,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
@@ -525,23 +563,12 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
-
-        set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+      case 0x64 ... 0x67:
+        textured_sprite(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -565,22 +592,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+      case 0x74 ... 0x77:
+        textured_sprite(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -594,19 +610,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
       }
   
       case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 16, height = 16;
-
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+        textured_sprite(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
   
 #ifdef PCSX
       case 0x1F:                   //  irq?
@@ -1155,12 +1160,31 @@ static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
 }
 #endif
 
+static void textured_sprite_enh(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  s32 width_b = width, height_b = height;
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+
+  if (check_enhanced_range(psx_gpu, x, x + width))
+    do_sprite_enhanced(psx_gpu, x, y, u, v, width_b, height_b, list[0]);
+}
+
 u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
  s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
   u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 siplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -1265,8 +1289,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
@@ -1318,8 +1353,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
@@ -1475,30 +1521,12 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         }
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
-
-        set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + width)) {
-          width = list_s16[6] & 0x3FF;
-          height = list_s16[7] & 0x1FF;
-          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
-        }
+      case 0x64 ... 0x67:
+        textured_sprite_enh(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1528,26 +1556,11 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]);
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + 8))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]);
+      case 0x74 ... 0x77:
+        textured_sprite_enh(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1562,25 +1575,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]);
         break;
       }
-  
-      case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 16, height = 16;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + 16))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]);
+      case 0x7C ... 0x7F:
+        textured_sprite_enh(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
 
       case 0x80 ... 0x9F:          //  vid -> vid
       case 0xA0 ... 0xBF:          //  sys -> vid
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S
index 93269932..a516f08f 100644
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -7,6 +7,7 @@
 
 #include "arm_features.h"
 
+.syntax unified
 .text
 .align 2
 
@@ -16,6 +17,89 @@
 #endif
 .endm
 
+#ifdef HAVE_ARMV6
+
+.macro modulate rp mbr mg t0 t1 t2
+    and     \t0, \rp, #0x001f
+    and     \t1, \rp, #0x03e0
+    and     \t2, \rp, #0x7c00
+    smulbb  \t0, \t0, \mbr       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
+    smulbt  \t1, \t1, \mg        @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
+    smulbt  \t2, \t2, \mbr       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
+    and     \rp, \rp, #0x8000    @ retain msb
+    usat    \t0, #5, \t0, asr #14
+    usat    \t1, #5, \t1, asr #19
+    usat    \t2, #5, \t2, asr #24
+    orr     \rp, \rp, \t0
+    orr     \rp, \rp, \t1, lsl #5
+    orr     \rp, \rp, \t2, lsl #10
+.endm
+
+@ http://www.slack.net/~ant/info/rgb_mixing.html
+@ p0 = (p0 + p1) / 2; p1 |= 0x8000
+@ msb of input p0 is assumed to be set
+.macro semitrans0 p0 p1 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  #0x0420
+    sub     \p0, \p0, \t
+    orr     \p1, \p1, #0x8000
+    uhadd16 \p0, \p0, \p1
+.endm
+
+.macro semitrans0p p0 p1 m421 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  \m421
+    add     \p0, \p0, \p1
+    uhsub16 \p0, \p0, \t           @ sub because of borrow into hi16
+.endm
+
+@ p0 - {p1|r,g,b}   // p1* - premasked rgb
+.macro semitrans2p p0 p1r p1g p1b m1f t0 t1
+    and     \t0, \p0, \m1f
+    and     \t1, \p0, \m1f, lsl #5
+    and     \p0, \p0, \m1f, lsl #10
+    uqsub16 \t0, \t0, \p1r
+    uqsub16 \t1, \t1, \p1g
+    uqsub16 \p0, \p0, \p1b
+    orr     \t0, \t0, \t1
+    orr     \p0, \p0, \t0
+.endm
+
+#else
+
+@ msb of input p0 is assumed to be set
+.macro semitrans0 p0 p1 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  #0x0420
+    orr     \p1, \p1, #0x8000
+    sub     \p0, \p0, \t
+    add     \p0, \p0, \p1
+    orr     \p0, \p0, #0x10000
+    mov     \p0, \p0, lsr #1
+.endm
+
+.macro semitrans0p p0 p1 m421 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  \m421
+    add     \p0, \p0, \p1
+    sub     \p0, \p0, \t
+    mov     \p0, \p0, lsr #1
+.endm
+
+#endif // HAVE_ARMV6
+
+.macro semitrans13p p0 p1 m421 t0
+    add     \t0, \p0, \p1
+    eor     \p0, \p0, \p1
+    and     \p0, \p0, \m421          @ low_bits
+    sub     \p0, \t0, \p0
+    and     \p0, \p0, \m421, lsl #5  @ carries
+    sub     \t0, \t0, \p0            @ modulo
+    sub     \p0, \p0, \p0, lsr #5    @ clamp
+    orr     \p0, \t0, \p0
+.endm
+
+
 @ in: r0=dst, r2=pal, r12=0x1e
 @ trashes r6-r8,lr,flags
 .macro do_4x_4bpp rs ibase obase
@@ -32,13 +116,13 @@
     ldrh    r8, [r2, r8]
     ldrh    lr, [r2, lr]
     tst     r6, r6
-    strneh  r6, [r0, #\obase+0]
+    strhne  r6, [r0, #\obase+0]
     tst     r7, r7
-    strneh  r7, [r0, #\obase+2]
+    strhne  r7, [r0, #\obase+2]
     tst     r8, r8
-    strneh  r8, [r0, #\obase+4]
+    strhne  r8, [r0, #\obase+4]
     tst     lr, lr
-    strneh  lr, [r0, #\obase+6]
+    strhne  lr, [r0, #\obase+6]
 .endm
 
 @ in: r0=dst, r2=pal, r12=0x1fe
@@ -53,25 +137,112 @@
     ldrh     r8, [r2, r8]
     ldrh     \rs,[r2, \rs]
     tst      r6, r6
-    strneh   r6, [r0, #0]
+    strhne   r6, [r0, #0]
     tst      r7, r7
-    strneh   r7, [r0, #2]
+    strhne   r7, [r0, #2]
     tst      r8, r8
-    strneh   r8, [r0, #4]
+    strhne   r8, [r0, #4]
     tst      \rs,\rs
-    strneh   \rs,[r0, #6]
+    strhne   \rs,[r0, #6]
+.endm
+
+
+@ (void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn)
+@ see also poly_untex_st_m
+.macro tile_driver_st_m name semit
+FUNCTION(\name):
+    .cfi_startproc
+    stmfd   sp!, {r4-r9,lr}
+    .cfi_def_cfa_offset 4*7
+    .cfi_rel_offset lr, 4*6
+    ldr     r7, [r3, #0x18]        @ y0
+    ldr     r8, [r3, #0x1c]        @ y1
+.if \semit != 2
+    mov     r4, #0x8000
+    orr     r4, r4, r4, lsl #16    @ mask 8000
+    mov     r6, #0x420
+    orr     r6, r6, #1
+    orr     r6, r6, r6, lsl #16    @ mask 0421
+.endif
+.if \semit == 2
+    and     r4, r1, #0x03e0
+    and     r5, r1, #0x7c00
+    and     r1, r1, #0x001f
+    orr     r4, r4, r4, lsl #16    @ premasked g
+    orr     r5, r5, r5, lsl #16    @ premasked b
+    mov     r6, #0x00001f
+    orr     r6, #0x1f0000          @ mask
+.elseif \semit == 3
+    mov     r1, r1, lsr #2
+    bic     r1, r1, #(0x0c60>>2)
+.endif
+    orr     r1, r1, r1, lsl #16
+    sub     r3, r8, r7             @ h
+    mov     r7, r2                 @ save w
+0:
+    ldrh    r8, [r0]
+    pld_    r0, #2048
+    tst     r0, #2
+    beq     1f
+    sub     r2, #1
+.if \semit == 0
+    bic     r8, r8, r4
+    semitrans0p  r8, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r8, r8, r4
+    semitrans13p r8, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r8, r1, r4, r5, r6, r9, lr
+.endif
+    strh    r8, [r0], #2
+1:
+    ldr     r8, [r0]
+    pld_    r0, #32
+    subs    r2, r2, #2
+.if \semit == 0
+    bic     r8, r8, r4
+    semitrans0p  r8, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r8, r8, r4
+    semitrans13p r8, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r8, r1, r4, r5, r6, r9, lr
+.endif
+    strpl   r8, [r0], #4
+    bpl     1b
+2:
+    tst     r2, #1
+    strhne  r8, [r0], #2
+    mov     r2, r7                 @ w
+    add     r0, r0, #2048
+    sub     r0, r0, r7, lsl #1
+    subs    r3, r3, #1
+    bgt     0b
+
+    ldmfd   sp!, {r4-r9,pc}
+    .cfi_endproc
 .endm
 
-.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines)
+
+tile_driver_st_m tile_driver_st0_asm, 0
+tile_driver_st_m tile_driver_st1_asm, 1
+tile_driver_st_m tile_driver_st3_asm, 3
+#ifdef HAVE_ARMV6
+tile_driver_st_m tile_driver_st2_asm, 2
+#endif
+
+@ (u16 *d, void *s, u16 *pal, int lines)
 sprite_4bpp_x16_asm_:
-    ldr     r2, [r3]               @ pal
-    ldr     r3, [r3, #0x1c]        @ lines
-sprite_4bpp_x16_asm:
+    ldr     r12,[r3, #0x18]        @ y0
+    ldr     r2, [r3, #0x04]        @ pal
+    ldr     r3, [r3, #0x1c]        @ y1
+    sub     r3, r3, r12
+FUNCTION(sprite_4bpp_x16_asm):
     .cfi_startproc
     stmfd   sp!, {r4-r8,lr}
     .cfi_def_cfa_offset 4*6
     .cfi_rel_offset lr, 4*5
-    mov     r12, #0x1e             @ empty pixel
+    mov     r12, #0x1e
 
 0:
     ldmia   r1, {r4,r5}
@@ -98,15 +269,17 @@ sprite_4bpp_x16_asm:
 .if \is8bpp
     orr     r12, r12, #0x1f0   @ mask=0x01fe
 .endif
-    ldr     r4, [r3, #4]       @ u0
-    ldr     r5, [r3, #0x1c]    @ h
+    ldr     r4, [r3, #0x08]    @ u
+    ldr     r5, [r3, #0x1c]    @ v1
+    ldr     r6, [r3, #0x18]    @ v0
     and     r4, r4, #((8 >> \is8bpp) - 1)
+    sub     r5, r5, r6
     sub     r5, r5, #1
     orr     r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction
     mov     r9, r2             @ saved_w
     mov     r10, r0            @ saved_dst
     mov     r11, r1            @ saved_src
-    ldr     r2, [r3]           @ pal
+    ldr     r2, [r3, #0x04]    @ pal
 11: @ line_loop:
     pld_    r11, #2048
     mov     r0, r10
@@ -151,10 +324,10 @@ sprite_4bpp_x16_asm:
     b       12b @ return from fractional_u
 .endm
 
-.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
-sprite_driver_4bpp_asm:
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_4bpp_asm):
     .cfi_startproc
-    ldr     r12, [r3, #4]      @ u0
+    ldr     r12, [r3, #8]      @ u
     mov     r12, r12, lsl #29
     orr     r12, r12, r2       @ w
     cmp     r12, #16
@@ -175,15 +348,15 @@ sprite_driver_4bpp_asm:
     ldrh    r7, [r2, r7]
     add     r0, r0, #2
     tst     r7, r7
-    strneh  r7, [r0, #-2]
+    strhne  r7, [r0, #-2]
     subs    r8, r8, #1
     bgt     0b
     sprite_driver_part3
     .cfi_endproc
 
 
-.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
-sprite_driver_8bpp_asm:
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_8bpp_asm):
     .cfi_startproc
     sprite_driver_part1 1
 0:
@@ -200,11 +373,425 @@ sprite_driver_8bpp_asm:
     ldrh    r7, [r2, r7]
     add     r0, r0, #2
     tst     r7, r7
-    strneh  r7, [r0, #-2]
+    strhne  r7, [r0, #-2]
     subs    r8, r8, #1
     bgt     0b
     sprite_driver_part3
     .cfi_endproc
 
 
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+.macro sprite_driver_l_st name bpp light semit
+FUNCTION(\name):
+    .cfi_startproc
+    stmfd   sp!, {r4-r11,lr}
+    .cfi_def_cfa_offset 4*4
+    .cfi_rel_offset lr, 4*3
+    ldr     r5, [r3, #0x18]    @ y0
+    ldr     r7, [r3, #0x1c]    @ y1
+    ldr     r8, [r3, #0x20]    @ rbg5
+    mov     r6, r2             @ saved_w
+    ldr     r2, [r3, #0x04]    @ pal
+    ldr     r10,[r3, #0x08]    @ u
+    ldr     r11,[r3, #0x10]    @ u_msk
+    sub     r5, r7, r5         @ h
+    mov     r7, r8, lsl #(8+2) @ 0bbb bb00 0ggg gg00 0rrr rr00 0000 0000
+    mov     r8, r8, lsl #(16+2)@ 0ggg gg00 ...
+    mov     r3, r11,lsr #10
+    orr     r6, r3, r6, lsl #16 @ (w << 16) | u_mask
+    mov     r3, r6
+    and     r10,r10,r6
+
+3: @ line_loop:
+.if \bpp == 4
+    add     r9, r1, r10, lsr #1
+.elseif \bpp == 8
+    add     r9, r1, r10
+    pld_    r9, #2048
+.endif
+0:
+.if \bpp == 4
+    ldrb    r4, [r1, r10, lsr #1]
+.elseif \bpp == 8
+    ldrb    r4, [r1, r10]
+.endif
+    subs    r3, r3, #1<<16
+    bmi     1f
+.if \bpp == 4
+    tst     r10, #1
+    movne   r4, r4, lsr #3
+    addeq   r4, r4, r4
+    and     r4, r4, #0x1e
+.elseif \bpp == 8
+    add     r4, r4, r4         @ <<= 1
+.endif
+    ldrsh   r12,[r2, r4]
+    add     r10,r10,#1
+    and     r10,r10,r6
+    add     r0, r0, #2
+    tst     r12,r12
+    beq     0b
+.if \light && \semit != 1
+    modulate r12, r7, r8, r4, r9, lr
+.endif
+.if \semit == 0
+    ldrhmi  lr, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    bpl     0b
+    semitrans0 r12, lr, r9
+.elseif \light && \semit == 1
+    and     r4,  r12, #0x001f
+    and     r9,  r12, #0x03e0
+    and     r12, r12, #0x7c00
+    ldrhmi  r11, [r0, #-2]
+    smulbb  r4,  r4,  r7       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
+    smulbt  r9,  r9,  r8       @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
+    smulbt  r12, r12, r7       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
+    and     r8,  r11, #0x001f
+    and     lr,  r11, #0x03e0
+    and     r11, r11, #0x7c00
+    addmi   r4,  r4,  r8,  lsl #14
+    addmi   r9,  r9,  lr,  lsl #14
+    addmi   r12, r12, r11, lsl #14
+    usat    r4,  #5,  r4,  asr #14
+    usat    r9,  #5,  r9,  asr #19
+    usat    r12, #5,  r12, asr #24
+    orrmi   r4,  r4,  #0x8000
+    orr     r4,  r4,  r9,  lsl #5
+    orr     r12, r4,  r12, lsl #10
+    mov     r8,  r7,  lsl #8       @ restore r8
+.endif
+    strh    r12,[r0, #-2]
+    b       0b
+1:
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    sub     r0, r0, r6, lsr #15    @ dst
+    sub     r10,r10,r6, lsr #16    @ u
+    mov     r3, r6                 @ (w << 16) | u_mask
+    and     r10,r6, r10
+    subs    r5, r5, #1
+    and     r10,r10,#0xff
+    bgt     3b @ line_loop
+
+    ldmfd   sp!, {r4-r11,pc}
+    .cfi_endproc
+.endm
+
+sprite_driver_l_st sprite_driver_4bpp_l0_std_asm, 4, 0, -1
+sprite_driver_l_st sprite_driver_4bpp_l0_st0_asm, 4, 0,  0
+sprite_driver_l_st sprite_driver_8bpp_l0_std_asm, 8, 0, -1
+sprite_driver_l_st sprite_driver_8bpp_l0_st0_asm, 8, 0,  0
+
+#ifdef HAVE_ARMV6
+
+sprite_driver_l_st sprite_driver_4bpp_l1_std_asm, 4, 1, -1
+sprite_driver_l_st sprite_driver_4bpp_l1_st0_asm, 4, 1,  0
+sprite_driver_l_st sprite_driver_4bpp_l1_st1_asm, 4, 1,  1
+sprite_driver_l_st sprite_driver_8bpp_l1_std_asm, 8, 1, -1
+sprite_driver_l_st sprite_driver_8bpp_l1_st0_asm, 8, 1,  0
+sprite_driver_l_st sprite_driver_8bpp_l1_st1_asm, 8, 1,  1
+
+#endif // HAVE_ARMV6
+
+
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_16bpp_asm):
+    .cfi_startproc
+    stmfd   sp!, {r4-r6,lr}
+    .cfi_def_cfa_offset 4*4
+    .cfi_rel_offset lr, 4*3
+    ldr     r4, [r3, #0x1c]    @ v1
+    ldr     r5, [r3, #0x18]    @ v0
+    mov     r12,      #0x00ff
+    orr     r12, r12, #0xff00  @ mask
+    mov     r6, r2             @ saved_w
+    sub     r5, r4, r5
+    sub     r5, r5, #1         @ h-1
+3: @ line_loop:
+    pld_    r1, #2048
+    mov     r2, r6             @ w
+    tst     r1, #2
+    beq     0f
+2: @ 1pix:
+    ldrh    lr, [r1], #2
+    add     r0, r0, #2
+    sub     r2, r2, #1
+    tst     lr, lr
+    strhne  lr, [r0, #-2]
+0:
+    subs    r2, r2, #4
+    bmi     1f
+0:
+    ldmia   r1!, {r3,r4}
+    add     r0, r0, #2*4
+    pld_    r1, #24
+    tst     r3, r12
+    strhne  r3, [r0, #-8]
+    movs    lr, r3, lsr #16
+    strhne  lr, [r0, #-6]
+    tst     r4, r12
+    strhne  r4, [r0, #-4]
+    movs    lr, r4, lsr #16
+    strhne  lr, [r0, #-2]
+    subs    r2, r2, #4
+    bpl     0b
+1:
+    adds    r2, r2, #4
+    bne     2b @ 1pix
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    sub     r0, r0, r6, lsl #1 @ dst
+    sub     r1, r1, r6, lsl #1
+    subs    r5, r5, #1
+    bpl     3b @ line_loop
+
+    ldmfd   sp!, {r4-r6,pc}
+    .cfi_endproc
+
+
+@ (void *d, const gpu_unai_inner_t *inn, int count)
+@ see also tile_driver_st_m
+.macro poly_untex_st_m name semit
+FUNCTION(\name):
+    .cfi_startproc
+    ldrh    r1, [r1, #0x38]        @ rgb
+    stmfd   sp!, {r4-r7,lr}
+    .cfi_def_cfa_offset 4*5
+    .cfi_rel_offset lr, 4*4
+.if \semit != 2
+    mov     r4, #0x8000
+    orr     r4, r4, r4, lsl #16    @ mask 8000
+    mov     r6, #0x420
+    orr     r6, r6, #1
+    orr     r6, r6, r6, lsl #16    @ mask 0421
+.endif
+.if \semit == 2
+    and     r4, r1, #0x03e0
+    and     r5, r1, #0x7c00
+    and     r1, r1, #0x001f
+    orr     r4, r4, r4, lsl #16    @ premasked g
+    orr     r5, r5, r5, lsl #16    @ premasked b
+    mov     r6, #0x00001f
+    orr     r6, #0x1f0000          @ mask
+.elseif \semit == 3
+    mov     r1, r1, lsr #2
+    bic     r1, r1, #(0x0c60>>2)
+.endif
+    orr     r1, r1, r1, lsl #16
+0:
+    ldrh    r3, [r0]
+    pld_    r0, #2048
+    tst     r0, #2
+    beq     1f
+    sub     r2, #1
+.if \semit == 0
+    bic     r3, r3, r4
+    semitrans0p  r3, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r3, r3, r4
+    semitrans13p r3, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r3, r1, r4, r5, r6, r7, lr
+.endif
+    strh    r3, [r0], #2
+1:
+    ldr     r3, [r0]
+    pld_    r0, #32
+    subs    r2, r2, #2
+.if \semit == 0
+    bic     r3, r3, r4
+    semitrans0p  r3, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r3, r3, r4
+    semitrans13p r3, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r3, r1, r4, r5, r6, r7, lr
+.endif
+    strpl   r3, [r0], #4
+    bpl     1b
+2:
+    tst     r2, #1
+    strhne  r3, [r0], #2
+
+    ldmfd   sp!, {r4-r7,pc}
+    .cfi_endproc
+.endm
+
+poly_untex_st_m poly_untex_st0_asm, 0
+poly_untex_st_m poly_untex_st1_asm, 1
+poly_untex_st_m poly_untex_st3_asm, 3
+#ifdef HAVE_ARMV6
+poly_untex_st_m poly_untex_st2_asm, 2
+#endif
+
+
+.macro poly_4_8bpp_asm_m name bpp light semit
+FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
+    .cfi_startproc
+    stmfd   sp!, {r4-r11,lr}
+    .cfi_def_cfa_offset 4*9
+    .cfi_rel_offset lr, 4*8
+    add     r12, r1, #4
+    ldmia   r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
+    ldr     r5, [r1, #0x18]    @ u_inc
+.if \light
+    ldr     r10,[r1, #0x24]    @ rbg
+.endif
+    mov     r6, r12            @ u_msk
+    ldr     r12,[r1, #0x1c]    @ v_inc
+.if \light
+    mov     r10,r10,lsl #7     @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
+    bic     r10,r10,#1<<23
+    bic     r10,r10,#1<<15
+    mov     r11,r10,lsl #8     @ 0ggg gggg ...
+.endif
+    and     r4, r4, r6
+    and     lr, lr, r7         @ v_msk & v
+    and     lr, lr, #0xff<<10
+    tst     r12,r12
+    bne     v_\name
+    ldr     r1, [r1]           @ src
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
+    add     r1, r1, lr, lsl #1
+#ifdef HAVE_ARMV6
+    add     r12,r1, r7, lsl #(2 - (\bpp / 8 * 2))
+    pld_    r12,#2048          @ next line
+#endif
+0:
+.if \light || \semit >= 0
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
+    subs    r2, r2, #1
+    bmi     1f
+.endif
+.if \bpp == 4
+    ldr     lr, [r1, r7, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+.else
+    ldrb    r12,[r1, r7]
+    add     r4, r4, r5
+    add     r12,r12,r12
+.endif
+    and     r4, r4, r6
+    ldrsh   r12,[r3, r12]
+    add     r0, r0, #2
+.if !\light && \semit < 0
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
+    tst     r12,r12
+    strhne  r12,[r0, #-2]
+    subs    r2, r2, #1
+    bgt     0b
+    @ end
+.else
+    tst     r12,r12
+    beq     0b
+.if \light && \semit != 1
+    modulate r12, r10, r11, r7, r8, lr
+.endif
+.if \semit == 0
+    ldrhmi  r7, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    bpl     0b
+    semitrans0 r12, r7, lr
+.endif
+    strh    r12,[r0, #-2]
+    b       0b
+.endif                         @ \light || \semit >= 0
+1:
+    ldmfd   sp!, {r4-r11,pc}
+
+v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+.if \light || \semit >= 0
+    sub     sp, sp, #4*2
+    stmia   sp, {r5,r6}
+    .cfi_def_cfa_offset 4*(9+2)
+    .cfi_rel_offset lr, 4*(8+2)
+.endif
+    ldr     r9, [r1, #0x14]    @ v_msk
+    ldr     r1, [r1]           @ src
+    mov     r8, r12            @ v_inc
+    and     r9, r9, #0xff<<10  @ v_msk_final
+.if !\light && \semit < 0
+    and     lr, r7, r9
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
+    add     lr, r1, lr, lsl #1
+.endif
+0:
+.if \light || \semit >= 0
+    and     lr, r7, r9
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
+    add     lr, r1, lr, lsl #1
+    subs    r2, r2, #1
+    bmi     1f
+.endif
+.if \bpp == 4
+    ldr     lr, [lr, r12, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+.else
+    ldrb    r12,[lr, r12]
+    add     r4, r4, r5
+    add     r12,r12,r12
+.endif
+    and     r4, r4, r6
+    ldrsh   r12,[r3, r12]
+    add     r0, r0, #2
+    add     r7, r7, r8
+.if !\light && \semit < 0
+    and     lr, r7, r9
+    tst     r12,r12
+    add     lr, r1, lr, lsl #1
+    strhne  r12,[r0, #-2]
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
+    subs    r2, r2, #1
+    bgt     0b
+    @ end
+.else
+    tst     r12,r12
+    beq     0b
+.if \light && \semit != 1
+    modulate r12, r10, r11, r5, r6, lr
+.endif
+.if \semit == 0
+    ldrhmi  r6, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    ldmiapl sp, {r5,r6}
+    bpl     0b
+    semitrans0 r12, r6, lr
+.endif
+    strh    r12,[r0, #-2]
+    ldmia   sp, {r5,r6}
+    b       0b
+.endif                         @ \light || \semit >= 0
+1:
+.if \light || \semit >= 0
+    add     sp, sp, #4*2
+.endif
+    ldmfd   sp!, {r4-r11,pc}
+    .cfi_endproc
+.endm
+
+poly_4_8bpp_asm_m poly_4bpp_asm,        4, 0, -1
+poly_4_8bpp_asm_m poly_4bpp_l0_st0_asm, 4, 0,  0
+poly_4_8bpp_asm_m poly_8bpp_asm,        8, 0, -1
+poly_4_8bpp_asm_m poly_8bpp_l0_st0_asm, 8, 0,  0
+
+#ifdef HAVE_ARMV6
+
+poly_4_8bpp_asm_m poly_4bpp_l1_std_asm, 4, 1, -1
+poly_4_8bpp_asm_m poly_4bpp_l1_st0_asm, 4, 1,  0
+poly_4_8bpp_asm_m poly_8bpp_l1_std_asm, 8, 1, -1
+poly_4_8bpp_asm_m poly_8bpp_l1_st0_asm, 8, 1,  0
+
+#endif // HAVE_ARMV6
+
 @ vim:filetype=armasm
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h
index 2329c46c..d69490ff 100644
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -5,14 +5,62 @@
 extern "C" {
 #endif
 
-struct spriteDriverArg;
+struct gpu_unai_inner_t;
+
+void tile_driver_st0_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+void tile_driver_st1_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+void tile_driver_st3_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
 
 void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
-	u32 count, const struct spriteDriverArg *arg);
+	u32 count, const struct gpu_unai_inner_t *inn);
 void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
-	u32 count, const struct spriteDriverArg *arg);
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_16bpp_asm(void *pPixel, const void *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
 void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
 
+void sprite_driver_4bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+
+void poly_untex_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_untex_st1_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_untex_st3_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_asm       (void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_asm       (void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
+#ifdef HAVE_ARMV6
+
+void tile_driver_st2_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+
+void sprite_driver_4bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
+	u32 count, const struct gpu_unai_inner_t *inn);
+
+void poly_untex_st2_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
+#endif // HAVE_ARMV6
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/plugins/gpu_unai/gpu_command.h b/plugins/gpu_unai/gpu_command.h
index cf6b62b4..adede2b5 100644
--- a/plugins/gpu_unai/gpu_command.h
+++ b/plugins/gpu_unai/gpu_command.h
@@ -45,13 +45,13 @@ void gpuSetTexture(u16 tpage)
 	
 	gpu_unai.BLEND_MODE  = ((tpage>>5) & 3) << 3;
 	gpu_unai.TEXT_MODE   = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
-	gpu_unai.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
+	gpu_unai.inn.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSetCLUT(u16 clut)
 {
-	gpu_unai.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
+	gpu_unai.inn.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
 }
 
 #ifdef  ENABLE_GPU_NULL_SUPPORT
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h
index a80c3a3a..3281d0fa 100644
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -55,7 +55,10 @@
 #include "gpu_inner_quantization.h"
 #include "gpu_inner_light.h"
 
+#include "arm_features.h"
+#include "compiler_features.h"
 #ifdef __arm__
+#include "gpu_arm.h"
 #include "gpu_inner_blend_arm.h"
 #include "gpu_inner_light_arm.h"
 #define gpuBlending gpuBlendingARM
@@ -276,7 +279,7 @@ const PSD gpuPixelSpanDrivers[64] =
 //  GPU Tiles innerloops generator
 
 template<int CF>
-static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data)
+static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count)
 {
 	le16_t ldata;
 
@@ -328,7 +331,42 @@ static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data)
 	}
 }
 
-static void TileNULL(le16_t *pDst, u32 count, u16 data)
+template<int CF>
+static noinline void gpuTileDriverFn(le16_t *pDst, u16 data, u32 count,
+	const gpu_unai_inner_t &inn)
+{
+	const int li=gpu_unai.inn.ilace_mask;
+	const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+	const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
+	const int y1 = inn.y1;
+	int y0 = inn.y0;
+
+	for (; y0 < y1; ++y0) {
+		if (!(y0&li) && (y0&pi) != pif)
+			gpuTileSpanFn<CF>(pDst, data, count);
+		pDst += FRAME_WIDTH;
+	}
+}
+
+#ifdef __arm__
+
+template<int CF>
+static void TileAsm(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
+{
+	switch (CF) {
+	case 0x02: tile_driver_st0_asm(pDst, data, count, &inn); return;
+	case 0x0a: tile_driver_st1_asm(pDst, data, count, &inn); return;
+	case 0x1a: tile_driver_st3_asm(pDst, data, count, &inn); return;
+#ifdef HAVE_ARMV6
+	case 0x12: tile_driver_st2_asm(pDst, data, count, &inn); return;
+#endif
+	}
+	gpuTileDriverFn<CF>(pDst, data, count, inn);
+}
+
+#endif
+
+static void TileNULL(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
 {
 	#ifdef ENABLE_GPU_LOG_SUPPORT
 		fprintf(stdout,"TileNULL()\n");
@@ -337,42 +375,47 @@ static void TileNULL(le16_t *pDst, u32 count, u16 data)
 
 ///////////////////////////////////////////////////////////////////////////////
 //  Tiles innerloops driver
-typedef void (*PT)(le16_t *pDst, u32 count, u16 data);
+typedef void (*PT)(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn);
 
 // Template instantiation helper macros
-#define TI(cf) gpuTileSpanFn<(cf)>
+#define TI(cf) gpuTileDriverFn<(cf)>
 #define TN     TileNULL
+#ifdef __arm__
+#define TA(cf) TileAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) TileAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
 #define TIBLOCK(ub) \
-	TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
-	TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
-	TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
-	TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+	TI((ub)|0x00), TA6((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+	TN,            TA ((ub)|0x0a), TN,            TI((ub)|0x0e), \
+	TN,            TA6((ub)|0x12), TN,            TI((ub)|0x16), \
+	TN,            TA ((ub)|0x1a), TN,            TI((ub)|0x1e)
 
-const PT gpuTileSpanDrivers[32] = {
+const PT gpuTileDrivers[32] = {
 	TIBLOCK(0<<8), TIBLOCK(1<<8)
 };
 
 #undef TI
 #undef TN
+#undef TA
+#undef TA6
 #undef TIBLOCK
 
 
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Sprites innerloops generator
 
-// warning: gpu_arm.S asm uses this, update it if you change this
-typedef struct spriteDriverArg {
-	const le16_t *CBA;             // 00
-	u32 u0, v0, u0_mask, v0_mask;  // 04 08 0c 10
-	s32 y0, y1, lines, li;         // 14
-} spriteDriverArg;
-
 typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
-	const spriteDriverArg *arg);
+	const gpu_unai_inner_t &inn);
 
 template<int CF>
-static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-	const spriteDriverArg *arg)
+static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+	const gpu_unai_inner_t &inn)
 {
 	// Blend func can save an operation if it knows uSrc MSB is unset.
 	//  Untextured prims can always skip (source color always comes with MSB=0).
@@ -381,25 +424,26 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
 
 	uint_fast16_t uSrc, uDst, srcMSB;
 	bool should_blend;
-	u32 u0_mask = arg->u0_mask;
+	u32 u0_mask = inn.u_msk >> 10;
 
 	u8 r5, g5, b5;
 	if (CF_LIGHT) {
-		r5 = gpu_unai.r5;
-		g5 = gpu_unai.g5;
-		b5 = gpu_unai.b5;
+		r5 = inn.r5;
+		g5 = inn.g5;
+		b5 = inn.b5;
 	}
 
+	const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
+	const u32 v0_mask = inn.v_msk >> 10;
+	s32 y0 = inn.y0, y1 = inn.y1, li = inn.ilace_mask;
+	u32 u0_ = inn.u, v0 = inn.v;
+
 	if (CF_TEXTMODE==3) {
-		// Texture is accessed byte-wise, so adjust mask if 16bpp
+		// Texture is accessed byte-wise, so adjust to 16bpp
+		u0_ <<= 1;
 		u0_mask <<= 1;
 	}
 
-	const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = arg->CBA;
-	const u32 v0_mask = arg->v0_mask;
-	s32 y0 = arg->y0, y1 = arg->y1, li = arg->li;
-	u32 u0_ = arg->u0, v0 = arg->v0;
-
 	for (; y0 < y1; ++y0, pPixel += FRAME_WIDTH, ++v0)
 	{
 	  if (y0 & li) continue;
@@ -450,41 +494,46 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
 }
 
 #ifdef __arm__
-#include "gpu_arm.h"
 
-static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-        const spriteDriverArg *arg)
+template<int CF>
+static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+        const gpu_unai_inner_t &inn)
 {
 #if 1
-	s32 lines = arg->lines;
-	u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
-	if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
-		pTxt_base += arg->u0 / 2 + arg->v0 * 2048;
-		sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg);
-	}
-	else
+  s32 lines = inn.y1 - inn.y0;
+  u32 u1m = inn.u + count - 1, v1m = inn.v + lines - 1;
+  if (u1m == (u1m & (inn.u_msk >> 10)) && v1m == (v1m & (inn.v_msk >> 10))) {
+    const u8 *pTxt = pTxt_base + inn.v * 2048;
+    switch (CF) {
+    case 0x20: sprite_driver_4bpp_asm (pPixel, pTxt + inn.u / 2, count, &inn); return;
+    case 0x40: sprite_driver_8bpp_asm (pPixel, pTxt + inn.u,     count, &inn); return;
+    case 0x60: sprite_driver_16bpp_asm(pPixel, pTxt + inn.u * 2, count, &inn); return;
+    }
+  }
+  if (v1m == (v1m & (inn.v_msk >> 10))) {
+    const u8 *pTxt = pTxt_base + inn.v * 2048;
+    switch (CF) {
+    case 0x20: sprite_driver_4bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x40: sprite_driver_8bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+#ifdef HAVE_ARMV6
+    case 0x21: sprite_driver_4bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x23: sprite_driver_4bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x2b: sprite_driver_4bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
+    case 0x41: sprite_driver_8bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x43: sprite_driver_8bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x4b: sprite_driver_8bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
 #endif
-		gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg);
-}
-
-static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-        const spriteDriverArg *arg)
-{
-#if 1
-	s32 lines = arg->lines;
-	u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
-	if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
-		pTxt_base += arg->u0 + arg->v0 * 2048;
-		sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg);
-	}
-	else
+    }
+  }
 #endif
-		gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg);
+  gpuSpriteDriverFn<CF>(pPixel, count, pTxt_base, inn);
 }
 #endif // __arm__
 
 static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-	const spriteDriverArg *arg)
+	const gpu_unai_inner_t &inn)
 {
 	#ifdef ENABLE_GPU_LOG_SUPPORT
 		fprintf(stdout,"SpriteNULL()\n");
@@ -500,29 +549,32 @@ static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
 #define TI(cf) gpuSpriteDriverFn<(cf)>
 #define TN     SpriteNULL
 #ifdef __arm__
-#define TA4(cf) Sprite4bppMaybeAsm
-#define TA8(cf) Sprite8bppMaybeAsm
+#define TA(cf) SpriteMaybeAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) SpriteMaybeAsm<(cf)>
 #else
-#define TA4(cf) TI(cf)
-#define TA8(cf) TI(cf)
+#define TA6(cf) TI(cf)
 #endif
 #define TIBLOCK(ub) \
-	TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-	TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-	TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-	TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-	TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
-	TN,             TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
-	TN,             TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
-	TN,             TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
-	TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
-	TN,             TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
-	TN,             TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
-	TN,             TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
-	TI((ub)|0x60),  TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
-	TN,             TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
-	TN,             TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
-	TN,             TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+	TN,            TN,            TI((ub)|0x2a), TA6((ub)|0x2b),TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+	TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+	TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+	TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+	TN,            TN,            TI((ub)|0x4a), TA6((ub)|0x4b),TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+	TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+	TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+	TA((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+	TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+	TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+	TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
 
 const PS gpuSpriteDrivers[256] = {
 	TIBLOCK(0<<8), TIBLOCK(1<<8)
@@ -531,6 +583,8 @@ const PS gpuSpriteDrivers[256] = {
 #undef TI
 #undef TN
 #undef TIBLOCK
+#undef TA
+#undef TA6
 
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Polygon innerloops generator
@@ -554,7 +608,7 @@ const PS gpuSpriteDrivers[256] = {
 //             relevant blend/light headers.
 // (see README_senquack.txt)
 template<int CF>
-static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 {
 	// Blend func can save an operation if it knows uSrc MSB is unset.
 	//  Untextured prims can always skip this (src color MSB is always 0).
@@ -562,14 +616,14 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 	const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
 	bool should_blend;
 
-	u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
+	u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.inn.blit_mask;
 
 	if (!CF_TEXTMODE)
 	{
 		if (!CF_GOURAUD)
 		{
 			// UNTEXTURED, NO GOURAUD
-			const u16 pix15 = gpu_unai.PixelData;
+			const u16 pix15 = gpu_unai.inn.PixelData;
 			do {
 				uint_fast16_t uSrc, uDst;
 
@@ -596,8 +650,8 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 		else
 		{
 			// UNTEXTURED, GOURAUD
-			gcol_t l_gCol = gpu_unai.gCol;
-			gcol_t l_gInc = gpu_unai.gInc;
+			gcol_t l_gCol = gpu_unai.inn.gCol;
+			gcol_t l_gInc = gpu_unai.inn.gInc;
 
 			do {
 				uint_fast16_t uDst, uSrc;
@@ -643,12 +697,15 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 		//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
 		// one 32-bit unsigned int, but this proved to lose too much accuracy
 		// (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
-		u32 l_u_msk = gpu_unai.u_msk;     u32 l_v_msk = gpu_unai.v_msk;
-		u32 l_u = gpu_unai.u & l_u_msk;   u32 l_v = gpu_unai.v & l_v_msk;
-		s32 l_u_inc = gpu_unai.u_inc;     s32 l_v_inc = gpu_unai.v_inc;
+		u32 l_u_msk = gpu_unai.inn.u_msk;     u32 l_v_msk = gpu_unai.inn.v_msk;
+		u32 l_u = gpu_unai.inn.u & l_u_msk;   u32 l_v = gpu_unai.inn.v & l_v_msk;
+		s32 l_u_inc = gpu_unai.inn.u_inc;     s32 l_v_inc = gpu_unai.inn.v_inc;
+		l_v <<= 1;
+		l_v_inc <<= 1;
+		l_v_msk = (l_v_msk & (0xff<<10)) << 1;
 
-		const le16_t* TBA_ = gpu_unai.TBA;
-		const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+		const le16_t* TBA_ = gpu_unai.inn.TBA;
+		const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
 
 		u8 r5, g5, b5;
 		u8 r8, g8, b8;
@@ -657,17 +714,17 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 
 		if (CF_LIGHT) {
 			if (CF_GOURAUD) {
-				l_gInc = gpu_unai.gInc;
-				l_gCol = gpu_unai.gCol;
+				l_gInc = gpu_unai.inn.gInc;
+				l_gCol = gpu_unai.inn.gCol;
 			} else {
 				if (CF_DITHER) {
-					r8 = gpu_unai.r8;
-					g8 = gpu_unai.g8;
-					b8 = gpu_unai.b8;
+					r8 = gpu_unai.inn.r8;
+					g8 = gpu_unai.inn.g8;
+					b8 = gpu_unai.inn.b8;
 				} else {
-					r5 = gpu_unai.r5;
-					g5 = gpu_unai.g5;
-					b5 = gpu_unai.b5;
+					r5 = gpu_unai.inn.r5;
+					g5 = gpu_unai.inn.g5;
+					b5 = gpu_unai.inn.b5;
 				}
 			}
 		}
@@ -682,17 +739,19 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 			//           (UNAI originally used 16.16)
 			if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
 				u32 tu=(l_u>>10);
-				u32 tv=(l_v<<1)&(0xff<<11);
+				u32 tv=l_v&l_v_msk;
 				u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
 				uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]);
 				if (!uSrc) goto endpolytext;
 			}
 			if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
-				uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]);
+				u32 tv=l_v&l_v_msk;
+				uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]);
 				if (!uSrc) goto endpolytext;
 			}
 			if (CF_TEXTMODE==3) {  // 16bpp
-				uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]);
+				u32 tv=(l_v&l_v_msk)>>1;
+				uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]);
 				if (!uSrc) goto endpolytext;
 			}
 
@@ -736,7 +795,7 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 endpolytext:
 			pDst++;
 			l_u = (l_u + l_u_inc) & l_u_msk;
-			l_v = (l_v + l_v_inc) & l_v_msk;
+			l_v += l_v_inc;
 			if (CF_LIGHT && CF_GOURAUD)
 				l_gCol.raw += l_gInc.raw;
 		}
@@ -744,6 +803,30 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 	}
 }
 
+#ifdef __arm__
+template<int CF>
+static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+{
+	switch (CF) {
+	case 0x02: poly_untex_st0_asm  (pDst, &gpu_unai.inn, count); break;
+	case 0x0a: poly_untex_st1_asm  (pDst, &gpu_unai.inn, count); break;
+	case 0x1a: poly_untex_st3_asm  (pDst, &gpu_unai.inn, count); break;
+	case 0x20: poly_4bpp_asm       (pDst, &gpu_unai.inn, count); break;
+	case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+	case 0x40: poly_8bpp_asm       (pDst, &gpu_unai.inn, count); break;
+	case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+#ifdef HAVE_ARMV6
+	case 0x12: poly_untex_st2_asm  (pDst, &gpu_unai.inn, count); break;
+	case 0x21: poly_4bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+	case 0x23: poly_4bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+	case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+	case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+#endif
+	default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count);
+	}
+}
+#endif
+
 static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 {
 	#ifdef ENABLE_GPU_LOG_SUPPORT
@@ -758,16 +841,26 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
 // Template instantiation helper macros
 #define TI(cf) gpuPolySpanFn<(cf)>
 #define TN     PolyNULL
+#ifdef __arm__
+#define TA(cf) PolySpanMaybeAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) PolySpanMaybeAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
 #define TIBLOCK(ub) \
-	TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
-	TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
-	TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
-	TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-	TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+	TI((ub)|0x00), TI((ub)|0x01), TA6((ub)|0x02),TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+	TN,            TN,            TA((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+	TN,            TN,            TA6((ub)|0x12),TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+	TN,            TN,            TA((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+	TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
 	TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
 	TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
 	TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
-	TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+	TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
 	TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
 	TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
 	TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
@@ -800,5 +893,7 @@ const PP gpuPolySpanDrivers[2048] = {
 #undef TI
 #undef TN
 #undef TIBLOCK
+#undef TA
+#undef TA6
 
 #endif /* __GPU_UNAI_GPU_INNER_H__ */
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm.h b/plugins/gpu_unai/gpu_inner_blend_arm.h
index 6413527c..f887374c 100644
--- a/plugins/gpu_unai/gpu_inner_blend_arm.h
+++ b/plugins/gpu_unai/gpu_inner_blend_arm.h
@@ -41,10 +41,14 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
 		asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
 		     "and %[mix], %[mix], %[mask]\n\t"  // ... & 0x0421
 		     "sub %[mix], %[uDst], %[mix]\n\t"  // uDst - ...
+		#ifdef HAVE_ARMV6
+		     "uhadd16 %[mix], %[uSrc], %[mix]\n\t"
+		#else
 		     "add %[mix], %[uSrc], %[mix]\n\t"  // uSrc + ...
 		     "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1
+		#endif
 		     : [mix] "=&r" (mix)
-		     : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+		     : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0420)); // 421
 	}
 
 	if (BLENDMODE == 1 || BLENDMODE == 3) {
diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h
index 7bd58908..7edb8fb0 100644
--- a/plugins/gpu_unai/gpu_inner_light_arm.h
+++ b/plugins/gpu_unai/gpu_inner_light_arm.h
@@ -1,6 +1,8 @@
 #ifndef _OP_LIGHT_ARM_H_
 #define _OP_LIGHT_ARM_H_
 
+#include "arm_features.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
 //
@@ -40,6 +42,27 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
 //	    u16 output:	 mbbbbbgggggrrrrr
 // Where 'X' are fixed-pt bits.
 ////////////////////////////////////////////////////////////////////////////////
+#ifdef HAVE_ARMV6
+// clang uses smulbb but not gcc, so we need this
+GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+{
+	int_fast16_t r;
+	asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+	return r;
+}
+
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+	// on v6 we have single-cycle mul and sat which is better than the lut
+	int_fast16_t r = smulbb(uSrc & 0x001f, r5);
+	int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
+	int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
+	asm volatile("usat %0, #5, %0, asr #4"  : "=r"(r) : "0"(r));
+	asm volatile("usat %0, #5, %0, asr #9"  : "=r"(g) : "0"(g));
+	asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
+	return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+}
+#else
 GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
 {
 	uint_fast16_t out = 0x03E0;
@@ -65,6 +88,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8
 	     : "cc");
 	return out;
 }
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h
index 1b9e08dc..6aaf9adc 100644
--- a/plugins/gpu_unai/gpu_raster_polygon.h
+++ b/plugins/gpu_unai/gpu_raster_polygon.h
@@ -223,13 +223,14 @@ static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVerte
 /*----------------------------------------------------------------------
 gpuDrawPolyF - Flat-shaded, untextured poly
 ----------------------------------------------------------------------*/
-void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad,
+	PolyType ptype = POLYTYPE_F)
 {
 	// Set up bgr555 color to be used across calls in inner driver
-	gpu_unai.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
+	gpu_unai.inn.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
 
 	PolyVertex vbuf[4];
-	polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad);
+	polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
 
 	int total_passes = is_quad ? 2 : 1;
 	int cur_pass = 0;
@@ -257,7 +258,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 				x3 = x4 = i2x(x0);
 				if (dx < 0) {
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
 					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
 #else
@@ -275,7 +276,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 #endif
 				} else {
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
 					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
 #else
@@ -303,7 +304,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 					x3 = i2x(x0) + (dx3 * (y1 - y0));
 					x4 = i2x(x1);
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
 #else
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
@@ -319,7 +320,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 					x3 = i2x(x1);
 					x4 = i2x(x0) + (dx4 * (y1 - y0));
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
 #else
 					dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
@@ -351,9 +352,9 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 				continue;
 
 			le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-			int li=gpu_unai.ilace_mask;
-			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+			int li=gpu_unai.inn.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
 
 			for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
 					x3 += dx3, x4 += dx4 )
@@ -374,19 +375,20 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 /*----------------------------------------------------------------------
 gpuDrawPolyFT - Flat-shaded, textured poly
 ----------------------------------------------------------------------*/
-void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad,
+	PolyType ptype = POLYTYPE_FT)
 {
 	// r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
-	gpu_unai.r8 = packet.U1[0];
-	gpu_unai.g8 = packet.U1[1];
-	gpu_unai.b8 = packet.U1[2];
+	gpu_unai.inn.r8 = packet.U1[0];
+	gpu_unai.inn.g8 = packet.U1[1];
+	gpu_unai.inn.b8 = packet.U1[2];
 	// r5/g5/b5 used if just texture-blending is applied (15-bit light)
-	gpu_unai.r5 = packet.U1[0] >> 3;
-	gpu_unai.g5 = packet.U1[1] >> 3;
-	gpu_unai.b5 = packet.U1[2] >> 3;
+	gpu_unai.inn.r5 = packet.U1[0] >> 3;
+	gpu_unai.inn.g5 = packet.U1[1] >> 3;
+	gpu_unai.inn.b5 = packet.U1[2] >> 3;
 
 	PolyVertex vbuf[4];
-	polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad);
+	polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
 
 	int total_passes = is_quad ? 2 : 1;
 	int cur_pass = 0;
@@ -460,8 +462,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 #endif
 #endif
 		// Set u,v increments for inner driver
-		gpu_unai.u_inc = du4;
-		gpu_unai.v_inc = dv4;
+		gpu_unai.inn.u_inc = du4;
+		gpu_unai.inn.v_inc = dv4;
 
 		//senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
 		//			 (SAME ISSUE ELSEWHERE)
@@ -581,7 +583,7 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 						v3 += (dv3 * (y1 - y0));
 					}
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
 #else
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
@@ -661,9 +663,9 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 				continue;
 
 			le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-			int li=gpu_unai.ilace_mask;
-			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+			int li=gpu_unai.inn.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
 
 			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
 					x3 += dx3, x4 += dx4,
@@ -693,8 +695,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 				}
 
 				// Set u,v coords for inner driver
-				gpu_unai.u = u4;
-				gpu_unai.v = v4;
+				gpu_unai.inn.u = u4;
+				gpu_unai.inn.v = v4;
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
@@ -790,7 +792,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 #endif
 #endif
 		// Setup packed Gouraud increment for inner driver
-		gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+		gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
 
 		for (s32 loop0 = 2; loop0; loop0--) {
 			if (loop0 == 2) {
@@ -920,7 +922,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 					}
 
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
 #else
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
@@ -1006,9 +1008,9 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 				continue;
 
 			le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-			int li=gpu_unai.ilace_mask;
-			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+			int li=gpu_unai.inn.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
 
 			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
 					x3 += dx3, x4 += dx4,
@@ -1042,7 +1044,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 				}
 
 				// Setup packed Gouraud color for inner driver
-				gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+				gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
@@ -1156,9 +1158,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 #endif
 #endif
 		// Set u,v increments and packed Gouraud increment for inner driver
-		gpu_unai.u_inc = du4;
-		gpu_unai.v_inc = dv4;
-		gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+		gpu_unai.inn.u_inc = du4;
+		gpu_unai.inn.v_inc = dv4;
+		gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
 
 		for (s32 loop0 = 2; loop0; loop0--) {
 			if (loop0 == 2) {
@@ -1305,7 +1307,7 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 					}
 
 #ifdef GPU_UNAI_USE_FLOATMATH
-#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV_FOR_ONE
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
 #else
 					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
@@ -1401,9 +1403,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 				continue;
 
 			le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-			int li=gpu_unai.ilace_mask;
-			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+			int li=gpu_unai.inn.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
 
 			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
 					x3 += dx3, x4 += dx4,
@@ -1446,9 +1448,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 				}
 
 				// Set packed Gouraud color and u,v coords for inner driver
-				gpu_unai.u = u4;
-				gpu_unai.v = v4;
-				gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+				gpu_unai.inn.u = u4;
+				gpu_unai.inn.v = v4;
+				gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h
index 2564e7f0..5c7b67ce 100644
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -61,34 +61,19 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
 	*w_out = x1;
 	*h_out = y1 - y0;
 
-	gpu_unai.r5 = packet.U1[0] >> 3;
-	gpu_unai.g5 = packet.U1[1] >> 3;
-	gpu_unai.b5 = packet.U1[2] >> 3;
-
 	le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
-	const int li=gpu_unai.ilace_mask;
-	//const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-	//const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
-	unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
-	u8* pTxt_base = (u8*)gpu_unai.TBA;
-
-	// Texture is accessed byte-wise, so adjust idx if 16bpp
-	if (tmode == 3) u0 <<= 1;
-
-	spriteDriverArg arg;
-	arg.CBA = gpu_unai.CBA;
-	arg.u0 = u0;
-	arg.v0 = v0;
-	arg.u0_mask = gpu_unai.TextureWindow[2];
-	arg.v0_mask = gpu_unai.TextureWindow[3];
-	arg.y0 = y0;
-	arg.y1 = y1;
-	arg.lines = y1 - y0;
-	arg.li = li;
-	gpuSpriteDriver(Pixel, x1, pTxt_base, &arg);
+
+	gpu_unai.inn.r5 = packet.U1[0] >> 3;
+	gpu_unai.inn.g5 = packet.U1[1] >> 3;
+	gpu_unai.inn.b5 = packet.U1[2] >> 3;
+	gpu_unai.inn.u = u0;
+	gpu_unai.inn.v = v0;
+	gpu_unai.inn.y0 = y0;
+	gpu_unai.inn.y1 = y1;
+	gpuSpriteDriver(Pixel, x1, (u8 *)gpu_unai.inn.TBA, gpu_unai.inn);
 }
 
-void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out)
+void gpuDrawT(PtrUnion packet, const PT gpuTileDriver, s32 *w_out, s32 *h_out)
 {
 	s32 x0, x1, y0, y1;
 
@@ -118,15 +103,10 @@ void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_ou
 
 	const u16 Data = GPU_RGB16(le32_to_u32(packet.U4[0]));
 	le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
-	const int li=gpu_unai.ilace_mask;
-	const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-	const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
-
-	for (; y0<y1; ++y0) {
-		if (!(y0&li) && (y0&pi)!=pif)
-			gpuTileSpanDriver(Pixel,x1,Data);
-		Pixel += FRAME_WIDTH;
-	}
+
+	gpu_unai.inn.y0 = y0;
+	gpu_unai.inn.y1 = y1;
+	gpuTileDriver(Pixel, Data, x1, gpu_unai.inn);
 }
 
 #endif /* __GPU_UNAI_GPU_RASTER_SPRITE_H__ */
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h
index 844a8fd4..6fe00bb9 100644
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -196,6 +196,54 @@ static inline s32 GPU_DIV(s32 rs, s32 rt)
 // 'Unsafe' version of above that doesn't check for div-by-zero
 #define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
 
+// warning: gpu_arm.S asm uses this struct, update the asm if you change this
+struct gpu_unai_inner_t {
+	le16_t* TBA;              // 00 Ptr to current texture in VRAM
+	le16_t* CBA;              // 04 Ptr to current CLUT in VRAM
+
+	// 22.10 Fixed-pt texture coords, mask, scanline advance
+	// NOTE: U,V are no longer packed together into one u32, this proved to be
+	//  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+	u32 u, v;                 // 08 not fractional for sprites
+	u32 u_msk, v_msk;         // 10 always 22.10
+	union {
+	  struct {
+	    s32 u_inc, v_inc;     // 18 poly uv increment, 22.10
+	  };
+	  struct {
+	    s32 y0, y1;           // 18 sprite y range
+	  };
+	};
+
+	// Color for flat-shaded, texture-blended prims
+	u8  r5, g5, b5, pad5;     // 20 5-bit light for undithered prims
+	u8  r8, g8, b8, pad8;     // 24 8-bit light for dithered prims
+
+	// Color for Gouraud-shaded prims
+	// Fixed-pt 8.8 rgb triplet
+	// Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+	//  layout:  ccccccccXXXXXXXX for c in [r, g, b]
+	//           ^ bit 16
+	gcol_t gCol;       // 28
+	gcol_t gInc;       // 30 Increment along scanline for gCol
+
+	// Color for flat-shaded, untextured prims
+	u16 PixelData;     // 38 bgr555 color for untextured flat-shaded polys
+
+	u8 blit_mask;           // Determines what pixels to skip when rendering.
+	                        //  Only useful on low-resolution devices using
+	                        //  a simple pixel-dropping downscaler for PS1
+	                        //  high-res modes. See 'pixel_skip' option.
+
+	u8 ilace_mask;          // Determines what lines to skip when rendering.
+	                        //  Normally 0 when PS1 240 vertical res is in
+	                        //  use and ilace_force is 0. When running in
+	                        //  PS1 480 vertical res on a low-resolution
+	                        //  device (320x240), will usually be set to 1
+	                        //  so odd lines are not rendered. (Unless future
+	                        //  full-screen scaling option is in use ..TODO)
+};
+
 struct gpu_unai_t {
 	u32 GPU_GP1;
 	GPUPacket PacketBuffer;
@@ -260,51 +308,15 @@ struct gpu_unai_t {
 	s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
 	                       // [1] : Drawing offset Y (signed)
 
-	le16_t* TBA;              // Ptr to current texture in VRAM
-	le16_t* CBA;              // Ptr to current CLUT in VRAM
-
 	////////////////////////////////////////////////////////////////////////////
 	//  Inner Loop parameters
 
-	// 22.10 Fixed-pt texture coords, mask, scanline advance
-	// NOTE: U,V are no longer packed together into one u32, this proved to be
-	//  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
-	u32 u, v;
-	u32 u_msk, v_msk;
-	s32 u_inc, v_inc;
-
-	// Color for Gouraud-shaded prims
-	// Fixed-pt 8.8 rgb triplet
-	// Packed fixed-pt 8.3:8.3:8.2 rgb triplet
-	//  layout:  ccccccccXXXXXXXX for c in [r, g, b]
-	//           ^ bit 16
-	gcol_t gCol;
-	gcol_t gInc;       // Increment along scanline for gCol
-
-	// Color for flat-shaded, texture-blended prims
-	u8  r5, g5, b5;    // 5-bit light for undithered prims
-	u8  r8, g8, b8;    // 8-bit light for dithered prims
-
-	// Color for flat-shaded, untextured prims
-	u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+	__attribute__((aligned(32)))
+	gpu_unai_inner_t inn;
 
 	// End of inner Loop parameters
 	////////////////////////////////////////////////////////////////////////////
 
-
-	u8 blit_mask;           // Determines what pixels to skip when rendering.
-	                        //  Only useful on low-resolution devices using
-	                        //  a simple pixel-dropping downscaler for PS1
-	                        //  high-res modes. See 'pixel_skip' option.
-
-	u8 ilace_mask;          // Determines what lines to skip when rendering.
-	                        //  Normally 0 when PS1 240 vertical res is in
-	                        //  use and ilace_force is 0. When running in
-	                        //  PS1 480 vertical res on a low-resolution
-	                        //  device (320x240), will usually be set to 1
-	                        //  so odd lines are not rendered. (Unless future
-	                        //  full-screen scaling option is in use ..TODO)
-
 	bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
 
 	u8 BLEND_MODE;
@@ -319,7 +331,7 @@ struct gpu_unai_t {
 	u32 DitherMatrix[64];   // Matrix of dither coefficients
 };
 
-static gpu_unai_t gpu_unai;
+static __attribute__((aligned(32))) gpu_unai_t gpu_unai;
 
 // Global config that frontend can alter.. Values are read in GPU_init().
 // TODO: if frontend menu modifies a setting, add a function that can notify
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp
index c7169dd6..40c7fd95 100644
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -161,7 +161,7 @@ static uint16_t *get_downscale_buffer(int *x, int *y, int *w, int *h, int *vram_
     lines = *h;
 
     // Ensure start at a non-skipped line
-    while (*y & gpu_unai.ilace_mask) ++*y;
+    while (*y & gpu_unai.inn.ilace_mask) ++*y;
   }
 
   unsigned int fb_offset_src = (*y * dstride + *x) & fb_mask;
@@ -243,8 +243,8 @@ int renderer_init(void)
   //senquack - new vars must be updated whenever texture window is changed:
   //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
   const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-  gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-  gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
 
   // Configuration options
   gpu_unai.config = gpu_unai_config_ext;
@@ -252,7 +252,7 @@ int renderer_init(void)
   // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
   // present in latest PCSX4ALL sources we were using.
   //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack;
-  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+  gpu_unai.inn.ilace_mask = gpu_unai.config.ilace_force;
 
 #if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || (!defined(GPU_UNAI_NO_OLD) && !defined(GPU_UNAI_USE_FLOATMATH))
   // s_invTable
@@ -285,13 +285,13 @@ void renderer_finish(void)
 
 void renderer_notify_res_change(void)
 {
-  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+  gpu_unai.inn.ilace_mask = gpu_unai.config.ilace_force;
 
 #ifndef HAVE_PRE_ARMV7 /* XXX */
   if (gpu_unai.config.scale_hires)
 #endif
   {
-    gpu_unai.ilace_mask |= !!(gpu.status & PSX_GPU_STATUS_INTERLACE);
+    gpu_unai.inn.ilace_mask |= !!(gpu.status & PSX_GPU_STATUS_INTERLACE);
   }
 
   /*
@@ -340,8 +340,8 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
 
         // Inner loop vars must be updated whenever texture window is changed:
         const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-        gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-        gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
 
         gpuSetTexture(gpu_unai.GPU_GP1);
       }
@@ -375,6 +375,40 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
 #endif
 
 #include "../gpulib/gpu_timing.h"
+
+// Strip lower 3 bits of each color and determine if lighting should be used:
+static inline bool need_lighting(u32 rgb_raw)
+{
+  return (rgb_raw & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080);
+}
+
+static inline void textured_sprite(int &cpu_cycles_sum, int &cpu_cycles)
+{
+  u32 PRIM = le32_to_u32(gpu_unai.PacketBuffer.U4[0]) >> 24;
+  gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
+  u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+  s32 w = 0, h = 0;
+
+  //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+  // This fixes Silent Hill running animation on loading screens:
+  // (On PSX, color values 0x00-0x7F darken the source texture's color,
+  //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+  //  0x80 leaves source texture color unchanged, HOWEVER,
+  //   gpu_unai uses a simple lighting LUT whereby only the upper
+  //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+  //   0x80.
+  //
+  // NOTE: I've changed all textured sprite draw commands here and
+  //  elsewhere to use proper behavior, but left poly commands
+  //  alone, I don't want to slow rendering down too much. (TODO)
+  if (need_lighting(le32_raw(gpu_unai.PacketBuffer.U4[0])))
+    driver_idx |= Lighting;
+  PS driver = gpuSpriteDrivers[driver_idx];
+  PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
+  gpuDrawS(packet, driver, &w, &h);
+  gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+}
+
 extern const unsigned char cmd_lengths[256];
 
 int do_cmd_list(u32 *list_, int list_len,
@@ -468,8 +502,20 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x2D:
       case 0x2E:
       case 0x2F: {          // Textured 4-pt poly
-        gpuSetCLUT   (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
+        u32 simplified_count;
         gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[4]) >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(gpu_unai.PacketBuffer.U4,
+              gpu_unai.PacketBuffer.U4)))
+        {
+          for (i = 0;; ) {
+            textured_sprite(cpu_cycles_sum, cpu_cycles);
+            if (++i >= simplified_count)
+              break;
+            memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16);
+          }
+          break;
+        }
+        gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
 
         u32 driver_idx =
           //(gpu_unai.blit_mask?1024:0) |
@@ -497,13 +543,22 @@ int do_cmd_list(u32 *list_, int list_len,
         // this is an untextured poly, so CF_LIGHT (texture blend)
         // shouldn't apply. Until the original array of template
         // instantiation ptrs is fixed, we're stuck with this. (TODO)
+        u8 gouraud = 129;
+        u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]);
+        for (i = 1; i < 3; i++)
+          xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 2]);
+        if ((xor_ & HTOLE32(0xf8f8f8)) == 0)
+          gouraud = 0;
         PP driver = gpuPolySpanDrivers[
           //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode |
-          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+          gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB
         ];
-        gpuDrawPolyG(packet, driver, false);
+        if (gouraud)
+          gpuDrawPolyG(packet, driver, false);
+        else
+          gpuDrawPolyF(packet, driver, false, POLYTYPE_G);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
       } break;
 
@@ -513,13 +568,28 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x37: {          // Gouraud-shaded, textured 3-pt poly
         gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
         gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16);
+        u8 lighting = Lighting;
+        u8 gouraud = lighting ? (1<<7) : 0;
+        if (lighting) {
+          u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]);
+          for (i = 1; i < 3; i++)
+            xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 3]);
+          if ((xor_ & HTOLE32(0xf8f8f8)) == 0) {
+            gouraud = 0;
+            if (!need_lighting(rgb0))
+              lighting = 0;
+          }
+        }
         PP driver = gpuPolySpanDrivers[
           //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
-          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+          gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB
         ];
-        gpuDrawPolyGT(packet, driver, false);
+        if (gouraud)
+          gpuDrawPolyGT(packet, driver, false); // is_quad = true
+        else
+          gpuDrawPolyFT(packet, driver, false, POLYTYPE_GT);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
       } break;
 
@@ -528,13 +598,22 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x3A:
       case 0x3B: {          // Gouraud-shaded 4-pt poly
         // See notes regarding '129' for 0x30..0x33 further above -senquack
+        u8 gouraud = 129;
+        u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]);
+        for (i = 1; i < 4; i++)
+          xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 2]);
+        if ((xor_ & HTOLE32(0xf8f8f8)) == 0)
+          gouraud = 0;
         PP driver = gpuPolySpanDrivers[
           //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode |
-          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+          gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB
         ];
-        gpuDrawPolyG(packet, driver, true); // is_quad = true
+        if (gouraud)
+          gpuDrawPolyG(packet, driver, true); // is_quad = true
+        else
+          gpuDrawPolyF(packet, driver, true, POLYTYPE_G);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
       } break;
 
@@ -542,15 +621,42 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x3D:
       case 0x3E:
       case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16);
+        u32 simplified_count;
+        gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(gpu_unai.PacketBuffer.U4,
+              gpu_unai.PacketBuffer.U4)))
+        {
+          for (i = 0;; ) {
+            textured_sprite(cpu_cycles_sum, cpu_cycles);
+            if (++i >= simplified_count)
+              break;
+            memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16);
+          }
+          break;
+        }
+        gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
+        u8 lighting = Lighting;
+        u8 gouraud = lighting ? (1<<7) : 0;
+        if (lighting) {
+          u32 xor_ = 0, rgb0 = le32_raw(gpu_unai.PacketBuffer.U4[0]);
+          for (i = 1; i < 4; i++)
+            xor_ |= rgb0 ^ le32_raw(gpu_unai.PacketBuffer.U4[i * 3]);
+          if ((xor_ & HTOLE32(0xf8f8f8)) == 0) {
+            gouraud = 0;
+            if (!need_lighting(rgb0))
+              lighting = 0;
+          }
+        }
         PP driver = gpuPolySpanDrivers[
           //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
-          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+          gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB
         ];
-        gpuDrawPolyGT(packet, driver, true); // is_quad = true
+        if (gouraud)
+          gpuDrawPolyGT(packet, driver, true); // is_quad = true
+        else
+          gpuDrawPolyFT(packet, driver, true, POLYTYPE_GT);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
       } break;
 
@@ -642,7 +748,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x61:
       case 0x62:
       case 0x63: {          // Monochrome rectangle (variable size)
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
         s32 w = 0, h = 0;
         gpuDrawT(packet, driver, &w, &h);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
@@ -651,38 +757,16 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x64:
       case 0x65:
       case 0x66:
-      case 0x67: {          // Textured rectangle (variable size)
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        // This fixes Silent Hill running animation on loading screens:
-        // (On PSX, color values 0x00-0x7F darken the source texture's color,
-        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
-        //  0x80 leaves source texture color unchanged, HOWEVER,
-        //   gpu_unai uses a simple lighting LUT whereby only the upper
-        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
-        //   0x80.
-        // 
-        // NOTE: I've changed all textured sprite draw commands here and
-        //  elsewhere to use proper behavior, but left poly commands
-        //  alone, I don't want to slow rendering down too much. (TODO)
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
-      } break;
+      case 0x67:            // Textured rectangle (variable size)
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
+        break;
 
       case 0x68:
       case 0x69:
       case 0x6A:
       case 0x6B: {          // Monochrome rectangle (1x1 dot)
         gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00010001);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
         s32 w = 0, h = 0;
         gpuDrawT(packet, driver, &w, &h);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
@@ -693,7 +777,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x72:
       case 0x73: {          // Monochrome rectangle (8x8)
         gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00080008);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
         s32 w = 0, h = 0;
         gpuDrawT(packet, driver, &w, &h);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
@@ -704,18 +788,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x76:
       case 0x77: {          // Textured rectangle (8x8)
         gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00080008);
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
       } break;
 
       case 0x78:
@@ -723,7 +796,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x7A:
       case 0x7B: {          // Monochrome rectangle (16x16)
         gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00100010);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
         s32 w = 0, h = 0;
         gpuDrawT(packet, driver, &w, &h);
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
@@ -734,17 +807,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x7E:
       case 0x7F: {          // Textured rectangle (16x16)
         gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010);
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
       } break;
 
 #ifdef TEST
diff --git a/plugins/gpulib/Makefile b/plugins/gpulib/Makefile
index cff61410..53aaa886 100644
--- a/plugins/gpulib/Makefile
+++ b/plugins/gpulib/Makefile
@@ -5,7 +5,7 @@ endif
 
 include ../../config.mak
 
-OBJS += gpu.o
+OBJS += gpu.o prim.o
 
 ifeq "$(ARCH)" "arm"
 OBJS += vout_pl.o
diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c
index df1c1c6c..88aa6704 100644
--- a/plugins/gpulib/gpu.c
+++ b/plugins/gpulib/gpu.c
@@ -17,23 +17,11 @@
 #include "gpu_timing.h"
 #include "../../libpcsxcore/gpu.h" // meh
 #include "../../frontend/plugin_lib.h"
+#include "../../include/compiler_features.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
-#ifdef __GNUC__
-# define unlikely(x) __builtin_expect((x), 0)
-# define preload __builtin_prefetch
-# ifndef __clang__
-#  define noinline __attribute__((noinline,noclone))
-# else
-#  define noinline __attribute__((noinline))
-# endif
-#else
-# define unlikely(x)
-# define preload(...)
-# define noinline
-#endif
 
 //#define log_io gpu_log
 #define log_io(...)
diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h
index e654500d..570d8421 100644
--- a/plugins/gpulib/gpu.h
+++ b/plugins/gpulib/gpu.h
@@ -147,6 +147,9 @@ void vout_update(void);
 void vout_blank(void);
 void vout_set_config(const struct rearmed_cbs *config);
 
+int  prim_try_simplify_quad_t (void *simplified, const void *prim);
+int  prim_try_simplify_quad_gt(void *simplified, const void *prim);
+
 /* listing these here for correct linkage if rasterizer uses c++ */
 struct GPUFreeze;
 
diff --git a/plugins/gpulib/prim.c b/plugins/gpulib/prim.c
new file mode 100644
index 00000000..d6294641
--- /dev/null
+++ b/plugins/gpulib/prim.c
@@ -0,0 +1,249 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../../include/compiler_features.h"
+#include "gpu.h"
+
+// retain neon's ability to sample textures pixel-perfectly
+#ifdef GPU_NEON
+#define STRICT
+#endif
+
+struct vert_t
+{
+  union {
+    struct {
+      int16_t x, y;
+    };
+    uint32_t xy;
+  };
+  union {
+    struct {
+      uint8_t u, v;
+      int16_t clut;
+    };
+    uint32_t uvclut;
+  };
+};
+
+// gt ~ gouraud textured
+struct vert_gt
+{
+  uint32_t rgb;
+  struct vert_t t;
+};
+
+struct quad_t
+{
+  uint32_t rgb_c;
+  struct vert_t v[4];
+};
+
+struct quad_gt
+{
+  struct vert_gt v[4];
+};
+
+struct sprite
+{
+  uint32_t rgb_c;
+  union {
+    struct {
+      int16_t x, y;
+    };
+    uint32_t xy;
+  };
+  union {
+    struct {
+      uint8_t u, v;
+      int16_t clut;
+    };
+    uint32_t uvclut;
+  };
+  int16_t w, h;
+};
+
+// debug
+#if 0
+static void log_quad_t(const struct quad_t *q, int ret)
+{
+#if 1
+  printf("quad_t %08x", q->rgb_c);
+  int i;
+  for (i = 0; i < 4; i++)
+    printf(" | %3d,%3d %3d,%3d",
+        q->v[i].x, q->v[i].y, q->v[i].u, q->v[i].v);
+  printf(" -> %d\n", ret);
+#endif
+}
+
+static void log_quad_gt(const struct vert_gt *v, int ret)
+{
+#if 1
+  printf("quad_gt %02x", v[0].rgb >> 24);
+  int i;
+  for (i = 0; i < 4; i++)
+    printf(" | %3d,%3d %3d,%3d %06x",
+        v[i].t.x, v[i].t.y, v[i].t.u, v[i].t.v, v[i].rgb & 0xffffff);
+  printf(" -> %d\n", ret);
+#endif
+}
+
+int prim_try_simplify_quad_t_(void *simplified, const void *prim_);
+int prim_try_simplify_quad_t(void *simplified, const void *prim_)
+{
+  struct quad_t prim = *(struct quad_t *)prim_;
+  int ret = prim_try_simplify_quad_t_(simplified, prim_);
+  #define prim_try_simplify_quad_t prim_try_simplify_quad_t_
+  ///if (!ret)
+    log_quad_t(&prim, ret);
+  return ret;
+}
+
+int prim_try_simplify_quad_gt_(void *simplified, const void *prim_);
+int prim_try_simplify_quad_gt(void *simplified, const void *prim_)
+{
+  struct quad_gt prim = *(struct quad_gt *)prim_;
+  int ret = prim_try_simplify_quad_gt_(simplified, prim_);
+  #define prim_try_simplify_quad_gt prim_try_simplify_quad_gt_
+  ///if (!ret)
+    log_quad_gt(prim.v, ret);
+  return ret;
+}
+#endif // debug
+
+static noinline int simplify_quad_t(void *simplified, const struct vert_t *v,
+  int xd, int ud, int yd, int vd, uint32_t rgb_c, uint16_t clut)
+{
+  struct sprite *s = simplified;
+  int ret = 1;
+  rgb_c &= HTOLE32(0x03ffffff);
+  rgb_c |= HTOLE32(0x64000000);
+  xd = abs(xd);
+  ud = abs(ud);
+  s[0].rgb_c = rgb_c;
+  s[0].xy = v->xy;
+  s[0].u = v->u;
+  s[0].v = v->v;
+  s[0].clut = clut;
+  s[0].w = HTOLE16(xd);
+  s[0].h = HTOLE16(yd);
+#ifndef STRICT
+  if (xd != ud) {
+    int mid = xd / 2;
+    s[0].w = HTOLE16(mid);
+    s[1].rgb_c = rgb_c;
+    s[1].x = HTOLE16(LE16TOH(s[0].x) + mid);
+    s[1].y = s[0].y;
+    s[1].u = s[0].u + mid + ud - xd;
+    s[1].v = s[0].v;
+    s[1].clut = clut;
+    s[1].w = HTOLE16(xd - mid);
+    s[1].h = s[0].h;
+    ret = 2;
+  }
+  if (yd != vd) {
+    int i, mid = yd / 2, y = LE16TOH(s[0].y);
+    memcpy(s + ret, s, sizeof(s[0]) * ret);
+    for (i = 0; i < ret; i++) {
+      s[i].h = HTOLE16(mid);
+      s[ret+i].y = HTOLE16(y + mid);
+      s[ret+i].h = HTOLE16(yd - mid);
+      s[ret+i].v = s[0].v + mid + vd - yd;
+    }
+    ret *= 2;
+  }
+#endif
+  return ret;
+}
+
+// this is split to reduce gcc spilling
+static noinline int prim_try_simplify_quad_t2(void *simplified,
+  const struct vert_t *v, uint32_t rgb_c)
+{
+  do {
+    int yd = LE16TOH(v[2].y) - LE16TOH(v[0].y);
+    int xd, ud, vd;
+    if (yd < 0)
+      break;
+    xd = LE16TOH(v[1].x) - LE16TOH(v[0].x);
+    ud = LE16TOH(v[1].u) - LE16TOH(v[0].u);
+    vd = LE16TOH(v[2].v) - LE16TOH(v[0].v);
+#ifdef STRICT
+    if (xd != ud || yd != vd)
+#else
+    if (abs(xd - ud) > 1 || abs(yd - vd) > 1)
+#endif
+      break;
+    return simplify_quad_t(simplified, xd < 0 ? &v[1] : &v[0],
+             xd, ud, yd, vd, rgb_c, v[0].clut);
+  }
+  while (0);
+  return 0;
+}
+
+static noinline int prim_try_simplify_quad_gt2(void *simplified,
+  const struct vert_gt *v)
+{
+  do {
+    int yd = LE16TOH(v[2].t.y) - LE16TOH(v[0].t.y);
+    int xd, ud, vd;
+    if (yd < 0)
+      break;
+    xd = LE16TOH(v[1].t.x) - LE16TOH(v[0].t.x);
+    ud = LE16TOH(v[1].t.u) - LE16TOH(v[0].t.u);
+    vd = LE16TOH(v[2].t.v) - LE16TOH(v[0].t.v);
+#ifdef STRICT
+    if (xd != ud || yd != vd)
+#else
+    if (abs(xd - ud) > 1 || abs(yd - vd) > 1)
+#endif
+      break;
+    if (!(v[0].rgb & HTOLE32(1 << 24))) { // modulation/"lighting"
+      uint32_t i, xor = 0, rgb0 = v[0].rgb;
+      for (i = 1; i < 4; i++)
+        xor |= rgb0 ^ v[i].rgb;
+      if (xor & HTOLE32(0xf8f8f8))
+        break;
+    }
+    return simplify_quad_t(simplified, xd < 0 ? &v[1].t : &v[0].t,
+        xd, ud, yd, vd, v[0].rgb, v[0].t.clut);
+  }
+  while (0);
+  return 0;
+}
+
+// 2c-2f
+int prim_try_simplify_quad_t(void *simplified, const void *prim_)
+{
+  const struct quad_t *prim = prim_;
+  const struct vert_t *v = prim->v;
+  int ret = 0;
+  do {
+    if (v[0].y != v[1].y || v[0].x != v[2].x || v[2].y != v[3].y || v[1].x != v[3].x)
+      break;
+    if (v[0].v != v[1].v || v[0].u != v[2].u || v[2].v != v[3].v || v[1].u != v[3].u)
+      break;
+    ret = prim_try_simplify_quad_t2(simplified, v, prim->rgb_c);
+  }
+  while (0);
+  return ret;
+}
+
+// 3c-3f
+int prim_try_simplify_quad_gt(void *simplified, const void *prim)
+{
+  const struct vert_gt *v = prim;
+  int ret = 0;
+  do {
+    if (v[0].t.y != v[1].t.y || v[0].t.x != v[2].t.x || v[2].t.y != v[3].t.y || v[1].t.x != v[3].t.x)
+      break;
+    if (v[0].t.v != v[1].t.v || v[0].t.u != v[2].t.u || v[2].t.v != v[3].t.v || v[1].t.u != v[3].t.u)
+      break;
+    ret = prim_try_simplify_quad_gt2(simplified, v);
+  }
+  while (0);
+  return ret;
+}
+
+// vim:shiftwidth=2:expandtab