diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..95af655
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.zip
+out/aroma_installer
+out/aroma.zip
+out/out_folder.txt
+assets/META-INF/com/google/android/update-binary
diff --git a/Android.mk b/Android.mk
index d86078d..45ac6d8 100644
--- a/Android.mk
+++ b/Android.mk
@@ -11,7 +11,7 @@ include $(CLEAR_VARS)
   ## Force Compiling Without ARM NEON
   ##   -- Uncomment This Line --
   ##
-  # AROMA_ARM_NEON      := false
+  AROMA_ARM_NEON      := false
   #
   
   ##
@@ -157,7 +157,7 @@ include $(CLEAR_VARS)
   
   ## INCLUDED LIBRARIES
   LOCAL_STATIC_LIBRARIES        := libm libc
-  
+  LOCAL_FORCE_STATIC_EXECUTABLE := true
   ifeq ($(MAKECMDGOALS),$(LOCAL_MODULE))
     $(shell rm -rf $(PRODUCT_OUT)/obj/EXECUTABLES/$(LOCAL_MODULE)_intermediates)
   endif
diff --git a/assets/META-INF/com/google/android/update-binary-installer b/assets/META-INF/com/google/android/update-binary-installer
index 43a2d46..9eafafd 100644
Binary files a/assets/META-INF/com/google/android/update-binary-installer and b/assets/META-INF/com/google/android/update-binary-installer differ
diff --git a/libs/zlib/x86/adler32.c b/libs/zlib/x86/adler32.c
new file mode 100644
index 0000000..84b16f7
--- /dev/null
+++ b/libs/zlib/x86/adler32.c
@@ -0,0 +1,1026 @@
+/*
+ * adler32.c -- compute the Adler-32 checksum of a data stream
+ *   x86 implementation
+ * Copyright (C) 1995-2007 Mark Adler
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#include "x86.h"
+
+#if GCC_VERSION_GE(203)
+#  define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x)))
+#else
+#  define VEC_NO_GO
+#endif
+
+/* inline asm, so only on GCC (or compatible) */
+#if defined(__GNUC__) && !defined(VEC_NO_GO)
+#  define HAVE_ADLER32_VEC
+#  define MIN_WORK 64
+
+/* ========================================================================= */
+local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = {
+    {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
+};
+
+/* ========================================================================= */
+local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = {
+    {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
+};
+
+/* ========================================================================= */
+local noinline const Bytef *adler32_jumped(buf, s1, s2, k)
+    const Bytef *buf;
+    unsigned int *s1;
+    unsigned int *s2;
+    unsigned int k;
+{
+    unsigned int t;
+    unsigned n = k % 16;
+    buf += n;
+    k = (k / 16) + 1;
+
+    __asm__ __volatile__ (
+#  ifdef __x86_64__
+#    define CLOB "&"
+            "lea	1f(%%rip), %q4\n\t"
+            "lea	(%q4,%q5,8), %q4\n\t"
+            "jmp	*%q4\n\t"
+#  else
+#    ifndef __PIC__
+#      define CLOB
+            "lea	1f(,%5,8), %4\n\t"
+#    else
+#      define CLOB
+            "lea	1f-3f(,%5,8), %4\n\t"
+            "call	9f\n"
+            "3:\n\t"
+#    endif
+            "jmp	*%4\n\t"
+#    ifdef __PIC__
+            ".p2align 1\n"
+            "9:\n\t"
+            "addl	(%%esp), %4\n\t"
+            "ret\n\t"
+#    endif
+#  endif
+            ".p2align 1\n"
+            "2:\n\t"
+#  ifdef __i386
+            ".byte 0x3e\n\t"
+#  endif
+            "add	$0x10, %2\n\t"
+            ".p2align 1\n"
+            "1:\n\t"
+            /* 128 */
+            "movzbl	-16(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /* 120 */
+            "movzbl	-15(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /* 112 */
+            "movzbl	-14(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /* 104 */
+            "movzbl	-13(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  96 */
+            "movzbl	-12(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  88 */
+            "movzbl	-11(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  80 */
+            "movzbl	-10(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  72 */
+            "movzbl	 -9(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  64 */
+            "movzbl	 -8(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  56 */
+            "movzbl	 -7(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  48 */
+            "movzbl	 -6(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  40 */
+            "movzbl	 -5(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  32 */
+            "movzbl	 -4(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  24 */
+            "movzbl	 -3(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*  16 */
+            "movzbl	 -2(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*   8 */
+            "movzbl	 -1(%2), %4\n\t"	/* 4 */
+            "add	%4, %0\n\t"	/* 2 */
+            "add	%0, %1\n\t"	/* 2 */
+            /*   0 */
+            "dec	%3\n\t"
+            "jnz	2b"
+        : /* %0 */ "=R" (*s1),
+          /* %1 */ "=R" (*s2),
+          /* %2 */ "=abdSD" (buf),
+          /* %3 */ "=c" (k),
+          /* %4 */ "="CLOB"R" (t)
+        : /* %5 */ "r" (16 - n),
+          /*    */ "0" (*s1),
+          /*    */ "1" (*s2),
+          /*    */ "2" (buf),
+          /*    */ "3" (k)
+        : "cc", "memory"
+    );
+
+    return buf;
+}
+
+
+
+#if 0 && (HAVE_BINUTILS-0) >= 222
+        /*
+         * 2013 Intel will hopefully bring the Haswell CPUs,
+         * which hopefully will have AVX2, which brings integer
+         * ops to the full width AVX regs.
+         */
+        "2:\n\t"
+        "mov	$256, %1\n\t"
+        "cmp	%1, %3\n\t"
+        "cmovb	%3, %1\n\t"
+        "and	$-32, %1\n\t"
+        "sub	%1, %3\n\t"
+        "shr	$5, %1\n\t"
+        "vpxor	%%xmm6, %%xmm6\n\t"
+        ".p2align 4,,7\n"
+        ".p2align 3\n"
+        "1:\n\t"
+        "vmovdqa	(%0), %%ymm0\n\t"
+        "prefetchnta	0x70(%0)\n\t"
+        "vpaddd	%%ymm3, %%ymm7, %%ymm7\n\t"
+        "add	$32, %0\n\t"
+        "dec	%1\n\t"
+        "vpsadbw	%%ymm4, %%ymm0, %%ymm1\n\t"
+        "vpmaddubsw	%%ymm5, %%ymm0, %%ymm0\n\t"
+        "vpaddd	%%ymm1, %%ymm3, %%ymm3\n\t"
+        "vpaddw	%%ymm0, %%ymm6, %%ymm6\n\t"
+        "jnz	1b\n\t"
+        "vpunpckhwd	%%ymm4, %%ymm6, %%xmm0\n\t"
+        "vpunpcklwd	%%ymm4, %%ymm6, %%ymm6\n\t"
+        "vpaddd	%%ymm0, %%ymm2, %%ymm2\n\t"
+        "vpaddd	%%ymm6, %%ymm2, %%ymm2\n\t"
+        "cmp	$32, %3\n\t"
+        "jg	2b\n\t"
+        avx2_chop
+        ...
+#endif
+
+#if 0
+        /*
+         * Will XOP processors have SSSE3/AVX??
+         * And what is the unaligned load performance?
+         */
+        "prefetchnta	0x70(%0)\n\t"
+        "lddqu	(%0), %%xmm0\n\t"
+        "vpaddd	%%xmm3, %%xmm5, %%xmm5\n\t"
+        "sub	$16, %3\n\t"
+        "add	$16, %0\n\t"
+        "cmp	$15, %3\n\t"
+        "vphaddubd	%%xmm0, %%xmm1\n\t" /* A */
+        "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */
+        "vphadduwd	%%xmm0, %%xmm0\n\t" /* 2 */
+        "vpaddd	%%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */
+        "vpaddd	%%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */
+        "jg	1b\n\t"
+        xop_chop
+        xop_chop
+        xop_chop
+        setup
+        "jg	1b\n\t"
+        "vphaddudq	%%xmm2, %%xmm0\n\t"
+        "vphaddudq	%%xmm3, %%xmm1\n\t"
+        "pshufd	$0xE6, %%xmm0, %%xmm2\n\t"
+        "pshufd	$0xE6, %%xmm1, %%xmm3\n\t"
+        "paddd	%%xmm0, %%xmm2\n\t"
+        "paddd	%%xmm1, %%xmm3\n\t"
+        "movd	%%xmm2, %2\n\t"
+        "movd	%%xmm3, %1\n\t"
+#endif
+
+/* ========================================================================= */
+local uLong adler32_SSSE3(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int k;
+
+    k    = ALIGN_DIFF(buf, 16);
+    len -= k;
+    if (k)
+        buf = adler32_jumped(buf, &s1, &s2, k);
+
+    __asm__ __volatile__ (
+            "mov	%6, %3\n\t"		/* get max. byte count VNMAX till v1_round_sum overflows */
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"		/* k = len >= VNMAX ? k : len */
+            "sub	%3, %4\n\t"		/* len -= k */
+            "cmp	$16, %3\n\t"
+            "jb	8f\n\t"			/* if(k < 16) goto OUT */
+#if defined(__ELF__) && !defined(__clang__)
+            ".subsection 2\n\t"
+#else
+            "jmp	7f\n\t"
+#endif
+            ".p2align 2\n"
+            /*
+             * reduction function to bring a vector sum within the range of BASE
+             * This does no full reduction! When the sum is large, a number > BASE
+             * is the result. To do a full reduction call multiple times.
+             */
+            "sse2_chop:\n\t"
+            "movdqa	%%xmm0, %%xmm1\n\t"	/* y = x */
+            "pslld	$16, %%xmm1\n\t"	/* y <<= 16 */
+            "psrld	$16, %%xmm0\n\t"	/* x >>= 16 */
+            "psrld	$16, %%xmm1\n\t"	/* y >>= 16 */
+            "psubd	%%xmm0, %%xmm1\n\t"	/* y -= x */
+            "pslld	$4, %%xmm0\n\t"		/* x <<= 4 */
+            "paddd	%%xmm1, %%xmm0\n\t"	/* x += y */
+            "ret\n\t"
+#if defined(__ELF__) && !defined(__clang__)
+            ".previous\n\t"
+#else
+            "7:\n\t"
+#endif
+            "movdqa	%5, %%xmm5\n\t"		/* get vord_b */
+            "prefetchnta	0x70(%0)\n\t"
+            "movd %2, %%xmm2\n\t"		/* init vector sum vs2 with s2 */
+            "movd %1, %%xmm3\n\t"		/* init vector sum vs1 with s1 */
+            "pxor	%%xmm4, %%xmm4\n"	/* zero */
+            "3:\n\t"
+            "pxor	%%xmm7, %%xmm7\n\t"	/* zero vs1_round_sum */
+            ".p2align 3,,3\n\t"
+            ".p2align 2\n"
+            "2:\n\t"
+            "mov	$128, %1\n\t"		/* inner_k = 128 bytes till vs2_i overflows */
+            "cmp	%1, %3\n\t"
+            "cmovb	%3, %1\n\t"		/* inner_k = k >= inner_k ? inner_k : k */
+            "and	$-16, %1\n\t"		/* inner_k = ROUND_TO(inner_k, 16) */
+            "sub	%1, %3\n\t"		/* k -= inner_k */
+            "shr	$4, %1\n\t"		/* inner_k /= 16 */
+            "pxor	%%xmm6, %%xmm6\n\t"	/* zero vs2_i */
+            ".p2align 4,,7\n"
+            ".p2align 3\n"
+            "1:\n\t"
+            "movdqa	(%0), %%xmm0\n\t"	/* fetch input data */
+            "prefetchnta	0x70(%0)\n\t"
+            "paddd	%%xmm3, %%xmm7\n\t"	/* vs1_round_sum += vs1 */
+            "add	$16, %0\n\t"		/* advance input data pointer */
+            "dec	%1\n\t"			/* decrement inner_k */
+            "movdqa	%%xmm0, %%xmm1\n\t"	/* make a copy of the input data */
+#  if (HAVE_BINUTILS-0) >= 217
+            "pmaddubsw %%xmm5, %%xmm0\n\t"	/* multiply all input bytes by vord_b bytes, add adjecent results to words */
+#  else
+            ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */
+#  endif
+            "psadbw	%%xmm4, %%xmm1\n\t"	/* subtract zero from every byte, add 8 bytes to a sum */
+            "paddw	%%xmm0, %%xmm6\n\t"	/* vs2_i += in * vorder_b */
+            "paddd	%%xmm1, %%xmm3\n\t"	/* vs1 += psadbw */
+            "jnz	1b\n\t"			/* repeat if inner_k != 0 */
+            "movdqa	%%xmm6, %%xmm0\n\t"	/* copy vs2_i */
+            "punpckhwd	%%xmm4, %%xmm0\n\t"	/* zero extent vs2_i upper words to dwords */
+            "punpcklwd	%%xmm4, %%xmm6\n\t"	/* zero extent vs2_i lower words to dwords */
+            "paddd	%%xmm0, %%xmm2\n\t"	/* vs2 += vs2_i.upper */
+            "paddd	%%xmm6, %%xmm2\n\t"	/* vs2 += vs2_i.lower */
+            "cmp	$15, %3\n\t"
+            "jg	2b\n\t"				/* if(k > 15) repeat */
+            "movdqa	%%xmm7, %%xmm0\n\t"	/* move vs1_round_sum */
+            "call	sse2_chop\n\t"	/* chop vs1_round_sum */
+            "pslld	$4, %%xmm0\n\t"		/* vs1_round_sum *= 16 */
+            "paddd	%%xmm2, %%xmm0\n\t"	/* vs2 += vs1_round_sum */
+            "call	sse2_chop\n\t"	/* chop again */
+            "movdqa	%%xmm0, %%xmm2\n\t"	/* move vs2 back in place */
+            "movdqa	%%xmm3, %%xmm0\n\t"	/* move vs1 */
+            "call	sse2_chop\n\t"	/* chop */
+            "movdqa	%%xmm0, %%xmm3\n\t"	/* move vs1 back in place */
+            "add	%3, %4\n\t"		/* len += k */
+            "mov	%6, %3\n\t"		/* get max. byte count VNMAX till v1_round_sum overflows */
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"		/* k = len >= VNMAX ? k : len */
+            "sub	%3, %4\n\t"		/* len -= k */
+            "cmp	$15, %3\n\t"
+            "jg	3b\n\t"				/* if(k > 15) repeat */
+            "pshufd	$0xEE, %%xmm3, %%xmm1\n\t"	/* collect vs1 & vs2 in lowest vector member */
+            "pshufd	$0xEE, %%xmm2, %%xmm0\n\t"
+            "paddd	%%xmm3, %%xmm1\n\t"
+            "paddd	%%xmm2, %%xmm0\n\t"
+            "pshufd	$0xE5, %%xmm0, %%xmm2\n\t"
+            "paddd	%%xmm0, %%xmm2\n\t"
+            "movd	%%xmm1, %1\n\t"		/* mov vs1 to s1 */
+            "movd	%%xmm2, %2\n"		/* mov vs2 to s2 */
+            "8:"
+        : /* %0 */ "=r" (buf),
+          /* %1 */ "=r" (s1),
+          /* %2 */ "=r" (s2),
+          /* %3 */ "=r" (k),
+          /* %4 */ "=r" (len)
+        : /* %5 */ "m" (vord_b),
+          /*
+           * somewhere between 5 & 6, psadbw 64 bit sums ruin the party
+           * spreading the sums with palignr only brings it to 7 (?),
+           * while introducing an op into the main loop (2800 ms -> 3200 ms)
+           */
+          /* %6 */ "i" (5*NMAX),
+          /*    */ "0" (buf),
+          /*    */ "1" (s1),
+          /*    */ "2" (s2),
+          /*    */ "4" (len)
+        : "cc", "memory"
+#  ifdef __SSE__
+          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#  endif
+    );
+
+    if (unlikely(k))
+        buf = adler32_jumped(buf, &s1, &s2, k);
+    MOD28(s1);
+    MOD28(s2);
+    return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+local uLong adler32_SSE2(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int k;
+
+    k    = ALIGN_DIFF(buf, 16);
+    len -= k;
+    if (k)
+        buf = adler32_jumped(buf, &s1, &s2, k);
+
+    __asm__ __volatile__ (
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$16, %3\n\t"
+            "jb	8f\n\t"
+            "prefetchnta	0x70(%0)\n\t"
+            "movd	%1, %%xmm4\n\t"
+            "movd	%2, %%xmm3\n\t"
+            "pxor	%%xmm2, %%xmm2\n\t"
+            "pxor	%%xmm5, %%xmm5\n\t"
+            ".p2align 2\n"
+            "3:\n\t"
+            "pxor	%%xmm6, %%xmm6\n\t"
+            "pxor	%%xmm7, %%xmm7\n\t"
+            "mov	$2048, %1\n\t"		/* get byte count till vs2_{l|h}_word overflows */
+            "cmp	%1, %3\n\t"
+            "cmovb	%3, %1\n"
+            "and	$-16, %1\n\t"
+            "sub	%1, %3\n\t"
+            "shr	$4, %1\n\t"
+            ".p2align 4,,7\n"
+            ".p2align 3\n"
+            "1:\n\t"
+            "prefetchnta	0x70(%0)\n\t"
+            "movdqa	(%0), %%xmm0\n\t"	/* fetch input data */
+            "paddd	%%xmm4, %%xmm5\n\t"	/* vs1_round_sum += vs1 */
+            "add	$16, %0\n\t"
+            "dec	%1\n\t"
+            "movdqa	%%xmm0, %%xmm1\n\t"	/* copy input data */
+            "psadbw	%%xmm2, %%xmm0\n\t"	/* add all bytes horiz. */
+            "paddd	%%xmm0, %%xmm4\n\t"	/* add that to vs1 */
+            "movdqa	%%xmm1, %%xmm0\n\t"	/* copy input data */
+            "punpckhbw	%%xmm2, %%xmm1\n\t"	/* zero extent input upper bytes to words */
+            "punpcklbw	%%xmm2, %%xmm0\n\t"	/* zero extent input lower bytes to words */
+            "paddw	%%xmm1, %%xmm7\n\t"	/* vs2_h_words += in_high_words */
+            "paddw	%%xmm0, %%xmm6\n\t"	/* vs2_l_words += in_low_words */
+            "jnz	1b\n\t"
+            "cmp	$15, %3\n\t"
+            "pmaddwd	32+%5, %%xmm7\n\t"	/* multiply vs2_h_words with order, add adjecend results */
+            "pmaddwd	16+%5, %%xmm6\n\t"	/* multiply vs2_l_words with order, add adjecend results */
+            "paddd	%%xmm7, %%xmm3\n\t"	/* add to vs2 */
+            "paddd	%%xmm6, %%xmm3\n\t"	/* add to vs2 */
+            "jg	3b\n\t"
+            "movdqa	%%xmm5, %%xmm0\n\t"
+            "pxor	%%xmm5, %%xmm5\n\t"
+            "call	sse2_chop\n\t"
+            "pslld	$4, %%xmm0\n\t"
+            "paddd	%%xmm3, %%xmm0\n\t"
+            "call	sse2_chop\n\t"
+            "movdqa	%%xmm0, %%xmm3\n\t"
+            "movdqa	%%xmm4, %%xmm0\n\t"
+            "call	sse2_chop\n\t"
+            "movdqa	%%xmm0, %%xmm4\n\t"
+            "add	%3, %4\n\t"
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n"
+            "sub	%3, %4\n\t"
+            "cmp	$15, %3\n\t"
+            "jg	3b\n\t"
+            "pshufd	$0xEE, %%xmm4, %%xmm1\n\t"
+            "pshufd	$0xEE, %%xmm3, %%xmm0\n\t"
+            "paddd	%%xmm4, %%xmm1\n\t"
+            "paddd	%%xmm3, %%xmm0\n\t"
+            "pshufd	$0xE5, %%xmm0, %%xmm3\n\t"
+            "paddd	%%xmm0, %%xmm3\n\t"
+            "movd	%%xmm1, %1\n\t"
+            "movd	%%xmm3, %2\n"
+            "8:\n\t"
+        : /* %0 */ "=r" (buf),
+          /* %1 */ "=r" (s1),
+          /* %2 */ "=r" (s2),
+          /* %3 */ "=r" (k),
+          /* %4 */ "=r" (len)
+        : /* %5 */ "m" (vord),
+          /* %6 */ "i" (5*NMAX),
+          /*    */ "0" (buf),
+          /*    */ "1" (s1),
+          /*    */ "2" (s2),
+          /*    */ "3" (k),
+          /*    */ "4" (len)
+        : "cc", "memory"
+#  ifdef __SSE__
+          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#  endif
+    );
+
+    if (unlikely(k))
+        buf = adler32_jumped(buf, &s1, &s2, k);
+    MOD28(s1);
+    MOD28(s2);
+    return (s2 << 16) | s1;
+}
+
+#  if 0
+/* ========================================================================= */
+/*
+ * The SSE2 version above is faster on my CPUs (Athlon64, Core2,
+ * P4 Xeon, K10 Sempron), but has instruction stalls only a
+ * Out-Of-Order-Execution CPU can solve.
+ * So this Version _may_ be better for the new old thing, Atom.
+ */
+local noinline uLong adler32_SSE2_no_oooe(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int k;
+
+    k    = ALIGN_DIFF(buf, 16);
+    len -= k;
+    if (k)
+        buf = adler32_jumped(buf, &s1, &s2, k);
+
+    __asm__ __volatile__ (
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$16, %3\n\t"
+            "jb	8f\n\t"
+            "movdqa	16+%5, %%xmm6\n\t"
+            "movdqa	32+%5, %%xmm5\n\t"
+            "prefetchnta	16(%0)\n\t"
+            "pxor	%%xmm7, %%xmm7\n\t"
+            "movd	%1, %%xmm4\n\t"
+            "movd	%2, %%xmm3\n\t"
+            ".p2align 3,,3\n\t"
+            ".p2align 2\n"
+            "1:\n\t"
+            "prefetchnta	32(%0)\n\t"
+            "movdqa	(%0), %%xmm1\n\t"
+            "sub	$16, %3\n\t"
+            "movdqa	%%xmm4, %%xmm2\n\t"
+            "add	$16, %0\n\t"
+            "movdqa	%%xmm1, %%xmm0\n\t"
+            "cmp	$15, %3\n\t"
+            "pslld	$4, %%xmm2\n\t"
+            "paddd	%%xmm3, %%xmm2\n\t"
+            "psadbw	%%xmm7, %%xmm0\n\t"
+            "paddd	%%xmm0, %%xmm4\n\t"
+            "movdqa	%%xmm1, %%xmm0\n\t"
+            "punpckhbw	%%xmm7, %%xmm1\n\t"
+            "punpcklbw	%%xmm7, %%xmm0\n\t"
+            "movdqa	%%xmm1, %%xmm3\n\t"
+            "pmaddwd	%%xmm6, %%xmm0\n\t"
+            "paddd	%%xmm2, %%xmm0\n\t"
+            "pmaddwd	%%xmm5, %%xmm3\n\t"
+            "paddd	%%xmm0, %%xmm3\n\t"
+            "jg	1b\n\t"
+            "movdqa	%%xmm3, %%xmm0\n\t"
+            "call	sse2_chop\n\t"
+            "call	sse2_chop\n\t"
+            "movdqa	%%xmm0, %%xmm3\n\t"
+            "movdqa	%%xmm4, %%xmm0\n\t"
+            "call	sse2_chop\n\t"
+            "movdqa	%%xmm0, %%xmm4\n\t"
+            "add	%3, %4\n\t"
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$15, %3\n\t"
+            "jg	1b\n\t"
+            "pshufd	$0xEE, %%xmm3, %%xmm0\n\t"
+            "pshufd	$0xEE, %%xmm4, %%xmm1\n\t"
+            "paddd	%%xmm3, %%xmm0\n\t"
+            "pshufd	$0xE5, %%xmm0, %%xmm2\n\t"
+            "paddd	%%xmm4, %%xmm1\n\t"
+            "movd	%%xmm1, %1\n\t"
+            "paddd	%%xmm0, %%xmm2\n\t"
+            "movd	%%xmm2, %2\n"
+            "8:"
+        : /* %0 */ "=r" (buf),
+          /* %1 */ "=r" (s1),
+          /* %2 */ "=r" (s2),
+          /* %3 */ "=r" (k),
+          /* %4 */ "=r" (len)
+        : /* %5 */ "m" (vord),
+          /* %6 */ "i" (NMAX + NMAX/3),
+          /*    */ "0" (buf),
+          /*    */ "1" (s1),
+          /*    */ "2" (s2),
+          /*    */ "4" (len)
+        : "cc", "memory"
+#  ifdef __SSE__
+          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#  endif
+    );
+
+    if (unlikely(k))
+        buf = adler32_jumped(buf, &s1, &s2, k);
+    MOD28(s1);
+    MOD28(s2);
+    return (s2 << 16) | s1;
+}
+#  endif
+
+#  ifndef __x86_64__
+/* ========================================================================= */
+/*
+ * SSE version to help VIA-C3_2, P2 & P3
+ */
+local uLong adler32_SSE(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int k;
+
+    k    = ALIGN_DIFF(buf, 8);
+    len -= k;
+    if (k)
+        buf = adler32_jumped(buf, &s1, &s2, k);
+
+    __asm__ __volatile__ (
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$8, %3\n\t"
+            "jb	8f\n\t"
+            "movd	%1, %%mm4\n\t"
+            "movd	%2, %%mm3\n\t"
+            "pxor	%%mm2, %%mm2\n\t"
+            "pxor	%%mm5, %%mm5\n\t"
+#    ifdef __ELF__
+            ".subsection 2\n\t"
+#    else
+            "jmp	7f\n\t"
+#    endif
+            ".p2align 2\n"
+            "mmx_chop:\n\t"
+            "movq	%%mm0, %%mm1\n\t"
+            "pslld	$16, %%mm1\n\t"
+            "psrld	$16, %%mm0\n\t"
+            "psrld	$16, %%mm1\n\t"
+            "psubd	%%mm0, %%mm1\n\t"
+            "pslld	$4, %%mm0\n\t"
+            "paddd	%%mm1, %%mm0\n\t"
+            "ret\n\t"
+#    ifdef __ELF__
+            ".previous\n\t"
+#    else
+            "7:\n\t"
+#    endif
+            ".p2align	2\n"
+            "3:\n\t"
+            "pxor	%%mm6, %%mm6\n\t"
+            "pxor	%%mm7, %%mm7\n\t"
+            "mov	$1024, %1\n\t"
+            "cmp	%1, %3\n\t"
+            "cmovb	%3, %1\n"
+            "and	$-8, %1\n\t"
+            "sub	%1, %3\n\t"
+            "shr	$3, %1\n\t"
+            ".p2align 4,,7\n"
+            ".p2align 3\n"
+            "1:\n\t"
+            "movq	(%0), %%mm0\n\t"
+            "paddd	%%mm4, %%mm5\n\t"
+            "add	$8, %0\n\t"
+            "dec	%1\n\t"
+            "movq	%%mm0, %%mm1\n\t"
+            "psadbw	%%mm2, %%mm0\n\t"
+            "paddd	%%mm0, %%mm4\n\t"
+            "movq	%%mm1, %%mm0\n\t"
+            "punpckhbw	%%mm2, %%mm1\n\t"
+            "punpcklbw	%%mm2, %%mm0\n\t"
+            "paddw	%%mm1, %%mm7\n\t"
+            "paddw	%%mm0, %%mm6\n\t"
+            "jnz	1b\n\t"
+            "cmp	$7, %3\n\t"
+            "pmaddwd	40+%5, %%mm7\n\t"
+            "pmaddwd	32+%5, %%mm6\n\t"
+            "paddd	%%mm7, %%mm3\n\t"
+            "paddd	%%mm6, %%mm3\n\t"
+            "jg	3b\n\t"
+            "movq	%%mm5, %%mm0\n\t"
+            "pxor	%%mm5, %%mm5\n\t"
+            "call	mmx_chop\n\t"
+            "pslld	$3, %%mm0\n\t"
+            "paddd	%%mm3, %%mm0\n\t"
+            "call	mmx_chop\n\t"
+            "movq	%%mm0, %%mm3\n\t"
+            "movq	%%mm4, %%mm0\n\t"
+            "call	mmx_chop\n\t"
+            "movq	%%mm0, %%mm4\n\t"
+            "add	%3, %4\n\t"
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "cmovb	%4, %3\n"
+            "sub	%3, %4\n\t"
+            "cmp	$7, %3\n\t"
+            "jg	3b\n\t"
+            "movd	%%mm4, %1\n\t"
+            "psrlq	$32, %%mm4\n\t"
+            "movd	%%mm3, %2\n\t"
+            "psrlq	$32, %%mm3\n\t"
+            "movd	%%mm4, %4\n\t"
+            "add	%4, %1\n\t"
+            "movd	%%mm3, %4\n\t"
+            "add	%4, %2\n\t"
+            "emms\n"
+            "8:\n\t"
+        : /* %0 */ "=r" (buf),
+          /* %1 */ "=r" (s1),
+          /* %2 */ "=r" (s2),
+          /* %3 */ "=r" (k),
+          /* %4 */ "=r" (len)
+        : /* %5 */ "m" (vord),
+          /* %6 */ "i" ((5*NMAX)/2),
+          /*    */ "0" (buf),
+          /*    */ "1" (s1),
+          /*    */ "2" (s2),
+          /*    */ "3" (k),
+          /*    */ "4" (len)
+        : "cc", "memory"
+#    ifdef __MMX__
+          , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+#    endif
+                               );
+
+    if (unlikely(k))
+        buf = adler32_jumped(buf, &s1, &s2, k);
+    MOD28(s1);
+    MOD28(s2);
+    return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+/*
+ * Processors which only have MMX will prop. not like this
+ * code, they are so old, they are not Out-Of-Order
+ * (maybe except AMD K6, Cyrix, Winchip/VIA).
+ * I did my best to get at least 1 instruction between result -> use
+ */
+local uLong adler32_MMX(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int k;
+
+    k    = ALIGN_DIFF(buf, 8);
+    len -= k;
+    if (k)
+        buf = adler32_jumped(buf, &s1, &s2, k);
+
+    __asm__ __volatile__ (
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "jae	6f\n\t"
+            "mov	%4, %3\n"
+            "6:\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$8, %3\n\t"
+            "jb	8f\n\t"
+            "sub	$8, %%esp\n\t"
+            "movd	%1, %%mm4\n\t"
+            "movd	%2, %%mm2\n\t"
+            "movq	%5, %%mm3\n"
+            "5:\n\t"
+            "movq	%%mm2, %%mm0\n\t"
+            "pxor	%%mm2, %%mm2\n\t"
+            "pxor	%%mm5, %%mm5\n\t"
+            ".p2align 2\n"
+            "3:\n\t"
+            "movq	%%mm0, (%%esp)\n\t"
+            "pxor	%%mm6, %%mm6\n\t"
+            "pxor	%%mm7, %%mm7\n\t"
+            "mov	$1024, %1\n\t"
+            "cmp	%1, %3\n\t"
+            "jae	4f\n\t"
+            "mov	%3, %1\n"
+            "4:\n\t"
+            "and	$-8, %1\n\t"
+            "sub	%1, %3\n\t"
+            "shr	$3, %1\n\t"
+            ".p2align 4,,7\n\t"
+            ".p2align 3\n"
+            "1:\n\t"
+            "movq	(%0), %%mm0\n\t"
+            "paddd	%%mm4, %%mm5\n\t"
+            "add	$8, %0\n\t"
+            "dec	%1\n\t"
+            "movq	%%mm0, %%mm1\n\t"
+            "punpcklbw	%%mm2, %%mm0\n\t"
+            "punpckhbw	%%mm2, %%mm1\n\t"
+            "paddw	%%mm0, %%mm6\n\t"
+            "paddw	%%mm1, %%mm0\n\t"
+            "paddw	%%mm1, %%mm7\n\t"
+            "pmaddwd	%%mm3, %%mm0\n\t"
+            "paddd	%%mm0, %%mm4\n\t"
+            "jnz	1b\n\t"
+            "movq	(%%esp), %%mm0\n\t"
+            "cmp	$7, %3\n\t"
+            "pmaddwd	32+%5, %%mm6\n\t"
+            "pmaddwd	40+%5, %%mm7\n\t"
+            "paddd	%%mm6, %%mm0\n\t"
+            "paddd	%%mm7, %%mm0\n\t"
+            "jg	3b\n\t"
+            "movq	%%mm0, %%mm2\n\t"
+            "movq	%%mm5, %%mm0\n\t"
+            "call	mmx_chop\n\t"
+            "pslld	$3, %%mm0\n\t"
+            "paddd	%%mm2, %%mm0\n\t"
+            "call	mmx_chop\n\t"
+            "movq	%%mm0, %%mm2\n\t"
+            "movq	%%mm4, %%mm0\n\t"
+            "call	mmx_chop\n\t"
+            "movq	%%mm0, %%mm4\n\t"
+            "add	%3, %4\n\t"
+            "mov	%6, %3\n\t"
+            "cmp	%3, %4\n\t"
+            "jae	2f\n\t"
+            "mov	%4, %3\n"
+            "2:\n\t"
+            "sub	%3, %4\n\t"
+            "cmp	$7, %3\n\t"
+            "jg	5b\n\t"
+            "add	$8, %%esp\n\t"
+            "movd	%%mm4, %1\n\t"
+            "psrlq	$32, %%mm4\n\t"
+            "movd	%%mm2, %2\n\t"
+            "psrlq	$32, %%mm2\n\t"
+            "movd	%%mm4, %4\n\t"
+            "add	%4, %1\n\t"
+            "movd	%%mm2, %4\n\t"
+            "add	%4, %2\n\t"
+            "emms\n"
+            "8:\n\t"
+        : /* %0 */ "=r" (buf),
+          /* %1 */ "=r" (s1),
+          /* %2 */ "=r" (s2),
+          /* %3 */ "=r" (k),
+          /* %4 */ "=r" (len)
+        : /* %5 */ "m" (vord),
+          /* %6 */ "i" (4*NMAX),
+          /*    */ "0" (buf),
+          /*    */ "1" (s1),
+          /*    */ "2" (s2),
+          /*    */ "3" (k),
+          /*    */ "4" (len)
+        : "cc", "memory"
+#    ifdef __MMX__
+          , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+#    endif
+    );
+
+    if (unlikely(k))
+        buf = adler32_jumped(buf, &s1, &s2, k);
+    MOD28(s1);
+    MOD28(s2);
+    return (s2 << 16) | s1;
+}
+#  endif
+
+/* ========================================================================= */
+local uLong adler32_x86(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    /* split Adler-32 into component sums */
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+    unsigned int n;
+
+    do {
+        /* find maximum, len or NMAX */
+        n = len < NMAX ? len : NMAX;
+        len -= n;
+
+        /* do it */
+        buf = adler32_jumped(buf, &s1, &s2, n);
+        /* modulo */
+        MOD(s1);
+        MOD(s2);
+    } while (likely(len));
+
+    /* return recombined sums */
+    return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+#define NO_ADLER32_GE16
+local noinline uLong adler32_ge16(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    /* split Adler-32 into component sums */
+    unsigned int s1 = adler & 0xffff;
+    unsigned int s2 = (adler >> 16) & 0xffff;
+
+    /* simply do it, we do not expect more then NMAX as len */
+    adler32_jumped(buf, &s1, &s2, len);
+    /* actually we expect much less, MOD28 it */
+    MOD28(s1);
+    MOD28(s2);
+
+    /* return recombined sums */
+    return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+/*
+ * Knot it all together with a runtime switch
+ */
+/* ========================================================================= */
+/* function enum */
+enum adler32_types
+{
+    T_ADLER32_RTSWITCH = 0,
+    T_ADLER32_X86,
+#  ifndef __x86_64__
+    T_ADLER32_MMX,
+    T_ADLER32_SSE,
+#  endif
+    T_ADLER32_SSE2,
+    T_ADLER32_SSSE3,
+    T_ADLER32_MAX
+};
+
+/* ========================================================================= */
+/* Decision table */
+local const struct test_cpu_feature tfeat_adler32_vec[] =
+{
+    /* func                             flags   features       */
+    {T_ADLER32_SSSE3,         0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}},
+    {T_ADLER32_SSE2,          0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}},
+#  ifndef __x86_64__
+    {T_ADLER32_SSE,           0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}},
+    {T_ADLER32_MMX,           0, {CFB(CFEATURE_MMX), 0}},
+#  endif
+    {T_ADLER32_X86, CFF_DEFAULT, { 0, 0}},
+};
+
+/* ========================================================================= */
+/* Prototypes */
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len);
+
+/* ========================================================================= */
+/* Function pointer table */
+local uLong (*const adler32_ptr_tab[])(uLong adler, const Bytef *buf, uInt len) =
+{
+    adler32_vec_runtimesw,
+    adler32_x86,
+#  ifndef __x86_64__
+    adler32_MMX,
+    adler32_SSE,
+#  endif
+    adler32_SSE2,
+    adler32_SSSE3,
+};
+
+/* ========================================================================= */
+#  if _FORTIFY_SOURCE-0 > 0
+/* Runtime decide var */
+local enum adler32_types adler32_f_type = T_ADLER32_RTSWITCH;
+#  else
+/* Runtime Function pointer */
+local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw;
+#  endif
+
+/* ========================================================================= */
+/* Constructor to init the decide var early */
+local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void)
+{
+    enum adler32_types lf_type =
+        _test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0]));
+#  if _FORTIFY_SOURCE-0 > 0
+    adler32_f_type = lf_type;
+#  else
+    adler32_vec_ptr = adler32_ptr_tab[lf_type];
+#  endif
+}
+
+/* ========================================================================= */
+/* Jump function */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    /*
+     * Protect us from memory corruption. As long as the function pointer table
+     * resides in rodata, with a little bounding we can prevent arb. code
+     * execution (overwritten vtable pointer). We still may crash if the corruption
+     * is within bounds (or the cpudata gets corrupted too) and we jump into an
+     * function with unsupported instr., but this should mitigate the worst case
+     * scenario.
+     * But it's more expensive than a simple function pointer, so only when more
+     * security is wanted.
+     */
+#  if _FORTIFY_SOURCE-0 > 0
+    enum adler32_types lf_type = adler32_f_type;
+    /*
+     * If the compiler is smart he creates a cmp + sbb + and, cmov have a high
+     * latency and are not always avail.
+     * Otherwise compiler logic is advanced enough to see what's happening here,
+     * so there maybe is a reason why he changes this to a cmov...
+     * (or he simply does not see he can create a conditional -1/0 the cheap way)
+     *
+     * Maybe change it to an unlikely() cbranch? Which still leaves the question
+     * what's the mispredition propability, esp. with lots of different x86
+     * microarchs and not always perfect CFLAGS (-march/-mtune) to arrange the
+     * code to the processors liking.
+     */
+    lf_type &= likely((unsigned)lf_type < (unsigned)T_ADLER32_MAX) ? -1 : 0;
+    return adler32_ptr_tab[lf_type](adler, buf, len);
+#  else
+    return adler32_vec_ptr(adler, buf, len);
+#  endif
+}
+
+/* ========================================================================= */
+/*
+ * the runtime switcher is a little racy, but this is OK,
+ * it should normaly not run if the constructor works, and
+ * we are on x86, which isn't that picky about ordering
+ */
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len)
+{
+    adler32_vec_select();
+    return adler32_vec(adler, buf, len);
+}
+#endif
diff --git a/libs/zlib/x86/cpudet.c b/libs/zlib/x86/cpudet.c
new file mode 100644
index 0000000..18130ba
--- /dev/null
+++ b/libs/zlib/x86/cpudet.c
@@ -0,0 +1,154 @@
+/* cpudet.c -- runtime cpu detection, x86 part
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "x86.h"
+
+/* ========================================================================= */
+/* Internal data types */
+struct cpuid_regs
+{
+    unsigned long eax, ebx, ecx, edx;
+};
+
+local struct
+{
+    unsigned int max_basic;
+    unsigned int features[FEATURE_WORDS];
+    int init_done;
+} our_cpu;
+
+/* ========================================================================= */
+local inline unsigned long read_flags(void)
+{
+    unsigned long f;
+    __asm__ __volatile__ (
+            "pushf\n\t"
+            "pop %0\n\t"
+        : "=r" (f)
+    );
+    return f;
+}
+
+/* ========================================================================= */
+local inline void write_flags(unsigned long f)
+{
+    __asm__ __volatile__ (
+            "push %0\n\t"
+            "popf\n\t"
+        : : "ri" (f) : "cc"
+    );
+}
+
+/* ========================================================================= */
+local inline void cpuid(struct cpuid_regs *regs, unsigned long func)
+{
+    /* save ebx around cpuid call, PIC code needs it */
+    __asm__ __volatile__ (
+            "xchg	%1, " PICREG "\n\t"
+            "cpuid\n\t"
+            "xchg	%1, " PICREG "\n"
+        : /* %0 */ "=a" (regs->eax),
+          /* %1 */ "=r" (regs->ebx),
+          /* %2 */ "=c" (regs->ecx),
+          /* %4 */ "=d" (regs->edx)
+        : /* %5 */ "0" (func),
+          /* %6 */ "2" (regs->ecx)
+        : "cc"
+    );
+}
+
+/* ========================================================================= */
+local inline void cpuids(struct cpuid_regs *regs, unsigned long func)
+{
+    regs->ecx = 0;
+    cpuid(regs, func);
+}
+
+/* ========================================================================= */
+local inline int toggle_eflags_test(const unsigned long mask)
+{
+    unsigned long f;
+    int result;
+
+    f = read_flags();
+    write_flags(f ^ mask);
+    result = !!((f ^ read_flags()) & mask);
+    /*
+     * restore the old flags, the test for i486 tests the alignment
+     * check bit, and left set will confuse the x86 software world.
+     */
+    write_flags(f);
+    return result;
+}
+
+/* ========================================================================= */
+local inline int is_486(void)
+{
+    return toggle_eflags_test(1 << 18);
+}
+
+/* ========================================================================= */
+local inline int has_cpuid(void)
+{
+    return toggle_eflags_test(1 << 21);
+}
+
+/* ========================================================================= */
+local void identify_cpu(void)
+{
+    struct cpuid_regs a;
+
+    if (our_cpu.init_done)
+        return;
+
+    our_cpu.init_done = -1;
+    /* force a write out to memory */
+    __asm__ __volatile__ ("" : : "m" (our_cpu.init_done));
+
+    if (!is_486())
+        return;
+
+    if (!has_cpuid())
+        return;
+
+    /* get the maximum basic leaf number */
+    cpuids(&a, 0x00000000);
+    our_cpu.max_basic = (unsigned int)a.eax;
+    /* we could get the vendor string from ebx, edx, ecx */
+
+    /* get the first basic leaf, if it is avail. */
+    if (our_cpu.max_basic >= 0x00000001)
+        cpuids(&a, 0x00000001);
+    else
+        a.eax = a.ebx = a.ecx = a.edx = 0;
+
+    /* we could extract family, model, stepping from eax */
+
+    /* there is the first set of features */
+    our_cpu.features[0] = a.edx;
+    our_cpu.features[1] = a.ecx;
+
+    /* now we could test the extended features, but is not needed, for now */
+}
+
+/* ========================================================================= */
+int ZLIB_INTERNAL _test_cpu_feature (t, l)
+    const struct test_cpu_feature *t;
+    unsigned int l;
+{
+    unsigned int i, j, f;
+    identify_cpu();
+
+    for (i = 0; i < l; i++) {
+        if (t[i].flags & CFF_DEFAULT)
+            return t[i].f_type;
+        for (f = 0, j = 0; j < FEATURE_WORDS; j++)
+            f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j];
+        if (f)
+            continue;
+        return t[i].f_type;
+    }
+    return 1; /* default */
+}
diff --git a/libs/zlib/x86/slhash.c b/libs/zlib/x86/slhash.c
new file mode 100644
index 0000000..0450516
--- /dev/null
+++ b/libs/zlib/x86/slhash.c
@@ -0,0 +1,415 @@
+/* slhash.c -- slide the hash table during fill_window()
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "x86.h"
+
+/* inline asm, so only on GCC (or compatible) */
+#if defined(__GNUC__) && !defined(VEC_NO_GO)
+#  define HAVE_SLHASH_VEC
+#  define HAVE_SLHASH_COMPLETE
+
+#define IGNORE_MMX
+
+local noinline void update_hoffset_x86(Posf *p, uInt wsize, unsigned n);
+local noinline void slhash_x86(Posf *p, Posf *q, uInt wsize, unsigned n);
+local noinline void slhash_SSE2(Posf *p, Posf *q, uInt wsize, unsigned n);
+
+/* NOTE:
+ * We do not precheck the length or wsize for small values because
+ * we assume a minimum len of 256 (for MEM_LEVEL 1) and a minimum wsize
+ * of 256 for windowBits 8
+ */
+
+/* ========================================================================= */
+/* This is totally bogus, because the Pos type is only 16 bit, and as soon as
+ * wsize > 65534, we can not hold the distances in a Pos. All this is a
+ * kind of complicated memset 0.
+ */
+local void update_hoffset_SSE4_1(Posf *p, uInt wsize, unsigned n)
+{
+    register unsigned m;
+    unsigned int i, j;
+
+    i  = ALIGN_DIFF(p, 8)/sizeof(Pos);
+    n -= i;
+    if (unlikely(i)) do {
+        m = *p;
+        *p++ = (Pos)(m >= wsize ? m-wsize : NIL);
+    } while (--i);
+    i = n / 4;
+    n %= 4;
+    asm (
+        "pxor %%xmm6, %%xmm6\n\t"
+        "movd %k3, %%xmm7\n\t"
+        "pshufd $0, %%xmm7, %%xmm7\n\t"
+        "test $8, %0\n\t"
+        "jz 2f\n\t"
+        "movq (%0), %%xmm0\n\t"
+        "add $8, %0\n\t"
+        "dec %1\n\t"
+        "punpcklwd %%xmm6, %%xmm0\n\t"
+        "psubd %%xmm7, %%xmm0\n\t"
+        "packusdw %%xmm6, %%xmm0\n\t"
+        "movq %%xmm0, -8(%0)\n"
+        "2:\n\t"
+        "mov %1, %2\n\t"
+        "shr $1, %1\n\t"
+        "and $1, %2\n\t"
+        ".p2align 3\n"
+        "1:\n\t"
+        "movdqa (%0), %%xmm0\n\t"
+        "add $16, %0\n\t"
+        "movdqa %%xmm0, %%xmm1\n\t"
+        "punpcklwd %%xmm6, %%xmm0\n\t"
+        "punpckhwd %%xmm6, %%xmm1\n\t"
+        "psubd %%xmm7, %%xmm0\n\t"
+        "psubd %%xmm7, %%xmm1\n\t"
+        "packusdw %%xmm1, %%xmm0\n\t"
+        "movdqa %%xmm0, -16(%0)\n\t"
+        "dec %1\n\t"
+        "jnz 1b\n\t"
+        "test %2, %2\n\t"
+        "jz 3f\n\t"
+        "movq (%0), %%xmm0\n\t"
+        "add $8, %0\n\t"
+        "punpcklwd %%xmm6, %%xmm0\n\t"
+        "psubd %%xmm7, %%xmm0\n\t"
+        "packusdw %%xmm6, %%xmm0\n\t"
+        "movq %%xmm0, -8(%0)\n"
+        "3:"
+        : /* %0 */ "=r" (p),
+          /* %1 */ "=r" (i),
+          /* %2 */ "=r" (j)
+        : /* %3 */ "r" (wsize),
+          /*  */ "0" (p),
+          /*  */ "1" (i)
+#  ifdef __SSE2__
+        : "xmm0", "xmm7"
+#  endif
+    );
+    if (unlikely(n))
+        update_hoffset_x86(p, wsize, n);
+}
+
+/* ========================================================================= */
+local void slhash_SSE4_1(Posf *p, Posf *q, uInt wsize, unsigned n)
+{
+    if (likely(wsize <= (1<<16)-1)) {
+        slhash_SSE2(p, q, wsize, n);
+        return;
+    }
+
+    update_hoffset_SSE4_1(p, wsize, n);
+#  ifndef FASTEST
+    /* If n is not on any hash chain, prev[n] is garbage but
+     * its value will never be used.
+     */
+    update_hoffset_SSE4_1(q, wsize, wsize);
+#  endif
+}
+
+/* ========================================================================= */
+local void update_hoffset_SSE2(Posf *p, uInt wsize, unsigned n)
+{
+    register unsigned m;
+    unsigned int i, j;
+
+    i  = ALIGN_DIFF(p, 8)/sizeof(Pos);
+    n -= i;
+    if (unlikely(i)) do {
+        m = *p;
+        *p++ = (Pos)(m >= wsize ? m-wsize : NIL);
+    } while (--i);
+    i = n / 4;
+    n %= 4;
+    asm (
+        "movd %k3, %%xmm7\n\t"
+        "pshuflw $0, %%xmm7, %%xmm7\n\t"
+        "pshufd $0, %%xmm7, %%xmm7\n\t"
+        "test $8, %0\n\t"
+        "jz 2f\n\t"
+        "movq (%0), %%xmm0\n\t"
+        "add $8, %0\n\t"
+        "dec %1\n\t"
+        "psubusw %%xmm7, %%xmm0\n\t"
+        "movq %%xmm0, -8(%0)\n\t"
+        "2:\n\t"
+        "mov %1, %2\n\t"
+        "shr $1, %1\n\t"
+        "and $1, %2\n\t"
+        ".p2align 3\n"
+        "1:\n\t"
+        "movdqa (%0), %%xmm0\n\t"
+        "add $16, %0\n\t"
+        "psubusw %%xmm7, %%xmm0\n\t"
+        "movdqa %%xmm0, -16(%0)\n\t"
+        "dec %1\n\t"
+        "jnz 1b\n\t"
+        "test %2, %2\n\t"
+        "jz 3f\n\t"
+        "movq (%0), %%xmm0\n\t"
+        "add $8, %0\n\t"
+        "psubusw %%xmm7, %%xmm0\n\t"
+        "movq %%xmm0, -8(%0)\n\t"
+        "3:"
+        : /* %0 */ "=r" (p),
+          /* %1 */ "=r" (i),
+          /* %2 */ "=r" (j)
+        : /* %3 */ "r" (wsize),
+          /*  */ "0" (p),
+          /*  */ "1" (i)
+#  ifdef __SSE2__
+        : "xmm0", "xmm7"
+#  endif
+    );
+    if (unlikely(n))
+        update_hoffset_x86(p, wsize, n);
+}
+
+/* ========================================================================= */
+local noinline void slhash_SSE2(Posf *p, Posf *q, uInt wsize, unsigned n)
+{
+    if (unlikely(wsize > (1 << 16)-1)) {
+        slhash_x86(p, q, wsize, n);
+        return;
+    }
+
+    update_hoffset_SSE2(p, wsize, n);
+#  ifndef FASTEST
+    /* If n is not on any hash chain, prev[n] is garbage but
+     * its value will never be used.
+     */
+    update_hoffset_SSE2(q, wsize, wsize);
+#  endif
+}
+
+#  ifndef __x86_64__
+#  ifndef IGNORE_MMX
+/* ========================================================================= */
+local void update_hoffset_MMX(Posf *p, uInt wsize, unsigned n)
+{
+    register unsigned m;
+    unsigned int i;
+
+    i  = ALIGN_DIFF(p, 8)/sizeof(Pos);
+    n -= i;
+    if (unlikely(i)) do {
+        m = *p;
+        *p++ = (Pos)(m >= wsize ? m-wsize : NIL);
+    } while (--i);
+    i = n / 4;
+    n %= 4;
+    asm (
+        "movd %k2, %%mm7\n\t"
+        "pshufw $0, %%mm7, %%mm7\n\t"
+        ".p2align 2\n"
+        "1:\n\t"
+        "movq (%0), %%mm0\n\t"
+        "add $8, %0\n\t"
+        "psubusw %%mm7, %%mm0\n\t"
+        "movq %%xmm0, -8(%0)\n\t"
+        "dec %1\n\t"
+        "jnz 1b"
+        : /* %0 */ "=r" (p),
+          /* %1 */ "=r" (i)
+        : /* %2 */ "r" (wsize),
+          /*  */ "0" (p),
+          /*  */ "1" (i)
+#    ifdef __MMX__
+        : "mm0", "mm7"
+#    endif
+    );
+    if (unlikely(n))
+        update_hoffset_x86(p, wsize, n);
+}
+
+/* ========================================================================= */
+local noinline void slhash_MMX(Posf *p, Posf *q, uInt wsize, unsigned n)
+{
+    if (unlikely(wsize > (1 << 16)-1)) {
+        slhash_x86(p, q, wsize, n);
+        return;
+    }
+
+    update_hoffset_MMX(p, wsize, n);
+#    ifndef FASTEST
+    /* If n is not on any hash chain, prev[n] is garbage but
+     * its value will never be used.
+     */
+    update_hoffset_MMX(q, wsize, wsize);
+#    endif
+    asm volatile ("emms");
+}
+#  endif
+#  endif
+/* ========================================================================= */
+local noinline void update_hoffset_x86(Posf *p, uInt wsize, unsigned n)
+{
+    /*
+     * This code is cheaper then a cmov, measuring whole loops with
+     * rdtsc:
+     * This code:  593216
+     * compiler:  1019864
+     * (and 1000 runs show the same trend)
+     * Old CPUs without cmov will also love it, better then jumps.
+     *
+     * GCC does not manage to create it, x86 is a cc_mode target,
+     * and prop. will stay forever.
+     */
+    do {
+        register unsigned m = *p;
+        unsigned t;
+        asm (
+            "sub	%2, %0\n\t"
+            "sbb	$0, %1\n\t"
+            : "=r" (m),
+              "=r" (t)
+            : "r" (wsize),
+              "0" (m),
+              "1" (0)
+        );
+        *p++ = (Pos)(m & ~t);
+    } while (--n);
+}
+
+/* ========================================================================= */
+local noinline void slhash_x86(Posf *p, Posf *q, uInt wsize, unsigned n)
+{
+    update_hoffset_x86(p, wsize, n);
+#  ifndef FASTEST
+    /* If n is not on any hash chain, prev[n] is garbage but
+     * its value will never be used.
+     */
+    update_hoffset_x86(q, wsize, wsize);
+#  endif
+}
+
+/*
+ * Knot it all together with a runtime switch
+ */
+/* ========================================================================= */
+/* function enum */
+enum slhash_types
+{
+    T_SLHASH_RTSWITCH = 0,
+    T_SLHASH_X86,
+#  ifndef __x86_64__
+#  ifndef IGNORE_MMX
+    T_SLHASH_MMX,
+#  endif
+#  endif
+    T_SLHASH_SSE2,
+    T_SLHASH_SSE4_1,
+    T_SLHASH_MAX
+};
+
+/* ========================================================================= */
+/* Decision table */
+local const struct test_cpu_feature tfeat_slhash_vec[] =
+{
+    /* func               flags  features       */
+    {T_SLHASH_SSE4_1,        0, {0,                  CFB(CFEATURE_SSE4_1)}},
+    {T_SLHASH_SSE2,          0, {CFB(CFEATURE_SSE2),                    0}},
+#  ifndef __x86_64__
+#  ifndef IGNORE_MMX
+    {T_SLHASH_MMX,           0, {CFB(CFEATURE_MMX),                     0}},
+#  endif
+#  endif
+    {T_SLHASH_X86, CFF_DEFAULT, { 0, 0}},
+};
+
+/* ========================================================================= */
+/* Prototypes */
+local void slhash_vec_runtimesw(Posf *p, Posf *q, uInt wsize, unsigned n);
+
+/* ========================================================================= */
+/* Function pointer table */
+local void (*const slhash_ptr_tab[])(Posf *p, Posf *q, uInt wsize, unsigned n) =
+{
+    slhash_vec_runtimesw,
+    slhash_x86,
+#  ifndef __x86_64__
+#  ifndef IGNORE_MMX
+    slhash_MMX,
+#  endif
+#  endif
+    slhash_SSE2,
+    slhash_SSE4_1,
+};
+
+/* ========================================================================= */
+#  if _FORTIFY_SOURCE-0 > 0
+/* Runtime decide var */
+local enum slhash_types slhash_f_type = T_SLHASH_RTSWITCH;
+#  else
+/* Runtime Function pointer */
+local void (*slhash_vec_ptr)(Posf *p, Posf *q, uInt wsize, unsigned n) = slhash_vec_runtimesw;
+#  endif
+
+/* ========================================================================= */
+/* Constructor to init the decide var early */
+local GCC_ATTR_CONSTRUCTOR void slhash_vec_select(void)
+{
+    enum slhash_types lf_type =
+        _test_cpu_feature(tfeat_slhash_vec, sizeof (tfeat_slhash_vec)/sizeof (tfeat_slhash_vec[0]));
+#  if _FORTIFY_SOURCE-0 > 0
+    slhash_f_type = lf_type;
+#  else
+    slhash_vec_ptr = slhash_ptr_tab[lf_type];
+#  endif
+}
+
+/* ========================================================================= */
+/* Jump function */
+void ZLIB_INTERNAL _sh_slide (p, q, wsize, n)
+    Posf *p;
+    Posf *q;
+    uInt wsize;
+    unsigned n;
+{
+    /*
+     * Protect us from memory corruption. As long as the function pointer table
+     * resides in rodata, with a little bounding we can prevent arb. code
+     * execution (overwriten vtable pointer). We still may crash if the corruption
+     * is within bounds (or the cpudata gets corrupted too) and we jump into an
+     * function with unsupported instr., but this should mitigate the worst case
+     * scenario.
+     * But it's more expensive than a simple function pointer, so only when more
+     * security is wanted.
+     */
+#  if _FORTIFY_SOURCE-0 > 0
+    enum slhash_types lf_type = slhash_f_type;
+    /*
+     * If the compiler is smart he creates a cmp + sbb + and, cmov have a high
+     * latency and are not always avail.
+     * Otherwise compiler logic is advanced enough to see what's happening here,
+     * so there maybe is a reason why he changes this to a cmov...
+     * (or he simply does not see he can create a conditional -1/0 the cheap way)
+     *
+     * Maybe change it to an unlikely() cbranch? Which still leaves the question
+     * what's the mispredition propability, esp. with lots of different x86
+     * microarchs and not always perfect CFLAGS (-march/-mtune) to arrange the
+     * code to the processors liking.
+     */
+    lf_type &= likely((unsigned)lf_type < (unsigned)T_SLHASH_MAX) ? -1 : 0;
+    return slhash_ptr_tab[lf_type](p, q, wsize, n);
+#  else
+    return slhash_vec_ptr(p, q, wsize, n);
+#  endif
+}
+
+/* ========================================================================= */
+/*
+ * the runtime switcher is a little racy, but this is OK,
+ * it should normaly not run if the constructor works, and
+ * we are on x86, which isn't that picky about ordering
+ */
+local void slhash_vec_runtimesw(Posf *p, Posf *q, uInt wsize, unsigned n)
+{
+    slhash_vec_select();
+    return _sh_slide(p, q, wsize, n);
+}
+#endif
diff --git a/libs/zlib/x86/x86.h b/libs/zlib/x86/x86.h
new file mode 100644
index 0000000..95bb33a
--- /dev/null
+++ b/libs/zlib/x86/x86.h
@@ -0,0 +1,45 @@
+/* x86.h -- x86 cpu magic
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_H
+#define X86_H
+
+#if GCC_VERSION_GE(207)
+#  define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__))
+#else
+#  define VEC_NO_GO
+#endif
+
+#ifdef __x86_64__
+#  define PICREG "%%rbx"
+#else
+#  define PICREG "%%ebx"
+#endif
+
+/* Flags */
+#define CFF_DEFAULT (1 << 0)
+/* Processor features */
+#define CFEATURE_CMOV   (15 +   0)
+#define CFEATURE_MMX    (23 +   0)
+#define CFEATURE_SSE    (25 +   0)
+#define CFEATURE_SSE2   (26 +   0)
+#define CFEATURE_SSSE3  ( 9 +  32)
+#define CFEATURE_SSE4_1 (19 +  32)
+
+#define CFB(x) (1 << ((x)%32))
+
+#define FEATURE_WORDS 2
+
+/* ========================================================================= */
+/* data structure */
+struct test_cpu_feature
+{
+    int f_type;
+    int flags;
+    unsigned int features[FEATURE_WORDS];
+};
+
+int ZLIB_INTERNAL _test_cpu_feature OF((const struct test_cpu_feature *t, unsigned int l));
+#endif