diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a495a362..ffb33509 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -68,7 +68,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - run: sudo scripts/install_xorgxrdp_build_dependencies_with_apt.sh ${{ matrix.arch }} --allow-downgrades --allow-remove-essential --allow-change-held-packages
-      - run: git clone --depth 1 --branch=v0.10 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp
+      - run: git clone --depth 1 --branch=v0.10-h264 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp
       - run: ./bootstrap
       - run: ./configure ${{ matrix.CONF_FLAGS }}
       - run: make CFLAGS="$CFLAGS -O2 -Wall -Wwrite-strings -Werror"
diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am
index cd2a0204..ed2d7c63 100644
--- a/module/amd64/Makefile.am
+++ b/module/amd64/Makefile.am
@@ -3,6 +3,8 @@ NAFLAGS += -DASM_ARCH_AMD64
 ASMSOURCES = \
   a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \
   a8r8g8b8_to_nv12_box_amd64_sse2.asm \
+  a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm \
+  a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \
   cpuid_amd64.asm \
   i420_to_rgb32_amd64_sse2.asm \
   uyvy_to_rgb32_amd64_sse2.asm \
diff --git a/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm
new file mode 100644
index 00000000..c18e9d6a
--- /dev/null
+++ b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm
@@ -0,0 +1,304 @@
+;
+;Copyright 2015 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to NV12 709 full range
+;amd64 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width should be multiple of 8 and > 0
+;   height should be even and > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+
+    cw255  times 8 dw 255
+    cw128  times 8 dw 128
+    cw54   times 8 dw 54
+    cw183  times 8 dw 183
+    cw18   times 8 dw 18
+    cw29   times 8 dw 29
+    cw99   times 8 dw 99
+    cw116  times 8 dw 116
+    cw12   times 8 dw 12
+    cw2    times 8 dw 2
+
+%define LS8            [rsp +   0] ; s8
+%define LSRC_STRIDE    [rsp +   8] ; src_stride
+%define LD8_Y          [rsp +  16] ; d8_y
+%define LDST_Y_STRIDE  [rsp +  24] ; dst_stride_y
+%define LD8_UV         [rsp +  32] ; d8_uv
+%define LDST_UV_STRIDE [rsp +  40] ; dst_stride_uv
+%define LU1            [rsp +  48] ; first line U, 8 bytes
+%define LV1            [rsp +  56] ; first line V, 8 bytes
+%define LU2            [rsp +  64] ; second line U, 8 bytes
+%define LV2            [rsp +  72] ; second line V, 8 bytes
+
+%define LWIDTH         [rsp + 104] ; width
+%define LHEIGHT        [rsp + 112] ; height
+
+;The first six integer or pointer arguments are passed in registers
+; RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const char *s8, int src_stride,
+;                                      char *d8_y, int dst_stride_y,
+;                                      char *d8_uv, int dst_stride_uv,
+;                                      int width, int height);
+PROC a8r8g8b8_to_nv12_709fr_box_amd64_sse2
+    push rbx
+    push rbp
+    sub rsp, 80                ; local vars, 80 bytes
+
+    mov LS8, rdi               ; s8
+    mov LSRC_STRIDE, rsi       ; src_stride
+    mov LD8_Y, rdx             ; d8_y
+    mov LDST_Y_STRIDE, rcx     ; dst_stride_y
+    mov LD8_UV, r8             ; d8_uv
+    mov LDST_UV_STRIDE, r9     ; dst_stride_uv
+
+    pxor xmm7, xmm7
+
+    mov ebx, LHEIGHT           ; ebx = height
+    shr ebx, 1                 ; doing 2 lines at a time
+
+row_loop1:
+    mov rsi, LS8               ; s8
+    mov rdi, LD8_Y             ; d8_y
+    mov rdx, LD8_UV            ; d8_uv
+
+    mov ecx, LWIDTH            ; ecx = width
+    shr ecx, 3                 ; doing 8 pixels at a time
+
+loop1:
+    ; first line
+    movdqu xmm0, [rsi]         ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0          ; blue
+    pand xmm1, [lsym(cd255)]   ; blue
+    movdqa xmm2, xmm0          ; green
+    psrld xmm2, 8              ; green
+    pand xmm2, [lsym(cd255)]   ; green
+    movdqa xmm3, xmm0          ; red
+    psrld xmm3, 16             ; red
+    pand xmm3, [lsym(cd255)]   ; red
+
+    movdqu xmm0, [rsi + 16]    ; 4 pixels, 16 bytes
+    movdqa xmm4, xmm0          ; blue
+    pand xmm4, [lsym(cd255)]   ; blue
+    movdqa xmm5, xmm0          ; green
+    psrld xmm5, 8              ; green
+    pand xmm5, [lsym(cd255)]   ; green
+    movdqa xmm6, xmm0          ; red
+    psrld xmm6, 16             ; red
+    pand xmm6, [lsym(cd255)]   ; red
+
+    packssdw xmm1, xmm4        ; xmm1 = 8 blues
+    packssdw xmm2, xmm5        ; xmm2 = 8 greens
+    packssdw xmm3, xmm6        ; xmm3 = 8 reds
+
+    ; _Y = (( 54 * _R + 183 * _G +  18 * _B) >> 8);
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw18)]
+    pmullw xmm5, [lsym(cw183)]
+    pmullw xmm6, [lsym(cw54)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [rdi], xmm4           ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-29 * _R -  99 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw99)]
+    pmullw xmm6, [lsym(cw29)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LU1, xmm4             ; save for later
+
+    ; _V = ((128 * _R - 116 * _G -  12 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm4, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw116)]
+    pmullw xmm6, [lsym(cw12)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LV1, xmm4             ; save for later
+
+    ; go down to second line
+    add rsi, LSRC_STRIDE
+    add rdi, LDST_Y_STRIDE
+
+    ; second line
+    movdqu xmm0, [rsi]         ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0          ; blue
+    pand xmm1, [lsym(cd255)]   ; blue
+    movdqa xmm2, xmm0          ; green
+    psrld xmm2, 8              ; green
+    pand xmm2, [lsym(cd255)]   ; green
+    movdqa xmm3, xmm0          ; red
+    psrld xmm3, 16             ; red
+    pand xmm3, [lsym(cd255)]   ; red
+
+    movdqu xmm0, [rsi + 16]    ; 4 pixels, 16 bytes
+    movdqa xmm4, xmm0          ; blue
+    pand xmm4, [lsym(cd255)]   ; blue
+    movdqa xmm5, xmm0          ; green
+    psrld xmm5, 8              ; green
+    pand xmm5, [lsym(cd255)]   ; green
+    movdqa xmm6, xmm0          ; red
+    psrld xmm6, 16             ; red
+    pand xmm6, [lsym(cd255)]   ; red
+
+    packssdw xmm1, xmm4        ; xmm1 = 8 blues
+    packssdw xmm2, xmm5        ; xmm2 = 8 greens
+    packssdw xmm3, xmm6        ; xmm3 = 8 reds
+
+    ; _Y = (( 54 * _R + 183 * _G +  18 * _B) >> 8);
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw18)]
+    pmullw xmm5, [lsym(cw183)]
+    pmullw xmm6, [lsym(cw54)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [rdi], xmm4           ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-29 * _R -  99 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw99)]
+    pmullw xmm6, [lsym(cw29)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LU2, xmm4             ; save for later
+
+    ; _V = ((128 * _R - 116 * _G -  12 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm4, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw116)]
+    pmullw xmm6, [lsym(cw12)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LV2, xmm4             ; save for later
+
+    ; uv add and divide(average)
+    movq mm1, LU1              ; u from first line
+    movq mm3, mm1
+    pand mm1, [lsym(cw255)]
+    psrlw mm3, 8
+    pand mm3, [lsym(cw255)]
+    paddw mm1, mm3             ; add
+    movq mm2, LU2              ; u from second line
+    movq mm3, mm2
+    pand mm2, [lsym(cw255)]
+    paddw mm1, mm2             ; add
+    psrlw mm3, 8
+    pand mm3, [lsym(cw255)]
+    paddw mm1, mm3             ; add
+    paddw mm1, [lsym(cw2)]     ; add 2
+    psrlw mm1, 2               ; div 4
+
+    movq mm2, LV1              ; v from first line
+    movq mm4, mm2
+    pand mm2, [lsym(cw255)]
+    psrlw mm4, 8
+    pand mm4, [lsym(cw255)]
+    paddw mm2, mm4             ; add
+    movq mm3, LV2              ; v from second line
+    movq mm4, mm3
+    pand mm3, [lsym(cw255)]
+    paddw mm2, mm3             ; add
+    psrlw mm4, 8
+    pand mm4, [lsym(cw255)]
+    paddw mm2, mm4             ; add
+    paddw mm2, [lsym(cw2)]     ; add 2
+    psrlw mm2, 2               ; div 4
+
+    packuswb mm1, mm1
+    packuswb mm2, mm2
+
+    punpcklbw mm1, mm2         ; uv
+    movq [rdx], mm1            ; out 8 bytes uvuvuvuv
+
+    ; go up to first line
+    sub rsi, LSRC_STRIDE
+    sub rdi, LDST_Y_STRIDE
+
+    ; move right
+    lea rsi, [rsi + 32]
+    lea rdi, [rdi + 8]
+    lea rdx, [rdx + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov rax, LS8               ; s8
+    add rax, LSRC_STRIDE       ; s8 += src_stride
+    add rax, LSRC_STRIDE       ; s8 += src_stride
+    mov LS8, rax
+
+    ; update d8_y
+    mov rax, LD8_Y             ; d8_y
+    add rax, LDST_Y_STRIDE     ; d8_y += dst_stride_y
+    add rax, LDST_Y_STRIDE     ; d8_y += dst_stride_y
+    mov LD8_Y, rax
+
+    ; update d8_uv
+    mov rax, LD8_UV            ; d8_uv
+    add rax, LDST_UV_STRIDE    ; d8_uv += dst_stride_uv
+    mov LD8_UV, rax
+
+    dec ebx
+    jnz row_loop1
+
+    mov rax, 0                 ; return value
+    add rsp, 80                ; local vars, 80 bytes
+    pop rbp
+    pop rbx
+    ret
+END_OF_FILE
diff --git a/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
new file mode 100644
index 00000000..cfe9d6af
--- /dev/null
+++ b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
@@ -0,0 +1,178 @@
+;
+;Copyright 2024 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to YUVALP
+;amd64 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width must be multiple of 8 and > 0
+;   height must be > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+    cw128  times 8 dw 128
+    cw77   times 8 dw 77
+    cw150  times 8 dw 150
+    cw29   times 8 dw 29
+    cw43   times 8 dw 43
+    cw85   times 8 dw 85
+    cw107  times 8 dw 107
+    cw21   times 8 dw 21
+
+%define LS8            [rsp +   0] ; s8
+%define LSRC_STRIDE    [rsp +   8] ; src_stride
+%define LD8            [rsp +  16] ; d8
+%define LDST_STRIDE    [rsp +  24] ; dst_stride
+%define LWIDTH         [rsp +  32] ; width
+%define LHEIGHT        [rsp +  40] ; height
+
+;The first six integer or pointer arguments are passed in registers
+; RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
+;                                  uint8_t *d8, int dst_stride,
+;                                  int width, int height);
+PROC a8r8g8b8_to_yuvalp_box_amd64_sse2
+    push rbx
+    push rbp
+    sub rsp, 48                 ; local vars, 48 bytes
+
+    mov LS8, rdi                ; s8
+    mov LSRC_STRIDE, rsi        ; src_stride
+    mov LD8, rdx                ; d8
+    mov LDST_STRIDE, rcx        ; dst_stride
+    mov LWIDTH, r8              ; width
+    mov LHEIGHT, r9             ; height
+
+    pxor xmm7, xmm7
+
+    mov ebx, LHEIGHT            ; ebx = height
+
+row_loop1:
+    mov rsi, LS8                ; s8
+    mov rdi, LD8                ; d8
+
+    mov ecx, LWIDTH             ; ecx = width
+    shr ecx, 3                  ; doing 8 pixels at a time
+
+loop1:
+    movdqu xmm0, [rsi]          ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0           ; blue
+    pand xmm1, [lsym(cd255)]    ; blue
+    movdqa xmm2, xmm0           ; green
+    psrld xmm2, 8               ; green
+    pand xmm2, [lsym(cd255)]    ; green
+    movdqa xmm3, xmm0           ; red
+    psrld xmm3, 16              ; red
+    pand xmm3, [lsym(cd255)]    ; red
+    movdqa xmm4, xmm0           ; alpha
+    psrld xmm4, 24              ; alpha
+    pand xmm4, [lsym(cd255)]    ; alpha
+
+    movdqu xmm0, [rsi + 16]     ; 4 pixels, 16 bytes
+    movdqa xmm5, xmm0           ; alpha
+    psrld xmm5, 24              ; alpha
+    pand xmm5, [lsym(cd255)]    ; alpha
+    packssdw xmm4, xmm5         ; xmm4 = 8 alphas
+    packuswb xmm4, xmm7
+    movq [rdi + 3 * 64 * 64], xmm4  ; out 8 bytes aaaaaaaa
+    movdqa xmm4, xmm0           ; blue
+    pand xmm4, [lsym(cd255)]    ; blue
+    movdqa xmm5, xmm0           ; green
+    psrld xmm5, 8               ; green
+    pand xmm5, [lsym(cd255)]    ; green
+    movdqa xmm6, xmm0           ; red
+    psrld xmm6, 16              ; red
+    pand xmm6, [lsym(cd255)]    ; red
+
+    packssdw xmm1, xmm4         ; xmm1 = 8 blues
+    packssdw xmm2, xmm5         ; xmm2 = 8 greens
+    packssdw xmm3, xmm6         ; xmm3 = 8 reds
+
+    ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw29)]
+    pmullw xmm5, [lsym(cw150)]
+    pmullw xmm6, [lsym(cw77)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [rdi], xmm4            ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw85)]
+    pmullw xmm6, [lsym(cw43)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [rdi + 1 * 64 * 64], xmm4  ; out 8 bytes uuuuuuuu
+
+    ; _V = ((128 * _R - 107 * _G -  21 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm4, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw107)]
+    pmullw xmm6, [lsym(cw21)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [rdi + 2 * 64 * 64], xmm4  ; out 8 bytes vvvvvvvv
+
+    ; move right
+    lea rsi, [rsi + 32]
+    lea rdi, [rdi + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov rax, LS8                ; s8
+    add rax, LSRC_STRIDE        ; s8 += src_stride
+    mov LS8, rax
+
+    ; update d8
+    mov rax, LD8                ; d8
+    add rax, LDST_STRIDE        ; d8 += dst_stride
+    mov LD8, rax
+
+    dec ebx
+    jnz row_loop1
+
+    mov rax, 0                  ; return value
+    add rsp, 48                 ; local vars, 48 bytes
+    pop rbp
+    pop rbx
+    ret
+END_OF_FILE
diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h
index ae38c53b..9d746fdc 100644
--- a/module/amd64/funcs_amd64.h
+++ b/module/amd64/funcs_amd64.h
@@ -43,6 +43,15 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride,
                                 uint8_t *d8_y, int dst_stride_y,
                                 uint8_t *d8_uv, int dst_stride_uv,
                                 int width, int height);
+int
+a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const uint8_t *s8, int src_stride,
+                                      uint8_t *d8_y, int dst_stride_y,
+                                      uint8_t *d8_uv, int dst_stride_uv,
+                                      int width, int height);
+int
+a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
+                                  uint8_t *d8, int dst_stride,
+                                  int width, int height);
 
 #endif
 
diff --git a/module/rdp.h b/module/rdp.h
index 844db424..534934e3 100644
--- a/module/rdp.h
+++ b/module/rdp.h
@@ -282,6 +282,8 @@ struct _rdpRec
     CARD32 last_event_time_ms;
     CARD32 last_wheel_time_ms;
 
+    CARD32 msFrameInterval;
+
     int conNumber;
 
     struct _rdpCounts counts;
@@ -297,6 +299,8 @@ struct _rdpRec
 
     copy_box_proc a8r8g8b8_to_a8b8g8r8_box;
     copy_box_dst2_proc a8r8g8b8_to_nv12_box;
+    copy_box_dst2_proc a8r8g8b8_to_nv12_709fr_box;
+    copy_box_proc a8r8g8b8_to_yuvalp_box;
 
     /* multimon */
     struct monitor_info minfo[16]; /* client monitor data */
diff --git a/module/rdpCapture.c b/module/rdpCapture.c
index 334a4880..7591af47 100644
--- a/module/rdpCapture.c
+++ b/module/rdpCapture.c
@@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay,
 /* 19595  38470   7471
   -11071 -21736  32807
    32756 -27429  -5327 */
-static int
-rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
-                              const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
-                              BoxPtr rects, int num_rects)
+int
+a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
+                       uint8_t *d8, int dst_stride,
+                       int width, int height)
 {
-    const uint8_t *s8;
-    uint8_t *d8;
     uint8_t *yptr;
     uint8_t *uptr;
     uint8_t *vptr;
     uint8_t *aptr;
     const uint32_t *s32;
-    int index;
     int jndex;
     int kndex;
-    int width;
-    int height;
     uint32_t pixel;
     uint8_t a;
     int r;
@@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
     int y;
     int u;
     int v;
+
+    for (jndex = 0; jndex < height; jndex++)
+    {
+        s32 = (const uint32_t *) s8;
+        yptr = d8;
+        uptr = yptr + 64 * 64;
+        vptr = uptr + 64 * 64;
+        aptr = vptr + 64 * 64;
+        kndex = 0;
+        while (kndex < width)
+        {
+            pixel = *(s32++);
+            RGB_SPLIT(a, r, g, b, pixel);
+            y = (r *  19595 + g *  38470 + b *   7471) >> 16;
+            u = (r * -11071 + g * -21736 + b *  32807) >> 16;
+            v = (r *  32756 + g * -27429 + b *  -5327) >> 16;
+            u = u + 128;
+            v = v + 128;
+            y = RDPCLAMP(y, 0, UCHAR_MAX);
+            u = RDPCLAMP(u, 0, UCHAR_MAX);
+            v = RDPCLAMP(v, 0, UCHAR_MAX);
+            *(yptr++) = y;
+            *(uptr++) = u;
+            *(vptr++) = v;
+            *(aptr++) = a;
+            kndex++;
+        }
+        d8 += dst_stride;
+        s8 += src_stride;
+    }
+    return 0;
+}
+
+/******************************************************************************/
+static int
+rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay,
+                              const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              BoxPtr rects, int num_rects)
+{
+    const uint8_t *s8;
+    uint8_t *d8;
+    int index;
+    int width;
+    int height;
     BoxPtr box;
 
     dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8);
@@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
         d8 += box->x1 - ax;
         width = box->x2 - box->x1;
         height = box->y2 - box->y1;
-        for (jndex = 0; jndex < height; jndex++)
-        {
-            s32 = (const uint32_t *) s8;
-            yptr = d8;
-            uptr = yptr + 64 * 64;
-            vptr = uptr + 64 * 64;
-            aptr = vptr + 64 * 64;
-            kndex = 0;
-            while (kndex < width)
-            {
-                pixel = *(s32++);
-                RGB_SPLIT(a, r, g, b, pixel);
-                y = (r *  19595 + g *  38470 + b *   7471) >> 16;
-                u = (r * -11071 + g * -21736 + b *  32807) >> 16;
-                v = (r *  32756 + g * -27429 + b *  -5327) >> 16;
-                u = u + 128;
-                v = v + 128;
-                y = RDPCLAMP(y, 0, UCHAR_MAX);
-                u = RDPCLAMP(u, 0, UCHAR_MAX);
-                v = RDPCLAMP(v, 0, UCHAR_MAX);
-                *(yptr++) = y;
-                *(uptr++) = u;
-                *(vptr++) = v;
-                *(aptr++) = a;
-                kndex++;
-            }
-            d8 += 64;
-            s8 += src_stride;
-        }
+        clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride,
+                                               d8, 64,
+                                               width, height);
     }
     return 0;
 }
@@ -540,6 +553,103 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride,
     return 0;
 }
 
+/******************************************************************************/
+int
+a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride,
+                           uint8_t *d8_y, int dst_stride_y,
+                           uint8_t *d8_uv, int dst_stride_uv,
+                           int width, int height)
+{
+    int index;
+    int jndex;
+    int R;
+    int G;
+    int B;
+    int Y;
+    int U;
+    int V;
+    int U_sum;
+    int V_sum;
+    int pixel;
+    const uint32_t *s32a;
+    const uint32_t *s32b;
+    uint8_t *d8ya;
+    uint8_t *d8yb;
+    uint8_t *d8uv;
+
+    for (jndex = 0; jndex < height; jndex += 2)
+    {
+        s32a = (const uint32_t *) (s8 + src_stride * jndex);
+        s32b = (const uint32_t *) (s8 + src_stride * (jndex + 1));
+        d8ya = d8_y + dst_stride_y * jndex;
+        d8yb = d8_y + dst_stride_y * (jndex + 1);
+        d8uv = d8_uv + dst_stride_uv * (jndex / 2);
+        for (index = 0; index < width; index += 2)
+        {
+            U_sum = 0;
+            V_sum = 0;
+
+            pixel = s32a[0];
+            s32a++;
+            R = (pixel >> 16) & 0xff;
+            G = (pixel >>  8) & 0xff;
+            B = (pixel >>  0) & 0xff;
+            Y =  ( 54 * R + 183 * G +  18 * B) >> 8;
+            U = ((-29 * R -  99 * G + 128 * B) >> 8) + 128;
+            V = ((128 * R - 116 * G -  12 * B) >> 8) + 128;
+            d8ya[0] = RDPCLAMP(Y, 0, 255);
+            d8ya++;
+            U_sum += RDPCLAMP(U, 0, 255);
+            V_sum += RDPCLAMP(V, 0, 255);
+
+            pixel = s32a[0];
+            s32a++;
+            R = (pixel >> 16) & 0xff;
+            G = (pixel >>  8) & 0xff;
+            B = (pixel >>  0) & 0xff;
+            Y =  ( 54 * R + 183 * G +  18 * B) >> 8;
+            U = ((-29 * R -  99 * G + 128 * B) >> 8) + 128;
+            V = ((128 * R - 116 * G -  12 * B) >> 8) + 128;
+            d8ya[0] = RDPCLAMP(Y, 0, 255);
+            d8ya++;
+            U_sum += RDPCLAMP(U, 0, 255);
+            V_sum += RDPCLAMP(V, 0, 255);
+
+            pixel = s32b[0];
+            s32b++;
+            R = (pixel >> 16) & 0xff;
+            G = (pixel >>  8) & 0xff;
+            B = (pixel >>  0) & 0xff;
+            Y =  ( 54 * R + 183 * G +  18 * B) >> 8;
+            U = ((-29 * R -  99 * G + 128 * B) >> 8) + 128;
+            V = ((128 * R - 116 * G -  12 * B) >> 8) + 128;
+            d8yb[0] = RDPCLAMP(Y, 0, 255);
+            d8yb++;
+            U_sum += RDPCLAMP(U, 0, 255);
+            V_sum += RDPCLAMP(V, 0, 255);
+
+            pixel = s32b[0];
+            s32b++;
+            R = (pixel >> 16) & 0xff;
+            G = (pixel >>  8) & 0xff;
+            B = (pixel >>  0) & 0xff;
+            Y =  ( 54 * R + 183 * G +  18 * B) >> 8;
+            U = ((-29 * R -  99 * G + 128 * B) >> 8) + 128;
+            V = ((128 * R - 116 * G -  12 * B) >> 8) + 128;
+            d8yb[0] = RDPCLAMP(Y, 0, 255);
+            d8yb++;
+            U_sum += RDPCLAMP(U, 0, 255);
+            V_sum += RDPCLAMP(V, 0, 255);
+
+            d8uv[0] = (U_sum + 2) / 4;
+            d8uv++;
+            d8uv[0] = (V_sum + 2) / 4;
+            d8uv++;
+        }
+    }
+    return 0;
+}
+
 /******************************************************************************/
 /* copy rects with no error checking */
 static int
@@ -577,6 +687,44 @@ rdpCopyBox_a8r8g8b8_to_nv12(rdpClientCon *clientCon,
     return 0;
 }
 
+/******************************************************************************/
+/* copy rects with no error checking */
+static int
+rdpCopyBox_a8r8g8b8_to_nv12_709fr(rdpClientCon *clientCon,
+                                  const uint8_t *src, int src_stride,
+                                  int srcx, int srcy,
+                                  uint8_t *dst_y, int dst_stride_y,
+                                  uint8_t *dst_uv, int dst_stride_uv,
+                                  int dstx, int dsty,
+                                  BoxPtr rects, int num_rects)
+{
+    const uint8_t *s8;
+    uint8_t *d8_y;
+    uint8_t *d8_uv;
+    int index;
+    int width;
+    int height;
+    BoxPtr box;
+
+    for (index = 0; index < num_rects; index++)
+    {
+        box = rects + index;
+        s8 = src + (box->y1 - srcy) * src_stride;
+        s8 += (box->x1 - srcx) * 4;
+        d8_y = dst_y + (box->y1 - dsty) * dst_stride_y;
+        d8_y += (box->x1 - dstx) * 1;
+        d8_uv = dst_uv + ((box->y1 - dsty) / 2) * dst_stride_uv;
+        d8_uv += (box->x1 - dstx) * 1;
+        width = box->x2 - box->x1;
+        height = box->y2 - box->y1;
+        clientCon->dev->a8r8g8b8_to_nv12_709fr_box(s8, src_stride,
+                                                   d8_y, dst_stride_y,
+                                                   d8_uv, dst_stride_uv,
+                                                   width, height);
+    }
+    return 0;
+}
+
 /******************************************************************************/
 static Bool
 isShmStatusActive(enum shared_memory_status status) {
@@ -610,8 +758,8 @@ wyhash_rfx_tile(const uint8_t *src, int src_stride, int x, int y, uint64_t seed)
 
 /******************************************************************************/
 static Bool
-rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
-            int *num_out_rects, struct image_data *id)
+rdpCaptureSimple(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
+                 int *num_out_rects, struct image_data *id)
 {
     BoxPtr psrc_rects;
     BoxRec rect;
@@ -624,10 +772,10 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     int dst_stride;
     int dst_format;
 
-    LLOGLN(10, ("rdpCapture0:"));
+    LLOGLN(10, ("rdpCaptureSimple:"));
 
     if (!isShmStatusActive(clientCon->shmemstatus)) {
-        LLOGLN(0, ("rdpCapture0: WARNING -- Shared memory is not configured."
+        LLOGLN(0, ("rdpCaptureSimple: WARNING -- Shared memory is not configured."
                    " Aborting capture!"));
         return FALSE;
     }
@@ -695,7 +843,7 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     }
     else
     {
-        LLOGLN(0, ("rdpCapture0: unimplemented color conversion"));
+        LLOGLN(0, ("rdpCaptureSimple: unimplemented color conversion"));
     }
     return rv;
 }
@@ -703,8 +851,8 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
 /******************************************************************************/
 /* make out_rects always multiple of 16 width and height */
 static Bool
-rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
-            int *num_out_rects, struct image_data *id)
+rdpCaptureSufA16(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
+                 int *num_out_rects, struct image_data *id)
 {
     BoxPtr psrc_rects;
     BoxRec rect;
@@ -735,10 +883,10 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     int dst_stride;
     int dst_format;
 
-    LLOGLN(10, ("rdpCapture1:"));
+    LLOGLN(10, ("rdpCaptureSufA16:"));
 
     if (!isShmStatusActive(clientCon->shmemstatus)) {
-        LLOGLN(0, ("rdpCapture1: WARNING -- Shared memory is not configured."
+        LLOGLN(0, ("rdpCaptureSufA16: WARNING -- Shared memory is not configured."
                " Aborting capture!"));
         return FALSE;
     }
@@ -844,15 +992,15 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     }
     else
     {
-        LLOGLN(0, ("rdpCapture1: unimplemented color conversion"));
+        LLOGLN(0, ("rdpCaptureSufA16: unimplemented color conversion"));
     }
     return rv;
 }
 
 /******************************************************************************/
 static Bool
-rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
-            int *num_out_rects, struct image_data *id)
+rdpCaptureGfxPro(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
+                 int *num_out_rects, struct image_data *id)
 {
     int x;
     int y;
@@ -874,11 +1022,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     int num_crcs;
     int mon_index;
 
-    LLOGLN(10, ("rdpCapture2:"));
+    LLOGLN(10, ("rdpCaptureGfxPro:"));
 
     if (!isShmStatusActive(clientCon->shmemstatus))
     {
-        LLOGLN(0, ("rdpCapture2: WARNING -- Shared memory is not configured"
+        LLOGLN(0, ("rdpCaptureGfxPro: WARNING -- Shared memory is not configured"
                    " for RFX. Aborting capture!"));
         return FALSE;
     }
@@ -904,7 +1052,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     num_crcs = crc_stride * ((id->height + 63) / 64);
     if (num_crcs != clientCon->num_rfx_crcs_alloc[mon_index])
     {
-        LLOGLN(0, ("rdpCapture2: resize the crc list was %d now %d",
+        LLOGLN(0, ("rdpCaptureGfxPro: resize the crc list was %d now %d",
                clientCon->num_rfx_crcs_alloc[mon_index], num_crcs));
         /* resize the crc list */
         clientCon->num_rfx_crcs_alloc[mon_index] = num_crcs;
@@ -924,11 +1072,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
             rect.x2 = rect.x1 + XRDP_RFX_ALIGN;
             rect.y2 = rect.y1 + XRDP_RFX_ALIGN;
             rcode = rdpRegionContainsRect(in_reg, &rect);
-            LLOGLN(10, ("rdpCapture2: rcode %d", rcode));
+            LLOGLN(10, ("rdpCaptureGfxPro: rcode %d", rcode));
 
             if (rcode == rgnOUT)
             {
-                LLOGLN(10, ("rdpCapture2: rgnOUT"));
+                LLOGLN(10, ("rdpCaptureGfxPro: rgnOUT"));
                 rdpRegionInit(&tile_reg, &rect, 0);
                 rdpRegionSubtract(in_reg, in_reg, &tile_reg);
                 rdpRegionUninit(&tile_reg);
@@ -939,14 +1087,14 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
                 crc = WYHASH_SEED;
                 if (rcode == rgnPART)
                 {
-                    LLOGLN(10, ("rdpCapture2: rgnPART"));
+                    LLOGLN(10, ("rdpCaptureGfxPro: rgnPART"));
                     rdpFillBox_yuvalp(x, y, dst, dst_stride);
                     rdpRegionInit(&tile_reg, &rect, 0);
                     rdpRegionIntersect(&tile_reg, in_reg, &tile_reg);
                     rects = REGION_RECTS(&tile_reg);
                     num_rects = REGION_NUM_RECTS(&tile_reg);
                     crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp);
-                    rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
+                    rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
                                                   src, src_stride,
                                                   dst, dst_stride,
                                                   rects, num_rects);
@@ -956,16 +1104,16 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
                 }
                 else /* rgnIN */
                 {
-                    LLOGLN(10, ("rdpCapture2: rgnIN"));
+                    LLOGLN(10, ("rdpCaptureGfxPro: rgnIN"));
                     crc = wyhash_rfx_tile(src, src_stride, x, y, crc);
                 }
                 crc_offset = (y / XRDP_RFX_ALIGN) * crc_stride
                              + (x / XRDP_RFX_ALIGN);
-                LLOGLN(10, ("rdpCapture2: crc 0x%" PRIx64 " 0x%" PRIx64,
+                LLOGLN(10, ("rdpCaptureGfxPro: crc 0x%" PRIx64 " 0x%" PRIx64,
                        crc, clientCon->rfx_crcs[mon_index][crc_offset]));
                 if (crc == clientCon->rfx_crcs[mon_index][crc_offset])
                 {
-                    LLOGLN(10, ("rdpCapture2: crc skip at x %d y %d", x, y));
+                    LLOGLN(10, ("rdpCaptureGfxPro: crc skip at x %d y %d", x, y));
                     rdpRegionInit(&tile_reg, &rect, 0);
                     rdpRegionSubtract(in_reg, in_reg, &tile_reg);
                     rdpRegionUninit(&tile_reg);
@@ -975,7 +1123,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
                     /* lazily only do this if hash wasn't identical */
                     if (rcode != rgnPART)
                     {
-                        rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
+                        rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
                                 src, src_stride,
                                 dst, dst_stride,
                                 &rect, 1);
@@ -1002,8 +1150,8 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
 /******************************************************************************/
 /* make out_rects always multiple of 2 width and height */
 static Bool
-rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
-            int *num_out_rects, struct image_data *id)
+rdpCaptureSufA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
+                int *num_out_rects, struct image_data *id)
 {
     BoxPtr psrc_rects;
     BoxRec rect;
@@ -1017,11 +1165,11 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     int dst_stride;
     int dst_format;
 
-    LLOGLN(10, ("rdpCapture3:"));
+    LLOGLN(10, ("rdpCaptureSufA2:"));
 
     if (!isShmStatusActive(clientCon->shmemstatus))
     {
-        LLOGLN(0, ("rdpCapture3: WARNING -- Shared memory is not configured."
+        LLOGLN(0, ("rdpCaptureSufA2: WARNING -- Shared memory is not configured."
                " Aborting capture!"));
         return FALSE;
     }
@@ -1081,7 +1229,100 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     }
     else
     {
-        LLOGLN(0, ("rdpCapture3: unimplemented color conversion"));
+        LLOGLN(0, ("rdpCaptureSufA2: unimplemented color conversion"));
+    }
+
+    return rv;
+}
+
+/******************************************************************************/
+/* make out_rects always multiple of 2 width and height */
+static Bool
+rdpCaptureGfxA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
+                int *num_out_rects, struct image_data *id)
+{
+    BoxPtr psrc_rects;
+    BoxRec rect;
+    int num_rects;
+    int index;
+    uint8_t *dst_uv;
+    Bool rv;
+    const uint8_t *src;
+    uint8_t *dst;
+    int src_stride;
+    int dst_stride;
+    int dst_format;
+
+    LLOGLN(10, ("rdpCaptureGfxA2:"));
+
+    if (!isShmStatusActive(clientCon->shmemstatus))
+    {
+        LLOGLN(0, ("rdpCaptureGfxA2: WARNING -- Shared memory is not configured."
+               " Aborting capture!"));
+        return FALSE;
+    }
+
+    rv = TRUE;
+
+    rdpRegionTranslate(in_reg, -id->left, -id->top);
+
+    num_rects = REGION_NUM_RECTS(in_reg);
+    psrc_rects = REGION_RECTS(in_reg);
+
+    if (num_rects < 1)
+    {
+        return FALSE;
+    }
+
+    *num_out_rects = num_rects;
+
+    *out_rects = g_new(BoxRec, num_rects * 4);
+    index = 0;
+    while (index < num_rects)
+    {
+        rect = psrc_rects[index];
+        LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1,
+               rect.x2, rect.y2));
+        rect.x1 -= rect.x1 & 1;
+        rect.y1 -= rect.y1 & 1;
+        rect.x2 += rect.x2 & 1;
+        rect.y2 += rect.y2 & 1;
+        if (rect.x2 > id->width)
+        {
+            rect.x2 = id->width & ~1;
+        }
+        if (rect.y2 > id->height)
+        {
+            rect.y2 = id->height & ~1;
+        }
+        LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1,
+               rect.x2, rect.y2));
+        (*out_rects)[index] = rect;
+        index++;
+    }
+
+    src = id->pixels;
+    dst = id->shmem_pixels;
+    dst_format = clientCon->rdp_format;
+    src_stride = id->lineBytes;
+    dst_stride = id->width;
+
+    src = src + src_stride * id->top + id->left * 4;
+
+    if (dst_format == XRDP_nv12_709fr)
+    {
+        dst_uv = dst;
+        dst_uv += id->width * id->height;
+        rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon,
+                                          src, src_stride, 0, 0,
+                                          dst, dst_stride,
+                                          dst_uv, dst_stride,
+                                          0, 0,
+                                          *out_rects, num_rects);
+    }
+    else
+    {
+        LLOGLN(0, ("rdpCaptureGfxA2: unimplemented color conversion"));
     }
 
     return rv;
@@ -1153,7 +1394,7 @@ Bool
 rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
            int *num_out_rects, struct image_data *id)
 {
-    int mode;
+    enum xrdp_capture_code mode;
 
     LLOGLN(10, ("rdpCapture:"));
     mode = clientCon->client_info.capture_code;
@@ -1170,18 +1411,20 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
     }
     switch (mode)
     {
-        case 0:
-            return rdpCapture0(clientCon, in_reg, out_rects, num_out_rects, id);
-        case 1:
-            return rdpCapture1(clientCon, in_reg, out_rects, num_out_rects, id);
-        case 2:
-        case 4:
-            /* used for remotefx capture */
-            return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id);
-        case 3:
-        case 5:
+        case CC_SIMPLE:
+            return rdpCaptureSimple(clientCon, in_reg, out_rects, num_out_rects, id);
+        case CC_SUF_A16:
+            return rdpCaptureSufA16(clientCon, in_reg, out_rects, num_out_rects, id);
+        case CC_SUF_RFX: /* surface command RFX */
+            /* FALLTHROUGH */
+        case CC_GFX_PRO: /* GFX progressive */
+            return rdpCaptureGfxPro(clientCon, in_reg, out_rects, num_out_rects, id);
+        case CC_SUF_A2: /* surface command h264 */
+            /* used for even align capture */
+            return rdpCaptureSufA2(clientCon, in_reg, out_rects, num_out_rects, id);
+        case CC_GFX_A2: /* GFX h264 */
             /* used for even align capture */
-            return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id);
+            return rdpCaptureGfxA2(clientCon, in_reg, out_rects, num_out_rects, id);
         default:
             LLOGLN(0, ("rdpCapture: mode %d not implemented", mode));
             break;
diff --git a/module/rdpCapture.h b/module/rdpCapture.h
index 72a9336e..7e38508e 100644
--- a/module/rdpCapture.h
+++ b/module/rdpCapture.h
@@ -48,5 +48,14 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride,
                      uint8_t *d8_y, int dst_stride_y,
                      uint8_t *d8_uv, int dst_stride_uv,
                      int width, int height);
+extern _X_EXPORT int
+a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride,
+                           uint8_t *d8_y, int dst_stride_y,
+                           uint8_t *d8_uv, int dst_stride_uv,
+                           int width, int height);
+extern _X_EXPORT int
+a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
+                       uint8_t *d8, int dst_stride,
+                       int width, int height);
 
 #endif
diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c
index a3366fa5..b98db64b 100644
--- a/module/rdpClientCon.c
+++ b/module/rdpClientCon.c
@@ -776,47 +776,57 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon)
     clientCon->rdp_height = height;
 
     /* Set the capture parameters */
-    if ((clientCon->client_info.capture_code == 2) || /* RFX */
-        (clientCon->client_info.capture_code == 4))
+    switch(clientCon->client_info.capture_code)
     {
-        LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture"));
-        /* RFX capture needs fixed-size rectangles */
-        clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN);
-        clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN);
-        LLOGLN(0, ("  cap_width %d cap_height %d",
-               clientCon->cap_width, clientCon->cap_height));
+        case CC_SUF_RFX: /* RFX */
+        case CC_GFX_PRO:
+            LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture"));
+            /* RFX capture needs fixed-size rectangles */
+            clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN);
+            clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN);
+            LLOGLN(0, ("  cap_width %d cap_height %d",
+                   clientCon->cap_width, clientCon->cap_height));
 
-        bytes = clientCon->cap_width * clientCon->cap_height *
-                clientCon->rdp_Bpp;
+            bytes = clientCon->cap_width * clientCon->cap_height *
+                    clientCon->rdp_Bpp;
 
-        clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
-        clientCon->cap_stride_bytes = clientCon->cap_width * 4;
-        shmemstatus = SHM_RFX_ACTIVE_PENDING;
-    }
-    else if ((clientCon->client_info.capture_code == 3) || /* H264 */
-             (clientCon->client_info.capture_code == 5))
-    {
-        LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture"));
-        clientCon->cap_width = width;
-        clientCon->cap_height = height;
+            clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
+            clientCon->cap_stride_bytes = clientCon->cap_width * 4;
+            shmemstatus = SHM_RFX_ACTIVE_PENDING;
 
-        bytes = clientCon->cap_width * clientCon->cap_height * 2;
+            dev->msFrameInterval = clientCon->client_info.rfx_frame_interval;
+            break;
+        case CC_SUF_A2: /* H264 */
+        case CC_GFX_A2:
+            LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture"));
+            clientCon->cap_width = width;
+            clientCon->cap_height = height;
 
-        clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
-        clientCon->cap_stride_bytes = clientCon->cap_width * 4;
-        shmemstatus = SHM_H264_ACTIVE_PENDING;
-    }
-    else
-    {
-        clientCon->cap_width = width;
-        clientCon->cap_height = height;
+            bytes = clientCon->cap_width * clientCon->cap_height * 2;
+
+            clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
+            clientCon->cap_stride_bytes = clientCon->cap_width * 4;
+            shmemstatus = SHM_H264_ACTIVE_PENDING;
 
-        bytes = width * height * clientCon->rdp_Bpp;
+            dev->msFrameInterval = clientCon->client_info.h264_frame_interval;
+            break;
+        default:
+            LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got normal capture"));
+            clientCon->cap_width = width;
+            clientCon->cap_width = width;
+            clientCon->cap_height = height;
 
-        clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
-        clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp;
-        shmemstatus = SHM_ACTIVE_PENDING;
+            bytes = width * height * clientCon->rdp_Bpp;
+
+            clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width;
+            clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp;
+            shmemstatus = SHM_ACTIVE_PENDING;
+
+            dev->msFrameInterval = clientCon->client_info.normal_frame_interval;
+            break;
     }
+
+    LLOGLN(0, ("    msFrameInterval %ld", (long)dev->msFrameInterval));
     rdpClientConAllocateSharedMemory(clientCon, bytes);
 
     if (clientCon->client_info.capture_format != 0)
@@ -1011,12 +1021,12 @@ rdpSendMemoryAllocationComplete(rdpPtr dev, rdpClientCon *clientCon)
 
     switch (clientCon->client_info.capture_code)
     {
-        case 2:
-        case 4:
+        case CC_SUF_RFX:
+        case CC_GFX_PRO:
             alignment = XRDP_RFX_ALIGN;
             break;
-        case 3:
-        case 5:
+        case CC_SUF_A2:
+        case CC_GFX_A2:
             alignment = XRDP_H264_ALIGN;
             break;
         default:
@@ -2539,7 +2549,7 @@ rdpClientConScheduleDeferredUpdate(rdpPtr dev)
     {
         dev->sendUpdateScheduled = TRUE;
         dev->sendUpdateTimer =
-                TimerSet(dev->sendUpdateTimer, 0, 40,
+                TimerSet(dev->sendUpdateTimer, 0, dev->msFrameInterval,
                          rdpClientConDeferredUpdateCallback, dev);
     }
 }
@@ -2608,7 +2618,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
     int num_rects_d;
     int num_rects_c;
     struct stream *s;
-    int capture_code;
+    enum xrdp_capture_code capture_code;
     int start_frame_bytes;
     int wiretosurface1_bytes;
     int wiretosurface2_bytes;
@@ -2625,6 +2635,8 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
            id->flags, id->left, id->top, id->width, id->height));
 
     capture_code = clientCon->client_info.capture_code;
+    LLOGLN(10, ("rdpClientConSendPaintRectShmFd: capture_code %d",
+           capture_code));
 
     num_rects_d = REGION_NUM_RECTS(dirtyReg);
     num_rects_c = numCopyRects;
@@ -2636,7 +2648,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
 
     rdpClientConBeginUpdate(dev, clientCon);
 
-    if (capture_code < 4)
+    if (capture_code < CC_GFX_PRO)
     {
         /* non gfx */
         size = 2 + 2 + 2 + num_rects_d * 8 + 2 + num_rects_c * 8;
@@ -2656,15 +2668,15 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
         out_uint32_le(s, clientCon->rect_id);
         out_uint32_le(s, id->shmem_bytes);
         out_uint32_le(s, id->shmem_offset);
-		if (capture_code == 2) /* rfx */
-		{
+        if (capture_code == CC_SUF_RFX) /* rfx */
+        {
             out_uint16_le(s, id->left);
             out_uint16_le(s, id->top);
             out_uint16_le(s, id->width);
             out_uint16_le(s, id->height);
-		}
-		else
-		{
+        }
+        else
+        {
             out_uint16_le(s, 0);
             out_uint16_le(s, 0);
             out_uint16_le(s, clientCon->cap_width);
@@ -2673,7 +2685,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
         rdpClientConSendPending(clientCon->dev, clientCon);
         g_sck_send_fd_set(clientCon->sck, "int", 4, &(id->shmem_fd), 1);
     }
-    else if (capture_code == 4) /* gfx pro rfx */
+    else if (capture_code == CC_GFX_PRO) /* gfx pro rfx */
     {
         start_frame_bytes = 8 + 8;
         wiretosurface2_bytes = 8 + 13 +
@@ -2745,7 +2757,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon,
             out_uint32_le(s, 0);                /* shmem_bytes */
         }
     }
-    else if (capture_code == 5) /* gfx h264 */
+    else if (capture_code == CC_GFX_A2) /* gfx h264 */
     {
         start_frame_bytes = 8 + 8;
         wiretosurface1_bytes = 8 + 9 +
@@ -2979,7 +2991,6 @@ rdpDeferredUpdateCallback(OsTimerPtr timer, CARD32 now, pointer arg)
 
 
 /******************************************************************************/
-#define MIN_MS_BETWEEN_FRAMES 40
 #define MIN_MS_TO_WAIT_FOR_MORE_UPDATES 4
 #define UPDATE_RETRY_TIMEOUT 200 // After this number of retries, give up and perform the capture anyway. This prevents an infinite loop.
 static void
@@ -2998,7 +3009,7 @@ rdpScheduleDeferredUpdate(rdpClientCon *clientCon)
        for more changes before sending an update. Always waiting the longer
        delay would introduce unnecessarily much latency. */
     msToWait = MIN_MS_TO_WAIT_FOR_MORE_UPDATES;
-    minNextUpdateTime = clientCon->lastUpdateTime + MIN_MS_BETWEEN_FRAMES;
+    minNextUpdateTime = clientCon->lastUpdateTime + clientCon->dev->msFrameInterval;
     /* the first check is to gracefully handle the infrequent case of
        the time wrapping around */
     if(clientCon->lastUpdateTime < curTime &&
diff --git a/module/rdpClientCon.h b/module/rdpClientCon.h
index 5ff1de21..b4c443cf 100644
--- a/module/rdpClientCon.h
+++ b/module/rdpClientCon.h
@@ -120,6 +120,8 @@ struct _rdpClientCon
     int updateScheduled; /* boolean */
     int updateRetries;
 
+    CARD32 msFrameInterval;
+
     RegionPtr dirtyRegion;
 
     int num_rfx_crcs_alloc[16];
diff --git a/module/rdpSimd.c b/module/rdpSimd.c
index 49a3653e..59feb9da 100644
--- a/module/rdpSimd.c
+++ b/module/rdpSimd.c
@@ -62,6 +62,261 @@ int g_simd_use_accel = 1;
 #define LLOGLN(_level, _args) \
     do { if (_level < LOG_LEVEL) { ErrorF _args ; ErrorF("\n"); } } while (0)
 
+#if SIMD_USE_ACCEL
+
+#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64)
+/******************************************************************************/
+static int
+a8r8g8b8_to_nv12_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride,
+                                     uint8_t *d8_y, int dst_stride_y,
+                                     uint8_t *d8_uv, int dst_stride_uv,
+                                     int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_box_amd64_sse2(s8, src_stride,
+                                                    d8_y, dst_stride_y,
+                                                    d8_uv, dst_stride_uv,
+                                                    aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride,
+                                         d8_y + aligned_width, dst_stride_y,
+                                         d8_uv + aligned_width, dst_stride_uv,
+                                         left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+
+/******************************************************************************/
+static int
+a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride,
+                                           uint8_t *d8_y, int dst_stride_y,
+                                           uint8_t *d8_uv, int dst_stride_uv,
+                                           int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_709fr_box_amd64_sse2(s8, src_stride,
+                                                          d8_y, dst_stride_y,
+                                                          d8_uv, dst_stride_uv,
+                                                          aligned_width,
+                                                          height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4,
+                                               src_stride,
+                                               d8_y + aligned_width,
+                                               dst_stride_y,
+                                               d8_uv + aligned_width,
+                                               dst_stride_uv,
+                                               left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+
+/*****************************************************************************/
+int
+a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride,
+                                       uint8_t *d8, int dst_stride,
+                                       int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box_amd64_sse2(s8, src_stride,
+                                                      d8, dst_stride,
+                                                      aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride,
+                                           d8 + aligned_width, dst_stride,
+                                           left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+#endif
+
+#if defined(__x86__) || defined(_M_IX86) || defined(__i386__)
+/******************************************************************************/
+static int
+a8r8g8b8_to_nv12_box_x86_sse2_wrap(const uint8_t *s8, int src_stride,
+                                   uint8_t *d8_y, int dst_stride_y,
+                                   uint8_t *d8_uv, int dst_stride_uv,
+                                   int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_box_x86_sse2(s8, src_stride,
+                                                  d8_y, dst_stride_y,
+                                                  d8_uv, dst_stride_uv,
+                                                  aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride,
+                                         d8_y + aligned_width, dst_stride_y,
+                                         d8_uv + aligned_width, dst_stride_uv,
+                                         left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+
+/******************************************************************************/
+static int
+a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap(const uint8_t *s8, int src_stride,
+                                         uint8_t *d8_y, int dst_stride_y,
+                                         uint8_t *d8_uv, int dst_stride_uv,
+                                         int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_709fr_box_x86_sse2(s8, src_stride,
+                                                        d8_y, dst_stride_y,
+                                                        d8_uv, dst_stride_uv,
+                                                        aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4,
+                                               src_stride,
+                                               d8_y + aligned_width,
+                                               dst_stride_y,
+                                               d8_uv + aligned_width,
+                                               dst_stride_uv,
+                                               left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+
+/*****************************************************************************/
+int
+a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride,
+                                     uint8_t *d8, int dst_stride,
+                                     int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box_x86_sse2(s8, src_stride,
+                                                    d8, dst_stride,
+                                                    aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride,
+                                           d8 + aligned_width, dst_stride,
+                                           left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+#endif
+
+#endif
+
 /*****************************************************************************/
 Bool
 rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
@@ -77,6 +332,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
     dev->uyvy_to_rgb32 = UYVY_to_RGB32;
     dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box;
     dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box;
+    dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box;
+    dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box;
 #if SIMD_USE_ACCEL
     if (g_simd_use_accel)
     {
@@ -92,7 +349,9 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
             dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2;
             dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2;
             dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2;
-            dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2;
+            dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2_wrap;
+            dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap;
+            dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap;
             LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned"));
         }
 #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__)
@@ -107,7 +366,9 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
             dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2;
             dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2;
             dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2;
-            dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2;
+            dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2_wrap;
+            dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap;
+            dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap;
             LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned"));
         }
 #endif
diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am
index ed106863..92acda61 100644
--- a/module/x86/Makefile.am
+++ b/module/x86/Makefile.am
@@ -3,6 +3,8 @@ NAFLAGS += -DASM_ARCH_I386
 ASMSOURCES = \
   a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \
   a8r8g8b8_to_nv12_box_x86_sse2.asm \
+  a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm \
+  a8r8g8b8_to_yuvalp_box_x86_sse2.asm \
   cpuid_x86.asm \
   i420_to_rgb32_x86_sse2.asm \
   uyvy_to_rgb32_x86_sse2.asm \
diff --git a/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm
new file mode 100644
index 00000000..262f1af3
--- /dev/null
+++ b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm
@@ -0,0 +1,300 @@
+;
+;Copyright 2015 Jay Sorg
+;Copyright 2017 mirabilos
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to NV12 709 full range
+;x86 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width should be multiple of 8 and > 0
+;   height should be even and > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+
+    cw255  times 8 dw 255
+    cw128  times 8 dw 128
+    cw54   times 8 dw 54
+    cw183  times 8 dw 183
+    cw18   times 8 dw 18
+    cw29   times 8 dw 29
+    cw99   times 8 dw 99
+    cw116  times 8 dw 116
+    cw12   times 8 dw 12
+    cw2    times 8 dw 2
+
+%define LU1            [esp +  0] ; first line U, 8 bytes
+%define LV1            [esp +  8] ; first line V, 8 bytes
+%define LU2            [esp + 16] ; second line U, 8 bytes
+%define LV2            [esp + 24] ; second line V, 8 bytes
+
+%define LS8            [esp + 52] ; s8
+%define LSRC_STRIDE    [esp + 56] ; src_stride
+%define LD8_Y          [esp + 60] ; d8_y
+%define LDST_Y_STRIDE  [esp + 64] ; dst_stride_y
+%define LD8_UV         [esp + 68] ; d8_uv
+%define LDST_UV_STRIDE [esp + 72] ; dst_stride_uv
+%define LWIDTH         [esp + 76] ; width
+%define LHEIGHT        [esp + 80] ; height
+
+;int
+;a8r8g8b8_to_nv12_709fr_box_x86_sse2(const char *s8, int src_stride,
+;                                    char *d8_y, int dst_stride_y,
+;                                    char *d8_uv, int dst_stride_uv,
+;                                    int width, int height);
+PROC a8r8g8b8_to_nv12_709fr_box_x86_sse2
+    push ebx
+    RETRIEVE_RODATA
+    push esi
+    push edi
+    push ebp
+    sub esp, 32                ; local vars, 32 bytes
+
+    pxor xmm7, xmm7
+
+    mov ebp, LHEIGHT           ; ebp = height
+    shr ebp, 1                 ; doing 2 lines at a time
+
+row_loop1:
+    mov esi, LS8               ; s8
+    mov edi, LD8_Y             ; d8_y
+    mov edx, LD8_UV            ; d8_uv
+
+    mov ecx, LWIDTH            ; ecx = width
+    shr ecx, 3                 ; doing 8 pixels at a time
+
+loop1:
+    ; first line
+    movdqu xmm0, [esi]         ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0          ; blue
+    pand xmm1, [lsym(cd255)]   ; blue
+    movdqa xmm2, xmm0          ; green
+    psrld xmm2, 8              ; green
+    pand xmm2, [lsym(cd255)]   ; green
+    movdqa xmm3, xmm0          ; red
+    psrld xmm3, 16             ; red
+    pand xmm3, [lsym(cd255)]   ; red
+
+    movdqu xmm0, [esi + 16]    ; 4 pixels, 16 bytes
+    movdqa xmm4, xmm0          ; blue
+    pand xmm4, [lsym(cd255)]   ; blue
+    movdqa xmm5, xmm0          ; green
+    psrld xmm5, 8              ; green
+    pand xmm5, [lsym(cd255)]   ; green
+    movdqa xmm6, xmm0          ; red
+    psrld xmm6, 16             ; red
+    pand xmm6, [lsym(cd255)]   ; red
+
+    packssdw xmm1, xmm4        ; xmm1 = 8 blues
+    packssdw xmm2, xmm5        ; xmm2 = 8 greens
+    packssdw xmm3, xmm6        ; xmm3 = 8 reds
+
+    ; _Y = (( 66 * _R + 129 * _G +  25 * _B) >> 8) +  16;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw18)]
+    pmullw xmm5, [lsym(cw183)]
+    pmullw xmm6, [lsym(cw54)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [edi], xmm4           ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-38 * _R -  74 * _G + 112 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw99)]
+    pmullw xmm6, [lsym(cw29)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LU1, xmm4             ; save for later
+
+    ; _V = ((112 * _R -  94 * _G -  18 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm4, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw116)]
+    pmullw xmm6, [lsym(cw12)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LV1, xmm4             ; save for later
+
+    ; go down to second line
+    add esi, LSRC_STRIDE
+    add edi, LDST_Y_STRIDE
+
+    ; second line
+    movdqu xmm0, [esi]         ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0          ; blue
+    pand xmm1, [lsym(cd255)]   ; blue
+    movdqa xmm2, xmm0          ; green
+    psrld xmm2, 8              ; green
+    pand xmm2, [lsym(cd255)]   ; green
+    movdqa xmm3, xmm0          ; red
+    psrld xmm3, 16             ; red
+    pand xmm3, [lsym(cd255)]   ; red
+
+    movdqu xmm0, [esi + 16]    ; 4 pixels, 16 bytes
+    movdqa xmm4, xmm0          ; blue
+    pand xmm4, [lsym(cd255)]   ; blue
+    movdqa xmm5, xmm0          ; green
+    psrld xmm5, 8              ; green
+    pand xmm5, [lsym(cd255)]   ; green
+    movdqa xmm6, xmm0          ; red
+    psrld xmm6, 16             ; red
+    pand xmm6, [lsym(cd255)]   ; red
+
+    packssdw xmm1, xmm4        ; xmm1 = 8 blues
+    packssdw xmm2, xmm5        ; xmm2 = 8 greens
+    packssdw xmm3, xmm6        ; xmm3 = 8 reds
+
+    ; _Y = (( 66 * _R + 129 * _G +  25 * _B) >> 8) +  16;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw18)]
+    pmullw xmm5, [lsym(cw183)]
+    pmullw xmm6, [lsym(cw54)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [edi], xmm4           ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-38 * _R -  74 * _G + 112 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm6, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw99)]
+    pmullw xmm6, [lsym(cw29)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LU2, xmm4             ; save for later
+
+    ; _V = ((112 * _R -  94 * _G -  18 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1          ; blue
+    movdqa xmm5, xmm2          ; green
+    movdqa xmm4, xmm3          ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw116)]
+    pmullw xmm6, [lsym(cw12)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq LV2, xmm4             ; save for later
+
+    ; uv add and divide(average)
+    movq mm1, LU1              ; u from first line
+    movq mm3, mm1
+    pand mm1, [lsym(cw255)]
+    psrlw mm3, 8
+    pand mm3, [lsym(cw255)]
+    paddw mm1, mm3             ; add
+    movq mm2, LU2              ; u from second line
+    movq mm3, mm2
+    pand mm2, [lsym(cw255)]
+    paddw mm1, mm2             ; add
+    psrlw mm3, 8
+    pand mm3, [lsym(cw255)]
+    paddw mm1, mm3             ; add
+    paddw mm1, [lsym(cw2)]     ; add 2
+    psrlw mm1, 2               ; div 4
+
+    movq mm2, LV1              ; v from first line
+    movq mm4, mm2
+    pand mm2, [lsym(cw255)]
+    psrlw mm4, 8
+    pand mm4, [lsym(cw255)]
+    paddw mm2, mm4             ; add
+    movq mm3, LV2              ; v from second line
+    movq mm4, mm3
+    pand mm3, [lsym(cw255)]
+    paddw mm2, mm3             ; add
+    psrlw mm4, 8
+    pand mm4, [lsym(cw255)]
+    paddw mm2, mm4             ; add
+    paddw mm2, [lsym(cw2)]     ; add 2
+    psrlw mm2, 2               ; div 4
+
+    packuswb mm1, mm1
+    packuswb mm2, mm2
+
+    punpcklbw mm1, mm2         ; uv
+    movq [edx], mm1            ; out 8 bytes uvuvuvuv
+
+    ; go up to first line
+    sub esi, LSRC_STRIDE
+    sub edi, LDST_Y_STRIDE
+
+    ; move right
+    lea esi, [esi + 32]
+    lea edi, [edi + 8]
+    lea edx, [edx + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov eax, LS8               ; s8
+    add eax, LSRC_STRIDE       ; s8 += src_stride
+    add eax, LSRC_STRIDE       ; s8 += src_stride
+    mov LS8, eax
+
+    ; update d8_y
+    mov eax, LD8_Y             ; d8_y
+    add eax, LDST_Y_STRIDE     ; d8_y += dst_stride_y
+    add eax, LDST_Y_STRIDE     ; d8_y += dst_stride_y
+    mov LD8_Y, eax
+
+    ; update d8_uv
+    mov eax, LD8_UV            ; d8_uv
+    add eax, LDST_UV_STRIDE    ; d8_uv += dst_stride_uv
+    mov LD8_UV, eax
+
+    dec ebp
+    jnz row_loop1
+
+    mov eax, 0                 ; return value
+    add esp, 32                ; local vars, 32 bytes
+    pop ebp
+    pop edi
+    pop esi
+    pop ebx
+    ret
+END_OF_FILE
diff --git a/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm
new file mode 100644
index 00000000..cec02043
--- /dev/null
+++ b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm
@@ -0,0 +1,171 @@
+;
+;Copyright 2024 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to YUVALP
+;x86 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width must be multiple of 8 and > 0
+;   height must be > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+    cw128  times 8 dw 128
+    cw77   times 8 dw 77
+    cw150  times 8 dw 150
+    cw29   times 8 dw 29
+    cw43   times 8 dw 43
+    cw85   times 8 dw 85
+    cw107  times 8 dw 107
+    cw21   times 8 dw 21
+
+%define LS8            [esp + 20]   ; s8
+%define LSRC_STRIDE    [esp + 24]   ; src_stride
+%define LD8            [esp + 28]   ; d8
+%define LDST_STRIDE    [esp + 32]   ; dst_stride
+%define LWIDTH         [esp + 36]   ; width
+%define LHEIGHT        [esp + 40]   ; height
+
+;int
+;a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride,
+;                                uint8_t *d8, int dst_stride,
+;                                int width, int height);
+PROC a8r8g8b8_to_yuvalp_box_x86_sse2
+    push ebx
+    RETRIEVE_RODATA
+    push esi
+    push edi
+    push ebp
+
+    pxor xmm7, xmm7
+
+    mov ebp, LHEIGHT            ; ebp = height
+
+row_loop1:
+    mov esi, LS8                ; s8
+    mov edi, LD8                ; d8
+
+    mov ecx, LWIDTH             ; ecx = width
+    shr ecx, 3                  ; doing 8 pixels at a time
+
+loop1:
+    movdqu xmm0, [esi]          ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0           ; blue
+    pand xmm1, [lsym(cd255)]    ; blue
+    movdqa xmm2, xmm0           ; green
+    psrld xmm2, 8               ; green
+    pand xmm2, [lsym(cd255)]    ; green
+    movdqa xmm3, xmm0           ; red
+    psrld xmm3, 16              ; red
+    pand xmm3, [lsym(cd255)]    ; red
+    movdqa xmm4, xmm0           ; alpha
+    psrld xmm4, 24              ; alpha
+    pand xmm4, [lsym(cd255)]    ; alpha
+
+    movdqu xmm0, [esi + 16]     ; 4 pixels, 16 bytes
+    movdqa xmm5, xmm0           ; alpha
+    psrld xmm5, 24              ; alpha
+    pand xmm5, [lsym(cd255)]    ; alpha
+    packssdw xmm4, xmm5         ; xmm4 = 8 alphas
+    packuswb xmm4, xmm7
+    movq [edi + 3 * 64 * 64], xmm4  ; out 8 bytes aaaaaaaa
+    movdqa xmm4, xmm0           ; blue
+    pand xmm4, [lsym(cd255)]    ; blue
+    movdqa xmm5, xmm0           ; green
+    psrld xmm5, 8               ; green
+    pand xmm5, [lsym(cd255)]    ; green
+    movdqa xmm6, xmm0           ; red
+    psrld xmm6, 16              ; red
+    pand xmm6, [lsym(cd255)]    ; red
+
+    packssdw xmm1, xmm4         ; xmm1 = 8 blues
+    packssdw xmm2, xmm5         ; xmm2 = 8 greens
+    packssdw xmm3, xmm6         ; xmm3 = 8 reds
+
+    ; _Y = (77 * _R + 150 * _G +  29 * _B) >> 8;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw29)]
+    pmullw xmm5, [lsym(cw150)]
+    pmullw xmm6, [lsym(cw77)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [edi], xmm4            ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-43 * _R -  85 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw85)]
+    pmullw xmm6, [lsym(cw43)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [edi + 1 * 64 * 64], xmm4  ; out 8 bytes uuuuuuuu
+
+    ; _V = ((128 * _R - 107 * _G -  21 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm4, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw107)]
+    pmullw xmm6, [lsym(cw21)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [edi + 2 * 64 * 64], xmm4  ; out 8 bytes vvvvvvvv
+
+    ; move right
+    lea esi, [esi + 32]
+    lea edi, [edi + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov eax, LS8                ; s8
+    add eax, LSRC_STRIDE        ; s8 += src_stride
+    mov LS8, eax
+
+    ; update d8
+    mov eax, LD8                ; d8
+    add eax, LDST_STRIDE        ; d8 += dst_stride
+    mov LD8, eax
+
+    dec ebp
+    jnz row_loop1
+
+    mov eax, 0                  ; return value
+    pop ebp
+    pop edi
+    pop esi
+    pop ebx
+    ret
+END_OF_FILE
diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h
index c70cc8cf..a08834f8 100644
--- a/module/x86/funcs_x86.h
+++ b/module/x86/funcs_x86.h
@@ -43,6 +43,15 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride,
                               uint8_t *d8_y, int dst_stride_y,
                               uint8_t *d8_uv, int dst_stride_uv,
                               int width, int height);
+int
+a8r8g8b8_to_nv12_709fr_box_x86_sse2(const uint8_t *s8, int src_stride,
+                                    uint8_t *d8_y, int dst_stride_y,
+                                    uint8_t *d8_uv, int dst_stride_uv,
+                                    int width, int height);
+int
+a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride,
+                                uint8_t *d8, int dst_stride,
+                                int width, int height);
 
 #endif