diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a495a362..ffb33509 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -68,7 +68,7 @@ jobs: steps: - uses: actions/checkout@v3 - run: sudo scripts/install_xorgxrdp_build_dependencies_with_apt.sh ${{ matrix.arch }} --allow-downgrades --allow-remove-essential --allow-change-held-packages - - run: git clone --depth 1 --branch=v0.10 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp + - run: git clone --depth 1 --branch=v0.10-h264 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp - run: ./bootstrap - run: ./configure ${{ matrix.CONF_FLAGS }} - run: make CFLAGS="$CFLAGS -O2 -Wall -Wwrite-strings -Werror" diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am index cd2a0204..ed2d7c63 100644 --- a/module/amd64/Makefile.am +++ b/module/amd64/Makefile.am @@ -3,6 +3,8 @@ NAFLAGS += -DASM_ARCH_AMD64 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \ a8r8g8b8_to_nv12_box_amd64_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm \ + a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \ cpuid_amd64.asm \ i420_to_rgb32_amd64_sse2.asm \ uyvy_to_rgb32_amd64_sse2.asm \ diff --git a/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm new file mode 100644 index 00000000..c18e9d6a --- /dev/null +++ b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm @@ -0,0 +1,304 @@ +; +;Copyright 2015 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8_Y [rsp + 16] ; d8_y +%define LDST_Y_STRIDE [rsp + 24] ; dst_stride_y +%define LD8_UV [rsp + 32] ; d8_uv +%define LDST_UV_STRIDE [rsp + 40] ; dst_stride_uv +%define LU1 [rsp + 48] ; first line U, 8 bytes +%define LV1 [rsp + 56] ; first line V, 8 bytes +%define LU2 [rsp + 64] ; second line U, 8 bytes +%define LV2 [rsp + 72] ; second line V, 8 bytes + +%define LWIDTH [rsp + 104] ; width +%define LHEIGHT [rsp + 112] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_amd64_sse2 + push rbx + push rbp + sub rsp, 80 ; local vars, 80 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8_Y, rdx ; d8_y + mov LDST_Y_STRIDE, rcx ; dst_stride_y + mov LD8_UV, r8 ; d8_uv + mov LDST_UV_STRIDE, r9 ; dst_stride_uv + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + shr ebx, 1 ; doing 2 lines at a time + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8_Y ; d8_y + mov rdx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add rsi, LSRC_STRIDE + add rdi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [rdx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub rsi, LSRC_STRIDE + sub rdi, LDST_Y_STRIDE + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + lea rdx, [rdx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8_y + mov rax, LD8_Y ; d8_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, rax + + ; update d8_uv + mov rax, LD8_UV ; d8_uv + add rax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 80 ; local vars, 80 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm new file mode 100644 index 00000000..cfe9d6af --- /dev/null +++ b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm @@ -0,0 +1,178 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8 [rsp + 16] ; d8 +%define LDST_STRIDE [rsp + 24] ; dst_stride +%define LWIDTH [rsp + 32] ; width +%define LHEIGHT [rsp + 40] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_amd64_sse2 + push rbx + push rbp + sub rsp, 48 ; local vars, 48 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8, rdx ; d8 + mov LDST_STRIDE, rcx ; dst_stride + mov LWIDTH, r8 ; width + mov LHEIGHT, r9 ; height + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [rdi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8 + mov rax, LD8 ; d8 + add rax, LDST_STRIDE ; d8 += dst_stride + mov LD8, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 48 ; local vars, 48 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h index ae38c53b..9d746fdc 100644 --- a/module/amd64/funcs_amd64.h +++ b/module/amd64/funcs_amd64.h @@ -43,6 +43,15 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int +a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdp.h b/module/rdp.h index 844db424..534934e3 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -282,6 +282,8 @@ struct _rdpRec CARD32 last_event_time_ms; CARD32 last_wheel_time_ms; + CARD32 msFrameInterval; + int conNumber; struct _rdpCounts counts; @@ -297,6 +299,8 @@ struct _rdpRec copy_box_proc a8r8g8b8_to_a8b8g8r8_box; copy_box_dst2_proc a8r8g8b8_to_nv12_box; + copy_box_dst2_proc a8r8g8b8_to_nv12_709fr_box; + copy_box_proc a8r8g8b8_to_yuvalp_box; /* multimon */ struct monitor_info minfo[16]; /* client monitor data */ diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 334a4880..7591af47 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay, /* 19595 38470 7471 -11071 -21736 32807 32756 -27429 -5327 */ -static int -rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, - const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - BoxPtr rects, int num_rects) +int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) { - const uint8_t *s8; - uint8_t *d8; uint8_t *yptr; uint8_t *uptr; uint8_t *vptr; uint8_t *aptr; const uint32_t *s32; - int index; int jndex; int kndex; - int width; - int height; uint32_t pixel; uint8_t a; int r; @@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, int y; int u; int v; + + for (jndex = 0; jndex < height; jndex++) + { + s32 = (const uint32_t *) s8; + yptr = d8; + uptr = yptr + 64 * 64; + vptr = uptr + 64 * 64; + aptr = vptr + 64 * 64; + kndex = 0; + while (kndex < width) + { + pixel = *(s32++); + RGB_SPLIT(a, r, g, b, pixel); + y = (r * 19595 + g * 38470 + b * 7471) >> 16; + u = (r * -11071 + g * -21736 + b * 32807) >> 16; + v = (r * 32756 + g * -27429 + b * -5327) >> 16; + u = u + 128; + v = v + 128; + y = RDPCLAMP(y, 0, UCHAR_MAX); + u = RDPCLAMP(u, 0, UCHAR_MAX); + v = RDPCLAMP(v, 0, UCHAR_MAX); + *(yptr++) = y; + *(uptr++) = u; + *(vptr++) = v; + *(aptr++) = a; + kndex++; + } + d8 += dst_stride; + s8 += src_stride; + } + return 0; +} + +/******************************************************************************/ +static int +rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay, + const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8; + int index; + int width; + int height; BoxPtr box; dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8); @@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, d8 += box->x1 - ax; width = box->x2 - box->x1; height = box->y2 - box->y1; - for (jndex = 0; jndex < height; jndex++) - { - s32 = (const uint32_t *) s8; - yptr = d8; - uptr = yptr + 64 * 64; - vptr = uptr + 64 * 64; - aptr = vptr + 64 * 64; - kndex = 0; - while (kndex < width) - { - pixel = *(s32++); - RGB_SPLIT(a, r, g, b, pixel); - y = (r * 19595 + g * 38470 + b * 7471) >> 16; - u = (r * -11071 + g * -21736 + b * 32807) >> 16; - v = (r * 32756 + g * -27429 + b * -5327) >> 16; - u = u + 128; - v = v + 128; - y = RDPCLAMP(y, 0, UCHAR_MAX); - u = RDPCLAMP(u, 0, UCHAR_MAX); - v = RDPCLAMP(v, 0, UCHAR_MAX); - *(yptr++) = y; - *(uptr++) = u; - *(vptr++) = v; - *(aptr++) = a; - kndex++; - } - d8 += 64; - s8 += src_stride; - } + clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride, + d8, 64, + width, height); } return 0; } @@ -540,6 +553,103 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, return 0; } +/******************************************************************************/ +int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int index; + int jndex; + int R; + int G; + int B; + int Y; + int U; + int V; + int U_sum; + int V_sum; + int pixel; + const uint32_t *s32a; + const uint32_t *s32b; + uint8_t *d8ya; + uint8_t *d8yb; + uint8_t *d8uv; + + for (jndex = 0; jndex < height; jndex += 2) + { + s32a = (const uint32_t *) (s8 + src_stride * jndex); + s32b = (const uint32_t *) (s8 + src_stride * (jndex + 1)); + d8ya = d8_y + dst_stride_y * jndex; + d8yb = d8_y + dst_stride_y * (jndex + 1); + d8uv = d8_uv + dst_stride_uv * (jndex / 2); + for (index = 0; index < width; index += 2) + { + U_sum = 0; + V_sum = 0; + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + d8uv[0] = (U_sum + 2) / 4; + d8uv++; + d8uv[0] = (V_sum + 2) / 4; + d8uv++; + } + } + return 0; +} + /******************************************************************************/ /* copy rects with no error checking */ static int @@ -577,6 +687,44 @@ rdpCopyBox_a8r8g8b8_to_nv12(rdpClientCon *clientCon, return 0; } +/******************************************************************************/ +/* copy rects with no error checking */ +static int +rdpCopyBox_a8r8g8b8_to_nv12_709fr(rdpClientCon *clientCon, + const uint8_t *src, int src_stride, + int srcx, int srcy, + uint8_t *dst_y, int dst_stride_y, + uint8_t *dst_uv, int dst_stride_uv, + int dstx, int dsty, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8_y; + uint8_t *d8_uv; + int index; + int width; + int height; + BoxPtr box; + + for (index = 0; index < num_rects; index++) + { + box = rects + index; + s8 = src + (box->y1 - srcy) * src_stride; + s8 += (box->x1 - srcx) * 4; + d8_y = dst_y + (box->y1 - dsty) * dst_stride_y; + d8_y += (box->x1 - dstx) * 1; + d8_uv = dst_uv + ((box->y1 - dsty) / 2) * dst_stride_uv; + d8_uv += (box->x1 - dstx) * 1; + width = box->x2 - box->x1; + height = box->y2 - box->y1; + clientCon->dev->a8r8g8b8_to_nv12_709fr_box(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + width, height); + } + return 0; +} + /******************************************************************************/ static Bool isShmStatusActive(enum shared_memory_status status) { @@ -610,8 +758,8 @@ wyhash_rfx_tile(const uint8_t *src, int src_stride, int x, int y, uint64_t seed) /******************************************************************************/ static Bool -rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSimple(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -624,10 +772,10 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture0:")); + LLOGLN(10, ("rdpCaptureSimple:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture0: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSimple: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -695,7 +843,7 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture0: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSimple: unimplemented color conversion")); } return rv; } @@ -703,8 +851,8 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 16 width and height */ static Bool -rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA16(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -735,10 +883,10 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture1:")); + LLOGLN(10, ("rdpCaptureSufA16:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture1: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA16: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -844,15 +992,15 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture1: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA16: unimplemented color conversion")); } return rv; } /******************************************************************************/ static Bool -rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureGfxPro(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { int x; int y; @@ -874,11 +1022,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int num_crcs; int mon_index; - LLOGLN(10, ("rdpCapture2:")); + LLOGLN(10, ("rdpCaptureGfxPro:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture2: WARNING -- Shared memory is not configured" + LLOGLN(0, ("rdpCaptureGfxPro: WARNING -- Shared memory is not configured" " for RFX. Aborting capture!")); return FALSE; } @@ -904,7 +1052,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, num_crcs = crc_stride * ((id->height + 63) / 64); if (num_crcs != clientCon->num_rfx_crcs_alloc[mon_index]) { - LLOGLN(0, ("rdpCapture2: resize the crc list was %d now %d", + LLOGLN(0, ("rdpCaptureGfxPro: resize the crc list was %d now %d", clientCon->num_rfx_crcs_alloc[mon_index], num_crcs)); /* resize the crc list */ clientCon->num_rfx_crcs_alloc[mon_index] = num_crcs; @@ -924,11 +1072,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, rect.x2 = rect.x1 + XRDP_RFX_ALIGN; rect.y2 = rect.y1 + XRDP_RFX_ALIGN; rcode = rdpRegionContainsRect(in_reg, &rect); - LLOGLN(10, ("rdpCapture2: rcode %d", rcode)); + LLOGLN(10, ("rdpCaptureGfxPro: rcode %d", rcode)); if (rcode == rgnOUT) { - LLOGLN(10, ("rdpCapture2: rgnOUT")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnOUT")); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -939,14 +1087,14 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, crc = WYHASH_SEED; if (rcode == rgnPART) { - LLOGLN(10, ("rdpCapture2: rgnPART")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnPART")); rdpFillBox_yuvalp(x, y, dst, dst_stride); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionIntersect(&tile_reg, in_reg, &tile_reg); rects = REGION_RECTS(&tile_reg); num_rects = REGION_NUM_RECTS(&tile_reg); crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp); - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, rects, num_rects); @@ -956,16 +1104,16 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else /* rgnIN */ { - LLOGLN(10, ("rdpCapture2: rgnIN")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnIN")); crc = wyhash_rfx_tile(src, src_stride, x, y, crc); } crc_offset = (y / XRDP_RFX_ALIGN) * crc_stride + (x / XRDP_RFX_ALIGN); - LLOGLN(10, ("rdpCapture2: crc 0x%" PRIx64 " 0x%" PRIx64, + LLOGLN(10, ("rdpCaptureGfxPro: crc 0x%" PRIx64 " 0x%" PRIx64, crc, clientCon->rfx_crcs[mon_index][crc_offset])); if (crc == clientCon->rfx_crcs[mon_index][crc_offset]) { - LLOGLN(10, ("rdpCapture2: crc skip at x %d y %d", x, y)); + LLOGLN(10, ("rdpCaptureGfxPro: crc skip at x %d y %d", x, y)); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -975,7 +1123,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /* lazily only do this if hash wasn't identical */ if (rcode != rgnPART) { - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, &rect, 1); @@ -1002,8 +1150,8 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 2 width and height */ static Bool -rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -1017,11 +1165,11 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture3:")); + LLOGLN(10, ("rdpCaptureSufA2:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture3: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA2: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -1081,7 +1229,100 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture3: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA2: unimplemented color conversion")); + } + + return rv; +} + +/******************************************************************************/ +/* make out_rects always multiple of 2 width and height */ +static Bool +rdpCaptureGfxA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) +{ + BoxPtr psrc_rects; + BoxRec rect; + int num_rects; + int index; + uint8_t *dst_uv; + Bool rv; + const uint8_t *src; + uint8_t *dst; + int src_stride; + int dst_stride; + int dst_format; + + LLOGLN(10, ("rdpCaptureGfxA2:")); + + if (!isShmStatusActive(clientCon->shmemstatus)) + { + LLOGLN(0, ("rdpCaptureGfxA2: WARNING -- Shared memory is not configured." + " Aborting capture!")); + return FALSE; + } + + rv = TRUE; + + rdpRegionTranslate(in_reg, -id->left, -id->top); + + num_rects = REGION_NUM_RECTS(in_reg); + psrc_rects = REGION_RECTS(in_reg); + + if (num_rects < 1) + { + return FALSE; + } + + *num_out_rects = num_rects; + + *out_rects = g_new(BoxRec, num_rects * 4); + index = 0; + while (index < num_rects) + { + rect = psrc_rects[index]; + LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, + rect.x2, rect.y2)); + rect.x1 -= rect.x1 & 1; + rect.y1 -= rect.y1 & 1; + rect.x2 += rect.x2 & 1; + rect.y2 += rect.y2 & 1; + if (rect.x2 > id->width) + { + rect.x2 = id->width & ~1; + } + if (rect.y2 > id->height) + { + rect.y2 = id->height & ~1; + } + LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, + rect.x2, rect.y2)); + (*out_rects)[index] = rect; + index++; + } + + src = id->pixels; + dst = id->shmem_pixels; + dst_format = clientCon->rdp_format; + src_stride = id->lineBytes; + dst_stride = id->width; + + src = src + src_stride * id->top + id->left * 4; + + if (dst_format == XRDP_nv12_709fr) + { + dst_uv = dst; + dst_uv += id->width * id->height; + rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon, + src, src_stride, 0, 0, + dst, dst_stride, + dst_uv, dst_stride, + 0, 0, + *out_rects, num_rects); + } + else + { + LLOGLN(0, ("rdpCaptureGfxA2: unimplemented color conversion")); } return rv; @@ -1153,7 +1394,7 @@ Bool rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int *num_out_rects, struct image_data *id) { - int mode; + enum xrdp_capture_code mode; LLOGLN(10, ("rdpCapture:")); mode = clientCon->client_info.capture_code; @@ -1170,18 +1411,20 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } switch (mode) { - case 0: - return rdpCapture0(clientCon, in_reg, out_rects, num_out_rects, id); - case 1: - return rdpCapture1(clientCon, in_reg, out_rects, num_out_rects, id); - case 2: - case 4: - /* used for remotefx capture */ - return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id); - case 3: - case 5: + case CC_SIMPLE: + return rdpCaptureSimple(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A16: + return rdpCaptureSufA16(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_RFX: /* surface command RFX */ + /* FALLTHROUGH */ + case CC_GFX_PRO: /* GFX progressive */ + return rdpCaptureGfxPro(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A2: /* surface command h264 */ + /* used for even align capture */ + return rdpCaptureSufA2(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_GFX_A2: /* GFX h264 */ /* used for even align capture */ - return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id); + return rdpCaptureGfxA2(clientCon, in_reg, out_rects, num_out_rects, id); default: LLOGLN(0, ("rdpCapture: mode %d not implemented", mode)); break; diff --git a/module/rdpCapture.h b/module/rdpCapture.h index 72a9336e..7e38508e 100644 --- a/module/rdpCapture.h +++ b/module/rdpCapture.h @@ -48,5 +48,14 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +extern _X_EXPORT int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +extern _X_EXPORT int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c index a3366fa5..b98db64b 100644 --- a/module/rdpClientCon.c +++ b/module/rdpClientCon.c @@ -776,47 +776,57 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->rdp_height = height; /* Set the capture parameters */ - if ((clientCon->client_info.capture_code == 2) || /* RFX */ - (clientCon->client_info.capture_code == 4)) + switch(clientCon->client_info.capture_code) { - LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); - /* RFX capture needs fixed-size rectangles */ - clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN); - clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN); - LLOGLN(0, (" cap_width %d cap_height %d", - clientCon->cap_width, clientCon->cap_height)); + case CC_SUF_RFX: /* RFX */ + case CC_GFX_PRO: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); + /* RFX capture needs fixed-size rectangles */ + clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN); + clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN); + LLOGLN(0, (" cap_width %d cap_height %d", + clientCon->cap_width, clientCon->cap_height)); - bytes = clientCon->cap_width * clientCon->cap_height * - clientCon->rdp_Bpp; + bytes = clientCon->cap_width * clientCon->cap_height * + clientCon->rdp_Bpp; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * 4; - shmemstatus = SHM_RFX_ACTIVE_PENDING; - } - else if ((clientCon->client_info.capture_code == 3) || /* H264 */ - (clientCon->client_info.capture_code == 5)) - { - LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); - clientCon->cap_width = width; - clientCon->cap_height = height; + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * 4; + shmemstatus = SHM_RFX_ACTIVE_PENDING; - bytes = clientCon->cap_width * clientCon->cap_height * 2; + dev->msFrameInterval = clientCon->client_info.rfx_frame_interval; + break; + case CC_SUF_A2: /* H264 */ + case CC_GFX_A2: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); + clientCon->cap_width = width; + clientCon->cap_height = height; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * 4; - shmemstatus = SHM_H264_ACTIVE_PENDING; - } - else - { - clientCon->cap_width = width; - clientCon->cap_height = height; + bytes = clientCon->cap_width * clientCon->cap_height * 2; + + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * 4; + shmemstatus = SHM_H264_ACTIVE_PENDING; - bytes = width * height * clientCon->rdp_Bpp; + dev->msFrameInterval = clientCon->client_info.h264_frame_interval; + break; + default: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got normal capture")); + clientCon->cap_width = width; + clientCon->cap_width = width; + clientCon->cap_height = height; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp; - shmemstatus = SHM_ACTIVE_PENDING; + bytes = width * height * clientCon->rdp_Bpp; + + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp; + shmemstatus = SHM_ACTIVE_PENDING; + + dev->msFrameInterval = clientCon->client_info.normal_frame_interval; + break; } + + LLOGLN(0, (" msFrameInterval %ld", (long)dev->msFrameInterval)); rdpClientConAllocateSharedMemory(clientCon, bytes); if (clientCon->client_info.capture_format != 0) @@ -1011,12 +1021,12 @@ rdpSendMemoryAllocationComplete(rdpPtr dev, rdpClientCon *clientCon) switch (clientCon->client_info.capture_code) { - case 2: - case 4: + case CC_SUF_RFX: + case CC_GFX_PRO: alignment = XRDP_RFX_ALIGN; break; - case 3: - case 5: + case CC_SUF_A2: + case CC_GFX_A2: alignment = XRDP_H264_ALIGN; break; default: @@ -2539,7 +2549,7 @@ rdpClientConScheduleDeferredUpdate(rdpPtr dev) { dev->sendUpdateScheduled = TRUE; dev->sendUpdateTimer = - TimerSet(dev->sendUpdateTimer, 0, 40, + TimerSet(dev->sendUpdateTimer, 0, dev->msFrameInterval, rdpClientConDeferredUpdateCallback, dev); } } @@ -2608,7 +2618,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, int num_rects_d; int num_rects_c; struct stream *s; - int capture_code; + enum xrdp_capture_code capture_code; int start_frame_bytes; int wiretosurface1_bytes; int wiretosurface2_bytes; @@ -2625,6 +2635,8 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, id->flags, id->left, id->top, id->width, id->height)); capture_code = clientCon->client_info.capture_code; + LLOGLN(10, ("rdpClientConSendPaintRectShmFd: capture_code %d", + capture_code)); num_rects_d = REGION_NUM_RECTS(dirtyReg); num_rects_c = numCopyRects; @@ -2636,7 +2648,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConBeginUpdate(dev, clientCon); - if (capture_code < 4) + if (capture_code < CC_GFX_PRO) { /* non gfx */ size = 2 + 2 + 2 + num_rects_d * 8 + 2 + num_rects_c * 8; @@ -2656,15 +2668,15 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, clientCon->rect_id); out_uint32_le(s, id->shmem_bytes); out_uint32_le(s, id->shmem_offset); - if (capture_code == 2) /* rfx */ - { + if (capture_code == CC_SUF_RFX) /* rfx */ + { out_uint16_le(s, id->left); out_uint16_le(s, id->top); out_uint16_le(s, id->width); out_uint16_le(s, id->height); - } - else - { + } + else + { out_uint16_le(s, 0); out_uint16_le(s, 0); out_uint16_le(s, clientCon->cap_width); @@ -2673,7 +2685,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConSendPending(clientCon->dev, clientCon); g_sck_send_fd_set(clientCon->sck, "int", 4, &(id->shmem_fd), 1); } - else if (capture_code == 4) /* gfx pro rfx */ + else if (capture_code == CC_GFX_PRO) /* gfx pro rfx */ { start_frame_bytes = 8 + 8; wiretosurface2_bytes = 8 + 13 + @@ -2745,7 +2757,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, 0); /* shmem_bytes */ } } - else if (capture_code == 5) /* gfx h264 */ + else if (capture_code == CC_GFX_A2) /* gfx h264 */ { start_frame_bytes = 8 + 8; wiretosurface1_bytes = 8 + 9 + @@ -2979,7 +2991,6 @@ rdpDeferredUpdateCallback(OsTimerPtr timer, CARD32 now, pointer arg) /******************************************************************************/ -#define MIN_MS_BETWEEN_FRAMES 40 #define MIN_MS_TO_WAIT_FOR_MORE_UPDATES 4 #define UPDATE_RETRY_TIMEOUT 200 // After this number of retries, give up and perform the capture anyway. This prevents an infinite loop. static void @@ -2998,7 +3009,7 @@ rdpScheduleDeferredUpdate(rdpClientCon *clientCon) for more changes before sending an update. Always waiting the longer delay would introduce unnecessarily much latency. */ msToWait = MIN_MS_TO_WAIT_FOR_MORE_UPDATES; - minNextUpdateTime = clientCon->lastUpdateTime + MIN_MS_BETWEEN_FRAMES; + minNextUpdateTime = clientCon->lastUpdateTime + clientCon->dev->msFrameInterval; /* the first check is to gracefully handle the infrequent case of the time wrapping around */ if(clientCon->lastUpdateTime < curTime && diff --git a/module/rdpClientCon.h b/module/rdpClientCon.h index 5ff1de21..b4c443cf 100644 --- a/module/rdpClientCon.h +++ b/module/rdpClientCon.h @@ -120,6 +120,8 @@ struct _rdpClientCon int updateScheduled; /* boolean */ int updateRetries; + CARD32 msFrameInterval; + RegionPtr dirtyRegion; int num_rfx_crcs_alloc[16]; diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 49a3653e..59feb9da 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -62,6 +62,261 @@ int g_simd_use_accel = 1; #define LLOGLN(_level, _args) \ do { if (_level < LOG_LEVEL) { ErrorF _args ; ErrorF("\n"); } } while (0) +#if SIMD_USE_ACCEL + +#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, + height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_amd64_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#if defined(__x86__) || defined(_M_IX86) || defined(__i386__) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_x86_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#endif + /*****************************************************************************/ Bool rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) @@ -77,6 +332,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = UYVY_to_RGB32; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) { @@ -92,7 +349,9 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__) @@ -107,7 +366,9 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } #endif diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am index ed106863..92acda61 100644 --- a/module/x86/Makefile.am +++ b/module/x86/Makefile.am @@ -3,6 +3,8 @@ NAFLAGS += -DASM_ARCH_I386 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \ a8r8g8b8_to_nv12_box_x86_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm \ + a8r8g8b8_to_yuvalp_box_x86_sse2.asm \ cpuid_x86.asm \ i420_to_rgb32_x86_sse2.asm \ uyvy_to_rgb32_x86_sse2.asm \ diff --git a/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm new file mode 100644 index 00000000..262f1af3 --- /dev/null +++ b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm @@ -0,0 +1,300 @@ +; +;Copyright 2015 Jay Sorg +;Copyright 2017 mirabilos +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LU1 [esp + 0] ; first line U, 8 bytes +%define LV1 [esp + 8] ; first line V, 8 bytes +%define LU2 [esp + 16] ; second line U, 8 bytes +%define LV2 [esp + 24] ; second line V, 8 bytes + +%define LS8 [esp + 52] ; s8 +%define LSRC_STRIDE [esp + 56] ; src_stride +%define LD8_Y [esp + 60] ; d8_y +%define LDST_Y_STRIDE [esp + 64] ; dst_stride_y +%define LD8_UV [esp + 68] ; d8_uv +%define LDST_UV_STRIDE [esp + 72] ; dst_stride_uv +%define LWIDTH [esp + 76] ; width +%define LHEIGHT [esp + 80] ; height + +;int +;a8r8g8b8_to_nv12_709fr_box_x86_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + sub esp, 32 ; local vars, 32 bytes + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + shr ebp, 1 ; doing 2 lines at a time + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8_Y ; d8_y + mov edx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add esi, LSRC_STRIDE + add edi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [edx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub esi, LSRC_STRIDE + sub edi, LDST_Y_STRIDE + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + lea edx, [edx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8_y + mov eax, LD8_Y ; d8_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, eax + + ; update d8_uv + mov eax, LD8_UV ; d8_uv + add eax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + add esp, 32 ; local vars, 32 bytes + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm new file mode 100644 index 00000000..cec02043 --- /dev/null +++ b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm @@ -0,0 +1,171 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [esp + 20] ; s8 +%define LSRC_STRIDE [esp + 24] ; src_stride +%define LD8 [esp + 28] ; d8 +%define LDST_STRIDE [esp + 32] ; dst_stride +%define LWIDTH [esp + 36] ; width +%define LHEIGHT [esp + 40] ; height + +;int +;a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [edi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8 + mov eax, LD8 ; d8 + add eax, LDST_STRIDE ; d8 += dst_stride + mov LD8, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h index c70cc8cf..a08834f8 100644 --- a/module/x86/funcs_x86.h +++ b/module/x86/funcs_x86.h @@ -43,6 +43,15 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_nv12_709fr_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int +a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif