Skip to content

Commit

Permalink
Port rounding of some x87 instructions from Box86 (#2242)
Browse files Browse the repository at this point in the history
* Port rounding of some x87 instructions from Box86

Ported from ptitSeb/box86#951. The original pull
request and this commit also contain some improvements on precision of
`F2XM1` and `FYL2XP1`.

* Run fpu_rounding test with dynarec only for ARM64

They have been implemented on dynarec only for ARM64.
  • Loading branch information
Hagb authored Jan 8, 2025
1 parent b99893d commit 653a67c
Show file tree
Hide file tree
Showing 21 changed files with 1,361 additions and 24 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,17 @@ add_test(avx_intrinsics ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${B

set_tests_properties(avx_intrinsics PROPERTIES ENVIRONMENT "BOX64_DYNAREC_FASTNAN=0;BOX64_DYNAREC_FASTROUND=0;BOX64_AVX=2")

add_test(fpu_rounding ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX64}
-D TEST_ARGS=${CMAKE_SOURCE_DIR}/tests/test31 -D TEST_OUTPUT=tmpfile31.txt
-D TEST_REFERENCE=${CMAKE_SOURCE_DIR}/tests/ref31.txt
-P ${CMAKE_SOURCE_DIR}/runTest.cmake )

if(ARM_DYNAREC)
set_tests_properties(fpu_rounding PROPERTIES ENVIRONMENT "BOX64_DYNAREC_FASTROUND=0;BOX64_DYNAREC_TEST=1")
else()
set_tests_properties(fpu_rounding PROPERTIES ENVIRONMENT "BOX64_DYNAREC=0")
endif()

else()

add_test(bootSyscall ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX64}
Expand Down
4 changes: 2 additions & 2 deletions docs/USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,8 @@ Generate x86 -NAN.
#### BOX64_DYNAREC_FASTROUND *

Generate precise x86 rounding.
* 0 : Generate float/double -> int rounding like on x86.
* 1 : Do not do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion). (default, faster)
* 0 : Generate float/double -> int rounding and use current rounding mode for float/double computation like on x86.
* 1 : Do not do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion, and no non-default rounding modes). (default, faster)
* 2 : Everything from 1 plus also fast round of double->float (not taking into account current rounding mode).

#### BOX64_DYNAREC_SAFEFLAGS *
Expand Down
4 changes: 2 additions & 2 deletions docs/box64.pod
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ Enable/Disable generation of -NAN

Enable/Disable generation of precise x86 rounding

* 0 : Generate float/double -> int rounding like on x86
* 1 : Don't do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion) (default, faster)
* 0 : Generate float/double -> int rounding and use current rounding mode for float/double computation like on x86
* 1 : Don't do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion, and no non-default rounding modes) (default, faster)

=item B<BOX64_DYNAREC_SAFEFLAGS>=I<0|1|2>

Expand Down
2 changes: 1 addition & 1 deletion src/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ void LoadLogEnv()
box64_dynarec_fastround = p[0]-'0';
}
if(!box64_dynarec_fastround)
printf_log(LOG_INFO, "Dynarec will try to generate x86 precise IEEE->int rounding\n");
printf_log(LOG_INFO, "Dynarec will try to generate x86 precise IEEE->int rounding and and set rounding mode for computation\n");
else if(box64_dynarec_fastround==2)
printf_log(LOG_INFO, "Dynarec will generate x86 very imprecise double->float rounding\n");
}
Expand Down
49 changes: 49 additions & 0 deletions src/dynarec/arm64/dynarec_arm64_d8.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin

uint8_t nextop = F8;
uint8_t ed;
uint8_t u8;
int64_t fixedaddress;
int unscaled;
int v1, v2;
Expand All @@ -51,11 +52,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FADD ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FADDS(v1, v1, v2);
} else {
FADDD(v1, v1, v2);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xC8:
case 0xC9:
Expand All @@ -68,11 +73,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FMUL ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FMULS(v1, v1, v2);
} else {
FMULD(v1, v1, v2);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xD0:
case 0xD1:
Expand Down Expand Up @@ -122,11 +131,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FSUB ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FSUBS(v1, v1, v2);
} else {
FSUBD(v1, v1, v2);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xE8:
case 0xE9:
Expand All @@ -139,11 +152,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FSUBR ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FSUBS(v1, v2, v1);
} else {
FSUBD(v1, v2, v1);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xF0:
case 0xF1:
Expand All @@ -156,11 +173,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FDIV ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FDIVS(v1, v1, v2);
} else {
FDIVD(v1, v1, v2);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xF8:
case 0xF9:
Expand All @@ -173,11 +194,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
INST_NAME("FDIVR ST0, STx");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FDIVS(v1, v2, v1);
} else {
FDIVD(v1, v2, v1);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
default:
DEFAULT;
Expand All @@ -190,25 +215,33 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FADDS(v1, v1, s0);
} else {
FCVT_D_S(s0, s0);
FADDD(v1, v1, s0);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 1:
INST_NAME("FMUL ST0, float[ED]");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FMULS(v1, v1, s0);
} else {
FCVT_D_S(s0, s0);
FMULD(v1, v1, s0);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 2:
INST_NAME("FCOM ST0, float[ED]");
Expand Down Expand Up @@ -245,51 +278,67 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FSUBS(v1, v1, s0);
} else {
FCVT_D_S(s0, s0);
FSUBD(v1, v1, s0);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 5:
INST_NAME("FSUBR ST0, float[ED]");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FSUBS(v1, s0, v1);
} else {
FCVT_D_S(s0, s0);
FSUBD(v1, s0, v1);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 6:
INST_NAME("FDIV ST0, float[ED]");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FDIVS(v1, v1, s0);
} else {
FCVT_D_S(s0, s0);
FDIVD(v1, v1, s0);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 7:
INST_NAME("FDIVR ST0, float[ED]");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
s0 = fpu_get_scratch(dyn, ninst);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(s0, ed, fixedaddress);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
if(ST_IS_F(0)) {
FDIVS(v1, s0, v1);
} else {
FCVT_D_S(s0, s0);
FDIVD(v1, s0, v1);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
default:
DEFAULT;
Expand Down
48 changes: 42 additions & 6 deletions src/dynarec/arm64/dynarec_arm64_d9.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,12 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
MESSAGE(LOG_DUMP, "Need Optimization\n");
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
CALL(native_ftan, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_ftan, -1, box64_dynarec_fastround ? 0 : u8);
x87_unstackcount(dyn, ninst, x1, i1);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
if(PK(0)==0xdd && PK(1)==0xd8) {
MESSAGE(LOG_DUMP, "Optimized next DD D8 fstp st0, st0, not emitting 1\n");
u8 = F8;
Expand All @@ -312,7 +316,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
x87_forget(dyn, ninst, x1, x2, 1);
CALL(native_fpatan, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_fpatan, -1, box64_dynarec_fastround ? 0 : u8);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
x87_unstackcount(dyn, ninst, x1, i1);
X87_POP_OR_FAIL(dyn, ninst, x3);
break;
Expand Down Expand Up @@ -418,19 +426,27 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
case 0xFA:
INST_NAME("FSQRT");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
if(ST_IS_F(0)) {
FSQRTS(v1, v1);
} else {
FSQRTD(v1, v1);
}
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
break;
case 0xFB:
INST_NAME("FSINCOS");
MESSAGE(LOG_DUMP, "Need Optimization\n");
X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, 0);
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 1);
CALL(native_fsincos, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_fsincos, -1, box64_dynarec_fastround ? 0 : u8);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
x87_unstackcount(dyn, ninst, x1, i1);
break;
case 0xFC:
Expand All @@ -457,23 +473,35 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
x87_forget(dyn, ninst, x1, x2, 1);
CALL(native_fscale, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_fscale, -1, box64_dynarec_fastround ? 0 : u8);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
x87_unstackcount(dyn, ninst, x1, i1);
break;
case 0xFE:
INST_NAME("FSIN");
MESSAGE(LOG_DUMP, "Need Optimization\n");
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
CALL(native_fsin, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_fsin, -1, box64_dynarec_fastround ? 0 : u8);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
x87_unstackcount(dyn, ninst, x1, i1);
break;
case 0xFF:
INST_NAME("FCOS");
MESSAGE(LOG_DUMP, "Need Optimization\n");
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
CALL(native_fcos, -1);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
CALL_(native_fcos, -1, box64_dynarec_fastround ? 0 : u8);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
x87_unstackcount(dyn, ninst, x1, i1);
break;
default:
Expand All @@ -497,7 +525,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
s0 = v1;
else {
s0 = fpu_get_scratch(dyn, ninst);
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FCVT_S_D(s0, v1);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
}
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VST32(s0, ed, fixedaddress);
Expand All @@ -507,7 +539,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
if(!ST_IS_F(0)) {
if(!box64_dynarec_fastround)
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FCVT_S_D(v1, v1);
if(!box64_dynarec_fastround)
x87_restoreround(dyn, ninst, u8);
}
VST32(v1, ed, fixedaddress);
X87_POP_OR_FAIL(dyn, ninst, x3);
Expand Down
Loading

0 comments on commit 653a67c

Please sign in to comment.