-
Notifications
You must be signed in to change notification settings - Fork 1
/
prefix-func.h
113 lines (97 loc) · 3.94 KB
/
prefix-func.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// Implementation/Emulation of prefix operand mangling
// Described in a way that potentially resembles the hardware.
// Likely there's a 4:1 mux to choose operand (swizzle) followed by
// a gate to clear the MSB (for abs() operation).
// In parallel these three selection bits are used to select a constant
// and finally these two partial results are selected by a 2:1 mux.
// Final operation is a simple xor of the MSB to negate operand.
// Evidence of this is the variety of patterns that can be obtained:
// -|x| being possible means abs() goes before negation
// -/+ constant means that constants go before negation
// absolute bit being reused for constant offset means that constant
// and absolute logic likely work in parallel (same for swizzle).
// This is nice and useful, since things like -|x| can be calculated,
// as well as doubling the constant pool size (positive and negative).
// Some instructions are (ab)using the prefix logic to perform operations:
//
// - vscl: Seems to be a vmul + swizzle on rt input.
// - vabs: Probably hardcodes abs/neg bits (allows swizzle)
// - vneg: Like vabs but a different subset of bits
// - vn{sin,rcp,...} Likely abusing the neg bit.
// (since sin(-x) = -sin(x), vrexp2(x) = vexp2(-x), etc.
// likely the reason why no vncos, vrlog2, ...)
//
// Thus this operations have some limited prefixes available.
static const uint32_t prefix_cnt[8] = {
0x00000000, // 0.0f
0x3f800000, // 1.0f
0x40000000, // 2.0f
0x3f000000, // 0.5f
0x40400000, // 3.0f
0x3eaaaaab, // 1/3
0x3e800000, // 0.25f
0x3e2aaaab, // 1/6
};
static inline void pfx_src(uint32_t *output, uint32_t *input, uint32_t pfx, unsigned lanen) {
for (unsigned i = 0; i < lanen; i++) {
// Bits 15..12 specify constant (1) or swizzle+abs (0)
unsigned cntop = ((pfx >> 12) >> i) & 1;
// Uses the 8 LSB in groups of 2 bits to indicate swizzle index
unsigned swsel = (pfx >> (i << 1)) & 3;
// Bits 11..8 specify abs operation
unsigned absel = ((pfx >> 8) >> i) & 1;
uint32_t value;
if (cntop) {
// It (ab)uses the swizzle and abs bits to pick the constant
unsigned ctsel = swsel | (absel << 2);
// Calculate constant based on ctsel
value = prefix_cnt[ctsel];
} else {
// Use swizzle bits to select input
uint32_t swz = input[swsel];
// Perform abs() if specified (clears sign bit)
value = absel ? (swz & 0x7FFFFFFF) : swz;
}
// Bits 19..16 specifiy operand negation
unsigned negat = ((pfx >> 16) >> i) & 1;
// Negate if required
output[i] = negat ? (value ^ 0x80000000) : value;
}
}
// Destination prefix consists in 3 bits per lane.
// One bit determines whether the register will be written at all (mask).
// The other two are used for saturation operations: one bit enables saturation
// while the other chooses the range (0:1 vs -1:1).
// Some combinations are legal but do not make sense:
// - mask + saturation: don't care about the result since it's not written
// - saturation selection bit + no-saturation
//
// This suggests that saturation is performed before the masking (likely masking
// just disables register updates) and that the range is selected by one bit while
// the other selects whether to apply the operation or not.
static inline void pfx_dst(uint32_t *output, uint32_t *input, uint32_t pfx, unsigned lanen) {
for (unsigned i = 0; i < lanen; i++) {
// Uses the 8 LSB in groups of 2 bits (1 bit for sat, 1 for minsat)
unsigned dosat = (pfx >> (i * 2)) & 1;
unsigned satsl = (pfx >> (i * 2 + 1)) & 1;
// Bits 11..8 specify masking operation (1 means do not write)
unsigned domask = ((pfx >> 8) >> i) & 1;
if (!domask) {
if (dosat) {
union {
float f32;
uint32_t u32;
} v = {.u32 = input[i]};
const float maxv = 1.0f;
const float minv = satsl ? -1.0f : 0.0f;
if (v.f32 > maxv)
v.f32 = maxv;
if (v.f32 <= minv) // Ensure we map -0 to 0
v.f32 = minv;
output[i] = v.u32;
} else {
output[i] = input[i];
}
}
}
}