Skip to content

Commit

Permalink
n64: refactor devirtualize and memory ops, force JIT only with instru…
Browse files Browse the repository at this point in the history
…ction cache (#1635)

This change contains a large refactoring with no functional changes,
plus a small functional change that will help speeding up the JIT.

The refactoring allows CPU::devirtualize to return all the information
regarding the memory access. This in turn allows to simplify all memory
ops (which contained lots of duplicated code) to use devirtualize,
reducing code duplication.

The functional change is that we now use JIT only when running code from
icache (which is 99.9999% of the times). Running code without icache is
extremely slow on real hardware and only happens in specific situations
(eg: during boot when RDRAM is not initialized). By limiting the JIT to
run from icache, we open the door to implement proper icache support in
the JIT and finish remove the instruction epilogue with its slow
"instruction cache stepper" (which in addition to being slow, is also
inaccurate and make us fail cache test ROMs).
  • Loading branch information
rasky authored Sep 8, 2024
1 parent 18f45dd commit f43da41
Show file tree
Hide file tree
Showing 8 changed files with 209 additions and 263 deletions.
14 changes: 8 additions & 6 deletions ares/n64/cpu/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,15 @@ auto CPU::instruction() -> void {
return;
}

if(Accuracy::CPU::Recompiler && recompiler.enabled) {
if (auto address = devirtualize(ipu.pc)) {
auto block = recompiler.block(ipu.pc, *address, GDB::server.hasBreakpoints());
block->execute(*this);
}
auto access = devirtualize<Read, Word>(ipu.pc);
if(!access) return;

if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) {
if(vaddrAlignedError<Word>(access.vaddr, false)) return;
auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints());
block->execute(*this);
} else {
auto data = fetch(ipu.pc);
auto data = fetch(access);
if (!data) return;
pipeline.begin();
instructionPrologue(ipu.pc, *data);
Expand Down
88 changes: 48 additions & 40 deletions ares/n64/cpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,17 @@ struct CPU : Thread {
}
} pipeline{*this};

struct PhysAccess {
enum Direction : u32 { Read, Write };

explicit operator bool() const { return found; }

bool found; //this is a valid physical access
bool cache; //access must go through cache
u32 paddr; //physical address on 32-bit MIPS bus
u64 vaddr; //virtual address used on the CPU (64-bit)
};

//context.cpp
struct Context {
CPU& self;
Expand Down Expand Up @@ -101,29 +112,27 @@ struct CPU : Thread {
struct InstructionCache {
CPU& self;
struct Line;
auto line(u32 vaddr) -> Line& { return lines[vaddr >> 5 & 0x1ff]; }
auto line(u64 vaddr) -> Line& { return lines[vaddr >> 5 & 0x1ff]; }

//used by the recompiler to simulate instruction cache fetch timing
auto step(u32 vaddr, u32 address) -> void {
auto step(u64 vaddr, u32 paddr) -> void {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
if(!line.hit(paddr)) {
self.step(48 * 2);
line.valid = 1;
line.tag = address & ~0x0000'0fff;
line.tag = paddr & ~0x0000'0fff;
} else {
self.step(1 * 2);
}
}

//used by the interpreter to fully emulate the instruction cache
auto fetch(u32 vaddr, u32 address, CPU& cpu) -> u32 {
auto fetch(u64 vaddr, u32 paddr, CPU& cpu) -> u32 {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
line.fill(address, cpu);
} else {
cpu.step(1 * 2);
if(!line.hit(paddr)) {
line.fill(paddr, cpu);
}
return line.read(address);
return line.read(paddr);
}

auto power(bool reset) -> void {
Expand All @@ -138,11 +147,11 @@ struct CPU : Thread {

//16KB
struct Line {
auto hit(u32 address) const -> bool { return valid && tag == (address & ~0x0000'0fff); }
auto fill(u32 address, CPU& cpu) -> void {
auto hit(u32 paddr) const -> bool { return valid && tag == (paddr & ~0x0000'0fff); }
auto fill(u32 paddr, CPU& cpu) -> void {
cpu.step(48 * 2);
valid = 1;
tag = address & ~0x0000'0fff;
tag = paddr & ~0x0000'0fff;
cpu.busReadBurst<ICache>(tag | index, words);
}

Expand All @@ -151,7 +160,7 @@ struct CPU : Thread {
cpu.busWriteBurst<ICache>(tag | index, words);
}

auto read(u32 address) const -> u32 { return words[address >> 2 & 7]; }
auto read(u32 paddr) const -> u32 { return words[paddr >> 2 & 7]; }

bool valid;
u32 tag;
Expand All @@ -163,21 +172,21 @@ struct CPU : Thread {
//dcache.cpp
struct DataCache {
struct Line;
auto line(u32 vaddr) -> Line&;
template<u32 Size> auto read(u32 vaddr, u32 address) -> u64;
template<u32 Size> auto write(u32 vaddr, u32 address, u64 data) -> void;
auto line(u64 vaddr) -> Line&;
template<u32 Size> auto read(u64 vaddr, u32 paddr) -> u64;
template<u32 Size> auto write(u64 vaddr, u32 paddr, u64 data) -> void;
auto power(bool reset) -> void;

auto readDebug(u32 vaddr, u32 address) -> u8;
auto writeDebug(u32 vaddr, u32 address, u8 value) -> void;
auto readDebug(u64 vaddr, u32 paddr) -> u8;
auto writeDebug(u64 vaddr, u32 paddr, u8 value) -> void;

//8KB
struct Line {
auto hit(u32 address) const -> bool;
auto fill(u32 address) -> void;
auto hit(u32 paddr) const -> bool;
auto fill(u32 paddr) -> void;
auto writeBack() -> void;
template<u32 Size> auto read(u32 address) const -> u64;
template<u32 Size> auto write(u32 address, u64 data) -> void;
template<u32 Size> auto read(u32 paddr) const -> u64;
template<u32 Size> auto write(u32 paddr, u64 data) -> void;

bool valid;
u16 dirty;
Expand All @@ -199,14 +208,6 @@ struct CPU : Thread {
TLB(CPU& self) : self(self) {}
static constexpr u32 Entries = 32;

struct Match {
explicit operator bool() const { return found; }

bool found;
bool cache;
u32 address;
};

struct Entry {
//scc-tlb.cpp
auto synchronize() -> void;
Expand All @@ -228,12 +229,12 @@ struct CPU : Thread {
} entry[TLB::Entries];

//tlb.cpp
auto load(u64 vaddr, bool noExceptions = false) -> Match;
auto load(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe<Match>;
auto load(u64 vaddr, bool noExceptions = false) -> PhysAccess;
auto load(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe<PhysAccess>;

auto loadFast(u64 vaddr) -> Match;
auto store(u64 vaddr) -> Match;
auto store(u64 vaddr, const Entry& entry) -> maybe<Match>;
auto loadFast(u64 vaddr) -> PhysAccess;
auto store(u64 vaddr, bool noExceptions = false) -> PhysAccess;
auto store(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe<PhysAccess>;

struct TlbCache { ;
static constexpr int entries = 4;
Expand Down Expand Up @@ -277,21 +278,28 @@ struct CPU : Thread {
auto userSegment64(u64 vaddr) const -> Context::Segment;

auto segment(u64 vaddr) -> Context::Segment;
auto devirtualize(u64 vaddr) -> maybe<u64>;
template<u32 Dir, u32 Size> auto devirtualize(u64 vaddr, bool raiseAlignedError = true, bool raiseExceptions = true) -> PhysAccess;
alwaysinline auto devirtualizeFast(u64 vaddr) -> u64;
auto devirtualizeDebug(u64 vaddr) -> u64;

auto fetch(u64 vaddr) -> maybe<u32>;
auto fetch(PhysAccess access) -> maybe<u32>;
template<u32 Size> auto busWrite(u32 address, u64 data) -> void;
template<u32 Size> auto busRead(u32 address) -> u64;
template<u32 Size> auto busWriteBurst(u32 address, u32 *data) -> void;
template<u32 Size> auto busReadBurst(u32 address, u32 *data) -> void;
template<u32 Size> auto read(u64 vaddr) -> maybe<u64>;
template<u32 Size> auto write(u64 vaddr, u64 data, bool alignedError=true) -> bool;
template<u32 Size> auto read(PhysAccess access) -> maybe<u64>;
template<u32 Size> auto write(PhysAccess access, u64 data) -> bool;
template<u32 Size> auto read(u64 vaddr) -> maybe<u64> {
return read<Size>(devirtualize<Read, Size>(vaddr));
}
template<u32 Size> auto write(u64 vaddr, u64 data, bool alignedError = true) -> bool {
return write<Size>(devirtualize<Write, Size>(vaddr, alignedError), data);
}
template<u32 Size> auto vaddrAlignedError(u64 vaddr, bool write) -> bool;
auto addressException(u64 vaddr) -> void;

auto readDebug(u64 vaddr) -> u8;
template <u32 Size> auto writeDebug(u64 vaddr, u64 data) -> bool;

//serialization.cpp
auto serialize(serializer&) -> void;
Expand Down
70 changes: 35 additions & 35 deletions ares/n64/cpu/dcache.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
auto CPU::DataCache::Line::hit(u32 address) const -> bool {
return valid && tag == (address & ~0x0000'0fff);
auto CPU::DataCache::Line::hit(u32 paddr) const -> bool {
return valid && tag == (paddr & ~0x0000'0fff);
}

auto CPU::DataCache::Line::fill(u32 address) -> void {
auto CPU::DataCache::Line::fill(u32 paddr) -> void {
cpu.step(40 * 2);
valid = 1;
dirty = 0;
tag = address & ~0x0000'0fff;
tag = paddr & ~0x0000'0fff;
fillPc = cpu.ipu.pc;
cpu.busReadBurst<DCache>(tag | index, words);
}
Expand All @@ -17,75 +17,75 @@ auto CPU::DataCache::Line::writeBack() -> void {
cpu.busWriteBurst<DCache>(tag | index, words);
}

auto CPU::DataCache::line(u32 vaddr) -> Line& {
auto CPU::DataCache::line(u64 vaddr) -> Line& {
return lines[vaddr >> 4 & 0x1ff];
}

template<u32 Size>
auto CPU::DataCache::Line::read(u32 address) const -> u64 {
if constexpr(Size == Byte) { return bytes[address >> 0 & 15 ^ 3]; }
if constexpr(Size == Half) { return halfs[address >> 1 & 7 ^ 1]; }
if constexpr(Size == Word) { return words[address >> 2 & 3 ^ 0]; }
auto CPU::DataCache::Line::read(u32 paddr) const -> u64 {
if constexpr(Size == Byte) { return bytes[paddr >> 0 & 15 ^ 3]; }
if constexpr(Size == Half) { return halfs[paddr >> 1 & 7 ^ 1]; }
if constexpr(Size == Word) { return words[paddr >> 2 & 3 ^ 0]; }
if constexpr(Size == Dual) {
u64 upper = words[address >> 2 & 2 | 0];
u64 lower = words[address >> 2 & 2 | 1];
u64 upper = words[paddr >> 2 & 2 | 0];
u64 lower = words[paddr >> 2 & 2 | 1];
return upper << 32 | lower << 0;
}
}

template<u32 Size>
auto CPU::DataCache::Line::write(u32 address, u64 data) -> void {
if constexpr(Size == Byte) { bytes[address >> 0 & 15 ^ 3] = data; }
if constexpr(Size == Half) { halfs[address >> 1 & 7 ^ 1] = data; }
if constexpr(Size == Word) { words[address >> 2 & 3 ^ 0] = data; }
auto CPU::DataCache::Line::write(u32 paddr, u64 data) -> void {
if constexpr(Size == Byte) { bytes[paddr >> 0 & 15 ^ 3] = data; }
if constexpr(Size == Half) { halfs[paddr >> 1 & 7 ^ 1] = data; }
if constexpr(Size == Word) { words[paddr >> 2 & 3 ^ 0] = data; }
if constexpr(Size == Dual) {
words[address >> 2 & 2 | 0] = data >> 32;
words[address >> 2 & 2 | 1] = data >> 0;
words[paddr >> 2 & 2 | 0] = data >> 32;
words[paddr >> 2 & 2 | 1] = data >> 0;
}
dirty |= ((1 << Size) - 1) << (address & 0xF);
dirty |= ((1 << Size) - 1) << (paddr & 0xF);
dirtyPc = cpu.ipu.pc;
}

template<u32 Size>
auto CPU::DataCache::read(u32 vaddr, u32 address) -> u64 {
auto CPU::DataCache::read(u64 vaddr, u32 paddr) -> u64 {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
if(!line.hit(paddr)) {
if(line.valid && line.dirty) line.writeBack();
line.fill(address);
line.fill(paddr);
} else {
cpu.step(1 * 2);
}
return line.read<Size>(address);
return line.read<Size>(paddr);
}

auto CPU::DataCache::readDebug(u32 vaddr, u32 address) -> u8 {
auto CPU::DataCache::readDebug(u64 vaddr, u32 paddr) -> u8 {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
if(!line.hit(paddr)) {
Thread dummyThread{};
return bus.read<Byte>(address, dummyThread, "Ares Debugger");
return bus.read<Byte>(paddr, dummyThread, "Ares Debugger");
}
return line.read<Byte>(address);
return line.read<Byte>(paddr);
}

template<u32 Size>
auto CPU::DataCache::write(u32 vaddr, u32 address, u64 data) -> void {
auto CPU::DataCache::write(u64 vaddr, u32 paddr, u64 data) -> void {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
if(!line.hit(paddr)) {
if(line.valid && line.dirty) line.writeBack();
line.fill(address);
line.fill(paddr);
} else {
cpu.step(1 * 2);
}
line.write<Size>(address, data);
line.write<Size>(paddr, data);
}

auto CPU::DataCache::writeDebug(u32 vaddr, u32 address, u8 data) -> void {
auto CPU::DataCache::writeDebug(u64 vaddr, u32 paddr, u8 data) -> void {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
if(!line.hit(paddr)) {
Thread dummyThread{};
return bus.write<Byte>(address, data, dummyThread, "Ares Debugger");
return bus.write<Byte>(paddr, data, dummyThread, "Ares Debugger");
}
line.write<Byte>(address, data);
line.write<Byte>(paddr, data);
}

auto CPU::DataCache::power(bool reset) -> void {
Expand All @@ -100,4 +100,4 @@ auto CPU::DataCache::power(bool reset) -> void {
}

template
auto CPU::DataCache::Line::write<Byte>(u32 address, u64 data) -> void;
auto CPU::DataCache::Line::write<Byte>(u32 paddr, u64 data) -> void;
Loading

0 comments on commit f43da41

Please sign in to comment.