-
Notifications
You must be signed in to change notification settings - Fork 166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
X64: Use Struct for ELF Ehdr and Phdr headers #1694
Conversation
In this PR, I am attempting to decouple the assembly instructions and elf headers. @certik please, could you review and share if we are heading in desired direction? |
Also, using the approach in this PR, how shall we support generating assembly text format for the Elf headers? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks good. Don't forget to remove the tmp
binary.
src/libasr/codegen/x86_assembler.cpp
Outdated
Elf64_Phdr get_program_header(X86Assembler &a) { | ||
Elf64_Phdr p; | ||
p.type = 1; | ||
p.flags = 4; | ||
p.offset = sizeof(Elf64_Phdr); | ||
p.vaddr = a.origin(); | ||
p.paddr = a.origin(); | ||
p.filesz = sizeof(Elf64_Phdr); | ||
p.memsz = sizeof(Elf64_Phdr); | ||
p.align = 0x1000; | ||
return p; | ||
} | ||
|
||
Elf64_Phdr get_text_segment(X86Assembler &a, Elf64_Phdr &p_program) { | ||
Elf64_Phdr p; | ||
p.type = 1; | ||
p.flags = 5; | ||
p.offset = p_program.offset + sizeof(p); | ||
p.vaddr = a.origin() + p.offset; | ||
p.paddr = a.origin() + p.offset; | ||
p.filesz = sizeof(Elf64_Phdr); | ||
p.memsz = sizeof(Elf64_Phdr); | ||
p.align = 0x1000; | ||
return p; | ||
} | ||
|
||
Elf64_Phdr get_data_segment(X86Assembler &a, Elf64_Phdr &p_text_seg) { | ||
Elf64_Phdr p; | ||
p.type = 1; | ||
p.flags = 6; | ||
p.offset = p_text_seg.offset + sizeof(p_text_seg); | ||
p.vaddr = a.origin() + p.offset; | ||
p.paddr = a.origin() + p.offset; | ||
p.filesz = sizeof(Elf64_Phdr); | ||
p.memsz = sizeof(Elf64_Phdr); | ||
p.align = 0x1000; | ||
return p; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These functions only use a.origin()
, so I would just pass origin
as an integer parameter, not the whole x86Assembler
.
src/libasr/codegen/x86_assembler.cpp
Outdated
e.type = 2; | ||
e.machine = 0x3e; | ||
e.version = 1; | ||
e.entry = a.get_defined_symbol("_start").value; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would pass a.get_defined_symbol("_start").value
as an argument, not a
.
out.write((const char*) m_code.p, m_code.size()); | ||
|
||
out.close(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would keep this function outside of X86Assembler. Eventually once we add Arm64Assembler
, we will figure out how to generalize this function to also work for ELF ARM binaries.
Also I would rename it to something like create_elf64_x86_binary
and I would pass m_code
as an argument.
I would make the function return a string.
The caller will then save the string to a file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would make the function return a string.
I currently used a Vec<uint8_t>
. I would like to know if it is fine or if I shall update it.
43021ba
to
88aa79b
Compare
Example: (lp) lpython$ cat integration_tests/test_complex_01.py
from lpython import i32, i64, f32, f64, c32, c64
def test_real_imag():
x: c64
x = c64(2) + 3j
a: f64
b: f64
eps: f64
eps = 1e-12
a = x.real
b = x.imag
assert abs(a - 2.0) <= eps
assert abs(b - 3.0) <= eps
print(x)
def test_complex():
x: c64
x = complex(4.5, 6.7)
eps: f64
eps = 1e-12
assert abs(x.real - 4.5) <= eps
assert abs(x.imag - 6.7) <= eps
x = complex(-4, 2)
assert abs(x.real - (-4.0)) <= eps
assert abs(x.imag - 2.0) <= eps
x = complex(4, 7.89)
assert abs(x.real - 4.0) <= eps
assert abs(x.imag - 7.89) <= eps
x = complex(5.6, 0)
assert abs(x.real - 5.6) <= eps
assert abs(x.imag - 0.0) <= eps
a: f64
a = 534.6
x = complex(a, -a) # (f64, f64)
assert abs(x.real - 534.60000000000002274) <= eps
assert abs(x.imag - (-534.60000000000002274)) <= eps
a2: f32
a2 = -f32(423.5430806348152437)
a3: f32
a3 = f32(34.5)
x2: c32
x2 = c32(complex(a2, a3)) # (f32, f32)
assert f64(abs(x2.imag - f32(34.5))) <= eps
i1: i32
i1 = -5
i2: i64
i2 = -i64(6)
x = complex(a3, a) # (f32, f64)
x = complex(a, a3) # (f64, f32)
x = complex(i1, i2) # (i32, i64)
x = complex(i1, -i1) # (i32, i32)
x = complex(-i2, -i2) # (i64, i64)
x = complex(i2, -i1) # (i64, i32)
print(x)
def test_complex_unary_minus():
c: c32
c = c32(complex(3, 4.5))
_c: c32
_c = -c
assert abs(f64(_c.real) - (-3.0)) <= 1e-12
assert abs(f64(_c.imag) - (-4.5)) <= 1e-12
_c = c32(complex(5, -78))
_c = -_c
assert abs(f64(_c.real) - (-5.0)) <= 1e-12
assert abs(f64(_c.imag) - 78.0) <= 1e-12
c2: c64
c2 = complex(-4.5, -7.8)
c2 = -c2
assert abs(c2.real - 4.5) <= 1e-12
assert abs(c2.imag - 7.8) <= 1e-12
c2 = c64(3) + 4j
c2 = -c2
assert abs(c2.real - (-3.0)) <= 1e-12
assert abs(c2.imag - (-4.0)) <= 1e-12
print(c, _c, c2)
def test_complex_not():
c: c32
c = c32(complex(4, 5))
b: bool
b = not c
assert not b
c2: c64
c2 = complex(0, 0)
b = not c2
assert b
print(c,c2, b)
def check():
test_real_imag()
test_complex()
test_complex_unary_minus()
test_complex_not()
check()
(lp) lpython$ lpython integration_tests/test_complex_01.py --backend wasm -o tmp.out
(lp) lpython$ wasmtime tmp.out
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1
(lp) lpython$ lpython integration_tests/test_complex_01.py --backend wasm_x64 -o tmp2.out > tmp2.asm
(lp) lpython$ ./tmp2.out
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1
(wasm_asm) lpython$ nasm -fbin tmp2.asm && chmod +x tmp2
(wasm_asm) lpython$ ./tmp2
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1 |
88aa79b
to
e455564
Compare
This is ready. Please review and share feedback. |
|
||
for (auto b:a.get_machine_code()) { | ||
bin.push_back(al, b); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this copying the (potentially big) binary code byte by byte?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is copying the binary code byte by byte. Another approach that we can do is to just return the ELF and Phdr headers in create_elf64_x86_binary()
(we can rename it as create_elf64_x86_header()
).
For example:
void X86Assembler::save_binary64(const std::string &filename) {
Vec<uint8_t> header = create_elf64_x86_header(m_al, *this);
{
std::ofstream out;
out.open(filename);
out.write((const char*) header.p, header.size());
out.write((const char*) m_code.p, m_code.size());
out.close();
}
#ifdef LFORTRAN_LINUX
std::string mode = "0755";
int mod = strtol(mode.c_str(), 0, 8);
if (chmod(filename.c_str(),mod) < 0) {
throw AssemblerError("chmod failed");
}
#endif
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I currently updated it to return the header
only.
I think it looks ok, but I am a bit worried about performance. Can you benchmark this compared to master? |
Please find the benchmark with respect to main. Benchmark used #1222 (comment) ( generate benchmark: python examples/expr2.py > bench.py main branch: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real 0m12.138s
user 0m12.079s
sys 0m0.060s This PR: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real 0m12.963s
user 0m12.771s
sys 0m0.084s Relative: (real time considered here)
|
89ea081
to
0630d23
Compare
The timing seems similar now (The timing on Main: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real 0m11.905s
user 0m11.842s
sys 0m0.061s This PR: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real 0m11.877s
user 0m11.802s
sys 0m0.065s |
This is ready. |
Did you benchmark LPython compiled in Release mode? |
Sorry, it is in debug mode. Do we need release mode? From #1222 (comment), it seems in release mode, the timing difference could be lesser. Thus, I thought that in debug mode, we could have enlarged timing difference which could act as a better or more stricter value. |
We should only do benchmarks in Release mode, as in Debug mode there are all kinds of asserts and checks that are not executed in Release mode, so they are not relevant. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks good now. Thanks for this refactoring!
Shall we merge before release benchmarking or after release benchmarking? |
Co-authored-by: Ondřej Čertík <[email protected]>
On this #!/usr/bin/env bash
set -e
set -x
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_LLVM=yes \
-DLPYTHON_BUILD_ALL=yes \
-DWITH_STACKTRACE=no \
-DWITH_RUNTIME_STACKTRACE=no \
-DWITH_LSP=no \
-DWITH_LFORTRAN_BINARY_MODFILES=no \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH_LPYTHON;$CONDA_PREFIX" \
-DCMAKE_INSTALL_PREFIX=`pwd`/inst \
.
cmake --build . -j16 --target install I am receiving the following timing values on AMD Ryzen 5 2500U with Radeon Vega Mobile Gfx @1.600 GHz, Ubuntu 22.04.4 LTS: (lp) lpython$ time lpython bench.py --backend wasm_x86 -o tmp
real 0m3.393s
user 0m3.340s
sys 0m0.053s
(lp) lpython$ time lpython bench.py --backend x86 -o tmp
real 0m2.736s
user 0m2.683s
sys 0m0.054s
(lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp
real 0m3.457s
user 0m3.417s
sys 0m0.040s
(lp) lpython$ time lpython bench.py --backend llvm -o tmp
real 0m30.106s
user 0m29.968s
sys 0m0.129s
(lp) ubaid@ubaid-Lenovo-ideapad-330-15ARR:~/Desktop/Open-Source/lpython$ time python bench.py
249975005
real 0m0.499s
user 0m0.374s
sys 0m0.125s
(lp) ubaid@ubaid-Lenovo-ideapad-330-15ARR:~/Desktop/Open-Source/lpython$ These values seems to differ significantly from release mode values shared at #1222 (comment). Is the build script used to build in release mode as expected or am I missing something? |
Measure master and this PR in Release mode and let's see. I can do the same. If the benchmarks in Release mode look good, we can merge. |
Compiled on ubuntu 22.04, using ./build0.sh
cmake .
cmake --build . -j16 On main branch: (lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp.out
real 0m2.258s
user 0m2.218s
sys 0m0.041s On this PR branch: (lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp.out
real 0m1.588s
user 0m1.508s
sys 0m0.080s Currently benchmarked the |
Ok. I think it's good enough. Go ahead and merge this. |
Thank you for the approval. |
I just noticed I also had another branch locally with slightly varied approach Vec<uint8_t> create_elf64_x86_binary(Allocator &al, X86Assembler &a) {
const int E_IDX = 0;
const int PROGRAM_IDX = E_IDX + sizeof(Elf64_Ehdr);
const int TEXT_SEG_IDX = PROGRAM_IDX + sizeof(Elf64_Phdr);
const int DATA_SEG_IDX = TEXT_SEG_IDX + sizeof(Elf64_Phdr);
const int TOTAL_HEADER_SIZE = DATA_SEG_IDX + sizeof(Elf64_Phdr);
Vec<uint8_t> binary;
binary.resize(al, TOTAL_HEADER_SIZE);
Elf64_Ehdr* e = (Elf64_Ehdr*)(binary.p + E_IDX);
Elf64_Phdr* p_program = (Elf64_Phdr*)(binary.p + PROGRAM_IDX);
Elf64_Phdr* p_text_seg = (Elf64_Phdr*)(binary.p + TEXT_SEG_IDX);
Elf64_Phdr* p_data_seg = (Elf64_Phdr*)(binary.p + DATA_SEG_IDX);
set_header(a, e);
set_program_header(a, p_program);
set_text_segment(a, p_program, p_text_seg);
set_data_segment(a, p_text_seg, p_data_seg);
align_by_byte(al, binary, 0x1000);
const int PROGRAM_HEADER_SIZE = binary.size();
e->entry = a.get_defined_symbol("_start").value + PROGRAM_HEADER_SIZE;
p_program->filesz = PROGRAM_HEADER_SIZE;
p_program->memsz = p_program->filesz;
p_text_seg->offset = p_program->offset + p_program->filesz;
p_text_seg->vaddr = a.origin() + p_text_seg->offset + PROGRAM_HEADER_SIZE;
p_text_seg->paddr = p_text_seg->vaddr;
p_data_seg->offset = p_text_seg->offset + p_text_seg->filesz;
p_data_seg->vaddr = a.origin() + p_data_seg->offset + PROGRAM_HEADER_SIZE;
p_data_seg->paddr = p_data_seg->vaddr;
append_bytes(al, a.get_machine_code(), binary);
return binary;
} where void set_program_header(X86Assembler &a, Elf64_Phdr* p) {
p->type = 1;
p->flags = 4;
p->offset = 0;
p->vaddr = a.origin();
p->paddr = a.origin();
p->filesz = sizeof(Elf64_Ehdr) + 3 * sizeof(Elf64_Phdr);
p->memsz = p->filesz;
p->align = 0x1000;
} |
Sorry, I should have shared this approach #1694 (comment) early. I forgot about it while working on the new branch. |
The merged PR is fine with me. If you want to change it, go ahead and submit a PR. |
fixes #1539.
Also, towards #1485.
PS: This is a work in progress, hence the tests might not work.