diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index dc15aeadaa619..8c7ddd97bab20 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -344,6 +344,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { unsigned Reg = MO.getReg(); + bool EmitPercent = true; + switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register @@ -358,6 +360,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, case 'k': // Print SImode register Reg = getX86SubSuperRegister(Reg, 32); break; + case 'V': + EmitPercent = false; + LLVM_FALLTHROUGH; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. @@ -365,7 +370,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, break; } - O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + if (EmitPercent) + O << '%'; + + O << X86ATTInstPrinter::getRegisterName(Reg); return false; } @@ -438,6 +446,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'w': // Print HImode register case 'k': // Print SImode register case 'q': // Print DImode register + case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); printOperand(*this, MI, OpNo, O); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2c2294d6e032f..59a9832e17bba 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26250,28 +26250,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) { static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, unsigned Reg) { + if (Subtarget.useRetpolineExternalThunk()) { + // When using an external thunk for retpolines, we pick names that match the + // names GCC happens to use as well. This helps simplify the implementation + // of the thunks for kernels where they have no easy ability to create + // aliases and are doing non-trivial configuration of the thunk's body. For + // example, the Linux kernel will do boot-time hot patching of the thunk + // bodies and cannot easily export aliases of these to loaded modules. + // + // Note that at any point in the future, we may need to change the semantics + // of how we implement retpolines and at that time will likely change the + // name of the called thunk. Essentially, there is no hard guarantee that + // LLVM will generate calls to specific thunks, we merely make a best-effort + // attempt to help out kernels and other systems where duplicating the + // thunks is costly. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__x86_indirect_thunk_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); + } + + // When targeting an internal COMDAT thunk use an LLVM-specific name. switch (Reg) { - case 0: - assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_push" - : "__llvm_retpoline_push"; case X86::EAX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_eax" - : "__llvm_retpoline_eax"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_eax"; case X86::ECX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_ecx" - : "__llvm_retpoline_ecx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_ecx"; case X86::EDX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_edx" - : "__llvm_retpoline_edx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edi"; case X86::R11: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_r11" - : "__llvm_retpoline_r11"; + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__llvm_retpoline_r11"; } llvm_unreachable("unexpected reg for retpoline"); } @@ -26290,15 +26319,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, // just use R11, but we scan for uses anyway to ensure we don't generate // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't // already a register use operand to the call to hold the callee. If none - // are available, push the callee instead. This is less efficient, but is - // necessary for functions using 3 regparms. Such function calls are - // (currently) not eligible for tail call optimization, because there is no - // scratch register available to hold the address of the callee. + // are available, use EDI instead. EDI is chosen because EBX is the PIC base + // register and ESI is the base pointer to realigned stack frames with VLAs. SmallVector AvailableRegs; if (Subtarget.is64Bit()) AvailableRegs.push_back(X86::R11); else - AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { @@ -26316,30 +26343,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, break; } } + if (!AvailableReg) + report_fatal_error("calling convention incompatible with retpoline, no " + "available registers"); const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); - if (AvailableReg == 0) { - // No register available. Use PUSH. This must not be a tailcall, and this - // must not be x64. - if (Subtarget.is64Bit()) - report_fatal_error( - "Cannot make an indirect call on x86-64 using both retpoline and a " - "calling convention that preservers r11"); - if (Opc != X86::CALLpcrel32) - report_fatal_error("Cannot make an indirect tail call on x86 using " - "retpoline without a preserved register"); - BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - } else { - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) - .addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - MachineInstrBuilder(*BB->getParent(), &MI) - .addReg(AvailableReg, RegState::Implicit | RegState::Kill); - } + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); return BB; } diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index 223fa57714980..d03826bbe992f 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11"; static const char EAXThunkName[] = "__llvm_retpoline_eax"; static const char ECXThunkName[] = "__llvm_retpoline_ecx"; static const char EDXThunkName[] = "__llvm_retpoline_edx"; -static const char PushThunkName[] = "__llvm_retpoline_push"; +static const char EDIThunkName[] = "__llvm_retpoline_edi"; namespace { class X86RetpolineThunks : public MachineFunctionPass { @@ -74,7 +74,6 @@ class X86RetpolineThunks : public MachineFunctionPass { void createThunkFunction(Module &M, StringRef Name); void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); - void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); void populateThunk(MachineFunction &MF, Optional Reg = None); }; @@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { createThunkFunction(M, R11ThunkName); else for (StringRef Name : - {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName}) + {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName}) createThunkFunction(M, Name); InsertedThunks = true; return true; @@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { populateThunk(MF, X86::R11); } else { // For 32-bit targets we need to emit a collection of thunks for various - // possible scratch registers as well as a fallback that is used when - // there are no scratch registers and assumes the retpoline target has - // been pushed. + // possible scratch registers as well as a fallback that uses EDI, which is + // normally callee saved. // __llvm_retpoline_eax: // calll .Leax_call_target // .Leax_capture_spec: @@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { // movl %edx, (%esp) // retl // - // This last one is a bit more special and so needs a little extra - // handling. - // __llvm_retpoline_push: - // calll .Lpush_call_target - // .Lpush_capture_spec: - // pause - // lfence - // jmp .Lpush_capture_spec - // .align 16 - // .Lpush_call_target: - // # Clear pause_loop return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee + // __llvm_retpoline_edi: + // ... # Same setup + // movl %edi, (%esp) + // retl if (MF.getName() == EAXThunkName) populateThunk(MF, X86::EAX); else if (MF.getName() == ECXThunkName) populateThunk(MF, X86::ECX); else if (MF.getName() == EDXThunkName) populateThunk(MF, X86::EDX); - else if (MF.getName() == PushThunkName) - populateThunk(MF); + else if (MF.getName() == EDIThunkName) + populateThunk(MF, X86::EDI); else llvm_unreachable("Invalid thunk name on x86-32!"); } @@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, .addReg(Reg); } -void X86RetpolineThunks::insert32BitPushReturnAddrClobber( - MachineBasicBlock &MBB) { - // The instruction sequence we use to replace the return address without - // a scratch register is somewhat complicated: - // # Clear capture_spec from return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee - BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) - .addReg(X86::ESP) - .addImm(4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 8); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 0); -} - void X86RetpolineThunks::populateThunk(MachineFunction &MF, Optional Reg) { // Set MF properties. We never use vregs... @@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF, CaptureSpec->addSuccessor(CaptureSpec); CallTarget->setAlignment(4); - if (Reg) { - insertRegReturnAddrClobber(*CallTarget, *Reg); - } else { - assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); - insert32BitPushReturnAddrClobber(*CallTarget); - } + insertRegReturnAddrClobber(*CallTarget, *Reg); BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); } diff --git a/test/CodeGen/X86/inline-asm-modifier-V.ll b/test/CodeGen/X86/inline-asm-modifier-V.ll new file mode 100644 index 0000000000000..5a7f3fdd25fd5 --- /dev/null +++ b/test/CodeGen/X86/inline-asm-modifier-V.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s + +; If the target does not have 64-bit integer registers, emit 32-bit register +; names. + +; X86: call __x86_indirect_thunk_e{{[abcd]}}x +; X64: call __x86_indirect_thunk_r + +define void @q_modifier(i32* %p) { +entry: + tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p) + ret void +} diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll index 66d32ba5d73d4..2f21bb2566de8 100644 --- a/test/CodeGen/X86/retpoline-external.ll +++ b/test/CodeGen/X86/retpoline-external.ll @@ -23,18 +23,18 @@ entry: ; X64: callq bar ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq %[[fp]], %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64: movl %[[x]], %edi ; X64: callq bar ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq %[[fp]], %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: icall_reg: ; X64FAST: callq bar -; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq __x86_indirect_thunk_r11 ; X64FAST: callq bar -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: icall_reg: ; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] @@ -43,19 +43,19 @@ entry: ; X86: calll bar ; X86: movl %[[fp]], %eax ; X86: pushl %[[x]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: pushl %[[x]] ; X86: calll bar ; X86: movl %[[fp]], %eax ; X86: pushl %[[x]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86-NOT: # TAILCALL ; X86FAST-LABEL: icall_reg: ; X86FAST: calll bar -; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll __x86_indirect_thunk_eax ; X86FAST: calll bar -; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll __x86_indirect_thunk_eax @global_fp = external global void (i32)* @@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { ; X64-LABEL: icall_global_fp: ; X64-DAG: movl %edi, %[[x:[^ ]*]] ; X64-DAG: movq global_fp(%rip), %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq global_fp(%rip), %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: icall_global_fp: ; X64FAST: movq global_fp(%rip), %r11 -; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq __x86_indirect_thunk_r11 ; X64FAST: movq global_fp(%rip), %r11 -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: icall_global_fp: ; X86: movl global_fp, %eax ; X86: pushl 4(%esp) -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: addl $4, %esp ; X86: movl global_fp, %eax -; X86: jmp __llvm_external_retpoline_eax # TAILCALL +; X86: jmp __x86_indirect_thunk_eax # TAILCALL ; X86FAST-LABEL: icall_global_fp: -; X86FAST: calll __llvm_external_retpoline_eax -; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL +; X86FAST: calll __x86_indirect_thunk_eax +; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL %struct.Foo = type { void (%struct.Foo*)** } @@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 { ; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] ; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] ; X64: movq %[[fp]], %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64-DAG: movq %[[obj]], %rdi ; X64-DAG: movq %[[fp]], %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: vcall: -; X64FAST: callq __llvm_external_retpoline_r11 -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: callq __x86_indirect_thunk_r11 +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: vcall: ; X86: movl 8(%esp), %[[obj:[^ ]*]] @@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 { ; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] ; X86: movl %[[fp]], %eax ; X86: pushl %[[obj]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: addl $4, %esp ; X86: movl %[[fp]], %eax -; X86: jmp __llvm_external_retpoline_eax # TAILCALL +; X86: jmp __x86_indirect_thunk_eax # TAILCALL ; X86FAST-LABEL: vcall: -; X86FAST: calll __llvm_external_retpoline_eax -; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL +; X86FAST: calll __x86_indirect_thunk_eax +; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL declare void @direct_callee() diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll new file mode 100644 index 0000000000000..13b32740b2875 --- /dev/null +++ b/test/CodeGen/X86/retpoline-regparm.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s + +; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting +; because there are no available scratch registers. The Linux kernel builds +; with -mregparm=3, so we need to support it. TCO should fail because we need +; to restore EDI. + +define void @call_edi(void (i32, i32, i32)* %fp) #0 { +entry: + tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0) + ret void +} + +; CHECK-LABEL: call_edi: +; EDI is used, so it must be saved. +; CHECK: pushl %edi +; CHECK-DAG: xorl %eax, %eax +; CHECK-DAG: xorl %edx, %edx +; CHECK-DAG: xorl %ecx, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: calll __llvm_retpoline_edi +; CHECK: popl %edi +; CHECK: retl + +define void @edi_external(void (i32, i32, i32)* %fp) #1 { +entry: + tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0) + ret void +} + +; CHECK-LABEL: edi_external: +; CHECK: pushl %edi +; CHECK-DAG: xorl %eax, %eax +; CHECK-DAG: xorl %edx, %edx +; CHECK-DAG: xorl %ecx, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: calll __x86_indirect_thunk_edi +; CHECK: popl %edi +; CHECK: retl + +attributes #0 = { "target-features"="+retpoline" } +attributes #1 = { "target-features"="+retpoline-external-thunk" } diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll index b0d4c85d45e8b..562386ee27b9e 100644 --- a/test/CodeGen/X86/retpoline.ll +++ b/test/CodeGen/X86/retpoline.ll @@ -336,10 +336,10 @@ latch: ; X86-NEXT: movl %edx, (%esp) ; X86-NEXT: retl ; -; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat -; X86-NEXT: .hidden __llvm_retpoline_push -; X86-NEXT: .weak __llvm_retpoline_push -; X86: __llvm_retpoline_push: +; X86-LABEL: .section .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat +; X86-NEXT: .hidden __llvm_retpoline_edi +; X86-NEXT: .weak __llvm_retpoline_edi +; X86: __llvm_retpoline_edi: ; X86-NEXT: # {{.*}} # %entry ; X86-NEXT: calll [[CALL_TARGET:.*]] ; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken @@ -351,11 +351,7 @@ latch: ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: [[CALL_TARGET]]: # Block address taken ; X86-NEXT: # %entry -; X86-NEXT: addl $4, %esp -; X86-NEXT: pushl 4(%esp) -; X86-NEXT: pushl 4(%esp) -; X86-NEXT: popl 8(%esp) -; X86-NEXT: popl (%esp) +; X86-NEXT: movl %edi, (%esp) ; X86-NEXT: retl