[clang] [llvm] [X86][Windows] Return `fp128` on the stack (PR #204887)

Folkert de Vries via cfe-commits Sat, 27 Jun 2026 05:39:42 -0700

https://github.com/folkertdev updated 
https://github.com/llvm/llvm-project/pull/204887


>From 6306970fa9266b0d386a51a477e7aefe958363d7 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Fri, 19 Jun 2026 20:05:36 +0200
Subject: [PATCH 1/8] [X86][Windows] Return `fp128` on the stack

This is in line with mingw64 gcc and follows the win64 CC (at least
more)
---
 clang/lib/CodeGen/Targets/X86.cpp             |  14 +-
 clang/test/CodeGen/win-fp128.c                |   4 +-
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  14 +
 .../test/CodeGen/X86/fp128-libcalls-strict.ll | 451 +++++++++++++-----
 llvm/test/CodeGen/X86/fp128-libcalls.ll       | 251 +++++++---
 llvm/test/CodeGen/X86/i128-fp128-abi.ll       | 132 +++--
 6 files changed, 624 insertions(+), 242 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index dbe4d656aabc5..77c912b021604 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3437,8 +3437,6 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
     case BuiltinType::Int128:
     case BuiltinType::UInt128:
     case BuiltinType::Float128:
-      // 128-bit float and integer types share the same ABI.
-
       // If it's a parameter type, the normal ABI rule is that arguments larger
       // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
       // even though it isn't particularly efficient.
@@ -3449,10 +3447,14 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
       // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
       // Clang matches them for compatibility.
-      // NOTE: GCC actually returns f128 indirectly but will hopefully change.
-      // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054#c8.
-      return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
-          llvm::Type::getInt64Ty(getVMContext()), 2));
+      if (BT->getKind() == BuiltinType::Int128 ||
+          BT->getKind() == BuiltinType::UInt128)
+        return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
+            llvm::Type::getInt64Ty(getVMContext()), 2));
+
+      // Mingw64 GCC returns f128 via sret. Clang matches that for
+      // compatibility.
+      break;
 
     default:
       break;
diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
index 58e203d4fc8ed..dc144f899fa4f 100644
--- a/clang/test/CodeGen/win-fp128.c
+++ b/clang/test/CodeGen/win-fp128.c
@@ -3,10 +3,10 @@
 // __float128 is unsupported on MSVC
 
 __float128 fp128_ret(void) { return 0; }
-// CHECK-GNU64: define dso_local <2 x i64>  @fp128_ret()
+// CHECK-GNU64: define dso_local fp128 @fp128_ret()
 
 __float128 fp128_args(__float128 a, __float128 b) { return a * b; }
-// CHECK-GNU64: define dso_local <2 x i64> @fp128_args(ptr noundef 
dead_on_return %0, ptr noundef dead_on_return %1)
+// CHECK-GNU64: define dso_local fp128 @fp128_args(ptr noundef dead_on_return 
%0, ptr noundef dead_on_return %1)
 
 void fp128_vararg(int a, ...) {
   // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 7c068115df481..bce581ad7a48b 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -670,6 +670,20 @@ bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
     const Type *RetTy) const {
+  // Mingw64 GCC returns f128 via sret, which matches the documentation of the
+  // Windows x64 calling convention:
+  //
+  // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
+  //
+  // > Otherwise, the caller must allocate memory for the return value and pass
+  // a pointer to it as the first argument.
+  //
+  // Return false, which will perform sret demotion.
+  if (Subtarget.isCallingConvWin64(CallConv) &&
+      llvm::any_of(
+          Outs, [](const ISD::OutputArg &Out) { return Out.VT == MVT::f128; }))
+    return false;
+
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll 
b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index ad2d690fd7ed0..dfff88d30bcd4 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -79,15 +79,22 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: add:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __addtf3
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: add:
@@ -201,15 +208,22 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: sub:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __subtf3
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: sub:
@@ -323,15 +337,22 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: mul:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __multf3
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: mul:
@@ -445,15 +466,22 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: div:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __divtf3
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: div:
@@ -568,18 +596,25 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind 
strictfp {
 ;
 ; WIN-LABEL: fma:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $88, %rsp
-; WIN-NEXT:    movaps (%r8), %xmm0
-; WIN-NEXT:    movaps (%rcx), %xmm1
-; WIN-NEXT:    movaps (%rdx), %xmm2
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $96, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%r9), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps (%r8), %xmm2
 ; WIN-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
 ; WIN-NEXT:    callq fmal
-; WIN-NEXT:    addq $88, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $96, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: fma:
@@ -694,15 +729,22 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: frem:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq fmodl
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: frem:
@@ -797,12 +839,19 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: ceil:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq ceill
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: ceil:
@@ -887,12 +936,19 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: acos:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq acosl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: acos:
@@ -977,12 +1033,19 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: cos:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq cosl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: cos:
@@ -1067,12 +1130,19 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: cosh:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq coshl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: cosh:
@@ -1157,12 +1227,19 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: exp:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq expl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: exp:
@@ -1247,12 +1324,19 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: exp2:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq exp2l
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: exp2:
@@ -1337,12 +1421,19 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: floor:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq floorl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: floor:
@@ -1427,12 +1518,19 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: log:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq logl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: log:
@@ -1517,12 +1615,19 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: log10:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq log10l
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: log10:
@@ -1607,12 +1712,19 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: log2:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq log2l
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: log2:
@@ -1709,15 +1821,22 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind 
strictfp {
 ;
 ; WIN-LABEL: maxnum:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq fmaxl
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: maxnum:
@@ -1824,15 +1943,22 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind 
strictfp {
 ;
 ; WIN-LABEL: minnum:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq fminl
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: minnum:
@@ -1927,12 +2053,19 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: nearbyint:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq nearbyintl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: nearbyint:
@@ -2029,15 +2162,22 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp 
{
 ;
 ; WIN-LABEL: pow:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq powl
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: pow:
@@ -2143,12 +2283,19 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; WIN-LABEL: powi:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq __powitf2
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: powi:
@@ -2237,12 +2384,19 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: rint:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq rintl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: rint:
@@ -2327,12 +2481,19 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: round:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq roundl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: round:
@@ -2417,12 +2578,19 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: roundeven:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq roundevenl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: roundeven:
@@ -2507,12 +2675,19 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: asin:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq asinl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: asin:
@@ -2597,12 +2772,19 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: sin:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sinl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: sin:
@@ -2687,12 +2869,19 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: sinh:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sinhl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: sinh:
@@ -2777,12 +2966,19 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: sqrt:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sqrtl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: sqrt:
@@ -2867,12 +3063,19 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: atan:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq atanl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: atan:
@@ -2969,15 +3172,22 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind 
strictfp {
 ;
 ; WIN-LABEL: atan2:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq atan2l
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: atan2:
@@ -3072,12 +3282,19 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: tan:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq tanl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: tan:
@@ -3162,12 +3379,19 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: tanh:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq tanhl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: tanh:
@@ -3252,12 +3476,19 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; WIN-LABEL: trunc:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq truncl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: trunc:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll 
b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index 4b0449fd7502e..c594b15ef1cbe 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -78,16 +78,18 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) 
nounwind {
 ;
 ; WIN-LABEL: Test128Add:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps (%rdx), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __addtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Add:
@@ -207,16 +209,18 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128_1Add:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps vf128(%rip), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __addtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128_1Add:
@@ -331,16 +335,18 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) 
nounwind {
 ;
 ; WIN-LABEL: Test128Sub:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps (%rdx), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __subtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Sub:
@@ -460,16 +466,18 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128_1Sub:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps vf128(%rip), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __subtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128_1Sub:
@@ -584,16 +592,18 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) 
nounwind {
 ;
 ; WIN-LABEL: Test128Mul:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps (%rdx), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __multf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Mul:
@@ -713,16 +723,18 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128_1Mul:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps vf128(%rip), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __multf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128_1Mul:
@@ -837,16 +849,18 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) 
nounwind {
 ;
 ; WIN-LABEL: Test128Div:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps (%rdx), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __divtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Div:
@@ -966,16 +980,18 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128_1Div:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps vf128(%rip), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq __divtf3
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128_1Div:
@@ -1082,16 +1098,18 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) 
nounwind {
 ;
 ; WIN-LABEL: Test128Rem:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps (%rdx), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq fmodl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Rem:
@@ -1201,16 +1219,18 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind 
{
 ;
 ; WIN-LABEL: Test128_1Rem:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    subq $88, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps vf128(%rip), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq fmodl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    addq $88, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128_1Rem:
@@ -1303,13 +1323,15 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128Sqrt:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sqrtl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Sqrt:
@@ -1390,13 +1412,15 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128Sin:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sinl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Sin:
@@ -1477,13 +1501,15 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128Cos:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq cosl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Cos:
@@ -1564,13 +1590,15 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128Ceil:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq ceill
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Ceil:
@@ -1651,13 +1679,15 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind 
{
 ;
 ; WIN-LABEL: Test128Floor:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq floorl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Floor:
@@ -1738,13 +1768,15 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind 
{
 ;
 ; WIN-LABEL: Test128Trunc:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq truncl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Trunc:
@@ -1825,13 +1857,15 @@ define dso_local void @Test128Nearbyint(fp128 %d1) 
nounwind {
 ;
 ; WIN-LABEL: Test128Nearbyint:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq nearbyintl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Nearbyint:
@@ -1912,13 +1946,15 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; WIN-LABEL: Test128Rint:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq rintl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Rint:
@@ -1999,13 +2035,15 @@ define dso_local void @Test128Round(fp128 %d1) nounwind 
{
 ;
 ; WIN-LABEL: Test128Round:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    subq $72, %rsp
 ; WIN-NEXT:    movaps (%rcx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq roundl
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; WIN-NEXT:    movaps %xmm0, vf128(%rip)
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    addq $72, %rsp
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Round:
@@ -2102,18 +2140,25 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) 
nounwind {
 ;
 ; WIN-LABEL: Test128FMA:
 ; WIN:       # %bb.0: # %entry
-; WIN-NEXT:    subq $88, %rsp
-; WIN-NEXT:    movaps (%r8), %xmm0
-; WIN-NEXT:    movaps (%rcx), %xmm1
-; WIN-NEXT:    movaps (%rdx), %xmm2
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $96, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%r9), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps (%r8), %xmm2
 ; WIN-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
 ; WIN-NEXT:    callq fmal
-; WIN-NEXT:    addq $88, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $96, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128FMA:
@@ -2211,12 +2256,19 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Acos:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq acosl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Acos:
@@ -2294,12 +2346,19 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Asin:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq asinl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Asin:
@@ -2377,12 +2436,19 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Atan:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq atanl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Atan:
@@ -2472,15 +2538,22 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind 
{
 ;
 ; WIN-LABEL: Test128Atan2:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $80, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps (%r8), %xmm1
 ; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN-NEXT:    callq atan2l
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $80, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Atan2:
@@ -2568,12 +2641,19 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Cosh:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq coshl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Cosh:
@@ -2651,12 +2731,19 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Sinh:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq sinhl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Sinh:
@@ -2734,12 +2821,19 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Tan:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq tanl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Tan:
@@ -2817,12 +2911,19 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; WIN-LABEL: Test128Tanh:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $56, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq tanhl
-; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Tanh:
@@ -2916,14 +3017,20 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind 
{
 ;
 ; WIN-LABEL: Test128Modf:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    subq $72, %rsp
-; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    pushq %rsi
+; WIN-NEXT:    subq $64, %rsp
+; WIN-NEXT:    movq %rcx, %rsi
+; WIN-NEXT:    movaps (%rdx), %xmm0
 ; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq 16(%rcx), %r8
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN-NEXT:    callq modfl
-; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rsi)
+; WIN-NEXT:    movq %rsi, %rax
+; WIN-NEXT:    addq $64, %rsp
+; WIN-NEXT:    popq %rsi
 ; WIN-NEXT:    retq
 ;
 ; WIN-X86-LABEL: Test128Modf:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll 
b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 2174d5056e6ce..9f385ee2faf4e 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -190,7 +190,9 @@ define PrimTy @return(ptr %p) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: return:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rcx)
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: return:
@@ -201,7 +203,9 @@ define PrimTy @return(ptr %p) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: return:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rcx)
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: return:
@@ -262,7 +266,9 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: first_arg:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rcx)
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: first_arg:
@@ -272,7 +278,9 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: first_arg:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rcx)
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: first_arg:
@@ -338,8 +346,10 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 
%_3, PrimTy %x) nounw
 ;
 ; CHECK-MSVC64-F128-LABEL: leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq 40(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-F128-NEXT:    movq 48(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: leading_args:
@@ -350,8 +360,10 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 
%_3, PrimTy %x) nounw
 ;
 ; CHECK-MINGW-F128-LABEL: leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq 40(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-F128-NEXT:    movq 48(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: leading_args:
@@ -418,8 +430,10 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 
%_2, i64 %_3, i64 %_4, Pr
 ;
 ; CHECK-MSVC64-F128-LABEL: many_leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq 56(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-F128-NEXT:    movq 64(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: many_leading_args:
@@ -430,8 +444,10 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 
%_2, i64 %_3, i64 %_4, Pr
 ;
 ; CHECK-MINGW-F128-LABEL: many_leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq 56(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-F128-NEXT:    movq 64(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: many_leading_args:
@@ -496,8 +512,10 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 
%_3, i64 %_4, PrimTy
 ;
 ; CHECK-MSVC64-F128-LABEL: trailing_arg:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq 48(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-F128-NEXT:    movq 56(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: trailing_arg:
@@ -508,8 +526,10 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 
%_3, i64 %_4, PrimTy
 ;
 ; CHECK-MINGW-F128-LABEL: trailing_arg:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq 48(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-F128-NEXT:    movq 56(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: trailing_arg:
@@ -578,12 +598,13 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: call_first_arg:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    subq $56, %rsp
+; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 32(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq 32(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    leaq 32(%rsp), %rdx
 ; CHECK-MSVC64-F128-NEXT:    callq first_arg
-; CHECK-MSVC64-F128-NEXT:    addq $56, %rsp
+; CHECK-MSVC64-F128-NEXT:    addq $72, %rsp
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: call_first_arg:
@@ -595,12 +616,13 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: call_first_arg:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    subq $56, %rsp
+; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 32(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq 32(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    leaq 32(%rsp), %rdx
 ; CHECK-MINGW-F128-NEXT:    callq first_arg
-; CHECK-MINGW-F128-NEXT:    addq $56, %rsp
+; CHECK-MINGW-F128-NEXT:    addq $72, %rsp
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: call_first_arg:
@@ -682,17 +704,18 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: call_leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    subq $88, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 64(%rsp), %rcx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MSVC64-F128-NEXT:    callq leading_args
-; CHECK-MSVC64-F128-NEXT:    addq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    addq $88, %rsp
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: call_leading_args:
@@ -710,17 +733,18 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: call_leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    subq $88, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
 ; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MINGW-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 64(%rsp), %rcx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MINGW-F128-NEXT:    callq leading_args
-; CHECK-MINGW-F128-NEXT:    addq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    addq $88, %rsp
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: call_leading_args:
@@ -831,21 +855,22 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: call_many_leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    subq $88, %rsp
+; CHECK-MSVC64-F128-NEXT:    subq $120, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm1, 64(%rsp)
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm1, 80(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 80(%rsp), %rax
 ; CHECK-MSVC64-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 64(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    leaq 64(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 48(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 96(%rsp), %rcx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MSVC64-F128-NEXT:    callq many_leading_args
-; CHECK-MSVC64-F128-NEXT:    addq $88, %rsp
+; CHECK-MSVC64-F128-NEXT:    addq $120, %rsp
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: call_many_leading_args:
@@ -865,21 +890,22 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: call_many_leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    subq $88, %rsp
+; CHECK-MINGW-F128-NEXT:    subq $120, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MINGW-F128-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MINGW-F128-NEXT:    movaps %xmm1, 64(%rsp)
-; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movaps %xmm1, 80(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 80(%rsp), %rax
 ; CHECK-MINGW-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 64(%rsp)
 ; CHECK-MINGW-F128-NEXT:    leaq 64(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    movq %rax, 48(%rsp)
+; CHECK-MINGW-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 96(%rsp), %rcx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MINGW-F128-NEXT:    callq many_leading_args
-; CHECK-MINGW-F128-NEXT:    addq $88, %rsp
+; CHECK-MINGW-F128-NEXT:    addq $120, %rsp
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: call_many_leading_args:
@@ -993,17 +1019,18 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MSVC64-F128-LABEL: call_trailing_arg:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    subq $88, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 64(%rsp), %rcx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MSVC64-F128-NEXT:    callq trailing_arg
-; CHECK-MSVC64-F128-NEXT:    addq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    addq $88, %rsp
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: call_trailing_arg:
@@ -1021,17 +1048,18 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-MINGW-F128-LABEL: call_trailing_arg:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    subq $88, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
 ; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
-; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MINGW-F128-NEXT:    movq $0, 32(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 64(%rsp), %rcx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
 ; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
 ; CHECK-MINGW-F128-NEXT:    callq trailing_arg
-; CHECK-MINGW-F128-NEXT:    addq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    addq $88, %rsp
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: call_trailing_arg:

>From 70e8db9f599fa870339d87619dba4227ac2043d0 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Sat, 20 Jun 2026 12:46:08 +0200
Subject: [PATCH 2/8] add more nuance to GCC compat comment

---
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index bce581ad7a48b..ba177c0cc63a1 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -670,13 +670,24 @@ bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
     const Type *RetTy) const {
-  // Mingw64 GCC returns f128 via sret, which matches the documentation of the
-  // Windows x64 calling convention:
+  // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
+  //
+  //
+  // Using sret is a reasonable implementation of the Windows x64 calling
+  // convention:
   //
   // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
   //
   // > Otherwise, the caller must allocate memory for the return value and pass
-  // a pointer to it as the first argument.
+  // > a pointer to it as the first argument.
+  //
+  // Although it is not the only reasonable interpretation:
+  //
+  // > Nonscalar types including floats, doubles, and vector types such as
+  // > __m128, __m128i, __m128d are returned in XMM0.
+  //
+  // For now, we prefer compatibility with GCC. If official guidelines are ever
+  // published, this can be revisited.
   //
   // Return false, which will perform sret demotion.
   if (Subtarget.isCallingConvWin64(CallConv) &&

>From 6f909ce71777e3d887c553373b71c1c1b1728bc6 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Tue, 23 Jun 2026 01:08:52 +0200
Subject: [PATCH 3/8] try to use tablegen

---
 llvm/lib/Target/X86/X86CallingConv.td         |  34 +++++-
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  33 ++----
 .../CodeGen/X86/fp128-return-calling-conv.ll  | 102 ++++++++++++++++++
 3 files changed, 141 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/fp128-return-calling-conv.ll

diff --git a/llvm/lib/Target/X86/X86CallingConv.td 
b/llvm/lib/Target/X86/X86CallingConv.td
index 2e03f8996969b..5c7f48166b631 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -390,8 +390,8 @@ def RetCC_X86_Win64_C : CallingConv<[
 
 // X86-64 vectorcall return-value convention.
 def RetCC_X86_64_Vectorcall : CallingConv<[
-  // Vectorcall calling convention always returns FP values in XMMs.
-  CCIfType<[f32, f64, f128], 
+  // See RetCC_X86_64 for details on f128.
+  CCIfType<[f32, f64],
     CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
   // Otherwise, everything is the same as Windows X86-64 C CC.
@@ -469,6 +469,36 @@ def RetCC_X86_32 : CallingConv<[
 
 // This is the root return-value convention for the X86-64 backend.
 def RetCC_X86_64 : CallingConv<[
+  // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
+  //
+  // Using sret is a reasonable implementation of the Windows x64 calling
+  // convention:
+  //
+  // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
+  //
+  // > Otherwise, the caller must allocate memory for the return value and pass
+  // > a pointer to it as the first argument.
+  //
+  // Although it is not the only reasonable interpretation:
+  //
+  // > Nonscalar types including floats, doubles, and vector types such as
+  // > __m128, __m128i, __m128d are returned in XMM0.
+  //
+  // For now, we prefer compatibility with GCC. If official guidelines are ever
+  // published, this can be revisited.
+  //
+  // The alignment of 1 is so the frame's alignment is not bumped.
+  CCIfType<[f128], CCIfCC<"CallingConv::Win64", CCAssignToStack<16, 1>>>,
+  CCIfType<[f128], CCIfSubtarget<"isTargetWin64()",
+    CCIfCC<"CallingConv::C", CCAssignToStack<16, 1>>>>,
+  CCIfType<[f128], CCIfSubtarget<"isTargetWin64()",
+    CCIfCC<"CallingConv::X86_VectorCall", CCAssignToStack<16, 1>>>>,
+  // UEFI also uses the Win64 CC.
+  CCIfType<[f128], CCIfSubtarget<"isTargetUEFI64()",
+    CCIfCC<"CallingConv::C", CCAssignToStack<16, 1>>>>,
+  CCIfType<[f128], CCIfSubtarget<"isTargetUEFI64()",
+    CCIfCC<"CallingConv::X86_VectorCall", CCAssignToStack<16, 1>>>>,
+
   // HiPE uses RetCC_X86_64_HiPE
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
 
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index ba177c0cc63a1..556524b8af503 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -670,34 +670,15 @@ bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
     const Type *RetTy) const {
-  // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
-  //
-  //
-  // Using sret is a reasonable implementation of the Windows x64 calling
-  // convention:
-  //
-  // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
-  //
-  // > Otherwise, the caller must allocate memory for the return value and pass
-  // > a pointer to it as the first argument.
-  //
-  // Although it is not the only reasonable interpretation:
-  //
-  // > Nonscalar types including floats, doubles, and vector types such as
-  // > __m128, __m128i, __m128d are returned in XMM0.
-  //
-  // For now, we prefer compatibility with GCC. If official guidelines are ever
-  // published, this can be revisited.
-  //
-  // Return false, which will perform sret demotion.
-  if (Subtarget.isCallingConvWin64(CallConv) &&
-      llvm::any_of(
-          Outs, [](const ISD::OutputArg &Out) { return Out.VT == MVT::f128; }))
-    return false;
-
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC_X86);
+  if (!CCInfo.CheckReturn(Outs, RetCC_X86))
+    return false;
+
+  // Demotion to sret when the value must be returned via memory. This is the
+  // case for fp128 on windows.
+  return llvm::none_of(RVLocs,
+                       [](const CCValAssign &VA) { return VA.isMemLoc(); });
 }
 
 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const 
{
diff --git a/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll 
b/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
new file mode 100644
index 0000000000000..6aa12e0aa1f8c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu  | FileCheck %s 
--check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s 
--check-prefix=LINUX
+
+; On Windows, for calling conventions that have GCC compatibibility 
requirements,
+; follow the GCC convention of passing via the stack (an sret). Elsewhere we 
use
+; the more efficient approach of returning via XMM0.
+;
+; NOTE: f128 arguments are passed indirectly on Windows, but in XMM registers 
elsewhere.
+
+define fp128 @ret_ccc(fp128 %a) {
+; WIN-LABEL: ret_ccc:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_ccc:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define win64cc fp128 @ret_win64(fp128 %a) {
+; WIN-LABEL: ret_win64:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_win64:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %rcx, %rax
+; LINUX-NEXT:    movaps (%rdx), %xmm0
+; LINUX-NEXT:    movaps %xmm0, (%rcx)
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+; vectorcallcc symbol mangling confuses the test generation script.
+; The \01 ensures re-running the script does not remove the checks.
+define x86_vectorcallcc fp128 @"\01ret_vectorcall"(fp128 %a) {
+; WIN-LABEL: ret_vectorcall:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps %xmm1, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_vectorcall:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define swiftcc fp128 @ret_swift(fp128 %a) {
+; WIN-LABEL: ret_swift:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_swift:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define tailcc fp128 @ret_tail(fp128 %a) {
+; WIN-LABEL: ret_tail:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq $40
+;
+; LINUX-LABEL: ret_tail:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq $8
+  ret fp128 %a
+}
+
+define preserve_mostcc fp128 @ret_preserve_most(fp128 %a) {
+; WIN-LABEL: ret_preserve_most:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_preserve_most:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define x86_regcallcc fp128 @ret_regcall(fp128 %a) {
+; WIN-LABEL: ret_regcall:
+; WIN:       # %bb.0:
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_regcall:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}

>From 404e2115796977647ba274717ce0d6dc5f9621bb Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Tue, 23 Jun 2026 01:20:38 +0200
Subject: [PATCH 4/8] add relnotes entry

---
 llvm/docs/ReleaseNotes.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index df4ced91e8f5e..ca870cafe1817 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -284,6 +284,11 @@ Makes programs 10x faster by doing Special New Thing.
   two-register push in Windows x64 V3 unwind info. The directive takes two
   register operands: ``.seh_push2regs %r12, %r13``.
 
+* The `fp128` type is now passed via sret instead of XMM0 for some calling
+  conventions to match GCC. The C, Win64 and vectorcall calling conventions
+  now use sret, other calling conventions do not need to be compatible with
+  GCC and still return via XMM0.
+
 ### Changes to the OCaml bindings
 
 ### Changes to the Python bindings

>From c51d0c8d95f0cc0fdff7b0a10f5b84fae94c1f9a Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Sat, 27 Jun 2026 14:03:45 +0200
Subject: [PATCH 5/8] Revert "try to use tablegen"

This reverts commit 6f909ce71777e3d887c553373b71c1c1b1728bc6.
---
 llvm/lib/Target/X86/X86CallingConv.td         |  34 +-----
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  33 ++++--
 .../CodeGen/X86/fp128-return-calling-conv.ll  | 102 ------------------
 3 files changed, 28 insertions(+), 141 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/fp128-return-calling-conv.ll

diff --git a/llvm/lib/Target/X86/X86CallingConv.td 
b/llvm/lib/Target/X86/X86CallingConv.td
index 5c7f48166b631..2e03f8996969b 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -390,8 +390,8 @@ def RetCC_X86_Win64_C : CallingConv<[
 
 // X86-64 vectorcall return-value convention.
 def RetCC_X86_64_Vectorcall : CallingConv<[
-  // See RetCC_X86_64 for details on f128.
-  CCIfType<[f32, f64],
+  // Vectorcall calling convention always returns FP values in XMMs.
+  CCIfType<[f32, f64, f128], 
     CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
   // Otherwise, everything is the same as Windows X86-64 C CC.
@@ -469,36 +469,6 @@ def RetCC_X86_32 : CallingConv<[
 
 // This is the root return-value convention for the X86-64 backend.
 def RetCC_X86_64 : CallingConv<[
-  // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
-  //
-  // Using sret is a reasonable implementation of the Windows x64 calling
-  // convention:
-  //
-  // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
-  //
-  // > Otherwise, the caller must allocate memory for the return value and pass
-  // > a pointer to it as the first argument.
-  //
-  // Although it is not the only reasonable interpretation:
-  //
-  // > Nonscalar types including floats, doubles, and vector types such as
-  // > __m128, __m128i, __m128d are returned in XMM0.
-  //
-  // For now, we prefer compatibility with GCC. If official guidelines are ever
-  // published, this can be revisited.
-  //
-  // The alignment of 1 is so the frame's alignment is not bumped.
-  CCIfType<[f128], CCIfCC<"CallingConv::Win64", CCAssignToStack<16, 1>>>,
-  CCIfType<[f128], CCIfSubtarget<"isTargetWin64()",
-    CCIfCC<"CallingConv::C", CCAssignToStack<16, 1>>>>,
-  CCIfType<[f128], CCIfSubtarget<"isTargetWin64()",
-    CCIfCC<"CallingConv::X86_VectorCall", CCAssignToStack<16, 1>>>>,
-  // UEFI also uses the Win64 CC.
-  CCIfType<[f128], CCIfSubtarget<"isTargetUEFI64()",
-    CCIfCC<"CallingConv::C", CCAssignToStack<16, 1>>>>,
-  CCIfType<[f128], CCIfSubtarget<"isTargetUEFI64()",
-    CCIfCC<"CallingConv::X86_VectorCall", CCAssignToStack<16, 1>>>>,
-
   // HiPE uses RetCC_X86_64_HiPE
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
 
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 556524b8af503..ba177c0cc63a1 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -670,15 +670,34 @@ bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
     const Type *RetTy) const {
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-  if (!CCInfo.CheckReturn(Outs, RetCC_X86))
+  // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
+  //
+  //
+  // Using sret is a reasonable implementation of the Windows x64 calling
+  // convention:
+  //
+  // 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170#return-values
+  //
+  // > Otherwise, the caller must allocate memory for the return value and pass
+  // > a pointer to it as the first argument.
+  //
+  // Although it is not the only reasonable interpretation:
+  //
+  // > Nonscalar types including floats, doubles, and vector types such as
+  // > __m128, __m128i, __m128d are returned in XMM0.
+  //
+  // For now, we prefer compatibility with GCC. If official guidelines are ever
+  // published, this can be revisited.
+  //
+  // Return false, which will perform sret demotion.
+  if (Subtarget.isCallingConvWin64(CallConv) &&
+      llvm::any_of(
+          Outs, [](const ISD::OutputArg &Out) { return Out.VT == MVT::f128; }))
     return false;
 
-  // Demotion to sret when the value must be returned via memory. This is the
-  // case for fp128 on windows.
-  return llvm::none_of(RVLocs,
-                       [](const CCValAssign &VA) { return VA.isMemLoc(); });
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const 
{
diff --git a/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll 
b/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
deleted file mode 100644
index 6aa12e0aa1f8c..0000000000000
--- a/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu  | FileCheck %s 
--check-prefix=WIN
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s 
--check-prefix=LINUX
-
-; On Windows, for calling conventions that have GCC compatibibility 
requirements,
-; follow the GCC convention of passing via the stack (an sret). Elsewhere we 
use
-; the more efficient approach of returning via XMM0.
-;
-; NOTE: f128 arguments are passed indirectly on Windows, but in XMM registers 
elsewhere.
-
-define fp128 @ret_ccc(fp128 %a) {
-; WIN-LABEL: ret_ccc:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movq %rcx, %rax
-; WIN-NEXT:    movaps (%rdx), %xmm0
-; WIN-NEXT:    movaps %xmm0, (%rcx)
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_ccc:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}
-
-define win64cc fp128 @ret_win64(fp128 %a) {
-; WIN-LABEL: ret_win64:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movq %rcx, %rax
-; WIN-NEXT:    movaps (%rdx), %xmm0
-; WIN-NEXT:    movaps %xmm0, (%rcx)
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_win64:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    movq %rcx, %rax
-; LINUX-NEXT:    movaps (%rdx), %xmm0
-; LINUX-NEXT:    movaps %xmm0, (%rcx)
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}
-
-; vectorcallcc symbol mangling confuses the test generation script.
-; The \01 ensures re-running the script does not remove the checks.
-define x86_vectorcallcc fp128 @"\01ret_vectorcall"(fp128 %a) {
-; WIN-LABEL: ret_vectorcall:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movq %rcx, %rax
-; WIN-NEXT:    movaps %xmm1, (%rcx)
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_vectorcall:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}
-
-define swiftcc fp128 @ret_swift(fp128 %a) {
-; WIN-LABEL: ret_swift:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_swift:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}
-
-define tailcc fp128 @ret_tail(fp128 %a) {
-; WIN-LABEL: ret_tail:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    retq $40
-;
-; LINUX-LABEL: ret_tail:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq $8
-  ret fp128 %a
-}
-
-define preserve_mostcc fp128 @ret_preserve_most(fp128 %a) {
-; WIN-LABEL: ret_preserve_most:
-; WIN:       # %bb.0:
-; WIN-NEXT:    movaps (%rcx), %xmm0
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_preserve_most:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}
-
-define x86_regcallcc fp128 @ret_regcall(fp128 %a) {
-; WIN-LABEL: ret_regcall:
-; WIN:       # %bb.0:
-; WIN-NEXT:    retq
-;
-; LINUX-LABEL: ret_regcall:
-; LINUX:       # %bb.0:
-; LINUX-NEXT:    retq
-  ret fp128 %a
-}

>From 1d4dea5b00f2f673e86db943c37c125e605eef31 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Sat, 27 Jun 2026 14:15:21 +0200
Subject: [PATCH 6/8] restrict what CCs now use sret for f128

---
 .../CodeGen/X86/fp128-return-calling-conv.ll  | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/fp128-return-calling-conv.ll

diff --git a/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll 
b/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
new file mode 100644
index 0000000000000..6aa12e0aa1f8c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp128-return-calling-conv.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu  | FileCheck %s 
--check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s 
--check-prefix=LINUX
+
+; On Windows, for calling conventions that have GCC compatibibility 
requirements,
+; follow the GCC convention of passing via the stack (an sret). Elsewhere we 
use
+; the more efficient approach of returning via XMM0.
+;
+; NOTE: f128 arguments are passed indirectly on Windows, but in XMM registers 
elsewhere.
+
+define fp128 @ret_ccc(fp128 %a) {
+; WIN-LABEL: ret_ccc:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_ccc:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define win64cc fp128 @ret_win64(fp128 %a) {
+; WIN-LABEL: ret_win64:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps (%rdx), %xmm0
+; WIN-NEXT:    movaps %xmm0, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_win64:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %rcx, %rax
+; LINUX-NEXT:    movaps (%rdx), %xmm0
+; LINUX-NEXT:    movaps %xmm0, (%rcx)
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+; vectorcallcc symbol mangling confuses the test generation script.
+; The \01 ensures re-running the script does not remove the checks.
+define x86_vectorcallcc fp128 @"\01ret_vectorcall"(fp128 %a) {
+; WIN-LABEL: ret_vectorcall:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movq %rcx, %rax
+; WIN-NEXT:    movaps %xmm1, (%rcx)
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_vectorcall:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define swiftcc fp128 @ret_swift(fp128 %a) {
+; WIN-LABEL: ret_swift:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_swift:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define tailcc fp128 @ret_tail(fp128 %a) {
+; WIN-LABEL: ret_tail:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq $40
+;
+; LINUX-LABEL: ret_tail:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq $8
+  ret fp128 %a
+}
+
+define preserve_mostcc fp128 @ret_preserve_most(fp128 %a) {
+; WIN-LABEL: ret_preserve_most:
+; WIN:       # %bb.0:
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_preserve_most:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}
+
+define x86_regcallcc fp128 @ret_regcall(fp128 %a) {
+; WIN-LABEL: ret_regcall:
+; WIN:       # %bb.0:
+; WIN-NEXT:    retq
+;
+; LINUX-LABEL: ret_regcall:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    retq
+  ret fp128 %a
+}

>From 4cb0cacdf6423d96489a5a8289e738cfe42dbb59 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Sat, 27 Jun 2026 14:36:30 +0200
Subject: [PATCH 7/8] WIP

---
 llvm/lib/Target/X86/X86CallingConv.td       |  2 +-
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CallingConv.td 
b/llvm/lib/Target/X86/X86CallingConv.td
index 2e03f8996969b..dfe4e9b922faf 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -391,7 +391,7 @@ def RetCC_X86_Win64_C : CallingConv<[
 // X86-64 vectorcall return-value convention.
 def RetCC_X86_64_Vectorcall : CallingConv<[
   // Vectorcall calling convention always returns FP values in XMMs.
-  CCIfType<[f32, f64, f128], 
+  CCIfType<[f32, f64],
     CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
   // Otherwise, everything is the same as Windows X86-64 C CC.
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index ba177c0cc63a1..a65bcc666557f 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -690,7 +690,19 @@ bool X86TargetLowering::CanLowerReturn(
   // published, this can be revisited.
   //
   // Return false, which will perform sret demotion.
-  if (Subtarget.isCallingConvWin64(CallConv) &&
+  auto IsWin64F128StackCC = [this](CallingConv::ID CC) -> bool {
+    switch (CC) {
+    case CallingConv::Win64:
+      return true;
+    case CallingConv::C:
+    case CallingConv::X86_VectorCall:
+      return Subtarget.isTargetWin64() || Subtarget.isTargetUEFI64();
+    default:
+      return false;
+    }
+  };
+
+  if (IsWin64F128StackCC(CallConv) &&
       llvm::any_of(
           Outs, [](const ISD::OutputArg &Out) { return Out.VT == MVT::f128; }))
     return false;

>From 9055ee81c23b24b4efcfbce70c32f9d64dcd90f4 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <[email protected]>
Date: Sat, 27 Jun 2026 14:36:51 +0200
Subject: [PATCH 8/8] make clang explicitly use sret to return f128 on windows

---
 clang/lib/CodeGen/Targets/X86.cpp           | 8 ++++++--
 clang/test/CodeGen/win-fp128.c              | 4 ++--
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index 77c912b021604..b49ee331c0152 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3452,8 +3452,12 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
         return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
             llvm::Type::getInt64Ty(getVMContext()), 2));
 
-      // Mingw64 GCC returns f128 via sret. Clang matches that for
-      // compatibility.
+      // Mingw64 GCC returns f128 via sret, and Clang matches that for
+      // compatibility. RegCall is excluded: it returns f128 in a vector
+      // register, matching the X86 backend's calling-convention lowering.
+      if (BT->getKind() == BuiltinType::Float128 && !IsRegCall)
+        return getNaturalAlignIndirect(Ty, 
getDataLayout().getAllocaAddrSpace(),
+                                       /*ByVal=*/false);
       break;
 
     default:
diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
index dc144f899fa4f..efc24e2ea0d63 100644
--- a/clang/test/CodeGen/win-fp128.c
+++ b/clang/test/CodeGen/win-fp128.c
@@ -3,10 +3,10 @@
 // __float128 is unsupported on MSVC
 
 __float128 fp128_ret(void) { return 0; }
-// CHECK-GNU64: define dso_local fp128 @fp128_ret()
+// CHECK-GNU64: define dso_local void @fp128_ret(ptr dead_on_unwind noalias 
writable sret(fp128) align 16 %agg.result)
 
 __float128 fp128_args(__float128 a, __float128 b) { return a * b; }
-// CHECK-GNU64: define dso_local fp128 @fp128_args(ptr noundef dead_on_return 
%0, ptr noundef dead_on_return %1)
+// CHECK-GNU64: define dso_local void @fp128_args(ptr dead_on_unwind noalias 
writable sret(fp128) align 16 %agg.result, ptr noundef dead_on_return %0, ptr 
noundef dead_on_return %1)
 
 void fp128_vararg(int a, ...) {
   // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp 
b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index a65bcc666557f..cc8f541406520 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -671,7 +671,8 @@ bool X86TargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
     const Type *RetTy) const {
   // Mingw64 GCC returns f128 via sret, and LLVM matches it for compatibility.
-  //
+  // This logic exists for libcalls, a frontend should explicitly use sret
+  // rather than rely on the sret demotion here.
   //
   // Using sret is a reasonable implementation of the Windows x64 calling
   // convention:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [X86][Windows] Return `fp128` on the stack (PR #204887)

Reply via email to