https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95796

            Bug ID: 95796
           Summary: Inlining works between functions with the same target
                    attribute but not target_clones
           Product: gcc
           Version: 10.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: ipa
          Assignee: unassigned at gcc dot gnu.org
          Reporter: yyc1992 at gmail dot com
                CC: marxin at gcc dot gnu.org
  Target Milestone: ---

If two functions with the same target attribute calls each other, GCC can
inline one into another one (although sometimes incorrectly... PR95790). This
can be shown with the following code (all compilation using `g++ -O2 -S
-fno-exceptions -fno-asynchronous-unwind-tables`).

```
__attribute__ ((target ("default")))
static unsigned foo()
{
  return 1;
}

__attribute__ ((target ("avx")))
static unsigned foo() {
  return 1;
}

__attribute__ ((target ("default")))
unsigned bar()
{
    return foo();
}

__attribute__ ((target ("avx")))
unsigned bar()
{
    return foo();
}
```

which is compiled to

```
        .text
        .p2align 4
        .globl  _Z3barv
        .type   _Z3barv, @function
_Z3barv:
        movl    $1, %eax
        ret
        .size   _Z3barv, .-_Z3barv
        .p2align 4
        .globl  _Z3barv.avx
        .type   _Z3barv.avx, @function
_Z3barv.avx:
        movl    $1, %eax
        ret
        .size   _Z3barv.avx, .-_Z3barv.avx
```

OTOH, the equivalent code using `target_clones`

```
__attribute__ ((target_clones ("default,avx")))
static unsigned foo()
{
  return 1;
}

__attribute__ ((target_clones ("default,avx")))
unsigned bar()
{
    return foo();
}
```

compiles to

```
        .text
        .p2align 4
        .type   _ZL3foov.default.1, @function
_ZL3foov.default.1:
        movl    $1, %eax
        ret
        .size   _ZL3foov.default.1, .-_ZL3foov.default.1
        .p2align 4
        .type   _Z3barv.default.1, @function
_Z3barv.default.1:
        jmp     _ZL3foov.default.1
        .size   _Z3barv.default.1, .-_Z3barv.default.1
        .p2align 4
        .type   _ZL3foov.avx.0, @function
_ZL3foov.avx.0:
        movl    $1, %eax
        ret
        .size   _ZL3foov.avx.0, .-_ZL3foov.avx.0
        .p2align 4
        .type   _Z3barv.avx.0, @function
_Z3barv.avx.0:
        jmp     _ZL3foov.avx.0
        .size   _Z3barv.avx.0, .-_Z3barv.avx.0
        .section       
.text._Z3barv.resolver,"axG",@progbits,_Z3barv.resolver,comdat
        .p2align 4
        .weak   _Z3barv.resolver
        .type   _Z3barv.resolver, @function
_Z3barv.resolver:
        subq    $8, %rsp
        call    __cpu_indicator_init@PLT
        movq    __cpu_model@GOTPCREL(%rip), %rax
        leaq    _Z3barv.avx.0(%rip), %rdx
        testb   $2, 13(%rax)
        leaq    _Z3barv.default.1(%rip), %rax
        cmovne  %rdx, %rax
        addq    $8, %rsp
        ret
        .size   _Z3barv.resolver, .-_Z3barv.resolver
        .globl  _Z3barv
        .type   _Z3barv, @gnu_indirect_function
        .set    _Z3barv,_Z3barv.resolver
        .text
        .p2align 4
        .type   _ZL3foov.resolver, @function
_ZL3foov.resolver:
        subq    $8, %rsp
        call    __cpu_indicator_init@PLT
        movq    __cpu_model@GOTPCREL(%rip), %rax
        leaq    _ZL3foov.avx.0(%rip), %rdx
        testb   $2, 13(%rax)
        leaq    _ZL3foov.default.1(%rip), %rax
        cmovne  %rdx, %rax
        addq    $8, %rsp
        ret
        .size   _ZL3foov.resolver, .-_ZL3foov.resolver
```

instead. Which only eliminates the indirect call but does not inline `foo` into
`bar`. (Note that the useless resolver for foo is PR95779). I believe the two
versions should behave the same...

Ref PR95778 (PLT elimination)
Ref PR71990 (similar title but different. That one is about inlining of the
dispatcher itself IIUC and is not about the case that can already be statically
dispatched. It is also not specific to target_clones like this one is)

Reply via email to