https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119193

            Bug ID: 119193
           Summary: Suboptimal packing codegen
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

Example source:

#include <stdint.h>

#define PACK_8_TO_64( a, b, c, d, e, f, g, h )\
    (((uint64_t)a&0xFF) | ((uint64_t)(b&0xFF)<<8) | ((uint64_t)(c&0xFF)<<16) |
((uint64_t)(d&0xFF)<<24)\
    | ((uint64_t)(e&0xFF)<<32) | ((uint64_t)(f&0xFF)<<40) |
((uint64_t)(g&0xFF)<<48) | ((uint64_t)h<<56))

uint64_t
pack (uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e, uint64_t f,
uint64_t g, uint64_t h)
{
  return PACK_8_TO_64 (a, b, c, d, e, f, g, h);
}

GCC for aarch64 at -O2 generates:
pack:
        ubfiz   x5, x5, 40, 8
        ubfiz   x6, x6, 48, 8
        ubfiz   x4, x4, 32, 8
        ubfiz   x1, x1, 8, 8
        orr     x4, x4, x5
        orr     x7, x6, x7, lsl 56
        lsl     w3, w3, 24
        orr     x4, x4, x1
        ubfiz   x2, x2, 16, 8
        orr     x7, x7, x3
        and     x0, x0, 255
        orr     x4, x4, x2
        orr     x0, x7, x0
        orr     x0, x4, x0
        ret

but Clang does better in using the bitfield insert instructions:
pack:
        and     x8, x0, #0xff
        bfi     x8, x1, #8, #8
        bfi     x8, x2, #16, #8
        bfi     x8, x3, #24, #8
        bfi     x8, x4, #32, #8
        bfi     x8, x5, #40, #8
        bfi     x8, x6, #48, #8
        orr     x0, x8, x7, lsl #56
        ret

Reply via email to