https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101185
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> --- Alloc order is just another kind of cost which can be compensated by increasing cost of mask->integer and integer->mask. With below patch , pr96814 wouldn't generate any mask intructions execept for kmovd %eax, %k1 vpcmpeqd %ymm1, %ymm1, %ymm1 vmovdqu8 %ymm1, %ymm0{%k1}{z} which is what we want. modified gcc/config/i386/i386.md @@ -1335,7 +1335,7 @@ (define_insn "*cmp<mode>_ccz_1" [(set (reg FLAGS_REG) (compare (match_operand:SWI1248_AVX512BWDQ_64 0 - "nonimmediate_operand" "<r>,?m<r>,$k") + "nonimmediate_operand" "<r>,?m<r>,*k") (match_operand:SWI1248_AVX512BWDQ_64 1 "const0_operand")))] "TARGET_AVX512F && ix86_match_ccmode (insn, CCZmode)" "@ modified gcc/config/i386/x86-tune-costs.h @@ -2768,7 +2768,7 @@ struct processor_costs intel_cost = { {6, 6, 6, 6, 6}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ 4, 4, /* SSE->integer and integer->SSE moves */ - 4, 4, /* mask->integer and integer->mask moves */ + 6, 6, /* mask->integer and integer->mask moves */ {4, 4, 4}, /* cost of loading mask register in QImode, HImode, SImode. */ {6, 6, 6}, /* cost if storing mask register @@ -2882,7 +2882,7 @@ struct processor_costs generic_cost = { {6, 6, 6, 10, 15}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ 6, 6, /* SSE->integer and integer->SSE moves */ - 6, 6, /* mask->integer and integer->mask moves */ + 8, 8, /* mask->integer and integer->mask moves */ {6, 6, 6}, /* cost of loading mask register in QImode, HImode, SImode. */ {6, 6, 6}, /* cost if storing mask register So would the solution of increasing one more unit(or maybe more) for cost of mask->integer and integer->mask as compensation for changing alloca order be acceptable for you? or do you insist on reverting the x86_order_regs_for_local_alloc part?