https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103
Jakub Jelinek <jakub at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |hjl.tools at gmail dot com, | |jakub at gcc dot gnu.org --- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> --- For the constant vector element extraction, it can be done say with: --- gcc/config/i386/sse.md.jj 2019-07-06 23:55:51.617641994 +0200 +++ gcc/config/i386/sse.md 2019-07-08 12:23:13.315509840 +0200 @@ -9351,7 +9351,7 @@ (define_insn "avx512f_sgetexp<mode><mask [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "<mask_codefor><avx512>_align<mode><mask_name>" +(define_insn "<avx512>_align<mode><mask_name>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (unspec:VI48_AVX512VL [(match_operand:VI48_AVX512VL 1 "register_operand" "v") (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") --- gcc/config/i386/i386-expand.c.jj 2019-07-04 00:18:37.067010375 +0200 +++ gcc/config/i386/i386-expand.c 2019-07-08 12:37:24.687562956 +0200 @@ -14827,6 +14827,14 @@ ix86_expand_vector_extract (bool mmx_ok, break; case E_V16SFmode: + if (elt > 12) + { + tmp = gen_reg_rtx (V16SImode); + vec = gen_lowpart (V16SImode, vec); + emit_insn (gen_avx512f_alignv16si (tmp, vec, vec, GEN_INT (elt))); + vec = gen_lowpart (V16SFmode, tmp); + elt = 0; + } tmp = gen_reg_rtx (V8SFmode); if (elt < 8) emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); @@ -14836,6 +14844,14 @@ ix86_expand_vector_extract (bool mmx_ok, return; case E_V8DFmode: + if (elt >= 6) + { + tmp = gen_reg_rtx (V8DImode); + vec = gen_lowpart (V8DImode, vec); + emit_insn (gen_avx512f_alignv8di (tmp, vec, vec, GEN_INT (elt))); + vec = gen_lowpart (V8DFmode, tmp); + elt = 0; + } tmp = gen_reg_rtx (V4DFmode); if (elt < 4) emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); @@ -14845,6 +14861,13 @@ ix86_expand_vector_extract (bool mmx_ok, return; case E_V16SImode: + if (elt > 12) + { + tmp = gen_reg_rtx (V16SImode); + emit_insn (gen_avx512f_alignv16si (tmp, vec, vec, GEN_INT (elt))); + vec = tmp; + elt = 0; + } tmp = gen_reg_rtx (V8SImode); if (elt < 8) emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); @@ -14854,6 +14877,13 @@ ix86_expand_vector_extract (bool mmx_ok, return; case E_V8DImode: + if (elt >= 6) + { + tmp = gen_reg_rtx (V8DImode); + emit_insn (gen_avx512f_alignv8di (tmp, vec, vec, GEN_INT (elt))); + vec = tmp; + elt = 0; + } tmp = gen_reg_rtx (V4DImode); if (elt < 4) emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); The question is in which cases it is beneficial, from pure -Os POV the valignd/valignq is one instruction and for integer extractions needs a vmovd afterwards, so for 64-bit extraction might be also useful for double [3] and [5] (for long long it is two insns in both cases), for 32-bit extraction likely also shorter for float [5], [6], [7], [9], [10], [11], [12], but not for int. But I admit I have no idea on how fast what is.