Hello! Attached patch teaches gcc how to merge unaligned load (UNSPEC_MOVU) with pcmpistr/pcmpestr instructions.
2012-06-18 Uros Bizjak <ubiz...@gmail.com> PR target/53712 * config/i386/sse.md (*sse4_2_pcmpestr_unaligned): New. (*sse4_2_pcmpistr_unaligned): New. testsuite/ChangeLog: 2012-06-18 Uros Bizjak <ubiz...@gmail.com> PR target/53712 * gcc.target/i386/pr53712.c: New test. Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, also with followup lex.c patch that substitutes asm with builtins. Committed to mainline SVN. Uros.
Index: testsuite/gcc.target/i386/pr53712.c =================================================================== --- testsuite/gcc.target/i386/pr53712.c (revision 0) +++ testsuite/gcc.target/i386/pr53712.c (working copy) @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.2" } */ + +typedef char v16qi __attribute__ ((__vector_size__ (16))); + +int test (const char *s1, const char *s2) +{ + v16qi s1chars = __builtin_ia32_loaddqu ((const char *) s2); + v16qi s2chars = __builtin_ia32_loaddqu ((const char *) s1); + return __builtin_ia32_pcmpistri128 (s1chars, s2chars, 0); +} + +/* { dg-final { scan-assembler-times "movdqu" 1 } } */ Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 188736) +++ config/i386/sse.md (working copy) @@ -9679,6 +9679,68 @@ (set_attr "memory" "none,load") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse4_2_pcmpestr_unaligned" + [(set (match_operand:SI 0 "register_operand" "=c") + (unspec:SI + [(match_operand:V16QI 2 "reg_not_xmm0_operand" "x") + (match_operand:SI 3 "register_operand" "a") + (unspec:V16QI + [(match_operand:V16QI 4 "memory_operand" "m")] + UNSPEC_MOVU) + (match_operand:SI 5 "register_operand" "d") + (match_operand:SI 6 "const_0_to_255_operand" "n")] + UNSPEC_PCMPESTR)) + (set (match_operand:V16QI 1 "register_operand" "=Yz") + (unspec:V16QI + [(match_dup 2) + (match_dup 3) + (unspec:V16QI [(match_dup 4)] UNSPEC_MOVU) + (match_dup 5) + (match_dup 6)] + UNSPEC_PCMPESTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 2) + (match_dup 3) + (unspec:V16QI [(match_dup 4)] UNSPEC_MOVU) + (match_dup 5) + (match_dup 6)] + UNSPEC_PCMPESTR))] + "TARGET_SSE4_2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0])); + int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1])); + int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG); + + if (ecx) + emit_insn (gen_sse4_2_pcmpestri (operands[0], operands[2], + operands[3], operands[4], + operands[5], operands[6])); + if (xmm0) + emit_insn (gen_sse4_2_pcmpestrm (operands[1], operands[2], + operands[3], operands[4], + operands[5], operands[6])); + if (flags && !(ecx || xmm0)) + emit_insn (gen_sse4_2_pcmpestr_cconly (NULL, NULL, + operands[2], operands[3], + operands[4], operands[5], + operands[6])); + if (!(flags || ecx || xmm0)) + emit_note (NOTE_INSN_DELETED); + + DONE; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "load") + (set_attr "mode" "TI")]) + (define_insn "sse4_2_pcmpestri" [(set (match_operand:SI 0 "register_operand" "=c,c") (unspec:SI @@ -9809,6 +9871,59 @@ (set_attr "memory" "none,load") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse4_2_pcmpistr_unaligned" + [(set (match_operand:SI 0 "register_operand" "=c") + (unspec:SI + [(match_operand:V16QI 2 "reg_not_xmm0_operand" "x") + (unspec:V16QI + [(match_operand:V16QI 3 "memory_operand" "m")] + UNSPEC_MOVU) + (match_operand:SI 4 "const_0_to_255_operand" "n")] + UNSPEC_PCMPISTR)) + (set (match_operand:V16QI 1 "register_operand" "=Yz") + (unspec:V16QI + [(match_dup 2) + (unspec:V16QI [(match_dup 3)] UNSPEC_MOVU) + (match_dup 4)] + UNSPEC_PCMPISTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 2) + (unspec:V16QI [(match_dup 3)] UNSPEC_MOVU) + (match_dup 4)] + UNSPEC_PCMPISTR))] + "TARGET_SSE4_2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0])); + int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1])); + int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG); + + if (ecx) + emit_insn (gen_sse4_2_pcmpistri (operands[0], operands[2], + operands[3], operands[4])); + if (xmm0) + emit_insn (gen_sse4_2_pcmpistrm (operands[1], operands[2], + operands[3], operands[4])); + if (flags && !(ecx || xmm0)) + emit_insn (gen_sse4_2_pcmpistr_cconly (NULL, NULL, + operands[2], operands[3], + operands[4])); + if (!(flags || ecx || xmm0)) + emit_note (NOTE_INSN_DELETED); + + DONE; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "load") + (set_attr "mode" "TI")]) + (define_insn "sse4_2_pcmpistri" [(set (match_operand:SI 0 "register_operand" "=c,c") (unspec:SI