diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 3436820..0f145dd 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1783,18 +1783,18 @@ struct processor_costs atom_cost = {
   /* stringop_algs for memcpy.  
      SSE loops works best on Atom, but fall back into non-SSE unrolled loop variant
      if that fails.  */
-  {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment.  */
-    {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
-   {{libcall, {{2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment.  */
-    {libcall, {{2048, unrolled_loop},
+  {{{libcall, {{64, loop}, {4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment.  */
+    {libcall, {{64, loop}, {4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
+   {{libcall, {{64, loop}, {2048, sse_loop}, {2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment.  */
+    {libcall, {{64, loop}, {2048, sse_loop}, {2048, unrolled_loop},
 	       {-1, libcall}}}}},
 
   /* stringop_algs for memset.  */
-  {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment.  */
-    {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
-   {{libcall, {{1024, unrolled_loop},	 /* Unknown alignment.  */
+  {{{libcall, {{64, loop}, {4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment.  */
+    {libcall, {{64, loop}, {4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
+   {{libcall, {{64, loop}, {1024, sse_loop}, {1024, unrolled_loop},	 /* Unknown alignment.  */
 	       {-1, libcall}}},
-    {libcall, {{2048, unrolled_loop},
+    {libcall, {{64, loop}, {2048, sse_loop}, {2048, unrolled_loop},
 	       {-1, libcall}}}}},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -22327,6 +22327,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
   enum machine_mode move_mode;
   rtx loop_iter = NULL_RTX;
   int dst_offset, src_offset;
+  int remainder_size = 0;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22356,6 +22357,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
 		   || src_offset < 0
 		   || src_offset != dst_offset);
   alg = decide_alg (count, expected_size, false, &dynamic_check, align_unknown);
+  if (align_unknown && alg == sse_loop)
+    alg = unrolled_loop;
   desired_align = decide_alignment (align, alg, expected_size);
   if (align_unknown)
     desired_align = align;
@@ -22408,7 +22411,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
 	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
 	       && unroll_factor < 4)
 	  unroll_factor *= 2;
-      size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+      size_needed = GET_MODE_SIZE (move_mode);
       break;
     case rep_prefix_8_byte:
       size_needed = 8;
@@ -22469,12 +22472,6 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
 	}
       else
 	{
-	  /* SSE and unrolled algs re-use iteration counter in the epilogue.  */
-	  if (alg == sse_loop || alg == unrolled_loop)
-	    {
-	      loop_iter = gen_reg_rtx (counter_mode (count_exp));
-              emit_move_insn (loop_iter, const0_rtx);
-	    }
 	  label = gen_label_rtx ();
 	  emit_cmp_and_jump_insns (count_exp,
 				   GEN_INT (epilogue_size_needed),
@@ -22584,16 +22581,54 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
 				     count_exp, Pmode, 1, expected_size);
       break;
-    case sse_loop:
     case unrolled_loop:
       /* In some cases we want to use the same iterator in several adjacent
 	 loops, so here we save loop iterator rtx and don't update addresses.  */
+      expand_set_or_movmem_via_loop_with_iter (dst, src, destreg, srcreg, NULL,
+					       count_exp, NULL_RTX, move_mode,
+					       unroll_factor, expected_size, true);
+      break;
+    case sse_loop:
+      /* In some cases we want to use the same iterator in several adjacent
+	 loops, so here we save loop iterator rtx and don't update addresses.  */
+      loop_iter = gen_reg_rtx (counter_mode (count_exp));
+      emit_move_insn (loop_iter, const0_rtx);
       loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
 							   srcreg, NULL,
 							   count_exp, loop_iter,
 							   move_mode,
 							   unroll_factor,
 							   expected_size, false);
+
+      /* We haven't updated addresses, so we'll do it now.
+	 Also, if the epilogue seems to be big, we'll generate a loop (not
+	 unrolled) in it.  We'll do it only if alignment is unknown, because in
+	 this case in epilogue we have to perform memmove by bytes, which is very
+	 slow.  */
+      remainder_size = GET_MODE_SIZE (move_mode) * unroll_factor;
+      if (count && desired_align <= align)
+	remainder_size = count % remainder_size;
+      /* We may not need the epilgoue loop at all when the count is known
+	 and alignment is not adjusted.  */
+      if (remainder_size > GET_MODE_SIZE (move_mode))
+	{
+	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
+	     do this, we can have very big epilogue - when alignment is statically
+	     unknown we'll have the epilogue byte by byte which may be very slow.  */
+	  loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
+	      srcreg, NULL, count_exp,
+	      loop_iter, move_mode, 1,
+	      expected_size, false);
+	}
+      tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
+			       true, OPTAB_LIB_WIDEN);
+      if (tmp != destreg)
+	emit_move_insn (destreg, tmp);
+
+      tmp = expand_simple_binop (Pmode, PLUS, srcreg, loop_iter, srcreg,
+			       true, OPTAB_LIB_WIDEN);
+      if (tmp != srcreg)
+	emit_move_insn (srcreg, tmp);
       break;
     case rep_prefix_8_byte:
       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
@@ -22644,43 +22679,6 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
       LABEL_NUSES (label) = 1;
     }
 
-  /* We haven't updated addresses, so we'll do it now.
-     Also, if the epilogue seems to be big, we'll generate a loop (not
-     unrolled) in it.  We'll do it only if alignment is unknown, because in
-     this case in epilogue we have to perform memmove by bytes, which is very
-     slow.  */
-  if (alg == sse_loop || alg == unrolled_loop)
-    {
-      rtx tmp;
-      int remainder_size = epilogue_size_needed;
-
-      /* We may not need the epilgoue loop at all when the count is known
-	 and alignment is not adjusted.  */
-      if (count && desired_align <= align)
-	remainder_size = count % epilogue_size_needed;
-      if (remainder_size > 31)
-	{
-	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
-	     do this, we can have very big epilogue - when alignment is statically
-	     unknown we'll have the epilogue byte by byte which may be very slow.  */
-	  loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
-	      srcreg, NULL, count_exp,
-	      loop_iter, move_mode, 1,
-	      expected_size, false);
-	  src = change_address (src, BLKmode, srcreg);
-	  dst = change_address (dst, BLKmode, destreg);
-	  epilogue_size_needed = GET_MODE_SIZE (move_mode);
-	}
-      tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
-			       true, OPTAB_LIB_WIDEN);
-      if (tmp != destreg)
-	emit_move_insn (destreg, tmp);
-
-      tmp = expand_simple_binop (Pmode, PLUS, srcreg, loop_iter, srcreg,
-			       true, OPTAB_LIB_WIDEN);
-      if (tmp != srcreg)
-	emit_move_insn (srcreg, tmp);
-    }
   if (count_exp != const0_rtx && epilogue_size_needed > 1)
     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
 			    epilogue_size_needed);
@@ -22855,6 +22853,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
   enum machine_mode move_mode;
   rtx loop_iter = NULL_RTX;
   bool early_jump = false;
+  int remainder_size = 0;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22923,7 +22922,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
 	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
 	       && unroll_factor < 4)
 	  unroll_factor *= 2;
-      size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+      size_needed = GET_MODE_SIZE (move_mode);
       break;
     case rep_prefix_8_byte:
       size_needed = 8;
@@ -22994,12 +22993,6 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
 	}
       else
 	{
-	  /* SSE and unrolled_lopo algs re-use iteration counter in the epilogue.  */
-	  if (alg == sse_loop || alg == unrolled_loop)
-	    {
-	      loop_iter = gen_reg_rtx (counter_mode (count_exp));
-              emit_move_insn (loop_iter, const0_rtx);
-	    }
 	  label = gen_label_rtx ();
 	  early_jump = true;
 	  emit_cmp_and_jump_insns (count_exp,
@@ -23105,20 +23098,36 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
 				     count_exp, Pmode, 1, expected_size);
       break;
     case unrolled_loop:
-      loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
-				     NULL, gpr_promoted_val, count_exp,
-				     loop_iter, move_mode, unroll_factor,
-				     expected_size, false);
-      break;
     case sse_loop:
-      vec_promoted_val =
-	promote_duplicated_reg_to_size (gpr_promoted_val,
+      loop_iter = gen_reg_rtx (counter_mode (count_exp));
+      emit_move_insn (loop_iter, const0_rtx);
+      gcc_assert (gpr_promoted_val);
+      if (alg == sse_loop)
+        vec_promoted_val =
+	  promote_duplicated_reg_to_size (gpr_promoted_val,
 					GET_MODE_SIZE (move_mode),
 					GET_MODE_SIZE (move_mode), align);
       loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
-				     NULL, vec_promoted_val, count_exp,
-				     loop_iter, move_mode, unroll_factor,
-				     expected_size, false);
+		    NULL,  (alg == sse_loop ? vec_promoted_val : gpr_promoted_val),
+		    count_exp, loop_iter, move_mode, unroll_factor,
+		    expected_size, false);
+      remainder_size = GET_MODE_SIZE (move_mode) * unroll_factor;
+      if (count && desired_align <= align)
+	remainder_size = count % remainder_size;
+      if (alg == sse_loop && remainder_size >= GET_MODE_SIZE (move_mode))
+	{
+	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
+	     do this, we can have very big epilogue - when alignment is statically
+	     unknown we'll have the epilogue byte by byte which may be very slow.  */
+	  loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
+	      NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val),
+	      count_exp, loop_iter, move_mode, 1, expected_size, false);
+	}
+      loop_iter = ix86_zero_extend_to_Pmode (loop_iter);
+      tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
+			       true, OPTAB_LIB_WIDEN);
+      if (tmp != destreg)
+	emit_move_insn (destreg, tmp);
       break;
     case rep_prefix_8_byte:
       gcc_assert (TARGET_64BIT);
@@ -23142,7 +23151,6 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
     dst = change_address (dst, BLKmode, destreg);
 
   /* Step 4: Epilogue to copy the remaining bytes.  */
-
   if (label)
     {
       /* When the main loop is done, COUNT_EXP might hold original count,
@@ -23167,32 +23175,6 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
         gpr_promoted_val = 0;
     }
  epilogue:
-  if (alg == unrolled_loop || alg == sse_loop)
-    {
-      rtx tmp;
-      int remainder_size = epilogue_size_needed;
-      if (count && desired_align <= align)
-	remainder_size = count % epilogue_size_needed;
-      /* We may not need the epilgoue loop at all when the count is known
-	 and alignment is not adjusted.  */
-      if (remainder_size > 31 
-	  && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val))
-	{
-	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
-	     do this, we can have very big epilogue - when alignment is statically
-	     unknown we'll have the epilogue byte by byte which may be very slow.  */
-	  loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
-	      NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp,
-	      loop_iter, move_mode, 1,
-	      expected_size, false);
-	  dst = change_address (dst, BLKmode, destreg);
-	  epilogue_size_needed = GET_MODE_SIZE (move_mode);
-	}
-      tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
-			       true, OPTAB_LIB_WIDEN);
-      if (tmp != destreg)
-	emit_move_insn (destreg, tmp);
-    }
   if (count_exp == const0_rtx || epilogue_size_needed <= 1)
     ;
   else if (!gpr_promoted_val)
