diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..3181c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,57 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..a419c8d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..fe4d1ad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..d3006c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..576502a 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1616,11 +1616,21 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   range of memory represented by this symbolic number. The range is different
+   from size as size reflect the size of the type of current expression. It can
+   thus be bigger when casting to a type with bigger size.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  unsigned HOST_WIDE_INT range;
 };
 
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
@@ -1733,6 +1743,51 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	 to initialize the symbolic number.  */
       if (!source_expr1)
 	{
+	  n->base_addr = n->offset = NULL_TREE;
+	  if (is_gimple_assign (rhs1_stmt))
+	    {
+	      unsigned bit_offset;
+	      tree var, elt_size, index, field, bit_offset_expr;
+
+	      var = gimple_assign_rhs1 (rhs1_stmt);
+	      switch (TREE_CODE (var))
+		{
+		/* leaf node is an array, memorize its base, element size and
+		   offset from base to compare to other array leaf node.  */
+		case ARRAY_RANGE_REF:
+		case ARRAY_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  elt_size = array_ref_element_size (var);
+		  if (TREE_CODE (elt_size) != INTEGER_CST)
+		    return NULL_TREE;
+		  index = TREE_OPERAND (var, 1);
+		  if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+		    return NULL_TREE;
+		  n->offset = fold_build2 (MULT_EXPR, sizetype,
+					   index, elt_size);
+		  rhs1 = var;
+		  break;
+		/* leaf node is a record field, memorize its base, field size
+		   and offset from base to compare to other field leaf node.  */
+		case COMPONENT_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  field = TREE_OPERAND (var, 1);
+		  bit_offset = TREE_INT_CST_LOW (DECL_FIELD_BIT_OFFSET (field));
+		  if (bit_offset % BITS_PER_UNIT)
+		    return NULL_TREE;
+		  bit_offset /= BITS_PER_UNIT;
+		  n->offset = component_ref_field_offset (var);
+		  bit_offset_expr = build_int_cst (TREE_TYPE (n->offset),
+						   bit_offset);
+		  n->offset = fold_build2 (PLUS_EXPR, TREE_TYPE (n->offset),
+					  n->offset, bit_offset_expr);
+		  if (TYPE_PRECISION (TREE_TYPE (field)) != BITS_PER_UNIT)
+		    return NULL_TREE;
+		  rhs1 = var;
+		  break;
+		default:;
+		}
+	    }
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
 	     order byte is set to n->size and the lowest order
@@ -1741,6 +1796,7 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
+	  n->range = n->size;
 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
 
@@ -1824,10 +1880,69 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1853,14 +1968,15 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 }
 
 /* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+   SHIFTs and ANDs.  It also sets *memsrc if the source lies in memory
+   and in this case also sets *size to the size of the load needed. At
+   last, the function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap (gimple stmt, bool *memsrc, int *size)
 {
 /* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
+   have a full byte swap.  The number is shifted to the right according
    to the size of the symbolic number before using it.  */
   unsigned HOST_WIDEST_INT cmp =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
@@ -1868,12 +1984,12 @@ find_bswap (gimple stmt)
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
@@ -1885,18 +2001,34 @@ find_bswap (gimple stmt)
   /* Zero out the extra bits of N and CMP.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmp >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
      with the largest digit in the highest order byte.  */
   if (cmp != n.n)
     return NULL_TREE;
 
+  *memsrc = 0;
+
+  if (n.base_addr)
+    {
+      *memsrc = 1;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
@@ -1961,21 +2093,26 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
+	  tree bswap_src, bswap_type, load_type;
 	  tree bswap_tmp;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
+	  bool memsrc;
 	  gimple call;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  bswap_src = find_bswap (stmt, &memsrc, &type_size);
+
+	  if (!bswap_src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2120,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2128,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2003,10 +2142,35 @@ execute_optimize_bswap (void)
 	  if (!fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (memsrc)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_ptr_type, load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      load_ptr_type = build_pointer_type (load_type);
+	      addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
+	      addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+	      addr_stmt = gimple_build_assign_with_ops
+			 (NOP_EXPR, addr_tmp, addr_expr, NULL);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (load_ptr_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      bswap_src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)
