diff --git a/libgfortran/io/file_pos.c b/libgfortran/io/file_pos.c
index c8ecc3a..bf2250a 100644
--- a/libgfortran/io/file_pos.c
+++ b/libgfortran/io/file_pos.c
@@ -140,15 +140,21 @@ unformatted_backspace (st_parameter_filepos *fpp, gfc_unit *u)
 	}
       else
 	{
+	  uint32_t u32;
+	  uint64_t u64;
 	  switch (length)
 	    {
 	    case sizeof(GFC_INTEGER_4):
-	      reverse_memcpy (&m4, p, sizeof (m4));
+	      memcpy (&u32, p, sizeof (u32));
+	      u32 = __builtin_bswap32 (u32);
+	      m4 = *(GFC_INTEGER_4*)&u32;
 	      m = m4;
 	      break;
 
 	    case sizeof(GFC_INTEGER_8):
-	      reverse_memcpy (&m8, p, sizeof (m8));
+	      memcpy (&u64, p, sizeof (u64));
+	      u64 = __builtin_bswap64 (u64);
+	      m8 = *(GFC_INTEGER_8*)&u64;
 	      m = m8;
 	      break;
 
diff --git a/libgfortran/io/io.h b/libgfortran/io/io.h
index 43aeafd..f17de19 100644
--- a/libgfortran/io/io.h
+++ b/libgfortran/io/io.h
@@ -649,9 +649,6 @@ internal_proto(init_loop_spec);
 extern void next_record (st_parameter_dt *, int);
 internal_proto(next_record);
 
-extern void reverse_memcpy (void *, const void *, size_t);
-internal_proto (reverse_memcpy);
-
 extern void st_wait (st_parameter_wait *);
 export_proto(st_wait);
 
diff --git a/libgfortran/io/transfer.c b/libgfortran/io/transfer.c
index 6dda1df..eb77df8a 100644
--- a/libgfortran/io/transfer.c
+++ b/libgfortran/io/transfer.c
@@ -878,50 +878,91 @@ write_buf (st_parameter_dt *dtp, void *buf, size_t nbytes)
 }
 
 
-/* Master function for unformatted reads.  */
+/* Reverse memcpy - used for byte swapping.  */
 
 static void
-unformatted_read (st_parameter_dt *dtp, bt type,
-		  void *dest, int kind, size_t size, size_t nelems)
+reverse_memcpy (void *dest, const void *src, size_t n)
 {
-  if (likely (dtp->u.p.current_unit->flags.convert == GFC_CONVERT_NATIVE)
-      || kind == 1)
+  char *d, *s;
+  size_t i;
+
+  d = (char *) dest;
+  s = (char *) src + n - 1;
+
+  /* Write with ascending order - this is likely faster
+     on modern architectures because of write combining.  */
+  for (i=0; i<n; i++)
+      *(d++) = *(s--);
+}
+
+
+/* Utility function for byteswapping an array, using the bswap
+   builtins if possible. dest and src can overlap.  */
+
+static void
+bswap_array (void *dest, const void *src, size_t size, size_t nelems)
+{
+  char buffer[16];
+  const char *ps; 
+  char *pd;
+
+  switch (size)
     {
-      if (type == BT_CHARACTER)
-	size *= GFC_SIZE_OF_CHAR_KIND(kind);
-      read_block_direct (dtp, dest, size * nelems);
+    case 1:
+      break;
+    case 2:
+      for (size_t i = 0; i < nelems; i++)
+	((uint16_t*)dest)[i] = __builtin_bswap16 (((uint16_t*)src)[i]);
+      break;
+    case 4:
+      for (size_t i = 0; i < nelems; i++)
+	((uint32_t*)dest)[i] = __builtin_bswap32 (((uint32_t*)src)[i]);
+      break;
+    case 8:
+      for (size_t i = 0; i < nelems; i++)
+	((uint64_t*)dest)[i] = __builtin_bswap64 (((uint64_t*)src)[i]);
+      break;
+    default:
+      ps = src;
+      pd = dest;
+      for (size_t i = 0; i < nelems; i++)
+	{
+	  reverse_memcpy (buffer, ps, size);
+	  memcpy (pd, buffer, size);
+	  ps += size;
+	  pd += size;
+	}
     }
-  else
-    {
-      char buffer[16];
-      char *p;
-      size_t i;
+}
 
-      p = dest;
 
+/* Master function for unformatted reads.  */
+
+static void
+unformatted_read (st_parameter_dt *dtp, bt type,
+		  void *dest, int kind, size_t size, size_t nelems)
+{
+  if (type == BT_CHARACTER)
+    size *= GFC_SIZE_OF_CHAR_KIND(kind);
+  read_block_direct (dtp, dest, size * nelems);
+
+  if (unlikely (dtp->u.p.current_unit->flags.convert == GFC_CONVERT_SWAP)
+      && kind != 1)
+    {
       /* Handle wide chracters.  */
-      if (type == BT_CHARACTER && kind != 1)
-	{
-	  nelems *= size;
-	  size = kind;
-	}
+      if (type == BT_CHARACTER)
+  	{
+  	  nelems *= size;
+  	  size = kind;
+  	}
 
       /* Break up complex into its constituent reals.  */
-      if (type == BT_COMPLEX)
-	{
-	  nelems *= 2;
-	  size /= 2;
-	}
-      
-      /* By now, all complex variables have been split into their
-	 constituent reals.  */
-      
-      for (i = 0; i < nelems; i++)
-	{
- 	  read_block_direct (dtp, buffer, size);
- 	  reverse_memcpy (p, buffer, size);
- 	  p += size;
- 	}
+      else if (type == BT_COMPLEX)
+  	{
+  	  nelems *= 2;
+  	  size /= 2;
+  	}
+      bswap_array (dest, dest, size, nelems);
     }
 }
 
@@ -945,9 +986,10 @@ unformatted_write (st_parameter_dt *dtp, bt type,
     }
   else
     {
-      char buffer[16];
+#define BSWAP_BUFSZ 512
+      char buffer[BSWAP_BUFSZ];
       char *p;
-      size_t i;
+      size_t nrem;
 
       p = source;
 
@@ -968,12 +1010,21 @@ unformatted_write (st_parameter_dt *dtp, bt type,
       /* By now, all complex variables have been split into their
 	 constituent reals.  */
 
-      for (i = 0; i < nelems; i++)
+      nrem = nelems;
+      do
 	{
-	  reverse_memcpy(buffer, p, size);
- 	  p += size;
-	  write_buf (dtp, buffer, size);
+	  size_t nc;
+	  if (size * nrem > BSWAP_BUFSZ)
+	    nc = BSWAP_BUFSZ / size;
+	  else
+	    nc = nrem;
+
+	  bswap_array (buffer, p, size, nc);
+	  write_buf (dtp, buffer, size * nc);
+	  p += size * nc;
+	  nrem -= nc;
 	}
+      while (nrem > 0);
     }
 }
 
@@ -2153,15 +2204,22 @@ us_read (st_parameter_dt *dtp, int continued)
 	}
     }
   else
+    {
+      uint32_t u32;
+      uint64_t u64;
       switch (nr)
 	{
 	case sizeof(GFC_INTEGER_4):
-	  reverse_memcpy (&i4, &i, sizeof (i4));
+	  memcpy (&u32, &i, sizeof (u32));
+	  u32 = __builtin_bswap32 (u32);
+	  i4 = *(GFC_INTEGER_4*)&u32;
 	  i = i4;
 	  break;
 
 	case sizeof(GFC_INTEGER_8):
-	  reverse_memcpy (&i8, &i, sizeof (i8));
+	  memcpy (&u64, &i, sizeof (u64));
+	  u64 = __builtin_bswap64 (u64);
+	  i8 = *(GFC_INTEGER_8*)&u64;
 	  i = i8;
 	  break;
 
@@ -2169,6 +2227,7 @@ us_read (st_parameter_dt *dtp, int continued)
 	  runtime_error ("Illegal value for record marker");
 	  break;
 	}
+    }
 
   if (i >= 0)
     {
@@ -3036,7 +3095,6 @@ write_us_marker (st_parameter_dt *dtp, const gfc_offset buf)
   size_t len;
   GFC_INTEGER_4 buf4;
   GFC_INTEGER_8 buf8;
-  char p[sizeof (GFC_INTEGER_8)];
 
   if (compile_options.record_marker == 0)
     len = sizeof (GFC_INTEGER_4);
@@ -3065,18 +3123,20 @@ write_us_marker (st_parameter_dt *dtp, const gfc_offset buf)
     }
   else
     {
+      uint32_t u32;
+      uint64_t u64;
       switch (len)
 	{
 	case sizeof (GFC_INTEGER_4):
 	  buf4 = buf;
-	  reverse_memcpy (p, &buf4, sizeof (GFC_INTEGER_4));
-	  return swrite (dtp->u.p.current_unit->s, p, len);
+	  u32 = __builtin_bswap32 (*(uint32_t*)&buf4);
+	  return swrite (dtp->u.p.current_unit->s, &u32, len);
 	  break;
 
 	case sizeof (GFC_INTEGER_8):
 	  buf8 = buf;
-	  reverse_memcpy (p, &buf8, sizeof (GFC_INTEGER_8));
-	  return swrite (dtp->u.p.current_unit->s, p, len);
+	  u64 = __builtin_bswap64 (*(uint64_t*)&buf8);
+	  return swrite (dtp->u.p.current_unit->s, &u64, len);
 	  break;
 
 	default:
@@ -3713,22 +3773,6 @@ st_set_nml_var_dim (st_parameter_dt *dtp, GFC_INTEGER_4 n_dim,
   GFC_DIMENSION_SET(nml->dim[n],lbound,ubound,stride);
 }
 
-/* Reverse memcpy - used for byte swapping.  */
-
-void reverse_memcpy (void *dest, const void *src, size_t n)
-{
-  char *d, *s;
-  size_t i;
-
-  d = (char *) dest;
-  s = (char *) src + n - 1;
-
-  /* Write with ascending order - this is likely faster
-     on modern architectures because of write combining.  */
-  for (i=0; i<n; i++)
-      *(d++) = *(s--);
-}
-
 
 /* Once upon a time, a poor innocent Fortran program was reading a
    file, when suddenly it hit the end-of-file (EOF).  Unfortunately
