Hi,

gfortran has a few long-standing bugs wrt module handling. The more
fundamental, and also more difficult to fix, issue is that we re-read
and re-parse module files every time a USE statement is encountered,
instead of once per translation unit. See PR 25708. Another issue, PR
40958, is that module files can be quite big which exacerbates the PR
25708 issues.

The attached patch fixes PR 40958 by compressing the module files with
zlib and storing them in the gzip format (RFC 1952). I chose zlib
because it's a) ubiquitous and b) there's already a copy of zlib in
the GCC source tree, so this doesn't introduce any further build
dependencies. Since the mod files with the patch are in the gzip
format, one can use tools like zcat, zless, zgrep, zdiff etc. to
inspect the uncompressed contents easily (one can also use gunzip if
one first copies the module file to a temporary file with .gz
extension).

However, there's a couple of issues related to seeking in gzip files
(gzseek() instead of fseek() which is currently used). One is fixed by
the patch, the other is a potentially serious performance issue.

First, for a writable gzip file, seeking backwards is not allowed.
Currently when writing a module file, we first write a placeholder for
the MD5, then write the actual module content while updating the MD5
sum in memory as we go, and finally we seek back and write the final
MD5 value. However, the gzip file format contains a solution, 8 bytes
from the end of the file a CRC32 checksum of the (uncompressed)
content is stored. So the patch rips out the MD5 machinery, and
instead compares these CRC32 checksums to determine whether to replace
an existing module file or not (from the command line, one can check
the CRC32 with 'zcat -l -v filename'). As a result, the module version
number has been bumped as well.

The second issue that the patch doesn't address in any way, is that
while seeking on a gzip file in read mode is allowed, from zlib.h: "If
the file is opened for reading, this function is emulated but can be
extremely slow.". Unfortunately, when reading a module file we do seek
back and forth in it. Based on a brief inspection of the code, most if
not all of these seeks are for a very short distance (typically peek a
few bytes ahead in the stream, then seek back), and if the gzseek()
function is somewhat clever about seeking within the read buffer, this
might not be so slow after all. OTOH, if every gzseek() call means
restarting the inflation from the beginning of the file, the impact
could be quite bad.

The patch passes regression testing except for one failure,
module_md5_1.f90 which should be removed. Based on some quick testing,
the size of module files are reduced by a factor of 5 or thereabouts.
I haven't checked performance, in particular one would need to check
the second issue described above for some of those testcases
generating large module files. I think there was some single-file
version of cp2k somewhere that could be used for this, or are there
other appropriate tests somewhere that aren't too difficult to set up?

So at the moment, I'm not proposing this patch for inclusion, consider
it a RFC. Especially appropriate benchmark results and/or pointers to
easy-to-set-up testcases are appreciated.

In case the seeking in read mode is an issue, I suspect it wouldn't be
too hard to fix the parsing to not require it, but I think that would
push the patch more towards 4.8 material.

-- 
Janne Blomqvist
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 17ebd58..d6152b3 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -29,6 +29,9 @@ along with GCC; see the file COPYING3.  If not see
    multiple header files.  Besides, Microsoft's winnt.h was 250k last
    time I looked, so by comparison this is perfectly reasonable.  */
 
+#include "config.h"
+#include "system.h"
+
 /* Declarations common to the front-end and library are put in
    libgfortran/libgfortran_frontend.h  */
 #include "libgfortran.h"
@@ -38,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "input.h"
 #include "splay-tree.h"
+#include <zlib.h>
 
 /* Major control parameters.  */
 
@@ -2345,7 +2349,8 @@ void gfc_add_include_path (const char *, bool, bool);
 void gfc_add_intrinsic_modules_path (const char *);
 void gfc_release_include_path (void);
 FILE *gfc_open_included_file (const char *, bool, bool);
-FILE *gfc_open_intrinsic_module (const char *);
+gzFile gfc_gzopen_included_file (const char *, bool, bool);
+gzFile gfc_open_intrinsic_module (const char *);
 
 int gfc_at_end (void);
 int gfc_at_eof (void);
diff --git a/gcc/fortran/module.c b/gcc/fortran/module.c
index 62f7598..9fa8c97 100644
--- a/gcc/fortran/module.c
+++ b/gcc/fortran/module.c
@@ -72,15 +72,15 @@ along with GCC; see the file COPYING3.  If not see
 #include "arith.h"
 #include "match.h"
 #include "parse.h" /* FIXME */
-#include "md5.h"
 #include "constructor.h"
 #include "cpp.h"
+#include <zlib.h>
 
 #define MODULE_EXTENSION ".mod"
 
 /* Don't put any single quote (') in MOD_VERSION, 
    if yout want it to be recognized.  */
-#define MOD_VERSION "7"
+#define MOD_VERSION "8"
 
 
 /* Structure that describes a position within a module file.  */
@@ -88,7 +88,7 @@ along with GCC; see the file COPYING3.  If not see
 typedef struct
 {
   int column, line;
-  fpos_t pos;
+  z_off_t pos;
 }
 module_locus;
 
@@ -182,10 +182,7 @@ pointer_info;
 /* Local variables */
 
 /* The FILE for the module we're reading or writing.  */
-static FILE *module_fp;
-
-/* MD5 context structure.  */
-static struct md5_ctx ctx;
+static gzFile module_fp;
 
 /* The name of the module we're reading (USE'ing) or writing.  */
 static char module_name[GFC_MAX_SYMBOL_LEN + 1];
@@ -929,7 +926,7 @@ static void bad_module (const char *) ATTRIBUTE_NORETURN;
 static void
 bad_module (const char *msgid)
 {
-  fclose (module_fp);
+  gzclose (module_fp);
 
   switch (iomode)
     {
@@ -956,7 +953,7 @@ set_module_locus (module_locus *m)
 {
   module_column = m->column;
   module_line = m->line;
-  fsetpos (module_fp, &m->pos);
+  gzseek (module_fp, m->pos, SEEK_SET);
 }
 
 
@@ -967,7 +964,7 @@ get_module_locus (module_locus *m)
 {
   m->column = module_column;
   m->line = module_line;
-  fgetpos (module_fp, &m->pos);
+  m->pos = gztell (module_fp);
 }
 
 
@@ -979,7 +976,7 @@ module_char (void)
 {
   int c;
 
-  c = getc (module_fp);
+  c = gzgetc (module_fp);
 
   if (c == EOF)
     bad_module ("Unexpected EOF");
@@ -1104,7 +1101,7 @@ parse_name (int c)
 
   *p = '\0';
 
-  fseek (module_fp, -1, SEEK_CUR);
+  gzseek (module_fp, -1, SEEK_CUR);
   module_column = m.column + len - 1;
 
   if (c == '\n')
@@ -1299,12 +1296,9 @@ find_enum (const mstring *m)
 static void
 write_char (char out)
 {
-  if (putc (out, module_fp) == EOF)
+  if (gzputc (module_fp, out) == EOF)
     gfc_fatal_error ("Error writing modules file: %s", xstrerror (errno));
 
-  /* Add this to our MD5.  */
-  md5_process_bytes (&out, sizeof (out), &ctx);
-  
   if (out != '\n')
     module_column++;
   else
@@ -5114,59 +5108,44 @@ write_module (void)
 }
 
 
-/* Read a MD5 sum from the header of a module file.  If the file cannot
+/* Read a CRC32 sum from the gzip trailer of a module file.  If the file cannot
    be opened, or we have any other error, we return -1.  */
 
 static int
-read_md5_from_module_file (const char * filename, unsigned char md5[16])
+read_crc32_from_module_file (const char * filename, uLong * crc)
 {
   FILE *file;
-  char buf[1024];
-  int n;
+  char buf[4];
+  unsigned int val;
 
-  /* Open the file.  */
-  if ((file = fopen (filename, "r")) == NULL)
+  /* Open the file in binary mode.  */
+  if ((file = fopen (filename, "rb")) == NULL)
     return -1;
 
-  /* Read the first line.  */
-  if (fgets (buf, sizeof (buf) - 1, file) == NULL)
+  /* The gzip crc32 value is found in the [END-8, END-4] bytes of the
+     file. See RFC 1952.  */
+  if (fseek (file, -8, SEEK_END) != 0)
     {
       fclose (file);
       return -1;
     }
-
-  /* The file also needs to be overwritten if the version number changed.  */
-  n = strlen ("GFORTRAN module version '" MOD_VERSION "' created");
-  if (strncmp (buf, "GFORTRAN module version '" MOD_VERSION "' created", n) != 0)
-    {
-      fclose (file);
-      return -1;
-    }
- 
-  /* Read a second line.  */
-  if (fgets (buf, sizeof (buf) - 1, file) == NULL)
+       
+  /* Read the crc32.  */
+  if (fread (buf, 1, 4, file) != 4)
     {
       fclose (file);
       return -1;
     }
-
-  /* Close the file.  */
   fclose (file);
 
-  /* If the header is not what we expect, or is too short, bail out.  */
-  if (strncmp (buf, "MD5:", 4) != 0 || strlen (buf) < 4 + 16)
-    return -1;
-
-  /* Now, we have a real MD5, read it into the array.  */
-  for (n = 0; n < 16; n++)
-    {
-      unsigned int x;
-
-      if (sscanf (&(buf[4+2*n]), "%02x", &x) != 1)
-       return -1;
+  val = (buf[0] & 0xFF) + ((buf[1] & 0xFF) << 8) + ((buf[2] & 0xFF) << 16) 
+    + ((buf[3] & 0xFF) << 24);
+  *crc = val;
+  
+  /* For debugging, the CRC value printed in hexadecimal should match
+     the CRC printed by "zcat -l -v filename".
 
-      md5[n] = x;
-    }
+     printf("CRC of file %s is %x\n", filename, val); */
 
   return 0;
 }
@@ -5181,8 +5160,7 @@ gfc_dump_module (const char *name, int dump_flag)
 {
   int n;
   char *filename, *filename_tmp;
-  fpos_t md5_pos;
-  unsigned char md5_new[16], md5_old[16];
+  uLong crc, crc_old;
 
   n = strlen (name) + strlen (MODULE_EXTENSION) + 1;
   if (gfc_option.module_dir != NULL)
@@ -5216,20 +5194,14 @@ gfc_dump_module (const char *name, int dump_flag)
     gfc_cpp_add_target (filename);
 
   /* Write the module to the temporary file.  */
-  module_fp = fopen (filename_tmp, "w");
+  module_fp = gzopen (filename_tmp, "w");
   if (module_fp == NULL)
     gfc_fatal_error ("Can't open module file '%s' for writing at %C: %s",
 		     filename_tmp, xstrerror (errno));
 
-  /* Write the header, including space reserved for the MD5 sum.  */
-  fprintf (module_fp, "GFORTRAN module version '%s' created from %s\n"
-	   "MD5:", MOD_VERSION, gfc_source_file);
-  fgetpos (module_fp, &md5_pos);
-  fputs ("00000000000000000000000000000000 -- "
-	"If you edit this, you'll get what you deserve.\n\n", module_fp);
-
-  /* Initialize the MD5 context that will be used for output.  */
-  md5_init_ctx (&ctx);
+  /* Write the header.  */
+  gzprintf (module_fp, "GFORTRAN module version '%s' created from %s\n", 
+	    MOD_VERSION, gfc_source_file);
 
   /* Write the module itself.  */
   iomode = IO_OUTPUT;
@@ -5244,24 +5216,16 @@ gfc_dump_module (const char *name, int dump_flag)
 
   write_char ('\n');
 
-  /* Write the MD5 sum to the header of the module file.  */
-  md5_finish_ctx (&ctx, md5_new);
-  fsetpos (module_fp, &md5_pos);
-  for (n = 0; n < 16; n++)
-    fprintf (module_fp, "%02x", md5_new[n]);
-
-  if (fclose (module_fp))
+  if (gzclose (module_fp))
     gfc_fatal_error ("Error writing module file '%s' for writing: %s",
 		     filename_tmp, xstrerror (errno));
 
-  /* Read the MD5 from the header of the old module file and compare.  */
-  if (read_md5_from_module_file (filename, md5_old) != 0
-      || memcmp (md5_old, md5_new, sizeof (md5_old)) != 0)
+  /* Read the CRC32 from the gzip trailers of the module files and compare.  */
+  if (read_crc32_from_module_file (filename_tmp, &crc) != 0
+      || read_crc32_from_module_file (filename, &crc_old) != 0
+      || crc_old != crc)
     {
       /* Module file have changed, replace the old one.  */
-      if (unlink (filename) && errno != ENOENT)
-	gfc_fatal_error ("Can't delete module file '%s': %s", filename,
-			 xstrerror (errno));
       if (rename (filename_tmp, filename))
 	gfc_fatal_error ("Can't rename module file '%s' to '%s': %s",
 			 filename_tmp, filename, xstrerror (errno));
@@ -5817,7 +5781,7 @@ gfc_use_module (void)
      specified that the module is intrinsic.  */
   module_fp = NULL;
   if (!specified_int)
-    module_fp = gfc_open_included_file (filename, true, true);
+    module_fp = gfc_gzopen_included_file (filename, true, true);
 
   /* Then, see if it's an intrinsic one, unless the USE statement
      specified that the module is non-intrinsic.  */
@@ -5865,10 +5829,10 @@ gfc_use_module (void)
   module_column = 1;
   start = 0;
 
-  /* Skip the first two lines of the module, after checking that this is
+  /* Skip the first line of the module, after checking that this is
      a gfortran module file.  */
   line = 0;
-  while (line < 2)
+  while (line < 1)
     {
       c = module_char ();
       if (c == EOF)
@@ -5917,7 +5881,7 @@ gfc_use_module (void)
   free_pi_tree (pi_root);
   pi_root = NULL;
 
-  fclose (module_fp);
+  gzclose (module_fp);
 
   use_stmt = gfc_get_use_list ();
   use_stmt->module_name = gfc_get_string (module_name);
diff --git a/gcc/fortran/scanner.c b/gcc/fortran/scanner.c
index 120d550..c5f5aa1 100644
--- a/gcc/fortran/scanner.c
+++ b/gcc/fortran/scanner.c
@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "flags.h"
 #include "cpp.h"
+#include <zlib.h>
 
 /* Structure for holding module and include file search path.  */
 typedef struct gfc_directorylist
@@ -418,6 +419,37 @@ open_included_file (const char *name, gfc_directorylist *list,
 }
 
 
+static gzFile
+gzopen_included_file (const char *name, gfc_directorylist *list,
+		      bool module, bool system)
+{
+  char *fullname;
+  gfc_directorylist *p;
+  gzFile f;
+
+  for (p = list; p; p = p->next)
+    {
+      if (module && !p->use_for_modules)
+	continue;
+
+      fullname = (char *) alloca(strlen (p->path) + strlen (name) + 1);
+      strcpy (fullname, p->path);
+      strcat (fullname, name);
+
+      f = gzopen (fullname, "r");
+      if (f != NULL)
+	{
+	  if (gfc_cpp_makedep ())
+	    gfc_cpp_add_dep (fullname, system);
+
+	  return f;
+	}
+    }
+
+  return NULL;
+}
+
+
 /* Opens file for reading, searching through the include directories
    given if necessary.  If the include_cwd argument is true, we try
    to open the file in the current directory first.  */
@@ -440,20 +472,38 @@ gfc_open_included_file (const char *name, bool include_cwd, bool module)
   return f;
 }
 
-FILE *
+gzFile 
+gfc_gzopen_included_file (const char *name, bool include_cwd, bool module)
+{
+  gzFile f = NULL;
+
+  if (IS_ABSOLUTE_PATH (name) || include_cwd)
+    {
+      f = gzopen (name, "r");
+      if (f && gfc_cpp_makedep ())
+	gfc_cpp_add_dep (name, false);
+    }
+
+  if (!f)
+    f = gzopen_included_file (name, include_dirs, module, false);
+
+  return f;
+}
+
+gzFile
 gfc_open_intrinsic_module (const char *name)
 {
-  FILE *f = NULL;
+  gzFile f = NULL;
 
   if (IS_ABSOLUTE_PATH (name))
     {
-      f = gfc_open_file (name);
+      f = gzopen (name, "r");
       if (f && gfc_cpp_makedep ())
 	gfc_cpp_add_dep (name, true);
     }
 
   if (!f)
-    f = open_included_file (name, intrinsic_modules_dirs, true, true);
+    f = gzopen_included_file (name, intrinsic_modules_dirs, true, true);
 
   return f;
 }

Reply via email to