Hi, gfortran has a few long-standing bugs wrt module handling. The more fundamental, and also more difficult to fix, issue is that we re-read and re-parse module files every time a USE statement is encountered, instead of once per translation unit. See PR 25708. Another issue, PR 40958, is that module files can be quite big which exacerbates the PR 25708 issues.
The attached patch fixes PR 40958 by compressing the module files with zlib and storing them in the gzip format (RFC 1952). I chose zlib because it's a) ubiquitous and b) there's already a copy of zlib in the GCC source tree, so this doesn't introduce any further build dependencies. Since the mod files with the patch are in the gzip format, one can use tools like zcat, zless, zgrep, zdiff etc. to inspect the uncompressed contents easily (one can also use gunzip if one first copies the module file to a temporary file with .gz extension). However, there's a couple of issues related to seeking in gzip files (gzseek() instead of fseek() which is currently used). One is fixed by the patch, the other is a potentially serious performance issue. First, for a writable gzip file, seeking backwards is not allowed. Currently when writing a module file, we first write a placeholder for the MD5, then write the actual module content while updating the MD5 sum in memory as we go, and finally we seek back and write the final MD5 value. However, the gzip file format contains a solution, 8 bytes from the end of the file a CRC32 checksum of the (uncompressed) content is stored. So the patch rips out the MD5 machinery, and instead compares these CRC32 checksums to determine whether to replace an existing module file or not (from the command line, one can check the CRC32 with 'zcat -l -v filename'). As a result, the module version number has been bumped as well. The second issue that the patch doesn't address in any way, is that while seeking on a gzip file in read mode is allowed, from zlib.h: "If the file is opened for reading, this function is emulated but can be extremely slow.". Unfortunately, when reading a module file we do seek back and forth in it. Based on a brief inspection of the code, most if not all of these seeks are for a very short distance (typically peek a few bytes ahead in the stream, then seek back), and if the gzseek() function is somewhat clever about seeking within the read buffer, this might not be so slow after all. OTOH, if every gzseek() call means restarting the inflation from the beginning of the file, the impact could be quite bad. The patch passes regression testing except for one failure, module_md5_1.f90 which should be removed. Based on some quick testing, the size of module files are reduced by a factor of 5 or thereabouts. I haven't checked performance, in particular one would need to check the second issue described above for some of those testcases generating large module files. I think there was some single-file version of cp2k somewhere that could be used for this, or are there other appropriate tests somewhere that aren't too difficult to set up? So at the moment, I'm not proposing this patch for inclusion, consider it a RFC. Especially appropriate benchmark results and/or pointers to easy-to-set-up testcases are appreciated. In case the seeking in read mode is an issue, I suspect it wouldn't be too hard to fix the parsing to not require it, but I think that would push the patch more towards 4.8 material. -- Janne Blomqvist
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 17ebd58..d6152b3 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -29,6 +29,9 @@ along with GCC; see the file COPYING3. If not see multiple header files. Besides, Microsoft's winnt.h was 250k last time I looked, so by comparison this is perfectly reasonable. */ +#include "config.h" +#include "system.h" + /* Declarations common to the front-end and library are put in libgfortran/libgfortran_frontend.h */ #include "libgfortran.h" @@ -38,6 +41,7 @@ along with GCC; see the file COPYING3. If not see #include "coretypes.h" #include "input.h" #include "splay-tree.h" +#include <zlib.h> /* Major control parameters. */ @@ -2345,7 +2349,8 @@ void gfc_add_include_path (const char *, bool, bool); void gfc_add_intrinsic_modules_path (const char *); void gfc_release_include_path (void); FILE *gfc_open_included_file (const char *, bool, bool); -FILE *gfc_open_intrinsic_module (const char *); +gzFile gfc_gzopen_included_file (const char *, bool, bool); +gzFile gfc_open_intrinsic_module (const char *); int gfc_at_end (void); int gfc_at_eof (void); diff --git a/gcc/fortran/module.c b/gcc/fortran/module.c index 62f7598..9fa8c97 100644 --- a/gcc/fortran/module.c +++ b/gcc/fortran/module.c @@ -72,15 +72,15 @@ along with GCC; see the file COPYING3. If not see #include "arith.h" #include "match.h" #include "parse.h" /* FIXME */ -#include "md5.h" #include "constructor.h" #include "cpp.h" +#include <zlib.h> #define MODULE_EXTENSION ".mod" /* Don't put any single quote (') in MOD_VERSION, if yout want it to be recognized. */ -#define MOD_VERSION "7" +#define MOD_VERSION "8" /* Structure that describes a position within a module file. */ @@ -88,7 +88,7 @@ along with GCC; see the file COPYING3. If not see typedef struct { int column, line; - fpos_t pos; + z_off_t pos; } module_locus; @@ -182,10 +182,7 @@ pointer_info; /* Local variables */ /* The FILE for the module we're reading or writing. */ -static FILE *module_fp; - -/* MD5 context structure. */ -static struct md5_ctx ctx; +static gzFile module_fp; /* The name of the module we're reading (USE'ing) or writing. */ static char module_name[GFC_MAX_SYMBOL_LEN + 1]; @@ -929,7 +926,7 @@ static void bad_module (const char *) ATTRIBUTE_NORETURN; static void bad_module (const char *msgid) { - fclose (module_fp); + gzclose (module_fp); switch (iomode) { @@ -956,7 +953,7 @@ set_module_locus (module_locus *m) { module_column = m->column; module_line = m->line; - fsetpos (module_fp, &m->pos); + gzseek (module_fp, m->pos, SEEK_SET); } @@ -967,7 +964,7 @@ get_module_locus (module_locus *m) { m->column = module_column; m->line = module_line; - fgetpos (module_fp, &m->pos); + m->pos = gztell (module_fp); } @@ -979,7 +976,7 @@ module_char (void) { int c; - c = getc (module_fp); + c = gzgetc (module_fp); if (c == EOF) bad_module ("Unexpected EOF"); @@ -1104,7 +1101,7 @@ parse_name (int c) *p = '\0'; - fseek (module_fp, -1, SEEK_CUR); + gzseek (module_fp, -1, SEEK_CUR); module_column = m.column + len - 1; if (c == '\n') @@ -1299,12 +1296,9 @@ find_enum (const mstring *m) static void write_char (char out) { - if (putc (out, module_fp) == EOF) + if (gzputc (module_fp, out) == EOF) gfc_fatal_error ("Error writing modules file: %s", xstrerror (errno)); - /* Add this to our MD5. */ - md5_process_bytes (&out, sizeof (out), &ctx); - if (out != '\n') module_column++; else @@ -5114,59 +5108,44 @@ write_module (void) } -/* Read a MD5 sum from the header of a module file. If the file cannot +/* Read a CRC32 sum from the gzip trailer of a module file. If the file cannot be opened, or we have any other error, we return -1. */ static int -read_md5_from_module_file (const char * filename, unsigned char md5[16]) +read_crc32_from_module_file (const char * filename, uLong * crc) { FILE *file; - char buf[1024]; - int n; + char buf[4]; + unsigned int val; - /* Open the file. */ - if ((file = fopen (filename, "r")) == NULL) + /* Open the file in binary mode. */ + if ((file = fopen (filename, "rb")) == NULL) return -1; - /* Read the first line. */ - if (fgets (buf, sizeof (buf) - 1, file) == NULL) + /* The gzip crc32 value is found in the [END-8, END-4] bytes of the + file. See RFC 1952. */ + if (fseek (file, -8, SEEK_END) != 0) { fclose (file); return -1; } - - /* The file also needs to be overwritten if the version number changed. */ - n = strlen ("GFORTRAN module version '" MOD_VERSION "' created"); - if (strncmp (buf, "GFORTRAN module version '" MOD_VERSION "' created", n) != 0) - { - fclose (file); - return -1; - } - - /* Read a second line. */ - if (fgets (buf, sizeof (buf) - 1, file) == NULL) + + /* Read the crc32. */ + if (fread (buf, 1, 4, file) != 4) { fclose (file); return -1; } - - /* Close the file. */ fclose (file); - /* If the header is not what we expect, or is too short, bail out. */ - if (strncmp (buf, "MD5:", 4) != 0 || strlen (buf) < 4 + 16) - return -1; - - /* Now, we have a real MD5, read it into the array. */ - for (n = 0; n < 16; n++) - { - unsigned int x; - - if (sscanf (&(buf[4+2*n]), "%02x", &x) != 1) - return -1; + val = (buf[0] & 0xFF) + ((buf[1] & 0xFF) << 8) + ((buf[2] & 0xFF) << 16) + + ((buf[3] & 0xFF) << 24); + *crc = val; + + /* For debugging, the CRC value printed in hexadecimal should match + the CRC printed by "zcat -l -v filename". - md5[n] = x; - } + printf("CRC of file %s is %x\n", filename, val); */ return 0; } @@ -5181,8 +5160,7 @@ gfc_dump_module (const char *name, int dump_flag) { int n; char *filename, *filename_tmp; - fpos_t md5_pos; - unsigned char md5_new[16], md5_old[16]; + uLong crc, crc_old; n = strlen (name) + strlen (MODULE_EXTENSION) + 1; if (gfc_option.module_dir != NULL) @@ -5216,20 +5194,14 @@ gfc_dump_module (const char *name, int dump_flag) gfc_cpp_add_target (filename); /* Write the module to the temporary file. */ - module_fp = fopen (filename_tmp, "w"); + module_fp = gzopen (filename_tmp, "w"); if (module_fp == NULL) gfc_fatal_error ("Can't open module file '%s' for writing at %C: %s", filename_tmp, xstrerror (errno)); - /* Write the header, including space reserved for the MD5 sum. */ - fprintf (module_fp, "GFORTRAN module version '%s' created from %s\n" - "MD5:", MOD_VERSION, gfc_source_file); - fgetpos (module_fp, &md5_pos); - fputs ("00000000000000000000000000000000 -- " - "If you edit this, you'll get what you deserve.\n\n", module_fp); - - /* Initialize the MD5 context that will be used for output. */ - md5_init_ctx (&ctx); + /* Write the header. */ + gzprintf (module_fp, "GFORTRAN module version '%s' created from %s\n", + MOD_VERSION, gfc_source_file); /* Write the module itself. */ iomode = IO_OUTPUT; @@ -5244,24 +5216,16 @@ gfc_dump_module (const char *name, int dump_flag) write_char ('\n'); - /* Write the MD5 sum to the header of the module file. */ - md5_finish_ctx (&ctx, md5_new); - fsetpos (module_fp, &md5_pos); - for (n = 0; n < 16; n++) - fprintf (module_fp, "%02x", md5_new[n]); - - if (fclose (module_fp)) + if (gzclose (module_fp)) gfc_fatal_error ("Error writing module file '%s' for writing: %s", filename_tmp, xstrerror (errno)); - /* Read the MD5 from the header of the old module file and compare. */ - if (read_md5_from_module_file (filename, md5_old) != 0 - || memcmp (md5_old, md5_new, sizeof (md5_old)) != 0) + /* Read the CRC32 from the gzip trailers of the module files and compare. */ + if (read_crc32_from_module_file (filename_tmp, &crc) != 0 + || read_crc32_from_module_file (filename, &crc_old) != 0 + || crc_old != crc) { /* Module file have changed, replace the old one. */ - if (unlink (filename) && errno != ENOENT) - gfc_fatal_error ("Can't delete module file '%s': %s", filename, - xstrerror (errno)); if (rename (filename_tmp, filename)) gfc_fatal_error ("Can't rename module file '%s' to '%s': %s", filename_tmp, filename, xstrerror (errno)); @@ -5817,7 +5781,7 @@ gfc_use_module (void) specified that the module is intrinsic. */ module_fp = NULL; if (!specified_int) - module_fp = gfc_open_included_file (filename, true, true); + module_fp = gfc_gzopen_included_file (filename, true, true); /* Then, see if it's an intrinsic one, unless the USE statement specified that the module is non-intrinsic. */ @@ -5865,10 +5829,10 @@ gfc_use_module (void) module_column = 1; start = 0; - /* Skip the first two lines of the module, after checking that this is + /* Skip the first line of the module, after checking that this is a gfortran module file. */ line = 0; - while (line < 2) + while (line < 1) { c = module_char (); if (c == EOF) @@ -5917,7 +5881,7 @@ gfc_use_module (void) free_pi_tree (pi_root); pi_root = NULL; - fclose (module_fp); + gzclose (module_fp); use_stmt = gfc_get_use_list (); use_stmt->module_name = gfc_get_string (module_name); diff --git a/gcc/fortran/scanner.c b/gcc/fortran/scanner.c index 120d550..c5f5aa1 100644 --- a/gcc/fortran/scanner.c +++ b/gcc/fortran/scanner.c @@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see #include "debug.h" #include "flags.h" #include "cpp.h" +#include <zlib.h> /* Structure for holding module and include file search path. */ typedef struct gfc_directorylist @@ -418,6 +419,37 @@ open_included_file (const char *name, gfc_directorylist *list, } +static gzFile +gzopen_included_file (const char *name, gfc_directorylist *list, + bool module, bool system) +{ + char *fullname; + gfc_directorylist *p; + gzFile f; + + for (p = list; p; p = p->next) + { + if (module && !p->use_for_modules) + continue; + + fullname = (char *) alloca(strlen (p->path) + strlen (name) + 1); + strcpy (fullname, p->path); + strcat (fullname, name); + + f = gzopen (fullname, "r"); + if (f != NULL) + { + if (gfc_cpp_makedep ()) + gfc_cpp_add_dep (fullname, system); + + return f; + } + } + + return NULL; +} + + /* Opens file for reading, searching through the include directories given if necessary. If the include_cwd argument is true, we try to open the file in the current directory first. */ @@ -440,20 +472,38 @@ gfc_open_included_file (const char *name, bool include_cwd, bool module) return f; } -FILE * +gzFile +gfc_gzopen_included_file (const char *name, bool include_cwd, bool module) +{ + gzFile f = NULL; + + if (IS_ABSOLUTE_PATH (name) || include_cwd) + { + f = gzopen (name, "r"); + if (f && gfc_cpp_makedep ()) + gfc_cpp_add_dep (name, false); + } + + if (!f) + f = gzopen_included_file (name, include_dirs, module, false); + + return f; +} + +gzFile gfc_open_intrinsic_module (const char *name) { - FILE *f = NULL; + gzFile f = NULL; if (IS_ABSOLUTE_PATH (name)) { - f = gfc_open_file (name); + f = gzopen (name, "r"); if (f && gfc_cpp_makedep ()) gfc_cpp_add_dep (name, true); } if (!f) - f = open_included_file (name, intrinsic_modules_dirs, true, true); + f = gzopen_included_file (name, intrinsic_modules_dirs, true, true); return f; }