On Wed, Mar 17, 2010 at 9:42 AM, Seth Falcon <s...@userprimary.net> wrote: > On 3/17/10 7:16 AM, Alistair Gee wrote: >> Yes. I had noticed that R occasionally segfaults (especially when I >> run many concurrent R processes), so I used valgrind to log every use >> of R. In the valgrind logs, I tracked the problem to list_files(). >> >> I attached a patch to platform.c (for trunk). Unfortunately, I am >> having trouble building R from the subversion trunk--it is taking a >> very long time decompressing/installing the recommended packages--so I >> haven't been able to verify the fix yet. But my version of platform.c >> does compile, and it does simplify the code b/c count_files() is no >> longer needed. > > Hmm, I see that you "grow" the vector containing filenames by calling > lengthgets and doubling the length. I don't see where you cleanup > before returning -- seems likely you will end up returning a vector that > is too long. > > And there are some performance characteristics to consider in terms of > both run time and memory profile. Does making a single pass through the > files make up for the allocations/data copying that result from > lengthgets? Is it worth possibly requiring twice the memory for the > worst case? > > + seth > > > > > Sorry, I left out a call to shorten the vector (via final call to lengthgets()). See new patch.
BTW, I modeled this code after code I found in the RODBC package that handles rows returned from a database query. I don't know if this is a typical approach to reallocation. Maybe there is a better way of extending the vector, though wouldn't most of the memory usage be in the strings (of the filenames) rather than the STRSXP vector itself? Anyway, I'm offering this (untested) fix as it handles both directories that have grown and directories that have shrunk, so that the length of the vector is correct in both cases. --
--- platform.c~ 2010-03-17 09:57:26.268110062 -0400 +++ platform.c 2010-03-17 13:36:55.416853389 -0400 @@ -871,61 +871,10 @@ #include <tre/tre.h> -static void count_files(const char *dnp, int *count, - Rboolean allfiles, Rboolean recursive, - const regex_t *reg) -{ - DIR *dir; - struct dirent *de; - char p[PATH_MAX]; -#ifdef Windows - /* For consistency with other functions */ - struct _stati64 sb; -#else - struct stat sb; -#endif - - if (strlen(dnp) >= PATH_MAX) /* should not happen! */ - error(_("directory/folder path name too long")); - if ((dir = opendir(dnp)) == NULL) { - warning(_("list.files: '%s' is not a readable directory"), dnp); - } else { - while ((de = readdir(dir))) { - if (allfiles || !R_HiddenFile(de->d_name)) { - if (recursive) { -#ifdef Win32 - if (strlen(dnp) == 2 && dnp[1] == ':') - snprintf(p, PATH_MAX, "%s%s", dnp, de->d_name); - else - snprintf(p, PATH_MAX, "%s%s%s", dnp, R_FileSep, de->d_name); -#else - snprintf(p, PATH_MAX, "%s%s%s", dnp, R_FileSep, de->d_name); -#endif -#ifdef Windows - _stati64(p, &sb); -#else - stat(p, &sb); -#endif - if ((sb.st_mode & S_IFDIR) > 0) { - if (strcmp(de->d_name, ".") && strcmp(de->d_name, "..")) - count_files(p, count, allfiles, recursive, reg); - continue; - } - } - if (reg) { - if (tre_regexec(reg, de->d_name, 0, NULL, 0) == 0) (*count)++; - } else (*count)++; - } - } - closedir(dir); - } -} - static void list_files(const char *dnp, const char *stem, int *count, SEXP ans, Rboolean allfiles, Rboolean recursive, - const regex_t *reg) + int pattern, regex_t reg, int *countmax, PROTECT_INDEX *idx) { - int ans_len = length(ans); DIR *dir; struct dirent *de; char p[PATH_MAX], stem2[PATH_MAX]; @@ -971,20 +920,28 @@ } else strcpy(stem2, de->d_name); list_files(p, stem2, count, ans, allfiles, - recursive, reg); + recursive, pattern, reg, countmax, idx); } continue; } } - /* number of files could have changed since call to count_files */ - if (*count >= ans_len) break; - if (reg) { - if (tre_regexec(reg, de->d_name, 0, NULL, 0) == 0) + if (pattern) { + if (tre_regexec(®, de->d_name, 0, NULL, 0) == 0) { + if (*count == *countmax - 1) { + *countmax *= 2; + REPROTECT(ans = lengthgets(ans, *countmax), *idx); + } SET_STRING_ELT(ans, (*count)++, filename(stem, de->d_name)); - } else + } + } else { + if (*count == *countmax - 1) { + *countmax *= 2; + REPROTECT(ans = lengthgets(ans, *countmax), *idx); + } SET_STRING_ELT(ans, (*count)++, filename(stem, de->d_name)); + } } } closedir(dir); @@ -993,11 +950,13 @@ SEXP attribute_hidden do_listfiles(SEXP call, SEXP op, SEXP args, SEXP rho) { + PROTECT_INDEX ipx; SEXP d, p, ans; int allfiles, fullnames, count, pattern, recursive, igcase, flags; int i, ndir; const char *dnp; regex_t reg; + int countmax = 100; checkArity(op, args); d = CAR(args); args = CDR(args); @@ -1019,20 +978,15 @@ if (pattern && tre_regcomp(®, translateChar(STRING_ELT(p, 0)), flags)) error(_("invalid 'pattern' regular expression")); - count = 0; - for (i = 0; i < ndir ; i++) { - if (STRING_ELT(d, i) == NA_STRING) continue; - dnp = R_ExpandFileName(translateChar(STRING_ELT(d, i))); - count_files(dnp, &count, allfiles, recursive, pattern ? ® : NULL); - } - PROTECT(ans = allocVector(STRSXP, count)); + PROTECT_WITH_INDEX(ans = allocVector(STRSXP, count), &ipx); count = 0; for (i = 0; i < ndir ; i++) { if (STRING_ELT(d, i) == NA_STRING) continue; dnp = R_ExpandFileName(translateChar(STRING_ELT(d, i))); list_files(dnp, fullnames ? dnp : NULL, &count, ans, allfiles, - recursive, pattern ? ® : NULL); + recursive, pattern, reg, &countmax, &ipx); } + ans = lengthgets(ans, count); if (pattern) tre_regfree(®); ssort(STRING_PTR(ans), count);
______________________________________________ R-devel@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-devel