Generating a model matrix with very large numbers of columns overflows
the stack and/or runs very slowly, due to the implementation of
TrimRepeats().
This patch modifies it to use Rf_duplicated() to find the duplicates.
This makes the running time linear in the number of columns and
eliminates the recursive function calls.
Thanks
Index: src/library/stats/src/model.c
===================================================================
--- src/library/stats/src/model.c (revision 70230)
+++ src/library/stats/src/model.c (working copy)
@@ -1259,11 +1259,12 @@
static int TermZero(SEXP term)
{
- int i, val;
- val = 1;
- for (i = 0; i < nwords; i++)
- val = val && (INTEGER(term)[i] == 0);
- return val;
+ for (int i = 0; i < nwords; i++) {
+ if (INTEGER(term)[i] != 0) {
+ return 0;
+ }
+ }
+ return 1;
}
@@ -1271,11 +1272,12 @@
static int TermEqual(SEXP term1, SEXP term2)
{
- int i, val;
- val = 1;
- for (i = 0; i < nwords; i++)
- val = val && (INTEGER(term1)[i] == INTEGER(term2)[i]);
- return val;
+ for (int i = 0; i < nwords; i++) {
+ if (INTEGER(term1)[i] != INTEGER(term2)[i]) {
+ return 0;
+ }
+ }
+ return 1;
}
@@ -1303,18 +1305,37 @@
/* TrimRepeats removes duplicates of (bit string) terms
- in a model formula by repeated use of ``StripTerm''.
+ in a model formula.
Also drops zero terms. */
static SEXP TrimRepeats(SEXP list)
{
- if (list == R_NilValue)
- return R_NilValue;
- /* Highly recursive */
- R_CheckStack();
- if (TermZero(CAR(list)))
- return TrimRepeats(CDR(list));
- SETCDR(list, TrimRepeats(StripTerm(CAR(list), CDR(list))));
+ // Drop zero terms at the start of the list.
+ while (list != R_NilValue && TermZero(CAR(list))) {
+ list = CDR(list);
+ }
+ if (list == R_NilValue || CDR(list) == R_NilValue)
+ return list;
+
+ // Find out which terms are duplicates.
+ SEXP all_terms = PROTECT(Rf_PairToVectorList(list));
+ SEXP duplicate_sexp = PROTECT(Rf_duplicated(all_terms, FALSE));
+ int* is_duplicate = LOGICAL(duplicate_sexp);
+ int i = 0;
+
+ // Remove the zero terms and duplicates from the list.
+ for (SEXP current = list; CDR(current) != R_NilValue; i++) {
+ SEXP next = CDR(current);
+
+ if (is_duplicate[i + 1] || TermZero(CAR(next))) {
+ // Remove the node from the list.
+ SETCDR(current, CDR(next));
+ } else {
+ current = next;
+ }
+ }
+
+ UNPROTECT(2);
return list;
}
______________________________________________
[email protected] mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel