On 8 Jun 2011, at 10:29, Rainer Orth wrote:

Darwin seems to be exceedingly unreliable under load: tests randomly
time out and work again the next time.

If you mean random timeouts in dejagnu - I suspect (but have not yet proved) that the culprit is insufficient stack allocation in some component of tcl or expect itself.

One used to find that non-parallel "make check" would be quite reliable - but parallel very unreliable.

I use the following local (not submitted, unofficial) patch to dejagnu on Darwin9 - which retries tests that timeout (once) - this has been enough to eliminate the frustration for me...

YMMV, of course.

cheers
Iain


--- dejagnu-1.4.4/lib/remote.exp        2003-10-11 07:42:46.000000000 +0100
+++ dejagnu-1.4.4-installed/lib/remote.exp      2010-03-12 09:32:49.000000000 
+0000
@@ -74,7 +74,7 @@
 # or output is redirected. If the program needs to be killed, /bin/sh and
 # the kill command will be invoked.
 #
-proc local_exec { commandline inp outp timeout } {
+proc one_local_exec { commandline inp outp timeout } {
     # Tcl's exec is a pile of crap. It does two very inappropriate things
     # firstly, it has no business returning an error if the program being
     # executed happens to write to stderr. Secondly, it appends its own
@@ -155,7 +155,7 @@
 
     set got_eof 0
     set output ""
-
+    set status 0
     # Wait for either $timeout seconds to elapse, or for the program to
     # exit.
     expect {
@@ -163,16 +163,19 @@
            append output $expect_out(buffer)
            if { [string length $output] < 512000 } {
                exp_continue -continue_timer
+           } else {
+             warning "one_local_exec: (output string overflow)"
            }
        }
        timeout {
-           warning "program timed out."
+#          warning "one_local_exec: program timed out."
+           set status -2
        }
        eof {
            set got_eof 1
        }
     }
-
+    
     # Uuuuuuugh. Now I'm getting really sick.
     # If we didn't get an EOF, we have to kill the poor defenseless program.
     # However, Tcl has no kill primitive, so we have to execute an external
@@ -207,12 +210,16 @@
            set res "wait failed"
        }
     }
+    
     if { $r2 != 0 || $res != "" || ! $got_eof } {
        verbose "close result is $res"
-       set status 1
+       if { $status != -2 } {
+           set status 1
+       }
     } else {
        set status 0
     }
+    
     verbose "output is $output"
     if { $outp == "" } {
         return [list $status $output]
@@ -221,6 +228,29 @@
     }
 }
 
+proc local_exec { commandline inp outp timeout } {
+    set res [one_local_exec $commandline $inp $outp $timeout]
+    set stat [lindex $res 0]
+# OK?
+    if { $stat != -2 } {
+       return $res 
+    }
+# nope...
+#    warning "local_exec $commandline timed out : retrying"
+    verbose -log "RE-TRIED: (timeout = $timeout) : $commandline "
+    if { $timeout < 150 } {
+       set timeout [expr {$timeout} * 2]
+    }
+    set res [one_local_exec $commandline $inp $outp $timeout]
+    set stat [lindex $res 0]
+    if { $stat == -2 } {
+       set msg "" 
+       warning "TIMED OUT: $commandline timed out after retry"
+       verbose -log "TIMED OUT: $commandline (timeout = $timeout)"
+    }
+    return $res
+}
+
 #
 # Execute the supplied program on HOSTNAME. There are four optional arguments
 # the first is a set of arguments to pass to PROGRAM, the second is an
@@ -1090,7 +1120,8 @@
            }
        }
        timeout {
-           warning "program timed out."
+           warning "standard_wait: program timed out"
+           set status -2 
        }
        eof {
            if [board_info $dest exists fileid_origid] {

Reply via email to