On 8 Jun 2011, at 10:29, Rainer Orth wrote:
Darwin seems to be exceedingly unreliable under load: tests randomly time out and work again the next time.
If you mean random timeouts in dejagnu - I suspect (but have not yet proved) that the culprit is insufficient stack allocation in some component of tcl or expect itself.
One used to find that non-parallel "make check" would be quite reliable - but parallel very unreliable.
I use the following local (not submitted, unofficial) patch to dejagnu on Darwin9 - which retries tests that timeout (once) - this has been enough to eliminate the frustration for me...
YMMV, of course. cheers Iain
--- dejagnu-1.4.4/lib/remote.exp 2003-10-11 07:42:46.000000000 +0100 +++ dejagnu-1.4.4-installed/lib/remote.exp 2010-03-12 09:32:49.000000000 +0000 @@ -74,7 +74,7 @@ # or output is redirected. If the program needs to be killed, /bin/sh and # the kill command will be invoked. # -proc local_exec { commandline inp outp timeout } { +proc one_local_exec { commandline inp outp timeout } { # Tcl's exec is a pile of crap. It does two very inappropriate things # firstly, it has no business returning an error if the program being # executed happens to write to stderr. Secondly, it appends its own @@ -155,7 +155,7 @@ set got_eof 0 set output "" - + set status 0 # Wait for either $timeout seconds to elapse, or for the program to # exit. expect { @@ -163,16 +163,19 @@ append output $expect_out(buffer) if { [string length $output] < 512000 } { exp_continue -continue_timer + } else { + warning "one_local_exec: (output string overflow)" } } timeout { - warning "program timed out." +# warning "one_local_exec: program timed out." + set status -2 } eof { set got_eof 1 } } - + # Uuuuuuugh. Now I'm getting really sick. # If we didn't get an EOF, we have to kill the poor defenseless program. # However, Tcl has no kill primitive, so we have to execute an external @@ -207,12 +210,16 @@ set res "wait failed" } } + if { $r2 != 0 || $res != "" || ! $got_eof } { verbose "close result is $res" - set status 1 + if { $status != -2 } { + set status 1 + } } else { set status 0 } + verbose "output is $output" if { $outp == "" } { return [list $status $output] @@ -221,6 +228,29 @@ } } +proc local_exec { commandline inp outp timeout } { + set res [one_local_exec $commandline $inp $outp $timeout] + set stat [lindex $res 0] +# OK? + if { $stat != -2 } { + return $res + } +# nope... +# warning "local_exec $commandline timed out : retrying" + verbose -log "RE-TRIED: (timeout = $timeout) : $commandline " + if { $timeout < 150 } { + set timeout [expr {$timeout} * 2] + } + set res [one_local_exec $commandline $inp $outp $timeout] + set stat [lindex $res 0] + if { $stat == -2 } { + set msg "" + warning "TIMED OUT: $commandline timed out after retry" + verbose -log "TIMED OUT: $commandline (timeout = $timeout)" + } + return $res +} + # # Execute the supplied program on HOSTNAME. There are four optional arguments # the first is a set of arguments to pass to PROGRAM, the second is an @@ -1090,7 +1120,8 @@ } } timeout { - warning "program timed out." + warning "standard_wait: program timed out" + set status -2 } eof { if [board_info $dest exists fileid_origid] {