This is an automated email from the git hooks/post-receive script.
unknown user pushed a commit to branch master in repository dejagnu.
The following commit(s) were added to refs/heads/master by this push: new e7d7a3e * lib/remote.exp (close_wait_program): New procedure. (loca [...] e7d7a3e is described below
commit e7d7a3e0b0cda9194c192e979f4ecc8dcfb010b3 Author: Pedro Alves palves@redhat.com Date: Thu Jul 30 07:47:52 2015 +1000
* lib/remote.exp (close_wait_program): New procedure. (local_exec, standard_close): Use it.
The code that tries to make sure that a process dies in lib/remote.exp:remote_close can kill the wrong process due to PID-reuse races. The GDB buildbots show frequent misterious FAILs that turns out are caused by this. The problem is this bit here:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || k [...] ... catch "wait -i $shell_id"
When this procedure is called to close the GDB process, GDB exits promptly, but that whole cascade of kills carries on in the background, thus potentially killing the unfortunate process that manages to be spawned by one of the next tests and happens to reuse that $pid. [1]
So to fix this, kill that no-longer-needed pipeline as soon as "wait" returns. There are two places in the DejaGnu with a similar pipeline, so move that to a shared procedure.
[1] GDB'S testsuite spawns thousands of GDB instances and even more inferior processes, and of those inferiors, some spawn thousands of short lived threads in quick succession. Since threads and processes share the number space in Linux, all that causes PID recycling frequently. In addition, GDB's testsuite has a parallel test mode that runs several tests/DejaGnu instances at the same time, further widening the race window.
Signed-off-by: Ben Elliston bje@gnu.org --- ChangeLog | 5 ++++ lib/remote.exp | 86 ++++++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 68 insertions(+), 23 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 1eebf9c..0bdaeb9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2015-07-30 Pedro Alves palves@redhat.com + + * lib/remote.exp (close_wait_program): New procedure. + (local_exec, standard_close): Use it. + 2015-05-23 Ben Elliston bje@gnu.org
* doc/dejagnu.xml: Fix broken URLs. diff --git a/lib/remote.exp b/lib/remote.exp index ed176ad..fb040bb 100644 --- a/lib/remote.exp +++ b/lib/remote.exp @@ -56,6 +56,61 @@ proc remote_raw_open { args } { return [eval call_remote raw open $args] }
+# Close a spawn ID, and wait for the process to die. If PID is not +# -1, then if the process doesn't exit gracefully promptly, we kill +# it. +# +proc close_wait_program { program_id pid {wres_varname ""} } { + if {$wres_varname != "" } { + upvar 1 $wres_varname wres + } + + set exec_pid -1 + + if { $pid > 0 } { + # Tcl has no kill primitive, so we have to execute an external + # command in order to kill the process. + verbose "doing kill, pid is $pid" + # Prepend "-" to generate the "process group ID" needed by + # kill. + set pgid "-$pid" + # Send SIGINT to give the program a better chance to interrupt + # whatever it might be doing and react to stdin closing. + # E.g., in case of GDB, this should get it back to the prompt. + exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid)" + + # If the program doesn't exit gracefully when stdin closes, + # we'll need to kill it. But only do this after 'wait'ing a + # bit, to avoid killing the wrong process in case of a + # PID-reuse race. The extra sleep at the end is there to give + # time to kill $exec_pid without having _that_ be subject to a + # PID reuse race. + set secs 5 + set sh_cmd "exec > /dev/null 2>&1" + append sh_cmd " && sleep $secs && (kill -15 $pgid || kill -15 $pid)" + append sh_cmd " && sleep $secs && (kill -9 $pgid || kill -9 $pid)" + append sh_cmd " && sleep $secs" + set exec_pid [exec sh -c "$sh_cmd" &] + } + verbose "pid is $pid" + + # This closes the program's stdin. This should cause well behaved + # interactive programs to exit. This will hang if the kill + # doesn't work. Nothin' to do, and it's not OK. + catch "close -i $program_id" + + # Reap it. + set res [catch "wait -i $program_id" wres] + if {$exec_pid != -1} { + # We reaped the process, so cancel the pending force-kills, as + # otherwise if the PID is reused for some other unrelated + # process, we'd kill the wrong process. + exec sh -c "exec > /dev/null 2>&1 && kill -9 $exec_pid" + } + + return $res +} + # Run the specified COMMANDLINE on the local machine, redirecting input # to file INP (if non-empty), redirecting output to file OUTP (if non-empty), # and waiting TIMEOUT seconds for the command to complete before killing @@ -174,18 +229,10 @@ proc local_exec { commandline inp outp timeout } {
# Uuuuuuugh. Now I'm getting really sick. # If we didn't get an EOF, we have to kill the poor defenseless program. - # However, Tcl has no kill primitive, so we have to execute an external - # command in order to execute the execution. (English. Gotta love it.) - if { ! $got_eof } { - verbose "killing $pid $pgid" - # This is very, very nasty. SH, instead of EXPECT, is used to - # run this in the background since, on older CYGWINs, a - # strange file I/O error occures. - exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 & [...] - } - # This will hang if the kill doesn't work. Nothin' to do, and it's not ok. - catch "close -i $spawn_id" - set r2 [catch "wait -i $spawn_id" wres] + if { $got_eof } { + set pid -1 + } + set r2 [close_wait_program $spawn_id $pid wres] if { $id > 0 } { set r2 [catch "close $id" res] } else { @@ -312,20 +359,13 @@ proc standard_close { host } { } } } - if { $pid > 0 } { - verbose "doing kill, pid is $pid" - # This is very, very nasty. SH, instead of EXPECT, is used - # to run this in the background since, on older CYGWINs, a - # strange file I/O error occures. - set pgid "-[join $pid { -}]" - exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep [...] - } - verbose "pid is $pid" - catch "close -i $shell_id" + + close_wait_program $shell_id $pid + if {[info exists oid]} { catch "close $oid" } - catch "wait -i $shell_id" + unset board_info(${host},fileid) verbose "Shell closed." }