Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -136,6 +136,7 @@ type event = type task = { mutable in_syscall : bool; + mutable suppress_attach_stop : bool; } external fork : unit -> int = "as_fork" @@ -148,6 +149,7 @@ external peek_word : pid -> int64 -> string = "as_peek_word" external wait_raw : pid -> bool -> wait_stop = "as_wait" external const_sigtrap_sysgood : unit -> int = "as_const_sigtrap_sysgood" external const_sigtrap : unit -> int = "as_const_sigtrap" +external const_sigstop : unit -> int = "as_const_sigstop" external const_event_fork : unit -> int = "as_const_event_fork" external const_event_vfork : unit -> int = "as_const_event_vfork" external const_event_clone : unit -> int = "as_const_event_clone" @@ -201,6 +203,22 @@ let try_resume ?(signal : int = 0) (pid : pid) : unit = let is_fork_event (event : int) : bool = event = const_event_fork () || event = const_event_vfork () || event = const_event_clone () +(** PTRACE_O_TRACEFORK/CLONE stops the new child with SIGSTOP before it can + run. Non-interactive parents usually never notice, but an interactive shell + may run waitpid(..., WUNTRACED) for job control as soon as we resume the + forking parent. If that attach stop is still pending, the shell reports the + foreground command as "Stopped" and returns 128 + SIGSTOP (147 on Linux). + + Drain only those attach-time SIGSTOPs while the parent remains ptrace-stopped. + Return the first non-attach stop so normal syscall/event decoding still sees + the child's real execution from the beginning. *) +let rec consume_attach_stops (pid : pid) : wait_stop option = + try_resume pid; + match wait_raw pid true with + | Stopped (stopped_pid, signal, event) when stopped_pid = pid && signal = const_sigstop () && event = 0 -> + consume_attach_stops pid + | stop -> Some stop + (** Convert raw wait stops into higher-level trace events while maintaining syscall entry/exit phase. *) let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list = match stop with @@ -211,12 +229,11 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list let child = geteventmsg pid in let parent_task = match Hashtbl.find_opt tasks pid with - | Some task -> { in_syscall = task.in_syscall } - | None -> { in_syscall = false } + | Some task -> { in_syscall = task.in_syscall; suppress_attach_stop = true } + | None -> { in_syscall = false; suppress_attach_stop = true } in Hashtbl.replace tasks child parent_task; try_setoptions child; - try_resume child; [ Fork { parent = pid; child } ] | Stopped (pid, _signal, event) when event = const_event_exec () -> [ Exec pid ] | Stopped (pid, _signal, event) when event = const_event_exit () -> [ Exit pid ] @@ -225,11 +242,12 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list match Hashtbl.find_opt tasks pid with | Some task -> task | None -> - let task = { in_syscall = false } in + let task = { in_syscall = false; suppress_attach_stop = false } in Hashtbl.replace tasks pid task; task in let regs = regs pid in + task.suppress_attach_stop <- false; if task.in_syscall then ( task.in_syscall <- false; [ Syscall_exit (pid, regs) ]) @@ -237,7 +255,16 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list task.in_syscall <- true; [ Syscall_enter (pid, regs) ])) | Stopped (pid, signal, _event) -> - [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ] + let task = Hashtbl.find_opt tasks pid in + let signal = + if signal = const_sigtrap () then 0 + else + match task with + | Some task when signal = const_sigstop () && task.suppress_attach_stop -> + 0 + | _ -> signal + in + [ Signal (pid, signal) ] (** Run [command] under ptrace and invoke [on_event] for normalized trace events. *) let trace (command : string list) (on_event : event -> unit) : unit = @@ -254,24 +281,33 @@ let trace (command : string list) (on_event : event -> unit) : unit = | Stopped _ -> setoptions child; let tasks = Hashtbl.create 8 in - Hashtbl.add tasks child { in_syscall = false }; + Hashtbl.add tasks child { in_syscall = false; suppress_attach_stop = false }; + let pending_stops : wait_stop Queue.t = Queue.create () in try_resume child; + let stopped_pid = function + | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid + in + let resume_signal = function + | [ Signal (_, signal) ] -> signal + | _ -> 0 + in + let rec handle_stop stop = + let pid = stopped_pid stop in + let events = decode_stop tasks stop in + List.iter on_event events; + List.iter + (function + | Fork { child; _ } -> Option.iter (fun stop -> Queue.add stop pending_stops) (consume_attach_stops child) + | _ -> ()) + events; + while not (Queue.is_empty pending_stops) do + handle_stop (Queue.take pending_stops) + done; + if Hashtbl.mem tasks pid then try_resume ~signal:(resume_signal events) pid + in while Hashtbl.length tasks > 0 do match wait_next () with | exception Unix.Unix_error (Unix.ECHILD, _, _) -> Hashtbl.clear tasks - | stop -> - let pid = - match stop with - | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid - in - let events = decode_stop tasks stop in - List.iter on_event events; - if Hashtbl.mem tasks pid then - let signal = - match events with - | [ Signal (_, signal) ] -> signal - | _ -> 0 - in - try_resume ~signal pid + | stop -> handle_stop stop done | _ -> failwith "tracee did not stop at startup")
@@ -130,6 +130,11 @@ CAMLprim value as_const_sigtrap(value unit) { CAMLreturn(Val_int(SIGTRAP)); } +CAMLprim value as_const_sigstop(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(SIGSTOP)); +} + CAMLprim value as_const_event_fork(value unit) { CAMLparam1(unit); CAMLreturn(Val_int(PTRACE_EVENT_FORK));
@@ -1,7 +1,11 @@ import json import os +import pty +import select import shutil +import signal import subprocess +import time from pathlib import Path import pandas as pd @@ -371,6 +375,56 @@ def test_traced_command_options_do_not_need_separator(tmp_path): assert Snapshot(out).manifest["command"] == [PYTHON, "-c", "print('direct command')"] +def test_interactive_shell_children_are_not_reported_as_stopped_jobs(tmp_path): + out = tmp_path / "snapshot" + pid, fd = pty.fork() + if pid == 0: + os.chdir(WORKTREE) + os.environ["HISTFILE"] = "/dev/null" + os.execv(str(BIN), [str(BIN), "--snapshot-dir", str(out), "bash", "-i"]) + + output = bytearray() + + def read_available(deadline: float) -> str: + while time.monotonic() < deadline: + ready, _, _ = select.select([fd], [], [], 0.05) + if fd not in ready: + continue + try: + chunk = os.read(fd, 4096) + except OSError: + break + if not chunk: + break + output.extend(chunk) + return output.decode(errors="replace") + + try: + read_available(time.monotonic() + 1.0) + os.write(fd, b"git diff >/tmp/agent-snapshot-pty-gitdiff.txt 2>&1; printf 'rc:%s\\n' \"$?\"; exit\n") + text = "" + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + text = read_available(time.monotonic() + 0.2) + if "rc:" in text: + break + assert "rc:0" in text + assert "Stopped" not in text + finally: + try: + os.close(fd) + except OSError: + pass + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + try: + os.waitpid(pid, 0) + except ChildProcessError: + pass + + def test_run_prints_stderr_summary(tmp_path): out = tmp_path / "snapshot"