Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Fix ptrace attach stops in interactive shells

Drain fork-child attach-time SIGSTOPs before resuming interactive parents so job-control shells do not report traced foreground commands as stopped with exit code 147.

Add a pty-backed regression for agent-snapshot bash -i running git diff.

Note: this is GPT-5.5 genius.
Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-04 05:30:34 -0400
Commit
766e018fb4dcacae4dac927a534df416aabf183c
src/ocaml/ptrace.ml
index 01ae194..4d40029 100644
--- a/src/ocaml/ptrace.ml
+++ b/src/ocaml/ptrace.ml
@@ -136,6 +136,7 @@ type event =
 
 type task = {
   mutable in_syscall : bool;
+  mutable suppress_attach_stop : bool;
 }
 
 external fork : unit -> int = "as_fork"
@@ -148,6 +149,7 @@ external peek_word : pid -> int64 -> string = "as_peek_word"
 external wait_raw : pid -> bool -> wait_stop = "as_wait"
 external const_sigtrap_sysgood : unit -> int = "as_const_sigtrap_sysgood"
 external const_sigtrap : unit -> int = "as_const_sigtrap"
+external const_sigstop : unit -> int = "as_const_sigstop"
 external const_event_fork : unit -> int = "as_const_event_fork"
 external const_event_vfork : unit -> int = "as_const_event_vfork"
 external const_event_clone : unit -> int = "as_const_event_clone"
@@ -201,6 +203,22 @@ let try_resume ?(signal : int = 0) (pid : pid) : unit =
 let is_fork_event (event : int) : bool =
   event = const_event_fork () || event = const_event_vfork () || event = const_event_clone ()
 
+(** PTRACE_O_TRACEFORK/CLONE stops the new child with SIGSTOP before it can
+    run. Non-interactive parents usually never notice, but an interactive shell
+    may run waitpid(..., WUNTRACED) for job control as soon as we resume the
+    forking parent. If that attach stop is still pending, the shell reports the
+    foreground command as "Stopped" and returns 128 + SIGSTOP (147 on Linux).
+
+    Drain only those attach-time SIGSTOPs while the parent remains ptrace-stopped.
+    Return the first non-attach stop so normal syscall/event decoding still sees
+    the child's real execution from the beginning. *)
+let rec consume_attach_stops (pid : pid) : wait_stop option =
+  try_resume pid;
+  match wait_raw pid true with
+  | Stopped (stopped_pid, signal, event) when stopped_pid = pid && signal = const_sigstop () && event = 0 ->
+      consume_attach_stops pid
+  | stop -> Some stop
+
 (** Convert raw wait stops into higher-level trace events while maintaining syscall entry/exit phase. *)
 let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list =
   match stop with
@@ -211,12 +229,11 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list 
       let child = geteventmsg pid in
       let parent_task =
         match Hashtbl.find_opt tasks pid with
-        | Some task -> { in_syscall = task.in_syscall }
-        | None -> { in_syscall = false }
+        | Some task -> { in_syscall = task.in_syscall; suppress_attach_stop = true }
+        | None -> { in_syscall = false; suppress_attach_stop = true }
       in
       Hashtbl.replace tasks child parent_task;
       try_setoptions child;
-      try_resume child;
       [ Fork { parent = pid; child } ]
   | Stopped (pid, _signal, event) when event = const_event_exec () -> [ Exec pid ]
   | Stopped (pid, _signal, event) when event = const_event_exit () -> [ Exit pid ]
@@ -225,11 +242,12 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list 
         match Hashtbl.find_opt tasks pid with
         | Some task -> task
         | None ->
-            let task = { in_syscall = false } in
+            let task = { in_syscall = false; suppress_attach_stop = false } in
             Hashtbl.replace tasks pid task;
             task
       in
       let regs = regs pid in
+      task.suppress_attach_stop <- false;
       if task.in_syscall then (
         task.in_syscall <- false;
         [ Syscall_exit (pid, regs) ])
@@ -237,7 +255,16 @@ let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list 
         task.in_syscall <- true;
         [ Syscall_enter (pid, regs) ]))
   | Stopped (pid, signal, _event) ->
-      [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ]
+      let task = Hashtbl.find_opt tasks pid in
+      let signal =
+        if signal = const_sigtrap () then 0
+        else
+          match task with
+          | Some task when signal = const_sigstop () && task.suppress_attach_stop ->
+              0
+          | _ -> signal
+      in
+      [ Signal (pid, signal) ]
 
 (** Run [command] under ptrace and invoke [on_event] for normalized trace events. *)
 let trace (command : string list) (on_event : event -> unit) : unit =
@@ -254,24 +281,33 @@ let trace (command : string list) (on_event : event -> unit) : unit =
         | Stopped _ ->
             setoptions child;
             let tasks = Hashtbl.create 8 in
-            Hashtbl.add tasks child { in_syscall = false };
+            Hashtbl.add tasks child { in_syscall = false; suppress_attach_stop = false };
+            let pending_stops : wait_stop Queue.t = Queue.create () in
             try_resume child;
+            let stopped_pid = function
+              | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid
+            in
+            let resume_signal = function
+              | [ Signal (_, signal) ] -> signal
+              | _ -> 0
+            in
+            let rec handle_stop stop =
+              let pid = stopped_pid stop in
+              let events = decode_stop tasks stop in
+              List.iter on_event events;
+              List.iter
+                (function
+                  | Fork { child; _ } -> Option.iter (fun stop -> Queue.add stop pending_stops) (consume_attach_stops child)
+                  | _ -> ())
+                events;
+              while not (Queue.is_empty pending_stops) do
+                handle_stop (Queue.take pending_stops)
+              done;
+              if Hashtbl.mem tasks pid then try_resume ~signal:(resume_signal events) pid
+            in
             while Hashtbl.length tasks > 0 do
               match wait_next () with
               | exception Unix.Unix_error (Unix.ECHILD, _, _) -> Hashtbl.clear tasks
-              | stop ->
-                  let pid =
-                    match stop with
-                    | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid
-                  in
-                  let events = decode_stop tasks stop in
-                  List.iter on_event events;
-                  if Hashtbl.mem tasks pid then
-                    let signal =
-                      match events with
-                      | [ Signal (_, signal) ] -> signal
-                      | _ -> 0
-                    in
-                    try_resume ~signal pid
+              | stop -> handle_stop stop
             done
         | _ -> failwith "tracee did not stop at startup")
src/ocaml/ptrace_stubs.c
index a7eccd5..9a149a0 100644
--- a/src/ocaml/ptrace_stubs.c
+++ b/src/ocaml/ptrace_stubs.c
@@ -130,6 +130,11 @@ CAMLprim value as_const_sigtrap(value unit) {
   CAMLreturn(Val_int(SIGTRAP));
 }
 
+CAMLprim value as_const_sigstop(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(SIGSTOP));
+}
+
 CAMLprim value as_const_event_fork(value unit) {
   CAMLparam1(unit);
   CAMLreturn(Val_int(PTRACE_EVENT_FORK));
tests/test_agent_snapshot.py
index c0e2938..0be4658 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -1,7 +1,11 @@
 import json
 import os
+import pty
+import select
 import shutil
+import signal
 import subprocess
+import time
 from pathlib import Path
 
 import pandas as pd
@@ -371,6 +375,56 @@ def test_traced_command_options_do_not_need_separator(tmp_path):
     assert Snapshot(out).manifest["command"] == [PYTHON, "-c", "print('direct command')"]
 
 
+def test_interactive_shell_children_are_not_reported_as_stopped_jobs(tmp_path):
+    out = tmp_path / "snapshot"
+    pid, fd = pty.fork()
+    if pid == 0:
+        os.chdir(WORKTREE)
+        os.environ["HISTFILE"] = "/dev/null"
+        os.execv(str(BIN), [str(BIN), "--snapshot-dir", str(out), "bash", "-i"])
+
+    output = bytearray()
+
+    def read_available(deadline: float) -> str:
+        while time.monotonic() < deadline:
+            ready, _, _ = select.select([fd], [], [], 0.05)
+            if fd not in ready:
+                continue
+            try:
+                chunk = os.read(fd, 4096)
+            except OSError:
+                break
+            if not chunk:
+                break
+            output.extend(chunk)
+        return output.decode(errors="replace")
+
+    try:
+        read_available(time.monotonic() + 1.0)
+        os.write(fd, b"git diff >/tmp/agent-snapshot-pty-gitdiff.txt 2>&1; printf 'rc:%s\\n' \"$?\"; exit\n")
+        text = ""
+        deadline = time.monotonic() + 5.0
+        while time.monotonic() < deadline:
+            text = read_available(time.monotonic() + 0.2)
+            if "rc:" in text:
+                break
+        assert "rc:0" in text
+        assert "Stopped" not in text
+    finally:
+        try:
+            os.close(fd)
+        except OSError:
+            pass
+        try:
+            os.kill(pid, signal.SIGTERM)
+        except ProcessLookupError:
+            pass
+        try:
+            os.waitpid(pid, 0)
+        except ChildProcessError:
+            pass
+
+
 def test_run_prints_stderr_summary(tmp_path):
     out = tmp_path / "snapshot"