Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Add documented ptrace interface

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-04 05:42:41 -0400
Commit
b09e7ddfbc000b38f12ed339021f54f13b3af08e
src/ocaml/ptrace.mli
new file mode 100644
index 0000000..56b3f8d
--- /dev/null
+++ b/src/ocaml/ptrace.mli
@@ -0,0 +1,198 @@
+(** Linux ptrace support for following a command and decoding filesystem-related
+    syscalls.
+
+    This module is intentionally small and Linux/x86_64-specific. It owns the raw
+    ptrace interaction, normalizes wait stops into trace events, and exposes just
+    enough syscall decoding helpers for the snapshot layer to classify filesystem
+    observations. *)
+
+(** Operating-system process identifier. *)
+type pid = int
+
+(** Register state at a syscall stop.
+
+    The fields are decoded from the x86_64 Linux syscall ABI:
+
+    - {!syscall_nr} is the syscall number.
+    - {!args} contains the six syscall arguments in order.
+    - {!result} is meaningful on syscall-exit stops and contains the return
+      value, including negative errno values. *)
+type regs = {
+  syscall_nr : int;
+  args : int64 array;
+  result : int64;
+}
+
+(** The special directory file descriptor used by *at() syscalls to mean "the
+    current working directory". This is Linux's [AT_FDCWD]. *)
+val at_fdcwd : int
+
+(** Syscall numbers used by the snapshot tracer.
+
+    The values are for Linux on x86_64. They are exposed so callers can pattern
+    match decoded syscall records without duplicating architecture constants. *)
+module Syscall : sig
+  val access : int
+  val close : int
+  val creat : int
+  val dup : int
+  val dup2 : int
+  val dup3 : int
+  val faccessat : int
+  val faccessat2 : int
+  val fchdir : int
+  val fcntl : int
+  val ftruncate : int
+  val getdents : int
+  val getdents64 : int
+  val lstat : int
+  val mkdir : int
+  val mkdirat : int
+  val newfstatat : int
+  val open_ : int
+  val openat : int
+  val openat2 : int
+  val readlink : int
+  val readlinkat : int
+  val rename : int
+  val renameat : int
+  val renameat2 : int
+  val rmdir : int
+  val stat : int
+  val truncate : int
+  val unlink : int
+  val unlinkat : int
+  val chdir : int
+end
+
+(** Mask selecting the access-mode bits from Linux [open] flags. *)
+val o_accmode : int
+
+(** Linux [O_RDONLY]. *)
+val o_rdonly : int
+
+(** Linux [O_WRONLY]. *)
+val o_wronly : int
+
+(** Linux [O_RDWR]. *)
+val o_rdwr : int
+
+(** Linux [O_CREAT]. *)
+val o_creat : int
+
+(** Linux [O_TRUNC]. *)
+val o_trunc : int
+
+(** Linux [O_APPEND]. *)
+val o_append : int
+
+(** Linux [O_DIRECTORY]. *)
+val o_directory : int
+
+(** Linux [F_DUPFD] command for [fcntl]. *)
+val f_dupfd : int
+
+(** Linux [F_DUPFD_CLOEXEC] command for [fcntl]. *)
+val f_dupfd_cloexec : int
+
+(** [syscall_ok result] is [true] when [result] is a non-negative Linux syscall
+    return value. Negative values represent [-errno]. *)
+val syscall_ok : int64 -> bool
+
+(** [is_write_open flags] is [true] when Linux [open]-style [flags] may create,
+    truncate, append to, or otherwise open a file for writing. *)
+val is_write_open : int -> bool
+
+(** [is_read_open flags] is [true] when Linux [open]-style [flags] open a file
+    for reading. *)
+val is_read_open : int -> bool
+
+(** Decoded syscall-entry state for path-oriented syscalls.
+
+    The tracer captures this at syscall entry so the snapshot layer can still
+    interpret the operation after the kernel has completed it. Fields that do
+    not apply to the syscall keep neutral defaults: empty paths, {!at_fdcwd},
+    [-1] file descriptors, or zero flags. *)
+type pending_syscall = {
+  nr : int;
+  args : int64 array;
+  mutable path_a : string;
+  mutable path_b : string;
+  mutable dirfd : int;
+  mutable fd : int;
+  mutable flags : int;
+}
+
+(** [decode_syscall_entry ~resolve ~read_arg regs] decodes the syscall-entry
+    registers for filesystem operations known to the tracer.
+
+    Call this when {!trace} emits [Syscall_enter (pid, regs)]. The returned
+    {!pending_syscall} is the syscall-entry snapshot that a caller usually stores
+    in per-process state until the matching [Syscall_exit (pid, regs)] arrives.
+    This is necessary because pointer arguments, relative paths, and file
+    descriptor context should be captured before the tracee continues, while the
+    syscall result is only available at exit.
+
+    A typical use looks like:
+
+    {[
+      Ptrace.trace command (function
+        | Ptrace.Syscall_enter (pid, regs) ->
+            let read_arg i = Ptrace.read_string pid regs.Ptrace.args.(i) in
+            let resolve ~dirfd path = resolve_path_for_process pid ~dirfd path in
+            let pending = Ptrace.decode_syscall_entry ~resolve ~read_arg regs in
+            remember_pending_syscall pid pending
+        | Ptrace.Syscall_exit (pid, regs) ->
+            let pending = take_pending_syscall pid in
+            if Ptrace.syscall_ok regs.Ptrace.result then
+              handle_completed_syscall pending regs
+        | _ -> ())
+    ]}
+
+    [read_arg i] reads syscall argument [i] as a NUL-terminated string from the
+    tracee. For path syscalls this is normally implemented with {!read_string}:
+    [fun i -> read_string pid regs.args.(i)]. It is a callback because only the
+    caller knows which [pid] and register snapshot are currently being decoded.
+
+    [resolve ~dirfd path] converts a path argument into the caller's canonical
+    path representation. For absolute paths it can usually return [path]
+    unchanged. For relative paths it should resolve against either the tracee's
+    current working directory when [dirfd = {!at_fdcwd}], or the directory path
+    associated with [dirfd] for *at() syscalls such as [openat] and [renameat].
+    It is a callback because ptrace only exposes the raw integer file descriptor;
+    the higher-level caller must maintain cwd and fd-to-path state.
+
+    Unknown syscalls produce a record whose {!pending_syscall.nr} and
+    {!pending_syscall.args} are populated and whose derived fields keep their
+    defaults. *)
+val decode_syscall_entry :
+  resolve:(dirfd:int -> string -> string) -> read_arg:(int -> string) -> regs -> pending_syscall
+
+(** Trace events emitted by {!trace}.
+
+    Syscall events come in enter/exit pairs per process unless the tracee exits
+    or is interrupted mid-syscall. [Fork] covers fork, vfork, and clone events
+    that create a traceable child. [Process_exit] means the process has left the
+    traced task set. *)
+type event =
+  | Syscall_enter of pid * regs
+  | Syscall_exit of pid * regs
+  | Fork of { parent : pid; child : pid }
+  | Exec of pid
+  | Exit of pid
+  | Signal of pid * int
+  | Process_exit of pid
+
+(** [read_string pid address] reads a NUL-terminated string from tracee memory.
+
+    A null pointer, unreadable memory, or the configured maximum length ends the
+    read and returns the bytes collected so far. *)
+val read_string : pid -> int64 -> string
+
+(** [trace command on_event] runs [command] under ptrace and calls [on_event] for
+    each normalized event until all traced processes exit.
+
+    [command] must contain the executable name followed by its arguments. Raises
+    [Invalid_argument] for an empty command and may raise [Unix.Unix_error] for
+    ptrace, wait, fork, or exec failures. *)
+val trace : string list -> (event -> unit) -> unit