Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Document Dune build and clean up OCaml style

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 03:42:42 -0400
Commit
d1eca7e7734a2d876ee4c2a5aa8d68c4dbe24ef3
README.md
index 8133c42..43fb199 100644
--- a/README.md
+++ b/README.md
@@ -48,8 +48,10 @@ behavior. It does not snapshot environment variables, process limits, network
 state, complete directory entry listings, or arbitrary non-filesystem resources.
 It is also currently focused on Linux x86_64 syscall decoding.
 
-Non-UTF-8 pathnames are a known limitation: the current JSON manifest stores
-paths as JSON strings, and the JSON library rejects invalid UTF-8.
+The manifest is always written as UTF-8 JSON. Existing valid UTF-8 strings are
+preserved. Non-UTF-8 path bytes are converted through Latin-1 before writing so
+the manifest remains readable JSON, but byte-exact path restoration for such
+names is not yet represented separately in the snapshot format.
 
 ## Snapshot Format
 
@@ -158,13 +160,21 @@ Deleted files are represented by an after-state tombstone:
 
 ## Usage
 
-Build with CMake:
+Build with Dune:
 
 ```bash
-cmake -S . -B build
-cmake --build build --parallel
+dune build src/ocaml/agent_snapshot.exe
 ```
 
+The executable is written to:
+
+```text
+_build/default/src/ocaml/agent_snapshot.exe
+```
+
+The project depends on OCaml, Dune, Yojson, Camomile, and the local
+`vendor/ocaml-git` checkout.
+
 Create the required ignore configuration before running snapshots:
 
 ```bash
@@ -175,19 +185,19 @@ printf '[]\n' > "${XDG_CONFIG_HOME:-$HOME/.config}/agent-snapshot/ignore.json"
 Run a command under Agent Snapshot:
 
 ```bash
-build/agent-snapshot --output snapshot-dir -- command arg1 arg2
+dune exec -- agent-snapshot --output snapshot-dir -- command arg1 arg2
 ```
 
 For example:
 
 ```bash
-build/agent-snapshot --output snapshot-python -- /usr/bin/python3 script.py
+dune exec -- agent-snapshot --output snapshot-python -- /usr/bin/python3 script.py
 ```
 
 Restore captured final-state blobs and tombstones in place:
 
 ```bash
-build/agent-snapshot restore snapshot-dir
+dune exec -- agent-snapshot restore snapshot-dir
 ```
 
 Restore only applies files that have blobs and tombstones. Clean Git-tracked
src/ocaml/agent_snapshot.ml
index 83477e9..fe72b98 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -1,11 +1,12 @@
 module Json = Yojson.Safe
 
-let utf8_string s =
+(** Yojson does not validate UTF-8 on output. Keep valid UTF-8 unchanged; map arbitrary bytes through Latin-1 so manifest files remain UTF-8 text. *)
+let utf8_string (s : string) : string =
   let module Enc = Camomile.CharEncoding in
   try Enc.recode_string ~in_enc:Enc.utf8 ~out_enc:Enc.utf8 s
   with Enc.Malformed_code -> Enc.recode_string ~in_enc:Enc.latin1 ~out_enc:Enc.utf8 s
 
-let jstr s = `String (utf8_string s)
+let jstr (s : string) : Json.t = `String (utf8_string s)
 
 let at_fdcwd = -100
 let o_accmode = 0o3
@@ -106,28 +107,28 @@ type repo_record = {
   mutable dirty : bool;
 }
 
-let empty_metadata () =
+let empty_metadata () : metadata =
   { exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None }
 
-let empty_git () =
+let empty_git () : git_info =
   { in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" }
 
 let files : (string, file_record) Hashtbl.t = Hashtbl.create 128
 let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8
 let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8
-let ignored_paths = ref []
-let ignore_config_path = ref ""
-let snapshot_dir = ref ""
-let blob_dir = ref ""
-let tracer_uid = Unix.getuid ()
-let tracer_gid = Unix.getgid ()
+let ignored_paths : string list ref = ref []
+let ignore_config_path : string ref = ref ""
+let snapshot_dir : string ref = ref ""
+let blob_dir : string ref = ref ""
+let tracer_uid : int = Unix.getuid ()
+let tracer_gid : int = Unix.getgid ()
 
 let path_sep = '/'
 
-let split_path path =
+let split_path (path : string) : string list =
   path |> String.split_on_char path_sep |> List.filter (fun part -> part <> "" && part <> ".")
 
-let normalize_path path =
+let normalize_path (path : string) : string =
   let absolute = String.length path > 0 && path.[0] = path_sep in
   let parts =
     List.fold_left
@@ -139,65 +140,61 @@ let normalize_path path =
   let body = String.concat "/" parts in
   if absolute then if body = "" then "/" else "/" ^ body else if body = "" then "." else body
 
-let concat_path base path =
+let concat_path (base : string) (path : string) : string =
   if path = "" then base
   else if String.length path > 0 && path.[0] = '/' then normalize_path path
   else normalize_path (base ^ "/" ^ path)
 
-let dirname path =
+let dirname (path : string) : string =
   let path = normalize_path path in
   match String.rindex_opt path '/' with
   | None -> "."
   | Some 0 -> "/"
   | Some i -> String.sub path 0 i
 
-let basename path =
-  match String.rindex_opt path '/' with
-  | None -> path
-  | Some i -> String.sub path (i + 1) (String.length path - i - 1)
-
-let rec mkdir_p path =
+let rec mkdir_p (path : string) : unit =
   if path = "" || path = "/" || Sys.file_exists path then ()
   else (
     mkdir_p (dirname path);
     try Unix.mkdir path 0o777 with Unix.Unix_error (Unix.EEXIST, _, _) -> ())
 
-let is_absolute path = String.length path > 0 && path.[0] = '/'
+let is_absolute (path : string) : bool = String.length path > 0 && path.[0] = '/'
 
-let realpath_opt path = try Some (Unix.realpath path) with Unix.Unix_error _ -> None
+let realpath_opt (path : string) : string option = try Some (Unix.realpath path) with Unix.Unix_error _ -> None
 
-let best_effort_canonical path =
+(** Canonicalize existing paths, but still return a stable absolute lexical path for paths that do not exist yet. *)
+let best_effort_canonical (path : string) : string =
   match realpath_opt path with
   | Some path -> normalize_path path
   | None when is_absolute path -> normalize_path path
   | None -> concat_path (Sys.getcwd ()) path
 
-let path_is_at_or_under path root =
+let path_is_at_or_under (path : string) (root : string) : bool =
   path = root ||
   let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
   String.starts_with ~prefix:root path
 
-let is_git_internal_path path = List.exists (( = ) ".git") (split_path path)
+let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path)
 
-let is_ignored_path raw_path =
+let is_ignored_path (raw_path : string) : bool =
   if raw_path = "" then false
   else
     let path = best_effort_canonical raw_path in
     is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths
 
-let home_dir () =
+let home_dir () : string =
   match Sys.getenv_opt "HOME" with
   | Some home when home <> "" -> home
   | _ -> failwith "HOME is unavailable"
 
-let xdg_config_home_dir () =
+let xdg_config_home_dir () : string =
   match Sys.getenv_opt "XDG_CONFIG_HOME" with
   | Some path when path <> "" -> path
   | _ -> concat_path (home_dir ()) ".config"
 
-let xdg_ignore_config_path () = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json"
+let xdg_ignore_config_path () : string = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json"
 
-let expand_ignore_entry entry =
+let expand_ignore_entry (entry : string) : string =
   let home = "$HOME" in
   let xdg = "$XDG_CONFIG_HOME" in
   if entry = home then home_dir ()
@@ -206,7 +203,7 @@ let expand_ignore_entry entry =
   else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17))
   else entry
 
-let load_ignore_config () =
+let load_ignore_config () : unit =
   ignore_config_path := best_effort_canonical (xdg_ignore_config_path ());
   let json =
     try Json.from_file !ignore_config_path
@@ -223,7 +220,8 @@ let load_ignore_config () =
              entries
   | _ -> failwith ("ignore config must be a JSON array: " ^ !ignore_config_path)
 
-let mode_of_kind = function
+let mode_of_kind (kind : Unix.file_kind) : int =
+  match kind with
   | Unix.S_REG -> 0o100000
   | Unix.S_DIR -> 0o040000
   | Unix.S_LNK -> 0o120000
@@ -232,7 +230,7 @@ let mode_of_kind = function
   | Unix.S_FIFO -> 0o010000
   | Unix.S_SOCK -> 0o140000
 
-let stat_metadata path =
+let stat_metadata (path : string) : metadata option =
   try
     let st = Unix.LargeFile.lstat path in
     Some
@@ -248,7 +246,8 @@ let stat_metadata path =
       }
   with Unix.Unix_error _ -> None
 
-let owned_by_other_and_not_writable path =
+(** Treat non-owned, non-writable paths as external system environment instead of snapshot payloads. *)
+let owned_by_other_and_not_writable (path : string) : bool =
   try
     let st = Unix.LargeFile.lstat path in
     st.st_uid <> tracer_uid
@@ -259,12 +258,13 @@ let owned_by_other_and_not_writable path =
     with Unix.Unix_error _ -> true
   with Unix.Unix_error _ -> false
 
-let writable_by_current_user meta =
+let writable_by_current_user (meta : metadata) : bool =
   if not meta.exists then true
   else if tracer_uid = 0 then true
   else meta.mode land 0o222 <> 0
 
-let existing_anchor path =
+(** Find the nearest existing path Git can use to discover a repository for a possibly deleted or not-yet-created file. *)
+let existing_anchor (path : string) : string option =
   let rec loop path =
     if path = "" || path = "." then None
     else if Sys.file_exists path then Some path
@@ -274,13 +274,14 @@ let existing_anchor path =
   in
   loop path
 
-let relative_path root path =
+let relative_path (root : string) (path : string) : string =
   let root = best_effort_canonical root in
   let path = normalize_path path in
   let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
   if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path
 
-let classify_git input_path =
+(** Classify a path against its containing Git repository, recording repo facts needed for compact manifests. *)
+let classify_git (input_path : string) : git_info =
   let info = empty_git () in
   let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in
   match anchor with
@@ -331,7 +332,8 @@ let classify_git input_path =
                 info)
       with Ocaml_git.Git_error _ -> info)
 
-let fnv1a_file_digest path =
+(** Stable content key for blobs. This is not intended as a cryptographic integrity hash. *)
+let fnv1a_file_digest (path : string) : string =
   let ic = open_in_bin path in
   Fun.protect
     ~finally:(fun () -> close_in_noerr ic)
@@ -346,7 +348,7 @@ let fnv1a_file_digest path =
        with End_of_file -> ());
       Printf.sprintf "%016Lx" !hash)
 
-let copy_file src dst =
+let copy_file (src : string) (dst : string) : unit =
   let ic = open_in_bin src in
   Fun.protect
     ~finally:(fun () -> close_in_noerr ic)
@@ -364,19 +366,20 @@ let copy_file src dst =
           in
           loop ()))
 
-let store_blob path =
+let store_blob (path : string) : string =
   let digest = fnv1a_file_digest path in
   let out = concat_path !blob_dir digest in
   if not (Sys.file_exists out) then copy_file path out;
   digest
 
-let should_capture_content path meta git =
+let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool =
   if (not meta.exists) || not meta.regular then false
   else if owned_by_other_and_not_writable path then false
   else if git.in_repo && git.tracked && not git.dirty then false
   else writable_by_current_user meta
 
-let record_observation raw_path operation =
+(** Record the first observed before-state immediately, because creates/deletes can destroy that evidence by syscall exit. *)
+let record_observation (raw_path : string) (operation : string) : unit =
   if raw_path <> "" then
     let path = best_effort_canonical raw_path in
     if not (is_ignored_path path) then (
@@ -397,7 +400,8 @@ let record_observation raw_path operation =
         recd.before_git <- classify_git path;
         if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path)))
 
-let finalize_records () =
+(** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *)
+let finalize_records () : unit =
   Hashtbl.iter
     (fun _ recd ->
       if not (is_ignored_path recd.path) then (
@@ -409,7 +413,7 @@ let finalize_records () =
           recd.after.blob <- Some (store_blob recd.path)))
     files
 
-let metadata_json meta =
+let metadata_json (meta : metadata) : Json.t =
   let base = [ ("exists", `Bool meta.exists) ] in
   let base = if meta.tombstone then ("tombstone", `Bool true) :: base else base in
   let base =
@@ -424,7 +428,7 @@ let metadata_json meta =
   let base = match meta.blob with Some blob -> ("blob", jstr blob) :: base | None -> base in
   `Assoc (List.rev base)
 
-let git_json git =
+let git_json (git : git_info) : Json.t =
   if not git.in_repo then `Assoc [ ("in_repo", `Bool false) ]
   else
     `Assoc
@@ -438,7 +442,7 @@ let git_json git =
         ("ignored", `Bool git.ignored);
       ]
 
-let write_manifest out command exit_status =
+let write_manifest (out : string) (command : string list) (exit_status : int) : unit =
   let repo_items =
     Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", jstr repo.root); ("head", jstr repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos []
     |> List.sort Stdlib.compare
@@ -475,7 +479,8 @@ let write_manifest out command exit_status =
   in
   Json.to_file ~std:true (concat_path out "manifest.json") manifest
 
-let resolve_path proc dirfd path =
+(** Resolve relative syscall paths against cwd or a directory fd as required by *at syscalls. *)
+let resolve_path (proc : proc_state) (dirfd : int) (path : string) : string =
   if is_absolute path then normalize_path path
   else
     let base =
@@ -483,29 +488,31 @@ let resolve_path proc dirfd path =
     in
     concat_path base path
 
-let is_write_open flags =
+let is_write_open (flags : int) : bool =
   let access = flags land o_accmode in
   access = o_wronly || access = o_rdwr || flags land (o_creat lor o_trunc lor o_append) <> 0
 
-let is_read_open flags =
+let is_read_open (flags : int) : bool =
   let access = flags land o_accmode in
   access = o_rdonly || access = o_rdwr
 
-let readlink_opt path = try Some (Unix.readlink path) with Unix.Unix_error _ -> None
+let readlink_opt (path : string) : string option = try Some (Unix.readlink path) with Unix.Unix_error _ -> None
 
-let refresh_proc_fd pid proc fd =
+(** Ask /proc where a successful fd points; this avoids reimplementing kernel path resolution. *)
+let refresh_proc_fd (pid : int) (proc : proc_state) (fd : int) : unit =
   match readlink_opt (Printf.sprintf "/proc/%d/fd/%d" pid fd) with
   | Some target when is_absolute target -> Hashtbl.replace proc.fds fd (best_effort_canonical target)
   | _ -> ()
 
-let refresh_proc_cwd pid proc =
+let refresh_proc_cwd (pid : int) (proc : proc_state) : unit =
   match readlink_opt (Printf.sprintf "/proc/%d/cwd" pid) with
   | Some target -> proc.cwd <- best_effort_canonical target
   | None -> ()
 
-let int_arg regs i = Int64.to_int regs.Ptrace.args.(i)
+let int_arg (regs : Ptrace.regs) (i : int) : int = Int64.to_int regs.Ptrace.args.(i)
 
-let handle_syscall_entry pid proc regs =
+(** Decode syscall entry arguments while tracee pointers and pre-mutation filesystem state are still available. *)
+let handle_syscall_entry (pid : int) (proc : proc_state) (regs : Ptrace.regs) : unit =
   let p = { nr = regs.Ptrace.syscall_nr; args = regs.args; path_a = ""; path_b = ""; dirfd = at_fdcwd; fd = -1; flags = 0 } in
   let tracee_string i = Ptrace.read_string pid regs.args.(i) in
   begin
@@ -555,9 +562,10 @@ let handle_syscall_entry pid proc regs =
   end;
   proc.pending <- Some p
 
-let syscall_ok result = Int64.compare result 0L >= 0
+let syscall_ok (result : int64) : bool = Int64.compare result 0L >= 0
 
-let handle_syscall_exit pid proc regs =
+(** Interpret syscall results, update fd/cwd state, and record observations that depend on success or return fd. *)
+let handle_syscall_exit (pid : int) (proc : proc_state) (regs : Ptrace.regs) : unit =
   match proc.pending with
   | None -> ()
   | Some p ->
@@ -600,12 +608,12 @@ let handle_syscall_exit pid proc regs =
         | _ -> ()
       end
 
-let clone_proc_state state =
+let clone_proc_state (state : proc_state) : proc_state =
   let fds = Hashtbl.create (Hashtbl.length state.fds) in
   Hashtbl.iter (fun fd path -> Hashtbl.add fds fd path) state.fds;
   { cwd = state.cwd; fds; pending = state.pending }
 
-let trace_command command =
+let trace_command (command : string list) : unit =
   Ptrace.trace command (function
     | Ptrace.Fork { parent; child } ->
         let state =
@@ -634,14 +642,15 @@ let trace_command command =
           refresh_proc_cwd pid state;
           Hashtbl.add processes pid state))
 
-let rec remove_all path =
+let rec remove_all (path : string) : unit =
   if Sys.file_exists path then
     if Sys.is_directory path then (
       Array.iter (fun name -> if name <> "." && name <> ".." then remove_all (concat_path path name)) (Sys.readdir path);
       Unix.rmdir path)
     else Unix.unlink path
 
-let restore_snapshot dir =
+(** Restore only blobbed final states and tombstones; Git-reconstructable files remain manifest references. *)
+let restore_snapshot (dir : string) : unit =
   let manifest = Json.from_file (concat_path dir "manifest.json") in
   let files =
     match manifest with
@@ -681,7 +690,7 @@ let restore_snapshot dir =
       | _ -> ())
     files
 
-let parse_snapshot_args args =
+let parse_snapshot_args (args : string list) : string * string list =
   let rec loop output = function
     | "--output" :: value :: rest -> loop (Some value) rest
     | "--" :: command -> (output, command)
@@ -691,7 +700,7 @@ let parse_snapshot_args args =
   | Some output, (_ :: _ as command) -> (output, command)
   | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..."
 
-let run_snapshot args =
+let run_snapshot (args : string list) : int =
   load_ignore_config ();
   let output, command = parse_snapshot_args args in
   snapshot_dir := output;
@@ -703,7 +712,7 @@ let run_snapshot args =
   write_manifest output command 0;
   0
 
-let main () =
+let main () : unit =
   try
     let args = Array.to_list Sys.argv |> List.tl in
     let rc =
src/ocaml/ptrace.ml
index 90ba7c8..86c8be4 100644
--- a/src/ocaml/ptrace.ml
+++ b/src/ocaml/ptrace.ml
@@ -40,11 +40,13 @@ external const_event_clone : unit -> int = "as_const_event_clone"
 external const_event_exec : unit -> int = "as_const_event_exec"
 external const_event_exit : unit -> int = "as_const_event_exit"
 
-let regs pid =
+(** Decode x86_64 syscall registers into the small record used by the OCaml tracer. *)
+let regs (pid : pid) : regs =
   let nr, a0, a1, a2, a3, a4, a5, result = getregs_raw pid in
   { syscall_nr = nr; args = [| a0; a1; a2; a3; a4; a5 |]; result }
 
-let read_string pid address =
+(** Read a NUL-terminated string from tracee memory, bounded so bad pointers cannot loop forever. *)
+let read_string (pid : pid) (address : int64) : string =
   if Int64.equal address 0L then ""
   else
     let max_len = 65536 in
@@ -69,22 +71,25 @@ let read_string pid address =
     in
     loop 0
 
-let wait_initial pid = wait_raw pid false
+let wait_initial (pid : pid) : wait_stop = wait_raw pid false
 
-let wait_next () = wait_raw (-1) true
+let wait_next () : wait_stop = wait_raw (-1) true
 
-let resume ?(signal = 0) pid = syscall pid signal
+let resume ?(signal : int = 0) (pid : pid) : unit = syscall pid signal
 
-let try_setoptions pid =
+(** A forked child can exit before options are applied; that race is not fatal to tracing. *)
+let try_setoptions (pid : pid) : unit =
   try setoptions pid with Unix.Unix_error (Unix.ESRCH, _, _) -> ()
 
-let try_resume ?(signal = 0) pid =
+let try_resume ?(signal : int = 0) (pid : pid) : unit =
   try resume ~signal pid with Unix.Unix_error (Unix.ESRCH, _, _) -> ()
 
-let is_fork_event event =
+let is_fork_event (event : int) : bool =
   event = const_event_fork () || event = const_event_vfork () || event = const_event_clone ()
 
-let decode_stop tasks = function
+(** Convert raw wait stops into higher-level trace events while maintaining syscall entry/exit phase. *)
+let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list =
+  match stop with
   | Exited (pid, _) | Signaled (pid, _) ->
       Hashtbl.remove tasks pid;
       [ Process_exit pid ]
@@ -120,7 +125,8 @@ let decode_stop tasks = function
   | Stopped (pid, signal, _event) ->
       [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ]
 
-let trace command on_event =
+(** Run [command] under ptrace and invoke [on_event] for normalized trace events. *)
+let trace (command : string list) (on_event : event -> unit) : unit =
   match command with
   | [] -> invalid_arg "empty command"
   | argv0 :: _ ->