Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -48,8 +48,10 @@ behavior. It does not snapshot environment variables, process limits, network state, complete directory entry listings, or arbitrary non-filesystem resources. It is also currently focused on Linux x86_64 syscall decoding. -Non-UTF-8 pathnames are a known limitation: the current JSON manifest stores -paths as JSON strings, and the JSON library rejects invalid UTF-8. +The manifest is always written as UTF-8 JSON. Existing valid UTF-8 strings are +preserved. Non-UTF-8 path bytes are converted through Latin-1 before writing so +the manifest remains readable JSON, but byte-exact path restoration for such +names is not yet represented separately in the snapshot format. ## Snapshot Format @@ -158,13 +160,21 @@ Deleted files are represented by an after-state tombstone: ## Usage -Build with CMake: +Build with Dune: ```bash -cmake -S . -B build -cmake --build build --parallel +dune build src/ocaml/agent_snapshot.exe ``` +The executable is written to: + +```text +_build/default/src/ocaml/agent_snapshot.exe +``` + +The project depends on OCaml, Dune, Yojson, Camomile, and the local +`vendor/ocaml-git` checkout. + Create the required ignore configuration before running snapshots: ```bash @@ -175,19 +185,19 @@ printf '[]\n' > "${XDG_CONFIG_HOME:-$HOME/.config}/agent-snapshot/ignore.json" Run a command under Agent Snapshot: ```bash -build/agent-snapshot --output snapshot-dir -- command arg1 arg2 +dune exec -- agent-snapshot --output snapshot-dir -- command arg1 arg2 ``` For example: ```bash -build/agent-snapshot --output snapshot-python -- /usr/bin/python3 script.py +dune exec -- agent-snapshot --output snapshot-python -- /usr/bin/python3 script.py ``` Restore captured final-state blobs and tombstones in place: ```bash -build/agent-snapshot restore snapshot-dir +dune exec -- agent-snapshot restore snapshot-dir ``` Restore only applies files that have blobs and tombstones. Clean Git-tracked
@@ -1,11 +1,12 @@ module Json = Yojson.Safe -let utf8_string s = +(** Yojson does not validate UTF-8 on output. Keep valid UTF-8 unchanged; map arbitrary bytes through Latin-1 so manifest files remain UTF-8 text. *) +let utf8_string (s : string) : string = let module Enc = Camomile.CharEncoding in try Enc.recode_string ~in_enc:Enc.utf8 ~out_enc:Enc.utf8 s with Enc.Malformed_code -> Enc.recode_string ~in_enc:Enc.latin1 ~out_enc:Enc.utf8 s -let jstr s = `String (utf8_string s) +let jstr (s : string) : Json.t = `String (utf8_string s) let at_fdcwd = -100 let o_accmode = 0o3 @@ -106,28 +107,28 @@ type repo_record = { mutable dirty : bool; } -let empty_metadata () = +let empty_metadata () : metadata = { exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None } -let empty_git () = +let empty_git () : git_info = { in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" } let files : (string, file_record) Hashtbl.t = Hashtbl.create 128 let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8 let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8 -let ignored_paths = ref [] -let ignore_config_path = ref "" -let snapshot_dir = ref "" -let blob_dir = ref "" -let tracer_uid = Unix.getuid () -let tracer_gid = Unix.getgid () +let ignored_paths : string list ref = ref [] +let ignore_config_path : string ref = ref "" +let snapshot_dir : string ref = ref "" +let blob_dir : string ref = ref "" +let tracer_uid : int = Unix.getuid () +let tracer_gid : int = Unix.getgid () let path_sep = '/' -let split_path path = +let split_path (path : string) : string list = path |> String.split_on_char path_sep |> List.filter (fun part -> part <> "" && part <> ".") -let normalize_path path = +let normalize_path (path : string) : string = let absolute = String.length path > 0 && path.[0] = path_sep in let parts = List.fold_left @@ -139,65 +140,61 @@ let normalize_path path = let body = String.concat "/" parts in if absolute then if body = "" then "/" else "/" ^ body else if body = "" then "." else body -let concat_path base path = +let concat_path (base : string) (path : string) : string = if path = "" then base else if String.length path > 0 && path.[0] = '/' then normalize_path path else normalize_path (base ^ "/" ^ path) -let dirname path = +let dirname (path : string) : string = let path = normalize_path path in match String.rindex_opt path '/' with | None -> "." | Some 0 -> "/" | Some i -> String.sub path 0 i -let basename path = - match String.rindex_opt path '/' with - | None -> path - | Some i -> String.sub path (i + 1) (String.length path - i - 1) - -let rec mkdir_p path = +let rec mkdir_p (path : string) : unit = if path = "" || path = "/" || Sys.file_exists path then () else ( mkdir_p (dirname path); try Unix.mkdir path 0o777 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()) -let is_absolute path = String.length path > 0 && path.[0] = '/' +let is_absolute (path : string) : bool = String.length path > 0 && path.[0] = '/' -let realpath_opt path = try Some (Unix.realpath path) with Unix.Unix_error _ -> None +let realpath_opt (path : string) : string option = try Some (Unix.realpath path) with Unix.Unix_error _ -> None -let best_effort_canonical path = +(** Canonicalize existing paths, but still return a stable absolute lexical path for paths that do not exist yet. *) +let best_effort_canonical (path : string) : string = match realpath_opt path with | Some path -> normalize_path path | None when is_absolute path -> normalize_path path | None -> concat_path (Sys.getcwd ()) path -let path_is_at_or_under path root = +let path_is_at_or_under (path : string) (root : string) : bool = path = root || let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in String.starts_with ~prefix:root path -let is_git_internal_path path = List.exists (( = ) ".git") (split_path path) +let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path) -let is_ignored_path raw_path = +let is_ignored_path (raw_path : string) : bool = if raw_path = "" then false else let path = best_effort_canonical raw_path in is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths -let home_dir () = +let home_dir () : string = match Sys.getenv_opt "HOME" with | Some home when home <> "" -> home | _ -> failwith "HOME is unavailable" -let xdg_config_home_dir () = +let xdg_config_home_dir () : string = match Sys.getenv_opt "XDG_CONFIG_HOME" with | Some path when path <> "" -> path | _ -> concat_path (home_dir ()) ".config" -let xdg_ignore_config_path () = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json" +let xdg_ignore_config_path () : string = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json" -let expand_ignore_entry entry = +let expand_ignore_entry (entry : string) : string = let home = "$HOME" in let xdg = "$XDG_CONFIG_HOME" in if entry = home then home_dir () @@ -206,7 +203,7 @@ let expand_ignore_entry entry = else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17)) else entry -let load_ignore_config () = +let load_ignore_config () : unit = ignore_config_path := best_effort_canonical (xdg_ignore_config_path ()); let json = try Json.from_file !ignore_config_path @@ -223,7 +220,8 @@ let load_ignore_config () = entries | _ -> failwith ("ignore config must be a JSON array: " ^ !ignore_config_path) -let mode_of_kind = function +let mode_of_kind (kind : Unix.file_kind) : int = + match kind with | Unix.S_REG -> 0o100000 | Unix.S_DIR -> 0o040000 | Unix.S_LNK -> 0o120000 @@ -232,7 +230,7 @@ let mode_of_kind = function | Unix.S_FIFO -> 0o010000 | Unix.S_SOCK -> 0o140000 -let stat_metadata path = +let stat_metadata (path : string) : metadata option = try let st = Unix.LargeFile.lstat path in Some @@ -248,7 +246,8 @@ let stat_metadata path = } with Unix.Unix_error _ -> None -let owned_by_other_and_not_writable path = +(** Treat non-owned, non-writable paths as external system environment instead of snapshot payloads. *) +let owned_by_other_and_not_writable (path : string) : bool = try let st = Unix.LargeFile.lstat path in st.st_uid <> tracer_uid @@ -259,12 +258,13 @@ let owned_by_other_and_not_writable path = with Unix.Unix_error _ -> true with Unix.Unix_error _ -> false -let writable_by_current_user meta = +let writable_by_current_user (meta : metadata) : bool = if not meta.exists then true else if tracer_uid = 0 then true else meta.mode land 0o222 <> 0 -let existing_anchor path = +(** Find the nearest existing path Git can use to discover a repository for a possibly deleted or not-yet-created file. *) +let existing_anchor (path : string) : string option = let rec loop path = if path = "" || path = "." then None else if Sys.file_exists path then Some path @@ -274,13 +274,14 @@ let existing_anchor path = in loop path -let relative_path root path = +let relative_path (root : string) (path : string) : string = let root = best_effort_canonical root in let path = normalize_path path in let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path -let classify_git input_path = +(** Classify a path against its containing Git repository, recording repo facts needed for compact manifests. *) +let classify_git (input_path : string) : git_info = let info = empty_git () in let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in match anchor with @@ -331,7 +332,8 @@ let classify_git input_path = info) with Ocaml_git.Git_error _ -> info) -let fnv1a_file_digest path = +(** Stable content key for blobs. This is not intended as a cryptographic integrity hash. *) +let fnv1a_file_digest (path : string) : string = let ic = open_in_bin path in Fun.protect ~finally:(fun () -> close_in_noerr ic) @@ -346,7 +348,7 @@ let fnv1a_file_digest path = with End_of_file -> ()); Printf.sprintf "%016Lx" !hash) -let copy_file src dst = +let copy_file (src : string) (dst : string) : unit = let ic = open_in_bin src in Fun.protect ~finally:(fun () -> close_in_noerr ic) @@ -364,19 +366,20 @@ let copy_file src dst = in loop ())) -let store_blob path = +let store_blob (path : string) : string = let digest = fnv1a_file_digest path in let out = concat_path !blob_dir digest in if not (Sys.file_exists out) then copy_file path out; digest -let should_capture_content path meta git = +let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool = if (not meta.exists) || not meta.regular then false else if owned_by_other_and_not_writable path then false else if git.in_repo && git.tracked && not git.dirty then false else writable_by_current_user meta -let record_observation raw_path operation = +(** Record the first observed before-state immediately, because creates/deletes can destroy that evidence by syscall exit. *) +let record_observation (raw_path : string) (operation : string) : unit = if raw_path <> "" then let path = best_effort_canonical raw_path in if not (is_ignored_path path) then ( @@ -397,7 +400,8 @@ let record_observation raw_path operation = recd.before_git <- classify_git path; if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path))) -let finalize_records () = +(** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *) +let finalize_records () : unit = Hashtbl.iter (fun _ recd -> if not (is_ignored_path recd.path) then ( @@ -409,7 +413,7 @@ let finalize_records () = recd.after.blob <- Some (store_blob recd.path))) files -let metadata_json meta = +let metadata_json (meta : metadata) : Json.t = let base = [ ("exists", `Bool meta.exists) ] in let base = if meta.tombstone then ("tombstone", `Bool true) :: base else base in let base = @@ -424,7 +428,7 @@ let metadata_json meta = let base = match meta.blob with Some blob -> ("blob", jstr blob) :: base | None -> base in `Assoc (List.rev base) -let git_json git = +let git_json (git : git_info) : Json.t = if not git.in_repo then `Assoc [ ("in_repo", `Bool false) ] else `Assoc @@ -438,7 +442,7 @@ let git_json git = ("ignored", `Bool git.ignored); ] -let write_manifest out command exit_status = +let write_manifest (out : string) (command : string list) (exit_status : int) : unit = let repo_items = Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", jstr repo.root); ("head", jstr repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos [] |> List.sort Stdlib.compare @@ -475,7 +479,8 @@ let write_manifest out command exit_status = in Json.to_file ~std:true (concat_path out "manifest.json") manifest -let resolve_path proc dirfd path = +(** Resolve relative syscall paths against cwd or a directory fd as required by *at syscalls. *) +let resolve_path (proc : proc_state) (dirfd : int) (path : string) : string = if is_absolute path then normalize_path path else let base = @@ -483,29 +488,31 @@ let resolve_path proc dirfd path = in concat_path base path -let is_write_open flags = +let is_write_open (flags : int) : bool = let access = flags land o_accmode in access = o_wronly || access = o_rdwr || flags land (o_creat lor o_trunc lor o_append) <> 0 -let is_read_open flags = +let is_read_open (flags : int) : bool = let access = flags land o_accmode in access = o_rdonly || access = o_rdwr -let readlink_opt path = try Some (Unix.readlink path) with Unix.Unix_error _ -> None +let readlink_opt (path : string) : string option = try Some (Unix.readlink path) with Unix.Unix_error _ -> None -let refresh_proc_fd pid proc fd = +(** Ask /proc where a successful fd points; this avoids reimplementing kernel path resolution. *) +let refresh_proc_fd (pid : int) (proc : proc_state) (fd : int) : unit = match readlink_opt (Printf.sprintf "/proc/%d/fd/%d" pid fd) with | Some target when is_absolute target -> Hashtbl.replace proc.fds fd (best_effort_canonical target) | _ -> () -let refresh_proc_cwd pid proc = +let refresh_proc_cwd (pid : int) (proc : proc_state) : unit = match readlink_opt (Printf.sprintf "/proc/%d/cwd" pid) with | Some target -> proc.cwd <- best_effort_canonical target | None -> () -let int_arg regs i = Int64.to_int regs.Ptrace.args.(i) +let int_arg (regs : Ptrace.regs) (i : int) : int = Int64.to_int regs.Ptrace.args.(i) -let handle_syscall_entry pid proc regs = +(** Decode syscall entry arguments while tracee pointers and pre-mutation filesystem state are still available. *) +let handle_syscall_entry (pid : int) (proc : proc_state) (regs : Ptrace.regs) : unit = let p = { nr = regs.Ptrace.syscall_nr; args = regs.args; path_a = ""; path_b = ""; dirfd = at_fdcwd; fd = -1; flags = 0 } in let tracee_string i = Ptrace.read_string pid regs.args.(i) in begin @@ -555,9 +562,10 @@ let handle_syscall_entry pid proc regs = end; proc.pending <- Some p -let syscall_ok result = Int64.compare result 0L >= 0 +let syscall_ok (result : int64) : bool = Int64.compare result 0L >= 0 -let handle_syscall_exit pid proc regs = +(** Interpret syscall results, update fd/cwd state, and record observations that depend on success or return fd. *) +let handle_syscall_exit (pid : int) (proc : proc_state) (regs : Ptrace.regs) : unit = match proc.pending with | None -> () | Some p -> @@ -600,12 +608,12 @@ let handle_syscall_exit pid proc regs = | _ -> () end -let clone_proc_state state = +let clone_proc_state (state : proc_state) : proc_state = let fds = Hashtbl.create (Hashtbl.length state.fds) in Hashtbl.iter (fun fd path -> Hashtbl.add fds fd path) state.fds; { cwd = state.cwd; fds; pending = state.pending } -let trace_command command = +let trace_command (command : string list) : unit = Ptrace.trace command (function | Ptrace.Fork { parent; child } -> let state = @@ -634,14 +642,15 @@ let trace_command command = refresh_proc_cwd pid state; Hashtbl.add processes pid state)) -let rec remove_all path = +let rec remove_all (path : string) : unit = if Sys.file_exists path then if Sys.is_directory path then ( Array.iter (fun name -> if name <> "." && name <> ".." then remove_all (concat_path path name)) (Sys.readdir path); Unix.rmdir path) else Unix.unlink path -let restore_snapshot dir = +(** Restore only blobbed final states and tombstones; Git-reconstructable files remain manifest references. *) +let restore_snapshot (dir : string) : unit = let manifest = Json.from_file (concat_path dir "manifest.json") in let files = match manifest with @@ -681,7 +690,7 @@ let restore_snapshot dir = | _ -> ()) files -let parse_snapshot_args args = +let parse_snapshot_args (args : string list) : string * string list = let rec loop output = function | "--output" :: value :: rest -> loop (Some value) rest | "--" :: command -> (output, command) @@ -691,7 +700,7 @@ let parse_snapshot_args args = | Some output, (_ :: _ as command) -> (output, command) | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..." -let run_snapshot args = +let run_snapshot (args : string list) : int = load_ignore_config (); let output, command = parse_snapshot_args args in snapshot_dir := output; @@ -703,7 +712,7 @@ let run_snapshot args = write_manifest output command 0; 0 -let main () = +let main () : unit = try let args = Array.to_list Sys.argv |> List.tl in let rc =
@@ -40,11 +40,13 @@ external const_event_clone : unit -> int = "as_const_event_clone" external const_event_exec : unit -> int = "as_const_event_exec" external const_event_exit : unit -> int = "as_const_event_exit" -let regs pid = +(** Decode x86_64 syscall registers into the small record used by the OCaml tracer. *) +let regs (pid : pid) : regs = let nr, a0, a1, a2, a3, a4, a5, result = getregs_raw pid in { syscall_nr = nr; args = [| a0; a1; a2; a3; a4; a5 |]; result } -let read_string pid address = +(** Read a NUL-terminated string from tracee memory, bounded so bad pointers cannot loop forever. *) +let read_string (pid : pid) (address : int64) : string = if Int64.equal address 0L then "" else let max_len = 65536 in @@ -69,22 +71,25 @@ let read_string pid address = in loop 0 -let wait_initial pid = wait_raw pid false +let wait_initial (pid : pid) : wait_stop = wait_raw pid false -let wait_next () = wait_raw (-1) true +let wait_next () : wait_stop = wait_raw (-1) true -let resume ?(signal = 0) pid = syscall pid signal +let resume ?(signal : int = 0) (pid : pid) : unit = syscall pid signal -let try_setoptions pid = +(** A forked child can exit before options are applied; that race is not fatal to tracing. *) +let try_setoptions (pid : pid) : unit = try setoptions pid with Unix.Unix_error (Unix.ESRCH, _, _) -> () -let try_resume ?(signal = 0) pid = +let try_resume ?(signal : int = 0) (pid : pid) : unit = try resume ~signal pid with Unix.Unix_error (Unix.ESRCH, _, _) -> () -let is_fork_event event = +let is_fork_event (event : int) : bool = event = const_event_fork () || event = const_event_vfork () || event = const_event_clone () -let decode_stop tasks = function +(** Convert raw wait stops into higher-level trace events while maintaining syscall entry/exit phase. *) +let decode_stop (tasks : (pid, task) Hashtbl.t) (stop : wait_stop) : event list = + match stop with | Exited (pid, _) | Signaled (pid, _) -> Hashtbl.remove tasks pid; [ Process_exit pid ] @@ -120,7 +125,8 @@ let decode_stop tasks = function | Stopped (pid, signal, _event) -> [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ] -let trace command on_event = +(** Run [command] under ptrace and invoke [on_event] for normalized trace events. *) +let trace (command : string list) (on_event : event -> unit) : unit = match command with | [] -> invalid_arg "empty command" | argv0 :: _ ->