Repositories / agent-snapshot.git

src/ocaml/agent_snapshot.ml

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch
34346 bytes · d8d5c4d4c58f
module Json = Yojson.Safe (** Yojson does not validate UTF-8 on output. Keep valid UTF-8 unchanged; map arbitrary bytes through Latin-1 so manifest files remain UTF-8 text. *) let utf8_string (s : string) : string = let module Enc = Camomile.CharEncoding in try Enc.recode_string ~in_enc:Enc.utf8 ~out_enc:Enc.utf8 s with Enc.Malformed_code -> Enc.recode_string ~in_enc:Enc.latin1 ~out_enc:Enc.utf8 s type metadata = { mutable exists : bool; mutable tombstone : bool; mutable regular : bool; mutable directory : bool; mutable mode : int; mutable size : int64; mutable mtime : int; mutable blob : string option; } type git_info = { mutable in_repo : bool; mutable tracked : bool; mutable dirty : bool; mutable ignored : bool; mutable root : string; mutable head : string; mutable relative_path : string; } type file_record = { path : string; operations : (string, unit) Hashtbl.t; mutable before : metadata; mutable after : metadata; mutable before_git : git_info; mutable after_git : git_info; mutable before_external_system : bool; mutable after_external_system : bool; mutable before_recorded : bool; } type repo_record = { root : string; mutable head : string; mutable dirty : bool; } (** JSON array of strings in the ignore config file. *) type ignore_file_entries = string list [@@deriving yojson] (** Wire encoding of [manifest.json] ([write_manifest]); restore reads subsets by hand. *) module Manifest_json = struct (** One row of ["git_repositories"]. *) type repo = { root : string; head : string; dirty : bool; } [@@deriving yojson { strict = true }] (** ["before"] / ["after"]: keys mirror [manifest_metadata_of_metadata] output exactly ([exists], optional [tombstone], and when [exists] then [type], [mode], [mtime]; optional [blob]). *) type metadata = { exists : bool; tombstone : bool [@default false]; type_ : string option [@key "type"] [@default None]; mode : int option [@default None]; size : int64 option [@default None]; mtime : int option [@default None]; blob : string option [@default None]; } [@@deriving yojson { strict = true }] (** When [in_repo] is false only [["in_repo"]] appears; when true, all repository fields appear. *) type git = { in_repo : bool; root : string option [@default None]; head : string option [@default None]; relative_path : string option [@default None]; tracked : bool option [@default None]; dirty : bool option [@default None]; ignored : bool option [@default None]; } [@@deriving yojson { strict = true }] (** One element of [["files"]]. *) type file_entry = { path : string; operations : string list; before : metadata; after : metadata; git : git; } [@@deriving yojson { strict = true }] (** Root object ([manifest.json]). *) type t = { command : string list; exit_status : int; start_cwd : string; uid : int; gid : int; git_repositories : repo list; files : file_entry list; } [@@deriving yojson { strict = true }] end let empty_metadata () : metadata = { exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None } let empty_git () : git_info = { in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" } let files : (string, file_record) Hashtbl.t = Hashtbl.create 128 let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8 let ignored_paths : string list ref = ref [] let ignore_config_path : string ref = ref "" let snapshot_dir : string ref = ref "" let tracer_uid : int = Unix.getuid () let tracer_gid : int = Unix.getgid () exception Usage_error of string let split_path (path : string) : string list = if path = "" then [] else Fpath.segs (Fpath.normalize (Fpath.v path)) |> List.filter (fun seg -> seg <> "") let normalize_path (path : string) : string = if path = "" then "." else Fpath.to_string (Fpath.normalize (Fpath.v path)) let concat_path (base : string) (path : string) : string = if path = "" then base else Fpath.to_string (Fpath.normalize (Fpath.append (Fpath.v base) (Fpath.v path))) let dirname (path : string) : string = if path = "" then "." else Fpath.to_string (Fpath.parent (Fpath.normalize (Fpath.v path))) let mkdir_p (path : string) : unit = if path = "" || path = "/" || FileUtil.test FileUtil.Exists path then () else FileUtil.mkdir ~parent:true ~mode:(`Octal 0o777) path let is_absolute (path : string) : bool = path <> "" && Fpath.is_abs (Fpath.v path) let realpath_opt (path : string) : string option = try Some (Unix.realpath path) with Unix.Unix_error _ -> None (** Canonicalize existing paths, but still return a stable absolute lexical path for paths that do not exist yet. *) let best_effort_canonical (path : string) : string = match realpath_opt path with | Some path -> normalize_path path | None when is_absolute path -> normalize_path path | None -> concat_path (FileUtil.pwd ()) path let path_is_at_or_under (path : string) (root : string) : bool = path = root || let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in String.starts_with ~prefix:root path let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path) let always_ignored_paths : string list = [ "/proc"; "/dev" ] let is_ignored_path (raw_path : string) : bool = if raw_path = "" then false else let path = best_effort_canonical raw_path in is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) always_ignored_paths || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths let home_dir () : string = match Sys.getenv_opt "HOME" with | Some home when home <> "" -> home | _ -> failwith "HOME is unavailable" let xdg_config_home_dir () : string = match Sys.getenv_opt "XDG_CONFIG_HOME" with | Some path when path <> "" -> path | _ -> concat_path (home_dir ()) ".config" let xdg_ignore_config_path () : string = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json" let expand_ignore_entry (entry : string) : string = let home = "$HOME" in let xdg = "$XDG_CONFIG_HOME" in if entry = home then home_dir () else if String.starts_with ~prefix:(home ^ "/") entry then concat_path (home_dir ()) (String.sub entry 6 (String.length entry - 6)) else if entry = xdg then xdg_config_home_dir () else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17)) else entry (** Shipped default when [ignore.json] is missing; matches the documented example plus common system trees. *) let default_ignore_file_entries () : ignore_file_entries = [ "$HOME/.cache"; "$HOME/.claude"; "$HOME/.codex"; "$HOME/.cursor"; "$XDG_CONFIG_HOME/agent-snapshot/ignore.json"; "/tmp/scratch-output"; "/proc"; "/dev"; "/usr"; "/bin"; ] let load_ignore_config () : unit = ignore_config_path := best_effort_canonical (xdg_ignore_config_path ()); let json = try Json.from_file !ignore_config_path with Sys_error _ -> mkdir_p (dirname !ignore_config_path); let text = Json.pretty_to_string ~std:true (ignore_file_entries_to_yojson (default_ignore_file_entries ())) ^ "\n" in Printf.eprintf "Created default ignore file: %s\n%!" !ignore_config_path; let oc = open_out_bin !ignore_config_path in Fun.protect ~finally:(fun () -> close_out_noerr oc) (fun () -> output_string oc text); Json.from_file !ignore_config_path in match ignore_file_entries_of_yojson json with | Ok entries -> ignored_paths := !ignore_config_path :: List.map (fun entry -> best_effort_canonical (expand_ignore_entry entry)) entries | Error msg -> failwith ("ignore config: " ^ msg ^ " (" ^ !ignore_config_path ^ ")") (** Directory name for a default snapshot bundle next to [ignore.json]. *) let timestamped_snapshot_dir_next_to_config () : string = let tm = Unix.(localtime (gettimeofday ())) in let name = Printf.sprintf "%04d-%02d-%02dT%02d-%02d-%02d" (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday tm.tm_hour tm.tm_min tm.tm_sec in concat_path (dirname !ignore_config_path) name let mode_of_kind (kind : Unix.file_kind) : int = match kind with | Unix.S_REG -> 0o100000 | Unix.S_DIR -> 0o040000 | Unix.S_LNK -> 0o120000 | Unix.S_CHR -> 0o020000 | Unix.S_BLK -> 0o060000 | Unix.S_FIFO -> 0o010000 | Unix.S_SOCK -> 0o140000 let kind_is_special (kind : Unix.file_kind) : bool = match kind with | Unix.S_CHR | Unix.S_BLK | Unix.S_FIFO | Unix.S_SOCK -> true | Unix.S_REG | Unix.S_DIR | Unix.S_LNK -> false let metadata_is_special (meta : metadata) : bool = if not meta.exists then false else match meta.mode land 0o170000 with | 0o020000 | 0o060000 | 0o010000 | 0o140000 -> true | _ -> false let path_is_special_file (path : string) : bool = try let st = Unix.LargeFile.lstat path in kind_is_special st.st_kind with Unix.Unix_error _ -> false let stat_metadata (path : string) : metadata option = try let st = Unix.LargeFile.lstat path in Some { exists = true; tombstone = false; regular = st.st_kind = Unix.S_REG; directory = st.st_kind = Unix.S_DIR; mode = mode_of_kind st.st_kind lor st.st_perm; size = st.st_size; mtime = int_of_float st.st_mtime; blob = None; } with Unix.Unix_error _ -> None let path_has_non_directory_prefix (path : string) : bool = let rec loop prefix = function | [] | [ _ ] -> false | seg :: rest -> let prefix = concat_path prefix seg in try let st = Unix.LargeFile.lstat prefix in if st.st_kind = Unix.S_DIR then loop prefix rest else true with Unix.Unix_error (Unix.ENOENT, _, _) -> false | Unix.Unix_error (Unix.ENOTDIR, _, _) -> true | Unix.Unix_error _ -> false in loop (if is_absolute path then "/" else ".") (split_path path) (** Treat non-owned, non-writable paths as external system environment instead of snapshot payloads. *) let owned_by_other_and_not_writable (path : string) : bool = try let st = Unix.LargeFile.lstat path in st.st_uid <> tracer_uid && try Unix.access path [ Unix.W_OK ]; false with Unix.Unix_error _ -> true with Unix.Unix_error _ -> false let writable_by_current_user (meta : metadata) : bool = if not meta.exists then true else if tracer_uid = 0 then true else meta.mode land 0o222 <> 0 let path_exists (path : string) : bool = match stat_metadata path with | Some _ -> true | None -> false (** Find the nearest existing path Git can use to discover a repository for a possibly deleted or not-yet-created file. *) let existing_anchor (path : string) : string option = let rec loop path = if path = "" || path = "." then None else if path_exists path then Some path else let parent = dirname path in if parent = path then None else loop parent in loop path let relative_path (root : string) (path : string) : string = let root = best_effort_canonical root in let path = normalize_path path in let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path (** Per-repository cache populated on first observation. Reused for every subsequent path classified under the same repo so that [Ocaml_git.status] runs at most once per repo. The repo handle stays open for the lifetime of the snapshot run so [index_contains] fallbacks for clean tracked files don't have to reopen the repo and index. *) type repo_cache = { workdir : string; head : string; status_by_relpath : (string, Ocaml_git.status_entry) Hashtbl.t; repo_handle : Ocaml_git.t; mutable index_handle : Ocaml_git.index option; } let repo_cache_table : (string, repo_cache) Hashtbl.t = Hashtbl.create 8 let close_repo_caches () : unit = Hashtbl.iter (fun _ cache -> Option.iter Ocaml_git.close_index cache.index_handle; cache.index_handle <- None; try Ocaml_git.close cache.repo_handle with Ocaml_git.Git_error _ -> ()) repo_cache_table; Hashtbl.clear repo_cache_table let load_repo_cache (discovered : string) : repo_cache = let repo = Ocaml_git.open_repo discovered in let workdir = match Ocaml_git.workdir repo with | Some w -> best_effort_canonical w | None -> "" in let head = try (Ocaml_git.head_commit repo).id with Ocaml_git.Git_error _ -> "" in let status = Ocaml_git.status repo in let by_rel = Hashtbl.create (max 16 (List.length status)) in List.iter (fun (entry : Ocaml_git.status_entry) -> Hashtbl.replace by_rel entry.path entry) status; { workdir; head; status_by_relpath = by_rel; repo_handle = repo; index_handle = None } let cache_index (cache : repo_cache) : Ocaml_git.index = match cache.index_handle with | Some i -> i | None -> let i = Ocaml_git.index cache.repo_handle in cache.index_handle <- Some i; i (** Classify a path against its containing Git repository, recording repo facts needed for compact manifests. *) let classify_git (input_path : string) : git_info = let info = empty_git () in let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in match anchor with | None -> info | Some anchor -> ( try let discovered = Ocaml_git.discover anchor in let cache = match Hashtbl.find_opt repo_cache_table discovered with | Some c -> c | None -> let c = load_repo_cache discovered in Hashtbl.add repo_cache_table discovered c; c in if cache.workdir = "" then info else let root = cache.workdir in let rel = relative_path root input_path in info.in_repo <- true; info.root <- root; info.relative_path <- rel; info.head <- cache.head; let status_entry = Hashtbl.find_opt cache.status_by_relpath rel in let ignored, dirty, tracked = match status_entry with | Some entry -> let ignored = List.exists (( = ) Ocaml_git.Ignored) entry.flags in let dirty = entry.flags <> [] && entry.flags <> [ Ocaml_git.Current ] in let tracked = if ignored then false else not (List.exists (( = ) Ocaml_git.Worktree_new) entry.flags) in ignored, dirty, tracked | None -> false, false, Ocaml_git.index_contains (cache_index cache) rel in info.ignored <- ignored; info.tracked <- tracked; info.dirty <- dirty; let rec_record = match Hashtbl.find_opt repos root with | Some rec_record -> rec_record | None -> let rec_record : repo_record = { root; head = info.head; dirty = false } in Hashtbl.add repos root rec_record; rec_record in rec_record.head <- info.head; rec_record.dirty <- rec_record.dirty || info.dirty; info with Ocaml_git.Git_error _ -> info) let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst let read_file_bin (path : string) : string option = let ic = open_in_bin path in Fun.protect ~finally:(fun () -> close_in_noerr ic) (fun () -> try Some (really_input_string ic (in_channel_length ic)) with Sys_error _ | Invalid_argument _ | End_of_file -> None) let write_file_bin (path : string) (contents : string) : unit = let oc = open_out_bin path in Fun.protect ~finally:(fun () -> close_out_noerr oc) (fun () -> output_string oc contents) module Binary = struct type t = string end type blob_store_entry = { key : string ; content : Binary.t } [@@deriving arrow] (** Row batches written as row groups so capture never holds all blob payloads in memory. *) let blob_batch_max = 64 let blob_batch_keys_rev : string list ref = ref [] let blob_batch_contents_rev : string list ref = ref [] let blob_batch_count : int ref = ref 0 let blob_dir_ready : bool ref = ref false let blob_row_group_writer : Arrow_c_api.Writer.Row_group_writer.t option ref = ref None let blob_row_groups_written : bool ref = ref false let reset_blob_writer () : unit = blob_batch_keys_rev := []; blob_batch_contents_rev := []; blob_batch_count := 0; blob_dir_ready := false; blob_row_group_writer := None; blob_row_groups_written := false let ensure_blob_dir () : unit = blob_dir_ready := true let active_blob_row_group_writer () : Arrow_c_api.Writer.Row_group_writer.t = match !blob_row_group_writer with | Some writer -> writer | None -> invalid_arg "blob row group writer is not active" let flush_blob_batch () : unit = if !blob_batch_count = 0 then () else ( let keys = List.rev !blob_batch_keys_rev |> Array.of_list in let contents = List.rev !blob_batch_contents_rev |> Array.of_list in blob_batch_keys_rev := []; blob_batch_contents_rev := []; blob_batch_count := 0; let writer = active_blob_row_group_writer () in Arrow_c_api.Writer.Row_group_writer.write_exn writer ~cols: [ Arrow_c_api.Writer.utf8 keys ~name:"key"; Arrow_c_api.Writer.binary contents ~name:"content"; ]; blob_row_groups_written := true) let close_blob_writer () : unit = flush_blob_batch (); blob_row_group_writer := None let with_blob_row_group_writer (f : unit -> 'a) : 'a = ensure_blob_dir (); let path = concat_path !snapshot_dir "blobs.parquet" in try Arrow_c_api.Writer.with_row_group_writer ~compression:Arrow_c_api.Compression.Snappy path ~f:(fun writer -> blob_row_group_writer := Some writer; Fun.protect ~finally:(fun () -> blob_row_group_writer := None) f) with | Invalid_argument msg when (not !blob_row_groups_written) && msg = "Writer.with_row_group_writer: no row groups were written" -> if FileUtil.test FileUtil.Exists path then FileUtil.rm ~force:FileUtil.Force [ path ]; () let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path let store_blob (state : string) (path : string) (meta : metadata) : string option = ensure_blob_dir (); let key = blob_key state path in match read_file_bin path with | None -> meta.size <- -1L; None | Some content -> blob_batch_keys_rev := key :: !blob_batch_keys_rev; blob_batch_contents_rev := content :: !blob_batch_contents_rev; incr blob_batch_count; if !blob_batch_count >= blob_batch_max then flush_blob_batch (); Some key (** Restore reads the blob store into a map. This runs once per restore, not during capture. *) let load_blob_store (dir : string) : (string, string) Hashtbl.t = let tbl = Hashtbl.create 128 in let path = concat_path dir "blobs.parquet" in if FileUtil.test FileUtil.Exists path then ( let table = Arrow_c_api.Parquet_reader.table path in let entries = arrow_blob_store_entry_of_table table in Array.iter (fun { key; content } -> Hashtbl.replace tbl key content) entries); tbl let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool = if (not meta.exists) || not meta.regular then false else if owned_by_other_and_not_writable path then false else if git.in_repo && git.tracked && not git.dirty then false else writable_by_current_user meta (** Record the first observed before-state immediately, because creates/deletes can destroy that evidence by syscall exit. *) let record_observation (raw_path : string) (operation : string) : unit = if raw_path <> "" then let path = best_effort_canonical raw_path in if (not (is_ignored_path path)) && (not (path_has_non_directory_prefix path)) && (not (path_is_special_file path)) && not (owned_by_other_and_not_writable path) then ( let recd = match Hashtbl.find_opt files path with | Some recd -> recd | None -> let recd = { path; operations = Hashtbl.create 5; before = empty_metadata (); after = empty_metadata (); before_git = empty_git (); after_git = empty_git (); before_external_system = false; after_external_system = false; before_recorded = false; } in Hashtbl.add files path recd; recd in Hashtbl.replace recd.operations operation (); if not recd.before_recorded then ( recd.before_recorded <- true; recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ()); recd.before_external_system <- recd.before.exists && owned_by_other_and_not_writable path; recd.before_git <- classify_git path; if should_capture_content path recd.before recd.before_git then recd.before.blob <- store_blob "before" path recd.before)) (** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *) let finalize_records () : unit = (* Discard the trace-time repo cache so post-trace classifications see fresh status. This matters when the traced program commits or otherwise mutates a repository during the run. *) close_repo_caches (); Hashtbl.iter (fun _ recd -> if not (is_ignored_path recd.path) then ( recd.after <- Option.value (stat_metadata recd.path) ~default:(empty_metadata ()); if not recd.after.exists then recd.after.tombstone <- Hashtbl.mem recd.operations "delete"; recd.after_external_system <- recd.after.exists && owned_by_other_and_not_writable recd.path; recd.after_git <- classify_git recd.path; let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then recd.after.blob <- store_blob "after" recd.path recd.after)) files let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata = { exists = meta.exists; tombstone = meta.tombstone; type_ = (if meta.exists then Some (utf8_string (if meta.directory then "directory" else if meta.regular then "file" else "other")) else None); mode = if meta.exists then Some meta.mode else None; size = None; mtime = if meta.exists then Some meta.mtime else None; blob = Option.map utf8_string meta.blob; } let manifest_git_of_git_info (git : git_info) : Manifest_json.git = if not git.in_repo then { in_repo = false; root = None; head = None; relative_path = None; tracked = None; dirty = None; ignored = None; } else { in_repo = true; root = Some (utf8_string git.root); head = Some (utf8_string git.head); relative_path = Some (utf8_string git.relative_path); tracked = Some git.tracked; dirty = Some git.dirty; ignored = Some git.ignored; } let manifest_repo_of_repo_record (repo : repo_record) : Manifest_json.repo = { root = utf8_string repo.root; head = utf8_string repo.head; dirty = repo.dirty } let operation_was_recorded (recd : file_record) (operation : string) : bool = Hashtbl.mem recd.operations operation let record_has_mutation (recd : file_record) : bool = operation_was_recorded recd "write" || operation_was_recorded recd "delete" let record_is_transient_mutation (recd : file_record) : bool = (not recd.before.exists) && (not recd.after.exists) && record_has_mutation recd let record_should_be_manifested (recd : file_record) : bool = (not (metadata_is_special recd.before)) && (not (metadata_is_special recd.after)) && (not recd.before_external_system) && (not recd.after_external_system) && not (record_is_transient_mutation recd) let write_manifest (out : string) (command : string list) (exit_status : int) : unit = let git_repositories = Hashtbl.fold (fun _ (repo : repo_record) acc -> manifest_repo_of_repo_record repo :: acc) repos [] |> List.sort Stdlib.compare in let manifest_files = Hashtbl.fold (fun _ recd acc -> if not (record_should_be_manifested recd) then acc else let operations = Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string in let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in ({ path = utf8_string recd.path; operations; before = manifest_metadata_of_metadata recd.before; after = manifest_metadata_of_metadata recd.after; git = manifest_git_of_git_info git; } : Manifest_json.file_entry) :: acc) files [] |> List.sort Stdlib.compare in let manifest : Manifest_json.t = { command = List.map utf8_string command; exit_status; start_cwd = utf8_string (FileUtil.pwd ()); uid = tracer_uid; gid = tracer_gid; git_repositories; files = manifest_files; } in Json.to_file ~std:true (concat_path out "manifest.json") (Manifest_json.to_yojson manifest) let print_snapshot_summary () : unit = let updated_files = ref 0 in let uncommitted_read_files = ref 0 in Hashtbl.iter (fun _ recd -> if record_should_be_manifested recd && record_has_mutation recd && (Option.is_some recd.after.blob || recd.after.tombstone) then incr updated_files; if record_should_be_manifested recd && operation_was_recorded recd "read" && Option.is_some recd.before.blob then incr uncommitted_read_files) files; Printf.eprintf "Snapshot directory: %s\nWorked in %d repositories. Saved %d updated files. Saved %d read files in the snapshot that were not committed.\n%!" !snapshot_dir (Hashtbl.length repos) !updated_files !uncommitted_read_files let trace_command (command : string list) : unit = Ptrace_easy.trace command (function | Ptrace_easy.Pre_open { path; intent = { write = true; _ }; _ } -> record_observation path "write" | Ptrace_easy.Pre_open _ -> () | Ptrace_easy.Post_open { path; intent; ok = true; _ } -> if intent.read then record_observation path "read"; if intent.write then record_observation path "write"; if intent.directory_only then record_observation path "directory" | Ptrace_easy.Post_open { path; ok = false; _ } -> record_observation path "existence" | Ptrace_easy.Pre_unlink { path; _ } | Ptrace_easy.Post_unlink { path; _ } -> record_observation path "delete" | Ptrace_easy.Pre_rename { src; dst; _ } | Ptrace_easy.Post_rename { src; dst; _ } -> record_observation src "delete"; record_observation dst "write" | Ptrace_easy.Stat_like { path; _ } -> record_observation path "existence" | Ptrace_easy.Read_dir { path; _ } -> record_observation path "directory" | Ptrace_easy.Make_dir { path; _ } -> record_observation path "write" | Ptrace_easy.Truncate { path; _ } -> record_observation path "write" | Ptrace_easy.Fork _ | Ptrace_easy.Process_exit _ -> ()) let remove_all (path : string) : unit = if FileUtil.test FileUtil.Exists path then FileUtil.rm ~recurse:true ~force:FileUtil.Force [ path ] (** Restore only blobbed final states and tombstones; Git-reconstructable files remain manifest references. *) let restore_snapshot (dir : string) : unit = let manifest = Json.from_file (concat_path dir "manifest.json") in let files = match manifest with | `Assoc fields -> (match List.assoc_opt "files" fields with Some (`List files) -> files | _ -> []) | _ -> [] in let blobs_ht = load_blob_store dir in List.iter (fun file_json -> match file_json with | `Assoc item -> ( match (List.assoc_opt "path" item, List.assoc_opt "after" item : Json.t option * Json.t option) with | Some (`String path), Some (`Assoc after_fields) -> ( match Manifest_json.metadata_of_yojson (`Assoc after_fields) with | Error _ -> () | Ok after -> if not after.exists then ( if after.tombstone then try FileUtil.rm ~force:FileUtil.Force [ path ] with FileUtil.RmError _ | Unix.Unix_error (_, _, _) -> ()) else ( match after.blob with | None -> () | Some key -> mkdir_p (dirname path); let tmp = path ^ ".agent-snapshot.tmp" in write_file_bin tmp (Hashtbl.find blobs_ht key); Unix.rename tmp path; (match after.mode with | Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ] | None -> ()); (match after.mtime with | Some mtime -> FileUtil.touch ~time:(FileUtil.Touch_timestamp (float_of_int mtime)) path | None -> ()))) | _ -> ()) | _ -> ()) files let install_shutdown_sigint_handler () : unit = let count = ref 0 in Sys.set_signal Sys.sigint (Sys.Signal_handle (fun _ -> incr count; if !count >= 2 then ( Sys.set_signal Sys.sigint Sys.Signal_default; Unix.kill (Unix.getpid ()) Sys.sigint) else ( prerr_endline "Press Ctrl+C again to terminate the snapshot"; flush stderr))) let run_snapshot (output : string option) (command : string list) : int = load_ignore_config (); let output = match output with | Some dir -> dir | None -> timestamped_snapshot_dir_next_to_config () in snapshot_dir := output; reset_blob_writer (); remove_all output; mkdir_p output; Fun.protect ~finally:close_repo_caches (fun () -> with_blob_row_group_writer (fun () -> trace_command command; install_shutdown_sigint_handler (); finalize_records (); close_blob_writer ()); write_manifest output command 0; print_snapshot_summary ()); 0 open Cmdliner let output_arg = Arg.( value & opt (some string) None & info [ "o"; "output"; "snapshot-dir" ] ~docv:"SNAPDIR" ~doc: "Write the snapshot under this directory. When omitted, a new timestamped directory is created next to the \ ignore configuration file (the same directory as ignore.json).") let command_arg = Arg.( value & pos_all string [] & info [] ~docv:"ARG" ~doc:"Command to trace and its arguments.") let restore_snapdir = Arg.( required & pos 0 (some string) None & info [] ~docv:"SNAPDIR" ~doc:"Snapshot directory produced by a prior run.") let snapshot_term : unit Term.t = Term.( const (fun output cmd -> if cmd = [] then raise (Usage_error "usage: agent-snapshot [--snapshot-dir SNAPDIR] command args..."); ignore (run_snapshot output cmd : int)) $ output_arg $ command_arg) (* Restore is temporarily disabled while the default command form treats the first non-option argument as the traced program name. let restore_term : unit Term.t = Term.(const (fun dir -> restore_snapshot dir) $ restore_snapdir) *) let option_takes_value (arg : string) : bool = match arg with | "-o" | "--output" | "--snapshot-dir" -> true | _ -> false let is_help_option (arg : string) : bool = arg = "-h" || arg = "--help" || String.starts_with ~prefix:"--help=" arg let normalize_snapshot_argv (argv : string array) : string array = let len = Array.length argv in let rec loop i = if i >= len then None else let arg = argv.(i) in if arg = "--" || is_help_option arg then None else if option_takes_value arg then loop (i + 2) else if String.starts_with ~prefix:"--output=" arg || String.starts_with ~prefix:"--snapshot-dir=" arg then loop (i + 1) else if String.starts_with ~prefix:"-" arg then loop (i + 1) else Some i in match loop 1 with | None -> argv | Some command_index -> Array.init (len + 1) (fun i -> if i < command_index then argv.(i) else if i = command_index then "--" else argv.(i - 1)) let cmd_main : unit Cmd.t = let doc = "Filesystem snapshotter for traced commands" in let man = [ `S Manpage.s_description; `P "Traces a command with ptrace and records a manifest plus file blobs."; `S Manpage.s_examples; `P "$(tool) $(b,--snapshot-dir) /tmp/snap make all"; `P "$(tool) $(b,--output) /tmp/snap make all"; `P "$(tool) make all"; `S "COPYRIGHT"; `P "Copyright (c) 2026 Arjun Guha"; `S "LICENSE"; `P "BSD-3 licensed."; ] in let main_info = Cmd.info "agent-snapshot" ~doc ~man in (* let restore_info = Cmd.info "restore" ~docs:Manpage.s_commands ~doc:"Restore blobbed files and tombstones from a snapshot directory." in let restore_cmd = Cmd.v restore_info restore_term in Cmd.group main_info ~default:snapshot_term [ restore_cmd ] *) Cmd.v main_info snapshot_term let main () : unit = try let rc = Cmd.eval ~catch:false ~argv:(normalize_snapshot_argv Sys.argv) cmd_main in Ocaml_git.shutdown (); exit rc with | Usage_error msg -> Ocaml_git.shutdown (); Printf.eprintf "agent-snapshot: %s\n%!" msg; exit 1 | exn -> Ocaml_git.shutdown (); Printf.eprintf "agent-snapshot: %s\n%!" (Printexc.to_string exn); exit 1 let () = main ()