Repositories / agent-snapshot.git
src/ocaml/agent_snapshot.ml
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
module Json = Yojson.Safe
(** Yojson does not validate UTF-8 on output. Keep valid UTF-8 unchanged; map arbitrary bytes through Latin-1 so manifest files remain UTF-8 text. *)
let utf8_string (s : string) : string =
let module Enc = Camomile.CharEncoding in
try Enc.recode_string ~in_enc:Enc.utf8 ~out_enc:Enc.utf8 s
with Enc.Malformed_code -> Enc.recode_string ~in_enc:Enc.latin1 ~out_enc:Enc.utf8 s
type metadata = {
mutable exists : bool;
mutable tombstone : bool;
mutable regular : bool;
mutable directory : bool;
mutable mode : int;
mutable size : int64;
mutable mtime : int;
mutable blob : string option;
}
type git_info = {
mutable in_repo : bool;
mutable tracked : bool;
mutable dirty : bool;
mutable ignored : bool;
mutable root : string;
mutable head : string;
mutable relative_path : string;
}
type file_record = {
path : string;
operations : (string, unit) Hashtbl.t;
mutable before : metadata;
mutable after : metadata;
mutable before_git : git_info;
mutable after_git : git_info;
mutable before_external_system : bool;
mutable after_external_system : bool;
mutable before_recorded : bool;
}
type repo_record = {
root : string;
mutable head : string;
mutable dirty : bool;
}
(** JSON array of strings in the ignore config file. *)
type ignore_file_entries = string list [@@deriving yojson]
(** Wire encoding of [manifest.json] ([write_manifest]); restore reads subsets by hand. *)
module Manifest_json = struct
(** One row of ["git_repositories"]. *)
type repo = {
root : string;
head : string;
dirty : bool;
}
[@@deriving yojson { strict = true }]
(** ["before"] / ["after"]: keys mirror [manifest_metadata_of_metadata] output exactly
([exists], optional [tombstone], and when [exists] then [type], [mode], [mtime];
optional [blob]). *)
type metadata = {
exists : bool;
tombstone : bool [@default false];
type_ : string option [@key "type"] [@default None];
mode : int option [@default None];
size : int64 option [@default None];
mtime : int option [@default None];
blob : string option [@default None];
}
[@@deriving yojson { strict = true }]
(** When [in_repo] is false only [["in_repo"]] appears; when true, all repository fields appear. *)
type git = {
in_repo : bool;
root : string option [@default None];
head : string option [@default None];
relative_path : string option [@default None];
tracked : bool option [@default None];
dirty : bool option [@default None];
ignored : bool option [@default None];
}
[@@deriving yojson { strict = true }]
(** One element of [["files"]]. *)
type file_entry = {
path : string;
operations : string list;
before : metadata;
after : metadata;
git : git;
}
[@@deriving yojson { strict = true }]
(** Root object ([manifest.json]). *)
type t = {
command : string list;
exit_status : int;
start_cwd : string;
uid : int;
gid : int;
git_repositories : repo list;
files : file_entry list;
}
[@@deriving yojson { strict = true }]
end
let empty_metadata () : metadata =
{ exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None }
let empty_git () : git_info =
{ in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" }
let files : (string, file_record) Hashtbl.t = Hashtbl.create 128
let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8
let ignored_paths : string list ref = ref []
let ignore_config_path : string ref = ref ""
let snapshot_dir : string ref = ref ""
let tracer_uid : int = Unix.getuid ()
let tracer_gid : int = Unix.getgid ()
exception Usage_error of string
let split_path (path : string) : string list =
if path = "" then []
else Fpath.segs (Fpath.normalize (Fpath.v path)) |> List.filter (fun seg -> seg <> "")
let normalize_path (path : string) : string =
if path = "" then "." else Fpath.to_string (Fpath.normalize (Fpath.v path))
let concat_path (base : string) (path : string) : string =
if path = "" then base
else Fpath.to_string (Fpath.normalize (Fpath.append (Fpath.v base) (Fpath.v path)))
let dirname (path : string) : string =
if path = "" then "."
else Fpath.to_string (Fpath.parent (Fpath.normalize (Fpath.v path)))
let mkdir_p (path : string) : unit =
if path = "" || path = "/" || FileUtil.test FileUtil.Exists path then ()
else FileUtil.mkdir ~parent:true ~mode:(`Octal 0o777) path
let is_absolute (path : string) : bool = path <> "" && Fpath.is_abs (Fpath.v path)
let realpath_opt (path : string) : string option = try Some (Unix.realpath path) with Unix.Unix_error _ -> None
(** Canonicalize existing paths, but still return a stable absolute lexical path for paths that do not exist yet. *)
let best_effort_canonical (path : string) : string =
match realpath_opt path with
| Some path -> normalize_path path
| None when is_absolute path -> normalize_path path
| None -> concat_path (FileUtil.pwd ()) path
let path_is_at_or_under (path : string) (root : string) : bool =
path = root ||
let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
String.starts_with ~prefix:root path
let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path)
let always_ignored_paths : string list = [ "/proc"; "/dev" ]
let is_ignored_path (raw_path : string) : bool =
if raw_path = "" then false
else
let path = best_effort_canonical raw_path in
is_git_internal_path path
|| List.exists (fun ignored -> path_is_at_or_under path ignored) always_ignored_paths
|| List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths
let home_dir () : string =
match Sys.getenv_opt "HOME" with
| Some home when home <> "" -> home
| _ -> failwith "HOME is unavailable"
let xdg_config_home_dir () : string =
match Sys.getenv_opt "XDG_CONFIG_HOME" with
| Some path when path <> "" -> path
| _ -> concat_path (home_dir ()) ".config"
let xdg_ignore_config_path () : string = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json"
let expand_ignore_entry (entry : string) : string =
let home = "$HOME" in
let xdg = "$XDG_CONFIG_HOME" in
if entry = home then home_dir ()
else if String.starts_with ~prefix:(home ^ "/") entry then concat_path (home_dir ()) (String.sub entry 6 (String.length entry - 6))
else if entry = xdg then xdg_config_home_dir ()
else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17))
else entry
(** Shipped default when [ignore.json] is missing; matches the documented example plus common system trees. *)
let default_ignore_file_entries () : ignore_file_entries =
[
"$HOME/.cache";
"$HOME/.claude";
"$HOME/.codex";
"$HOME/.cursor";
"$XDG_CONFIG_HOME/agent-snapshot/ignore.json";
"/tmp/scratch-output";
"/proc";
"/dev";
"/usr";
"/bin";
]
let load_ignore_config () : unit =
ignore_config_path := best_effort_canonical (xdg_ignore_config_path ());
let json =
try Json.from_file !ignore_config_path
with Sys_error _ ->
mkdir_p (dirname !ignore_config_path);
let text = Json.pretty_to_string ~std:true (ignore_file_entries_to_yojson (default_ignore_file_entries ())) ^ "\n" in
Printf.eprintf "Created default ignore file: %s\n%!" !ignore_config_path;
let oc = open_out_bin !ignore_config_path in
Fun.protect
~finally:(fun () -> close_out_noerr oc)
(fun () -> output_string oc text);
Json.from_file !ignore_config_path
in
match ignore_file_entries_of_yojson json with
| Ok entries ->
ignored_paths :=
!ignore_config_path :: List.map (fun entry -> best_effort_canonical (expand_ignore_entry entry)) entries
| Error msg -> failwith ("ignore config: " ^ msg ^ " (" ^ !ignore_config_path ^ ")")
(** Directory name for a default snapshot bundle next to [ignore.json]. *)
let timestamped_snapshot_dir_next_to_config () : string =
let tm = Unix.(localtime (gettimeofday ())) in
let name =
Printf.sprintf "%04d-%02d-%02dT%02d-%02d-%02d"
(tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday tm.tm_hour tm.tm_min tm.tm_sec
in
concat_path (dirname !ignore_config_path) name
let mode_of_kind (kind : Unix.file_kind) : int =
match kind with
| Unix.S_REG -> 0o100000
| Unix.S_DIR -> 0o040000
| Unix.S_LNK -> 0o120000
| Unix.S_CHR -> 0o020000
| Unix.S_BLK -> 0o060000
| Unix.S_FIFO -> 0o010000
| Unix.S_SOCK -> 0o140000
let kind_is_special (kind : Unix.file_kind) : bool =
match kind with
| Unix.S_CHR | Unix.S_BLK | Unix.S_FIFO | Unix.S_SOCK -> true
| Unix.S_REG | Unix.S_DIR | Unix.S_LNK -> false
let metadata_is_special (meta : metadata) : bool =
if not meta.exists then false
else
match meta.mode land 0o170000 with
| 0o020000 | 0o060000 | 0o010000 | 0o140000 -> true
| _ -> false
let path_is_special_file (path : string) : bool =
try
let st = Unix.LargeFile.lstat path in
kind_is_special st.st_kind
with Unix.Unix_error _ -> false
let stat_metadata (path : string) : metadata option =
try
let st = Unix.LargeFile.lstat path in
Some
{
exists = true;
tombstone = false;
regular = st.st_kind = Unix.S_REG;
directory = st.st_kind = Unix.S_DIR;
mode = mode_of_kind st.st_kind lor st.st_perm;
size = st.st_size;
mtime = int_of_float st.st_mtime;
blob = None;
}
with Unix.Unix_error _ -> None
let path_has_non_directory_prefix (path : string) : bool =
let rec loop prefix = function
| [] | [ _ ] -> false
| seg :: rest ->
let prefix = concat_path prefix seg in
try
let st = Unix.LargeFile.lstat prefix in
if st.st_kind = Unix.S_DIR then loop prefix rest else true
with Unix.Unix_error (Unix.ENOENT, _, _) -> false
| Unix.Unix_error (Unix.ENOTDIR, _, _) -> true
| Unix.Unix_error _ -> false
in
loop (if is_absolute path then "/" else ".") (split_path path)
(** Treat non-owned, non-writable paths as external system environment instead of snapshot payloads. *)
let owned_by_other_and_not_writable (path : string) : bool =
try
let st = Unix.LargeFile.lstat path in
st.st_uid <> tracer_uid
&&
try
Unix.access path [ Unix.W_OK ];
false
with Unix.Unix_error _ -> true
with Unix.Unix_error _ -> false
let writable_by_current_user (meta : metadata) : bool =
if not meta.exists then true
else if tracer_uid = 0 then true
else meta.mode land 0o222 <> 0
let path_exists (path : string) : bool =
match stat_metadata path with
| Some _ -> true
| None -> false
(** Find the nearest existing path Git can use to discover a repository for a possibly deleted or not-yet-created file. *)
let existing_anchor (path : string) : string option =
let rec loop path =
if path = "" || path = "." then None
else if path_exists path then Some path
else
let parent = dirname path in
if parent = path then None else loop parent
in
loop path
let relative_path (root : string) (path : string) : string =
let root = best_effort_canonical root in
let path = normalize_path path in
let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path
(** Per-repository cache populated on first observation. Reused for every
subsequent path classified under the same repo so that [Ocaml_git.status]
runs at most once per repo. The repo handle stays open for the lifetime of
the snapshot run so [index_contains] fallbacks for clean tracked files
don't have to reopen the repo and index. *)
type repo_cache = {
workdir : string;
head : string;
status_by_relpath : (string, Ocaml_git.status_entry) Hashtbl.t;
repo_handle : Ocaml_git.t;
mutable index_handle : Ocaml_git.index option;
}
let repo_cache_table : (string, repo_cache) Hashtbl.t = Hashtbl.create 8
let close_repo_caches () : unit =
Hashtbl.iter
(fun _ cache ->
Option.iter Ocaml_git.close_index cache.index_handle;
cache.index_handle <- None;
try Ocaml_git.close cache.repo_handle with Ocaml_git.Git_error _ -> ())
repo_cache_table;
Hashtbl.clear repo_cache_table
let load_repo_cache (discovered : string) : repo_cache =
let repo = Ocaml_git.open_repo discovered in
let workdir =
match Ocaml_git.workdir repo with
| Some w -> best_effort_canonical w
| None -> ""
in
let head = try (Ocaml_git.head_commit repo).id with Ocaml_git.Git_error _ -> "" in
let status = Ocaml_git.status repo in
let by_rel = Hashtbl.create (max 16 (List.length status)) in
List.iter (fun (entry : Ocaml_git.status_entry) -> Hashtbl.replace by_rel entry.path entry) status;
{ workdir; head; status_by_relpath = by_rel; repo_handle = repo; index_handle = None }
let cache_index (cache : repo_cache) : Ocaml_git.index =
match cache.index_handle with
| Some i -> i
| None ->
let i = Ocaml_git.index cache.repo_handle in
cache.index_handle <- Some i;
i
(** Classify a path against its containing Git repository, recording repo facts needed for compact manifests. *)
let classify_git (input_path : string) : git_info =
let info = empty_git () in
let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in
match anchor with
| None -> info
| Some anchor -> (
try
let discovered = Ocaml_git.discover anchor in
let cache =
match Hashtbl.find_opt repo_cache_table discovered with
| Some c -> c
| None ->
let c = load_repo_cache discovered in
Hashtbl.add repo_cache_table discovered c;
c
in
if cache.workdir = "" then info
else
let root = cache.workdir in
let rel = relative_path root input_path in
info.in_repo <- true;
info.root <- root;
info.relative_path <- rel;
info.head <- cache.head;
let status_entry = Hashtbl.find_opt cache.status_by_relpath rel in
let ignored, dirty, tracked =
match status_entry with
| Some entry ->
let ignored = List.exists (( = ) Ocaml_git.Ignored) entry.flags in
let dirty = entry.flags <> [] && entry.flags <> [ Ocaml_git.Current ] in
let tracked =
if ignored then false
else not (List.exists (( = ) Ocaml_git.Worktree_new) entry.flags)
in
ignored, dirty, tracked
| None ->
false, false, Ocaml_git.index_contains (cache_index cache) rel
in
info.ignored <- ignored;
info.tracked <- tracked;
info.dirty <- dirty;
let rec_record =
match Hashtbl.find_opt repos root with
| Some rec_record -> rec_record
| None ->
let rec_record : repo_record = { root; head = info.head; dirty = false } in
Hashtbl.add repos root rec_record;
rec_record
in
rec_record.head <- info.head;
rec_record.dirty <- rec_record.dirty || info.dirty;
info
with Ocaml_git.Git_error _ -> info)
let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst
let read_file_bin (path : string) : string option =
let ic = open_in_bin path in
Fun.protect
~finally:(fun () -> close_in_noerr ic)
(fun () ->
try Some (really_input_string ic (in_channel_length ic))
with Sys_error _ | Invalid_argument _ | End_of_file -> None)
let write_file_bin (path : string) (contents : string) : unit =
let oc = open_out_bin path in
Fun.protect
~finally:(fun () -> close_out_noerr oc)
(fun () -> output_string oc contents)
module Binary = struct
type t = string
end
type blob_store_entry =
{ key : string
; content : Binary.t
}
[@@deriving arrow]
(** Row batches written as row groups so capture never holds all
blob payloads in memory. *)
let blob_batch_max = 64
let blob_batch_keys_rev : string list ref = ref []
let blob_batch_contents_rev : string list ref = ref []
let blob_batch_count : int ref = ref 0
let blob_dir_ready : bool ref = ref false
let blob_row_group_writer : Arrow_c_api.Writer.Row_group_writer.t option ref = ref None
let blob_row_groups_written : bool ref = ref false
let reset_blob_writer () : unit =
blob_batch_keys_rev := [];
blob_batch_contents_rev := [];
blob_batch_count := 0;
blob_dir_ready := false;
blob_row_group_writer := None;
blob_row_groups_written := false
let ensure_blob_dir () : unit = blob_dir_ready := true
let active_blob_row_group_writer () : Arrow_c_api.Writer.Row_group_writer.t =
match !blob_row_group_writer with
| Some writer -> writer
| None -> invalid_arg "blob row group writer is not active"
let flush_blob_batch () : unit =
if !blob_batch_count = 0 then ()
else (
let keys = List.rev !blob_batch_keys_rev |> Array.of_list in
let contents = List.rev !blob_batch_contents_rev |> Array.of_list in
blob_batch_keys_rev := [];
blob_batch_contents_rev := [];
blob_batch_count := 0;
let writer = active_blob_row_group_writer () in
Arrow_c_api.Writer.Row_group_writer.write_exn
writer
~cols:
[
Arrow_c_api.Writer.utf8 keys ~name:"key";
Arrow_c_api.Writer.binary contents ~name:"content";
];
blob_row_groups_written := true)
let close_blob_writer () : unit =
flush_blob_batch ();
blob_row_group_writer := None
let with_blob_row_group_writer (f : unit -> 'a) : 'a =
ensure_blob_dir ();
let path = concat_path !snapshot_dir "blobs.parquet" in
try
Arrow_c_api.Writer.with_row_group_writer ~compression:Arrow_c_api.Compression.Snappy path ~f:(fun writer ->
blob_row_group_writer := Some writer;
Fun.protect ~finally:(fun () -> blob_row_group_writer := None) f)
with
| Invalid_argument msg when (not !blob_row_groups_written) && msg = "Writer.with_row_group_writer: no row groups were written" ->
if FileUtil.test FileUtil.Exists path then FileUtil.rm ~force:FileUtil.Force [ path ];
()
let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path
let store_blob (state : string) (path : string) (meta : metadata) : string option =
ensure_blob_dir ();
let key = blob_key state path in
match read_file_bin path with
| None ->
meta.size <- -1L;
None
| Some content ->
blob_batch_keys_rev := key :: !blob_batch_keys_rev;
blob_batch_contents_rev := content :: !blob_batch_contents_rev;
incr blob_batch_count;
if !blob_batch_count >= blob_batch_max then flush_blob_batch ();
Some key
(** Restore reads the blob store into a map. This runs once per restore, not during capture. *)
let load_blob_store (dir : string) : (string, string) Hashtbl.t =
let tbl = Hashtbl.create 128 in
let path = concat_path dir "blobs.parquet" in
if FileUtil.test FileUtil.Exists path then (
let table = Arrow_c_api.Parquet_reader.table path in
let entries = arrow_blob_store_entry_of_table table in
Array.iter (fun { key; content } -> Hashtbl.replace tbl key content) entries);
tbl
let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool =
if (not meta.exists) || not meta.regular then false
else if owned_by_other_and_not_writable path then false
else if git.in_repo && git.tracked && not git.dirty then false
else writable_by_current_user meta
(** Record the first observed before-state immediately, because creates/deletes can destroy that evidence by syscall exit. *)
let record_observation (raw_path : string) (operation : string) : unit =
if raw_path <> "" then
let path = best_effort_canonical raw_path in
if
(not (is_ignored_path path))
&& (not (path_has_non_directory_prefix path))
&& (not (path_is_special_file path))
&& not (owned_by_other_and_not_writable path)
then (
let recd =
match Hashtbl.find_opt files path with
| Some recd -> recd
| None ->
let recd =
{
path;
operations = Hashtbl.create 5;
before = empty_metadata ();
after = empty_metadata ();
before_git = empty_git ();
after_git = empty_git ();
before_external_system = false;
after_external_system = false;
before_recorded = false;
}
in
Hashtbl.add files path recd;
recd
in
Hashtbl.replace recd.operations operation ();
if not recd.before_recorded then (
recd.before_recorded <- true;
recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ());
recd.before_external_system <- recd.before.exists && owned_by_other_and_not_writable path;
recd.before_git <- classify_git path;
if should_capture_content path recd.before recd.before_git then recd.before.blob <- store_blob "before" path recd.before))
(** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *)
let finalize_records () : unit =
(* Discard the trace-time repo cache so post-trace classifications see fresh
status. This matters when the traced program commits or otherwise mutates
a repository during the run. *)
close_repo_caches ();
Hashtbl.iter
(fun _ recd ->
if not (is_ignored_path recd.path) then (
recd.after <- Option.value (stat_metadata recd.path) ~default:(empty_metadata ());
if not recd.after.exists then recd.after.tombstone <- Hashtbl.mem recd.operations "delete";
recd.after_external_system <- recd.after.exists && owned_by_other_and_not_writable recd.path;
recd.after_git <- classify_git recd.path;
let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in
if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then
recd.after.blob <- store_blob "after" recd.path recd.after))
files
let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata =
{
exists = meta.exists;
tombstone = meta.tombstone;
type_ =
(if meta.exists then
Some (utf8_string (if meta.directory then "directory" else if meta.regular then "file" else "other"))
else None);
mode = if meta.exists then Some meta.mode else None;
size = None;
mtime = if meta.exists then Some meta.mtime else None;
blob = Option.map utf8_string meta.blob;
}
let manifest_git_of_git_info (git : git_info) : Manifest_json.git =
if not git.in_repo then
{
in_repo = false;
root = None;
head = None;
relative_path = None;
tracked = None;
dirty = None;
ignored = None;
}
else
{
in_repo = true;
root = Some (utf8_string git.root);
head = Some (utf8_string git.head);
relative_path = Some (utf8_string git.relative_path);
tracked = Some git.tracked;
dirty = Some git.dirty;
ignored = Some git.ignored;
}
let manifest_repo_of_repo_record (repo : repo_record) : Manifest_json.repo =
{ root = utf8_string repo.root; head = utf8_string repo.head; dirty = repo.dirty }
let operation_was_recorded (recd : file_record) (operation : string) : bool =
Hashtbl.mem recd.operations operation
let record_has_mutation (recd : file_record) : bool =
operation_was_recorded recd "write" || operation_was_recorded recd "delete"
let record_is_transient_mutation (recd : file_record) : bool =
(not recd.before.exists) && (not recd.after.exists) && record_has_mutation recd
let record_should_be_manifested (recd : file_record) : bool =
(not (metadata_is_special recd.before))
&& (not (metadata_is_special recd.after))
&& (not recd.before_external_system)
&& (not recd.after_external_system)
&& not (record_is_transient_mutation recd)
let write_manifest (out : string) (command : string list) (exit_status : int) : unit =
let git_repositories =
Hashtbl.fold (fun _ (repo : repo_record) acc -> manifest_repo_of_repo_record repo :: acc) repos []
|> List.sort Stdlib.compare
in
let manifest_files =
Hashtbl.fold
(fun _ recd acc ->
if not (record_should_be_manifested recd) then acc
else
let operations =
Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string
in
let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in
({
path = utf8_string recd.path;
operations;
before = manifest_metadata_of_metadata recd.before;
after = manifest_metadata_of_metadata recd.after;
git = manifest_git_of_git_info git;
}
: Manifest_json.file_entry)
:: acc)
files []
|> List.sort Stdlib.compare
in
let manifest : Manifest_json.t =
{
command = List.map utf8_string command;
exit_status;
start_cwd = utf8_string (FileUtil.pwd ());
uid = tracer_uid;
gid = tracer_gid;
git_repositories;
files = manifest_files;
}
in
Json.to_file ~std:true (concat_path out "manifest.json") (Manifest_json.to_yojson manifest)
let print_snapshot_summary () : unit =
let updated_files = ref 0 in
let uncommitted_read_files = ref 0 in
Hashtbl.iter
(fun _ recd ->
if record_should_be_manifested recd && record_has_mutation recd && (Option.is_some recd.after.blob || recd.after.tombstone)
then incr updated_files;
if record_should_be_manifested recd && operation_was_recorded recd "read" && Option.is_some recd.before.blob then
incr uncommitted_read_files)
files;
Printf.eprintf
"Snapshot directory: %s\nWorked in %d repositories. Saved %d updated files. Saved %d read files in the snapshot that were not committed.\n%!"
!snapshot_dir
(Hashtbl.length repos) !updated_files !uncommitted_read_files
let trace_command (command : string list) : unit =
Ptrace_easy.trace command (function
| Ptrace_easy.Pre_open { path; intent = { write = true; _ }; _ } ->
record_observation path "write"
| Ptrace_easy.Pre_open _ -> ()
| Ptrace_easy.Post_open { path; intent; ok = true; _ } ->
if intent.read then record_observation path "read";
if intent.write then record_observation path "write";
if intent.directory_only then record_observation path "directory"
| Ptrace_easy.Post_open { path; ok = false; _ } ->
record_observation path "existence"
| Ptrace_easy.Pre_unlink { path; _ } | Ptrace_easy.Post_unlink { path; _ } ->
record_observation path "delete"
| Ptrace_easy.Pre_rename { src; dst; _ } | Ptrace_easy.Post_rename { src; dst; _ } ->
record_observation src "delete";
record_observation dst "write"
| Ptrace_easy.Stat_like { path; _ } -> record_observation path "existence"
| Ptrace_easy.Read_dir { path; _ } -> record_observation path "directory"
| Ptrace_easy.Make_dir { path; _ } -> record_observation path "write"
| Ptrace_easy.Truncate { path; _ } -> record_observation path "write"
| Ptrace_easy.Fork _ | Ptrace_easy.Process_exit _ -> ())
let remove_all (path : string) : unit =
if FileUtil.test FileUtil.Exists path then FileUtil.rm ~recurse:true ~force:FileUtil.Force [ path ]
(** Restore only blobbed final states and tombstones; Git-reconstructable files remain manifest references. *)
let restore_snapshot (dir : string) : unit =
let manifest = Json.from_file (concat_path dir "manifest.json") in
let files =
match manifest with
| `Assoc fields -> (match List.assoc_opt "files" fields with Some (`List files) -> files | _ -> [])
| _ -> []
in
let blobs_ht = load_blob_store dir in
List.iter
(fun file_json ->
match file_json with
| `Assoc item -> (
match (List.assoc_opt "path" item, List.assoc_opt "after" item : Json.t option * Json.t option) with
| Some (`String path), Some (`Assoc after_fields) -> (
match Manifest_json.metadata_of_yojson (`Assoc after_fields) with
| Error _ -> ()
| Ok after ->
if not after.exists then (
if after.tombstone then
try FileUtil.rm ~force:FileUtil.Force [ path ]
with FileUtil.RmError _ | Unix.Unix_error (_, _, _) -> ())
else (
match after.blob with
| None -> ()
| Some key ->
mkdir_p (dirname path);
let tmp = path ^ ".agent-snapshot.tmp" in
write_file_bin tmp (Hashtbl.find blobs_ht key);
Unix.rename tmp path;
(match after.mode with
| Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ]
| None -> ());
(match after.mtime with
| Some mtime -> FileUtil.touch ~time:(FileUtil.Touch_timestamp (float_of_int mtime)) path
| None -> ())))
| _ -> ())
| _ -> ())
files
let install_shutdown_sigint_handler () : unit =
let count = ref 0 in
Sys.set_signal Sys.sigint
(Sys.Signal_handle
(fun _ ->
incr count;
if !count >= 2 then (
Sys.set_signal Sys.sigint Sys.Signal_default;
Unix.kill (Unix.getpid ()) Sys.sigint)
else (
prerr_endline "Press Ctrl+C again to terminate the snapshot";
flush stderr)))
let run_snapshot (output : string option) (command : string list) : int =
load_ignore_config ();
let output =
match output with
| Some dir -> dir
| None -> timestamped_snapshot_dir_next_to_config ()
in
snapshot_dir := output;
reset_blob_writer ();
remove_all output;
mkdir_p output;
Fun.protect
~finally:close_repo_caches
(fun () ->
with_blob_row_group_writer (fun () ->
trace_command command;
install_shutdown_sigint_handler ();
finalize_records ();
close_blob_writer ());
write_manifest output command 0;
print_snapshot_summary ());
0
open Cmdliner
let output_arg =
Arg.(
value
& opt (some string) None
& info [ "o"; "output"; "snapshot-dir" ] ~docv:"SNAPDIR"
~doc:
"Write the snapshot under this directory. When omitted, a new timestamped directory is created next to the \
ignore configuration file (the same directory as ignore.json).")
let command_arg =
Arg.(
value
& pos_all string []
& info [] ~docv:"ARG" ~doc:"Command to trace and its arguments.")
let restore_snapdir =
Arg.(
required
& pos 0 (some string) None
& info [] ~docv:"SNAPDIR"
~doc:"Snapshot directory produced by a prior run.")
let snapshot_term : unit Term.t =
Term.(
const (fun output cmd ->
if cmd = [] then
raise (Usage_error "usage: agent-snapshot [--snapshot-dir SNAPDIR] command args...");
ignore (run_snapshot output cmd : int))
$ output_arg
$ command_arg)
(* Restore is temporarily disabled while the default command form treats the
first non-option argument as the traced program name.
let restore_term : unit Term.t =
Term.(const (fun dir -> restore_snapshot dir) $ restore_snapdir)
*)
let option_takes_value (arg : string) : bool =
match arg with
| "-o" | "--output" | "--snapshot-dir" -> true
| _ -> false
let is_help_option (arg : string) : bool =
arg = "-h" || arg = "--help" || String.starts_with ~prefix:"--help=" arg
let normalize_snapshot_argv (argv : string array) : string array =
let len = Array.length argv in
let rec loop i =
if i >= len then None
else
let arg = argv.(i) in
if arg = "--" || is_help_option arg then None
else if option_takes_value arg then loop (i + 2)
else if String.starts_with ~prefix:"--output=" arg || String.starts_with ~prefix:"--snapshot-dir=" arg then loop (i + 1)
else if String.starts_with ~prefix:"-" arg then loop (i + 1)
else Some i
in
match loop 1 with
| None -> argv
| Some command_index ->
Array.init (len + 1) (fun i ->
if i < command_index then argv.(i)
else if i = command_index then "--"
else argv.(i - 1))
let cmd_main : unit Cmd.t =
let doc = "Filesystem snapshotter for traced commands" in
let man =
[
`S Manpage.s_description;
`P "Traces a command with ptrace and records a manifest plus file blobs.";
`S Manpage.s_examples;
`P "$(tool) $(b,--snapshot-dir) /tmp/snap make all";
`P "$(tool) $(b,--output) /tmp/snap make all";
`P "$(tool) make all";
`S "COPYRIGHT";
`P "Copyright (c) 2026 Arjun Guha";
`S "LICENSE";
`P "BSD-3 licensed.";
]
in
let main_info = Cmd.info "agent-snapshot" ~doc ~man in
(*
let restore_info =
Cmd.info "restore" ~docs:Manpage.s_commands
~doc:"Restore blobbed files and tombstones from a snapshot directory."
in
let restore_cmd = Cmd.v restore_info restore_term in
Cmd.group main_info ~default:snapshot_term [ restore_cmd ]
*)
Cmd.v main_info snapshot_term
let main () : unit =
try
let rc = Cmd.eval ~catch:false ~argv:(normalize_snapshot_argv Sys.argv) cmd_main in
Ocaml_git.shutdown ();
exit rc
with
| Usage_error msg ->
Ocaml_git.shutdown ();
Printf.eprintf "agent-snapshot: %s\n%!" msg;
exit 1
| exn ->
Ocaml_git.shutdown ();
Printf.eprintf "agent-snapshot: %s\n%!" (Printexc.to_string exn);
exit 1
let () = main ()