Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -34,10 +34,10 @@ external system environment and are not copied. There are important exceptions: - If the traced program writes a regular file, Agent Snapshot saves its - after-state blob even if the file is clean and Git-tracked when the program - exits. +after-state blob even if the file is clean and Git-tracked when the program +exits. - Dirty Git-tracked files, untracked files, and Git-ignored files under a Git - repo are copied when their contents are needed. +repo are copied when their contents are needed. - Deleted files are represented with tombstones. - Any path inside a `.git` directory is ignored. - Paths explicitly listed in the ignore configuration are ignored. @@ -60,11 +60,7 @@ A snapshot is a directory bundle: ```text snapshot-dir/ manifest.json - blobs/ - before/ - <absolute-path-without-leading-slash> - after/ - <absolute-path-without-leading-slash> + blobs.feather ``` `manifest.json` contains: @@ -80,16 +76,15 @@ Each file record contains: - `path`: absolute path. - `operations`: observed capabilities such as `read`, `write`, `existence`, - `directory`, or `delete`. +`directory`, or `delete`. - `before`: state captured the first time the path was observed. - `after`: state captured after the traced process tree exited. - `git`: Git classification for the path when applicable. Metadata records include whether the path exists, file type, mode, size, mtime, -and optionally a `blob` key. Blob files live under `blobs/` and are addressed by -state-qualified absolute path keys such as `before:/repo/input.txt` or -`after:/repo/generated.txt`. On disk, those keys are stored under -`blobs/before/repo/input.txt` or `blobs/after/repo/generated.txt`. +and optionally a `blob` key. Blob payloads live in `blobs.feather`, an Arrow +file with `key` and binary `content` columns. Blob keys are state-qualified +absolute paths such as `before:/repo/input.txt` or `after:/repo/generated.txt`. Clean Git-tracked reads typically have no blob: @@ -175,8 +170,9 @@ The executable is written to: _build/default/src/ocaml/agent_snapshot.exe ``` -The project depends on OCaml, Dune, Yojson, Camomile, and the local -`vendor/ocaml-git` checkout. +The project depends on OCaml, Dune, Yojson, Camomile, and the `ocaml-git` +package. Install the latter with opam (for a local checkout next to this repo, +`opam install ../../homebox/ocaml-git` from the repository root). Create the required ignore configuration before running snapshots: @@ -251,3 +247,4 @@ Run the test suite with `uv`: ```bash uv run pytest ``` +
@@ -12,4 +12,5 @@ camomile ocaml-git fpath - fileutils)) + fileutils + arrow))
@@ -120,6 +120,7 @@ let empty_git () : git_info = let files : (string, file_record) Hashtbl.t = Hashtbl.create 128 let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8 let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8 +let blobs : (string, string) Hashtbl.t = Hashtbl.create 128 let ignored_paths : string list ref = ref [] let ignore_config_path : string ref = ref "" let snapshot_dir : string ref = ref "" @@ -316,26 +317,56 @@ let classify_git (input_path : string) : git_info = let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst -let trim_leading_slash (path : string) : string = - if String.starts_with ~prefix:"/" path then String.sub path 1 (String.length path - 1) else path +let read_file_bin (path : string) : string = + let ic = open_in_bin path in + Fun.protect + ~finally:(fun () -> close_in_noerr ic) + (fun () -> really_input_string ic (in_channel_length ic)) -let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path +let write_file_bin (path : string) (contents : string) : unit = + let oc = open_out_bin path in + Fun.protect + ~finally:(fun () -> close_out_noerr oc) + (fun () -> output_string oc contents) -let blob_path_for_key (dir : string) (key : string) : string = - match String.index_opt key ':' with - | Some index -> - let state = String.sub key 0 index in - let path = String.sub key (index + 1) (String.length key - index - 1) in - concat_path (concat_path (concat_path dir "blobs") state) (trim_leading_slash path) - | None -> concat_path (concat_path dir "blobs") key +let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path let store_blob (state : string) (path : string) : string = let key = blob_key state path in - let out = blob_path_for_key !snapshot_dir key in - mkdir_p (dirname out); - copy_file path out; + Hashtbl.replace blobs key (read_file_bin path); key +let blob_store_path (dir : string) : string = concat_path dir "blobs.feather" + +module Binary = struct + type t = string +end + +type blob_store_entry = + { key : string + ; content : Binary.t + } +[@@deriving arrow] + +let write_blob_store (dir : string) : unit = + if Hashtbl.length blobs > 0 then ( + let entries = Hashtbl.fold (fun key content acc -> (key, content) :: acc) blobs [] |> List.sort Stdlib.compare in + let entries = Array.of_list (List.map (fun (key, content) -> { key; content }) entries) in + let table = arrow_table_of_blob_store_entry entries in + Arrow_c_api.Table.write_feather ~compression:Arrow_c_api.Compression.Uncompressed table (blob_store_path dir)) + +let read_blob_store (dir : string) : (string, string) Hashtbl.t = + let table = Arrow_c_api.Wrapper.Feather_reader.table (blob_store_path dir) in + let entries = arrow_blob_store_entry_of_table table in + let blobs = Hashtbl.create (Array.length entries) in + Array.iter (fun { key; content } -> Hashtbl.replace blobs key content) entries; + blobs + +let read_blob (dir : string) (key : string) : string = + match Hashtbl.find_opt (read_blob_store dir) key with + | Some content -> content + | None -> failwith ("blob not found: " ^ key) + let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool = if (not meta.exists) || not meta.regular then false else if owned_by_other_and_not_writable path then false @@ -608,7 +639,7 @@ let restore_snapshot (dir : string) : unit = | Some key -> mkdir_p (dirname path); let tmp = path ^ ".agent-snapshot.tmp" in - copy_file (blob_path_for_key dir key) tmp; + write_file_bin tmp (read_blob dir key); Unix.rename tmp path; (match after.mode with | Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ] @@ -620,37 +651,62 @@ let restore_snapshot (dir : string) : unit = | _ -> ()) files -let parse_snapshot_args (args : string list) : string * string list = - let rec loop output = function - | "--output" :: value :: rest -> loop (Some value) rest - | "--" :: command -> (output, command) - | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..." - in - match loop None args with - | Some output, (_ :: _ as command) -> (output, command) - | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..." - -let run_snapshot (args : string list) : int = +let run_snapshot (output : string) (command : string list) : int = load_ignore_config (); - let output, command = parse_snapshot_args args in snapshot_dir := output; remove_all output; - mkdir_p (concat_path output "blobs"); + mkdir_p output; trace_command command; finalize_records (); + write_blob_store output; write_manifest output command 0; 0 let main () : unit = try - let args = Array.to_list Sys.argv |> List.tl in + let argv = Sys.argv in let rc = - match args with - | [ "restore"; dir ] -> - restore_snapshot dir; - 0 - | "restore" :: _ -> failwith "usage: agent-snapshot restore SNAPDIR" - | _ -> run_snapshot args + if Array.length argv >= 2 && argv.(1) = "restore" then ( + if Array.length argv < 3 then failwith "usage: agent-snapshot restore SNAPDIR"; + if Array.length argv > 3 then failwith "usage: agent-snapshot restore SNAPDIR"; + restore_snapshot argv.(2); + 0) + else if Array.length argv >= 2 && argv.(1) = "blob" then ( + if Array.length argv <> 4 then failwith "usage: agent-snapshot blob SNAPDIR KEY"; + output_string stdout (read_blob argv.(2) argv.(3)); + flush stdout; + 0) + else + let output = ref "" in + let command_rev = ref [] in + let anon s = command_rev := s :: !command_rev in + let speclist = + [ + ( "--output", + Arg.Set_string output, + "SNAPDIR Write the snapshot under this directory (required)." ); + ] + in + let aligned = Arg.align speclist in + let usage = "agent-snapshot --output SNAPDIR -- command [args...]\nOptions:" in + let command = + let n = Array.length argv in + let rec ddash i = + if i >= n then None else if argv.(i) = "--" then Some i else ddash (i + 1) + in + match ddash 1 with + | Some i -> + if i + 1 >= n then failwith "usage: agent-snapshot --output SNAPDIR -- command args..."; + let opt_argv = Array.sub argv 0 i in + Arg.parse_argv ~current:(ref 0) opt_argv aligned anon usage; + Array.to_list (Array.sub argv (i + 1) (n - i - 1)) + | None -> + Arg.parse aligned anon usage; + List.rev !command_rev + in + if !output = "" || command = [] then + failwith "usage: agent-snapshot --output SNAPDIR -- command args..."; + run_snapshot !output command in Ocaml_git.shutdown (); exit rc
@@ -2,8 +2,8 @@ (name agent_snapshot) (public_name agent-snapshot) (preprocess - (pps ppx_deriving_yojson)) + (pps ppx_deriving_yojson arrow.ppx)) (foreign_stubs (language c) (names ptrace_stubs)) - (libraries unix yojson camomile ocaml-git fpath fileutils)) + (libraries unix yojson camomile ocaml-git fpath fileutils arrow.c_api arrow.ppx_runtime))
@@ -85,13 +85,20 @@ class Snapshot: return item raise AssertionError(f"{target} not present in snapshot") - def blob_path(self, key: str): + def blob_bytes(self, key: str): state, absolute_path = key.split(":", 1) + assert state in {"before", "after"} assert Path(absolute_path).is_absolute() - return self.path / "blobs" / state / absolute_path.removeprefix("/") + result = subprocess.run( + [str(BIN), "blob", str(self.path), key], + cwd=ROOT, + check=True, + capture_output=True, + ) + return result.stdout def blob_text(self, key: str): - return self.blob_path(key).read_text() + return self.blob_bytes(key).decode() def capture(tmp_path: Path, *command: str) -> Snapshot:
@@ -1 +0,0 @@ -../../../homebox/ocaml-git \ No newline at end of file