Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Checkpoint: Feather blob store, manifest updates, remove vendored ocaml-git

Co-authored-by: Cursor <cursoragent@cursor.com>
Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 11:40:28 -0400
Commit
720826bfd5d43b8a608da754dfbc25856de8e3c3
README.md
index df37bfd..28b4fb1 100644
--- a/README.md
+++ b/README.md
@@ -34,10 +34,10 @@ external system environment and are not copied.
 There are important exceptions:
 
 - If the traced program writes a regular file, Agent Snapshot saves its
-  after-state blob even if the file is clean and Git-tracked when the program
-  exits.
+after-state blob even if the file is clean and Git-tracked when the program
+exits.
 - Dirty Git-tracked files, untracked files, and Git-ignored files under a Git
-  repo are copied when their contents are needed.
+repo are copied when their contents are needed.
 - Deleted files are represented with tombstones.
 - Any path inside a `.git` directory is ignored.
 - Paths explicitly listed in the ignore configuration are ignored.
@@ -60,11 +60,7 @@ A snapshot is a directory bundle:
 ```text
 snapshot-dir/
   manifest.json
-  blobs/
-    before/
-      <absolute-path-without-leading-slash>
-    after/
-      <absolute-path-without-leading-slash>
+  blobs.feather
 ```
 
 `manifest.json` contains:
@@ -80,16 +76,15 @@ Each file record contains:
 
 - `path`: absolute path.
 - `operations`: observed capabilities such as `read`, `write`, `existence`,
-  `directory`, or `delete`.
+`directory`, or `delete`.
 - `before`: state captured the first time the path was observed.
 - `after`: state captured after the traced process tree exited.
 - `git`: Git classification for the path when applicable.
 
 Metadata records include whether the path exists, file type, mode, size, mtime,
-and optionally a `blob` key. Blob files live under `blobs/` and are addressed by
-state-qualified absolute path keys such as `before:/repo/input.txt` or
-`after:/repo/generated.txt`. On disk, those keys are stored under
-`blobs/before/repo/input.txt` or `blobs/after/repo/generated.txt`.
+and optionally a `blob` key. Blob payloads live in `blobs.feather`, an Arrow
+file with `key` and binary `content` columns. Blob keys are state-qualified
+absolute paths such as `before:/repo/input.txt` or `after:/repo/generated.txt`.
 
 Clean Git-tracked reads typically have no blob:
 
@@ -175,8 +170,9 @@ The executable is written to:
 _build/default/src/ocaml/agent_snapshot.exe
 ```
 
-The project depends on OCaml, Dune, Yojson, Camomile, and the local
-`vendor/ocaml-git` checkout.
+The project depends on OCaml, Dune, Yojson, Camomile, and the `ocaml-git`
+package. Install the latter with opam (for a local checkout next to this repo,
+`opam install ../../homebox/ocaml-git` from the repository root).
 
 Create the required ignore configuration before running snapshots:
 
@@ -251,3 +247,4 @@ Run the test suite with `uv`:
 ```bash
 uv run pytest
 ```
+
dune-project
index 2b963a0..f6d74c7 100644
--- a/dune-project
+++ b/dune-project
@@ -12,4 +12,5 @@
   camomile
   ocaml-git
   fpath
-  fileutils))
+  fileutils
+  arrow))
src/ocaml/agent_snapshot.ml
index bcb8a5c..337363c 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -120,6 +120,7 @@ let empty_git () : git_info =
 let files : (string, file_record) Hashtbl.t = Hashtbl.create 128
 let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8
 let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8
+let blobs : (string, string) Hashtbl.t = Hashtbl.create 128
 let ignored_paths : string list ref = ref []
 let ignore_config_path : string ref = ref ""
 let snapshot_dir : string ref = ref ""
@@ -316,26 +317,56 @@ let classify_git (input_path : string) : git_info =
 
 let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst
 
-let trim_leading_slash (path : string) : string =
-  if String.starts_with ~prefix:"/" path then String.sub path 1 (String.length path - 1) else path
+let read_file_bin (path : string) : string =
+  let ic = open_in_bin path in
+  Fun.protect
+    ~finally:(fun () -> close_in_noerr ic)
+    (fun () -> really_input_string ic (in_channel_length ic))
 
-let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path
+let write_file_bin (path : string) (contents : string) : unit =
+  let oc = open_out_bin path in
+  Fun.protect
+    ~finally:(fun () -> close_out_noerr oc)
+    (fun () -> output_string oc contents)
 
-let blob_path_for_key (dir : string) (key : string) : string =
-  match String.index_opt key ':' with
-  | Some index ->
-      let state = String.sub key 0 index in
-      let path = String.sub key (index + 1) (String.length key - index - 1) in
-      concat_path (concat_path (concat_path dir "blobs") state) (trim_leading_slash path)
-  | None -> concat_path (concat_path dir "blobs") key
+let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path
 
 let store_blob (state : string) (path : string) : string =
   let key = blob_key state path in
-  let out = blob_path_for_key !snapshot_dir key in
-  mkdir_p (dirname out);
-  copy_file path out;
+  Hashtbl.replace blobs key (read_file_bin path);
   key
 
+let blob_store_path (dir : string) : string = concat_path dir "blobs.feather"
+
+module Binary = struct
+  type t = string
+end
+
+type blob_store_entry =
+  { key : string
+  ; content : Binary.t
+  }
+[@@deriving arrow]
+
+let write_blob_store (dir : string) : unit =
+  if Hashtbl.length blobs > 0 then (
+    let entries = Hashtbl.fold (fun key content acc -> (key, content) :: acc) blobs [] |> List.sort Stdlib.compare in
+    let entries = Array.of_list (List.map (fun (key, content) -> { key; content }) entries) in
+    let table = arrow_table_of_blob_store_entry entries in
+    Arrow_c_api.Table.write_feather ~compression:Arrow_c_api.Compression.Uncompressed table (blob_store_path dir))
+
+let read_blob_store (dir : string) : (string, string) Hashtbl.t =
+  let table = Arrow_c_api.Wrapper.Feather_reader.table (blob_store_path dir) in
+  let entries = arrow_blob_store_entry_of_table table in
+  let blobs = Hashtbl.create (Array.length entries) in
+  Array.iter (fun { key; content } -> Hashtbl.replace blobs key content) entries;
+  blobs
+
+let read_blob (dir : string) (key : string) : string =
+  match Hashtbl.find_opt (read_blob_store dir) key with
+  | Some content -> content
+  | None -> failwith ("blob not found: " ^ key)
+
 let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool =
   if (not meta.exists) || not meta.regular then false
   else if owned_by_other_and_not_writable path then false
@@ -608,7 +639,7 @@ let restore_snapshot (dir : string) : unit =
                     | Some key ->
                         mkdir_p (dirname path);
                         let tmp = path ^ ".agent-snapshot.tmp" in
-                        copy_file (blob_path_for_key dir key) tmp;
+                        write_file_bin tmp (read_blob dir key);
                         Unix.rename tmp path;
                         (match after.mode with
                         | Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ]
@@ -620,37 +651,62 @@ let restore_snapshot (dir : string) : unit =
       | _ -> ())
     files
 
-let parse_snapshot_args (args : string list) : string * string list =
-  let rec loop output = function
-    | "--output" :: value :: rest -> loop (Some value) rest
-    | "--" :: command -> (output, command)
-    | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..."
-  in
-  match loop None args with
-  | Some output, (_ :: _ as command) -> (output, command)
-  | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..."
-
-let run_snapshot (args : string list) : int =
+let run_snapshot (output : string) (command : string list) : int =
   load_ignore_config ();
-  let output, command = parse_snapshot_args args in
   snapshot_dir := output;
   remove_all output;
-  mkdir_p (concat_path output "blobs");
+  mkdir_p output;
   trace_command command;
   finalize_records ();
+  write_blob_store output;
   write_manifest output command 0;
   0
 
 let main () : unit =
   try
-    let args = Array.to_list Sys.argv |> List.tl in
+    let argv = Sys.argv in
     let rc =
-      match args with
-      | [ "restore"; dir ] ->
-          restore_snapshot dir;
-          0
-      | "restore" :: _ -> failwith "usage: agent-snapshot restore SNAPDIR"
-      | _ -> run_snapshot args
+      if Array.length argv >= 2 && argv.(1) = "restore" then (
+        if Array.length argv < 3 then failwith "usage: agent-snapshot restore SNAPDIR";
+        if Array.length argv > 3 then failwith "usage: agent-snapshot restore SNAPDIR";
+        restore_snapshot argv.(2);
+        0)
+      else if Array.length argv >= 2 && argv.(1) = "blob" then (
+        if Array.length argv <> 4 then failwith "usage: agent-snapshot blob SNAPDIR KEY";
+        output_string stdout (read_blob argv.(2) argv.(3));
+        flush stdout;
+        0)
+      else
+        let output = ref "" in
+        let command_rev = ref [] in
+        let anon s = command_rev := s :: !command_rev in
+        let speclist =
+          [
+            ( "--output",
+              Arg.Set_string output,
+              "SNAPDIR  Write the snapshot under this directory (required)." );
+          ]
+        in
+        let aligned = Arg.align speclist in
+        let usage = "agent-snapshot --output SNAPDIR -- command [args...]\nOptions:" in
+        let command =
+          let n = Array.length argv in
+          let rec ddash i =
+            if i >= n then None else if argv.(i) = "--" then Some i else ddash (i + 1)
+          in
+          match ddash 1 with
+          | Some i ->
+              if i + 1 >= n then failwith "usage: agent-snapshot --output SNAPDIR -- command args...";
+              let opt_argv = Array.sub argv 0 i in
+              Arg.parse_argv ~current:(ref 0) opt_argv aligned anon usage;
+              Array.to_list (Array.sub argv (i + 1) (n - i - 1))
+          | None ->
+              Arg.parse aligned anon usage;
+              List.rev !command_rev
+        in
+        if !output = "" || command = [] then
+          failwith "usage: agent-snapshot --output SNAPDIR -- command args...";
+        run_snapshot !output command
     in
     Ocaml_git.shutdown ();
     exit rc
src/ocaml/dune
index 0bf84bc..0ddbecc 100644
--- a/src/ocaml/dune
+++ b/src/ocaml/dune
@@ -2,8 +2,8 @@
  (name agent_snapshot)
  (public_name agent-snapshot)
  (preprocess
-  (pps ppx_deriving_yojson))
+  (pps ppx_deriving_yojson arrow.ppx))
  (foreign_stubs
   (language c)
   (names ptrace_stubs))
- (libraries unix yojson camomile ocaml-git fpath fileutils))
+ (libraries unix yojson camomile ocaml-git fpath fileutils arrow.c_api arrow.ppx_runtime))
tests/test_agent_snapshot.py
index 642b05b..f00e169 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -85,13 +85,20 @@ class Snapshot:
                 return item
         raise AssertionError(f"{target} not present in snapshot")
 
-    def blob_path(self, key: str):
+    def blob_bytes(self, key: str):
         state, absolute_path = key.split(":", 1)
+        assert state in {"before", "after"}
         assert Path(absolute_path).is_absolute()
-        return self.path / "blobs" / state / absolute_path.removeprefix("/")
+        result = subprocess.run(
+            [str(BIN), "blob", str(self.path), key],
+            cwd=ROOT,
+            check=True,
+            capture_output=True,
+        )
+        return result.stdout
 
     def blob_text(self, key: str):
-        return self.blob_path(key).read_text()
+        return self.blob_bytes(key).decode()
 
 
 def capture(tmp_path: Path, *command: str) -> Snapshot:
vendor/ocaml-git
deleted file mode 120000
index a9469f5..0000000
--- a/vendor/ocaml-git
+++ /dev/null
@@ -1 +0,0 @@
-../../../homebox/ocaml-git
\ No newline at end of file