Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Key blobs by absolute path

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 06:14:49 -0400
Commit
e25d501e6117c96595c7f13f2d0754f8cacf84d7
README.md
index 11003b5..df37bfd 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,10 @@ A snapshot is a directory bundle:
 snapshot-dir/
   manifest.json
   blobs/
-    <content-digest>
+    before/
+      <absolute-path-without-leading-slash>
+    after/
+      <absolute-path-without-leading-slash>
 ```
 
 `manifest.json` contains:
@@ -83,9 +86,10 @@ Each file record contains:
 - `git`: Git classification for the path when applicable.
 
 Metadata records include whether the path exists, file type, mode, size, mtime,
-and optionally a `blob` digest. Blob files live under `blobs/` and are addressed
-by digest. The digest is currently an internal content-addressing key, not a
-cryptographic integrity guarantee.
+and optionally a `blob` key. Blob files live under `blobs/` and are addressed by
+state-qualified absolute path keys such as `before:/repo/input.txt` or
+`after:/repo/generated.txt`. On disk, those keys are stored under
+`blobs/before/repo/input.txt` or `blobs/after/repo/generated.txt`.
 
 Clean Git-tracked reads typically have no blob:
 
@@ -134,7 +138,7 @@ Captured file contents appear as blob references:
     "mode": 33188,
     "size": 18,
     "mtime": 1770000001,
-    "blob": "0d88229adcb64ea7"
+    "blob": "after:/repo/generated.txt"
   }
 }
 ```
src/ocaml/agent_snapshot.ml
index e66c6eb..bcb8a5c 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -123,7 +123,6 @@ let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8
 let ignored_paths : string list ref = ref []
 let ignore_config_path : string ref = ref ""
 let snapshot_dir : string ref = ref ""
-let blob_dir : string ref = ref ""
 let tracer_uid : int = Unix.getuid ()
 let tracer_gid : int = Unix.getgid ()
 
@@ -315,29 +314,27 @@ let classify_git (input_path : string) : git_info =
                 info)
       with Ocaml_git.Git_error _ -> info)
 
-(** Stable content key for blobs. This is not intended as a cryptographic integrity hash. *)
-let fnv1a_file_digest (path : string) : string =
-  let ic = open_in_bin path in
-  Fun.protect
-    ~finally:(fun () -> close_in_noerr ic)
-    (fun () ->
-      let hash = ref 0xcbf29ce484222325L in
-      (try
-         while true do
-           let c = input_byte ic in
-           hash := Int64.logxor !hash (Int64.of_int c);
-           hash := Int64.mul !hash 0x100000001b3L
-         done
-       with End_of_file -> ());
-      Printf.sprintf "%016Lx" !hash)
-
 let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst
 
-let store_blob (path : string) : string =
-  let digest = fnv1a_file_digest path in
-  let out = concat_path !blob_dir digest in
-  if not (FileUtil.test FileUtil.Exists out) then copy_file path out;
-  digest
+let trim_leading_slash (path : string) : string =
+  if String.starts_with ~prefix:"/" path then String.sub path 1 (String.length path - 1) else path
+
+let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path
+
+let blob_path_for_key (dir : string) (key : string) : string =
+  match String.index_opt key ':' with
+  | Some index ->
+      let state = String.sub key 0 index in
+      let path = String.sub key (index + 1) (String.length key - index - 1) in
+      concat_path (concat_path (concat_path dir "blobs") state) (trim_leading_slash path)
+  | None -> concat_path (concat_path dir "blobs") key
+
+let store_blob (state : string) (path : string) : string =
+  let key = blob_key state path in
+  let out = blob_path_for_key !snapshot_dir key in
+  mkdir_p (dirname out);
+  copy_file path out;
+  key
 
 let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool =
   if (not meta.exists) || not meta.regular then false
@@ -365,7 +362,7 @@ let record_observation (raw_path : string) (operation : string) : unit =
         recd.before_recorded <- true;
         recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ());
         recd.before_git <- classify_git path;
-        if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path)))
+        if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob "before" path)))
 
 (** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *)
 let finalize_records () : unit =
@@ -377,7 +374,7 @@ let finalize_records () : unit =
         recd.after_git <- classify_git recd.path;
         let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in
         if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then
-          recd.after.blob <- Some (store_blob recd.path)))
+          recd.after.blob <- Some (store_blob "after" recd.path)))
     files
 
 let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata =
@@ -608,16 +605,11 @@ let restore_snapshot (dir : string) : unit =
                   else (
                     match after.blob with
                     | None -> ()
-                    | Some digest ->
+                    | Some key ->
                         mkdir_p (dirname path);
-                        let same =
-                          FileUtil.test (FileUtil.And (FileUtil.Exists, FileUtil.Not FileUtil.Is_dir)) path
-                          && fnv1a_file_digest path = digest
-                        in
-                        if not same then (
-                          let tmp = path ^ ".agent-snapshot.tmp" in
-                          copy_file (concat_path (concat_path dir "blobs") digest) tmp;
-                          Unix.rename tmp path);
+                        let tmp = path ^ ".agent-snapshot.tmp" in
+                        copy_file (blob_path_for_key dir key) tmp;
+                        Unix.rename tmp path;
                         (match after.mode with
                         | Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ]
                         | None -> ());
@@ -642,9 +634,8 @@ let run_snapshot (args : string list) : int =
   load_ignore_config ();
   let output, command = parse_snapshot_args args in
   snapshot_dir := output;
-  blob_dir := concat_path output "blobs";
   remove_all output;
-  mkdir_p !blob_dir;
+  mkdir_p (concat_path output "blobs");
   trace_command command;
   finalize_records ();
   write_manifest output command 0;
tests/test_agent_snapshot.py
index 65223fb..642b05b 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -85,8 +85,13 @@ class Snapshot:
                 return item
         raise AssertionError(f"{target} not present in snapshot")
 
-    def blob_text(self, digest: str):
-        return (self.path / "blobs" / digest).read_text()
+    def blob_path(self, key: str):
+        state, absolute_path = key.split(":", 1)
+        assert Path(absolute_path).is_absolute()
+        return self.path / "blobs" / state / absolute_path.removeprefix("/")
+
+    def blob_text(self, key: str):
+        return self.blob_path(key).read_text()
 
 
 def capture(tmp_path: Path, *command: str) -> Snapshot:
@@ -198,6 +203,7 @@ def test_written_clean_git_tracked_file_gets_after_blob(tmp_path):
     assert "write" in clean["operations"]
     assert clean["git"]["tracked"] is True
     assert clean["git"]["dirty"] is False
+    assert clean["after"]["blob"] == f"after:{WORKTREE / 'clean.txt'}"
     assert snap.blob_text(clean["after"]["blob"]) == (WORKTREE / "clean.txt").read_text()
 
 
@@ -227,6 +233,7 @@ def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path):
     dirty = snap.file(WORKTREE / "dirty.txt")
     assert dirty["git"]["tracked"] is True
     assert dirty["git"]["dirty"] is True
+    assert dirty["before"]["blob"] == f"before:{WORKTREE / 'dirty.txt'}"
     assert snap.blob_text(dirty["before"]["blob"]) == "dirty tracked fixture changed before run\n"
 
     untracked = snap.file(WORKTREE / "untracked_runtime.txt")