Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Omit transient created deleted files

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 19:26:57 -0400
Commit
cecddc81ce4f4a9c62a41cb3f0b45171c779dab5
src/ocaml/agent_snapshot.ml
index 5dcd500..0aca768 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -581,6 +581,20 @@ let manifest_git_of_git_info (git : git_info) : Manifest_json.git =
 let manifest_repo_of_repo_record (repo : repo_record) : Manifest_json.repo =
   { root = utf8_string repo.root; head = utf8_string repo.head; dirty = repo.dirty }
 
+let operation_was_recorded (recd : file_record) (operation : string) : bool =
+  Hashtbl.mem recd.operations operation
+
+let record_has_mutation (recd : file_record) : bool =
+  operation_was_recorded recd "write" || operation_was_recorded recd "delete"
+
+let record_is_transient_mutation (recd : file_record) : bool =
+  (not recd.before.exists) && (not recd.after.exists) && record_has_mutation recd
+
+let record_should_be_manifested (recd : file_record) : bool =
+  (not (metadata_is_special recd.before))
+  && (not (metadata_is_special recd.after))
+  && not (record_is_transient_mutation recd)
+
 let write_manifest (out : string) (command : string list) (exit_status : int) : unit =
   let git_repositories =
     Hashtbl.fold (fun _ (repo : repo_record) acc -> manifest_repo_of_repo_record repo :: acc) repos []
@@ -589,7 +603,7 @@ let write_manifest (out : string) (command : string list) (exit_status : int) : 
   let manifest_files =
     Hashtbl.fold
       (fun _ recd acc ->
-        if metadata_is_special recd.before || metadata_is_special recd.after then acc
+        if not (record_should_be_manifested recd) then acc
         else
           let operations =
             Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string
@@ -620,19 +634,15 @@ let write_manifest (out : string) (command : string list) (exit_status : int) : 
   in
   Json.to_file ~std:true (concat_path out "manifest.json") (Manifest_json.to_yojson manifest)
 
-let operation_was_recorded (recd : file_record) (operation : string) : bool =
-  Hashtbl.mem recd.operations operation
-
 let print_snapshot_summary () : unit =
   let updated_files = ref 0 in
   let uncommitted_read_files = ref 0 in
   Hashtbl.iter
     (fun _ recd ->
-      if
-        (operation_was_recorded recd "write" || operation_was_recorded recd "delete")
-        && (Option.is_some recd.after.blob || recd.after.tombstone)
+      if record_should_be_manifested recd && record_has_mutation recd && (Option.is_some recd.after.blob || recd.after.tombstone)
       then incr updated_files;
-      if operation_was_recorded recd "read" && Option.is_some recd.before.blob then incr uncommitted_read_files)
+      if record_should_be_manifested recd && operation_was_recorded recd "read" && Option.is_some recd.before.blob then
+        incr uncommitted_read_files)
     files;
   Printf.eprintf
     "Snapshot directory: %s\nWorked in %d repositories. Saved %d updated files. Saved %d read files in the snapshot that were not committed.\n%!"
test_programs/create_read_delete_transient.py
new file mode 100644
index 0000000..5f286cb
--- /dev/null
+++ b/test_programs/create_read_delete_transient.py
@@ -0,0 +1,9 @@
+import os
+from pathlib import Path
+
+testdata = Path(os.environ["AGENT_SNAPSHOT_TEST_REPO"])
+transient = testdata / "transient_runtime.txt"
+
+transient.write_text("temporary payload\n")
+assert transient.read_text() == "temporary payload\n"
+transient.unlink()
tests/test_agent_snapshot.py
index 14718cc..f4ead13 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -303,6 +303,18 @@ def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path):
     assert deleted["after"]["tombstone"] is True
 
 
+def test_created_then_deleted_file_is_not_manifested_or_blobbed(tmp_path):
+    transient = WORKTREE / "transient_runtime.txt"
+
+    snap = capture(tmp_path, PYTHON, "test_programs/create_read_delete_transient.py")
+    manifest_paths = {item["path"] for item in snap.manifest["files"]}
+    blob_keys = set(snap._blobs_frame()["key"])
+
+    assert not transient.exists()
+    assert str(transient.resolve()) not in manifest_paths
+    assert not any(str(transient.resolve()) in key for key in blob_keys)
+
+
 def test_fork_usr_and_directory_traversal(tmp_path):
     # ptrace must follow the process tree, not just the initial pid. The helper
     # forks and writes from the child; missing that write means fork/clone events