Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Exclude external nonwritable files

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 20:09:04 -0400
Commit
b62e797d8587fcd372c903f08e4ffe2a5097fab6
README.md
index a873fba..443b028 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ At a high level, the snapshot includes:
 The snapshot is intentionally compact. Clean Git-tracked files that are only read
 are represented by Git metadata instead of copied into the snapshot. Files owned
 by another user and not writable by the current user are treated as part of the
-external system environment and are not copied.
+external system environment and are not recorded.
 
 There are important exceptions:
 
src/ocaml/agent_snapshot.ml
index 0aca768..7fc33a7 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -34,6 +34,8 @@ type file_record = {
   mutable after : metadata;
   mutable before_git : git_info;
   mutable after_git : git_info;
+  mutable before_external_system : bool;
+  mutable after_external_system : bool;
   mutable before_recorded : bool;
 }
 
@@ -511,13 +513,28 @@ let should_capture_content (path : string) (meta : metadata) (git : git_info) : 
 let record_observation (raw_path : string) (operation : string) : unit =
   if raw_path <> "" then
     let path = best_effort_canonical raw_path in
-    if (not (is_ignored_path path)) && (not (path_has_non_directory_prefix path)) && not (path_is_special_file path) then (
+    if
+      (not (is_ignored_path path))
+      && (not (path_has_non_directory_prefix path))
+      && (not (path_is_special_file path))
+      && not (owned_by_other_and_not_writable path)
+    then (
       let recd =
         match Hashtbl.find_opt files path with
         | Some recd -> recd
         | None ->
             let recd =
-              { path; operations = Hashtbl.create 5; before = empty_metadata (); after = empty_metadata (); before_git = empty_git (); after_git = empty_git (); before_recorded = false }
+              {
+                path;
+                operations = Hashtbl.create 5;
+                before = empty_metadata ();
+                after = empty_metadata ();
+                before_git = empty_git ();
+                after_git = empty_git ();
+                before_external_system = false;
+                after_external_system = false;
+                before_recorded = false;
+              }
             in
             Hashtbl.add files path recd;
             recd
@@ -526,6 +543,7 @@ let record_observation (raw_path : string) (operation : string) : unit =
       if not recd.before_recorded then (
         recd.before_recorded <- true;
         recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ());
+        recd.before_external_system <- recd.before.exists && owned_by_other_and_not_writable path;
         recd.before_git <- classify_git path;
         if should_capture_content path recd.before recd.before_git then recd.before.blob <- store_blob "before" path recd.before))
 
@@ -536,6 +554,7 @@ let finalize_records () : unit =
       if not (is_ignored_path recd.path) then (
         recd.after <- Option.value (stat_metadata recd.path) ~default:(empty_metadata ());
         if not recd.after.exists then recd.after.tombstone <- Hashtbl.mem recd.operations "delete";
+        recd.after_external_system <- recd.after.exists && owned_by_other_and_not_writable recd.path;
         recd.after_git <- classify_git recd.path;
         let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in
         if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then
@@ -593,6 +612,8 @@ let record_is_transient_mutation (recd : file_record) : bool =
 let record_should_be_manifested (recd : file_record) : bool =
   (not (metadata_is_special recd.before))
   && (not (metadata_is_special recd.after))
+  && (not recd.before_external_system)
+  && (not recd.after_external_system)
   && not (record_is_transient_mutation recd)
 
 let write_manifest (out : string) (command : string list) (exit_status : int) : unit =
tests/test_agent_snapshot.py
index f4ead13..7f9bd23 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -319,18 +319,16 @@ def test_fork_usr_and_directory_traversal(tmp_path):
     # ptrace must follow the process tree, not just the initial pid. The helper
     # forks and writes from the child; missing that write means fork/clone events
     # are not being attached early enough. The same helper reads /usr/bin/env to
-    # assert that root-owned, non-writable system files are observed but not
-    # blobbed, and iterates testdata to verify directory traversal is recorded.
+    # assert that root-owned, non-writable system files are excluded, and iterates
+    # testdata to verify directory traversal is recorded.
     snap = capture(tmp_path, PYTHON, "test_programs/fork_and_usr.py")
 
     child = snap.file(WORKTREE / "child_output.txt")
     assert "write" in child["operations"]
     assert snap.blob_text(child["after"]["blob"]) == "child final\n"
 
-    usr_env = snap.file(Path("/usr/bin/env"))
-    assert "read" in usr_env["operations"]
-    assert usr_env["before"].get("blob") is None
-    assert usr_env["after"].get("blob") is None
+    manifest_paths = {item["path"] for item in snap.manifest["files"]}
+    assert str(Path("/usr/bin/env").resolve()) not in manifest_paths
 
     directory = snap.file(WORKTREE)
     assert "directory" in directory["operations"]