Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Ignore proc dev and special files

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 18:00:03 -0400
Commit
a0dcc74beebf89d4e5bd96943b98c528cafc5fb6
README.md
index b306e2f..a873fba 100644
--- a/README.md
+++ b/README.md
@@ -239,6 +239,8 @@ The file is a JSON list of file or directory paths:
   "$HOME/.cursor",
   "$XDG_CONFIG_HOME/agent-snapshot/ignore.json",
   "/tmp/scratch-output",
+  "/proc",
+  "/dev",
   "/usr",
   "/bin"
 ]
src/ocaml/agent_snapshot.ml
index fd7ffe8..44492db 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -165,11 +165,15 @@ let path_is_at_or_under (path : string) (root : string) : bool =
 
 let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path)
 
+let always_ignored_paths : string list = [ "/proc"; "/dev" ]
+
 let is_ignored_path (raw_path : string) : bool =
   if raw_path = "" then false
   else
     let path = best_effort_canonical raw_path in
-    is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths
+    is_git_internal_path path
+    || List.exists (fun ignored -> path_is_at_or_under path ignored) always_ignored_paths
+    || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths
 
 let home_dir () : string =
   match Sys.getenv_opt "HOME" with
@@ -201,6 +205,8 @@ let default_ignore_file_entries () : ignore_file_entries =
     "$HOME/.cursor";
     "$XDG_CONFIG_HOME/agent-snapshot/ignore.json";
     "/tmp/scratch-output";
+    "/proc";
+    "/dev";
     "/usr";
     "/bin";
   ]
@@ -244,6 +250,24 @@ let mode_of_kind (kind : Unix.file_kind) : int =
   | Unix.S_FIFO -> 0o010000
   | Unix.S_SOCK -> 0o140000
 
+let kind_is_special (kind : Unix.file_kind) : bool =
+  match kind with
+  | Unix.S_CHR | Unix.S_BLK | Unix.S_FIFO | Unix.S_SOCK -> true
+  | Unix.S_REG | Unix.S_DIR | Unix.S_LNK -> false
+
+let metadata_is_special (meta : metadata) : bool =
+  if not meta.exists then false
+  else
+    match meta.mode land 0o170000 with
+    | 0o020000 | 0o060000 | 0o010000 | 0o140000 -> true
+    | _ -> false
+
+let path_is_special_file (path : string) : bool =
+  try
+    let st = Unix.LargeFile.lstat path in
+    kind_is_special st.st_kind
+  with Unix.Unix_error _ -> false
+
 let stat_metadata (path : string) : metadata option =
   try
     let st = Unix.LargeFile.lstat path in
@@ -367,11 +391,13 @@ let classify_git (input_path : string) : git_info =
 
 let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst
 
-let read_file_bin (path : string) : string =
+let read_file_bin (path : string) : string option =
   let ic = open_in_bin path in
   Fun.protect
     ~finally:(fun () -> close_in_noerr ic)
-    (fun () -> really_input_string ic (in_channel_length ic))
+    (fun () ->
+      try Some (really_input_string ic (in_channel_length ic))
+      with Sys_error _ | Invalid_argument _ | End_of_file -> None)
 
 let write_file_bin (path : string) (contents : string) : unit =
   let oc = open_out_bin path in
@@ -451,15 +477,19 @@ let with_blob_row_group_writer (f : unit -> 'a) : 'a =
 
 let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path
 
-let store_blob (state : string) (path : string) : string =
+let store_blob (state : string) (path : string) (meta : metadata) : string option =
   ensure_blob_dir ();
   let key = blob_key state path in
-  let content = read_file_bin path in
-  blob_batch_keys_rev := key :: !blob_batch_keys_rev;
-  blob_batch_contents_rev := content :: !blob_batch_contents_rev;
-  incr blob_batch_count;
-  if !blob_batch_count >= blob_batch_max then flush_blob_batch ();
-  key
+  match read_file_bin path with
+  | None ->
+      meta.size <- -1L;
+      None
+  | Some content ->
+      blob_batch_keys_rev := key :: !blob_batch_keys_rev;
+      blob_batch_contents_rev := content :: !blob_batch_contents_rev;
+      incr blob_batch_count;
+      if !blob_batch_count >= blob_batch_max then flush_blob_batch ();
+      Some key
 
 (** Restore reads the blob store into a map. This runs once per restore, not during capture. *)
 let load_blob_store (dir : string) : (string, string) Hashtbl.t =
@@ -481,7 +511,7 @@ let should_capture_content (path : string) (meta : metadata) (git : git_info) : 
 let record_observation (raw_path : string) (operation : string) : unit =
   if raw_path <> "" then
     let path = best_effort_canonical raw_path in
-    if (not (is_ignored_path path)) && not (path_has_non_directory_prefix path) then (
+    if (not (is_ignored_path path)) && (not (path_has_non_directory_prefix path)) && not (path_is_special_file path) then (
       let recd =
         match Hashtbl.find_opt files path with
         | Some recd -> recd
@@ -497,7 +527,7 @@ let record_observation (raw_path : string) (operation : string) : unit =
         recd.before_recorded <- true;
         recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ());
         recd.before_git <- classify_git path;
-        if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob "before" path)))
+        if should_capture_content path recd.before recd.before_git then recd.before.blob <- store_blob "before" path recd.before))
 
 (** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *)
 let finalize_records () : unit =
@@ -509,7 +539,7 @@ let finalize_records () : unit =
         recd.after_git <- classify_git recd.path;
         let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in
         if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then
-          recd.after.blob <- Some (store_blob "after" recd.path)))
+          recd.after.blob <- store_blob "after" recd.path recd.after))
     files
 
 let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata =
@@ -559,19 +589,21 @@ let write_manifest (out : string) (command : string list) (exit_status : int) : 
   let manifest_files =
     Hashtbl.fold
       (fun _ recd acc ->
-        let operations =
-          Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string
-        in
-        let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in
-        ({
-           path = utf8_string recd.path;
-           operations;
-           before = manifest_metadata_of_metadata recd.before;
-           after = manifest_metadata_of_metadata recd.after;
-           git = manifest_git_of_git_info git;
-         }
-          : Manifest_json.file_entry)
-        :: acc)
+        if metadata_is_special recd.before || metadata_is_special recd.after then acc
+        else
+          let operations =
+            Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string
+          in
+          let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in
+          ({
+             path = utf8_string recd.path;
+             operations;
+             before = manifest_metadata_of_metadata recd.before;
+             after = manifest_metadata_of_metadata recd.after;
+             git = manifest_git_of_git_info git;
+           }
+            : Manifest_json.file_entry)
+          :: acc)
       files []
     |> List.sort Stdlib.compare
   in
test_programs/read_proc_dev_special.py
new file mode 100644
index 0000000..4f57d0b
--- /dev/null
+++ b/test_programs/read_proc_dev_special.py
@@ -0,0 +1,10 @@
+import os
+from pathlib import Path
+
+testdata = Path(os.environ["AGENT_SNAPSHOT_TEST_REPO"])
+
+Path("/proc/self/status").read_text()
+Path("/dev/null").read_bytes()
+
+fd = os.open(testdata / "runtime.fifo", os.O_RDONLY | os.O_NONBLOCK)
+os.close(fd)
tests/test_agent_snapshot.py
index 9e599c0..af92e7e 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -140,6 +140,8 @@ def test_missing_ignore_config_creates_defaults(tmp_path, ignore_config):
         "$HOME/.cursor",
         "$XDG_CONFIG_HOME/agent-snapshot/ignore.json",
         "/tmp/scratch-output",
+        "/proc",
+        "/dev",
         "/usr",
         "/bin",
     ]
@@ -226,6 +228,18 @@ def test_git_internal_directory_writes_are_ignored(tmp_path):
     assert str((WORKTREE / ".git" / "delete_me").resolve()) not in manifest_paths
 
 
+def test_proc_dev_and_special_files_are_ignored(tmp_path):
+    fifo = WORKTREE / "runtime.fifo"
+    os.mkfifo(fifo)
+
+    snap = capture(tmp_path, PYTHON, "test_programs/read_proc_dev_special.py")
+    manifest_paths = {item["path"] for item in snap.manifest["files"]}
+
+    assert not any(path.startswith("/proc/") for path in manifest_paths)
+    assert not any(path.startswith("/dev/") for path in manifest_paths)
+    assert str(fifo.resolve()) not in manifest_paths
+
+
 def test_non_directory_path_component_does_not_crash(tmp_path):
     snap = capture(tmp_path, PYTHON, "test_programs/read_non_directory_component.py")
     impossible_path = (WORKTREE / "not_directory" / "2851767" / "ns").resolve(strict=False)