Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -239,6 +239,8 @@ The file is a JSON list of file or directory paths: "$HOME/.cursor", "$XDG_CONFIG_HOME/agent-snapshot/ignore.json", "/tmp/scratch-output", + "/proc", + "/dev", "/usr", "/bin" ]
@@ -165,11 +165,15 @@ let path_is_at_or_under (path : string) (root : string) : bool = let is_git_internal_path (path : string) : bool = List.exists (( = ) ".git") (split_path path) +let always_ignored_paths : string list = [ "/proc"; "/dev" ] + let is_ignored_path (raw_path : string) : bool = if raw_path = "" then false else let path = best_effort_canonical raw_path in - is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths + is_git_internal_path path + || List.exists (fun ignored -> path_is_at_or_under path ignored) always_ignored_paths + || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths let home_dir () : string = match Sys.getenv_opt "HOME" with @@ -201,6 +205,8 @@ let default_ignore_file_entries () : ignore_file_entries = "$HOME/.cursor"; "$XDG_CONFIG_HOME/agent-snapshot/ignore.json"; "/tmp/scratch-output"; + "/proc"; + "/dev"; "/usr"; "/bin"; ] @@ -244,6 +250,24 @@ let mode_of_kind (kind : Unix.file_kind) : int = | Unix.S_FIFO -> 0o010000 | Unix.S_SOCK -> 0o140000 +let kind_is_special (kind : Unix.file_kind) : bool = + match kind with + | Unix.S_CHR | Unix.S_BLK | Unix.S_FIFO | Unix.S_SOCK -> true + | Unix.S_REG | Unix.S_DIR | Unix.S_LNK -> false + +let metadata_is_special (meta : metadata) : bool = + if not meta.exists then false + else + match meta.mode land 0o170000 with + | 0o020000 | 0o060000 | 0o010000 | 0o140000 -> true + | _ -> false + +let path_is_special_file (path : string) : bool = + try + let st = Unix.LargeFile.lstat path in + kind_is_special st.st_kind + with Unix.Unix_error _ -> false + let stat_metadata (path : string) : metadata option = try let st = Unix.LargeFile.lstat path in @@ -367,11 +391,13 @@ let classify_git (input_path : string) : git_info = let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst -let read_file_bin (path : string) : string = +let read_file_bin (path : string) : string option = let ic = open_in_bin path in Fun.protect ~finally:(fun () -> close_in_noerr ic) - (fun () -> really_input_string ic (in_channel_length ic)) + (fun () -> + try Some (really_input_string ic (in_channel_length ic)) + with Sys_error _ | Invalid_argument _ | End_of_file -> None) let write_file_bin (path : string) (contents : string) : unit = let oc = open_out_bin path in @@ -451,15 +477,19 @@ let with_blob_row_group_writer (f : unit -> 'a) : 'a = let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path -let store_blob (state : string) (path : string) : string = +let store_blob (state : string) (path : string) (meta : metadata) : string option = ensure_blob_dir (); let key = blob_key state path in - let content = read_file_bin path in - blob_batch_keys_rev := key :: !blob_batch_keys_rev; - blob_batch_contents_rev := content :: !blob_batch_contents_rev; - incr blob_batch_count; - if !blob_batch_count >= blob_batch_max then flush_blob_batch (); - key + match read_file_bin path with + | None -> + meta.size <- -1L; + None + | Some content -> + blob_batch_keys_rev := key :: !blob_batch_keys_rev; + blob_batch_contents_rev := content :: !blob_batch_contents_rev; + incr blob_batch_count; + if !blob_batch_count >= blob_batch_max then flush_blob_batch (); + Some key (** Restore reads the blob store into a map. This runs once per restore, not during capture. *) let load_blob_store (dir : string) : (string, string) Hashtbl.t = @@ -481,7 +511,7 @@ let should_capture_content (path : string) (meta : metadata) (git : git_info) : let record_observation (raw_path : string) (operation : string) : unit = if raw_path <> "" then let path = best_effort_canonical raw_path in - if (not (is_ignored_path path)) && not (path_has_non_directory_prefix path) then ( + if (not (is_ignored_path path)) && (not (path_has_non_directory_prefix path)) && not (path_is_special_file path) then ( let recd = match Hashtbl.find_opt files path with | Some recd -> recd @@ -497,7 +527,7 @@ let record_observation (raw_path : string) (operation : string) : unit = recd.before_recorded <- true; recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ()); recd.before_git <- classify_git path; - if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob "before" path))) + if should_capture_content path recd.before recd.before_git then recd.before.blob <- store_blob "before" path recd.before)) (** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *) let finalize_records () : unit = @@ -509,7 +539,7 @@ let finalize_records () : unit = recd.after_git <- classify_git recd.path; let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then - recd.after.blob <- Some (store_blob "after" recd.path))) + recd.after.blob <- store_blob "after" recd.path recd.after)) files let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata = @@ -559,19 +589,21 @@ let write_manifest (out : string) (command : string list) (exit_status : int) : let manifest_files = Hashtbl.fold (fun _ recd acc -> - let operations = - Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string - in - let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in - ({ - path = utf8_string recd.path; - operations; - before = manifest_metadata_of_metadata recd.before; - after = manifest_metadata_of_metadata recd.after; - git = manifest_git_of_git_info git; - } - : Manifest_json.file_entry) - :: acc) + if metadata_is_special recd.before || metadata_is_special recd.after then acc + else + let operations = + Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare |> List.map utf8_string + in + let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in + ({ + path = utf8_string recd.path; + operations; + before = manifest_metadata_of_metadata recd.before; + after = manifest_metadata_of_metadata recd.after; + git = manifest_git_of_git_info git; + } + : Manifest_json.file_entry) + :: acc) files [] |> List.sort Stdlib.compare in
@@ -0,0 +1,10 @@ +import os +from pathlib import Path + +testdata = Path(os.environ["AGENT_SNAPSHOT_TEST_REPO"]) + +Path("/proc/self/status").read_text() +Path("/dev/null").read_bytes() + +fd = os.open(testdata / "runtime.fifo", os.O_RDONLY | os.O_NONBLOCK) +os.close(fd)
@@ -140,6 +140,8 @@ def test_missing_ignore_config_creates_defaults(tmp_path, ignore_config): "$HOME/.cursor", "$XDG_CONFIG_HOME/agent-snapshot/ignore.json", "/tmp/scratch-output", + "/proc", + "/dev", "/usr", "/bin", ] @@ -226,6 +228,18 @@ def test_git_internal_directory_writes_are_ignored(tmp_path): assert str((WORKTREE / ".git" / "delete_me").resolve()) not in manifest_paths +def test_proc_dev_and_special_files_are_ignored(tmp_path): + fifo = WORKTREE / "runtime.fifo" + os.mkfifo(fifo) + + snap = capture(tmp_path, PYTHON, "test_programs/read_proc_dev_special.py") + manifest_paths = {item["path"] for item in snap.manifest["files"]} + + assert not any(path.startswith("/proc/") for path in manifest_paths) + assert not any(path.startswith("/dev/") for path in manifest_paths) + assert str(fifo.resolve()) not in manifest_paths + + def test_non_directory_path_component_does_not_crash(tmp_path): snap = capture(tmp_path, PYTHON, "test_programs/read_non_directory_component.py") impossible_path = (WORKTREE / "not_directory" / "2851767" / "ns").resolve(strict=False)