Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -61,7 +61,10 @@ A snapshot is a directory bundle: snapshot-dir/ manifest.json blobs/ - <content-digest> + before/ + <absolute-path-without-leading-slash> + after/ + <absolute-path-without-leading-slash> ``` `manifest.json` contains: @@ -83,9 +86,10 @@ Each file record contains: - `git`: Git classification for the path when applicable. Metadata records include whether the path exists, file type, mode, size, mtime, -and optionally a `blob` digest. Blob files live under `blobs/` and are addressed -by digest. The digest is currently an internal content-addressing key, not a -cryptographic integrity guarantee. +and optionally a `blob` key. Blob files live under `blobs/` and are addressed by +state-qualified absolute path keys such as `before:/repo/input.txt` or +`after:/repo/generated.txt`. On disk, those keys are stored under +`blobs/before/repo/input.txt` or `blobs/after/repo/generated.txt`. Clean Git-tracked reads typically have no blob: @@ -134,7 +138,7 @@ Captured file contents appear as blob references: "mode": 33188, "size": 18, "mtime": 1770000001, - "blob": "0d88229adcb64ea7" + "blob": "after:/repo/generated.txt" } } ```
@@ -123,7 +123,6 @@ let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8 let ignored_paths : string list ref = ref [] let ignore_config_path : string ref = ref "" let snapshot_dir : string ref = ref "" -let blob_dir : string ref = ref "" let tracer_uid : int = Unix.getuid () let tracer_gid : int = Unix.getgid () @@ -315,29 +314,27 @@ let classify_git (input_path : string) : git_info = info) with Ocaml_git.Git_error _ -> info) -(** Stable content key for blobs. This is not intended as a cryptographic integrity hash. *) -let fnv1a_file_digest (path : string) : string = - let ic = open_in_bin path in - Fun.protect - ~finally:(fun () -> close_in_noerr ic) - (fun () -> - let hash = ref 0xcbf29ce484222325L in - (try - while true do - let c = input_byte ic in - hash := Int64.logxor !hash (Int64.of_int c); - hash := Int64.mul !hash 0x100000001b3L - done - with End_of_file -> ()); - Printf.sprintf "%016Lx" !hash) - let copy_file (src : string) (dst : string) : unit = FileUtil.cp [ src ] dst -let store_blob (path : string) : string = - let digest = fnv1a_file_digest path in - let out = concat_path !blob_dir digest in - if not (FileUtil.test FileUtil.Exists out) then copy_file path out; - digest +let trim_leading_slash (path : string) : string = + if String.starts_with ~prefix:"/" path then String.sub path 1 (String.length path - 1) else path + +let blob_key (state : string) (path : string) : string = state ^ ":" ^ best_effort_canonical path + +let blob_path_for_key (dir : string) (key : string) : string = + match String.index_opt key ':' with + | Some index -> + let state = String.sub key 0 index in + let path = String.sub key (index + 1) (String.length key - index - 1) in + concat_path (concat_path (concat_path dir "blobs") state) (trim_leading_slash path) + | None -> concat_path (concat_path dir "blobs") key + +let store_blob (state : string) (path : string) : string = + let key = blob_key state path in + let out = blob_path_for_key !snapshot_dir key in + mkdir_p (dirname out); + copy_file path out; + key let should_capture_content (path : string) (meta : metadata) (git : git_info) : bool = if (not meta.exists) || not meta.regular then false @@ -365,7 +362,7 @@ let record_observation (raw_path : string) (operation : string) : unit = recd.before_recorded <- true; recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ()); recd.before_git <- classify_git path; - if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path))) + if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob "before" path))) (** Capture after-state once the traced process tree has exited and filesystem writes have quiesced. *) let finalize_records () : unit = @@ -377,7 +374,7 @@ let finalize_records () : unit = recd.after_git <- classify_git recd.path; let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then - recd.after.blob <- Some (store_blob recd.path))) + recd.after.blob <- Some (store_blob "after" recd.path))) files let manifest_metadata_of_metadata (meta : metadata) : Manifest_json.metadata = @@ -608,16 +605,11 @@ let restore_snapshot (dir : string) : unit = else ( match after.blob with | None -> () - | Some digest -> + | Some key -> mkdir_p (dirname path); - let same = - FileUtil.test (FileUtil.And (FileUtil.Exists, FileUtil.Not FileUtil.Is_dir)) path - && fnv1a_file_digest path = digest - in - if not same then ( - let tmp = path ^ ".agent-snapshot.tmp" in - copy_file (concat_path (concat_path dir "blobs") digest) tmp; - Unix.rename tmp path); + let tmp = path ^ ".agent-snapshot.tmp" in + copy_file (blob_path_for_key dir key) tmp; + Unix.rename tmp path; (match after.mode with | Some mode -> FileUtil.chmod (`Octal (mode land 0o7777)) [ path ] | None -> ()); @@ -642,9 +634,8 @@ let run_snapshot (args : string list) : int = load_ignore_config (); let output, command = parse_snapshot_args args in snapshot_dir := output; - blob_dir := concat_path output "blobs"; remove_all output; - mkdir_p !blob_dir; + mkdir_p (concat_path output "blobs"); trace_command command; finalize_records (); write_manifest output command 0;
@@ -85,8 +85,13 @@ class Snapshot: return item raise AssertionError(f"{target} not present in snapshot") - def blob_text(self, digest: str): - return (self.path / "blobs" / digest).read_text() + def blob_path(self, key: str): + state, absolute_path = key.split(":", 1) + assert Path(absolute_path).is_absolute() + return self.path / "blobs" / state / absolute_path.removeprefix("/") + + def blob_text(self, key: str): + return self.blob_path(key).read_text() def capture(tmp_path: Path, *command: str) -> Snapshot: @@ -198,6 +203,7 @@ def test_written_clean_git_tracked_file_gets_after_blob(tmp_path): assert "write" in clean["operations"] assert clean["git"]["tracked"] is True assert clean["git"]["dirty"] is False + assert clean["after"]["blob"] == f"after:{WORKTREE / 'clean.txt'}" assert snap.blob_text(clean["after"]["blob"]) == (WORKTREE / "clean.txt").read_text() @@ -227,6 +233,7 @@ def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path): dirty = snap.file(WORKTREE / "dirty.txt") assert dirty["git"]["tracked"] is True assert dirty["git"]["dirty"] is True + assert dirty["before"]["blob"] == f"before:{WORKTREE / 'dirty.txt'}" assert snap.blob_text(dirty["before"]["blob"]) == "dirty tracked fixture changed before run\n" untracked = snap.file(WORKTREE / "untracked_runtime.txt")