Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Document tracer design rationale

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-02 07:29:13 -0400
Commit
ffda7d444d3bca1b030aa5f9727d757f9e71680e
src/main.cpp
index fe3d2c0..1dc1d10 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -35,10 +35,24 @@ namespace fs = std::filesystem;
 
 namespace {
 
+// ptrace reports many kinds of stops through the same waitpid interface. These
+// options are what make the tracer useful for this project:
+//
+// - PTRACE_O_TRACESYSGOOD marks syscall stops as SIGTRAP|0x80, so we can tell
+//   "the process is entering/leaving a syscall" apart from ordinary SIGTRAPs.
+// - PTRACE_O_TRACE{FORK,VFORK,CLONE} asks the kernel to stop the parent when a
+//   new child is created and lets us attach the same syscall tracing policy to
+//   the child before it runs far enough to hide file accesses from us.
+// - EXEC/EXIT are not deeply interpreted yet, but enabling them keeps the event
+//   stream explicit and leaves room for recording process lifecycle later.
 constexpr int kPtraceOptions = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK |
                                PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE |
                                PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT;
 
+// The manifest stores before/after facts because reconstruction is about final
+// state, but deciding whether to blob a file often depends on what was present
+// before the traced program touched it. For example, a created file must have an
+// explicit "before did not exist" fact, while a deleted file needs a tombstone.
 struct Metadata {
   bool exists = false;
   bool tombstone = false;
@@ -50,6 +64,9 @@ struct Metadata {
   std::string blob;
 };
 
+// Git facts are the main compactness mechanism. A clean tracked path can be
+// reconstructed from repository root + HEAD + relative path, so storing its
+// content would duplicate information already represented by Git.
 struct GitInfo {
   bool in_repo = false;
   bool tracked = false;
@@ -60,6 +77,8 @@ struct GitInfo {
   std::string relative_path;
 };
 
+// A file is recorded once and accumulates capabilities. This keeps the manifest
+// compact when a program stats, opens, reads, and later deletes the same path.
 struct FileRecord {
   std::string path;
   std::set<std::string> operations;
@@ -70,6 +89,9 @@ struct FileRecord {
   bool before_recorded = false;
 };
 
+// ptrace gives us register state separately at syscall entry and syscall exit.
+// Path pointers are only meaningful at entry, while return values and new file
+// descriptors only exist at exit, so we cache the decoded entry-side state here.
 struct PendingSyscall {
   long nr = -1;
   std::array<unsigned long long, 6> args{};
@@ -80,6 +102,9 @@ struct PendingSyscall {
   int flags = 0;
 };
 
+// ptrace tracks tasks, not "programs". Each traced pid can have a different cwd
+// and fd table after chdir/open/dup/close, and relative path reconstruction is
+// only as good as this per-process state.
 struct ProcState {
   bool in_syscall = false;
   fs::path cwd;
@@ -126,6 +151,10 @@ fs::path best_effort_canonical(const fs::path& path) {
   return fs::absolute(path, ec).lexically_normal();
 }
 
+// Syscall arguments live in the tracee's address space. PTRACE_PEEKDATA reads a
+// machine word at a time from that process; for path arguments we walk forward
+// until the NUL terminator. This is intentionally bounded so a bad userspace
+// pointer or non-terminated buffer cannot make the tracer loop forever.
 std::string read_tracee_string(pid_t pid, unsigned long long address) {
   if (address == 0) return {};
   std::string out;
@@ -167,6 +196,11 @@ bool writable_by_current_user(const Metadata& meta) {
   return (mode & S_IWUSR) || (mode & S_IWGRP) || (mode & S_IWOTH);
 }
 
+// This is the "system environment" escape hatch. The goal is not to snapshot
+// /usr wholesale just because the dynamic loader, Python, or libc looked there.
+// access(W_OK) is used instead of checking raw mode bits because groups, ACLs,
+// and effective permission rules matter more than whether an owner write bit is
+// present for some other uid.
 bool owned_by_other_and_not_writable(const fs::path& path) {
   struct stat st {};
   if (lstat(path.c_str(), &st) != 0) return false;
@@ -175,6 +209,10 @@ bool owned_by_other_and_not_writable(const fs::path& path) {
   return true;
 }
 
+// Git can classify paths that do not exist at finalization time if we start from
+// the nearest existing parent. That matters for tombstones: after unlink, the
+// path itself is gone, but the repository root and relative path are still
+// recoverable from its parent directory.
 fs::path existing_anchor(fs::path path) {
   std::error_code ec;
   if (fs::exists(path, ec)) return path;
@@ -206,6 +244,9 @@ GitInfo classify_git(const fs::path& input_path) {
   if (anchor.empty()) anchor = existing_anchor(input_path.parent_path());
   if (anchor.empty()) return info;
 
+  // git_repository_open_ext walks upward from the anchor to find any containing
+  // repository. This is deliberate: the traced program might access files inside
+  // a nested repo, a dependency checkout, or a temp repo unrelated to this tool.
   git_repository* repo = nullptr;
   if (git_repository_open_ext(&repo, anchor.c_str(), 0, nullptr) != 0) return info;
 
@@ -223,6 +264,10 @@ GitInfo classify_git(const fs::path& input_path) {
   fs::path rel = fs::relative(input_path, info.root, ec);
   if (!ec) info.relative_path = rel.string();
 
+  // libgit2's status bits encode both index and worktree state. For compactness
+  // we only skip blobs for GIT_STATUS_CURRENT tracked files. New, ignored, or
+  // dirty tracked files are not reconstructable from HEAD alone, so they are
+  // treated as content that belongs in the snapshot.
   unsigned int status = 0;
   if (!info.relative_path.empty() &&
       git_status_file(&status, repo, info.relative_path.c_str()) == 0) {
@@ -243,6 +288,10 @@ GitInfo classify_git(const fs::path& input_path) {
   return info;
 }
 
+// This digest is a content-addressing key for blobs, not a security boundary.
+// FNV-1a is small and deterministic, which is enough for current tests and for
+// avoiding duplicate blob files. A production snapshot format should replace it
+// with SHA-256 or BLAKE3 before relying on it for collision resistance.
 std::string fnv1a_file_digest(const fs::path& path) {
   std::ifstream in(path, std::ios::binary);
   uint64_t hash = 1469598103934665603ULL;
@@ -266,6 +315,8 @@ std::string store_blob(const fs::path& path) {
 }
 
 bool should_capture_content(const fs::path& path, const Metadata& meta, const GitInfo& git) {
+  // Directories, devices, sockets, etc. are represented as metadata and
+  // observations. Only regular files get blobbed in v1.
   if (!meta.exists || !meta.regular) return false;
   if (owned_by_other_and_not_writable(path)) return false;
   if (git.in_repo && git.tracked && !git.dirty) return false;
@@ -308,6 +359,9 @@ void record_observation(const fs::path& raw_path, const std::string& operation) 
   rec.path = key;
   rec.operations.insert(operation);
   if (!rec.before_recorded) {
+    // Capture "before" on first observation, not at process exit. This is the
+    // only chance to distinguish "the program created this path" from "the path
+    // existed before and was later opened for write".
     rec.before_recorded = true;
     rec.before = stat_metadata(path).value_or(Metadata{});
     rec.before_git = classify_git(path);
@@ -319,6 +373,9 @@ void record_observation(const fs::path& raw_path, const std::string& operation) 
 
 void finalize_records() {
   for (auto& [_, rec] : files) {
+    // The after pass is intentionally outside ptrace. Once the traced process
+    // tree has exited, the filesystem has quiesced from our point of view, so
+    // final content can be copied without racing the writer we launched.
     fs::path path(rec.path);
     rec.after = stat_metadata(path).value_or(Metadata{});
     if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0;
@@ -330,6 +387,9 @@ void finalize_records() {
 }
 
 fs::path resolve_path(const ProcState& proc, int dirfd, const std::string& path) {
+  // The *at syscalls interpret relative paths against either cwd or a directory
+  // fd. That is why the tracer maintains fd->path mappings; without them,
+  // openat(dirfd, "file") would be impossible to place in the manifest.
   fs::path p(path);
   if (p.is_absolute()) return p;
   fs::path base = proc.cwd;
@@ -351,17 +411,25 @@ bool is_read_open(int flags) {
 }
 
 void refresh_proc_fd(pid_t pid, ProcState& proc, int fd) {
+  // /proc/<pid>/fd/N is the kernel's own view of where an fd points. Reading
+  // this symlink after a successful open is more reliable than trying to model
+  // every mount namespace or symlink resolution rule ourselves.
   std::string target = readlink_string("/proc/" + std::to_string(pid) + "/fd/" + std::to_string(fd));
   if (!target.empty() && target[0] == '/') proc.fds[fd] = best_effort_canonical(target);
 }
 
 void refresh_proc_cwd(pid_t pid, ProcState& proc) {
+  // chdir/fchdir affect future relative path arguments. /proc gives us the
+  // post-syscall cwd after the kernel has accepted the directory change.
   std::string target = readlink_string("/proc/" + std::to_string(pid) + "/cwd");
   if (!target.empty()) proc.cwd = best_effort_canonical(target);
 }
 
 void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& regs) {
   PendingSyscall p;
+  // Linux x86_64 syscall ABI: syscall number in orig_rax, arguments in
+  // rdi/rsi/rdx/r10/r8/r9, result in rax on exit. This file is explicitly v1
+  // x86_64-only; another architecture needs a different register decoder.
   p.nr = static_cast<long>(regs.orig_rax);
   p.args = {regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9};
 
@@ -369,6 +437,8 @@ void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& re
     case SYS_open:
       p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
       p.flags = static_cast<int>(p.args[1]);
+      // For writes, record before-state at syscall entry. If O_CREAT succeeds,
+      // waiting until exit would make a newly-created file look preexisting.
       if (is_write_open(p.flags)) record_observation(p.path_a, "write");
       break;
     case SYS_openat:
@@ -403,6 +473,8 @@ void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& re
     case SYS_unlink:
     case SYS_rmdir:
       p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      // Deletions have the same timing issue as creations: after syscall exit
+      // the content may be gone, so the before snapshot must happen here.
       record_observation(p.path_a, "delete");
       break;
     case SYS_unlinkat:
@@ -419,6 +491,8 @@ void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& re
     case SYS_rename:
       p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
       p.path_b = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[1])).string();
+      // A rename is modeled as source deletion plus destination write. That is
+      // enough for reconstruction even though it loses the atomic-move history.
       record_observation(p.path_a, "delete");
       record_observation(p.path_b, "write");
       break;
@@ -453,6 +527,9 @@ void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& re
 
 void handle_syscall_exit(pid_t pid, ProcState& proc, long result) {
   const PendingSyscall& p = proc.pending;
+  // On Linux, syscall failures are returned as negative errno values in rax.
+  // We still record failed path probes as "existence" observations because the
+  // program learned something about that pathname.
   bool ok = result >= 0;
 
   switch (p.nr) {
@@ -485,6 +562,9 @@ void handle_syscall_exit(pid_t pid, ProcState& proc, long result) {
       break;
     case SYS_getdents:
     case SYS_getdents64:
+      // getdents returns directory entries for an already-open fd. The entries
+      // themselves are not decoded yet; v1 records the fact that the directory
+      // was traversed, which is the important observation boundary for now.
       if (ok && p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "directory");
       break;
     case SYS_unlink:
@@ -518,6 +598,9 @@ void handle_syscall_exit(pid_t pid, ProcState& proc, long result) {
       if (ok) proc.fds.erase(p.fd);
       break;
     case SYS_dup:
+      // Duplication makes multiple numeric fds refer to the same open file
+      // description. Mirroring that relationship preserves later ftruncate or
+      // fd-relative directory operations through the duplicate.
       if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(result)] = proc.fds[p.fd];
       break;
     case SYS_dup2:
@@ -540,6 +623,9 @@ void trace_command(const std::vector<std::string>& command) {
   pid_t child = fork();
   if (child < 0) throw std::runtime_error(errno_message("fork failed"));
   if (child == 0) {
+    // PTRACE_TRACEME makes the parent our tracer after exec. The explicit
+    // SIGSTOP is a synchronization point: the parent sets ptrace options before
+    // the child reaches execvp and starts making filesystem-related syscalls.
     if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) != 0) _exit(127);
     raise(SIGSTOP);
     std::vector<char*> argv;
@@ -556,9 +642,15 @@ void trace_command(const std::vector<std::string>& command) {
   }
   processes[child].cwd = fs::current_path();
   refresh_proc_cwd(child, processes[child]);
+  // PTRACE_SYSCALL resumes the child and asks the kernel to stop it twice per
+  // syscall: once before execution and once after, which is the basis for the
+  // PendingSyscall entry/exit split above.
   ptrace(PTRACE_SYSCALL, child, nullptr, nullptr);
 
   while (!processes.empty()) {
+    // __WALL is needed with ptrace so waitpid observes all traced tasks,
+    // including clone-created threads that would otherwise not behave like
+    // normal children from the wait API's point of view.
     pid_t pid = waitpid(-1, &status, __WALL);
     if (pid < 0) {
       if (errno == EINTR) continue;
@@ -585,6 +677,9 @@ void trace_command(const std::vector<std::string>& command) {
         event == PTRACE_EVENT_CLONE) {
       unsigned long new_pid = 0;
       ptrace(PTRACE_GETEVENTMSG, pid, nullptr, &new_pid);
+      // A newly forked process inherits cwd and fd table at fork time. Copying
+      // the parent's ProcState matches that kernel behavior closely enough for
+      // path reconstruction until either process mutates its own state.
       processes[static_cast<pid_t>(new_pid)] = it->second;
       ptrace(PTRACE_SETOPTIONS, static_cast<pid_t>(new_pid), nullptr, kPtraceOptions);
       ptrace(PTRACE_SYSCALL, static_cast<pid_t>(new_pid), nullptr, nullptr);
@@ -606,6 +701,9 @@ void trace_command(const std::vector<std::string>& command) {
       }
       ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
     } else {
+      // Non-syscall stops are real signals or ptrace events. Plain SIGTRAP is
+      // consumed by the tracer; other signals are reinjected so tracing changes
+      // process behavior as little as practical.
       int deliver = (sig == SIGTRAP) ? 0 : sig;
       ptrace(PTRACE_SYSCALL, pid, nullptr, reinterpret_cast<void*>(static_cast<long>(deliver)));
     }
@@ -639,6 +737,9 @@ void write_manifest(const fs::path& out, const std::vector<std::string>& command
         {"operations", ops},
         {"before", metadata_json(rec.before)},
         {"after", metadata_json(rec.after)},
+        // Prefer after_git when available so files created during the run are
+        // classified in their final repository context. Fall back to before_git
+        // for deleted paths whose final filesystem anchor may no longer exist.
         {"git", git_json(rec.after_git.in_repo ? rec.after_git : rec.before_git)},
     });
   }
@@ -657,6 +758,8 @@ void restore_snapshot(const fs::path& dir) {
     const json& after = item.at("after");
     if (!after.value("exists", false)) {
       if (after.value("tombstone", false)) {
+        // A tombstone represents final non-existence. Missing is already the
+        // desired final state, so remove errors are intentionally non-fatal.
         std::error_code ec;
         fs::remove(path, ec);
       }
@@ -664,8 +767,13 @@ void restore_snapshot(const fs::path& dir) {
     }
     if (!after.contains("blob")) continue;
 
+    // Restore only blobbed files. Clean Git-tracked files and reconstructable
+    // system files are manifest references, not payloads owned by this bundle.
     fs::create_directories(path.parent_path());
     const std::string expected_blob = after.at("blob").get<std::string>();
+    // Avoid rewriting identical files. This matters for observed executables
+    // that may be mapped or busy while restore runs, and it also makes restore
+    // idempotent for normal captured files.
     if (!(fs::exists(path) && fs::is_regular_file(path) &&
           fnv1a_file_digest(path) == expected_blob)) {
       fs::path tmp = path;
@@ -709,6 +817,9 @@ int run_snapshot(const std::vector<std::string>& args) {
 
   snapshot_dir = output;
   blob_dir = snapshot_dir / "blobs";
+  // v1 treats --output as an owned bundle directory. Removing it up front avoids
+  // stale blobs or manifest entries from a previous run being mistaken for the
+  // current trace.
   fs::remove_all(snapshot_dir);
   fs::create_directories(blob_dir);
 
tests/test_agent_snapshot.py
index c8a32f8..fcdb58c 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -11,6 +11,11 @@ ROOT = Path(__file__).resolve().parents[1]
 BUILD = ROOT / "build" / "pytest"
 BIN = BUILD / "agent-snapshot"
 TESTDATA = ROOT / "testdata"
+# Use the system Python rather than uv's managed interpreter. The snapshotter
+# intentionally observes interpreter and loader activity too, and a uv-managed
+# Python in the user's home directory can be writable by the current user. That
+# would force blobs for the interpreter itself and make the tests about uv's
+# environment instead of Agent Snapshot's file classification rules.
 PYTHON = "/usr/bin/python3"
 
 
@@ -20,6 +25,9 @@ def run(cmd, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)
 def build_agent_snapshot():
+    # The tests exercise the real CLI binary instead of calling internal helper
+    # functions. That keeps the acceptance criteria aligned with ptrace behavior,
+    # process launch, CMake wiring, and manifest writing as users will run them.
     run(["cmake", "-S", ".", "-B", str(BUILD)])
     run(["cmake", "--build", str(BUILD), "--parallel"])
     assert BIN.exists()
@@ -27,6 +35,9 @@ def build_agent_snapshot():
 
 @pytest.fixture(autouse=True)
 def pristine_testdata():
+    # The snapshot policy depends on Git's clean/dirty/untracked distinctions.
+    # Each test is allowed to dirty or create files under testdata, then this
+    # fixture restores the committed baseline so later tests see known Git state.
     run(["git", "checkout", "--", "testdata"])
     run(["git", "clean", "-fd", "--", "testdata"])
     yield
@@ -35,6 +46,9 @@ def pristine_testdata():
 
 
 class Snapshot:
+    # Small manifest assertion helper. Tests should read like snapshot behavior,
+    # not like repeated JSON tree walking, because the important contract is
+    # "this path was observed and classified this way".
     def __init__(self, path: Path):
         self.path = path
         self.manifest = json.loads((path / "manifest.json").read_text())
@@ -51,12 +65,17 @@ class Snapshot:
 
 
 def capture(tmp_path: Path, *command: str) -> Snapshot:
+    # Every test gets a fresh bundle directory so stale blobs cannot mask capture
+    # bugs. The command is passed after -- to exercise the intended CLI parsing.
     out = tmp_path / "snapshot"
     run([str(BIN), "--output", str(out), "--", *command])
     return Snapshot(out)
 
 
 def test_clean_git_tracked_read_records_repo_without_blob(tmp_path):
+    # A clean tracked file is the primary compactness case. The program really
+    # reads testdata/clean.txt, but the snapshot should rely on Git repository
+    # root + HEAD + relative path instead of copying file contents into blobs.
     snap = capture(tmp_path, PYTHON, "test_programs/read_clean.py")
     clean = snap.file(TESTDATA / "clean.txt")
 
@@ -69,6 +88,11 @@ def test_clean_git_tracked_read_records_repo_without_blob(tmp_path):
 
 
 def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path):
+    # This test covers the cases where Git metadata is not enough:
+    # - dirty tracked files differ from HEAD, so their content must be blobbed
+    # - untracked files have no commit object to reconstruct from
+    # - created files need before=false and after content
+    # - deleted files need a tombstone so restore can reproduce non-existence
     (TESTDATA / "dirty.txt").write_text("dirty tracked fixture changed before run\n")
     (TESTDATA / "untracked_runtime.txt").write_text("untracked input\n")
     (TESTDATA / "deleted_by_program.txt").write_text("delete me\n")
@@ -96,6 +120,11 @@ def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path):
 
 
 def test_fork_usr_and_directory_traversal(tmp_path):
+    # ptrace must follow the process tree, not just the initial pid. The helper
+    # forks and writes from the child; missing that write means fork/clone events
+    # are not being attached early enough. The same helper reads /usr/bin/env to
+    # assert that root-owned, non-writable system files are observed but not
+    # blobbed, and iterates testdata to verify directory traversal is recorded.
     snap = capture(tmp_path, PYTHON, "test_programs/fork_and_usr.py")
 
     child = snap.file(TESTDATA / "child_output.txt")
@@ -112,6 +141,10 @@ def test_fork_usr_and_directory_traversal(tmp_path):
 
 
 def test_restore_applies_final_state(tmp_path):
+    # Restore is intentionally tested from a damaged filesystem state rather than
+    # immediately after capture. That proves the bundle contains enough payload
+    # to recreate final captured files and enough tombstone information to remove
+    # files that should not exist after the traced command.
     (TESTDATA / "dirty.txt").write_text("changed before capture\n")
     (TESTDATA / "untracked_runtime.txt").write_text("untracked input\n")
     (TESTDATA / "deleted_by_program.txt").write_text("delete me\n")