Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Initial Agent Snapshot project

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-02 07:03:34 -0400
Commit
53c46391483675518a0911413346522d16b3fac3
.gitignore
new file mode 100644
index 0000000..b5fc08a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+build/
+.pytest_cache/
+.venv/
+__pycache__/
+*.pyc
+snapshot-*/
CMakeLists.txt
new file mode 100644
index 0000000..89796e6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.28)
+project(agent_snapshot VERSION 0.1.0 LANGUAGES CXX)
+
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(LIBGIT2 REQUIRED IMPORTED_TARGET libgit2)
+
+FetchContent_Declare(
+  nlohmann_json
+  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+  URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d
+)
+FetchContent_MakeAvailable(nlohmann_json)
+
+add_executable(agent-snapshot
+  src/main.cpp
+)
+
+target_include_directories(agent-snapshot PRIVATE include)
+target_link_libraries(agent-snapshot PRIVATE PkgConfig::LIBGIT2 nlohmann_json::nlohmann_json)
+target_compile_options(agent-snapshot PRIVATE -Wall -Wextra -Wpedantic)
+
pyproject.toml
new file mode 100644
index 0000000..70f16df
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,14 @@
+[project]
+name = "agent-snapshot"
+version = "0.1.0"
+requires-python = ">=3.12"
+dependencies = []
+
+[dependency-groups]
+dev = [
+  "pytest>=8.0",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-ra"
src/main.cpp
new file mode 100644
index 0000000..1472a70
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,728 @@
+#include <nlohmann/json.hpp>
+#include <git2.h>
+
+#include <sys/ptrace.h>
+#include <sys/reg.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <chrono>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using json = nlohmann::json;
+namespace fs = std::filesystem;
+
+namespace {
+
+constexpr int kPtraceOptions = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK |
+                               PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE |
+                               PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT;
+
+struct Metadata {
+  bool exists = false;
+  bool tombstone = false;
+  bool regular = false;
+  bool directory = false;
+  mode_t mode = 0;
+  uintmax_t size = 0;
+  std::time_t mtime = 0;
+  std::string blob;
+};
+
+struct GitInfo {
+  bool in_repo = false;
+  bool tracked = false;
+  bool dirty = false;
+  bool ignored = false;
+  std::string root;
+  std::string head;
+  std::string relative_path;
+};
+
+struct FileRecord {
+  std::string path;
+  std::set<std::string> operations;
+  Metadata before;
+  Metadata after;
+  GitInfo before_git;
+  GitInfo after_git;
+  bool before_recorded = false;
+};
+
+struct PendingSyscall {
+  long nr = -1;
+  std::array<unsigned long long, 6> args{};
+  std::string path_a;
+  std::string path_b;
+  int dirfd = AT_FDCWD;
+  int fd = -1;
+  int flags = 0;
+};
+
+struct ProcState {
+  bool in_syscall = false;
+  fs::path cwd;
+  std::map<int, fs::path> fds;
+  PendingSyscall pending;
+};
+
+struct RepoRecord {
+  std::string root;
+  std::string head;
+  bool dirty = false;
+};
+
+std::unordered_map<pid_t, ProcState> processes;
+std::map<std::string, FileRecord> files;
+std::map<std::string, RepoRecord> repos;
+fs::path snapshot_dir;
+fs::path blob_dir;
+uid_t tracer_uid = 0;
+gid_t tracer_gid = 0;
+
+std::string errno_message(const std::string& prefix) {
+  return prefix + ": " + std::strerror(errno);
+}
+
+std::string readlink_string(const fs::path& path) {
+  std::vector<char> buffer(4096);
+  ssize_t n = readlink(path.c_str(), buffer.data(), buffer.size() - 1);
+  if (n < 0) return {};
+  buffer[static_cast<size_t>(n)] = '\0';
+  return std::string(buffer.data());
+}
+
+fs::path lexical_abs(const fs::path& path, const fs::path& base) {
+  if (path.is_absolute()) return path.lexically_normal();
+  return (base / path).lexically_normal();
+}
+
+fs::path best_effort_canonical(const fs::path& path) {
+  std::error_code ec;
+  fs::path canonical = fs::weakly_canonical(path, ec);
+  if (!ec && !canonical.empty()) return canonical.lexically_normal();
+  if (path.is_absolute()) return path.lexically_normal();
+  return fs::absolute(path, ec).lexically_normal();
+}
+
+std::string read_tracee_string(pid_t pid, unsigned long long address) {
+  if (address == 0) return {};
+  std::string out;
+  union {
+    long value;
+    char chars[sizeof(long)];
+  } data{};
+  for (size_t offset = 0; offset < 65536; offset += sizeof(long)) {
+    errno = 0;
+    data.value = ptrace(PTRACE_PEEKDATA, pid, address + offset, nullptr);
+    if (errno != 0) break;
+    for (char c : data.chars) {
+      if (c == '\0') return out;
+      out.push_back(c);
+    }
+  }
+  return out;
+}
+
+std::optional<Metadata> stat_metadata(const fs::path& path) {
+  struct stat st {};
+  if (lstat(path.c_str(), &st) != 0) return std::nullopt;
+  Metadata meta;
+  meta.exists = true;
+  meta.mode = st.st_mode;
+  meta.size = static_cast<uintmax_t>(st.st_size);
+  meta.mtime = st.st_mtim.tv_sec;
+  meta.regular = S_ISREG(st.st_mode);
+  meta.directory = S_ISDIR(st.st_mode);
+  return meta;
+}
+
+bool writable_by_current_user(const Metadata& meta) {
+  if (!meta.exists) return true;
+  const mode_t mode = meta.mode;
+  if (tracer_uid == 0) return true;
+  struct stat st {};
+  (void)st;
+  return (mode & S_IWUSR) || (mode & S_IWGRP) || (mode & S_IWOTH);
+}
+
+bool owned_by_other_and_not_writable(const fs::path& path) {
+  struct stat st {};
+  if (lstat(path.c_str(), &st) != 0) return false;
+  if (st.st_uid == tracer_uid) return false;
+  if (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) return false;
+  return true;
+}
+
+fs::path existing_anchor(fs::path path) {
+  std::error_code ec;
+  if (fs::exists(path, ec)) return path;
+  while (!path.empty() && path != path.root_path()) {
+    path = path.parent_path();
+    if (fs::exists(path, ec)) return path;
+  }
+  return {};
+}
+
+std::string oid_to_string(const git_oid* oid) {
+  char out[GIT_OID_HEXSZ + 1] = {};
+  git_oid_tostr(out, sizeof(out), oid);
+  return out;
+}
+
+std::string repo_head(git_repository* repo) {
+  git_reference* head = nullptr;
+  if (git_repository_head(&head, repo) != 0) return {};
+  const git_oid* oid = git_reference_target(head);
+  std::string result = oid ? oid_to_string(oid) : "";
+  git_reference_free(head);
+  return result;
+}
+
+GitInfo classify_git(const fs::path& input_path) {
+  GitInfo info;
+  fs::path anchor = existing_anchor(input_path);
+  if (anchor.empty()) anchor = existing_anchor(input_path.parent_path());
+  if (anchor.empty()) return info;
+
+  git_repository* repo = nullptr;
+  if (git_repository_open_ext(&repo, anchor.c_str(), 0, nullptr) != 0) return info;
+
+  const char* workdir = git_repository_workdir(repo);
+  if (!workdir) {
+    git_repository_free(repo);
+    return info;
+  }
+
+  info.in_repo = true;
+  info.root = best_effort_canonical(workdir).string();
+  info.head = repo_head(repo);
+
+  std::error_code ec;
+  fs::path rel = fs::relative(input_path, info.root, ec);
+  if (!ec) info.relative_path = rel.string();
+
+  unsigned int status = 0;
+  if (!info.relative_path.empty() &&
+      git_status_file(&status, repo, info.relative_path.c_str()) == 0) {
+    info.ignored = status & GIT_STATUS_IGNORED;
+    info.tracked = !(status & GIT_STATUS_WT_NEW) && !(status & GIT_STATUS_IGNORED);
+    info.dirty = status != GIT_STATUS_CURRENT;
+  } else if (!info.relative_path.empty()) {
+    info.tracked = false;
+    info.dirty = true;
+  }
+
+  RepoRecord& rec = repos[info.root];
+  rec.root = info.root;
+  rec.head = info.head;
+  rec.dirty = rec.dirty || info.dirty;
+
+  git_repository_free(repo);
+  return info;
+}
+
+std::string fnv1a_file_digest(const fs::path& path) {
+  std::ifstream in(path, std::ios::binary);
+  uint64_t hash = 1469598103934665603ULL;
+  char c;
+  while (in.get(c)) {
+    hash ^= static_cast<unsigned char>(c);
+    hash *= 1099511628211ULL;
+  }
+  std::ostringstream out;
+  out << std::hex << std::setw(16) << std::setfill('0') << hash;
+  return out.str();
+}
+
+std::string store_blob(const fs::path& path) {
+  std::string digest = fnv1a_file_digest(path);
+  fs::path out = blob_dir / digest;
+  if (!fs::exists(out)) {
+    fs::copy_file(path, out, fs::copy_options::overwrite_existing);
+  }
+  return digest;
+}
+
+bool should_capture_content(const fs::path& path, const Metadata& meta, const GitInfo& git) {
+  if (!meta.exists || !meta.regular) return false;
+  if (owned_by_other_and_not_writable(path)) return false;
+  if (git.in_repo && git.tracked && !git.dirty) return false;
+  return writable_by_current_user(meta);
+}
+
+json metadata_json(const Metadata& meta) {
+  json j;
+  j["exists"] = meta.exists;
+  if (meta.tombstone) j["tombstone"] = true;
+  if (meta.exists) {
+    j["type"] = meta.directory ? "directory" : (meta.regular ? "file" : "other");
+    j["mode"] = meta.mode;
+    j["size"] = meta.size;
+    j["mtime"] = meta.mtime;
+  }
+  if (!meta.blob.empty()) j["blob"] = meta.blob;
+  return j;
+}
+
+json git_json(const GitInfo& git) {
+  json j;
+  j["in_repo"] = git.in_repo;
+  if (git.in_repo) {
+    j["root"] = git.root;
+    j["head"] = git.head;
+    j["relative_path"] = git.relative_path;
+    j["tracked"] = git.tracked;
+    j["dirty"] = git.dirty;
+    j["ignored"] = git.ignored;
+  }
+  return j;
+}
+
+void record_observation(const fs::path& raw_path, const std::string& operation) {
+  if (raw_path.empty()) return;
+  fs::path path = best_effort_canonical(raw_path);
+  std::string key = path.string();
+  FileRecord& rec = files[key];
+  rec.path = key;
+  rec.operations.insert(operation);
+  if (!rec.before_recorded) {
+    rec.before_recorded = true;
+    rec.before = stat_metadata(path).value_or(Metadata{});
+    rec.before_git = classify_git(path);
+    if (should_capture_content(path, rec.before, rec.before_git)) {
+      rec.before.blob = store_blob(path);
+    }
+  }
+}
+
+void finalize_records() {
+  for (auto& [_, rec] : files) {
+    fs::path path(rec.path);
+    rec.after = stat_metadata(path).value_or(Metadata{});
+    if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0;
+    rec.after_git = classify_git(path);
+    if (should_capture_content(path, rec.after, rec.after_git)) {
+      rec.after.blob = store_blob(path);
+    }
+  }
+}
+
+fs::path resolve_path(const ProcState& proc, int dirfd, const std::string& path) {
+  fs::path p(path);
+  if (p.is_absolute()) return p;
+  fs::path base = proc.cwd;
+  if (dirfd != AT_FDCWD) {
+    auto it = proc.fds.find(dirfd);
+    if (it != proc.fds.end()) base = it->second;
+  }
+  return lexical_abs(p, base);
+}
+
+bool is_write_open(int flags) {
+  int access = flags & O_ACCMODE;
+  return access == O_WRONLY || access == O_RDWR || (flags & (O_CREAT | O_TRUNC | O_APPEND));
+}
+
+bool is_read_open(int flags) {
+  int access = flags & O_ACCMODE;
+  return access == O_RDONLY || access == O_RDWR;
+}
+
+void refresh_proc_fd(pid_t pid, ProcState& proc, int fd) {
+  std::string target = readlink_string("/proc/" + std::to_string(pid) + "/fd/" + std::to_string(fd));
+  if (!target.empty() && target[0] == '/') proc.fds[fd] = best_effort_canonical(target);
+}
+
+void refresh_proc_cwd(pid_t pid, ProcState& proc) {
+  std::string target = readlink_string("/proc/" + std::to_string(pid) + "/cwd");
+  if (!target.empty()) proc.cwd = best_effort_canonical(target);
+}
+
+void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& regs) {
+  PendingSyscall p;
+  p.nr = static_cast<long>(regs.orig_rax);
+  p.args = {regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9};
+
+  switch (p.nr) {
+    case SYS_open:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      p.flags = static_cast<int>(p.args[1]);
+      break;
+    case SYS_openat:
+#ifdef SYS_openat2
+    case SYS_openat2:
+#endif
+      p.dirfd = static_cast<int>(p.args[0]);
+      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
+      p.flags = static_cast<int>(p.args[2]);
+      break;
+    case SYS_creat:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      p.flags = O_CREAT | O_WRONLY | O_TRUNC;
+      break;
+    case SYS_stat:
+    case SYS_lstat:
+    case SYS_access:
+    case SYS_readlink:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      break;
+    case SYS_newfstatat:
+    case SYS_faccessat:
+#ifdef SYS_faccessat2
+    case SYS_faccessat2:
+#endif
+    case SYS_readlinkat:
+      p.dirfd = static_cast<int>(p.args[0]);
+      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
+      break;
+    case SYS_unlink:
+    case SYS_rmdir:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      break;
+    case SYS_unlinkat:
+    case SYS_mkdirat:
+      p.dirfd = static_cast<int>(p.args[0]);
+      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
+      break;
+    case SYS_mkdir:
+    case SYS_chdir:
+    case SYS_truncate:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      break;
+    case SYS_rename:
+      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
+      p.path_b = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[1])).string();
+      break;
+    case SYS_renameat:
+#ifdef SYS_renameat2
+    case SYS_renameat2:
+#endif
+      p.path_a = resolve_path(proc, static_cast<int>(p.args[0]), read_tracee_string(pid, p.args[1])).string();
+      p.path_b = resolve_path(proc, static_cast<int>(p.args[2]), read_tracee_string(pid, p.args[3])).string();
+      break;
+    case SYS_getdents:
+    case SYS_getdents64:
+    case SYS_fchdir:
+    case SYS_ftruncate:
+      p.fd = static_cast<int>(p.args[0]);
+      break;
+    case SYS_close:
+    case SYS_dup:
+    case SYS_dup2:
+    case SYS_dup3:
+    case SYS_fcntl:
+      p.fd = static_cast<int>(p.args[0]);
+      break;
+    default:
+      break;
+  }
+
+  proc.pending = p;
+}
+
+void handle_syscall_exit(pid_t pid, ProcState& proc, long result) {
+  const PendingSyscall& p = proc.pending;
+  bool ok = result >= 0;
+
+  switch (p.nr) {
+    case SYS_open:
+    case SYS_openat:
+#ifdef SYS_openat2
+    case SYS_openat2:
+#endif
+    case SYS_creat:
+      if (ok) {
+        if (is_read_open(p.flags)) record_observation(p.path_a, "read");
+        if (is_write_open(p.flags)) record_observation(p.path_a, "write");
+        if (p.flags & O_DIRECTORY) record_observation(p.path_a, "directory");
+        refresh_proc_fd(pid, proc, static_cast<int>(result));
+      } else {
+        record_observation(p.path_a, "existence");
+      }
+      break;
+    case SYS_stat:
+    case SYS_lstat:
+    case SYS_newfstatat:
+    case SYS_access:
+    case SYS_faccessat:
+#ifdef SYS_faccessat2
+    case SYS_faccessat2:
+#endif
+    case SYS_readlink:
+    case SYS_readlinkat:
+      record_observation(p.path_a, "existence");
+      break;
+    case SYS_getdents:
+    case SYS_getdents64:
+      if (ok && p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "directory");
+      break;
+    case SYS_unlink:
+    case SYS_unlinkat:
+    case SYS_rmdir:
+      record_observation(p.path_a, "delete");
+      break;
+    case SYS_rename:
+    case SYS_renameat:
+#ifdef SYS_renameat2
+    case SYS_renameat2:
+#endif
+      record_observation(p.path_a, "delete");
+      record_observation(p.path_b, "write");
+      break;
+    case SYS_mkdir:
+    case SYS_mkdirat:
+    case SYS_truncate:
+      record_observation(p.path_a, "write");
+      break;
+    case SYS_ftruncate:
+      if (p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "write");
+      break;
+    case SYS_chdir:
+      if (ok) refresh_proc_cwd(pid, proc);
+      break;
+    case SYS_fchdir:
+      if (ok) refresh_proc_cwd(pid, proc);
+      break;
+    case SYS_close:
+      if (ok) proc.fds.erase(p.fd);
+      break;
+    case SYS_dup:
+      if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(result)] = proc.fds[p.fd];
+      break;
+    case SYS_dup2:
+    case SYS_dup3:
+      if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(p.args[1])] = proc.fds[p.fd];
+      break;
+    case SYS_fcntl:
+      if (ok && (static_cast<int>(p.args[1]) == F_DUPFD ||
+                 static_cast<int>(p.args[1]) == F_DUPFD_CLOEXEC) &&
+          proc.fds.count(p.fd)) {
+        proc.fds[static_cast<int>(result)] = proc.fds[p.fd];
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+void trace_command(const std::vector<std::string>& command) {
+  pid_t child = fork();
+  if (child < 0) throw std::runtime_error(errno_message("fork failed"));
+  if (child == 0) {
+    if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) != 0) _exit(127);
+    raise(SIGSTOP);
+    std::vector<char*> argv;
+    for (const auto& arg : command) argv.push_back(const_cast<char*>(arg.c_str()));
+    argv.push_back(nullptr);
+    execvp(argv[0], argv.data());
+    _exit(127);
+  }
+
+  int status = 0;
+  if (waitpid(child, &status, 0) < 0) throw std::runtime_error(errno_message("waitpid failed"));
+  if (ptrace(PTRACE_SETOPTIONS, child, nullptr, kPtraceOptions) != 0) {
+    throw std::runtime_error(errno_message("ptrace SETOPTIONS failed"));
+  }
+  processes[child].cwd = fs::current_path();
+  refresh_proc_cwd(child, processes[child]);
+  ptrace(PTRACE_SYSCALL, child, nullptr, nullptr);
+
+  while (!processes.empty()) {
+    pid_t pid = waitpid(-1, &status, __WALL);
+    if (pid < 0) {
+      if (errno == EINTR) continue;
+      if (errno == ECHILD) break;
+      throw std::runtime_error(errno_message("waitpid trace loop failed"));
+    }
+
+    auto it = processes.find(pid);
+    if (it == processes.end()) {
+      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
+      continue;
+    }
+
+    if (WIFEXITED(status) || WIFSIGNALED(status)) {
+      processes.erase(it);
+      continue;
+    }
+
+    if (!WIFSTOPPED(status)) continue;
+    int sig = WSTOPSIG(status);
+    unsigned event = static_cast<unsigned>(status >> 16);
+
+    if (event == PTRACE_EVENT_FORK || event == PTRACE_EVENT_VFORK ||
+        event == PTRACE_EVENT_CLONE) {
+      unsigned long new_pid = 0;
+      ptrace(PTRACE_GETEVENTMSG, pid, nullptr, &new_pid);
+      processes[static_cast<pid_t>(new_pid)] = it->second;
+      ptrace(PTRACE_SETOPTIONS, static_cast<pid_t>(new_pid), nullptr, kPtraceOptions);
+      ptrace(PTRACE_SYSCALL, static_cast<pid_t>(new_pid), nullptr, nullptr);
+      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
+      continue;
+    }
+
+    if (sig == (SIGTRAP | 0x80)) {
+      user_regs_struct regs {};
+      if (ptrace(PTRACE_GETREGS, pid, nullptr, &regs) == 0) {
+        ProcState& proc = it->second;
+        if (!proc.in_syscall) {
+          handle_syscall_entry(pid, proc, regs);
+          proc.in_syscall = true;
+        } else {
+          handle_syscall_exit(pid, proc, static_cast<long>(regs.rax));
+          proc.in_syscall = false;
+        }
+      }
+      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
+    } else {
+      int deliver = (sig == SIGTRAP) ? 0 : sig;
+      ptrace(PTRACE_SYSCALL, pid, nullptr, reinterpret_cast<void*>(static_cast<long>(deliver)));
+    }
+  }
+}
+
+void write_manifest(const fs::path& out, const std::vector<std::string>& command, int exit_status) {
+  json manifest;
+  manifest["format_version"] = 1;
+  manifest["command"] = command;
+  manifest["exit_status"] = exit_status;
+  manifest["start_cwd"] = fs::current_path().string();
+  manifest["uid"] = tracer_uid;
+  manifest["gid"] = tracer_gid;
+
+  manifest["git_repositories"] = json::array();
+  for (const auto& [_, repo] : repos) {
+    manifest["git_repositories"].push_back({
+        {"root", repo.root},
+        {"head", repo.head},
+        {"dirty", repo.dirty},
+    });
+  }
+
+  manifest["files"] = json::array();
+  for (const auto& [_, rec] : files) {
+    json ops = json::array();
+    for (const auto& op : rec.operations) ops.push_back(op);
+    manifest["files"].push_back({
+        {"path", rec.path},
+        {"operations", ops},
+        {"before", metadata_json(rec.before)},
+        {"after", metadata_json(rec.after)},
+        {"git", git_json(rec.after_git.in_repo ? rec.after_git : rec.before_git)},
+    });
+  }
+
+  std::ofstream stream(out / "manifest.json");
+  stream << manifest.dump(2) << "\n";
+}
+
+void restore_snapshot(const fs::path& dir) {
+  std::ifstream in(dir / "manifest.json");
+  if (!in) throw std::runtime_error("cannot open manifest");
+  json manifest = json::parse(in);
+
+  for (const auto& item : manifest.at("files")) {
+    fs::path path = item.at("path").get<std::string>();
+    const json& after = item.at("after");
+    if (!after.value("exists", false)) {
+      if (after.value("tombstone", false)) {
+        std::error_code ec;
+        fs::remove(path, ec);
+      }
+      continue;
+    }
+    if (!after.contains("blob")) continue;
+
+    fs::create_directories(path.parent_path());
+    fs::copy_file(dir / "blobs" / after.at("blob").get<std::string>(), path,
+                  fs::copy_options::overwrite_existing);
+    if (after.contains("mode")) {
+      fs::permissions(path, static_cast<fs::perms>(after.at("mode").get<unsigned>() & 07777),
+                      fs::perm_options::replace);
+    }
+    if (after.contains("mtime")) {
+      // std::filesystem has no portable wall-clock setter for Unix time in C++20.
+      struct timespec ts[2] {};
+      ts[0].tv_nsec = UTIME_OMIT;
+      ts[1].tv_sec = after.at("mtime").get<std::time_t>();
+      ts[1].tv_nsec = 0;
+      utimensat(AT_FDCWD, path.c_str(), ts, AT_SYMLINK_NOFOLLOW);
+    }
+  }
+}
+
+int run_snapshot(const std::vector<std::string>& args) {
+  fs::path output;
+  size_t split = args.size();
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (args[i] == "--output" && i + 1 < args.size()) {
+      output = args[i + 1];
+      ++i;
+    } else if (args[i] == "--") {
+      split = i + 1;
+      break;
+    } else {
+      throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args...");
+    }
+  }
+  if (output.empty() || split >= args.size()) {
+    throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args...");
+  }
+
+  snapshot_dir = output;
+  blob_dir = snapshot_dir / "blobs";
+  fs::remove_all(snapshot_dir);
+  fs::create_directories(blob_dir);
+
+  std::vector<std::string> command(args.begin() + static_cast<long>(split), args.end());
+  trace_command(command);
+  finalize_records();
+  write_manifest(snapshot_dir, command, 0);
+  return 0;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  tracer_uid = getuid();
+  tracer_gid = getgid();
+  git_libgit2_init();
+  try {
+    std::vector<std::string> args(argv + 1, argv + argc);
+    if (!args.empty() && args[0] == "restore") {
+      if (args.size() != 2) throw std::runtime_error("usage: agent-snapshot restore SNAPDIR");
+      restore_snapshot(args[1]);
+      git_libgit2_shutdown();
+      return 0;
+    }
+    int rc = run_snapshot(args);
+    git_libgit2_shutdown();
+    return rc;
+  } catch (const std::exception& e) {
+    git_libgit2_shutdown();
+    std::cerr << "agent-snapshot: " << e.what() << "\n";
+    return 1;
+  }
+}
test_programs/dirty_untracked_write.py
new file mode 100644
index 0000000..9600648
--- /dev/null
+++ b/test_programs/dirty_untracked_write.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+root = Path(__file__).resolve().parents[1]
+testdata = root / "testdata"
+
+(testdata / "dirty.txt").read_text()
+(testdata / "untracked_runtime.txt").read_text()
+(testdata / "created_by_program.txt").write_text("created final\n")
+(testdata / "deleted_by_program.txt").unlink()
test_programs/fork_and_usr.py
new file mode 100644
index 0000000..41c397b
--- /dev/null
+++ b/test_programs/fork_and_usr.py
@@ -0,0 +1,18 @@
+import os
+from pathlib import Path
+
+root = Path(__file__).resolve().parents[1]
+testdata = root / "testdata"
+
+pid = os.fork()
+if pid == 0:
+    (testdata / "child_output.txt").write_text("child final\n")
+    os._exit(0)
+
+with open("/usr/bin/env", "rb") as handle:
+    handle.read(16)
+
+for _ in testdata.iterdir():
+    pass
+
+os.waitpid(pid, 0)
test_programs/read_clean.py
new file mode 100644
index 0000000..2310ff0
--- /dev/null
+++ b/test_programs/read_clean.py
@@ -0,0 +1,4 @@
+from pathlib import Path
+
+root = Path(__file__).resolve().parents[1]
+print((root / "testdata" / "clean.txt").read_text())
testdata/clean.txt
new file mode 100644
index 0000000..39cb81a
--- /dev/null
+++ b/testdata/clean.txt
@@ -0,0 +1,2 @@
+clean tracked fixture
+line two
testdata/dirty.txt
new file mode 100644
index 0000000..63ba718
--- /dev/null
+++ b/testdata/dirty.txt
@@ -0,0 +1 @@
+dirty tracked fixture original
testdata/nested/info.txt
new file mode 100644
index 0000000..d892bff
--- /dev/null
+++ b/testdata/nested/info.txt
@@ -0,0 +1 @@
+nested tracked fixture
tests/test_agent_snapshot.py
new file mode 100644
index 0000000..2c8e345
--- /dev/null
+++ b/tests/test_agent_snapshot.py
@@ -0,0 +1,126 @@
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import pytest
+
+
+ROOT = Path(__file__).resolve().parents[1]
+BUILD = ROOT / "build" / "pytest"
+BIN = BUILD / "agent-snapshot"
+TESTDATA = ROOT / "testdata"
+
+
+def run(cmd, **kwargs):
+    return subprocess.run(cmd, cwd=ROOT, text=True, check=True, **kwargs)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def build_agent_snapshot():
+    run(["cmake", "-S", ".", "-B", str(BUILD)])
+    run(["cmake", "--build", str(BUILD), "--parallel"])
+    assert BIN.exists()
+
+
+@pytest.fixture(autouse=True)
+def pristine_testdata():
+    run(["git", "checkout", "--", "testdata"])
+    run(["git", "clean", "-fd", "--", "testdata"])
+    yield
+    run(["git", "checkout", "--", "testdata"])
+    run(["git", "clean", "-fd", "--", "testdata"])
+
+
+class Snapshot:
+    def __init__(self, path: Path):
+        self.path = path
+        self.manifest = json.loads((path / "manifest.json").read_text())
+
+    def file(self, path: Path):
+        target = str(path.resolve())
+        for item in self.manifest["files"]:
+            if item["path"] == target:
+                return item
+        raise AssertionError(f"{target} not present in snapshot")
+
+    def blob_text(self, digest: str):
+        return (self.path / "blobs" / digest).read_text()
+
+
+def capture(tmp_path: Path, *command: str) -> Snapshot:
+    out = tmp_path / "snapshot"
+    run([str(BIN), "--output", str(out), "--", *command])
+    return Snapshot(out)
+
+
+def test_clean_git_tracked_read_records_repo_without_blob(tmp_path):
+    snap = capture(tmp_path, "python3", "test_programs/read_clean.py")
+    clean = snap.file(TESTDATA / "clean.txt")
+
+    assert "read" in clean["operations"]
+    assert clean["git"]["tracked"] is True
+    assert clean["git"]["dirty"] is False
+    assert clean["before"].get("blob") is None
+    assert clean["after"].get("blob") is None
+    assert any(repo["root"] == str(ROOT.resolve()) for repo in snap.manifest["git_repositories"])
+
+
+def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path):
+    (TESTDATA / "dirty.txt").write_text("dirty tracked fixture changed before run\n")
+    (TESTDATA / "untracked_runtime.txt").write_text("untracked input\n")
+    (TESTDATA / "deleted_by_program.txt").write_text("delete me\n")
+
+    snap = capture(tmp_path, "python3", "test_programs/dirty_untracked_write.py")
+
+    dirty = snap.file(TESTDATA / "dirty.txt")
+    assert dirty["git"]["tracked"] is True
+    assert dirty["git"]["dirty"] is True
+    assert snap.blob_text(dirty["before"]["blob"]) == "dirty tracked fixture changed before run\n"
+
+    untracked = snap.file(TESTDATA / "untracked_runtime.txt")
+    assert untracked["git"]["tracked"] is False
+    assert snap.blob_text(untracked["before"]["blob"]) == "untracked input\n"
+
+    created = snap.file(TESTDATA / "created_by_program.txt")
+    assert "write" in created["operations"]
+    assert created["before"]["exists"] is False
+    assert snap.blob_text(created["after"]["blob"]) == "created final\n"
+
+    deleted = snap.file(TESTDATA / "deleted_by_program.txt")
+    assert "delete" in deleted["operations"]
+    assert deleted["after"]["exists"] is False
+    assert deleted["after"]["tombstone"] is True
+
+
+def test_fork_usr_and_directory_traversal(tmp_path):
+    snap = capture(tmp_path, "python3", "test_programs/fork_and_usr.py")
+
+    child = snap.file(TESTDATA / "child_output.txt")
+    assert "write" in child["operations"]
+    assert snap.blob_text(child["after"]["blob"]) == "child final\n"
+
+    usr_env = snap.file(Path("/usr/bin/env"))
+    assert "read" in usr_env["operations"]
+    assert usr_env["before"].get("blob") is None
+    assert usr_env["after"].get("blob") is None
+
+    directory = snap.file(TESTDATA)
+    assert "directory" in directory["operations"]
+
+
+def test_restore_applies_final_state(tmp_path):
+    (TESTDATA / "dirty.txt").write_text("changed before capture\n")
+    (TESTDATA / "deleted_by_program.txt").write_text("delete me\n")
+    snap = capture(tmp_path, "python3", "test_programs/dirty_untracked_write.py")
+
+    shutil.rmtree(TESTDATA)
+    TESTDATA.mkdir()
+    (TESTDATA / "created_by_program.txt").write_text("wrong\n")
+    (TESTDATA / "deleted_by_program.txt").write_text("should disappear\n")
+
+    run([str(BIN), "restore", str(snap.path)])
+
+    assert (TESTDATA / "created_by_program.txt").read_text() == "created final\n"
+    assert not (TESTDATA / "deleted_by_program.txt").exists()