Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Port snapshotter to OCaml

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 01:48:08 -0400
Commit
0c08809cad241ce9068cfb9a72c1f8444e017cf6
.gitignore
index b5fc08a..82751cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 build/
+_build/
 .pytest_cache/
 .venv/
 __pycache__/
CMakeLists.txt
deleted file mode 100644
index 89796e6..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-cmake_minimum_required(VERSION 3.28)
-project(agent_snapshot VERSION 0.1.0 LANGUAGES CXX)
-
-include(FetchContent)
-
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-find_package(PkgConfig REQUIRED)
-pkg_check_modules(LIBGIT2 REQUIRED IMPORTED_TARGET libgit2)
-
-FetchContent_Declare(
-  nlohmann_json
-  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
-  URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d
-)
-FetchContent_MakeAvailable(nlohmann_json)
-
-add_executable(agent-snapshot
-  src/main.cpp
-)
-
-target_include_directories(agent-snapshot PRIVATE include)
-target_link_libraries(agent-snapshot PRIVATE PkgConfig::LIBGIT2 nlohmann_json::nlohmann_json)
-target_compile_options(agent-snapshot PRIVATE -Wall -Wextra -Wpedantic)
-
dune-project
new file mode 100644
index 0000000..010ffd8
--- /dev/null
+++ b/dune-project
@@ -0,0 +1,11 @@
+(lang dune 3.22)
+(name agent-snapshot)
+
+(package
+ (name agent-snapshot)
+ (synopsis "Filesystem snapshotter for traced commands")
+ (depends
+  (ocaml (>= 5.4))
+  dune
+  yojson
+  ocaml-git))
src/main.cpp
deleted file mode 100644
index f96aa29..0000000
--- a/src/main.cpp
+++ /dev/null
@@ -1,955 +0,0 @@
-#include <nlohmann/json.hpp>
-#include <git2.h>
-
-#include <sys/ptrace.h>
-#include <sys/reg.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/user.h>
-#include <sys/wait.h>
-
-#include <fcntl.h>
-#include <signal.h>
-#include <unistd.h>
-
-#include <cerrno>
-#include <chrono>
-#include <cstdlib>
-#include <cstring>
-#include <filesystem>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <optional>
-#include <set>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-using json = nlohmann::json;
-namespace fs = std::filesystem;
-
-namespace {
-
-// ptrace reports many kinds of stops through the same waitpid interface. These
-// options are what make the tracer useful for this project:
-//
-// - PTRACE_O_TRACESYSGOOD marks syscall stops as SIGTRAP|0x80, so we can tell
-//   "the process is entering/leaving a syscall" apart from ordinary SIGTRAPs.
-// - PTRACE_O_TRACE{FORK,VFORK,CLONE} asks the kernel to stop the parent when a
-//   new child is created and lets us attach the same syscall tracing policy to
-//   the child before it runs far enough to hide file accesses from us.
-// - EXEC/EXIT are not deeply interpreted yet, but enabling them keeps the event
-//   stream explicit and leaves room for recording process lifecycle later.
-constexpr int kPtraceOptions = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK |
-                               PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE |
-                               PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT;
-
-// The manifest stores before/after facts because reconstruction is about final
-// state, but deciding whether to blob a file often depends on what was present
-// before the traced program touched it. For example, a created file must have an
-// explicit "before did not exist" fact, while a deleted file needs a tombstone.
-struct Metadata {
-  bool exists = false;
-  bool tombstone = false;
-  bool regular = false;
-  bool directory = false;
-  mode_t mode = 0;
-  uintmax_t size = 0;
-  std::time_t mtime = 0;
-  std::string blob;
-};
-
-// Git facts are the main compactness mechanism. A clean tracked path can be
-// reconstructed from repository root + HEAD + relative path, so storing its
-// content would duplicate information already represented by Git.
-struct GitInfo {
-  bool in_repo = false;
-  bool tracked = false;
-  bool dirty = false;
-  bool ignored = false;
-  std::string root;
-  std::string head;
-  std::string relative_path;
-};
-
-// A file is recorded once and accumulates capabilities. This keeps the manifest
-// compact when a program stats, opens, reads, and later deletes the same path.
-struct FileRecord {
-  std::string path;
-  std::set<std::string> operations;
-  Metadata before;
-  Metadata after;
-  GitInfo before_git;
-  GitInfo after_git;
-  bool before_recorded = false;
-};
-
-// ptrace gives us register state separately at syscall entry and syscall exit.
-// Path pointers are only meaningful at entry, while return values and new file
-// descriptors only exist at exit, so we cache the decoded entry-side state here.
-struct PendingSyscall {
-  long nr = -1;
-  std::array<unsigned long long, 6> args{};
-  std::string path_a;
-  std::string path_b;
-  int dirfd = AT_FDCWD;
-  int fd = -1;
-  int flags = 0;
-};
-
-// ptrace tracks tasks, not "programs". Each traced pid can have a different cwd
-// and fd table after chdir/open/dup/close, and relative path reconstruction is
-// only as good as this per-process state.
-struct ProcState {
-  bool in_syscall = false;
-  fs::path cwd;
-  std::map<int, fs::path> fds;
-  PendingSyscall pending;
-};
-
-struct RepoRecord {
-  std::string root;
-  std::string head;
-  bool dirty = false;
-};
-
-std::unordered_map<pid_t, ProcState> processes;
-std::map<std::string, FileRecord> files;
-std::map<std::string, RepoRecord> repos;
-std::vector<fs::path> ignored_paths;
-fs::path ignore_config_path;
-fs::path snapshot_dir;
-fs::path blob_dir;
-uid_t tracer_uid = 0;
-gid_t tracer_gid = 0;
-
-std::string errno_message(const std::string& prefix) {
-  return prefix + ": " + std::strerror(errno);
-}
-
-std::string readlink_string(const fs::path& path) {
-  std::vector<char> buffer(4096);
-  ssize_t n = readlink(path.c_str(), buffer.data(), buffer.size() - 1);
-  if (n < 0) return {};
-  buffer[static_cast<size_t>(n)] = '\0';
-  return std::string(buffer.data());
-}
-
-fs::path lexical_abs(const fs::path& path, const fs::path& base) {
-  if (path.is_absolute()) return path.lexically_normal();
-  return (base / path).lexically_normal();
-}
-
-fs::path best_effort_canonical(const fs::path& path) {
-  std::error_code ec;
-  fs::path canonical = fs::weakly_canonical(path, ec);
-  if (!ec && !canonical.empty()) return canonical.lexically_normal();
-  if (path.is_absolute()) return path.lexically_normal();
-  return fs::absolute(path, ec).lexically_normal();
-}
-
-fs::path xdg_ignore_config_path() {
-  const char* config_home = std::getenv("XDG_CONFIG_HOME");
-  if (config_home && config_home[0] != '\0') {
-    return fs::path(config_home) / "agent-snapshot" / "ignore.json";
-  }
-  const char* home = std::getenv("HOME");
-  if (!home || home[0] == '\0') {
-    throw std::runtime_error("XDG_CONFIG_HOME is unset and HOME is unavailable");
-  }
-  return fs::path(home) / ".config" / "agent-snapshot" / "ignore.json";
-}
-
-fs::path home_dir() {
-  const char* home = std::getenv("HOME");
-  if (!home || home[0] == '\0') {
-    throw std::runtime_error("HOME is unavailable");
-  }
-  return fs::path(home);
-}
-
-fs::path xdg_config_home_dir() {
-  const char* config_home = std::getenv("XDG_CONFIG_HOME");
-  if (config_home && config_home[0] != '\0') {
-    return fs::path(config_home);
-  }
-  return home_dir() / ".config";
-}
-
-fs::path expand_ignore_entry(const std::string& entry) {
-  constexpr std::string_view kHome = "$HOME";
-  constexpr std::string_view kXdgConfigHome = "$XDG_CONFIG_HOME";
-  if (entry == kHome) return home_dir();
-  if (entry.rfind(std::string(kHome) + "/", 0) == 0) {
-    return home_dir() / entry.substr(kHome.size() + 1);
-  }
-  if (entry == kXdgConfigHome) return xdg_config_home_dir();
-  if (entry.rfind(std::string(kXdgConfigHome) + "/", 0) == 0) {
-    return xdg_config_home_dir() / entry.substr(kXdgConfigHome.size() + 1);
-  }
-  return fs::path(entry);
-}
-
-bool path_is_at_or_under(const fs::path& path, const fs::path& root) {
-  auto pit = path.begin();
-  auto rit = root.begin();
-  for (; rit != root.end(); ++rit, ++pit) {
-    if (pit == path.end() || *pit != *rit) return false;
-  }
-  return true;
-}
-
-bool is_git_internal_path(const fs::path& path) {
-  for (const auto& part : path) {
-    if (part == ".git") return true;
-  }
-  return false;
-}
-
-bool is_ignored_path(const fs::path& raw_path) {
-  if (raw_path.empty()) return false;
-  fs::path path = best_effort_canonical(raw_path);
-  if (is_git_internal_path(path)) return true;
-  for (const auto& ignored : ignored_paths) {
-    if (path_is_at_or_under(path, ignored)) return true;
-  }
-  return false;
-}
-
-void load_ignore_config() {
-  ignore_config_path = best_effort_canonical(xdg_ignore_config_path());
-  std::ifstream stream(ignore_config_path);
-  if (!stream) {
-    throw std::runtime_error("ignore config does not exist: " + ignore_config_path.string());
-  }
-
-  json entries = json::parse(stream);
-  if (!entries.is_array()) {
-    throw std::runtime_error("ignore config must be a JSON array: " + ignore_config_path.string());
-  }
-
-  ignored_paths.clear();
-  ignored_paths.push_back(ignore_config_path);
-  for (const auto& entry : entries) {
-    if (!entry.is_string()) {
-      throw std::runtime_error("ignore config entries must be strings: " + ignore_config_path.string());
-    }
-    ignored_paths.push_back(best_effort_canonical(expand_ignore_entry(entry.get<std::string>())));
-  }
-}
-
-// Syscall arguments live in the tracee's address space. PTRACE_PEEKDATA reads a
-// machine word at a time from that process; for path arguments we walk forward
-// until the NUL terminator. This is intentionally bounded so a bad userspace
-// pointer or non-terminated buffer cannot make the tracer loop forever.
-std::string read_tracee_string(pid_t pid, unsigned long long address) {
-  if (address == 0) return {};
-  std::string out;
-  union {
-    long value;
-    char chars[sizeof(long)];
-  } data{};
-  for (size_t offset = 0; offset < 65536; offset += sizeof(long)) {
-    errno = 0;
-    data.value = ptrace(PTRACE_PEEKDATA, pid, address + offset, nullptr);
-    if (errno != 0) break;
-    for (char c : data.chars) {
-      if (c == '\0') return out;
-      out.push_back(c);
-    }
-  }
-  return out;
-}
-
-std::optional<Metadata> stat_metadata(const fs::path& path) {
-  struct stat st {};
-  if (lstat(path.c_str(), &st) != 0) return std::nullopt;
-  Metadata meta;
-  meta.exists = true;
-  meta.mode = st.st_mode;
-  meta.size = static_cast<uintmax_t>(st.st_size);
-  meta.mtime = st.st_mtim.tv_sec;
-  meta.regular = S_ISREG(st.st_mode);
-  meta.directory = S_ISDIR(st.st_mode);
-  return meta;
-}
-
-bool writable_by_current_user(const Metadata& meta) {
-  if (!meta.exists) return true;
-  const mode_t mode = meta.mode;
-  if (tracer_uid == 0) return true;
-  struct stat st {};
-  (void)st;
-  return (mode & S_IWUSR) || (mode & S_IWGRP) || (mode & S_IWOTH);
-}
-
-// This is the "system environment" escape hatch. The goal is not to snapshot
-// /usr wholesale just because the dynamic loader, Python, or libc looked there.
-// access(W_OK) is used instead of checking raw mode bits because groups, ACLs,
-// and effective permission rules matter more than whether an owner write bit is
-// present for some other uid.
-bool owned_by_other_and_not_writable(const fs::path& path) {
-  struct stat st {};
-  if (lstat(path.c_str(), &st) != 0) return false;
-  if (st.st_uid == tracer_uid) return false;
-  if (access(path.c_str(), W_OK) == 0) return false;
-  return true;
-}
-
-// Git can classify paths that do not exist at finalization time if we start from
-// the nearest existing parent. That matters for tombstones: after unlink, the
-// path itself is gone, but the repository root and relative path are still
-// recoverable from its parent directory.
-fs::path existing_anchor(fs::path path) {
-  std::error_code ec;
-  if (fs::exists(path, ec)) return path;
-  while (!path.empty() && path != path.root_path()) {
-    path = path.parent_path();
-    if (fs::exists(path, ec)) return path;
-  }
-  return {};
-}
-
-std::string oid_to_string(const git_oid* oid) {
-  char out[GIT_OID_HEXSZ + 1] = {};
-  git_oid_tostr(out, sizeof(out), oid);
-  return out;
-}
-
-std::string repo_head(git_repository* repo) {
-  git_reference* head = nullptr;
-  if (git_repository_head(&head, repo) != 0) return {};
-  const git_oid* oid = git_reference_target(head);
-  std::string result = oid ? oid_to_string(oid) : "";
-  git_reference_free(head);
-  return result;
-}
-
-GitInfo classify_git(const fs::path& input_path) {
-  GitInfo info;
-  fs::path anchor = existing_anchor(input_path);
-  if (anchor.empty()) anchor = existing_anchor(input_path.parent_path());
-  if (anchor.empty()) return info;
-
-  // git_repository_open_ext walks upward from the anchor to find any containing
-  // repository. This is deliberate: the traced program might access files inside
-  // a nested repo, a dependency checkout, or a temp repo unrelated to this tool.
-  git_repository* repo = nullptr;
-  if (git_repository_open_ext(&repo, anchor.c_str(), 0, nullptr) != 0) return info;
-
-  const char* workdir = git_repository_workdir(repo);
-  if (!workdir) {
-    git_repository_free(repo);
-    return info;
-  }
-
-  info.in_repo = true;
-  info.root = best_effort_canonical(workdir).string();
-  info.head = repo_head(repo);
-
-  std::error_code ec;
-  fs::path rel = fs::relative(input_path, info.root, ec);
-  if (!ec) info.relative_path = rel.string();
-
-  // libgit2's status bits encode both index and worktree state. For compactness
-  // we only skip blobs for GIT_STATUS_CURRENT tracked files. New, ignored, or
-  // dirty tracked files are not reconstructable from HEAD alone, so they are
-  // treated as content that belongs in the snapshot.
-  unsigned int status = 0;
-  if (!info.relative_path.empty() &&
-      git_status_file(&status, repo, info.relative_path.c_str()) == 0) {
-    info.ignored = status & GIT_STATUS_IGNORED;
-    info.tracked = !(status & GIT_STATUS_WT_NEW) && !(status & GIT_STATUS_IGNORED);
-    info.dirty = status != GIT_STATUS_CURRENT;
-  } else if (!info.relative_path.empty()) {
-    info.tracked = false;
-    info.dirty = true;
-  }
-
-  RepoRecord& rec = repos[info.root];
-  rec.root = info.root;
-  rec.head = info.head;
-  rec.dirty = rec.dirty || info.dirty;
-
-  git_repository_free(repo);
-  return info;
-}
-
-// This digest is a content-addressing key for blobs, not a security boundary.
-// FNV-1a is small and deterministic, which is enough for current tests and for
-// avoiding duplicate blob files. A production snapshot format should replace it
-// with SHA-256 or BLAKE3 before relying on it for collision resistance.
-std::string fnv1a_file_digest(const fs::path& path) {
-  std::ifstream in(path, std::ios::binary);
-  uint64_t hash = 1469598103934665603ULL;
-  char c;
-  while (in.get(c)) {
-    hash ^= static_cast<unsigned char>(c);
-    hash *= 1099511628211ULL;
-  }
-  std::ostringstream out;
-  out << std::hex << std::setw(16) << std::setfill('0') << hash;
-  return out.str();
-}
-
-std::string store_blob(const fs::path& path) {
-  std::string digest = fnv1a_file_digest(path);
-  fs::path out = blob_dir / digest;
-  if (!fs::exists(out)) {
-    fs::copy_file(path, out, fs::copy_options::overwrite_existing);
-  }
-  return digest;
-}
-
-bool should_capture_content(const fs::path& path, const Metadata& meta, const GitInfo& git) {
-  // Directories, devices, sockets, etc. are represented as metadata and
-  // observations. Only regular files get blobbed in v1.
-  if (!meta.exists || !meta.regular) return false;
-  if (owned_by_other_and_not_writable(path)) return false;
-  if (git.in_repo && git.tracked && !git.dirty) return false;
-  return writable_by_current_user(meta);
-}
-
-json metadata_json(const Metadata& meta) {
-  json j;
-  j["exists"] = meta.exists;
-  if (meta.tombstone) j["tombstone"] = true;
-  if (meta.exists) {
-    j["type"] = meta.directory ? "directory" : (meta.regular ? "file" : "other");
-    j["mode"] = meta.mode;
-    j["size"] = meta.size;
-    j["mtime"] = meta.mtime;
-  }
-  if (!meta.blob.empty()) j["blob"] = meta.blob;
-  return j;
-}
-
-json git_json(const GitInfo& git) {
-  json j;
-  j["in_repo"] = git.in_repo;
-  if (git.in_repo) {
-    j["root"] = git.root;
-    j["head"] = git.head;
-    j["relative_path"] = git.relative_path;
-    j["tracked"] = git.tracked;
-    j["dirty"] = git.dirty;
-    j["ignored"] = git.ignored;
-  }
-  return j;
-}
-
-void record_observation(const fs::path& raw_path, const std::string& operation) {
-  if (raw_path.empty()) return;
-  fs::path path = best_effort_canonical(raw_path);
-  if (is_ignored_path(path)) return;
-  std::string key = path.string();
-  FileRecord& rec = files[key];
-  rec.path = key;
-  rec.operations.insert(operation);
-  if (!rec.before_recorded) {
-    // Capture "before" on first observation, not at process exit. This is the
-    // only chance to distinguish "the program created this path" from "the path
-    // existed before and was later opened for write".
-    rec.before_recorded = true;
-    rec.before = stat_metadata(path).value_or(Metadata{});
-    rec.before_git = classify_git(path);
-    if (should_capture_content(path, rec.before, rec.before_git)) {
-      rec.before.blob = store_blob(path);
-    }
-  }
-}
-
-void finalize_records() {
-  for (auto& [_, rec] : files) {
-    // The after pass is intentionally outside ptrace. Once the traced process
-    // tree has exited, the filesystem has quiesced from our point of view, so
-    // final content can be copied without racing the writer we launched.
-    fs::path path(rec.path);
-    if (is_ignored_path(path)) continue;
-    rec.after = stat_metadata(path).value_or(Metadata{});
-    if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0;
-    rec.after_git = classify_git(path);
-    const bool written_regular_file =
-        rec.operations.count("write") > 0 && rec.after.exists && rec.after.regular;
-    if ((written_regular_file && !owned_by_other_and_not_writable(path)) ||
-        should_capture_content(path, rec.after, rec.after_git)) {
-      rec.after.blob = store_blob(path);
-    }
-  }
-}
-
-fs::path resolve_path(const ProcState& proc, int dirfd, const std::string& path) {
-  // The *at syscalls interpret relative paths against either cwd or a directory
-  // fd. That is why the tracer maintains fd->path mappings; without them,
-  // openat(dirfd, "file") would be impossible to place in the manifest.
-  fs::path p(path);
-  if (p.is_absolute()) return p;
-  fs::path base = proc.cwd;
-  if (dirfd != AT_FDCWD) {
-    auto it = proc.fds.find(dirfd);
-    if (it != proc.fds.end()) base = it->second;
-  }
-  return lexical_abs(p, base);
-}
-
-bool is_write_open(int flags) {
-  int access = flags & O_ACCMODE;
-  return access == O_WRONLY || access == O_RDWR || (flags & (O_CREAT | O_TRUNC | O_APPEND));
-}
-
-bool is_read_open(int flags) {
-  int access = flags & O_ACCMODE;
-  return access == O_RDONLY || access == O_RDWR;
-}
-
-void refresh_proc_fd(pid_t pid, ProcState& proc, int fd) {
-  // /proc/<pid>/fd/N is the kernel's own view of where an fd points. Reading
-  // this symlink after a successful open is more reliable than trying to model
-  // every mount namespace or symlink resolution rule ourselves.
-  std::string target = readlink_string("/proc/" + std::to_string(pid) + "/fd/" + std::to_string(fd));
-  if (!target.empty() && target[0] == '/') proc.fds[fd] = best_effort_canonical(target);
-}
-
-void refresh_proc_cwd(pid_t pid, ProcState& proc) {
-  // chdir/fchdir affect future relative path arguments. /proc gives us the
-  // post-syscall cwd after the kernel has accepted the directory change.
-  std::string target = readlink_string("/proc/" + std::to_string(pid) + "/cwd");
-  if (!target.empty()) proc.cwd = best_effort_canonical(target);
-}
-
-void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& regs) {
-  PendingSyscall p;
-  // Linux x86_64 syscall ABI: syscall number in orig_rax, arguments in
-  // rdi/rsi/rdx/r10/r8/r9, result in rax on exit. This file is explicitly v1
-  // x86_64-only; another architecture needs a different register decoder.
-  p.nr = static_cast<long>(regs.orig_rax);
-  p.args = {regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9};
-
-  switch (p.nr) {
-    case SYS_open:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      p.flags = static_cast<int>(p.args[1]);
-      // For writes, record before-state at syscall entry. If O_CREAT succeeds,
-      // waiting until exit would make a newly-created file look preexisting.
-      if (is_write_open(p.flags)) record_observation(p.path_a, "write");
-      break;
-    case SYS_openat:
-#ifdef SYS_openat2
-    case SYS_openat2:
-#endif
-      p.dirfd = static_cast<int>(p.args[0]);
-      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
-      p.flags = static_cast<int>(p.args[2]);
-      if (is_write_open(p.flags)) record_observation(p.path_a, "write");
-      break;
-    case SYS_creat:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      p.flags = O_CREAT | O_WRONLY | O_TRUNC;
-      record_observation(p.path_a, "write");
-      break;
-    case SYS_stat:
-    case SYS_lstat:
-    case SYS_access:
-    case SYS_readlink:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      break;
-    case SYS_newfstatat:
-    case SYS_faccessat:
-#ifdef SYS_faccessat2
-    case SYS_faccessat2:
-#endif
-    case SYS_readlinkat:
-      p.dirfd = static_cast<int>(p.args[0]);
-      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
-      break;
-    case SYS_unlink:
-    case SYS_rmdir:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      // Deletions have the same timing issue as creations: after syscall exit
-      // the content may be gone, so the before snapshot must happen here.
-      record_observation(p.path_a, "delete");
-      break;
-    case SYS_unlinkat:
-    case SYS_mkdirat:
-      p.dirfd = static_cast<int>(p.args[0]);
-      p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string();
-      if (p.nr == SYS_unlinkat) record_observation(p.path_a, "delete");
-      break;
-    case SYS_mkdir:
-    case SYS_chdir:
-    case SYS_truncate:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      break;
-    case SYS_rename:
-      p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string();
-      p.path_b = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[1])).string();
-      // A rename is modeled as source deletion plus destination write. That is
-      // enough for reconstruction even though it loses the atomic-move history.
-      record_observation(p.path_a, "delete");
-      record_observation(p.path_b, "write");
-      break;
-    case SYS_renameat:
-#ifdef SYS_renameat2
-    case SYS_renameat2:
-#endif
-      p.path_a = resolve_path(proc, static_cast<int>(p.args[0]), read_tracee_string(pid, p.args[1])).string();
-      p.path_b = resolve_path(proc, static_cast<int>(p.args[2]), read_tracee_string(pid, p.args[3])).string();
-      record_observation(p.path_a, "delete");
-      record_observation(p.path_b, "write");
-      break;
-    case SYS_getdents:
-    case SYS_getdents64:
-    case SYS_fchdir:
-    case SYS_ftruncate:
-      p.fd = static_cast<int>(p.args[0]);
-      break;
-    case SYS_close:
-    case SYS_dup:
-    case SYS_dup2:
-    case SYS_dup3:
-    case SYS_fcntl:
-      p.fd = static_cast<int>(p.args[0]);
-      break;
-    default:
-      break;
-  }
-
-  proc.pending = p;
-}
-
-void handle_syscall_exit(pid_t pid, ProcState& proc, long result) {
-  const PendingSyscall& p = proc.pending;
-  // On Linux, syscall failures are returned as negative errno values in rax.
-  // We still record failed path probes as "existence" observations because the
-  // program learned something about that pathname.
-  bool ok = result >= 0;
-
-  switch (p.nr) {
-    case SYS_open:
-    case SYS_openat:
-#ifdef SYS_openat2
-    case SYS_openat2:
-#endif
-    case SYS_creat:
-      if (ok) {
-        if (is_read_open(p.flags)) record_observation(p.path_a, "read");
-        if (is_write_open(p.flags)) record_observation(p.path_a, "write");
-        if (p.flags & O_DIRECTORY) record_observation(p.path_a, "directory");
-        refresh_proc_fd(pid, proc, static_cast<int>(result));
-      } else {
-        record_observation(p.path_a, "existence");
-      }
-      break;
-    case SYS_stat:
-    case SYS_lstat:
-    case SYS_newfstatat:
-    case SYS_access:
-    case SYS_faccessat:
-#ifdef SYS_faccessat2
-    case SYS_faccessat2:
-#endif
-    case SYS_readlink:
-    case SYS_readlinkat:
-      record_observation(p.path_a, "existence");
-      break;
-    case SYS_getdents:
-    case SYS_getdents64:
-      // getdents returns directory entries for an already-open fd. The entries
-      // themselves are not decoded yet; v1 records the fact that the directory
-      // was traversed, which is the important observation boundary for now.
-      if (ok && p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "directory");
-      break;
-    case SYS_unlink:
-    case SYS_unlinkat:
-    case SYS_rmdir:
-      record_observation(p.path_a, "delete");
-      break;
-    case SYS_rename:
-    case SYS_renameat:
-#ifdef SYS_renameat2
-    case SYS_renameat2:
-#endif
-      record_observation(p.path_a, "delete");
-      record_observation(p.path_b, "write");
-      break;
-    case SYS_mkdir:
-    case SYS_mkdirat:
-    case SYS_truncate:
-      record_observation(p.path_a, "write");
-      break;
-    case SYS_ftruncate:
-      if (p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "write");
-      break;
-    case SYS_chdir:
-      if (ok) refresh_proc_cwd(pid, proc);
-      break;
-    case SYS_fchdir:
-      if (ok) refresh_proc_cwd(pid, proc);
-      break;
-    case SYS_close:
-      if (ok) proc.fds.erase(p.fd);
-      break;
-    case SYS_dup:
-      // Duplication makes multiple numeric fds refer to the same open file
-      // description. Mirroring that relationship preserves later ftruncate or
-      // fd-relative directory operations through the duplicate.
-      if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(result)] = proc.fds[p.fd];
-      break;
-    case SYS_dup2:
-    case SYS_dup3:
-      if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(p.args[1])] = proc.fds[p.fd];
-      break;
-    case SYS_fcntl:
-      if (ok && (static_cast<int>(p.args[1]) == F_DUPFD ||
-                 static_cast<int>(p.args[1]) == F_DUPFD_CLOEXEC) &&
-          proc.fds.count(p.fd)) {
-        proc.fds[static_cast<int>(result)] = proc.fds[p.fd];
-      }
-      break;
-    default:
-      break;
-  }
-}
-
-void trace_command(const std::vector<std::string>& command) {
-  pid_t child = fork();
-  if (child < 0) throw std::runtime_error(errno_message("fork failed"));
-  if (child == 0) {
-    // PTRACE_TRACEME makes the parent our tracer after exec. The explicit
-    // SIGSTOP is a synchronization point: the parent sets ptrace options before
-    // the child reaches execvp and starts making filesystem-related syscalls.
-    if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) != 0) _exit(127);
-    raise(SIGSTOP);
-    std::vector<char*> argv;
-    for (const auto& arg : command) argv.push_back(const_cast<char*>(arg.c_str()));
-    argv.push_back(nullptr);
-    execvp(argv[0], argv.data());
-    _exit(127);
-  }
-
-  int status = 0;
-  if (waitpid(child, &status, 0) < 0) throw std::runtime_error(errno_message("waitpid failed"));
-  if (ptrace(PTRACE_SETOPTIONS, child, nullptr, kPtraceOptions) != 0) {
-    throw std::runtime_error(errno_message("ptrace SETOPTIONS failed"));
-  }
-  processes[child].cwd = fs::current_path();
-  refresh_proc_cwd(child, processes[child]);
-  // PTRACE_SYSCALL resumes the child and asks the kernel to stop it twice per
-  // syscall: once before execution and once after, which is the basis for the
-  // PendingSyscall entry/exit split above.
-  ptrace(PTRACE_SYSCALL, child, nullptr, nullptr);
-
-  while (!processes.empty()) {
-    // __WALL is needed with ptrace so waitpid observes all traced tasks,
-    // including clone-created threads that would otherwise not behave like
-    // normal children from the wait API's point of view.
-    pid_t pid = waitpid(-1, &status, __WALL);
-    if (pid < 0) {
-      if (errno == EINTR) continue;
-      if (errno == ECHILD) break;
-      throw std::runtime_error(errno_message("waitpid trace loop failed"));
-    }
-
-    auto it = processes.find(pid);
-    if (it == processes.end()) {
-      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
-      continue;
-    }
-
-    if (WIFEXITED(status) || WIFSIGNALED(status)) {
-      processes.erase(it);
-      continue;
-    }
-
-    if (!WIFSTOPPED(status)) continue;
-    int sig = WSTOPSIG(status);
-    unsigned event = static_cast<unsigned>(status >> 16);
-
-    if (event == PTRACE_EVENT_FORK || event == PTRACE_EVENT_VFORK ||
-        event == PTRACE_EVENT_CLONE) {
-      unsigned long new_pid = 0;
-      ptrace(PTRACE_GETEVENTMSG, pid, nullptr, &new_pid);
-      // A newly forked process inherits cwd and fd table at fork time. Copying
-      // the parent's ProcState matches that kernel behavior closely enough for
-      // path reconstruction until either process mutates its own state.
-      processes[static_cast<pid_t>(new_pid)] = it->second;
-      ptrace(PTRACE_SETOPTIONS, static_cast<pid_t>(new_pid), nullptr, kPtraceOptions);
-      ptrace(PTRACE_SYSCALL, static_cast<pid_t>(new_pid), nullptr, nullptr);
-      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
-      continue;
-    }
-
-    if (sig == (SIGTRAP | 0x80)) {
-      user_regs_struct regs {};
-      if (ptrace(PTRACE_GETREGS, pid, nullptr, &regs) == 0) {
-        ProcState& proc = it->second;
-        if (!proc.in_syscall) {
-          handle_syscall_entry(pid, proc, regs);
-          proc.in_syscall = true;
-        } else {
-          handle_syscall_exit(pid, proc, static_cast<long>(regs.rax));
-          proc.in_syscall = false;
-        }
-      }
-      ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr);
-    } else {
-      // Non-syscall stops are real signals or ptrace events. Plain SIGTRAP is
-      // consumed by the tracer; other signals are reinjected so tracing changes
-      // process behavior as little as practical.
-      int deliver = (sig == SIGTRAP) ? 0 : sig;
-      ptrace(PTRACE_SYSCALL, pid, nullptr, reinterpret_cast<void*>(static_cast<long>(deliver)));
-    }
-  }
-}
-
-void write_manifest(const fs::path& out, const std::vector<std::string>& command, int exit_status) {
-  json manifest;
-  manifest["format_version"] = 1;
-  manifest["command"] = command;
-  manifest["exit_status"] = exit_status;
-  manifest["start_cwd"] = fs::current_path().string();
-  manifest["uid"] = tracer_uid;
-  manifest["gid"] = tracer_gid;
-
-  manifest["git_repositories"] = json::array();
-  for (const auto& [_, repo] : repos) {
-    manifest["git_repositories"].push_back({
-        {"root", repo.root},
-        {"head", repo.head},
-        {"dirty", repo.dirty},
-    });
-  }
-
-  manifest["files"] = json::array();
-  for (const auto& [_, rec] : files) {
-    json ops = json::array();
-    for (const auto& op : rec.operations) ops.push_back(op);
-    manifest["files"].push_back({
-        {"path", rec.path},
-        {"operations", ops},
-        {"before", metadata_json(rec.before)},
-        {"after", metadata_json(rec.after)},
-        // Prefer after_git when available so files created during the run are
-        // classified in their final repository context. Fall back to before_git
-        // for deleted paths whose final filesystem anchor may no longer exist.
-        {"git", git_json(rec.after_git.in_repo ? rec.after_git : rec.before_git)},
-    });
-  }
-
-  std::ofstream stream(out / "manifest.json");
-  stream << manifest.dump(2) << "\n";
-}
-
-void restore_snapshot(const fs::path& dir) {
-  std::ifstream in(dir / "manifest.json");
-  if (!in) throw std::runtime_error("cannot open manifest");
-  json manifest = json::parse(in);
-
-  for (const auto& item : manifest.at("files")) {
-    fs::path path = item.at("path").get<std::string>();
-    const json& after = item.at("after");
-    if (!after.value("exists", false)) {
-      if (after.value("tombstone", false)) {
-        // A tombstone represents final non-existence. Missing is already the
-        // desired final state, so remove errors are intentionally non-fatal.
-        std::error_code ec;
-        fs::remove(path, ec);
-      }
-      continue;
-    }
-    if (!after.contains("blob")) continue;
-
-    // Restore only blobbed files. Clean Git-tracked files and reconstructable
-    // system files are manifest references, not payloads owned by this bundle.
-    fs::create_directories(path.parent_path());
-    const std::string expected_blob = after.at("blob").get<std::string>();
-    // Avoid rewriting identical files. This matters for observed executables
-    // that may be mapped or busy while restore runs, and it also makes restore
-    // idempotent for normal captured files.
-    if (!(fs::exists(path) && fs::is_regular_file(path) &&
-          fnv1a_file_digest(path) == expected_blob)) {
-      fs::path tmp = path;
-      tmp += ".agent-snapshot.tmp";
-      fs::copy_file(dir / "blobs" / expected_blob, tmp,
-                    fs::copy_options::overwrite_existing);
-      fs::rename(tmp, path);
-    }
-    if (after.contains("mode")) {
-      fs::permissions(path, static_cast<fs::perms>(after.at("mode").get<unsigned>() & 07777),
-                      fs::perm_options::replace);
-    }
-    if (after.contains("mtime")) {
-      // std::filesystem has no portable wall-clock setter for Unix time in C++20.
-      struct timespec ts[2] {};
-      ts[0].tv_nsec = UTIME_OMIT;
-      ts[1].tv_sec = after.at("mtime").get<std::time_t>();
-      ts[1].tv_nsec = 0;
-      utimensat(AT_FDCWD, path.c_str(), ts, AT_SYMLINK_NOFOLLOW);
-    }
-  }
-}
-
-int run_snapshot(const std::vector<std::string>& args) {
-  load_ignore_config();
-
-  fs::path output;
-  size_t split = args.size();
-  for (size_t i = 0; i < args.size(); ++i) {
-    if (args[i] == "--output" && i + 1 < args.size()) {
-      output = args[i + 1];
-      ++i;
-    } else if (args[i] == "--") {
-      split = i + 1;
-      break;
-    } else {
-      throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args...");
-    }
-  }
-  if (output.empty() || split >= args.size()) {
-    throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args...");
-  }
-
-  snapshot_dir = output;
-  blob_dir = snapshot_dir / "blobs";
-  // v1 treats --output as an owned bundle directory. Removing it up front avoids
-  // stale blobs or manifest entries from a previous run being mistaken for the
-  // current trace.
-  fs::remove_all(snapshot_dir);
-  fs::create_directories(blob_dir);
-
-  std::vector<std::string> command(args.begin() + static_cast<long>(split), args.end());
-  trace_command(command);
-  finalize_records();
-  write_manifest(snapshot_dir, command, 0);
-  return 0;
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-  tracer_uid = getuid();
-  tracer_gid = getgid();
-  git_libgit2_init();
-  try {
-    std::vector<std::string> args(argv + 1, argv + argc);
-    if (!args.empty() && args[0] == "restore") {
-      if (args.size() != 2) throw std::runtime_error("usage: agent-snapshot restore SNAPDIR");
-      restore_snapshot(args[1]);
-      git_libgit2_shutdown();
-      return 0;
-    }
-    int rc = run_snapshot(args);
-    git_libgit2_shutdown();
-    return rc;
-  } catch (const std::exception& e) {
-    git_libgit2_shutdown();
-    std::cerr << "agent-snapshot: " << e.what() << "\n";
-    return 1;
-  }
-}
src/ocaml/agent_snapshot.ml
new file mode 100644
index 0000000..7f50437
--- /dev/null
+++ b/src/ocaml/agent_snapshot.ml
@@ -0,0 +1,717 @@
+module Json = Yojson.Safe
+
+let at_fdcwd = -100
+let o_accmode = 0o3
+let o_rdonly = 0
+let o_wronly = 1
+let o_rdwr = 2
+let o_creat = 0o100
+let o_trunc = 0o1000
+let o_append = 0o2000
+let o_directory = 0o200000
+let f_dupfd = 0
+let f_dupfd_cloexec = 1030
+
+module Syscall = struct
+  let access = 21
+  let close = 3
+  let creat = 85
+  let dup = 32
+  let dup2 = 33
+  let dup3 = 292
+  let faccessat = 269
+  let faccessat2 = 439
+  let fchdir = 81
+  let fcntl = 72
+  let ftruncate = 77
+  let getdents = 78
+  let getdents64 = 217
+  let lstat = 6
+  let mkdir = 83
+  let mkdirat = 258
+  let newfstatat = 262
+  let open_ = 2
+  let openat = 257
+  let openat2 = 437
+  let readlink = 89
+  let readlinkat = 267
+  let rename = 82
+  let renameat = 264
+  let renameat2 = 316
+  let rmdir = 84
+  let stat = 4
+  let truncate = 76
+  let unlink = 87
+  let unlinkat = 263
+  let chdir = 80
+end
+
+type metadata = {
+  mutable exists : bool;
+  mutable tombstone : bool;
+  mutable regular : bool;
+  mutable directory : bool;
+  mutable mode : int;
+  mutable size : int64;
+  mutable mtime : int;
+  mutable blob : string option;
+}
+
+type git_info = {
+  mutable in_repo : bool;
+  mutable tracked : bool;
+  mutable dirty : bool;
+  mutable ignored : bool;
+  mutable root : string;
+  mutable head : string;
+  mutable relative_path : string;
+}
+
+type file_record = {
+  path : string;
+  operations : (string, unit) Hashtbl.t;
+  mutable before : metadata;
+  mutable after : metadata;
+  mutable before_git : git_info;
+  mutable after_git : git_info;
+  mutable before_recorded : bool;
+}
+
+type pending_syscall = {
+  nr : int;
+  args : int64 array;
+  mutable path_a : string;
+  mutable path_b : string;
+  mutable dirfd : int;
+  mutable fd : int;
+  mutable flags : int;
+}
+
+type proc_state = {
+  mutable cwd : string;
+  fds : (int, string) Hashtbl.t;
+  mutable pending : pending_syscall option;
+}
+
+type repo_record = {
+  root : string;
+  mutable head : string;
+  mutable dirty : bool;
+}
+
+let empty_metadata () =
+  { exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None }
+
+let empty_git () =
+  { in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" }
+
+let files : (string, file_record) Hashtbl.t = Hashtbl.create 128
+let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8
+let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8
+let ignored_paths = ref []
+let ignore_config_path = ref ""
+let snapshot_dir = ref ""
+let blob_dir = ref ""
+let tracer_uid = Unix.getuid ()
+let tracer_gid = Unix.getgid ()
+
+let path_sep = '/'
+
+let split_path path =
+  path |> String.split_on_char path_sep |> List.filter (fun part -> part <> "" && part <> ".")
+
+let normalize_path path =
+  let absolute = String.length path > 0 && path.[0] = path_sep in
+  let parts =
+    List.fold_left
+      (fun acc part ->
+        if part = ".." then match acc with [] -> acc | _ :: rest -> rest else part :: acc)
+      [] (split_path path)
+    |> List.rev
+  in
+  let body = String.concat "/" parts in
+  if absolute then if body = "" then "/" else "/" ^ body else if body = "" then "." else body
+
+let concat_path base path =
+  if path = "" then base
+  else if String.length path > 0 && path.[0] = '/' then normalize_path path
+  else normalize_path (base ^ "/" ^ path)
+
+let dirname path =
+  let path = normalize_path path in
+  match String.rindex_opt path '/' with
+  | None -> "."
+  | Some 0 -> "/"
+  | Some i -> String.sub path 0 i
+
+let basename path =
+  match String.rindex_opt path '/' with
+  | None -> path
+  | Some i -> String.sub path (i + 1) (String.length path - i - 1)
+
+let rec mkdir_p path =
+  if path = "" || path = "/" || Sys.file_exists path then ()
+  else (
+    mkdir_p (dirname path);
+    try Unix.mkdir path 0o777 with Unix.Unix_error (Unix.EEXIST, _, _) -> ())
+
+let is_absolute path = String.length path > 0 && path.[0] = '/'
+
+let realpath_opt path = try Some (Unix.realpath path) with Unix.Unix_error _ -> None
+
+let best_effort_canonical path =
+  match realpath_opt path with
+  | Some path -> normalize_path path
+  | None when is_absolute path -> normalize_path path
+  | None -> concat_path (Sys.getcwd ()) path
+
+let path_is_at_or_under path root =
+  path = root ||
+  let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
+  String.starts_with ~prefix:root path
+
+let is_git_internal_path path = List.exists (( = ) ".git") (split_path path)
+
+let is_ignored_path raw_path =
+  if raw_path = "" then false
+  else
+    let path = best_effort_canonical raw_path in
+    is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths
+
+let home_dir () =
+  match Sys.getenv_opt "HOME" with
+  | Some home when home <> "" -> home
+  | _ -> failwith "HOME is unavailable"
+
+let xdg_config_home_dir () =
+  match Sys.getenv_opt "XDG_CONFIG_HOME" with
+  | Some path when path <> "" -> path
+  | _ -> concat_path (home_dir ()) ".config"
+
+let xdg_ignore_config_path () = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json"
+
+let expand_ignore_entry entry =
+  let home = "$HOME" in
+  let xdg = "$XDG_CONFIG_HOME" in
+  if entry = home then home_dir ()
+  else if String.starts_with ~prefix:(home ^ "/") entry then concat_path (home_dir ()) (String.sub entry 6 (String.length entry - 6))
+  else if entry = xdg then xdg_config_home_dir ()
+  else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17))
+  else entry
+
+let load_ignore_config () =
+  ignore_config_path := best_effort_canonical (xdg_ignore_config_path ());
+  let json =
+    try Json.from_file !ignore_config_path
+    with Sys_error _ -> failwith ("ignore config does not exist: " ^ !ignore_config_path)
+  in
+  match json with
+  | `List entries ->
+      ignored_paths :=
+        !ignore_config_path
+        :: List.map
+             (function
+               | `String entry -> best_effort_canonical (expand_ignore_entry entry)
+               | _ -> failwith ("ignore config entries must be strings: " ^ !ignore_config_path))
+             entries
+  | _ -> failwith ("ignore config must be a JSON array: " ^ !ignore_config_path)
+
+let mode_of_kind = function
+  | Unix.S_REG -> 0o100000
+  | Unix.S_DIR -> 0o040000
+  | Unix.S_LNK -> 0o120000
+  | Unix.S_CHR -> 0o020000
+  | Unix.S_BLK -> 0o060000
+  | Unix.S_FIFO -> 0o010000
+  | Unix.S_SOCK -> 0o140000
+
+let stat_metadata path =
+  try
+    let st = Unix.LargeFile.lstat path in
+    Some
+      {
+        exists = true;
+        tombstone = false;
+        regular = st.st_kind = Unix.S_REG;
+        directory = st.st_kind = Unix.S_DIR;
+        mode = mode_of_kind st.st_kind lor st.st_perm;
+        size = st.st_size;
+        mtime = int_of_float st.st_mtime;
+        blob = None;
+      }
+  with Unix.Unix_error _ -> None
+
+let owned_by_other_and_not_writable path =
+  try
+    let st = Unix.LargeFile.lstat path in
+    st.st_uid <> tracer_uid
+    &&
+    try
+      Unix.access path [ Unix.W_OK ];
+      false
+    with Unix.Unix_error _ -> true
+  with Unix.Unix_error _ -> false
+
+let writable_by_current_user meta =
+  if not meta.exists then true
+  else if tracer_uid = 0 then true
+  else meta.mode land 0o222 <> 0
+
+let existing_anchor path =
+  let rec loop path =
+    if path = "" || path = "." then None
+    else if Sys.file_exists path then Some path
+    else
+      let parent = dirname path in
+      if parent = path then None else loop parent
+  in
+  loop path
+
+let relative_path root path =
+  let root = best_effort_canonical root in
+  let path = normalize_path path in
+  let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in
+  if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path
+
+let classify_git input_path =
+  let info = empty_git () in
+  let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in
+  match anchor with
+  | None -> info
+  | Some anchor -> (
+      try
+        let discovered = Ocaml_git.discover anchor in
+        Ocaml_git.with_repo discovered (fun repo ->
+            match Ocaml_git.workdir repo with
+            | None -> info
+            | Some workdir ->
+                let root = best_effort_canonical workdir in
+                let rel = relative_path root input_path in
+                info.in_repo <- true;
+                info.root <- root;
+                info.relative_path <- rel;
+                info.head <- (try (Ocaml_git.head_commit repo).id with Ocaml_git.Git_error _ -> "");
+                let ignored = ref false in
+                let dirty = ref false in
+                let status_hit = ref false in
+                List.iter
+                  (fun (entry : Ocaml_git.status_entry) ->
+                    if entry.path = rel then (
+                      status_hit := true;
+                      ignored := List.exists (( = ) Ocaml_git.Ignored) entry.flags;
+                      dirty := entry.flags <> [] && entry.flags <> [ Ocaml_git.Current ]))
+                  (Ocaml_git.status repo);
+                let tracked =
+                  if !ignored then false
+                  else if !status_hit then not (List.exists (( = ) Ocaml_git.Worktree_new) (List.find (fun (entry : Ocaml_git.status_entry) -> entry.path = rel) (Ocaml_git.status repo)).flags)
+                  else
+                    let index = Ocaml_git.index repo in
+                    Fun.protect ~finally:(fun () -> Ocaml_git.close_index index) (fun () -> Ocaml_git.index_contains index rel)
+                in
+                info.ignored <- !ignored;
+                info.tracked <- tracked;
+                info.dirty <- if !status_hit then !dirty else false;
+                let rec_record =
+                  match Hashtbl.find_opt repos root with
+                  | Some rec_record -> rec_record
+                  | None ->
+                      let rec_record = { root; head = info.head; dirty = false } in
+                      Hashtbl.add repos root rec_record;
+                      rec_record
+                in
+                rec_record.head <- info.head;
+                rec_record.dirty <- rec_record.dirty || info.dirty;
+                info)
+      with Ocaml_git.Git_error _ -> info)
+
+let fnv1a_file_digest path =
+  let ic = open_in_bin path in
+  Fun.protect
+    ~finally:(fun () -> close_in_noerr ic)
+    (fun () ->
+      let hash = ref 0xcbf29ce484222325L in
+      (try
+         while true do
+           let c = input_byte ic in
+           hash := Int64.logxor !hash (Int64.of_int c);
+           hash := Int64.mul !hash 0x100000001b3L
+         done
+       with End_of_file -> ());
+      Printf.sprintf "%016Lx" !hash)
+
+let copy_file src dst =
+  let ic = open_in_bin src in
+  Fun.protect
+    ~finally:(fun () -> close_in_noerr ic)
+    (fun () ->
+      let oc = open_out_bin dst in
+      Fun.protect
+        ~finally:(fun () -> close_out_noerr oc)
+        (fun () ->
+          let bytes = Bytes.create 65536 in
+          let rec loop () =
+            let n = input ic bytes 0 (Bytes.length bytes) in
+            if n > 0 then (
+              output oc bytes 0 n;
+              loop ())
+          in
+          loop ()))
+
+let store_blob path =
+  let digest = fnv1a_file_digest path in
+  let out = concat_path !blob_dir digest in
+  if not (Sys.file_exists out) then copy_file path out;
+  digest
+
+let should_capture_content path meta git =
+  if (not meta.exists) || not meta.regular then false
+  else if owned_by_other_and_not_writable path then false
+  else if git.in_repo && git.tracked && not git.dirty then false
+  else writable_by_current_user meta
+
+let record_observation raw_path operation =
+  if raw_path <> "" then
+    let path = best_effort_canonical raw_path in
+    if not (is_ignored_path path) then (
+      let recd =
+        match Hashtbl.find_opt files path with
+        | Some recd -> recd
+        | None ->
+            let recd =
+              { path; operations = Hashtbl.create 5; before = empty_metadata (); after = empty_metadata (); before_git = empty_git (); after_git = empty_git (); before_recorded = false }
+            in
+            Hashtbl.add files path recd;
+            recd
+      in
+      Hashtbl.replace recd.operations operation ();
+      if not recd.before_recorded then (
+        recd.before_recorded <- true;
+        recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ());
+        recd.before_git <- classify_git path;
+        if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path)))
+
+let finalize_records () =
+  Hashtbl.iter
+    (fun _ recd ->
+      if not (is_ignored_path recd.path) then (
+        recd.after <- Option.value (stat_metadata recd.path) ~default:(empty_metadata ());
+        if not recd.after.exists then recd.after.tombstone <- Hashtbl.mem recd.operations "delete";
+        recd.after_git <- classify_git recd.path;
+        let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in
+        if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then
+          recd.after.blob <- Some (store_blob recd.path)))
+    files
+
+let metadata_json meta =
+  let base = [ ("exists", `Bool meta.exists) ] in
+  let base = if meta.tombstone then ("tombstone", `Bool true) :: base else base in
+  let base =
+    if meta.exists then
+      ("type", `String (if meta.directory then "directory" else if meta.regular then "file" else "other"))
+      :: ("mode", `Int meta.mode)
+      :: ("size", `Intlit (Int64.to_string meta.size))
+      :: ("mtime", `Int meta.mtime)
+      :: base
+    else base
+  in
+  let base = match meta.blob with Some blob -> ("blob", `String blob) :: base | None -> base in
+  `Assoc (List.rev base)
+
+let git_json git =
+  if not git.in_repo then `Assoc [ ("in_repo", `Bool false) ]
+  else
+    `Assoc
+      [
+        ("in_repo", `Bool true);
+        ("root", `String git.root);
+        ("head", `String git.head);
+        ("relative_path", `String git.relative_path);
+        ("tracked", `Bool git.tracked);
+        ("dirty", `Bool git.dirty);
+        ("ignored", `Bool git.ignored);
+      ]
+
+let write_manifest out command exit_status =
+  let repo_items =
+    Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", `String repo.root); ("head", `String repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos []
+    |> List.sort Stdlib.compare
+  in
+  let file_items =
+    Hashtbl.fold
+      (fun _ recd acc ->
+        let ops = Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare in
+        let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in
+        `Assoc
+          [
+            ("path", `String recd.path);
+            ("operations", `List (List.map (fun op -> `String op) ops));
+            ("before", metadata_json recd.before);
+            ("after", metadata_json recd.after);
+            ("git", git_json git);
+          ]
+        :: acc)
+      files []
+    |> List.sort Stdlib.compare
+  in
+  let manifest =
+    `Assoc
+      [
+        ("format_version", `Int 1);
+        ("command", `List (List.map (fun arg -> `String arg) command));
+        ("exit_status", `Int exit_status);
+        ("start_cwd", `String (Sys.getcwd ()));
+        ("uid", `Int tracer_uid);
+        ("gid", `Int tracer_gid);
+        ("git_repositories", `List repo_items);
+        ("files", `List file_items);
+      ]
+  in
+  Json.to_file ~std:true (concat_path out "manifest.json") manifest
+
+let resolve_path proc dirfd path =
+  if is_absolute path then normalize_path path
+  else
+    let base =
+      if dirfd <> at_fdcwd then Option.value (Hashtbl.find_opt proc.fds dirfd) ~default:proc.cwd else proc.cwd
+    in
+    concat_path base path
+
+let is_write_open flags =
+  let access = flags land o_accmode in
+  access = o_wronly || access = o_rdwr || flags land (o_creat lor o_trunc lor o_append) <> 0
+
+let is_read_open flags =
+  let access = flags land o_accmode in
+  access = o_rdonly || access = o_rdwr
+
+let readlink_opt path = try Some (Unix.readlink path) with Unix.Unix_error _ -> None
+
+let refresh_proc_fd pid proc fd =
+  match readlink_opt (Printf.sprintf "/proc/%d/fd/%d" pid fd) with
+  | Some target when is_absolute target -> Hashtbl.replace proc.fds fd (best_effort_canonical target)
+  | _ -> ()
+
+let refresh_proc_cwd pid proc =
+  match readlink_opt (Printf.sprintf "/proc/%d/cwd" pid) with
+  | Some target -> proc.cwd <- best_effort_canonical target
+  | None -> ()
+
+let int_arg regs i = Int64.to_int regs.Ptrace.args.(i)
+
+let handle_syscall_entry pid proc regs =
+  let p = { nr = regs.Ptrace.syscall_nr; args = regs.args; path_a = ""; path_b = ""; dirfd = at_fdcwd; fd = -1; flags = 0 } in
+  let tracee_string i = Ptrace.read_string pid regs.args.(i) in
+  begin
+    match p.nr with
+    | nr when nr = Syscall.open_ ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0);
+        p.flags <- int_arg regs 1;
+        if is_write_open p.flags then record_observation p.path_a "write"
+    | nr when nr = Syscall.openat || nr = Syscall.openat2 ->
+        p.dirfd <- int_arg regs 0;
+        p.path_a <- resolve_path proc p.dirfd (tracee_string 1);
+        p.flags <- int_arg regs 2;
+        if is_write_open p.flags then record_observation p.path_a "write"
+    | nr when nr = Syscall.creat ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0);
+        p.flags <- o_creat lor o_wronly lor o_trunc;
+        record_observation p.path_a "write"
+    | nr when nr = Syscall.stat || nr = Syscall.lstat || nr = Syscall.access || nr = Syscall.readlink ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0)
+    | nr when nr = Syscall.newfstatat || nr = Syscall.faccessat || nr = Syscall.faccessat2 || nr = Syscall.readlinkat ->
+        p.dirfd <- int_arg regs 0;
+        p.path_a <- resolve_path proc p.dirfd (tracee_string 1)
+    | nr when nr = Syscall.unlink || nr = Syscall.rmdir ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0);
+        record_observation p.path_a "delete"
+    | nr when nr = Syscall.unlinkat || nr = Syscall.mkdirat ->
+        p.dirfd <- int_arg regs 0;
+        p.path_a <- resolve_path proc p.dirfd (tracee_string 1);
+        if p.nr = Syscall.unlinkat then record_observation p.path_a "delete"
+    | nr when nr = Syscall.mkdir || nr = Syscall.chdir || nr = Syscall.truncate ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0)
+    | nr when nr = Syscall.rename ->
+        p.path_a <- resolve_path proc at_fdcwd (tracee_string 0);
+        p.path_b <- resolve_path proc at_fdcwd (tracee_string 1);
+        record_observation p.path_a "delete";
+        record_observation p.path_b "write"
+    | nr when nr = Syscall.renameat || nr = Syscall.renameat2 ->
+        p.path_a <- resolve_path proc (int_arg regs 0) (tracee_string 1);
+        p.path_b <- resolve_path proc (int_arg regs 2) (tracee_string 3);
+        record_observation p.path_a "delete";
+        record_observation p.path_b "write"
+    | nr when nr = Syscall.getdents || nr = Syscall.getdents64 || nr = Syscall.fchdir || nr = Syscall.ftruncate ->
+        p.fd <- int_arg regs 0
+    | nr when nr = Syscall.close || nr = Syscall.dup || nr = Syscall.dup2 || nr = Syscall.dup3 || nr = Syscall.fcntl ->
+        p.fd <- int_arg regs 0
+    | _ -> ()
+  end;
+  proc.pending <- Some p
+
+let syscall_ok result = Int64.compare result 0L >= 0
+
+let handle_syscall_exit pid proc regs =
+  match proc.pending with
+  | None -> ()
+  | Some p ->
+      let ok = syscall_ok regs.Ptrace.result in
+      begin
+        match p.nr with
+        | nr when nr = Syscall.open_ || nr = Syscall.openat || nr = Syscall.openat2 || nr = Syscall.creat ->
+            if ok then (
+              if is_read_open p.flags then record_observation p.path_a "read";
+              if is_write_open p.flags then record_observation p.path_a "write";
+              if p.flags land o_directory <> 0 then record_observation p.path_a "directory";
+              refresh_proc_fd pid proc (Int64.to_int regs.result))
+            else record_observation p.path_a "existence"
+        | nr
+          when nr = Syscall.stat || nr = Syscall.lstat || nr = Syscall.newfstatat || nr = Syscall.access || nr = Syscall.faccessat
+               || nr = Syscall.faccessat2 || nr = Syscall.readlink || nr = Syscall.readlinkat ->
+            record_observation p.path_a "existence"
+        | nr when nr = Syscall.getdents || nr = Syscall.getdents64 ->
+            if ok && p.fd >= 0 then Option.iter (fun path -> record_observation path "directory") (Hashtbl.find_opt proc.fds p.fd)
+        | nr when nr = Syscall.unlink || nr = Syscall.unlinkat || nr = Syscall.rmdir ->
+            record_observation p.path_a "delete"
+        | nr when nr = Syscall.rename || nr = Syscall.renameat || nr = Syscall.renameat2 ->
+            record_observation p.path_a "delete";
+            record_observation p.path_b "write"
+        | nr when nr = Syscall.mkdir || nr = Syscall.mkdirat || nr = Syscall.truncate ->
+            record_observation p.path_a "write"
+        | nr when nr = Syscall.ftruncate ->
+            if p.fd >= 0 then Option.iter (fun path -> record_observation path "write") (Hashtbl.find_opt proc.fds p.fd)
+        | nr when nr = Syscall.chdir || nr = Syscall.fchdir ->
+            if ok then refresh_proc_cwd pid proc
+        | nr when nr = Syscall.close ->
+            if ok then Hashtbl.remove proc.fds p.fd
+        | nr when nr = Syscall.dup ->
+            if ok then Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int regs.result) path) (Hashtbl.find_opt proc.fds p.fd)
+        | nr when nr = Syscall.dup2 || nr = Syscall.dup3 ->
+            if ok then Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int p.args.(1)) path) (Hashtbl.find_opt proc.fds p.fd)
+        | nr when nr = Syscall.fcntl ->
+            if ok && (Int64.to_int p.args.(1) = f_dupfd || Int64.to_int p.args.(1) = f_dupfd_cloexec) then
+              Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int regs.result) path) (Hashtbl.find_opt proc.fds p.fd)
+        | _ -> ()
+      end
+
+let clone_proc_state state =
+  let fds = Hashtbl.create (Hashtbl.length state.fds) in
+  Hashtbl.iter (fun fd path -> Hashtbl.add fds fd path) state.fds;
+  { cwd = state.cwd; fds; pending = state.pending }
+
+let trace_command command =
+  Ptrace.trace command (function
+    | Ptrace.Fork { parent; child } ->
+        let state =
+          match Hashtbl.find_opt processes parent with
+          | Some state -> clone_proc_state state
+          | None -> { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None }
+        in
+        Hashtbl.replace processes child state
+    | Ptrace.Process_exit pid -> Hashtbl.remove processes pid
+    | Ptrace.Syscall_enter (pid, regs) ->
+        let state =
+          match Hashtbl.find_opt processes pid with
+          | Some state -> state
+          | None ->
+              let state = { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None } in
+              refresh_proc_cwd pid state;
+              Hashtbl.add processes pid state;
+              state
+        in
+        handle_syscall_entry pid state regs
+    | Ptrace.Syscall_exit (pid, regs) ->
+        Option.iter (fun state -> handle_syscall_exit pid state regs) (Hashtbl.find_opt processes pid)
+    | Ptrace.Exec pid | Ptrace.Exit pid | Ptrace.Signal (pid, _) ->
+        if not (Hashtbl.mem processes pid) then (
+          let state = { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None } in
+          refresh_proc_cwd pid state;
+          Hashtbl.add processes pid state))
+
+let rec remove_all path =
+  if Sys.file_exists path then
+    if Sys.is_directory path then (
+      Array.iter (fun name -> if name <> "." && name <> ".." then remove_all (concat_path path name)) (Sys.readdir path);
+      Unix.rmdir path)
+    else Unix.unlink path
+
+let restore_snapshot dir =
+  let manifest = Json.from_file (concat_path dir "manifest.json") in
+  let files =
+    match manifest with
+    | `Assoc fields -> (match List.assoc_opt "files" fields with Some (`List files) -> files | _ -> [])
+    | _ -> []
+  in
+  List.iter
+    (fun file_json ->
+      match file_json with
+      | `Assoc item -> (
+          match (List.assoc_opt "path" item, List.assoc_opt "after" item : Json.t option * Json.t option) with
+          | Some (`String path), Some (`Assoc after) ->
+              let exists = match List.assoc_opt "exists" after with Some (`Bool b) -> b | _ -> false in
+              if not exists then (
+                let tombstone = match List.assoc_opt "tombstone" after with Some (`Bool b) -> b | _ -> false in
+                if tombstone then try Unix.unlink path with Unix.Unix_error _ -> ())
+              else (
+                match List.assoc_opt "blob" after with
+                | None -> ()
+                | Some (`String digest) ->
+                    mkdir_p (dirname path);
+                    let same =
+                      Sys.file_exists path && not (Sys.is_directory path) && fnv1a_file_digest path = digest
+                    in
+                    if not same then (
+                      let tmp = path ^ ".agent-snapshot.tmp" in
+                      copy_file (concat_path (concat_path dir "blobs") digest) tmp;
+                      Unix.rename tmp path);
+                    (match List.assoc_opt "mode" after with
+                    | Some (`Int mode) -> Unix.chmod path (mode land 0o7777)
+                    | _ -> ());
+                    (match List.assoc_opt "mtime" after with
+                    | Some (`Int mtime) -> Unix.utimes path (float_of_int mtime) (float_of_int mtime)
+                    | _ -> ())
+                | _ -> ())
+          | _ -> ())
+      | _ -> ())
+    files
+
+let parse_snapshot_args args =
+  let rec loop output = function
+    | "--output" :: value :: rest -> loop (Some value) rest
+    | "--" :: command -> (output, command)
+    | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..."
+  in
+  match loop None args with
+  | Some output, (_ :: _ as command) -> (output, command)
+  | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..."
+
+let run_snapshot args =
+  load_ignore_config ();
+  let output, command = parse_snapshot_args args in
+  snapshot_dir := output;
+  blob_dir := concat_path output "blobs";
+  remove_all output;
+  mkdir_p !blob_dir;
+  trace_command command;
+  finalize_records ();
+  write_manifest output command 0;
+  0
+
+let main () =
+  try
+    let args = Array.to_list Sys.argv |> List.tl in
+    let rc =
+      match args with
+      | [ "restore"; dir ] ->
+          restore_snapshot dir;
+          0
+      | "restore" :: _ -> failwith "usage: agent-snapshot restore SNAPDIR"
+      | _ -> run_snapshot args
+    in
+    Ocaml_git.shutdown ();
+    exit rc
+  with exn ->
+    Ocaml_git.shutdown ();
+    Printf.eprintf "agent-snapshot: %s\n%!" (Printexc.to_string exn);
+    exit 1
+
+let () = main ()
src/ocaml/dune
new file mode 100644
index 0000000..c356d7e
--- /dev/null
+++ b/src/ocaml/dune
@@ -0,0 +1,7 @@
+(executable
+ (name agent_snapshot)
+ (public_name agent-snapshot)
+ (foreign_stubs
+  (language c)
+  (names ptrace_stubs))
+ (libraries unix yojson ocaml-git))
src/ocaml/ptrace.ml
new file mode 100644
index 0000000..90ba7c8
--- /dev/null
+++ b/src/ocaml/ptrace.ml
@@ -0,0 +1,157 @@
+type pid = int
+
+type regs = {
+  syscall_nr : int;
+  args : int64 array;
+  result : int64;
+}
+
+type wait_stop =
+  | Exited of pid * int
+  | Signaled of pid * int
+  | Stopped of pid * int * int
+
+type event =
+  | Syscall_enter of pid * regs
+  | Syscall_exit of pid * regs
+  | Fork of { parent : pid; child : pid }
+  | Exec of pid
+  | Exit of pid
+  | Signal of pid * int
+  | Process_exit of pid
+
+type task = {
+  mutable in_syscall : bool;
+}
+
+external fork : unit -> int = "as_fork"
+external traceme : unit -> unit = "as_traceme"
+external setoptions : pid -> unit = "as_setoptions"
+external syscall : pid -> int -> unit = "as_syscall"
+external geteventmsg : pid -> int = "as_geteventmsg"
+external getregs_raw : pid -> int * int64 * int64 * int64 * int64 * int64 * int64 * int64 = "as_getregs"
+external peek_word : pid -> int64 -> string = "as_peek_word"
+external wait_raw : pid -> bool -> wait_stop = "as_wait"
+external const_sigtrap_sysgood : unit -> int = "as_const_sigtrap_sysgood"
+external const_sigtrap : unit -> int = "as_const_sigtrap"
+external const_event_fork : unit -> int = "as_const_event_fork"
+external const_event_vfork : unit -> int = "as_const_event_vfork"
+external const_event_clone : unit -> int = "as_const_event_clone"
+external const_event_exec : unit -> int = "as_const_event_exec"
+external const_event_exit : unit -> int = "as_const_event_exit"
+
+let regs pid =
+  let nr, a0, a1, a2, a3, a4, a5, result = getregs_raw pid in
+  { syscall_nr = nr; args = [| a0; a1; a2; a3; a4; a5 |]; result }
+
+let read_string pid address =
+  if Int64.equal address 0L then ""
+  else
+    let max_len = 65536 in
+    let word_size = Sys.word_size / 8 in
+    let buffer = Buffer.create 64 in
+    let rec loop offset =
+      if offset >= max_len then Buffer.contents buffer
+      else
+        match peek_word pid (Int64.add address (Int64.of_int offset)) with
+        | exception _ -> Buffer.contents buffer
+        | word ->
+            let rec scan i =
+              if i >= String.length word then loop (offset + word_size)
+              else
+                let c = word.[i] in
+                if Char.equal c '\000' then Buffer.contents buffer
+                else (
+                  Buffer.add_char buffer c;
+                  scan (i + 1))
+            in
+            scan 0
+    in
+    loop 0
+
+let wait_initial pid = wait_raw pid false
+
+let wait_next () = wait_raw (-1) true
+
+let resume ?(signal = 0) pid = syscall pid signal
+
+let try_setoptions pid =
+  try setoptions pid with Unix.Unix_error (Unix.ESRCH, _, _) -> ()
+
+let try_resume ?(signal = 0) pid =
+  try resume ~signal pid with Unix.Unix_error (Unix.ESRCH, _, _) -> ()
+
+let is_fork_event event =
+  event = const_event_fork () || event = const_event_vfork () || event = const_event_clone ()
+
+let decode_stop tasks = function
+  | Exited (pid, _) | Signaled (pid, _) ->
+      Hashtbl.remove tasks pid;
+      [ Process_exit pid ]
+  | Stopped (pid, signal, event) when is_fork_event event ->
+      let child = geteventmsg pid in
+      let parent_task =
+        match Hashtbl.find_opt tasks pid with
+        | Some task -> { in_syscall = task.in_syscall }
+        | None -> { in_syscall = false }
+      in
+      Hashtbl.replace tasks child parent_task;
+      try_setoptions child;
+      try_resume child;
+      [ Fork { parent = pid; child } ]
+  | Stopped (pid, _signal, event) when event = const_event_exec () -> [ Exec pid ]
+  | Stopped (pid, _signal, event) when event = const_event_exit () -> [ Exit pid ]
+  | Stopped (pid, signal, _event) when signal = const_sigtrap_sysgood () -> (
+      let task =
+        match Hashtbl.find_opt tasks pid with
+        | Some task -> task
+        | None ->
+            let task = { in_syscall = false } in
+            Hashtbl.replace tasks pid task;
+            task
+      in
+      let regs = regs pid in
+      if task.in_syscall then (
+        task.in_syscall <- false;
+        [ Syscall_exit (pid, regs) ])
+      else (
+        task.in_syscall <- true;
+        [ Syscall_enter (pid, regs) ]))
+  | Stopped (pid, signal, _event) ->
+      [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ]
+
+let trace command on_event =
+  match command with
+  | [] -> invalid_arg "empty command"
+  | argv0 :: _ ->
+      let child = fork () in
+      if child = 0 then (
+        traceme ();
+        Unix.kill (Unix.getpid ()) Sys.sigstop;
+        Unix.execvp argv0 (Array.of_list command))
+      else (
+        match wait_initial child with
+        | Stopped _ ->
+            setoptions child;
+            let tasks = Hashtbl.create 8 in
+            Hashtbl.add tasks child { in_syscall = false };
+            try_resume child;
+            while Hashtbl.length tasks > 0 do
+              match wait_next () with
+              | exception Unix.Unix_error (Unix.ECHILD, _, _) -> Hashtbl.clear tasks
+              | stop ->
+                  let pid =
+                    match stop with
+                    | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid
+                  in
+                  let events = decode_stop tasks stop in
+                  List.iter on_event events;
+                  if Hashtbl.mem tasks pid then
+                    let signal =
+                      match events with
+                      | [ Signal (_, signal) ] -> signal
+                      | _ -> 0
+                    in
+                    try_resume ~signal pid
+            done
+        | _ -> failwith "tracee did not stop at startup")
src/ocaml/ptrace_stubs.c
new file mode 100644
index 0000000..a7eccd5
--- /dev/null
+++ b/src/ocaml/ptrace_stubs.c
@@ -0,0 +1,156 @@
+#include <caml/alloc.h>
+#include <caml/fail.h>
+#include <caml/memory.h>
+#include <caml/unixsupport.h>
+#include <caml/mlvalues.h>
+
+#include <errno.h>
+#include <signal.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static void raise_unix_error(const char *call) {
+  uerror(call, Nothing);
+}
+
+CAMLprim value as_fork(value unit) {
+  CAMLparam1(unit);
+  pid_t pid = fork();
+  if (pid < 0) raise_unix_error("fork");
+  CAMLreturn(Val_int(pid));
+}
+
+CAMLprim value as_traceme(value unit) {
+  CAMLparam1(unit);
+  if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) raise_unix_error("ptrace_traceme");
+  CAMLreturn(Val_unit);
+}
+
+CAMLprim value as_setoptions(value pid_v) {
+  CAMLparam1(pid_v);
+  long options = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK |
+                 PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT;
+  if (ptrace(PTRACE_SETOPTIONS, Int_val(pid_v), NULL, (void *)options) != 0) {
+    raise_unix_error("ptrace_setoptions");
+  }
+  CAMLreturn(Val_unit);
+}
+
+CAMLprim value as_syscall(value pid_v, value signal_v) {
+  CAMLparam2(pid_v, signal_v);
+  if (ptrace(PTRACE_SYSCALL, Int_val(pid_v), NULL, (void *)(long)Int_val(signal_v)) != 0) {
+    raise_unix_error("ptrace_syscall");
+  }
+  CAMLreturn(Val_unit);
+}
+
+CAMLprim value as_geteventmsg(value pid_v) {
+  CAMLparam1(pid_v);
+  unsigned long msg = 0;
+  if (ptrace(PTRACE_GETEVENTMSG, Int_val(pid_v), NULL, &msg) != 0) {
+    raise_unix_error("ptrace_geteventmsg");
+  }
+  CAMLreturn(Val_int((int)msg));
+}
+
+CAMLprim value as_getregs(value pid_v) {
+  CAMLparam1(pid_v);
+  CAMLlocal1(tuple);
+  struct user_regs_struct regs;
+  if (ptrace(PTRACE_GETREGS, Int_val(pid_v), NULL, &regs) != 0) {
+    raise_unix_error("ptrace_getregs");
+  }
+  tuple = caml_alloc_tuple(8);
+  Store_field(tuple, 0, Val_int((int)regs.orig_rax));
+  Store_field(tuple, 1, caml_copy_int64((int64_t)regs.rdi));
+  Store_field(tuple, 2, caml_copy_int64((int64_t)regs.rsi));
+  Store_field(tuple, 3, caml_copy_int64((int64_t)regs.rdx));
+  Store_field(tuple, 4, caml_copy_int64((int64_t)regs.r10));
+  Store_field(tuple, 5, caml_copy_int64((int64_t)regs.r8));
+  Store_field(tuple, 6, caml_copy_int64((int64_t)regs.r9));
+  Store_field(tuple, 7, caml_copy_int64((int64_t)regs.rax));
+  CAMLreturn(tuple);
+}
+
+CAMLprim value as_peek_word(value pid_v, value addr_v) {
+  CAMLparam2(pid_v, addr_v);
+  CAMLlocal1(out);
+  union {
+    long value;
+    char bytes[sizeof(long)];
+  } data;
+  errno = 0;
+  data.value = ptrace(PTRACE_PEEKDATA, Int_val(pid_v), (void *)(uintptr_t)Int64_val(addr_v), NULL);
+  if (errno != 0) raise_unix_error("ptrace_peekdata");
+  out = caml_alloc_string(sizeof(long));
+  memcpy(Bytes_val(out), data.bytes, sizeof(long));
+  CAMLreturn(out);
+}
+
+CAMLprim value as_wait(value pid_v, value wall_v) {
+  CAMLparam2(pid_v, wall_v);
+  CAMLlocal1(result);
+  int status = 0;
+  int options = Bool_val(wall_v) ? __WALL : 0;
+  pid_t pid;
+  do {
+    pid = waitpid(Int_val(pid_v), &status, options);
+  } while (pid < 0 && errno == EINTR);
+  if (pid < 0) raise_unix_error("waitpid");
+  if (WIFEXITED(status)) {
+    result = caml_alloc(2, 0);
+    Store_field(result, 0, Val_int(pid));
+    Store_field(result, 1, Val_int(WEXITSTATUS(status)));
+  } else if (WIFSIGNALED(status)) {
+    result = caml_alloc(2, 1);
+    Store_field(result, 0, Val_int(pid));
+    Store_field(result, 1, Val_int(WTERMSIG(status)));
+  } else {
+    result = caml_alloc(3, 2);
+    Store_field(result, 0, Val_int(pid));
+    Store_field(result, 1, Val_int(WSTOPSIG(status)));
+    Store_field(result, 2, Val_int(status >> 16));
+  }
+  CAMLreturn(result);
+}
+
+CAMLprim value as_const_sigtrap_sysgood(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(SIGTRAP | 0x80));
+}
+
+CAMLprim value as_const_sigtrap(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(SIGTRAP));
+}
+
+CAMLprim value as_const_event_fork(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(PTRACE_EVENT_FORK));
+}
+
+CAMLprim value as_const_event_vfork(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(PTRACE_EVENT_VFORK));
+}
+
+CAMLprim value as_const_event_clone(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(PTRACE_EVENT_CLONE));
+}
+
+CAMLprim value as_const_event_exec(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(PTRACE_EVENT_EXEC));
+}
+
+CAMLprim value as_const_event_exit(value unit) {
+  CAMLparam1(unit);
+  CAMLreturn(Val_int(PTRACE_EVENT_EXIT));
+}
tests/test_agent_snapshot.py
index a4de765..efef94d 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -8,8 +8,7 @@ import pytest
 
 
 ROOT = Path(__file__).resolve().parents[1]
-BUILD = ROOT / "build" / "pytest"
-BIN = BUILD / "agent-snapshot"
+BIN = ROOT / "_build" / "default" / "src" / "ocaml" / "agent_snapshot.exe"
 TESTDATA = ROOT / "testdata"
 WORKTREE = TESTDATA / "runtime_repo"
 # Use the system Python rather than uv's managed interpreter. The snapshotter
@@ -28,9 +27,8 @@ def run(cmd, **kwargs):
 def build_agent_snapshot():
     # The tests exercise the real CLI binary instead of calling internal helper
     # functions. That keeps the acceptance criteria aligned with ptrace behavior,
-    # process launch, CMake wiring, and manifest writing as users will run them.
-    run(["cmake", "-S", ".", "-B", str(BUILD)])
-    run(["cmake", "--build", str(BUILD), "--parallel"])
+    # process launch, Dune wiring, and manifest writing as users will run them.
+    run(["bash", "-lc", ". /scratch/arjun/ocaml/env.sh && dune build src/ocaml/agent_snapshot.exe"])
     assert BIN.exists()
 
 
@@ -349,7 +347,7 @@ def test_text_peculiar_file_names_are_recorded_and_blobbed(tmp_path):
     assert snap.blob_text(newline["before"]["blob"]) == "newline payload\n"
 
 
-@pytest.mark.skip(reason="nlohmann/json rejects non-UTF-8 std::string values when dumping JSON")
+@pytest.mark.skip(reason="manifest paths are still not valid UTF-8 for non-UTF-8 filenames")
 def test_non_utf8_filename_exposes_json_string_limitation(tmp_path):
     bytes_path = os.path.join(os.fsencode(WORKTREE), b"non-utf8-\xff.txt")
     with open(bytes_path, "wb") as handle:
vendor/ocaml-git
new file mode 120000
index 0000000..a9469f5
--- /dev/null
+++ b/vendor/ocaml-git
@@ -0,0 +1 @@
+../../../homebox/ocaml-git
\ No newline at end of file