Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -1,4 +1,5 @@ build/ +_build/ .pytest_cache/ .venv/ __pycache__/
@@ -1,27 +0,0 @@ -cmake_minimum_required(VERSION 3.28) -project(agent_snapshot VERSION 0.1.0 LANGUAGES CXX) - -include(FetchContent) - -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) - -find_package(PkgConfig REQUIRED) -pkg_check_modules(LIBGIT2 REQUIRED IMPORTED_TARGET libgit2) - -FetchContent_Declare( - nlohmann_json - URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz - URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d -) -FetchContent_MakeAvailable(nlohmann_json) - -add_executable(agent-snapshot - src/main.cpp -) - -target_include_directories(agent-snapshot PRIVATE include) -target_link_libraries(agent-snapshot PRIVATE PkgConfig::LIBGIT2 nlohmann_json::nlohmann_json) -target_compile_options(agent-snapshot PRIVATE -Wall -Wextra -Wpedantic) -
@@ -0,0 +1,11 @@ +(lang dune 3.22) +(name agent-snapshot) + +(package + (name agent-snapshot) + (synopsis "Filesystem snapshotter for traced commands") + (depends + (ocaml (>= 5.4)) + dune + yojson + ocaml-git))
@@ -1,955 +0,0 @@ -#include <nlohmann/json.hpp> -#include <git2.h> - -#include <sys/ptrace.h> -#include <sys/reg.h> -#include <sys/stat.h> -#include <sys/syscall.h> -#include <sys/types.h> -#include <sys/uio.h> -#include <sys/user.h> -#include <sys/wait.h> - -#include <fcntl.h> -#include <signal.h> -#include <unistd.h> - -#include <cerrno> -#include <chrono> -#include <cstdlib> -#include <cstring> -#include <filesystem> -#include <fstream> -#include <iomanip> -#include <iostream> -#include <map> -#include <optional> -#include <set> -#include <sstream> -#include <stdexcept> -#include <string> -#include <unordered_map> -#include <vector> - -using json = nlohmann::json; -namespace fs = std::filesystem; - -namespace { - -// ptrace reports many kinds of stops through the same waitpid interface. These -// options are what make the tracer useful for this project: -// -// - PTRACE_O_TRACESYSGOOD marks syscall stops as SIGTRAP|0x80, so we can tell -// "the process is entering/leaving a syscall" apart from ordinary SIGTRAPs. -// - PTRACE_O_TRACE{FORK,VFORK,CLONE} asks the kernel to stop the parent when a -// new child is created and lets us attach the same syscall tracing policy to -// the child before it runs far enough to hide file accesses from us. -// - EXEC/EXIT are not deeply interpreted yet, but enabling them keeps the event -// stream explicit and leaves room for recording process lifecycle later. -constexpr int kPtraceOptions = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | - PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE | - PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT; - -// The manifest stores before/after facts because reconstruction is about final -// state, but deciding whether to blob a file often depends on what was present -// before the traced program touched it. For example, a created file must have an -// explicit "before did not exist" fact, while a deleted file needs a tombstone. -struct Metadata { - bool exists = false; - bool tombstone = false; - bool regular = false; - bool directory = false; - mode_t mode = 0; - uintmax_t size = 0; - std::time_t mtime = 0; - std::string blob; -}; - -// Git facts are the main compactness mechanism. A clean tracked path can be -// reconstructed from repository root + HEAD + relative path, so storing its -// content would duplicate information already represented by Git. -struct GitInfo { - bool in_repo = false; - bool tracked = false; - bool dirty = false; - bool ignored = false; - std::string root; - std::string head; - std::string relative_path; -}; - -// A file is recorded once and accumulates capabilities. This keeps the manifest -// compact when a program stats, opens, reads, and later deletes the same path. -struct FileRecord { - std::string path; - std::set<std::string> operations; - Metadata before; - Metadata after; - GitInfo before_git; - GitInfo after_git; - bool before_recorded = false; -}; - -// ptrace gives us register state separately at syscall entry and syscall exit. -// Path pointers are only meaningful at entry, while return values and new file -// descriptors only exist at exit, so we cache the decoded entry-side state here. -struct PendingSyscall { - long nr = -1; - std::array<unsigned long long, 6> args{}; - std::string path_a; - std::string path_b; - int dirfd = AT_FDCWD; - int fd = -1; - int flags = 0; -}; - -// ptrace tracks tasks, not "programs". Each traced pid can have a different cwd -// and fd table after chdir/open/dup/close, and relative path reconstruction is -// only as good as this per-process state. -struct ProcState { - bool in_syscall = false; - fs::path cwd; - std::map<int, fs::path> fds; - PendingSyscall pending; -}; - -struct RepoRecord { - std::string root; - std::string head; - bool dirty = false; -}; - -std::unordered_map<pid_t, ProcState> processes; -std::map<std::string, FileRecord> files; -std::map<std::string, RepoRecord> repos; -std::vector<fs::path> ignored_paths; -fs::path ignore_config_path; -fs::path snapshot_dir; -fs::path blob_dir; -uid_t tracer_uid = 0; -gid_t tracer_gid = 0; - -std::string errno_message(const std::string& prefix) { - return prefix + ": " + std::strerror(errno); -} - -std::string readlink_string(const fs::path& path) { - std::vector<char> buffer(4096); - ssize_t n = readlink(path.c_str(), buffer.data(), buffer.size() - 1); - if (n < 0) return {}; - buffer[static_cast<size_t>(n)] = '\0'; - return std::string(buffer.data()); -} - -fs::path lexical_abs(const fs::path& path, const fs::path& base) { - if (path.is_absolute()) return path.lexically_normal(); - return (base / path).lexically_normal(); -} - -fs::path best_effort_canonical(const fs::path& path) { - std::error_code ec; - fs::path canonical = fs::weakly_canonical(path, ec); - if (!ec && !canonical.empty()) return canonical.lexically_normal(); - if (path.is_absolute()) return path.lexically_normal(); - return fs::absolute(path, ec).lexically_normal(); -} - -fs::path xdg_ignore_config_path() { - const char* config_home = std::getenv("XDG_CONFIG_HOME"); - if (config_home && config_home[0] != '\0') { - return fs::path(config_home) / "agent-snapshot" / "ignore.json"; - } - const char* home = std::getenv("HOME"); - if (!home || home[0] == '\0') { - throw std::runtime_error("XDG_CONFIG_HOME is unset and HOME is unavailable"); - } - return fs::path(home) / ".config" / "agent-snapshot" / "ignore.json"; -} - -fs::path home_dir() { - const char* home = std::getenv("HOME"); - if (!home || home[0] == '\0') { - throw std::runtime_error("HOME is unavailable"); - } - return fs::path(home); -} - -fs::path xdg_config_home_dir() { - const char* config_home = std::getenv("XDG_CONFIG_HOME"); - if (config_home && config_home[0] != '\0') { - return fs::path(config_home); - } - return home_dir() / ".config"; -} - -fs::path expand_ignore_entry(const std::string& entry) { - constexpr std::string_view kHome = "$HOME"; - constexpr std::string_view kXdgConfigHome = "$XDG_CONFIG_HOME"; - if (entry == kHome) return home_dir(); - if (entry.rfind(std::string(kHome) + "/", 0) == 0) { - return home_dir() / entry.substr(kHome.size() + 1); - } - if (entry == kXdgConfigHome) return xdg_config_home_dir(); - if (entry.rfind(std::string(kXdgConfigHome) + "/", 0) == 0) { - return xdg_config_home_dir() / entry.substr(kXdgConfigHome.size() + 1); - } - return fs::path(entry); -} - -bool path_is_at_or_under(const fs::path& path, const fs::path& root) { - auto pit = path.begin(); - auto rit = root.begin(); - for (; rit != root.end(); ++rit, ++pit) { - if (pit == path.end() || *pit != *rit) return false; - } - return true; -} - -bool is_git_internal_path(const fs::path& path) { - for (const auto& part : path) { - if (part == ".git") return true; - } - return false; -} - -bool is_ignored_path(const fs::path& raw_path) { - if (raw_path.empty()) return false; - fs::path path = best_effort_canonical(raw_path); - if (is_git_internal_path(path)) return true; - for (const auto& ignored : ignored_paths) { - if (path_is_at_or_under(path, ignored)) return true; - } - return false; -} - -void load_ignore_config() { - ignore_config_path = best_effort_canonical(xdg_ignore_config_path()); - std::ifstream stream(ignore_config_path); - if (!stream) { - throw std::runtime_error("ignore config does not exist: " + ignore_config_path.string()); - } - - json entries = json::parse(stream); - if (!entries.is_array()) { - throw std::runtime_error("ignore config must be a JSON array: " + ignore_config_path.string()); - } - - ignored_paths.clear(); - ignored_paths.push_back(ignore_config_path); - for (const auto& entry : entries) { - if (!entry.is_string()) { - throw std::runtime_error("ignore config entries must be strings: " + ignore_config_path.string()); - } - ignored_paths.push_back(best_effort_canonical(expand_ignore_entry(entry.get<std::string>()))); - } -} - -// Syscall arguments live in the tracee's address space. PTRACE_PEEKDATA reads a -// machine word at a time from that process; for path arguments we walk forward -// until the NUL terminator. This is intentionally bounded so a bad userspace -// pointer or non-terminated buffer cannot make the tracer loop forever. -std::string read_tracee_string(pid_t pid, unsigned long long address) { - if (address == 0) return {}; - std::string out; - union { - long value; - char chars[sizeof(long)]; - } data{}; - for (size_t offset = 0; offset < 65536; offset += sizeof(long)) { - errno = 0; - data.value = ptrace(PTRACE_PEEKDATA, pid, address + offset, nullptr); - if (errno != 0) break; - for (char c : data.chars) { - if (c == '\0') return out; - out.push_back(c); - } - } - return out; -} - -std::optional<Metadata> stat_metadata(const fs::path& path) { - struct stat st {}; - if (lstat(path.c_str(), &st) != 0) return std::nullopt; - Metadata meta; - meta.exists = true; - meta.mode = st.st_mode; - meta.size = static_cast<uintmax_t>(st.st_size); - meta.mtime = st.st_mtim.tv_sec; - meta.regular = S_ISREG(st.st_mode); - meta.directory = S_ISDIR(st.st_mode); - return meta; -} - -bool writable_by_current_user(const Metadata& meta) { - if (!meta.exists) return true; - const mode_t mode = meta.mode; - if (tracer_uid == 0) return true; - struct stat st {}; - (void)st; - return (mode & S_IWUSR) || (mode & S_IWGRP) || (mode & S_IWOTH); -} - -// This is the "system environment" escape hatch. The goal is not to snapshot -// /usr wholesale just because the dynamic loader, Python, or libc looked there. -// access(W_OK) is used instead of checking raw mode bits because groups, ACLs, -// and effective permission rules matter more than whether an owner write bit is -// present for some other uid. -bool owned_by_other_and_not_writable(const fs::path& path) { - struct stat st {}; - if (lstat(path.c_str(), &st) != 0) return false; - if (st.st_uid == tracer_uid) return false; - if (access(path.c_str(), W_OK) == 0) return false; - return true; -} - -// Git can classify paths that do not exist at finalization time if we start from -// the nearest existing parent. That matters for tombstones: after unlink, the -// path itself is gone, but the repository root and relative path are still -// recoverable from its parent directory. -fs::path existing_anchor(fs::path path) { - std::error_code ec; - if (fs::exists(path, ec)) return path; - while (!path.empty() && path != path.root_path()) { - path = path.parent_path(); - if (fs::exists(path, ec)) return path; - } - return {}; -} - -std::string oid_to_string(const git_oid* oid) { - char out[GIT_OID_HEXSZ + 1] = {}; - git_oid_tostr(out, sizeof(out), oid); - return out; -} - -std::string repo_head(git_repository* repo) { - git_reference* head = nullptr; - if (git_repository_head(&head, repo) != 0) return {}; - const git_oid* oid = git_reference_target(head); - std::string result = oid ? oid_to_string(oid) : ""; - git_reference_free(head); - return result; -} - -GitInfo classify_git(const fs::path& input_path) { - GitInfo info; - fs::path anchor = existing_anchor(input_path); - if (anchor.empty()) anchor = existing_anchor(input_path.parent_path()); - if (anchor.empty()) return info; - - // git_repository_open_ext walks upward from the anchor to find any containing - // repository. This is deliberate: the traced program might access files inside - // a nested repo, a dependency checkout, or a temp repo unrelated to this tool. - git_repository* repo = nullptr; - if (git_repository_open_ext(&repo, anchor.c_str(), 0, nullptr) != 0) return info; - - const char* workdir = git_repository_workdir(repo); - if (!workdir) { - git_repository_free(repo); - return info; - } - - info.in_repo = true; - info.root = best_effort_canonical(workdir).string(); - info.head = repo_head(repo); - - std::error_code ec; - fs::path rel = fs::relative(input_path, info.root, ec); - if (!ec) info.relative_path = rel.string(); - - // libgit2's status bits encode both index and worktree state. For compactness - // we only skip blobs for GIT_STATUS_CURRENT tracked files. New, ignored, or - // dirty tracked files are not reconstructable from HEAD alone, so they are - // treated as content that belongs in the snapshot. - unsigned int status = 0; - if (!info.relative_path.empty() && - git_status_file(&status, repo, info.relative_path.c_str()) == 0) { - info.ignored = status & GIT_STATUS_IGNORED; - info.tracked = !(status & GIT_STATUS_WT_NEW) && !(status & GIT_STATUS_IGNORED); - info.dirty = status != GIT_STATUS_CURRENT; - } else if (!info.relative_path.empty()) { - info.tracked = false; - info.dirty = true; - } - - RepoRecord& rec = repos[info.root]; - rec.root = info.root; - rec.head = info.head; - rec.dirty = rec.dirty || info.dirty; - - git_repository_free(repo); - return info; -} - -// This digest is a content-addressing key for blobs, not a security boundary. -// FNV-1a is small and deterministic, which is enough for current tests and for -// avoiding duplicate blob files. A production snapshot format should replace it -// with SHA-256 or BLAKE3 before relying on it for collision resistance. -std::string fnv1a_file_digest(const fs::path& path) { - std::ifstream in(path, std::ios::binary); - uint64_t hash = 1469598103934665603ULL; - char c; - while (in.get(c)) { - hash ^= static_cast<unsigned char>(c); - hash *= 1099511628211ULL; - } - std::ostringstream out; - out << std::hex << std::setw(16) << std::setfill('0') << hash; - return out.str(); -} - -std::string store_blob(const fs::path& path) { - std::string digest = fnv1a_file_digest(path); - fs::path out = blob_dir / digest; - if (!fs::exists(out)) { - fs::copy_file(path, out, fs::copy_options::overwrite_existing); - } - return digest; -} - -bool should_capture_content(const fs::path& path, const Metadata& meta, const GitInfo& git) { - // Directories, devices, sockets, etc. are represented as metadata and - // observations. Only regular files get blobbed in v1. - if (!meta.exists || !meta.regular) return false; - if (owned_by_other_and_not_writable(path)) return false; - if (git.in_repo && git.tracked && !git.dirty) return false; - return writable_by_current_user(meta); -} - -json metadata_json(const Metadata& meta) { - json j; - j["exists"] = meta.exists; - if (meta.tombstone) j["tombstone"] = true; - if (meta.exists) { - j["type"] = meta.directory ? "directory" : (meta.regular ? "file" : "other"); - j["mode"] = meta.mode; - j["size"] = meta.size; - j["mtime"] = meta.mtime; - } - if (!meta.blob.empty()) j["blob"] = meta.blob; - return j; -} - -json git_json(const GitInfo& git) { - json j; - j["in_repo"] = git.in_repo; - if (git.in_repo) { - j["root"] = git.root; - j["head"] = git.head; - j["relative_path"] = git.relative_path; - j["tracked"] = git.tracked; - j["dirty"] = git.dirty; - j["ignored"] = git.ignored; - } - return j; -} - -void record_observation(const fs::path& raw_path, const std::string& operation) { - if (raw_path.empty()) return; - fs::path path = best_effort_canonical(raw_path); - if (is_ignored_path(path)) return; - std::string key = path.string(); - FileRecord& rec = files[key]; - rec.path = key; - rec.operations.insert(operation); - if (!rec.before_recorded) { - // Capture "before" on first observation, not at process exit. This is the - // only chance to distinguish "the program created this path" from "the path - // existed before and was later opened for write". - rec.before_recorded = true; - rec.before = stat_metadata(path).value_or(Metadata{}); - rec.before_git = classify_git(path); - if (should_capture_content(path, rec.before, rec.before_git)) { - rec.before.blob = store_blob(path); - } - } -} - -void finalize_records() { - for (auto& [_, rec] : files) { - // The after pass is intentionally outside ptrace. Once the traced process - // tree has exited, the filesystem has quiesced from our point of view, so - // final content can be copied without racing the writer we launched. - fs::path path(rec.path); - if (is_ignored_path(path)) continue; - rec.after = stat_metadata(path).value_or(Metadata{}); - if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0; - rec.after_git = classify_git(path); - const bool written_regular_file = - rec.operations.count("write") > 0 && rec.after.exists && rec.after.regular; - if ((written_regular_file && !owned_by_other_and_not_writable(path)) || - should_capture_content(path, rec.after, rec.after_git)) { - rec.after.blob = store_blob(path); - } - } -} - -fs::path resolve_path(const ProcState& proc, int dirfd, const std::string& path) { - // The *at syscalls interpret relative paths against either cwd or a directory - // fd. That is why the tracer maintains fd->path mappings; without them, - // openat(dirfd, "file") would be impossible to place in the manifest. - fs::path p(path); - if (p.is_absolute()) return p; - fs::path base = proc.cwd; - if (dirfd != AT_FDCWD) { - auto it = proc.fds.find(dirfd); - if (it != proc.fds.end()) base = it->second; - } - return lexical_abs(p, base); -} - -bool is_write_open(int flags) { - int access = flags & O_ACCMODE; - return access == O_WRONLY || access == O_RDWR || (flags & (O_CREAT | O_TRUNC | O_APPEND)); -} - -bool is_read_open(int flags) { - int access = flags & O_ACCMODE; - return access == O_RDONLY || access == O_RDWR; -} - -void refresh_proc_fd(pid_t pid, ProcState& proc, int fd) { - // /proc/<pid>/fd/N is the kernel's own view of where an fd points. Reading - // this symlink after a successful open is more reliable than trying to model - // every mount namespace or symlink resolution rule ourselves. - std::string target = readlink_string("/proc/" + std::to_string(pid) + "/fd/" + std::to_string(fd)); - if (!target.empty() && target[0] == '/') proc.fds[fd] = best_effort_canonical(target); -} - -void refresh_proc_cwd(pid_t pid, ProcState& proc) { - // chdir/fchdir affect future relative path arguments. /proc gives us the - // post-syscall cwd after the kernel has accepted the directory change. - std::string target = readlink_string("/proc/" + std::to_string(pid) + "/cwd"); - if (!target.empty()) proc.cwd = best_effort_canonical(target); -} - -void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& regs) { - PendingSyscall p; - // Linux x86_64 syscall ABI: syscall number in orig_rax, arguments in - // rdi/rsi/rdx/r10/r8/r9, result in rax on exit. This file is explicitly v1 - // x86_64-only; another architecture needs a different register decoder. - p.nr = static_cast<long>(regs.orig_rax); - p.args = {regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9}; - - switch (p.nr) { - case SYS_open: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - p.flags = static_cast<int>(p.args[1]); - // For writes, record before-state at syscall entry. If O_CREAT succeeds, - // waiting until exit would make a newly-created file look preexisting. - if (is_write_open(p.flags)) record_observation(p.path_a, "write"); - break; - case SYS_openat: -#ifdef SYS_openat2 - case SYS_openat2: -#endif - p.dirfd = static_cast<int>(p.args[0]); - p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); - p.flags = static_cast<int>(p.args[2]); - if (is_write_open(p.flags)) record_observation(p.path_a, "write"); - break; - case SYS_creat: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - p.flags = O_CREAT | O_WRONLY | O_TRUNC; - record_observation(p.path_a, "write"); - break; - case SYS_stat: - case SYS_lstat: - case SYS_access: - case SYS_readlink: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - break; - case SYS_newfstatat: - case SYS_faccessat: -#ifdef SYS_faccessat2 - case SYS_faccessat2: -#endif - case SYS_readlinkat: - p.dirfd = static_cast<int>(p.args[0]); - p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); - break; - case SYS_unlink: - case SYS_rmdir: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - // Deletions have the same timing issue as creations: after syscall exit - // the content may be gone, so the before snapshot must happen here. - record_observation(p.path_a, "delete"); - break; - case SYS_unlinkat: - case SYS_mkdirat: - p.dirfd = static_cast<int>(p.args[0]); - p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); - if (p.nr == SYS_unlinkat) record_observation(p.path_a, "delete"); - break; - case SYS_mkdir: - case SYS_chdir: - case SYS_truncate: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - break; - case SYS_rename: - p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); - p.path_b = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[1])).string(); - // A rename is modeled as source deletion plus destination write. That is - // enough for reconstruction even though it loses the atomic-move history. - record_observation(p.path_a, "delete"); - record_observation(p.path_b, "write"); - break; - case SYS_renameat: -#ifdef SYS_renameat2 - case SYS_renameat2: -#endif - p.path_a = resolve_path(proc, static_cast<int>(p.args[0]), read_tracee_string(pid, p.args[1])).string(); - p.path_b = resolve_path(proc, static_cast<int>(p.args[2]), read_tracee_string(pid, p.args[3])).string(); - record_observation(p.path_a, "delete"); - record_observation(p.path_b, "write"); - break; - case SYS_getdents: - case SYS_getdents64: - case SYS_fchdir: - case SYS_ftruncate: - p.fd = static_cast<int>(p.args[0]); - break; - case SYS_close: - case SYS_dup: - case SYS_dup2: - case SYS_dup3: - case SYS_fcntl: - p.fd = static_cast<int>(p.args[0]); - break; - default: - break; - } - - proc.pending = p; -} - -void handle_syscall_exit(pid_t pid, ProcState& proc, long result) { - const PendingSyscall& p = proc.pending; - // On Linux, syscall failures are returned as negative errno values in rax. - // We still record failed path probes as "existence" observations because the - // program learned something about that pathname. - bool ok = result >= 0; - - switch (p.nr) { - case SYS_open: - case SYS_openat: -#ifdef SYS_openat2 - case SYS_openat2: -#endif - case SYS_creat: - if (ok) { - if (is_read_open(p.flags)) record_observation(p.path_a, "read"); - if (is_write_open(p.flags)) record_observation(p.path_a, "write"); - if (p.flags & O_DIRECTORY) record_observation(p.path_a, "directory"); - refresh_proc_fd(pid, proc, static_cast<int>(result)); - } else { - record_observation(p.path_a, "existence"); - } - break; - case SYS_stat: - case SYS_lstat: - case SYS_newfstatat: - case SYS_access: - case SYS_faccessat: -#ifdef SYS_faccessat2 - case SYS_faccessat2: -#endif - case SYS_readlink: - case SYS_readlinkat: - record_observation(p.path_a, "existence"); - break; - case SYS_getdents: - case SYS_getdents64: - // getdents returns directory entries for an already-open fd. The entries - // themselves are not decoded yet; v1 records the fact that the directory - // was traversed, which is the important observation boundary for now. - if (ok && p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "directory"); - break; - case SYS_unlink: - case SYS_unlinkat: - case SYS_rmdir: - record_observation(p.path_a, "delete"); - break; - case SYS_rename: - case SYS_renameat: -#ifdef SYS_renameat2 - case SYS_renameat2: -#endif - record_observation(p.path_a, "delete"); - record_observation(p.path_b, "write"); - break; - case SYS_mkdir: - case SYS_mkdirat: - case SYS_truncate: - record_observation(p.path_a, "write"); - break; - case SYS_ftruncate: - if (p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "write"); - break; - case SYS_chdir: - if (ok) refresh_proc_cwd(pid, proc); - break; - case SYS_fchdir: - if (ok) refresh_proc_cwd(pid, proc); - break; - case SYS_close: - if (ok) proc.fds.erase(p.fd); - break; - case SYS_dup: - // Duplication makes multiple numeric fds refer to the same open file - // description. Mirroring that relationship preserves later ftruncate or - // fd-relative directory operations through the duplicate. - if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(result)] = proc.fds[p.fd]; - break; - case SYS_dup2: - case SYS_dup3: - if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(p.args[1])] = proc.fds[p.fd]; - break; - case SYS_fcntl: - if (ok && (static_cast<int>(p.args[1]) == F_DUPFD || - static_cast<int>(p.args[1]) == F_DUPFD_CLOEXEC) && - proc.fds.count(p.fd)) { - proc.fds[static_cast<int>(result)] = proc.fds[p.fd]; - } - break; - default: - break; - } -} - -void trace_command(const std::vector<std::string>& command) { - pid_t child = fork(); - if (child < 0) throw std::runtime_error(errno_message("fork failed")); - if (child == 0) { - // PTRACE_TRACEME makes the parent our tracer after exec. The explicit - // SIGSTOP is a synchronization point: the parent sets ptrace options before - // the child reaches execvp and starts making filesystem-related syscalls. - if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) != 0) _exit(127); - raise(SIGSTOP); - std::vector<char*> argv; - for (const auto& arg : command) argv.push_back(const_cast<char*>(arg.c_str())); - argv.push_back(nullptr); - execvp(argv[0], argv.data()); - _exit(127); - } - - int status = 0; - if (waitpid(child, &status, 0) < 0) throw std::runtime_error(errno_message("waitpid failed")); - if (ptrace(PTRACE_SETOPTIONS, child, nullptr, kPtraceOptions) != 0) { - throw std::runtime_error(errno_message("ptrace SETOPTIONS failed")); - } - processes[child].cwd = fs::current_path(); - refresh_proc_cwd(child, processes[child]); - // PTRACE_SYSCALL resumes the child and asks the kernel to stop it twice per - // syscall: once before execution and once after, which is the basis for the - // PendingSyscall entry/exit split above. - ptrace(PTRACE_SYSCALL, child, nullptr, nullptr); - - while (!processes.empty()) { - // __WALL is needed with ptrace so waitpid observes all traced tasks, - // including clone-created threads that would otherwise not behave like - // normal children from the wait API's point of view. - pid_t pid = waitpid(-1, &status, __WALL); - if (pid < 0) { - if (errno == EINTR) continue; - if (errno == ECHILD) break; - throw std::runtime_error(errno_message("waitpid trace loop failed")); - } - - auto it = processes.find(pid); - if (it == processes.end()) { - ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); - continue; - } - - if (WIFEXITED(status) || WIFSIGNALED(status)) { - processes.erase(it); - continue; - } - - if (!WIFSTOPPED(status)) continue; - int sig = WSTOPSIG(status); - unsigned event = static_cast<unsigned>(status >> 16); - - if (event == PTRACE_EVENT_FORK || event == PTRACE_EVENT_VFORK || - event == PTRACE_EVENT_CLONE) { - unsigned long new_pid = 0; - ptrace(PTRACE_GETEVENTMSG, pid, nullptr, &new_pid); - // A newly forked process inherits cwd and fd table at fork time. Copying - // the parent's ProcState matches that kernel behavior closely enough for - // path reconstruction until either process mutates its own state. - processes[static_cast<pid_t>(new_pid)] = it->second; - ptrace(PTRACE_SETOPTIONS, static_cast<pid_t>(new_pid), nullptr, kPtraceOptions); - ptrace(PTRACE_SYSCALL, static_cast<pid_t>(new_pid), nullptr, nullptr); - ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); - continue; - } - - if (sig == (SIGTRAP | 0x80)) { - user_regs_struct regs {}; - if (ptrace(PTRACE_GETREGS, pid, nullptr, ®s) == 0) { - ProcState& proc = it->second; - if (!proc.in_syscall) { - handle_syscall_entry(pid, proc, regs); - proc.in_syscall = true; - } else { - handle_syscall_exit(pid, proc, static_cast<long>(regs.rax)); - proc.in_syscall = false; - } - } - ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); - } else { - // Non-syscall stops are real signals or ptrace events. Plain SIGTRAP is - // consumed by the tracer; other signals are reinjected so tracing changes - // process behavior as little as practical. - int deliver = (sig == SIGTRAP) ? 0 : sig; - ptrace(PTRACE_SYSCALL, pid, nullptr, reinterpret_cast<void*>(static_cast<long>(deliver))); - } - } -} - -void write_manifest(const fs::path& out, const std::vector<std::string>& command, int exit_status) { - json manifest; - manifest["format_version"] = 1; - manifest["command"] = command; - manifest["exit_status"] = exit_status; - manifest["start_cwd"] = fs::current_path().string(); - manifest["uid"] = tracer_uid; - manifest["gid"] = tracer_gid; - - manifest["git_repositories"] = json::array(); - for (const auto& [_, repo] : repos) { - manifest["git_repositories"].push_back({ - {"root", repo.root}, - {"head", repo.head}, - {"dirty", repo.dirty}, - }); - } - - manifest["files"] = json::array(); - for (const auto& [_, rec] : files) { - json ops = json::array(); - for (const auto& op : rec.operations) ops.push_back(op); - manifest["files"].push_back({ - {"path", rec.path}, - {"operations", ops}, - {"before", metadata_json(rec.before)}, - {"after", metadata_json(rec.after)}, - // Prefer after_git when available so files created during the run are - // classified in their final repository context. Fall back to before_git - // for deleted paths whose final filesystem anchor may no longer exist. - {"git", git_json(rec.after_git.in_repo ? rec.after_git : rec.before_git)}, - }); - } - - std::ofstream stream(out / "manifest.json"); - stream << manifest.dump(2) << "\n"; -} - -void restore_snapshot(const fs::path& dir) { - std::ifstream in(dir / "manifest.json"); - if (!in) throw std::runtime_error("cannot open manifest"); - json manifest = json::parse(in); - - for (const auto& item : manifest.at("files")) { - fs::path path = item.at("path").get<std::string>(); - const json& after = item.at("after"); - if (!after.value("exists", false)) { - if (after.value("tombstone", false)) { - // A tombstone represents final non-existence. Missing is already the - // desired final state, so remove errors are intentionally non-fatal. - std::error_code ec; - fs::remove(path, ec); - } - continue; - } - if (!after.contains("blob")) continue; - - // Restore only blobbed files. Clean Git-tracked files and reconstructable - // system files are manifest references, not payloads owned by this bundle. - fs::create_directories(path.parent_path()); - const std::string expected_blob = after.at("blob").get<std::string>(); - // Avoid rewriting identical files. This matters for observed executables - // that may be mapped or busy while restore runs, and it also makes restore - // idempotent for normal captured files. - if (!(fs::exists(path) && fs::is_regular_file(path) && - fnv1a_file_digest(path) == expected_blob)) { - fs::path tmp = path; - tmp += ".agent-snapshot.tmp"; - fs::copy_file(dir / "blobs" / expected_blob, tmp, - fs::copy_options::overwrite_existing); - fs::rename(tmp, path); - } - if (after.contains("mode")) { - fs::permissions(path, static_cast<fs::perms>(after.at("mode").get<unsigned>() & 07777), - fs::perm_options::replace); - } - if (after.contains("mtime")) { - // std::filesystem has no portable wall-clock setter for Unix time in C++20. - struct timespec ts[2] {}; - ts[0].tv_nsec = UTIME_OMIT; - ts[1].tv_sec = after.at("mtime").get<std::time_t>(); - ts[1].tv_nsec = 0; - utimensat(AT_FDCWD, path.c_str(), ts, AT_SYMLINK_NOFOLLOW); - } - } -} - -int run_snapshot(const std::vector<std::string>& args) { - load_ignore_config(); - - fs::path output; - size_t split = args.size(); - for (size_t i = 0; i < args.size(); ++i) { - if (args[i] == "--output" && i + 1 < args.size()) { - output = args[i + 1]; - ++i; - } else if (args[i] == "--") { - split = i + 1; - break; - } else { - throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args..."); - } - } - if (output.empty() || split >= args.size()) { - throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args..."); - } - - snapshot_dir = output; - blob_dir = snapshot_dir / "blobs"; - // v1 treats --output as an owned bundle directory. Removing it up front avoids - // stale blobs or manifest entries from a previous run being mistaken for the - // current trace. - fs::remove_all(snapshot_dir); - fs::create_directories(blob_dir); - - std::vector<std::string> command(args.begin() + static_cast<long>(split), args.end()); - trace_command(command); - finalize_records(); - write_manifest(snapshot_dir, command, 0); - return 0; -} - -} // namespace - -int main(int argc, char** argv) { - tracer_uid = getuid(); - tracer_gid = getgid(); - git_libgit2_init(); - try { - std::vector<std::string> args(argv + 1, argv + argc); - if (!args.empty() && args[0] == "restore") { - if (args.size() != 2) throw std::runtime_error("usage: agent-snapshot restore SNAPDIR"); - restore_snapshot(args[1]); - git_libgit2_shutdown(); - return 0; - } - int rc = run_snapshot(args); - git_libgit2_shutdown(); - return rc; - } catch (const std::exception& e) { - git_libgit2_shutdown(); - std::cerr << "agent-snapshot: " << e.what() << "\n"; - return 1; - } -}
@@ -0,0 +1,717 @@ +module Json = Yojson.Safe + +let at_fdcwd = -100 +let o_accmode = 0o3 +let o_rdonly = 0 +let o_wronly = 1 +let o_rdwr = 2 +let o_creat = 0o100 +let o_trunc = 0o1000 +let o_append = 0o2000 +let o_directory = 0o200000 +let f_dupfd = 0 +let f_dupfd_cloexec = 1030 + +module Syscall = struct + let access = 21 + let close = 3 + let creat = 85 + let dup = 32 + let dup2 = 33 + let dup3 = 292 + let faccessat = 269 + let faccessat2 = 439 + let fchdir = 81 + let fcntl = 72 + let ftruncate = 77 + let getdents = 78 + let getdents64 = 217 + let lstat = 6 + let mkdir = 83 + let mkdirat = 258 + let newfstatat = 262 + let open_ = 2 + let openat = 257 + let openat2 = 437 + let readlink = 89 + let readlinkat = 267 + let rename = 82 + let renameat = 264 + let renameat2 = 316 + let rmdir = 84 + let stat = 4 + let truncate = 76 + let unlink = 87 + let unlinkat = 263 + let chdir = 80 +end + +type metadata = { + mutable exists : bool; + mutable tombstone : bool; + mutable regular : bool; + mutable directory : bool; + mutable mode : int; + mutable size : int64; + mutable mtime : int; + mutable blob : string option; +} + +type git_info = { + mutable in_repo : bool; + mutable tracked : bool; + mutable dirty : bool; + mutable ignored : bool; + mutable root : string; + mutable head : string; + mutable relative_path : string; +} + +type file_record = { + path : string; + operations : (string, unit) Hashtbl.t; + mutable before : metadata; + mutable after : metadata; + mutable before_git : git_info; + mutable after_git : git_info; + mutable before_recorded : bool; +} + +type pending_syscall = { + nr : int; + args : int64 array; + mutable path_a : string; + mutable path_b : string; + mutable dirfd : int; + mutable fd : int; + mutable flags : int; +} + +type proc_state = { + mutable cwd : string; + fds : (int, string) Hashtbl.t; + mutable pending : pending_syscall option; +} + +type repo_record = { + root : string; + mutable head : string; + mutable dirty : bool; +} + +let empty_metadata () = + { exists = false; tombstone = false; regular = false; directory = false; mode = 0; size = 0L; mtime = 0; blob = None } + +let empty_git () = + { in_repo = false; tracked = false; dirty = false; ignored = false; root = ""; head = ""; relative_path = "" } + +let files : (string, file_record) Hashtbl.t = Hashtbl.create 128 +let repos : (string, repo_record) Hashtbl.t = Hashtbl.create 8 +let processes : (int, proc_state) Hashtbl.t = Hashtbl.create 8 +let ignored_paths = ref [] +let ignore_config_path = ref "" +let snapshot_dir = ref "" +let blob_dir = ref "" +let tracer_uid = Unix.getuid () +let tracer_gid = Unix.getgid () + +let path_sep = '/' + +let split_path path = + path |> String.split_on_char path_sep |> List.filter (fun part -> part <> "" && part <> ".") + +let normalize_path path = + let absolute = String.length path > 0 && path.[0] = path_sep in + let parts = + List.fold_left + (fun acc part -> + if part = ".." then match acc with [] -> acc | _ :: rest -> rest else part :: acc) + [] (split_path path) + |> List.rev + in + let body = String.concat "/" parts in + if absolute then if body = "" then "/" else "/" ^ body else if body = "" then "." else body + +let concat_path base path = + if path = "" then base + else if String.length path > 0 && path.[0] = '/' then normalize_path path + else normalize_path (base ^ "/" ^ path) + +let dirname path = + let path = normalize_path path in + match String.rindex_opt path '/' with + | None -> "." + | Some 0 -> "/" + | Some i -> String.sub path 0 i + +let basename path = + match String.rindex_opt path '/' with + | None -> path + | Some i -> String.sub path (i + 1) (String.length path - i - 1) + +let rec mkdir_p path = + if path = "" || path = "/" || Sys.file_exists path then () + else ( + mkdir_p (dirname path); + try Unix.mkdir path 0o777 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()) + +let is_absolute path = String.length path > 0 && path.[0] = '/' + +let realpath_opt path = try Some (Unix.realpath path) with Unix.Unix_error _ -> None + +let best_effort_canonical path = + match realpath_opt path with + | Some path -> normalize_path path + | None when is_absolute path -> normalize_path path + | None -> concat_path (Sys.getcwd ()) path + +let path_is_at_or_under path root = + path = root || + let root = if String.ends_with ~suffix:"/" root then root else root ^ "/" in + String.starts_with ~prefix:root path + +let is_git_internal_path path = List.exists (( = ) ".git") (split_path path) + +let is_ignored_path raw_path = + if raw_path = "" then false + else + let path = best_effort_canonical raw_path in + is_git_internal_path path || List.exists (fun ignored -> path_is_at_or_under path ignored) !ignored_paths + +let home_dir () = + match Sys.getenv_opt "HOME" with + | Some home when home <> "" -> home + | _ -> failwith "HOME is unavailable" + +let xdg_config_home_dir () = + match Sys.getenv_opt "XDG_CONFIG_HOME" with + | Some path when path <> "" -> path + | _ -> concat_path (home_dir ()) ".config" + +let xdg_ignore_config_path () = concat_path (xdg_config_home_dir ()) "agent-snapshot/ignore.json" + +let expand_ignore_entry entry = + let home = "$HOME" in + let xdg = "$XDG_CONFIG_HOME" in + if entry = home then home_dir () + else if String.starts_with ~prefix:(home ^ "/") entry then concat_path (home_dir ()) (String.sub entry 6 (String.length entry - 6)) + else if entry = xdg then xdg_config_home_dir () + else if String.starts_with ~prefix:(xdg ^ "/") entry then concat_path (xdg_config_home_dir ()) (String.sub entry 17 (String.length entry - 17)) + else entry + +let load_ignore_config () = + ignore_config_path := best_effort_canonical (xdg_ignore_config_path ()); + let json = + try Json.from_file !ignore_config_path + with Sys_error _ -> failwith ("ignore config does not exist: " ^ !ignore_config_path) + in + match json with + | `List entries -> + ignored_paths := + !ignore_config_path + :: List.map + (function + | `String entry -> best_effort_canonical (expand_ignore_entry entry) + | _ -> failwith ("ignore config entries must be strings: " ^ !ignore_config_path)) + entries + | _ -> failwith ("ignore config must be a JSON array: " ^ !ignore_config_path) + +let mode_of_kind = function + | Unix.S_REG -> 0o100000 + | Unix.S_DIR -> 0o040000 + | Unix.S_LNK -> 0o120000 + | Unix.S_CHR -> 0o020000 + | Unix.S_BLK -> 0o060000 + | Unix.S_FIFO -> 0o010000 + | Unix.S_SOCK -> 0o140000 + +let stat_metadata path = + try + let st = Unix.LargeFile.lstat path in + Some + { + exists = true; + tombstone = false; + regular = st.st_kind = Unix.S_REG; + directory = st.st_kind = Unix.S_DIR; + mode = mode_of_kind st.st_kind lor st.st_perm; + size = st.st_size; + mtime = int_of_float st.st_mtime; + blob = None; + } + with Unix.Unix_error _ -> None + +let owned_by_other_and_not_writable path = + try + let st = Unix.LargeFile.lstat path in + st.st_uid <> tracer_uid + && + try + Unix.access path [ Unix.W_OK ]; + false + with Unix.Unix_error _ -> true + with Unix.Unix_error _ -> false + +let writable_by_current_user meta = + if not meta.exists then true + else if tracer_uid = 0 then true + else meta.mode land 0o222 <> 0 + +let existing_anchor path = + let rec loop path = + if path = "" || path = "." then None + else if Sys.file_exists path then Some path + else + let parent = dirname path in + if parent = path then None else loop parent + in + loop path + +let relative_path root path = + let root = best_effort_canonical root in + let path = normalize_path path in + let prefix = if String.ends_with ~suffix:"/" root then root else root ^ "/" in + if path = root then "" else if String.starts_with ~prefix path then String.sub path (String.length prefix) (String.length path - String.length prefix) else path + +let classify_git input_path = + let info = empty_git () in + let anchor = match existing_anchor input_path with Some p -> Some p | None -> existing_anchor (dirname input_path) in + match anchor with + | None -> info + | Some anchor -> ( + try + let discovered = Ocaml_git.discover anchor in + Ocaml_git.with_repo discovered (fun repo -> + match Ocaml_git.workdir repo with + | None -> info + | Some workdir -> + let root = best_effort_canonical workdir in + let rel = relative_path root input_path in + info.in_repo <- true; + info.root <- root; + info.relative_path <- rel; + info.head <- (try (Ocaml_git.head_commit repo).id with Ocaml_git.Git_error _ -> ""); + let ignored = ref false in + let dirty = ref false in + let status_hit = ref false in + List.iter + (fun (entry : Ocaml_git.status_entry) -> + if entry.path = rel then ( + status_hit := true; + ignored := List.exists (( = ) Ocaml_git.Ignored) entry.flags; + dirty := entry.flags <> [] && entry.flags <> [ Ocaml_git.Current ])) + (Ocaml_git.status repo); + let tracked = + if !ignored then false + else if !status_hit then not (List.exists (( = ) Ocaml_git.Worktree_new) (List.find (fun (entry : Ocaml_git.status_entry) -> entry.path = rel) (Ocaml_git.status repo)).flags) + else + let index = Ocaml_git.index repo in + Fun.protect ~finally:(fun () -> Ocaml_git.close_index index) (fun () -> Ocaml_git.index_contains index rel) + in + info.ignored <- !ignored; + info.tracked <- tracked; + info.dirty <- if !status_hit then !dirty else false; + let rec_record = + match Hashtbl.find_opt repos root with + | Some rec_record -> rec_record + | None -> + let rec_record = { root; head = info.head; dirty = false } in + Hashtbl.add repos root rec_record; + rec_record + in + rec_record.head <- info.head; + rec_record.dirty <- rec_record.dirty || info.dirty; + info) + with Ocaml_git.Git_error _ -> info) + +let fnv1a_file_digest path = + let ic = open_in_bin path in + Fun.protect + ~finally:(fun () -> close_in_noerr ic) + (fun () -> + let hash = ref 0xcbf29ce484222325L in + (try + while true do + let c = input_byte ic in + hash := Int64.logxor !hash (Int64.of_int c); + hash := Int64.mul !hash 0x100000001b3L + done + with End_of_file -> ()); + Printf.sprintf "%016Lx" !hash) + +let copy_file src dst = + let ic = open_in_bin src in + Fun.protect + ~finally:(fun () -> close_in_noerr ic) + (fun () -> + let oc = open_out_bin dst in + Fun.protect + ~finally:(fun () -> close_out_noerr oc) + (fun () -> + let bytes = Bytes.create 65536 in + let rec loop () = + let n = input ic bytes 0 (Bytes.length bytes) in + if n > 0 then ( + output oc bytes 0 n; + loop ()) + in + loop ())) + +let store_blob path = + let digest = fnv1a_file_digest path in + let out = concat_path !blob_dir digest in + if not (Sys.file_exists out) then copy_file path out; + digest + +let should_capture_content path meta git = + if (not meta.exists) || not meta.regular then false + else if owned_by_other_and_not_writable path then false + else if git.in_repo && git.tracked && not git.dirty then false + else writable_by_current_user meta + +let record_observation raw_path operation = + if raw_path <> "" then + let path = best_effort_canonical raw_path in + if not (is_ignored_path path) then ( + let recd = + match Hashtbl.find_opt files path with + | Some recd -> recd + | None -> + let recd = + { path; operations = Hashtbl.create 5; before = empty_metadata (); after = empty_metadata (); before_git = empty_git (); after_git = empty_git (); before_recorded = false } + in + Hashtbl.add files path recd; + recd + in + Hashtbl.replace recd.operations operation (); + if not recd.before_recorded then ( + recd.before_recorded <- true; + recd.before <- Option.value (stat_metadata path) ~default:(empty_metadata ()); + recd.before_git <- classify_git path; + if should_capture_content path recd.before recd.before_git then recd.before.blob <- Some (store_blob path))) + +let finalize_records () = + Hashtbl.iter + (fun _ recd -> + if not (is_ignored_path recd.path) then ( + recd.after <- Option.value (stat_metadata recd.path) ~default:(empty_metadata ()); + if not recd.after.exists then recd.after.tombstone <- Hashtbl.mem recd.operations "delete"; + recd.after_git <- classify_git recd.path; + let written_regular = Hashtbl.mem recd.operations "write" && recd.after.exists && recd.after.regular in + if (written_regular && not (owned_by_other_and_not_writable recd.path)) || should_capture_content recd.path recd.after recd.after_git then + recd.after.blob <- Some (store_blob recd.path))) + files + +let metadata_json meta = + let base = [ ("exists", `Bool meta.exists) ] in + let base = if meta.tombstone then ("tombstone", `Bool true) :: base else base in + let base = + if meta.exists then + ("type", `String (if meta.directory then "directory" else if meta.regular then "file" else "other")) + :: ("mode", `Int meta.mode) + :: ("size", `Intlit (Int64.to_string meta.size)) + :: ("mtime", `Int meta.mtime) + :: base + else base + in + let base = match meta.blob with Some blob -> ("blob", `String blob) :: base | None -> base in + `Assoc (List.rev base) + +let git_json git = + if not git.in_repo then `Assoc [ ("in_repo", `Bool false) ] + else + `Assoc + [ + ("in_repo", `Bool true); + ("root", `String git.root); + ("head", `String git.head); + ("relative_path", `String git.relative_path); + ("tracked", `Bool git.tracked); + ("dirty", `Bool git.dirty); + ("ignored", `Bool git.ignored); + ] + +let write_manifest out command exit_status = + let repo_items = + Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", `String repo.root); ("head", `String repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos [] + |> List.sort Stdlib.compare + in + let file_items = + Hashtbl.fold + (fun _ recd acc -> + let ops = Hashtbl.fold (fun op () acc -> op :: acc) recd.operations [] |> List.sort String.compare in + let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in + `Assoc + [ + ("path", `String recd.path); + ("operations", `List (List.map (fun op -> `String op) ops)); + ("before", metadata_json recd.before); + ("after", metadata_json recd.after); + ("git", git_json git); + ] + :: acc) + files [] + |> List.sort Stdlib.compare + in + let manifest = + `Assoc + [ + ("format_version", `Int 1); + ("command", `List (List.map (fun arg -> `String arg) command)); + ("exit_status", `Int exit_status); + ("start_cwd", `String (Sys.getcwd ())); + ("uid", `Int tracer_uid); + ("gid", `Int tracer_gid); + ("git_repositories", `List repo_items); + ("files", `List file_items); + ] + in + Json.to_file ~std:true (concat_path out "manifest.json") manifest + +let resolve_path proc dirfd path = + if is_absolute path then normalize_path path + else + let base = + if dirfd <> at_fdcwd then Option.value (Hashtbl.find_opt proc.fds dirfd) ~default:proc.cwd else proc.cwd + in + concat_path base path + +let is_write_open flags = + let access = flags land o_accmode in + access = o_wronly || access = o_rdwr || flags land (o_creat lor o_trunc lor o_append) <> 0 + +let is_read_open flags = + let access = flags land o_accmode in + access = o_rdonly || access = o_rdwr + +let readlink_opt path = try Some (Unix.readlink path) with Unix.Unix_error _ -> None + +let refresh_proc_fd pid proc fd = + match readlink_opt (Printf.sprintf "/proc/%d/fd/%d" pid fd) with + | Some target when is_absolute target -> Hashtbl.replace proc.fds fd (best_effort_canonical target) + | _ -> () + +let refresh_proc_cwd pid proc = + match readlink_opt (Printf.sprintf "/proc/%d/cwd" pid) with + | Some target -> proc.cwd <- best_effort_canonical target + | None -> () + +let int_arg regs i = Int64.to_int regs.Ptrace.args.(i) + +let handle_syscall_entry pid proc regs = + let p = { nr = regs.Ptrace.syscall_nr; args = regs.args; path_a = ""; path_b = ""; dirfd = at_fdcwd; fd = -1; flags = 0 } in + let tracee_string i = Ptrace.read_string pid regs.args.(i) in + begin + match p.nr with + | nr when nr = Syscall.open_ -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0); + p.flags <- int_arg regs 1; + if is_write_open p.flags then record_observation p.path_a "write" + | nr when nr = Syscall.openat || nr = Syscall.openat2 -> + p.dirfd <- int_arg regs 0; + p.path_a <- resolve_path proc p.dirfd (tracee_string 1); + p.flags <- int_arg regs 2; + if is_write_open p.flags then record_observation p.path_a "write" + | nr when nr = Syscall.creat -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0); + p.flags <- o_creat lor o_wronly lor o_trunc; + record_observation p.path_a "write" + | nr when nr = Syscall.stat || nr = Syscall.lstat || nr = Syscall.access || nr = Syscall.readlink -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0) + | nr when nr = Syscall.newfstatat || nr = Syscall.faccessat || nr = Syscall.faccessat2 || nr = Syscall.readlinkat -> + p.dirfd <- int_arg regs 0; + p.path_a <- resolve_path proc p.dirfd (tracee_string 1) + | nr when nr = Syscall.unlink || nr = Syscall.rmdir -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0); + record_observation p.path_a "delete" + | nr when nr = Syscall.unlinkat || nr = Syscall.mkdirat -> + p.dirfd <- int_arg regs 0; + p.path_a <- resolve_path proc p.dirfd (tracee_string 1); + if p.nr = Syscall.unlinkat then record_observation p.path_a "delete" + | nr when nr = Syscall.mkdir || nr = Syscall.chdir || nr = Syscall.truncate -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0) + | nr when nr = Syscall.rename -> + p.path_a <- resolve_path proc at_fdcwd (tracee_string 0); + p.path_b <- resolve_path proc at_fdcwd (tracee_string 1); + record_observation p.path_a "delete"; + record_observation p.path_b "write" + | nr when nr = Syscall.renameat || nr = Syscall.renameat2 -> + p.path_a <- resolve_path proc (int_arg regs 0) (tracee_string 1); + p.path_b <- resolve_path proc (int_arg regs 2) (tracee_string 3); + record_observation p.path_a "delete"; + record_observation p.path_b "write" + | nr when nr = Syscall.getdents || nr = Syscall.getdents64 || nr = Syscall.fchdir || nr = Syscall.ftruncate -> + p.fd <- int_arg regs 0 + | nr when nr = Syscall.close || nr = Syscall.dup || nr = Syscall.dup2 || nr = Syscall.dup3 || nr = Syscall.fcntl -> + p.fd <- int_arg regs 0 + | _ -> () + end; + proc.pending <- Some p + +let syscall_ok result = Int64.compare result 0L >= 0 + +let handle_syscall_exit pid proc regs = + match proc.pending with + | None -> () + | Some p -> + let ok = syscall_ok regs.Ptrace.result in + begin + match p.nr with + | nr when nr = Syscall.open_ || nr = Syscall.openat || nr = Syscall.openat2 || nr = Syscall.creat -> + if ok then ( + if is_read_open p.flags then record_observation p.path_a "read"; + if is_write_open p.flags then record_observation p.path_a "write"; + if p.flags land o_directory <> 0 then record_observation p.path_a "directory"; + refresh_proc_fd pid proc (Int64.to_int regs.result)) + else record_observation p.path_a "existence" + | nr + when nr = Syscall.stat || nr = Syscall.lstat || nr = Syscall.newfstatat || nr = Syscall.access || nr = Syscall.faccessat + || nr = Syscall.faccessat2 || nr = Syscall.readlink || nr = Syscall.readlinkat -> + record_observation p.path_a "existence" + | nr when nr = Syscall.getdents || nr = Syscall.getdents64 -> + if ok && p.fd >= 0 then Option.iter (fun path -> record_observation path "directory") (Hashtbl.find_opt proc.fds p.fd) + | nr when nr = Syscall.unlink || nr = Syscall.unlinkat || nr = Syscall.rmdir -> + record_observation p.path_a "delete" + | nr when nr = Syscall.rename || nr = Syscall.renameat || nr = Syscall.renameat2 -> + record_observation p.path_a "delete"; + record_observation p.path_b "write" + | nr when nr = Syscall.mkdir || nr = Syscall.mkdirat || nr = Syscall.truncate -> + record_observation p.path_a "write" + | nr when nr = Syscall.ftruncate -> + if p.fd >= 0 then Option.iter (fun path -> record_observation path "write") (Hashtbl.find_opt proc.fds p.fd) + | nr when nr = Syscall.chdir || nr = Syscall.fchdir -> + if ok then refresh_proc_cwd pid proc + | nr when nr = Syscall.close -> + if ok then Hashtbl.remove proc.fds p.fd + | nr when nr = Syscall.dup -> + if ok then Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int regs.result) path) (Hashtbl.find_opt proc.fds p.fd) + | nr when nr = Syscall.dup2 || nr = Syscall.dup3 -> + if ok then Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int p.args.(1)) path) (Hashtbl.find_opt proc.fds p.fd) + | nr when nr = Syscall.fcntl -> + if ok && (Int64.to_int p.args.(1) = f_dupfd || Int64.to_int p.args.(1) = f_dupfd_cloexec) then + Option.iter (fun path -> Hashtbl.replace proc.fds (Int64.to_int regs.result) path) (Hashtbl.find_opt proc.fds p.fd) + | _ -> () + end + +let clone_proc_state state = + let fds = Hashtbl.create (Hashtbl.length state.fds) in + Hashtbl.iter (fun fd path -> Hashtbl.add fds fd path) state.fds; + { cwd = state.cwd; fds; pending = state.pending } + +let trace_command command = + Ptrace.trace command (function + | Ptrace.Fork { parent; child } -> + let state = + match Hashtbl.find_opt processes parent with + | Some state -> clone_proc_state state + | None -> { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None } + in + Hashtbl.replace processes child state + | Ptrace.Process_exit pid -> Hashtbl.remove processes pid + | Ptrace.Syscall_enter (pid, regs) -> + let state = + match Hashtbl.find_opt processes pid with + | Some state -> state + | None -> + let state = { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None } in + refresh_proc_cwd pid state; + Hashtbl.add processes pid state; + state + in + handle_syscall_entry pid state regs + | Ptrace.Syscall_exit (pid, regs) -> + Option.iter (fun state -> handle_syscall_exit pid state regs) (Hashtbl.find_opt processes pid) + | Ptrace.Exec pid | Ptrace.Exit pid | Ptrace.Signal (pid, _) -> + if not (Hashtbl.mem processes pid) then ( + let state = { cwd = Sys.getcwd (); fds = Hashtbl.create 8; pending = None } in + refresh_proc_cwd pid state; + Hashtbl.add processes pid state)) + +let rec remove_all path = + if Sys.file_exists path then + if Sys.is_directory path then ( + Array.iter (fun name -> if name <> "." && name <> ".." then remove_all (concat_path path name)) (Sys.readdir path); + Unix.rmdir path) + else Unix.unlink path + +let restore_snapshot dir = + let manifest = Json.from_file (concat_path dir "manifest.json") in + let files = + match manifest with + | `Assoc fields -> (match List.assoc_opt "files" fields with Some (`List files) -> files | _ -> []) + | _ -> [] + in + List.iter + (fun file_json -> + match file_json with + | `Assoc item -> ( + match (List.assoc_opt "path" item, List.assoc_opt "after" item : Json.t option * Json.t option) with + | Some (`String path), Some (`Assoc after) -> + let exists = match List.assoc_opt "exists" after with Some (`Bool b) -> b | _ -> false in + if not exists then ( + let tombstone = match List.assoc_opt "tombstone" after with Some (`Bool b) -> b | _ -> false in + if tombstone then try Unix.unlink path with Unix.Unix_error _ -> ()) + else ( + match List.assoc_opt "blob" after with + | None -> () + | Some (`String digest) -> + mkdir_p (dirname path); + let same = + Sys.file_exists path && not (Sys.is_directory path) && fnv1a_file_digest path = digest + in + if not same then ( + let tmp = path ^ ".agent-snapshot.tmp" in + copy_file (concat_path (concat_path dir "blobs") digest) tmp; + Unix.rename tmp path); + (match List.assoc_opt "mode" after with + | Some (`Int mode) -> Unix.chmod path (mode land 0o7777) + | _ -> ()); + (match List.assoc_opt "mtime" after with + | Some (`Int mtime) -> Unix.utimes path (float_of_int mtime) (float_of_int mtime) + | _ -> ()) + | _ -> ()) + | _ -> ()) + | _ -> ()) + files + +let parse_snapshot_args args = + let rec loop output = function + | "--output" :: value :: rest -> loop (Some value) rest + | "--" :: command -> (output, command) + | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..." + in + match loop None args with + | Some output, (_ :: _ as command) -> (output, command) + | _ -> failwith "usage: agent-snapshot --output SNAPDIR -- command args..." + +let run_snapshot args = + load_ignore_config (); + let output, command = parse_snapshot_args args in + snapshot_dir := output; + blob_dir := concat_path output "blobs"; + remove_all output; + mkdir_p !blob_dir; + trace_command command; + finalize_records (); + write_manifest output command 0; + 0 + +let main () = + try + let args = Array.to_list Sys.argv |> List.tl in + let rc = + match args with + | [ "restore"; dir ] -> + restore_snapshot dir; + 0 + | "restore" :: _ -> failwith "usage: agent-snapshot restore SNAPDIR" + | _ -> run_snapshot args + in + Ocaml_git.shutdown (); + exit rc + with exn -> + Ocaml_git.shutdown (); + Printf.eprintf "agent-snapshot: %s\n%!" (Printexc.to_string exn); + exit 1 + +let () = main ()
@@ -0,0 +1,7 @@ +(executable + (name agent_snapshot) + (public_name agent-snapshot) + (foreign_stubs + (language c) + (names ptrace_stubs)) + (libraries unix yojson ocaml-git))
@@ -0,0 +1,157 @@ +type pid = int + +type regs = { + syscall_nr : int; + args : int64 array; + result : int64; +} + +type wait_stop = + | Exited of pid * int + | Signaled of pid * int + | Stopped of pid * int * int + +type event = + | Syscall_enter of pid * regs + | Syscall_exit of pid * regs + | Fork of { parent : pid; child : pid } + | Exec of pid + | Exit of pid + | Signal of pid * int + | Process_exit of pid + +type task = { + mutable in_syscall : bool; +} + +external fork : unit -> int = "as_fork" +external traceme : unit -> unit = "as_traceme" +external setoptions : pid -> unit = "as_setoptions" +external syscall : pid -> int -> unit = "as_syscall" +external geteventmsg : pid -> int = "as_geteventmsg" +external getregs_raw : pid -> int * int64 * int64 * int64 * int64 * int64 * int64 * int64 = "as_getregs" +external peek_word : pid -> int64 -> string = "as_peek_word" +external wait_raw : pid -> bool -> wait_stop = "as_wait" +external const_sigtrap_sysgood : unit -> int = "as_const_sigtrap_sysgood" +external const_sigtrap : unit -> int = "as_const_sigtrap" +external const_event_fork : unit -> int = "as_const_event_fork" +external const_event_vfork : unit -> int = "as_const_event_vfork" +external const_event_clone : unit -> int = "as_const_event_clone" +external const_event_exec : unit -> int = "as_const_event_exec" +external const_event_exit : unit -> int = "as_const_event_exit" + +let regs pid = + let nr, a0, a1, a2, a3, a4, a5, result = getregs_raw pid in + { syscall_nr = nr; args = [| a0; a1; a2; a3; a4; a5 |]; result } + +let read_string pid address = + if Int64.equal address 0L then "" + else + let max_len = 65536 in + let word_size = Sys.word_size / 8 in + let buffer = Buffer.create 64 in + let rec loop offset = + if offset >= max_len then Buffer.contents buffer + else + match peek_word pid (Int64.add address (Int64.of_int offset)) with + | exception _ -> Buffer.contents buffer + | word -> + let rec scan i = + if i >= String.length word then loop (offset + word_size) + else + let c = word.[i] in + if Char.equal c '\000' then Buffer.contents buffer + else ( + Buffer.add_char buffer c; + scan (i + 1)) + in + scan 0 + in + loop 0 + +let wait_initial pid = wait_raw pid false + +let wait_next () = wait_raw (-1) true + +let resume ?(signal = 0) pid = syscall pid signal + +let try_setoptions pid = + try setoptions pid with Unix.Unix_error (Unix.ESRCH, _, _) -> () + +let try_resume ?(signal = 0) pid = + try resume ~signal pid with Unix.Unix_error (Unix.ESRCH, _, _) -> () + +let is_fork_event event = + event = const_event_fork () || event = const_event_vfork () || event = const_event_clone () + +let decode_stop tasks = function + | Exited (pid, _) | Signaled (pid, _) -> + Hashtbl.remove tasks pid; + [ Process_exit pid ] + | Stopped (pid, signal, event) when is_fork_event event -> + let child = geteventmsg pid in + let parent_task = + match Hashtbl.find_opt tasks pid with + | Some task -> { in_syscall = task.in_syscall } + | None -> { in_syscall = false } + in + Hashtbl.replace tasks child parent_task; + try_setoptions child; + try_resume child; + [ Fork { parent = pid; child } ] + | Stopped (pid, _signal, event) when event = const_event_exec () -> [ Exec pid ] + | Stopped (pid, _signal, event) when event = const_event_exit () -> [ Exit pid ] + | Stopped (pid, signal, _event) when signal = const_sigtrap_sysgood () -> ( + let task = + match Hashtbl.find_opt tasks pid with + | Some task -> task + | None -> + let task = { in_syscall = false } in + Hashtbl.replace tasks pid task; + task + in + let regs = regs pid in + if task.in_syscall then ( + task.in_syscall <- false; + [ Syscall_exit (pid, regs) ]) + else ( + task.in_syscall <- true; + [ Syscall_enter (pid, regs) ])) + | Stopped (pid, signal, _event) -> + [ Signal (pid, if signal = const_sigtrap () then 0 else signal) ] + +let trace command on_event = + match command with + | [] -> invalid_arg "empty command" + | argv0 :: _ -> + let child = fork () in + if child = 0 then ( + traceme (); + Unix.kill (Unix.getpid ()) Sys.sigstop; + Unix.execvp argv0 (Array.of_list command)) + else ( + match wait_initial child with + | Stopped _ -> + setoptions child; + let tasks = Hashtbl.create 8 in + Hashtbl.add tasks child { in_syscall = false }; + try_resume child; + while Hashtbl.length tasks > 0 do + match wait_next () with + | exception Unix.Unix_error (Unix.ECHILD, _, _) -> Hashtbl.clear tasks + | stop -> + let pid = + match stop with + | Exited (pid, _) | Signaled (pid, _) | Stopped (pid, _, _) -> pid + in + let events = decode_stop tasks stop in + List.iter on_event events; + if Hashtbl.mem tasks pid then + let signal = + match events with + | [ Signal (_, signal) ] -> signal + | _ -> 0 + in + try_resume ~signal pid + done + | _ -> failwith "tracee did not stop at startup")
@@ -0,0 +1,156 @@ +#include <caml/alloc.h> +#include <caml/fail.h> +#include <caml/memory.h> +#include <caml/unixsupport.h> +#include <caml/mlvalues.h> + +#include <errno.h> +#include <signal.h> +#include <stdint.h> +#include <string.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/user.h> +#include <sys/wait.h> +#include <unistd.h> + +static void raise_unix_error(const char *call) { + uerror(call, Nothing); +} + +CAMLprim value as_fork(value unit) { + CAMLparam1(unit); + pid_t pid = fork(); + if (pid < 0) raise_unix_error("fork"); + CAMLreturn(Val_int(pid)); +} + +CAMLprim value as_traceme(value unit) { + CAMLparam1(unit); + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) raise_unix_error("ptrace_traceme"); + CAMLreturn(Val_unit); +} + +CAMLprim value as_setoptions(value pid_v) { + CAMLparam1(pid_v); + long options = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | + PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT; + if (ptrace(PTRACE_SETOPTIONS, Int_val(pid_v), NULL, (void *)options) != 0) { + raise_unix_error("ptrace_setoptions"); + } + CAMLreturn(Val_unit); +} + +CAMLprim value as_syscall(value pid_v, value signal_v) { + CAMLparam2(pid_v, signal_v); + if (ptrace(PTRACE_SYSCALL, Int_val(pid_v), NULL, (void *)(long)Int_val(signal_v)) != 0) { + raise_unix_error("ptrace_syscall"); + } + CAMLreturn(Val_unit); +} + +CAMLprim value as_geteventmsg(value pid_v) { + CAMLparam1(pid_v); + unsigned long msg = 0; + if (ptrace(PTRACE_GETEVENTMSG, Int_val(pid_v), NULL, &msg) != 0) { + raise_unix_error("ptrace_geteventmsg"); + } + CAMLreturn(Val_int((int)msg)); +} + +CAMLprim value as_getregs(value pid_v) { + CAMLparam1(pid_v); + CAMLlocal1(tuple); + struct user_regs_struct regs; + if (ptrace(PTRACE_GETREGS, Int_val(pid_v), NULL, ®s) != 0) { + raise_unix_error("ptrace_getregs"); + } + tuple = caml_alloc_tuple(8); + Store_field(tuple, 0, Val_int((int)regs.orig_rax)); + Store_field(tuple, 1, caml_copy_int64((int64_t)regs.rdi)); + Store_field(tuple, 2, caml_copy_int64((int64_t)regs.rsi)); + Store_field(tuple, 3, caml_copy_int64((int64_t)regs.rdx)); + Store_field(tuple, 4, caml_copy_int64((int64_t)regs.r10)); + Store_field(tuple, 5, caml_copy_int64((int64_t)regs.r8)); + Store_field(tuple, 6, caml_copy_int64((int64_t)regs.r9)); + Store_field(tuple, 7, caml_copy_int64((int64_t)regs.rax)); + CAMLreturn(tuple); +} + +CAMLprim value as_peek_word(value pid_v, value addr_v) { + CAMLparam2(pid_v, addr_v); + CAMLlocal1(out); + union { + long value; + char bytes[sizeof(long)]; + } data; + errno = 0; + data.value = ptrace(PTRACE_PEEKDATA, Int_val(pid_v), (void *)(uintptr_t)Int64_val(addr_v), NULL); + if (errno != 0) raise_unix_error("ptrace_peekdata"); + out = caml_alloc_string(sizeof(long)); + memcpy(Bytes_val(out), data.bytes, sizeof(long)); + CAMLreturn(out); +} + +CAMLprim value as_wait(value pid_v, value wall_v) { + CAMLparam2(pid_v, wall_v); + CAMLlocal1(result); + int status = 0; + int options = Bool_val(wall_v) ? __WALL : 0; + pid_t pid; + do { + pid = waitpid(Int_val(pid_v), &status, options); + } while (pid < 0 && errno == EINTR); + if (pid < 0) raise_unix_error("waitpid"); + if (WIFEXITED(status)) { + result = caml_alloc(2, 0); + Store_field(result, 0, Val_int(pid)); + Store_field(result, 1, Val_int(WEXITSTATUS(status))); + } else if (WIFSIGNALED(status)) { + result = caml_alloc(2, 1); + Store_field(result, 0, Val_int(pid)); + Store_field(result, 1, Val_int(WTERMSIG(status))); + } else { + result = caml_alloc(3, 2); + Store_field(result, 0, Val_int(pid)); + Store_field(result, 1, Val_int(WSTOPSIG(status))); + Store_field(result, 2, Val_int(status >> 16)); + } + CAMLreturn(result); +} + +CAMLprim value as_const_sigtrap_sysgood(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(SIGTRAP | 0x80)); +} + +CAMLprim value as_const_sigtrap(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(SIGTRAP)); +} + +CAMLprim value as_const_event_fork(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(PTRACE_EVENT_FORK)); +} + +CAMLprim value as_const_event_vfork(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(PTRACE_EVENT_VFORK)); +} + +CAMLprim value as_const_event_clone(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(PTRACE_EVENT_CLONE)); +} + +CAMLprim value as_const_event_exec(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(PTRACE_EVENT_EXEC)); +} + +CAMLprim value as_const_event_exit(value unit) { + CAMLparam1(unit); + CAMLreturn(Val_int(PTRACE_EVENT_EXIT)); +}
@@ -8,8 +8,7 @@ import pytest ROOT = Path(__file__).resolve().parents[1] -BUILD = ROOT / "build" / "pytest" -BIN = BUILD / "agent-snapshot" +BIN = ROOT / "_build" / "default" / "src" / "ocaml" / "agent_snapshot.exe" TESTDATA = ROOT / "testdata" WORKTREE = TESTDATA / "runtime_repo" # Use the system Python rather than uv's managed interpreter. The snapshotter @@ -28,9 +27,8 @@ def run(cmd, **kwargs): def build_agent_snapshot(): # The tests exercise the real CLI binary instead of calling internal helper # functions. That keeps the acceptance criteria aligned with ptrace behavior, - # process launch, CMake wiring, and manifest writing as users will run them. - run(["cmake", "-S", ".", "-B", str(BUILD)]) - run(["cmake", "--build", str(BUILD), "--parallel"]) + # process launch, Dune wiring, and manifest writing as users will run them. + run(["bash", "-lc", ". /scratch/arjun/ocaml/env.sh && dune build src/ocaml/agent_snapshot.exe"]) assert BIN.exists() @@ -349,7 +347,7 @@ def test_text_peculiar_file_names_are_recorded_and_blobbed(tmp_path): assert snap.blob_text(newline["before"]["blob"]) == "newline payload\n" -@pytest.mark.skip(reason="nlohmann/json rejects non-UTF-8 std::string values when dumping JSON") +@pytest.mark.skip(reason="manifest paths are still not valid UTF-8 for non-UTF-8 filenames") def test_non_utf8_filename_exposes_json_string_limitation(tmp_path): bytes_path = os.path.join(os.fsencode(WORKTREE), b"non-utf8-\xff.txt") with open(bytes_path, "wb") as handle:
@@ -0,0 +1 @@ +../../../homebox/ocaml-git \ No newline at end of file