Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -0,0 +1,6 @@ +build/ +.pytest_cache/ +.venv/ +__pycache__/ +*.pyc +snapshot-*/
@@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.28) +project(agent_snapshot VERSION 0.1.0 LANGUAGES CXX) + +include(FetchContent) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(PkgConfig REQUIRED) +pkg_check_modules(LIBGIT2 REQUIRED IMPORTED_TARGET libgit2) + +FetchContent_Declare( + nlohmann_json + URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz + URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d +) +FetchContent_MakeAvailable(nlohmann_json) + +add_executable(agent-snapshot + src/main.cpp +) + +target_include_directories(agent-snapshot PRIVATE include) +target_link_libraries(agent-snapshot PRIVATE PkgConfig::LIBGIT2 nlohmann_json::nlohmann_json) +target_compile_options(agent-snapshot PRIVATE -Wall -Wextra -Wpedantic) +
@@ -0,0 +1,14 @@ +[project] +name = "agent-snapshot" +version = "0.1.0" +requires-python = ">=3.12" +dependencies = [] + +[dependency-groups] +dev = [ + "pytest>=8.0", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-ra"
@@ -0,0 +1,728 @@ +#include <nlohmann/json.hpp> +#include <git2.h> + +#include <sys/ptrace.h> +#include <sys/reg.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include <fcntl.h> +#include <signal.h> +#include <unistd.h> + +#include <cerrno> +#include <chrono> +#include <cstring> +#include <filesystem> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <map> +#include <optional> +#include <set> +#include <sstream> +#include <stdexcept> +#include <string> +#include <unordered_map> +#include <vector> + +using json = nlohmann::json; +namespace fs = std::filesystem; + +namespace { + +constexpr int kPtraceOptions = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | + PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE | + PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT; + +struct Metadata { + bool exists = false; + bool tombstone = false; + bool regular = false; + bool directory = false; + mode_t mode = 0; + uintmax_t size = 0; + std::time_t mtime = 0; + std::string blob; +}; + +struct GitInfo { + bool in_repo = false; + bool tracked = false; + bool dirty = false; + bool ignored = false; + std::string root; + std::string head; + std::string relative_path; +}; + +struct FileRecord { + std::string path; + std::set<std::string> operations; + Metadata before; + Metadata after; + GitInfo before_git; + GitInfo after_git; + bool before_recorded = false; +}; + +struct PendingSyscall { + long nr = -1; + std::array<unsigned long long, 6> args{}; + std::string path_a; + std::string path_b; + int dirfd = AT_FDCWD; + int fd = -1; + int flags = 0; +}; + +struct ProcState { + bool in_syscall = false; + fs::path cwd; + std::map<int, fs::path> fds; + PendingSyscall pending; +}; + +struct RepoRecord { + std::string root; + std::string head; + bool dirty = false; +}; + +std::unordered_map<pid_t, ProcState> processes; +std::map<std::string, FileRecord> files; +std::map<std::string, RepoRecord> repos; +fs::path snapshot_dir; +fs::path blob_dir; +uid_t tracer_uid = 0; +gid_t tracer_gid = 0; + +std::string errno_message(const std::string& prefix) { + return prefix + ": " + std::strerror(errno); +} + +std::string readlink_string(const fs::path& path) { + std::vector<char> buffer(4096); + ssize_t n = readlink(path.c_str(), buffer.data(), buffer.size() - 1); + if (n < 0) return {}; + buffer[static_cast<size_t>(n)] = '\0'; + return std::string(buffer.data()); +} + +fs::path lexical_abs(const fs::path& path, const fs::path& base) { + if (path.is_absolute()) return path.lexically_normal(); + return (base / path).lexically_normal(); +} + +fs::path best_effort_canonical(const fs::path& path) { + std::error_code ec; + fs::path canonical = fs::weakly_canonical(path, ec); + if (!ec && !canonical.empty()) return canonical.lexically_normal(); + if (path.is_absolute()) return path.lexically_normal(); + return fs::absolute(path, ec).lexically_normal(); +} + +std::string read_tracee_string(pid_t pid, unsigned long long address) { + if (address == 0) return {}; + std::string out; + union { + long value; + char chars[sizeof(long)]; + } data{}; + for (size_t offset = 0; offset < 65536; offset += sizeof(long)) { + errno = 0; + data.value = ptrace(PTRACE_PEEKDATA, pid, address + offset, nullptr); + if (errno != 0) break; + for (char c : data.chars) { + if (c == '\0') return out; + out.push_back(c); + } + } + return out; +} + +std::optional<Metadata> stat_metadata(const fs::path& path) { + struct stat st {}; + if (lstat(path.c_str(), &st) != 0) return std::nullopt; + Metadata meta; + meta.exists = true; + meta.mode = st.st_mode; + meta.size = static_cast<uintmax_t>(st.st_size); + meta.mtime = st.st_mtim.tv_sec; + meta.regular = S_ISREG(st.st_mode); + meta.directory = S_ISDIR(st.st_mode); + return meta; +} + +bool writable_by_current_user(const Metadata& meta) { + if (!meta.exists) return true; + const mode_t mode = meta.mode; + if (tracer_uid == 0) return true; + struct stat st {}; + (void)st; + return (mode & S_IWUSR) || (mode & S_IWGRP) || (mode & S_IWOTH); +} + +bool owned_by_other_and_not_writable(const fs::path& path) { + struct stat st {}; + if (lstat(path.c_str(), &st) != 0) return false; + if (st.st_uid == tracer_uid) return false; + if (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) return false; + return true; +} + +fs::path existing_anchor(fs::path path) { + std::error_code ec; + if (fs::exists(path, ec)) return path; + while (!path.empty() && path != path.root_path()) { + path = path.parent_path(); + if (fs::exists(path, ec)) return path; + } + return {}; +} + +std::string oid_to_string(const git_oid* oid) { + char out[GIT_OID_HEXSZ + 1] = {}; + git_oid_tostr(out, sizeof(out), oid); + return out; +} + +std::string repo_head(git_repository* repo) { + git_reference* head = nullptr; + if (git_repository_head(&head, repo) != 0) return {}; + const git_oid* oid = git_reference_target(head); + std::string result = oid ? oid_to_string(oid) : ""; + git_reference_free(head); + return result; +} + +GitInfo classify_git(const fs::path& input_path) { + GitInfo info; + fs::path anchor = existing_anchor(input_path); + if (anchor.empty()) anchor = existing_anchor(input_path.parent_path()); + if (anchor.empty()) return info; + + git_repository* repo = nullptr; + if (git_repository_open_ext(&repo, anchor.c_str(), 0, nullptr) != 0) return info; + + const char* workdir = git_repository_workdir(repo); + if (!workdir) { + git_repository_free(repo); + return info; + } + + info.in_repo = true; + info.root = best_effort_canonical(workdir).string(); + info.head = repo_head(repo); + + std::error_code ec; + fs::path rel = fs::relative(input_path, info.root, ec); + if (!ec) info.relative_path = rel.string(); + + unsigned int status = 0; + if (!info.relative_path.empty() && + git_status_file(&status, repo, info.relative_path.c_str()) == 0) { + info.ignored = status & GIT_STATUS_IGNORED; + info.tracked = !(status & GIT_STATUS_WT_NEW) && !(status & GIT_STATUS_IGNORED); + info.dirty = status != GIT_STATUS_CURRENT; + } else if (!info.relative_path.empty()) { + info.tracked = false; + info.dirty = true; + } + + RepoRecord& rec = repos[info.root]; + rec.root = info.root; + rec.head = info.head; + rec.dirty = rec.dirty || info.dirty; + + git_repository_free(repo); + return info; +} + +std::string fnv1a_file_digest(const fs::path& path) { + std::ifstream in(path, std::ios::binary); + uint64_t hash = 1469598103934665603ULL; + char c; + while (in.get(c)) { + hash ^= static_cast<unsigned char>(c); + hash *= 1099511628211ULL; + } + std::ostringstream out; + out << std::hex << std::setw(16) << std::setfill('0') << hash; + return out.str(); +} + +std::string store_blob(const fs::path& path) { + std::string digest = fnv1a_file_digest(path); + fs::path out = blob_dir / digest; + if (!fs::exists(out)) { + fs::copy_file(path, out, fs::copy_options::overwrite_existing); + } + return digest; +} + +bool should_capture_content(const fs::path& path, const Metadata& meta, const GitInfo& git) { + if (!meta.exists || !meta.regular) return false; + if (owned_by_other_and_not_writable(path)) return false; + if (git.in_repo && git.tracked && !git.dirty) return false; + return writable_by_current_user(meta); +} + +json metadata_json(const Metadata& meta) { + json j; + j["exists"] = meta.exists; + if (meta.tombstone) j["tombstone"] = true; + if (meta.exists) { + j["type"] = meta.directory ? "directory" : (meta.regular ? "file" : "other"); + j["mode"] = meta.mode; + j["size"] = meta.size; + j["mtime"] = meta.mtime; + } + if (!meta.blob.empty()) j["blob"] = meta.blob; + return j; +} + +json git_json(const GitInfo& git) { + json j; + j["in_repo"] = git.in_repo; + if (git.in_repo) { + j["root"] = git.root; + j["head"] = git.head; + j["relative_path"] = git.relative_path; + j["tracked"] = git.tracked; + j["dirty"] = git.dirty; + j["ignored"] = git.ignored; + } + return j; +} + +void record_observation(const fs::path& raw_path, const std::string& operation) { + if (raw_path.empty()) return; + fs::path path = best_effort_canonical(raw_path); + std::string key = path.string(); + FileRecord& rec = files[key]; + rec.path = key; + rec.operations.insert(operation); + if (!rec.before_recorded) { + rec.before_recorded = true; + rec.before = stat_metadata(path).value_or(Metadata{}); + rec.before_git = classify_git(path); + if (should_capture_content(path, rec.before, rec.before_git)) { + rec.before.blob = store_blob(path); + } + } +} + +void finalize_records() { + for (auto& [_, rec] : files) { + fs::path path(rec.path); + rec.after = stat_metadata(path).value_or(Metadata{}); + if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0; + rec.after_git = classify_git(path); + if (should_capture_content(path, rec.after, rec.after_git)) { + rec.after.blob = store_blob(path); + } + } +} + +fs::path resolve_path(const ProcState& proc, int dirfd, const std::string& path) { + fs::path p(path); + if (p.is_absolute()) return p; + fs::path base = proc.cwd; + if (dirfd != AT_FDCWD) { + auto it = proc.fds.find(dirfd); + if (it != proc.fds.end()) base = it->second; + } + return lexical_abs(p, base); +} + +bool is_write_open(int flags) { + int access = flags & O_ACCMODE; + return access == O_WRONLY || access == O_RDWR || (flags & (O_CREAT | O_TRUNC | O_APPEND)); +} + +bool is_read_open(int flags) { + int access = flags & O_ACCMODE; + return access == O_RDONLY || access == O_RDWR; +} + +void refresh_proc_fd(pid_t pid, ProcState& proc, int fd) { + std::string target = readlink_string("/proc/" + std::to_string(pid) + "/fd/" + std::to_string(fd)); + if (!target.empty() && target[0] == '/') proc.fds[fd] = best_effort_canonical(target); +} + +void refresh_proc_cwd(pid_t pid, ProcState& proc) { + std::string target = readlink_string("/proc/" + std::to_string(pid) + "/cwd"); + if (!target.empty()) proc.cwd = best_effort_canonical(target); +} + +void handle_syscall_entry(pid_t pid, ProcState& proc, const user_regs_struct& regs) { + PendingSyscall p; + p.nr = static_cast<long>(regs.orig_rax); + p.args = {regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9}; + + switch (p.nr) { + case SYS_open: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + p.flags = static_cast<int>(p.args[1]); + break; + case SYS_openat: +#ifdef SYS_openat2 + case SYS_openat2: +#endif + p.dirfd = static_cast<int>(p.args[0]); + p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); + p.flags = static_cast<int>(p.args[2]); + break; + case SYS_creat: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + p.flags = O_CREAT | O_WRONLY | O_TRUNC; + break; + case SYS_stat: + case SYS_lstat: + case SYS_access: + case SYS_readlink: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + break; + case SYS_newfstatat: + case SYS_faccessat: +#ifdef SYS_faccessat2 + case SYS_faccessat2: +#endif + case SYS_readlinkat: + p.dirfd = static_cast<int>(p.args[0]); + p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); + break; + case SYS_unlink: + case SYS_rmdir: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + break; + case SYS_unlinkat: + case SYS_mkdirat: + p.dirfd = static_cast<int>(p.args[0]); + p.path_a = resolve_path(proc, p.dirfd, read_tracee_string(pid, p.args[1])).string(); + break; + case SYS_mkdir: + case SYS_chdir: + case SYS_truncate: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + break; + case SYS_rename: + p.path_a = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[0])).string(); + p.path_b = resolve_path(proc, AT_FDCWD, read_tracee_string(pid, p.args[1])).string(); + break; + case SYS_renameat: +#ifdef SYS_renameat2 + case SYS_renameat2: +#endif + p.path_a = resolve_path(proc, static_cast<int>(p.args[0]), read_tracee_string(pid, p.args[1])).string(); + p.path_b = resolve_path(proc, static_cast<int>(p.args[2]), read_tracee_string(pid, p.args[3])).string(); + break; + case SYS_getdents: + case SYS_getdents64: + case SYS_fchdir: + case SYS_ftruncate: + p.fd = static_cast<int>(p.args[0]); + break; + case SYS_close: + case SYS_dup: + case SYS_dup2: + case SYS_dup3: + case SYS_fcntl: + p.fd = static_cast<int>(p.args[0]); + break; + default: + break; + } + + proc.pending = p; +} + +void handle_syscall_exit(pid_t pid, ProcState& proc, long result) { + const PendingSyscall& p = proc.pending; + bool ok = result >= 0; + + switch (p.nr) { + case SYS_open: + case SYS_openat: +#ifdef SYS_openat2 + case SYS_openat2: +#endif + case SYS_creat: + if (ok) { + if (is_read_open(p.flags)) record_observation(p.path_a, "read"); + if (is_write_open(p.flags)) record_observation(p.path_a, "write"); + if (p.flags & O_DIRECTORY) record_observation(p.path_a, "directory"); + refresh_proc_fd(pid, proc, static_cast<int>(result)); + } else { + record_observation(p.path_a, "existence"); + } + break; + case SYS_stat: + case SYS_lstat: + case SYS_newfstatat: + case SYS_access: + case SYS_faccessat: +#ifdef SYS_faccessat2 + case SYS_faccessat2: +#endif + case SYS_readlink: + case SYS_readlinkat: + record_observation(p.path_a, "existence"); + break; + case SYS_getdents: + case SYS_getdents64: + if (ok && p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "directory"); + break; + case SYS_unlink: + case SYS_unlinkat: + case SYS_rmdir: + record_observation(p.path_a, "delete"); + break; + case SYS_rename: + case SYS_renameat: +#ifdef SYS_renameat2 + case SYS_renameat2: +#endif + record_observation(p.path_a, "delete"); + record_observation(p.path_b, "write"); + break; + case SYS_mkdir: + case SYS_mkdirat: + case SYS_truncate: + record_observation(p.path_a, "write"); + break; + case SYS_ftruncate: + if (p.fd >= 0 && proc.fds.count(p.fd)) record_observation(proc.fds[p.fd], "write"); + break; + case SYS_chdir: + if (ok) refresh_proc_cwd(pid, proc); + break; + case SYS_fchdir: + if (ok) refresh_proc_cwd(pid, proc); + break; + case SYS_close: + if (ok) proc.fds.erase(p.fd); + break; + case SYS_dup: + if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(result)] = proc.fds[p.fd]; + break; + case SYS_dup2: + case SYS_dup3: + if (ok && proc.fds.count(p.fd)) proc.fds[static_cast<int>(p.args[1])] = proc.fds[p.fd]; + break; + case SYS_fcntl: + if (ok && (static_cast<int>(p.args[1]) == F_DUPFD || + static_cast<int>(p.args[1]) == F_DUPFD_CLOEXEC) && + proc.fds.count(p.fd)) { + proc.fds[static_cast<int>(result)] = proc.fds[p.fd]; + } + break; + default: + break; + } +} + +void trace_command(const std::vector<std::string>& command) { + pid_t child = fork(); + if (child < 0) throw std::runtime_error(errno_message("fork failed")); + if (child == 0) { + if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) != 0) _exit(127); + raise(SIGSTOP); + std::vector<char*> argv; + for (const auto& arg : command) argv.push_back(const_cast<char*>(arg.c_str())); + argv.push_back(nullptr); + execvp(argv[0], argv.data()); + _exit(127); + } + + int status = 0; + if (waitpid(child, &status, 0) < 0) throw std::runtime_error(errno_message("waitpid failed")); + if (ptrace(PTRACE_SETOPTIONS, child, nullptr, kPtraceOptions) != 0) { + throw std::runtime_error(errno_message("ptrace SETOPTIONS failed")); + } + processes[child].cwd = fs::current_path(); + refresh_proc_cwd(child, processes[child]); + ptrace(PTRACE_SYSCALL, child, nullptr, nullptr); + + while (!processes.empty()) { + pid_t pid = waitpid(-1, &status, __WALL); + if (pid < 0) { + if (errno == EINTR) continue; + if (errno == ECHILD) break; + throw std::runtime_error(errno_message("waitpid trace loop failed")); + } + + auto it = processes.find(pid); + if (it == processes.end()) { + ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); + continue; + } + + if (WIFEXITED(status) || WIFSIGNALED(status)) { + processes.erase(it); + continue; + } + + if (!WIFSTOPPED(status)) continue; + int sig = WSTOPSIG(status); + unsigned event = static_cast<unsigned>(status >> 16); + + if (event == PTRACE_EVENT_FORK || event == PTRACE_EVENT_VFORK || + event == PTRACE_EVENT_CLONE) { + unsigned long new_pid = 0; + ptrace(PTRACE_GETEVENTMSG, pid, nullptr, &new_pid); + processes[static_cast<pid_t>(new_pid)] = it->second; + ptrace(PTRACE_SETOPTIONS, static_cast<pid_t>(new_pid), nullptr, kPtraceOptions); + ptrace(PTRACE_SYSCALL, static_cast<pid_t>(new_pid), nullptr, nullptr); + ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); + continue; + } + + if (sig == (SIGTRAP | 0x80)) { + user_regs_struct regs {}; + if (ptrace(PTRACE_GETREGS, pid, nullptr, ®s) == 0) { + ProcState& proc = it->second; + if (!proc.in_syscall) { + handle_syscall_entry(pid, proc, regs); + proc.in_syscall = true; + } else { + handle_syscall_exit(pid, proc, static_cast<long>(regs.rax)); + proc.in_syscall = false; + } + } + ptrace(PTRACE_SYSCALL, pid, nullptr, nullptr); + } else { + int deliver = (sig == SIGTRAP) ? 0 : sig; + ptrace(PTRACE_SYSCALL, pid, nullptr, reinterpret_cast<void*>(static_cast<long>(deliver))); + } + } +} + +void write_manifest(const fs::path& out, const std::vector<std::string>& command, int exit_status) { + json manifest; + manifest["format_version"] = 1; + manifest["command"] = command; + manifest["exit_status"] = exit_status; + manifest["start_cwd"] = fs::current_path().string(); + manifest["uid"] = tracer_uid; + manifest["gid"] = tracer_gid; + + manifest["git_repositories"] = json::array(); + for (const auto& [_, repo] : repos) { + manifest["git_repositories"].push_back({ + {"root", repo.root}, + {"head", repo.head}, + {"dirty", repo.dirty}, + }); + } + + manifest["files"] = json::array(); + for (const auto& [_, rec] : files) { + json ops = json::array(); + for (const auto& op : rec.operations) ops.push_back(op); + manifest["files"].push_back({ + {"path", rec.path}, + {"operations", ops}, + {"before", metadata_json(rec.before)}, + {"after", metadata_json(rec.after)}, + {"git", git_json(rec.after_git.in_repo ? rec.after_git : rec.before_git)}, + }); + } + + std::ofstream stream(out / "manifest.json"); + stream << manifest.dump(2) << "\n"; +} + +void restore_snapshot(const fs::path& dir) { + std::ifstream in(dir / "manifest.json"); + if (!in) throw std::runtime_error("cannot open manifest"); + json manifest = json::parse(in); + + for (const auto& item : manifest.at("files")) { + fs::path path = item.at("path").get<std::string>(); + const json& after = item.at("after"); + if (!after.value("exists", false)) { + if (after.value("tombstone", false)) { + std::error_code ec; + fs::remove(path, ec); + } + continue; + } + if (!after.contains("blob")) continue; + + fs::create_directories(path.parent_path()); + fs::copy_file(dir / "blobs" / after.at("blob").get<std::string>(), path, + fs::copy_options::overwrite_existing); + if (after.contains("mode")) { + fs::permissions(path, static_cast<fs::perms>(after.at("mode").get<unsigned>() & 07777), + fs::perm_options::replace); + } + if (after.contains("mtime")) { + // std::filesystem has no portable wall-clock setter for Unix time in C++20. + struct timespec ts[2] {}; + ts[0].tv_nsec = UTIME_OMIT; + ts[1].tv_sec = after.at("mtime").get<std::time_t>(); + ts[1].tv_nsec = 0; + utimensat(AT_FDCWD, path.c_str(), ts, AT_SYMLINK_NOFOLLOW); + } + } +} + +int run_snapshot(const std::vector<std::string>& args) { + fs::path output; + size_t split = args.size(); + for (size_t i = 0; i < args.size(); ++i) { + if (args[i] == "--output" && i + 1 < args.size()) { + output = args[i + 1]; + ++i; + } else if (args[i] == "--") { + split = i + 1; + break; + } else { + throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args..."); + } + } + if (output.empty() || split >= args.size()) { + throw std::runtime_error("usage: agent-snapshot --output SNAPDIR -- command args..."); + } + + snapshot_dir = output; + blob_dir = snapshot_dir / "blobs"; + fs::remove_all(snapshot_dir); + fs::create_directories(blob_dir); + + std::vector<std::string> command(args.begin() + static_cast<long>(split), args.end()); + trace_command(command); + finalize_records(); + write_manifest(snapshot_dir, command, 0); + return 0; +} + +} // namespace + +int main(int argc, char** argv) { + tracer_uid = getuid(); + tracer_gid = getgid(); + git_libgit2_init(); + try { + std::vector<std::string> args(argv + 1, argv + argc); + if (!args.empty() && args[0] == "restore") { + if (args.size() != 2) throw std::runtime_error("usage: agent-snapshot restore SNAPDIR"); + restore_snapshot(args[1]); + git_libgit2_shutdown(); + return 0; + } + int rc = run_snapshot(args); + git_libgit2_shutdown(); + return rc; + } catch (const std::exception& e) { + git_libgit2_shutdown(); + std::cerr << "agent-snapshot: " << e.what() << "\n"; + return 1; + } +}
@@ -0,0 +1,9 @@ +from pathlib import Path + +root = Path(__file__).resolve().parents[1] +testdata = root / "testdata" + +(testdata / "dirty.txt").read_text() +(testdata / "untracked_runtime.txt").read_text() +(testdata / "created_by_program.txt").write_text("created final\n") +(testdata / "deleted_by_program.txt").unlink()
@@ -0,0 +1,18 @@ +import os +from pathlib import Path + +root = Path(__file__).resolve().parents[1] +testdata = root / "testdata" + +pid = os.fork() +if pid == 0: + (testdata / "child_output.txt").write_text("child final\n") + os._exit(0) + +with open("/usr/bin/env", "rb") as handle: + handle.read(16) + +for _ in testdata.iterdir(): + pass + +os.waitpid(pid, 0)
@@ -0,0 +1,4 @@ +from pathlib import Path + +root = Path(__file__).resolve().parents[1] +print((root / "testdata" / "clean.txt").read_text())
@@ -0,0 +1,2 @@ +clean tracked fixture +line two
@@ -0,0 +1 @@ +dirty tracked fixture original
@@ -0,0 +1 @@ +nested tracked fixture
@@ -0,0 +1,126 @@ +import json +import os +import shutil +import subprocess +from pathlib import Path + +import pytest + + +ROOT = Path(__file__).resolve().parents[1] +BUILD = ROOT / "build" / "pytest" +BIN = BUILD / "agent-snapshot" +TESTDATA = ROOT / "testdata" + + +def run(cmd, **kwargs): + return subprocess.run(cmd, cwd=ROOT, text=True, check=True, **kwargs) + + +@pytest.fixture(scope="session", autouse=True) +def build_agent_snapshot(): + run(["cmake", "-S", ".", "-B", str(BUILD)]) + run(["cmake", "--build", str(BUILD), "--parallel"]) + assert BIN.exists() + + +@pytest.fixture(autouse=True) +def pristine_testdata(): + run(["git", "checkout", "--", "testdata"]) + run(["git", "clean", "-fd", "--", "testdata"]) + yield + run(["git", "checkout", "--", "testdata"]) + run(["git", "clean", "-fd", "--", "testdata"]) + + +class Snapshot: + def __init__(self, path: Path): + self.path = path + self.manifest = json.loads((path / "manifest.json").read_text()) + + def file(self, path: Path): + target = str(path.resolve()) + for item in self.manifest["files"]: + if item["path"] == target: + return item + raise AssertionError(f"{target} not present in snapshot") + + def blob_text(self, digest: str): + return (self.path / "blobs" / digest).read_text() + + +def capture(tmp_path: Path, *command: str) -> Snapshot: + out = tmp_path / "snapshot" + run([str(BIN), "--output", str(out), "--", *command]) + return Snapshot(out) + + +def test_clean_git_tracked_read_records_repo_without_blob(tmp_path): + snap = capture(tmp_path, "python3", "test_programs/read_clean.py") + clean = snap.file(TESTDATA / "clean.txt") + + assert "read" in clean["operations"] + assert clean["git"]["tracked"] is True + assert clean["git"]["dirty"] is False + assert clean["before"].get("blob") is None + assert clean["after"].get("blob") is None + assert any(repo["root"] == str(ROOT.resolve()) for repo in snap.manifest["git_repositories"]) + + +def test_dirty_untracked_created_and_deleted_files_are_captured(tmp_path): + (TESTDATA / "dirty.txt").write_text("dirty tracked fixture changed before run\n") + (TESTDATA / "untracked_runtime.txt").write_text("untracked input\n") + (TESTDATA / "deleted_by_program.txt").write_text("delete me\n") + + snap = capture(tmp_path, "python3", "test_programs/dirty_untracked_write.py") + + dirty = snap.file(TESTDATA / "dirty.txt") + assert dirty["git"]["tracked"] is True + assert dirty["git"]["dirty"] is True + assert snap.blob_text(dirty["before"]["blob"]) == "dirty tracked fixture changed before run\n" + + untracked = snap.file(TESTDATA / "untracked_runtime.txt") + assert untracked["git"]["tracked"] is False + assert snap.blob_text(untracked["before"]["blob"]) == "untracked input\n" + + created = snap.file(TESTDATA / "created_by_program.txt") + assert "write" in created["operations"] + assert created["before"]["exists"] is False + assert snap.blob_text(created["after"]["blob"]) == "created final\n" + + deleted = snap.file(TESTDATA / "deleted_by_program.txt") + assert "delete" in deleted["operations"] + assert deleted["after"]["exists"] is False + assert deleted["after"]["tombstone"] is True + + +def test_fork_usr_and_directory_traversal(tmp_path): + snap = capture(tmp_path, "python3", "test_programs/fork_and_usr.py") + + child = snap.file(TESTDATA / "child_output.txt") + assert "write" in child["operations"] + assert snap.blob_text(child["after"]["blob"]) == "child final\n" + + usr_env = snap.file(Path("/usr/bin/env")) + assert "read" in usr_env["operations"] + assert usr_env["before"].get("blob") is None + assert usr_env["after"].get("blob") is None + + directory = snap.file(TESTDATA) + assert "directory" in directory["operations"] + + +def test_restore_applies_final_state(tmp_path): + (TESTDATA / "dirty.txt").write_text("changed before capture\n") + (TESTDATA / "deleted_by_program.txt").write_text("delete me\n") + snap = capture(tmp_path, "python3", "test_programs/dirty_untracked_write.py") + + shutil.rmtree(TESTDATA) + TESTDATA.mkdir() + (TESTDATA / "created_by_program.txt").write_text("wrong\n") + (TESTDATA / "deleted_by_program.txt").write_text("should disappear\n") + + run([str(BIN), "restore", str(snap.path)]) + + assert (TESTDATA / "created_by_program.txt").read_text() == "created final\n" + assert not (TESTDATA / "deleted_by_program.txt").exists()