Repositories / agent-snapshot.git
agent-snapshot.git
Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git
@@ -16,6 +16,7 @@ #include <cerrno> #include <chrono> +#include <cstdlib> #include <cstring> #include <filesystem> #include <fstream> @@ -121,6 +122,8 @@ struct RepoRecord { std::unordered_map<pid_t, ProcState> processes; std::map<std::string, FileRecord> files; std::map<std::string, RepoRecord> repos; +std::vector<fs::path> ignored_paths; +fs::path ignore_config_path; fs::path snapshot_dir; fs::path blob_dir; uid_t tracer_uid = 0; @@ -151,6 +154,88 @@ fs::path best_effort_canonical(const fs::path& path) { return fs::absolute(path, ec).lexically_normal(); } +fs::path xdg_ignore_config_path() { + const char* config_home = std::getenv("XDG_CONFIG_HOME"); + if (config_home && config_home[0] != '\0') { + return fs::path(config_home) / "agent-snapshot" / "ignore.json"; + } + const char* home = std::getenv("HOME"); + if (!home || home[0] == '\0') { + throw std::runtime_error("XDG_CONFIG_HOME is unset and HOME is unavailable"); + } + return fs::path(home) / ".config" / "agent-snapshot" / "ignore.json"; +} + +fs::path home_dir() { + const char* home = std::getenv("HOME"); + if (!home || home[0] == '\0') { + throw std::runtime_error("HOME is unavailable"); + } + return fs::path(home); +} + +fs::path xdg_config_home_dir() { + const char* config_home = std::getenv("XDG_CONFIG_HOME"); + if (config_home && config_home[0] != '\0') { + return fs::path(config_home); + } + return home_dir() / ".config"; +} + +fs::path expand_ignore_entry(const std::string& entry) { + constexpr std::string_view kHome = "$HOME"; + constexpr std::string_view kXdgConfigHome = "$XDG_CONFIG_HOME"; + if (entry == kHome) return home_dir(); + if (entry.rfind(std::string(kHome) + "/", 0) == 0) { + return home_dir() / entry.substr(kHome.size() + 1); + } + if (entry == kXdgConfigHome) return xdg_config_home_dir(); + if (entry.rfind(std::string(kXdgConfigHome) + "/", 0) == 0) { + return xdg_config_home_dir() / entry.substr(kXdgConfigHome.size() + 1); + } + return fs::path(entry); +} + +bool path_is_at_or_under(const fs::path& path, const fs::path& root) { + auto pit = path.begin(); + auto rit = root.begin(); + for (; rit != root.end(); ++rit, ++pit) { + if (pit == path.end() || *pit != *rit) return false; + } + return true; +} + +bool is_ignored_path(const fs::path& raw_path) { + if (raw_path.empty()) return false; + fs::path path = best_effort_canonical(raw_path); + for (const auto& ignored : ignored_paths) { + if (path_is_at_or_under(path, ignored)) return true; + } + return false; +} + +void load_ignore_config() { + ignore_config_path = best_effort_canonical(xdg_ignore_config_path()); + std::ifstream stream(ignore_config_path); + if (!stream) { + throw std::runtime_error("ignore config does not exist: " + ignore_config_path.string()); + } + + json entries = json::parse(stream); + if (!entries.is_array()) { + throw std::runtime_error("ignore config must be a JSON array: " + ignore_config_path.string()); + } + + ignored_paths.clear(); + ignored_paths.push_back(ignore_config_path); + for (const auto& entry : entries) { + if (!entry.is_string()) { + throw std::runtime_error("ignore config entries must be strings: " + ignore_config_path.string()); + } + ignored_paths.push_back(best_effort_canonical(expand_ignore_entry(entry.get<std::string>()))); + } +} + // Syscall arguments live in the tracee's address space. PTRACE_PEEKDATA reads a // machine word at a time from that process; for path arguments we walk forward // until the NUL terminator. This is intentionally bounded so a bad userspace @@ -354,6 +439,7 @@ json git_json(const GitInfo& git) { void record_observation(const fs::path& raw_path, const std::string& operation) { if (raw_path.empty()) return; fs::path path = best_effort_canonical(raw_path); + if (is_ignored_path(path)) return; std::string key = path.string(); FileRecord& rec = files[key]; rec.path = key; @@ -377,6 +463,7 @@ void finalize_records() { // tree has exited, the filesystem has quiesced from our point of view, so // final content can be copied without racing the writer we launched. fs::path path(rec.path); + if (is_ignored_path(path)) continue; rec.after = stat_metadata(path).value_or(Metadata{}); if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0; rec.after_git = classify_git(path); @@ -798,6 +885,8 @@ void restore_snapshot(const fs::path& dir) { } int run_snapshot(const std::vector<std::string>& args) { + load_ignore_config(); + fs::path output; size_t split = args.size(); for (size_t i = 0; i < args.size(); ++i) {
@@ -0,0 +1,12 @@ +import os +from pathlib import Path + +root = Path(__file__).resolve().parents[1] +testdata = root / "testdata" +config_home = Path(os.environ.get("XDG_CONFIG_HOME", Path(os.environ["HOME"]) / ".config")) +ignore_config = config_home / "agent-snapshot" / "ignore.json" + +(testdata / "clean.txt").read_text() +(testdata / "ignored_file.txt").read_text() +(testdata / "ignored_dir" / "nested.txt").read_text() +ignore_config.read_text()
@@ -47,6 +47,17 @@ def pristine_testdata(): run(["git", "clean", "-fd", "--", "testdata"]) +@pytest.fixture(autouse=True) +def ignore_config(tmp_path, monkeypatch): + config_home = tmp_path / "xdg-config" + config_path = config_home / "agent-snapshot" / "ignore.json" + config_path.parent.mkdir(parents=True) + config_path.write_text("[]\n") + monkeypatch.setenv("HOME", str(ROOT)) + monkeypatch.setenv("XDG_CONFIG_HOME", str(config_home)) + return config_path + + class Snapshot: # Small manifest assertion helper. Tests should read like snapshot behavior, # not like repeated JSON tree walking, because the important contract is @@ -74,6 +85,78 @@ def capture(tmp_path: Path, *command: str) -> Snapshot: return Snapshot(out) +def test_missing_ignore_config_aborts_at_startup(tmp_path, ignore_config): + ignore_config.unlink() + result = subprocess.run( + [str(BIN), "--output", str(tmp_path / "snapshot"), "--", PYTHON, "test_programs/read_clean.py"], + cwd=ROOT, + text=True, + capture_output=True, + ) + + assert result.returncode != 0 + assert "ignore" in result.stderr + + +def test_ignore_config_suppresses_files_directories_and_itself(tmp_path, ignore_config): + ignored_file = TESTDATA / "ignored_file.txt" + ignored_dir = TESTDATA / "ignored_dir" + ignored_file.write_text("ignored file payload\n") + ignored_dir.mkdir() + (ignored_dir / "nested.txt").write_text("ignored nested payload\n") + ignore_config.write_text(json.dumps([str(ignored_file), str(ignored_dir)]) + "\n") + + snap = capture(tmp_path, PYTHON, "test_programs/read_ignored_paths.py") + manifest_paths = {item["path"] for item in snap.manifest["files"]} + + assert str((TESTDATA / "clean.txt").resolve()) in manifest_paths + assert str(ignored_file.resolve()) not in manifest_paths + assert str((ignored_dir / "nested.txt").resolve()) not in manifest_paths + assert str(ignore_config.resolve()) not in manifest_paths + + +def test_ignore_config_expands_home_prefix(tmp_path, ignore_config): + ignored_file = TESTDATA / "ignored_file.txt" + ignored_dir = TESTDATA / "ignored_dir" + ignored_file.write_text("ignored file payload\n") + ignored_dir.mkdir() + (ignored_dir / "nested.txt").write_text("nested payload\n") + ignore_config.write_text(json.dumps(["$HOME/testdata/ignored_file.txt"]) + "\n") + + snap = capture(tmp_path, PYTHON, "test_programs/read_ignored_paths.py") + manifest_paths = {item["path"] for item in snap.manifest["files"]} + + assert str(ignored_file.resolve()) not in manifest_paths + + +def test_ignore_config_expands_xdg_config_home_fallback(tmp_path, ignore_config): + home = tmp_path / "home" + config_path = home / ".config" / "agent-snapshot" / "ignore.json" + ignored_file = TESTDATA / "ignored_file.txt" + ignored_dir = TESTDATA / "ignored_dir" + ignored_file.write_text("ignored file payload\n") + ignored_dir.mkdir() + (ignored_dir / "nested.txt").write_text("nested payload\n") + config_path.parent.mkdir(parents=True) + config_path.write_text(json.dumps(["$XDG_CONFIG_HOME/agent-snapshot/ignore.json"]) + "\n") + + env = os.environ.copy() + env.pop("XDG_CONFIG_HOME", None) + env["HOME"] = str(home) + out = tmp_path / "snapshot" + subprocess.run( + [str(BIN), "--output", str(out), "--", PYTHON, "test_programs/read_ignored_paths.py"], + cwd=ROOT, + text=True, + check=True, + env=env, + ) + manifest = json.loads((out / "manifest.json").read_text()) + manifest_paths = {item["path"] for item in manifest["files"]} + + assert str(config_path.resolve()) not in manifest_paths + + def test_clean_git_tracked_read_records_repo_without_blob(tmp_path): # A clean tracked file is the primary compactness case. The program really # reads testdata/clean.txt, but the snapshot should rely on Git repository