Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Add XDG ignore config support

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-02 10:21:51 -0400
Commit
5817cc0bcf06c642bbe7141ea26d72bcbdc45887
src/main.cpp
index 1dc1d10..2e67c31 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,6 +16,7 @@
 
 #include <cerrno>
 #include <chrono>
+#include <cstdlib>
 #include <cstring>
 #include <filesystem>
 #include <fstream>
@@ -121,6 +122,8 @@ struct RepoRecord {
 std::unordered_map<pid_t, ProcState> processes;
 std::map<std::string, FileRecord> files;
 std::map<std::string, RepoRecord> repos;
+std::vector<fs::path> ignored_paths;
+fs::path ignore_config_path;
 fs::path snapshot_dir;
 fs::path blob_dir;
 uid_t tracer_uid = 0;
@@ -151,6 +154,88 @@ fs::path best_effort_canonical(const fs::path& path) {
   return fs::absolute(path, ec).lexically_normal();
 }
 
+fs::path xdg_ignore_config_path() {
+  const char* config_home = std::getenv("XDG_CONFIG_HOME");
+  if (config_home && config_home[0] != '\0') {
+    return fs::path(config_home) / "agent-snapshot" / "ignore.json";
+  }
+  const char* home = std::getenv("HOME");
+  if (!home || home[0] == '\0') {
+    throw std::runtime_error("XDG_CONFIG_HOME is unset and HOME is unavailable");
+  }
+  return fs::path(home) / ".config" / "agent-snapshot" / "ignore.json";
+}
+
+fs::path home_dir() {
+  const char* home = std::getenv("HOME");
+  if (!home || home[0] == '\0') {
+    throw std::runtime_error("HOME is unavailable");
+  }
+  return fs::path(home);
+}
+
+fs::path xdg_config_home_dir() {
+  const char* config_home = std::getenv("XDG_CONFIG_HOME");
+  if (config_home && config_home[0] != '\0') {
+    return fs::path(config_home);
+  }
+  return home_dir() / ".config";
+}
+
+fs::path expand_ignore_entry(const std::string& entry) {
+  constexpr std::string_view kHome = "$HOME";
+  constexpr std::string_view kXdgConfigHome = "$XDG_CONFIG_HOME";
+  if (entry == kHome) return home_dir();
+  if (entry.rfind(std::string(kHome) + "/", 0) == 0) {
+    return home_dir() / entry.substr(kHome.size() + 1);
+  }
+  if (entry == kXdgConfigHome) return xdg_config_home_dir();
+  if (entry.rfind(std::string(kXdgConfigHome) + "/", 0) == 0) {
+    return xdg_config_home_dir() / entry.substr(kXdgConfigHome.size() + 1);
+  }
+  return fs::path(entry);
+}
+
+bool path_is_at_or_under(const fs::path& path, const fs::path& root) {
+  auto pit = path.begin();
+  auto rit = root.begin();
+  for (; rit != root.end(); ++rit, ++pit) {
+    if (pit == path.end() || *pit != *rit) return false;
+  }
+  return true;
+}
+
+bool is_ignored_path(const fs::path& raw_path) {
+  if (raw_path.empty()) return false;
+  fs::path path = best_effort_canonical(raw_path);
+  for (const auto& ignored : ignored_paths) {
+    if (path_is_at_or_under(path, ignored)) return true;
+  }
+  return false;
+}
+
+void load_ignore_config() {
+  ignore_config_path = best_effort_canonical(xdg_ignore_config_path());
+  std::ifstream stream(ignore_config_path);
+  if (!stream) {
+    throw std::runtime_error("ignore config does not exist: " + ignore_config_path.string());
+  }
+
+  json entries = json::parse(stream);
+  if (!entries.is_array()) {
+    throw std::runtime_error("ignore config must be a JSON array: " + ignore_config_path.string());
+  }
+
+  ignored_paths.clear();
+  ignored_paths.push_back(ignore_config_path);
+  for (const auto& entry : entries) {
+    if (!entry.is_string()) {
+      throw std::runtime_error("ignore config entries must be strings: " + ignore_config_path.string());
+    }
+    ignored_paths.push_back(best_effort_canonical(expand_ignore_entry(entry.get<std::string>())));
+  }
+}
+
 // Syscall arguments live in the tracee's address space. PTRACE_PEEKDATA reads a
 // machine word at a time from that process; for path arguments we walk forward
 // until the NUL terminator. This is intentionally bounded so a bad userspace
@@ -354,6 +439,7 @@ json git_json(const GitInfo& git) {
 void record_observation(const fs::path& raw_path, const std::string& operation) {
   if (raw_path.empty()) return;
   fs::path path = best_effort_canonical(raw_path);
+  if (is_ignored_path(path)) return;
   std::string key = path.string();
   FileRecord& rec = files[key];
   rec.path = key;
@@ -377,6 +463,7 @@ void finalize_records() {
     // tree has exited, the filesystem has quiesced from our point of view, so
     // final content can be copied without racing the writer we launched.
     fs::path path(rec.path);
+    if (is_ignored_path(path)) continue;
     rec.after = stat_metadata(path).value_or(Metadata{});
     if (!rec.after.exists) rec.after.tombstone = rec.operations.count("delete") > 0;
     rec.after_git = classify_git(path);
@@ -798,6 +885,8 @@ void restore_snapshot(const fs::path& dir) {
 }
 
 int run_snapshot(const std::vector<std::string>& args) {
+  load_ignore_config();
+
   fs::path output;
   size_t split = args.size();
   for (size_t i = 0; i < args.size(); ++i) {
test_programs/read_ignored_paths.py
new file mode 100644
index 0000000..db2f5d0
--- /dev/null
+++ b/test_programs/read_ignored_paths.py
@@ -0,0 +1,12 @@
+import os
+from pathlib import Path
+
+root = Path(__file__).resolve().parents[1]
+testdata = root / "testdata"
+config_home = Path(os.environ.get("XDG_CONFIG_HOME", Path(os.environ["HOME"]) / ".config"))
+ignore_config = config_home / "agent-snapshot" / "ignore.json"
+
+(testdata / "clean.txt").read_text()
+(testdata / "ignored_file.txt").read_text()
+(testdata / "ignored_dir" / "nested.txt").read_text()
+ignore_config.read_text()
tests/test_agent_snapshot.py
index c4cc711..efb0ad4 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -47,6 +47,17 @@ def pristine_testdata():
     run(["git", "clean", "-fd", "--", "testdata"])
 
 
+@pytest.fixture(autouse=True)
+def ignore_config(tmp_path, monkeypatch):
+    config_home = tmp_path / "xdg-config"
+    config_path = config_home / "agent-snapshot" / "ignore.json"
+    config_path.parent.mkdir(parents=True)
+    config_path.write_text("[]\n")
+    monkeypatch.setenv("HOME", str(ROOT))
+    monkeypatch.setenv("XDG_CONFIG_HOME", str(config_home))
+    return config_path
+
+
 class Snapshot:
     # Small manifest assertion helper. Tests should read like snapshot behavior,
     # not like repeated JSON tree walking, because the important contract is
@@ -74,6 +85,78 @@ def capture(tmp_path: Path, *command: str) -> Snapshot:
     return Snapshot(out)
 
 
+def test_missing_ignore_config_aborts_at_startup(tmp_path, ignore_config):
+    ignore_config.unlink()
+    result = subprocess.run(
+        [str(BIN), "--output", str(tmp_path / "snapshot"), "--", PYTHON, "test_programs/read_clean.py"],
+        cwd=ROOT,
+        text=True,
+        capture_output=True,
+    )
+
+    assert result.returncode != 0
+    assert "ignore" in result.stderr
+
+
+def test_ignore_config_suppresses_files_directories_and_itself(tmp_path, ignore_config):
+    ignored_file = TESTDATA / "ignored_file.txt"
+    ignored_dir = TESTDATA / "ignored_dir"
+    ignored_file.write_text("ignored file payload\n")
+    ignored_dir.mkdir()
+    (ignored_dir / "nested.txt").write_text("ignored nested payload\n")
+    ignore_config.write_text(json.dumps([str(ignored_file), str(ignored_dir)]) + "\n")
+
+    snap = capture(tmp_path, PYTHON, "test_programs/read_ignored_paths.py")
+    manifest_paths = {item["path"] for item in snap.manifest["files"]}
+
+    assert str((TESTDATA / "clean.txt").resolve()) in manifest_paths
+    assert str(ignored_file.resolve()) not in manifest_paths
+    assert str((ignored_dir / "nested.txt").resolve()) not in manifest_paths
+    assert str(ignore_config.resolve()) not in manifest_paths
+
+
+def test_ignore_config_expands_home_prefix(tmp_path, ignore_config):
+    ignored_file = TESTDATA / "ignored_file.txt"
+    ignored_dir = TESTDATA / "ignored_dir"
+    ignored_file.write_text("ignored file payload\n")
+    ignored_dir.mkdir()
+    (ignored_dir / "nested.txt").write_text("nested payload\n")
+    ignore_config.write_text(json.dumps(["$HOME/testdata/ignored_file.txt"]) + "\n")
+
+    snap = capture(tmp_path, PYTHON, "test_programs/read_ignored_paths.py")
+    manifest_paths = {item["path"] for item in snap.manifest["files"]}
+
+    assert str(ignored_file.resolve()) not in manifest_paths
+
+
+def test_ignore_config_expands_xdg_config_home_fallback(tmp_path, ignore_config):
+    home = tmp_path / "home"
+    config_path = home / ".config" / "agent-snapshot" / "ignore.json"
+    ignored_file = TESTDATA / "ignored_file.txt"
+    ignored_dir = TESTDATA / "ignored_dir"
+    ignored_file.write_text("ignored file payload\n")
+    ignored_dir.mkdir()
+    (ignored_dir / "nested.txt").write_text("nested payload\n")
+    config_path.parent.mkdir(parents=True)
+    config_path.write_text(json.dumps(["$XDG_CONFIG_HOME/agent-snapshot/ignore.json"]) + "\n")
+
+    env = os.environ.copy()
+    env.pop("XDG_CONFIG_HOME", None)
+    env["HOME"] = str(home)
+    out = tmp_path / "snapshot"
+    subprocess.run(
+        [str(BIN), "--output", str(out), "--", PYTHON, "test_programs/read_ignored_paths.py"],
+        cwd=ROOT,
+        text=True,
+        check=True,
+        env=env,
+    )
+    manifest = json.loads((out / "manifest.json").read_text())
+    manifest_paths = {item["path"] for item in manifest["files"]}
+
+    assert str(config_path.resolve()) not in manifest_paths
+
+
 def test_clean_git_tracked_read_records_repo_without_blob(tmp_path):
     # A clean tracked file is the primary compactness case. The program really
     # reads testdata/clean.txt, but the snapshot should rely on Git repository