Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Preserve non-UTF-8 path bytes in manifest

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-02 07:40:05 -0400
Commit
ffb9868db7ba77dc06dc623566cbbb072a6f81db
src/main.cpp
index 1dc1d10..55beaff 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -130,6 +130,65 @@ std::string errno_message(const std::string& prefix) {
   return prefix + ": " + std::strerror(errno);
 }
 
+std::string bytes_hex(const std::string& input) {
+  std::ostringstream out;
+  for (unsigned char c : input) {
+    out << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+  }
+  return out.str();
+}
+
+bool append_utf8_codepoint(std::string& out, const std::string& input, size_t& index) {
+  const unsigned char first = static_cast<unsigned char>(input[index]);
+  size_t length = 0;
+  uint32_t codepoint = 0;
+  if (first <= 0x7f) {
+    out.push_back(static_cast<char>(first));
+    ++index;
+    return true;
+  } else if ((first & 0xe0) == 0xc0) {
+    length = 2;
+    codepoint = first & 0x1f;
+  } else if ((first & 0xf0) == 0xe0) {
+    length = 3;
+    codepoint = first & 0x0f;
+  } else if ((first & 0xf8) == 0xf0) {
+    length = 4;
+    codepoint = first & 0x07;
+  } else {
+    return false;
+  }
+  if (index + length > input.size()) return false;
+  for (size_t offset = 1; offset < length; ++offset) {
+    const unsigned char next = static_cast<unsigned char>(input[index + offset]);
+    if ((next & 0xc0) != 0x80) return false;
+    codepoint = (codepoint << 6) | (next & 0x3f);
+  }
+  if ((length == 2 && codepoint < 0x80) || (length == 3 && codepoint < 0x800) ||
+      (length == 4 && codepoint < 0x10000) ||
+      (codepoint >= 0xd800 && codepoint <= 0xdfff) || codepoint > 0x10ffff) {
+    return false;
+  }
+  out.append(input, index, length);
+  index += length;
+  return true;
+}
+
+std::string json_safe_path_text(const std::string& bytes) {
+  std::string out;
+  for (size_t i = 0; i < bytes.size();) {
+    size_t before = i;
+    if (append_utf8_codepoint(out, bytes, i)) continue;
+    const unsigned char bad = static_cast<unsigned char>(bytes[before]);
+    out += "\\x";
+    constexpr char kHex[] = "0123456789abcdef";
+    out.push_back(kHex[bad >> 4]);
+    out.push_back(kHex[bad & 0x0f]);
+    i = before + 1;
+  }
+  return out;
+}
+
 std::string readlink_string(const fs::path& path) {
   std::vector<char> buffer(4096);
   ssize_t n = readlink(path.c_str(), buffer.data(), buffer.size() - 1);
@@ -343,7 +402,8 @@ json git_json(const GitInfo& git) {
   if (git.in_repo) {
     j["root"] = git.root;
     j["head"] = git.head;
-    j["relative_path"] = git.relative_path;
+    j["relative_path"] = json_safe_path_text(git.relative_path);
+    j["relative_path_bytes_hex"] = bytes_hex(git.relative_path);
     j["tracked"] = git.tracked;
     j["dirty"] = git.dirty;
     j["ignored"] = git.ignored;
@@ -733,7 +793,8 @@ void write_manifest(const fs::path& out, const std::vector<std::string>& command
     json ops = json::array();
     for (const auto& op : rec.operations) ops.push_back(op);
     manifest["files"].push_back({
-        {"path", rec.path},
+        {"path", json_safe_path_text(rec.path)},
+        {"path_bytes_hex", bytes_hex(rec.path)},
         {"operations", ops},
         {"before", metadata_json(rec.before)},
         {"after", metadata_json(rec.after)},
tests/test_agent_snapshot.py
index e6f5756..41e5667 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -60,8 +60,9 @@ class Snapshot:
         return self.file_by_manifest_path(target)
 
     def file_by_manifest_path(self, target: str):
+        target_bytes_hex = os.fsencode(target).hex()
         for item in self.manifest["files"]:
-            if item["path"] == target:
+            if item["path"] == target or item.get("path_bytes_hex") == target_bytes_hex:
                 return item
         raise AssertionError(f"{target} not present in snapshot")