Repositories / agent-snapshot.git

agent-snapshot.git

Clone (read-only): git clone http://git.guha-anderson.com/git/agent-snapshot.git

Branch

Use Camomile for UTF-8 manifest strings

Author
Arjun Guha <a.guha@northeastern.edu>
Date
2026-05-03 03:38:32 -0400
Commit
8020210ddf18df71db5e00818f49bfcf33d9c8ce
dune-project
index 010ffd8..fb4eb04 100644
--- a/dune-project
+++ b/dune-project
@@ -8,4 +8,5 @@
   (ocaml (>= 5.4))
   dune
   yojson
+  camomile
   ocaml-git))
src/ocaml/agent_snapshot.ml
index 7f50437..83477e9 100644
--- a/src/ocaml/agent_snapshot.ml
+++ b/src/ocaml/agent_snapshot.ml
@@ -1,5 +1,12 @@
 module Json = Yojson.Safe
 
+let utf8_string s =
+  let module Enc = Camomile.CharEncoding in
+  try Enc.recode_string ~in_enc:Enc.utf8 ~out_enc:Enc.utf8 s
+  with Enc.Malformed_code -> Enc.recode_string ~in_enc:Enc.latin1 ~out_enc:Enc.utf8 s
+
+let jstr s = `String (utf8_string s)
+
 let at_fdcwd = -100
 let o_accmode = 0o3
 let o_rdonly = 0
@@ -414,7 +421,7 @@ let metadata_json meta =
       :: base
     else base
   in
-  let base = match meta.blob with Some blob -> ("blob", `String blob) :: base | None -> base in
+  let base = match meta.blob with Some blob -> ("blob", jstr blob) :: base | None -> base in
   `Assoc (List.rev base)
 
 let git_json git =
@@ -423,9 +430,9 @@ let git_json git =
     `Assoc
       [
         ("in_repo", `Bool true);
-        ("root", `String git.root);
-        ("head", `String git.head);
-        ("relative_path", `String git.relative_path);
+        ("root", jstr git.root);
+        ("head", jstr git.head);
+        ("relative_path", jstr git.relative_path);
         ("tracked", `Bool git.tracked);
         ("dirty", `Bool git.dirty);
         ("ignored", `Bool git.ignored);
@@ -433,7 +440,7 @@ let git_json git =
 
 let write_manifest out command exit_status =
   let repo_items =
-    Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", `String repo.root); ("head", `String repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos []
+    Hashtbl.fold (fun _ repo acc -> `Assoc [ ("root", jstr repo.root); ("head", jstr repo.head); ("dirty", `Bool repo.dirty) ] :: acc) repos []
     |> List.sort Stdlib.compare
   in
   let file_items =
@@ -443,8 +450,8 @@ let write_manifest out command exit_status =
         let git = if recd.after_git.in_repo then recd.after_git else recd.before_git in
         `Assoc
           [
-            ("path", `String recd.path);
-            ("operations", `List (List.map (fun op -> `String op) ops));
+            ("path", jstr recd.path);
+            ("operations", `List (List.map jstr ops));
             ("before", metadata_json recd.before);
             ("after", metadata_json recd.after);
             ("git", git_json git);
@@ -457,9 +464,9 @@ let write_manifest out command exit_status =
     `Assoc
       [
         ("format_version", `Int 1);
-        ("command", `List (List.map (fun arg -> `String arg) command));
+        ("command", `List (List.map jstr command));
         ("exit_status", `Int exit_status);
-        ("start_cwd", `String (Sys.getcwd ()));
+        ("start_cwd", jstr (Sys.getcwd ()));
         ("uid", `Int tracer_uid);
         ("gid", `Int tracer_gid);
         ("git_repositories", `List repo_items);
src/ocaml/dune
index c356d7e..7254f07 100644
--- a/src/ocaml/dune
+++ b/src/ocaml/dune
@@ -4,4 +4,4 @@
  (foreign_stubs
   (language c)
   (names ptrace_stubs))
- (libraries unix yojson ocaml-git))
+ (libraries unix yojson camomile ocaml-git))
tests/test_agent_snapshot.py
index efef94d..65223fb 100644
--- a/tests/test_agent_snapshot.py
+++ b/tests/test_agent_snapshot.py
@@ -347,7 +347,6 @@ def test_text_peculiar_file_names_are_recorded_and_blobbed(tmp_path):
     assert snap.blob_text(newline["before"]["blob"]) == "newline payload\n"
 
 
-@pytest.mark.skip(reason="manifest paths are still not valid UTF-8 for non-UTF-8 filenames")
 def test_non_utf8_filename_exposes_json_string_limitation(tmp_path):
     bytes_path = os.path.join(os.fsencode(WORKTREE), b"non-utf8-\xff.txt")
     with open(bytes_path, "wb") as handle: