Repositories / olmoocr_runner.git

olmoocr_runner.git

Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git

Branch

Switch to llama-server + GGUF backend (4x faster)

Replace vLLM with llama-server serving Q4_K_M GGUF model.
- vLLM BF16: 1421s, 22.5 tok/s for 17-page PDF
- llama-server GGUF: 344s, 95.4 tok/s for same PDF
Uses Qwen2.5-VL mmproj for vision support.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Author
Arjun Guha <arjun@guha.dev>
Date
2026-04-09 12:53:04 -0400
Commit
8d2e1c33d2b3ce21424ca1e085df1b1217304c40
install.sh
index b16b7bf..a047743 100755
--- a/install.sh
+++ b/install.sh
@@ -2,20 +2,33 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-MODEL_DIR="$HOME/models/olmOCR-2-7B-1025"
-MODEL_NAME="allenai/olmOCR-2-7B-1025"
+GGUF_DIR="$HOME/models/olmOCR-2-7B-1025-Q4_K_M-GGUF"
+GGUF_FILE="$GGUF_DIR/olmocr-2-7b-1025-fp8-q4_k_m.gguf"
+GGUF_URL="https://huggingface.co/sing0717/olmOCR-2-7B-1025-FP8-Q4_K_M-GGUF/resolve/main/olmocr-2-7b-1025-fp8-q4_k_m.gguf"
+MMPROJ_FILE="$GGUF_DIR/mmproj-f16.gguf"
+MMPROJ_URL="https://huggingface.co/lmstudio-community/Qwen2.5-VL-7B-Instruct-GGUF/resolve/main/mmproj-model-f16.gguf"
 
 cd "$SCRIPT_DIR"
 
 echo "==> Running uv sync ..."
 uv sync
 
-if [ -d "$MODEL_DIR" ] && [ -f "$MODEL_DIR/config.json" ]; then
-    echo "==> Model already downloaded at $MODEL_DIR"
+mkdir -p "$GGUF_DIR"
+
+if [ -f "$GGUF_FILE" ]; then
+    echo "==> GGUF model already downloaded at $GGUF_FILE"
+else
+    echo "==> Downloading GGUF model to $GGUF_FILE ..."
+    curl -L -o "$GGUF_FILE" "$GGUF_URL"
+    echo "==> Model downloaded."
+fi
+
+if [ -f "$MMPROJ_FILE" ]; then
+    echo "==> mmproj already downloaded at $MMPROJ_FILE"
 else
-    echo "==> Downloading model $MODEL_NAME to $MODEL_DIR ..."
-    uv run python -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL_NAME', local_dir='$MODEL_DIR')"
-    echo "==> Model downloaded to $MODEL_DIR"
+    echo "==> Downloading mmproj (vision encoder) to $MMPROJ_FILE ..."
+    curl -L -o "$MMPROJ_FILE" "$MMPROJ_URL"
+    echo "==> mmproj downloaded."
 fi
 
 echo "==> Done."
run_ocr.py
index 04461dc..f04b869 100644
--- a/run_ocr.py
+++ b/run_ocr.py
@@ -5,16 +5,33 @@ import glob
 import json
 import os
 import shutil
+import signal
 import subprocess
 import sys
 import tempfile
+import time
+import urllib.request
 from pathlib import Path
 
+LLAMA_SERVER = "/home/arjun/repos/others/llama.cpp/build/bin/llama-server"
+GGUF_MODEL = os.path.expanduser("~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf")
+MMPROJ = os.path.expanduser("~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf")
+PORT = 30024
 
-MODEL_HF = "allenai/olmOCR-2-7B-1025"
-MODEL_LOCAL = os.path.expanduser("~/models/olmOCR-2-7B-1025")
-MODEL = MODEL_LOCAL if os.path.isdir(MODEL_LOCAL) else MODEL_HF
-GPU_MEMORY_UTILIZATION = 0.5
+
+def wait_for_server(port, timeout=300):
+    """Wait for llama-server to be ready."""
+    url = f"http://localhost:{port}/health"
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = urllib.request.urlopen(url, timeout=2)
+            if resp.status == 200:
+                return True
+        except Exception:
+            pass
+        time.sleep(1)
+    return False
 
 
 def main():
@@ -29,60 +46,89 @@ def main():
 
     output_md = pdf_path.with_suffix(".md")
 
-    with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir:
-        workspace = os.path.join(tmpdir, "workspace")
-
-        cmd = [
-            sys.executable, "-m", "olmocr.pipeline",
-            workspace,
-            "--pdfs", str(pdf_path),
-            "--markdown",
-            "--model", MODEL,
-            "--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION),
-        ]
-
-        env = os.environ.copy()
-        # Use system ptxas (CUDA 13.0) instead of Triton's bundled one,
-        # which doesn't support the GB10's sm_121a architecture.
-        env["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas"
-
-        print(f"Running olmOCR on {pdf_path.name} ...")
-        result = subprocess.run(cmd, env=env)
-        if result.returncode != 0:
-            print("olmOCR pipeline failed", file=sys.stderr)
-            sys.exit(result.returncode)
-
-        # Find the markdown output
-        md_dir = os.path.join(workspace, "markdown")
-        md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True)
-
-        if not md_files:
-            # Fall back to extracting from JSONL results
-            results_dir = os.path.join(workspace, "results")
-            jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl"))
-            texts = []
-            for jf in sorted(jsonl_files):
-                with open(jf) as f:
-                    for line in f:
-                        doc = json.loads(line)
-                        texts.append(doc.get("text", ""))
-            if texts:
-                output_md.write_text("\n\n".join(texts))
-                print(f"Output written to {output_md}")
-            else:
-                print("No output produced by olmOCR", file=sys.stderr)
-                sys.exit(1)
-        else:
-            # If there's a single markdown file, just copy it
-            if len(md_files) == 1:
-                shutil.copy2(md_files[0], output_md)
+    # Start llama-server
+    server_cmd = [
+        LLAMA_SERVER,
+        "-m", GGUF_MODEL,
+        "--mmproj", MMPROJ,
+        "--port", str(PORT),
+        "-c", "16384",
+        "-ngl", "999",
+        "--no-warmup",
+    ]
+
+    print(f"Starting llama-server on port {PORT} ...")
+    server_proc = subprocess.Popen(
+        server_cmd,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.PIPE,
+    )
+
+    try:
+        print("Waiting for llama-server to be ready ...")
+        if not wait_for_server(PORT):
+            print("llama-server failed to start", file=sys.stderr)
+            # Print stderr for debugging
+            server_proc.terminate()
+            stderr = server_proc.stderr.read().decode()
+            print(stderr[-2000:], file=sys.stderr)
+            sys.exit(1)
+        print("llama-server is ready.")
+
+        with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir:
+            workspace = os.path.join(tmpdir, "workspace")
+
+            cmd = [
+                sys.executable, "-m", "olmocr.pipeline",
+                workspace,
+                "--pdfs", str(pdf_path),
+                "--markdown",
+                "--model", "olmocr",
+                "--server", f"http://localhost:{PORT}/v1",
+            ]
+
+            print(f"Running olmOCR on {pdf_path.name} ...")
+            result = subprocess.run(cmd)
+            if result.returncode != 0:
+                print("olmOCR pipeline failed", file=sys.stderr)
+                sys.exit(result.returncode)
+
+            # Find the markdown output
+            md_dir = os.path.join(workspace, "markdown")
+            md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True)
+
+            if not md_files:
+                # Fall back to extracting from JSONL results
+                results_dir = os.path.join(workspace, "results")
+                jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl"))
+                texts = []
+                for jf in sorted(jsonl_files):
+                    with open(jf) as f:
+                        for line in f:
+                            doc = json.loads(line)
+                            texts.append(doc.get("text", ""))
+                if texts:
+                    output_md.write_text("\n\n".join(texts))
+                    print(f"Output written to {output_md}")
+                else:
+                    print("No output produced by olmOCR", file=sys.stderr)
+                    sys.exit(1)
             else:
-                # Concatenate all markdown files
-                parts = []
-                for mf in sorted(md_files):
-                    parts.append(Path(mf).read_text())
-                output_md.write_text("\n\n".join(parts))
-            print(f"Output written to {output_md}")
+                if len(md_files) == 1:
+                    shutil.copy2(md_files[0], output_md)
+                else:
+                    parts = []
+                    for mf in sorted(md_files):
+                        parts.append(Path(mf).read_text())
+                    output_md.write_text("\n\n".join(parts))
+                print(f"Output written to {output_md}")
+    finally:
+        print("Shutting down llama-server ...")
+        server_proc.terminate()
+        try:
+            server_proc.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            server_proc.kill()
 
 
 if __name__ == "__main__":