Repositories / olmoocr_runner.git
olmoocr_runner.git
Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git
@@ -2,20 +2,33 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -MODEL_DIR="$HOME/models/olmOCR-2-7B-1025" -MODEL_NAME="allenai/olmOCR-2-7B-1025" +GGUF_DIR="$HOME/models/olmOCR-2-7B-1025-Q4_K_M-GGUF" +GGUF_FILE="$GGUF_DIR/olmocr-2-7b-1025-fp8-q4_k_m.gguf" +GGUF_URL="https://huggingface.co/sing0717/olmOCR-2-7B-1025-FP8-Q4_K_M-GGUF/resolve/main/olmocr-2-7b-1025-fp8-q4_k_m.gguf" +MMPROJ_FILE="$GGUF_DIR/mmproj-f16.gguf" +MMPROJ_URL="https://huggingface.co/lmstudio-community/Qwen2.5-VL-7B-Instruct-GGUF/resolve/main/mmproj-model-f16.gguf" cd "$SCRIPT_DIR" echo "==> Running uv sync ..." uv sync -if [ -d "$MODEL_DIR" ] && [ -f "$MODEL_DIR/config.json" ]; then - echo "==> Model already downloaded at $MODEL_DIR" +mkdir -p "$GGUF_DIR" + +if [ -f "$GGUF_FILE" ]; then + echo "==> GGUF model already downloaded at $GGUF_FILE" +else + echo "==> Downloading GGUF model to $GGUF_FILE ..." + curl -L -o "$GGUF_FILE" "$GGUF_URL" + echo "==> Model downloaded." +fi + +if [ -f "$MMPROJ_FILE" ]; then + echo "==> mmproj already downloaded at $MMPROJ_FILE" else - echo "==> Downloading model $MODEL_NAME to $MODEL_DIR ..." - uv run python -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL_NAME', local_dir='$MODEL_DIR')" - echo "==> Model downloaded to $MODEL_DIR" + echo "==> Downloading mmproj (vision encoder) to $MMPROJ_FILE ..." + curl -L -o "$MMPROJ_FILE" "$MMPROJ_URL" + echo "==> mmproj downloaded." fi echo "==> Done."
@@ -5,16 +5,33 @@ import glob import json import os import shutil +import signal import subprocess import sys import tempfile +import time +import urllib.request from pathlib import Path +LLAMA_SERVER = "/home/arjun/repos/others/llama.cpp/build/bin/llama-server" +GGUF_MODEL = os.path.expanduser("~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf") +MMPROJ = os.path.expanduser("~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf") +PORT = 30024 -MODEL_HF = "allenai/olmOCR-2-7B-1025" -MODEL_LOCAL = os.path.expanduser("~/models/olmOCR-2-7B-1025") -MODEL = MODEL_LOCAL if os.path.isdir(MODEL_LOCAL) else MODEL_HF -GPU_MEMORY_UTILIZATION = 0.5 + +def wait_for_server(port, timeout=300): + """Wait for llama-server to be ready.""" + url = f"http://localhost:{port}/health" + start = time.time() + while time.time() - start < timeout: + try: + resp = urllib.request.urlopen(url, timeout=2) + if resp.status == 200: + return True + except Exception: + pass + time.sleep(1) + return False def main(): @@ -29,60 +46,89 @@ def main(): output_md = pdf_path.with_suffix(".md") - with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir: - workspace = os.path.join(tmpdir, "workspace") - - cmd = [ - sys.executable, "-m", "olmocr.pipeline", - workspace, - "--pdfs", str(pdf_path), - "--markdown", - "--model", MODEL, - "--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION), - ] - - env = os.environ.copy() - # Use system ptxas (CUDA 13.0) instead of Triton's bundled one, - # which doesn't support the GB10's sm_121a architecture. - env["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas" - - print(f"Running olmOCR on {pdf_path.name} ...") - result = subprocess.run(cmd, env=env) - if result.returncode != 0: - print("olmOCR pipeline failed", file=sys.stderr) - sys.exit(result.returncode) - - # Find the markdown output - md_dir = os.path.join(workspace, "markdown") - md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True) - - if not md_files: - # Fall back to extracting from JSONL results - results_dir = os.path.join(workspace, "results") - jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl")) - texts = [] - for jf in sorted(jsonl_files): - with open(jf) as f: - for line in f: - doc = json.loads(line) - texts.append(doc.get("text", "")) - if texts: - output_md.write_text("\n\n".join(texts)) - print(f"Output written to {output_md}") - else: - print("No output produced by olmOCR", file=sys.stderr) - sys.exit(1) - else: - # If there's a single markdown file, just copy it - if len(md_files) == 1: - shutil.copy2(md_files[0], output_md) + # Start llama-server + server_cmd = [ + LLAMA_SERVER, + "-m", GGUF_MODEL, + "--mmproj", MMPROJ, + "--port", str(PORT), + "-c", "16384", + "-ngl", "999", + "--no-warmup", + ] + + print(f"Starting llama-server on port {PORT} ...") + server_proc = subprocess.Popen( + server_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + try: + print("Waiting for llama-server to be ready ...") + if not wait_for_server(PORT): + print("llama-server failed to start", file=sys.stderr) + # Print stderr for debugging + server_proc.terminate() + stderr = server_proc.stderr.read().decode() + print(stderr[-2000:], file=sys.stderr) + sys.exit(1) + print("llama-server is ready.") + + with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir: + workspace = os.path.join(tmpdir, "workspace") + + cmd = [ + sys.executable, "-m", "olmocr.pipeline", + workspace, + "--pdfs", str(pdf_path), + "--markdown", + "--model", "olmocr", + "--server", f"http://localhost:{PORT}/v1", + ] + + print(f"Running olmOCR on {pdf_path.name} ...") + result = subprocess.run(cmd) + if result.returncode != 0: + print("olmOCR pipeline failed", file=sys.stderr) + sys.exit(result.returncode) + + # Find the markdown output + md_dir = os.path.join(workspace, "markdown") + md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True) + + if not md_files: + # Fall back to extracting from JSONL results + results_dir = os.path.join(workspace, "results") + jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl")) + texts = [] + for jf in sorted(jsonl_files): + with open(jf) as f: + for line in f: + doc = json.loads(line) + texts.append(doc.get("text", "")) + if texts: + output_md.write_text("\n\n".join(texts)) + print(f"Output written to {output_md}") + else: + print("No output produced by olmOCR", file=sys.stderr) + sys.exit(1) else: - # Concatenate all markdown files - parts = [] - for mf in sorted(md_files): - parts.append(Path(mf).read_text()) - output_md.write_text("\n\n".join(parts)) - print(f"Output written to {output_md}") + if len(md_files) == 1: + shutil.copy2(md_files[0], output_md) + else: + parts = [] + for mf in sorted(md_files): + parts.append(Path(mf).read_text()) + output_md.write_text("\n\n".join(parts)) + print(f"Output written to {output_md}") + finally: + print("Shutting down llama-server ...") + server_proc.terminate() + try: + server_proc.wait(timeout=10) + except subprocess.TimeoutExpired: + server_proc.kill() if __name__ == "__main__":