Repositories / olmoocr_runner.git
run_ocr.py

Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git
4954 bytes · a5c4777b6b0a
"""Run olmOCR on a single PDF and produce a Markdown file."""

import argparse
import glob
import json
import os
import shutil
import signal
import subprocess
import sys
import tempfile
import time
import urllib.request
from pathlib import Path

LLAMA_SERVER = os.environ.get("LLAMA_SERVER") or shutil.which("llama-server")
GGUF_MODEL = os.path.expanduser(
    os.environ.get("OLMOCR_GGUF_MODEL", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf")
)
MMPROJ = os.path.expanduser(os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf"))
PORT = 30024


def wait_for_server(port, timeout=300):
    """Wait for llama-server to be ready."""
    url = f"http://localhost:{port}/health"
    start = time.time()
    while time.time() - start < timeout:
        try:
            resp = urllib.request.urlopen(url, timeout=2)
            if resp.status == 200:
                return True
        except Exception:
            pass
        time.sleep(1)
    return False


def main():
    parser = argparse.ArgumentParser(description="OCR a PDF to Markdown using olmOCR")
    parser.add_argument("pdf", help="Path to the input PDF file")
    args = parser.parse_args()

    pdf_path = Path(args.pdf).resolve()
    if not pdf_path.exists():
        print(f"Error: {pdf_path} does not exist", file=sys.stderr)
        sys.exit(1)

    output_md = pdf_path.with_suffix(".md")

    if not LLAMA_SERVER:
        print("Error: llama-server was not found on PATH. Set LLAMA_SERVER to its full path.", file=sys.stderr)
        sys.exit(1)
    for model_file in (GGUF_MODEL, MMPROJ):
        if not Path(model_file).exists():
            print(f"Error: missing model file: {model_file}", file=sys.stderr)
            print("Run ./install.sh to download the model files.", file=sys.stderr)
            sys.exit(1)

    # Start llama-server
    server_cmd = [
        LLAMA_SERVER,
        "-m", GGUF_MODEL,
        "--mmproj", MMPROJ,
        "--port", str(PORT),
        "-c", "16384",
        "-ngl", "999",
        "--no-warmup",
    ]

    print(f"Starting llama-server on port {PORT} ...")
    server_proc = subprocess.Popen(
        server_cmd,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.PIPE,
    )

    try:
        print("Waiting for llama-server to be ready ...")
        if not wait_for_server(PORT):
            print("llama-server failed to start", file=sys.stderr)
            # Print stderr for debugging
            server_proc.terminate()
            stderr = server_proc.stderr.read().decode()
            print(stderr[-2000:], file=sys.stderr)
            sys.exit(1)
        print("llama-server is ready.")

        with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir:
            workspace = os.path.join(tmpdir, "workspace")

            cmd = [
                sys.executable, "-m", "olmocr.pipeline",
                workspace,
                "--pdfs", str(pdf_path),
                "--markdown",
                "--model", "olmocr",
                "--server", f"http://localhost:{PORT}/v1",
            ]

            print(f"Running olmOCR on {pdf_path.name} ...")
            result = subprocess.run(cmd)
            if result.returncode != 0:
                print("olmOCR pipeline failed", file=sys.stderr)
                sys.exit(result.returncode)

            # Find the markdown output
            md_dir = os.path.join(workspace, "markdown")
            md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True)

            if not md_files:
                # Fall back to extracting from JSONL results
                results_dir = os.path.join(workspace, "results")
                jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl"))
                texts = []
                for jf in sorted(jsonl_files):
                    with open(jf) as f:
                        for line in f:
                            doc = json.loads(line)
                            texts.append(doc.get("text", ""))
                if texts:
                    output_md.write_text("\n\n".join(texts))
                    print(f"Output written to {output_md}")
                else:
                    print("No output produced by olmOCR", file=sys.stderr)
                    sys.exit(1)
            else:
                if len(md_files) == 1:
                    shutil.copy2(md_files[0], output_md)
                else:
                    parts = []
                    for mf in sorted(md_files):
                        parts.append(Path(mf).read_text())
                    output_md.write_text("\n\n".join(parts))
                print(f"Output written to {output_md}")
    finally:
        print("Shutting down llama-server ...")
        server_proc.terminate()
        try:
            server_proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            server_proc.kill()


if __name__ == "__main__":
    main()