Repositories / olmoocr_runner.git
run_ocr.py
Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git
"""Run olmOCR on a single PDF and produce a Markdown file."""
import argparse
import glob
import json
import os
import shutil
import signal
import subprocess
import sys
import tempfile
import time
import urllib.request
from pathlib import Path
LLAMA_SERVER = os.environ.get("LLAMA_SERVER") or shutil.which("llama-server")
GGUF_MODEL = os.path.expanduser(
os.environ.get("OLMOCR_GGUF_MODEL", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf")
)
MMPROJ = os.path.expanduser(os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf"))
PORT = 30024
def wait_for_server(port, timeout=300):
"""Wait for llama-server to be ready."""
url = f"http://localhost:{port}/health"
start = time.time()
while time.time() - start < timeout:
try:
resp = urllib.request.urlopen(url, timeout=2)
if resp.status == 200:
return True
except Exception:
pass
time.sleep(1)
return False
def main():
parser = argparse.ArgumentParser(description="OCR a PDF to Markdown using olmOCR")
parser.add_argument("pdf", help="Path to the input PDF file")
args = parser.parse_args()
pdf_path = Path(args.pdf).resolve()
if not pdf_path.exists():
print(f"Error: {pdf_path} does not exist", file=sys.stderr)
sys.exit(1)
output_md = pdf_path.with_suffix(".md")
if not LLAMA_SERVER:
print("Error: llama-server was not found on PATH. Set LLAMA_SERVER to its full path.", file=sys.stderr)
sys.exit(1)
for model_file in (GGUF_MODEL, MMPROJ):
if not Path(model_file).exists():
print(f"Error: missing model file: {model_file}", file=sys.stderr)
print("Run ./install.sh to download the model files.", file=sys.stderr)
sys.exit(1)
# Start llama-server
server_cmd = [
LLAMA_SERVER,
"-m", GGUF_MODEL,
"--mmproj", MMPROJ,
"--port", str(PORT),
"-c", "16384",
"-ngl", "999",
"--no-warmup",
]
print(f"Starting llama-server on port {PORT} ...")
server_proc = subprocess.Popen(
server_cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
)
try:
print("Waiting for llama-server to be ready ...")
if not wait_for_server(PORT):
print("llama-server failed to start", file=sys.stderr)
# Print stderr for debugging
server_proc.terminate()
stderr = server_proc.stderr.read().decode()
print(stderr[-2000:], file=sys.stderr)
sys.exit(1)
print("llama-server is ready.")
with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir:
workspace = os.path.join(tmpdir, "workspace")
cmd = [
sys.executable, "-m", "olmocr.pipeline",
workspace,
"--pdfs", str(pdf_path),
"--markdown",
"--model", "olmocr",
"--server", f"http://localhost:{PORT}/v1",
]
print(f"Running olmOCR on {pdf_path.name} ...")
result = subprocess.run(cmd)
if result.returncode != 0:
print("olmOCR pipeline failed", file=sys.stderr)
sys.exit(result.returncode)
# Find the markdown output
md_dir = os.path.join(workspace, "markdown")
md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True)
if not md_files:
# Fall back to extracting from JSONL results
results_dir = os.path.join(workspace, "results")
jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl"))
texts = []
for jf in sorted(jsonl_files):
with open(jf) as f:
for line in f:
doc = json.loads(line)
texts.append(doc.get("text", ""))
if texts:
output_md.write_text("\n\n".join(texts))
print(f"Output written to {output_md}")
else:
print("No output produced by olmOCR", file=sys.stderr)
sys.exit(1)
else:
if len(md_files) == 1:
shutil.copy2(md_files[0], output_md)
else:
parts = []
for mf in sorted(md_files):
parts.append(Path(mf).read_text())
output_md.write_text("\n\n".join(parts))
print(f"Output written to {output_md}")
finally:
print("Shutting down llama-server ...")
server_proc.terminate()
try:
server_proc.wait(timeout=10)
except subprocess.TimeoutExpired:
server_proc.kill()
if __name__ == "__main__":
main()