Repositories / olmoocr_runner.git

run_ocr.py

Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git

Branch
4954 bytes · a5c4777b6b0a
"""Run olmOCR on a single PDF and produce a Markdown file.""" import argparse import glob import json import os import shutil import signal import subprocess import sys import tempfile import time import urllib.request from pathlib import Path LLAMA_SERVER = os.environ.get("LLAMA_SERVER") or shutil.which("llama-server") GGUF_MODEL = os.path.expanduser( os.environ.get("OLMOCR_GGUF_MODEL", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf") ) MMPROJ = os.path.expanduser(os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf")) PORT = 30024 def wait_for_server(port, timeout=300): """Wait for llama-server to be ready.""" url = f"http://localhost:{port}/health" start = time.time() while time.time() - start < timeout: try: resp = urllib.request.urlopen(url, timeout=2) if resp.status == 200: return True except Exception: pass time.sleep(1) return False def main(): parser = argparse.ArgumentParser(description="OCR a PDF to Markdown using olmOCR") parser.add_argument("pdf", help="Path to the input PDF file") args = parser.parse_args() pdf_path = Path(args.pdf).resolve() if not pdf_path.exists(): print(f"Error: {pdf_path} does not exist", file=sys.stderr) sys.exit(1) output_md = pdf_path.with_suffix(".md") if not LLAMA_SERVER: print("Error: llama-server was not found on PATH. Set LLAMA_SERVER to its full path.", file=sys.stderr) sys.exit(1) for model_file in (GGUF_MODEL, MMPROJ): if not Path(model_file).exists(): print(f"Error: missing model file: {model_file}", file=sys.stderr) print("Run ./install.sh to download the model files.", file=sys.stderr) sys.exit(1) # Start llama-server server_cmd = [ LLAMA_SERVER, "-m", GGUF_MODEL, "--mmproj", MMPROJ, "--port", str(PORT), "-c", "16384", "-ngl", "999", "--no-warmup", ] print(f"Starting llama-server on port {PORT} ...") server_proc = subprocess.Popen( server_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) try: print("Waiting for llama-server to be ready ...") if not wait_for_server(PORT): print("llama-server failed to start", file=sys.stderr) # Print stderr for debugging server_proc.terminate() stderr = server_proc.stderr.read().decode() print(stderr[-2000:], file=sys.stderr) sys.exit(1) print("llama-server is ready.") with tempfile.TemporaryDirectory(prefix="olmocr_") as tmpdir: workspace = os.path.join(tmpdir, "workspace") cmd = [ sys.executable, "-m", "olmocr.pipeline", workspace, "--pdfs", str(pdf_path), "--markdown", "--model", "olmocr", "--server", f"http://localhost:{PORT}/v1", ] print(f"Running olmOCR on {pdf_path.name} ...") result = subprocess.run(cmd) if result.returncode != 0: print("olmOCR pipeline failed", file=sys.stderr) sys.exit(result.returncode) # Find the markdown output md_dir = os.path.join(workspace, "markdown") md_files = glob.glob(os.path.join(md_dir, "**", "*.md"), recursive=True) if not md_files: # Fall back to extracting from JSONL results results_dir = os.path.join(workspace, "results") jsonl_files = glob.glob(os.path.join(results_dir, "*.jsonl")) texts = [] for jf in sorted(jsonl_files): with open(jf) as f: for line in f: doc = json.loads(line) texts.append(doc.get("text", "")) if texts: output_md.write_text("\n\n".join(texts)) print(f"Output written to {output_md}") else: print("No output produced by olmOCR", file=sys.stderr) sys.exit(1) else: if len(md_files) == 1: shutil.copy2(md_files[0], output_md) else: parts = [] for mf in sorted(md_files): parts.append(Path(mf).read_text()) output_md.write_text("\n\n".join(parts)) print(f"Output written to {output_md}") finally: print("Shutting down llama-server ...") server_proc.terminate() try: server_proc.wait(timeout=10) except subprocess.TimeoutExpired: server_proc.kill() if __name__ == "__main__": main()