Repositories / olmoocr_runner.git

tests/test_ocr_integration.py

Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git

Branch
2405 bytes · e20153933c89
import os import re import shutil import subprocess from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parents[1] DEFAULT_MODEL = Path( os.path.expanduser( os.environ.get( "OLMOCR_GGUF_MODEL", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf", ) ) ) DEFAULT_MMPROJ = Path( os.path.expanduser( os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf") ) ) def _require_command(name: str) -> None: if shutil.which(name) is None: pytest.skip(f"{name} is required for this integration test") def _normalized_math(text: str) -> str: text = text.lower() text = re.sub(r"\\\(|\\\)|\\\[|\\\]|[$`{}\\\s]", "", text) text = text.replace("²", "^2") return text def _contains_markdown_table(text: str) -> bool: lines = text.splitlines() if any("|" in line and ("Alpha" in line or "Beta" in line) for line in lines): return True lowered = text.lower() return all(tag in lowered for tag in ("<table", "</table>", "<td>alpha</td>", "<td>beta</td>")) @pytest.mark.integration def test_ocr_preserves_table_and_formula(tmp_path: Path) -> None: _require_command("pandoc") _require_command("xelatex") _require_command("llama-server") if not DEFAULT_MODEL.exists(): pytest.skip(f"model file is missing: {DEFAULT_MODEL}") if not DEFAULT_MMPROJ.exists(): pytest.skip(f"mmproj file is missing: {DEFAULT_MMPROJ}") source = tmp_path / "table_formula.md" pdf = tmp_path / "table_formula.pdf" output = tmp_path / "table_formula.md" source.write_text( """# Table and Formula Check | Input | Output | | --- | --- | | Alpha | One | | Beta | Four | The formula is: $$ y = x^2 $$ """, encoding="utf-8", ) subprocess.run( ["pandoc", str(source), "-o", str(pdf), "--pdf-engine=xelatex"], check=True, cwd=REPO_ROOT, ) subprocess.run( [str(REPO_ROOT / "ocr.sh"), str(pdf)], check=True, cwd=REPO_ROOT, timeout=600, ) markdown = output.read_text(encoding="utf-8") assert "Alpha" in markdown assert "Beta" in markdown assert "One" in markdown assert "Four" in markdown assert _contains_markdown_table(markdown) assert "y=x^2" in _normalized_math(markdown)