Repositories / olmoocr_runner.git
tests/test_ocr_integration.py
Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git
import os
import re
import shutil
import subprocess
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_MODEL = Path(
os.path.expanduser(
os.environ.get(
"OLMOCR_GGUF_MODEL",
"~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf",
)
)
)
DEFAULT_MMPROJ = Path(
os.path.expanduser(
os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf")
)
)
def _require_command(name: str) -> None:
if shutil.which(name) is None:
pytest.skip(f"{name} is required for this integration test")
def _normalized_math(text: str) -> str:
text = text.lower()
text = re.sub(r"\\\(|\\\)|\\\[|\\\]|[$`{}\\\s]", "", text)
text = text.replace("²", "^2")
return text
def _contains_markdown_table(text: str) -> bool:
lines = text.splitlines()
if any("|" in line and ("Alpha" in line or "Beta" in line) for line in lines):
return True
lowered = text.lower()
return all(tag in lowered for tag in ("<table", "</table>", "<td>alpha</td>", "<td>beta</td>"))
@pytest.mark.integration
def test_ocr_preserves_table_and_formula(tmp_path: Path) -> None:
_require_command("pandoc")
_require_command("xelatex")
_require_command("llama-server")
if not DEFAULT_MODEL.exists():
pytest.skip(f"model file is missing: {DEFAULT_MODEL}")
if not DEFAULT_MMPROJ.exists():
pytest.skip(f"mmproj file is missing: {DEFAULT_MMPROJ}")
source = tmp_path / "table_formula.md"
pdf = tmp_path / "table_formula.pdf"
output = tmp_path / "table_formula.md"
source.write_text(
"""# Table and Formula Check
| Input | Output |
| --- | --- |
| Alpha | One |
| Beta | Four |
The formula is:
$$
y = x^2
$$
""",
encoding="utf-8",
)
subprocess.run(
["pandoc", str(source), "-o", str(pdf), "--pdf-engine=xelatex"],
check=True,
cwd=REPO_ROOT,
)
subprocess.run(
[str(REPO_ROOT / "ocr.sh"), str(pdf)],
check=True,
cwd=REPO_ROOT,
timeout=600,
)
markdown = output.read_text(encoding="utf-8")
assert "Alpha" in markdown
assert "Beta" in markdown
assert "One" in markdown
assert "Four" in markdown
assert _contains_markdown_table(markdown)
assert "y=x^2" in _normalized_math(markdown)