Repositories / olmoocr_runner.git
tests/test_ocr_integration.py

Clone (read-only): git clone http://git.guha-anderson.com/git/olmoocr_runner.git
2405 bytes · e20153933c89
import os
import re
import shutil
import subprocess
from pathlib import Path

import pytest


REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_MODEL = Path(
    os.path.expanduser(
        os.environ.get(
            "OLMOCR_GGUF_MODEL",
            "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/olmocr-2-7b-1025-fp8-q4_k_m.gguf",
        )
    )
)
DEFAULT_MMPROJ = Path(
    os.path.expanduser(
        os.environ.get("OLMOCR_MMPROJ", "~/models/olmOCR-2-7B-1025-Q4_K_M-GGUF/mmproj-f16.gguf")
    )
)


def _require_command(name: str) -> None:
    if shutil.which(name) is None:
        pytest.skip(f"{name} is required for this integration test")


def _normalized_math(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\\\(|\\\)|\\\[|\\\]|[$`{}\\\s]", "", text)
    text = text.replace("²", "^2")
    return text


def _contains_markdown_table(text: str) -> bool:
    lines = text.splitlines()
    if any("|" in line and ("Alpha" in line or "Beta" in line) for line in lines):
        return True

    lowered = text.lower()
    return all(tag in lowered for tag in ("<table", "</table>", "<td>alpha</td>", "<td>beta</td>"))


@pytest.mark.integration
def test_ocr_preserves_table_and_formula(tmp_path: Path) -> None:
    _require_command("pandoc")
    _require_command("xelatex")
    _require_command("llama-server")

    if not DEFAULT_MODEL.exists():
        pytest.skip(f"model file is missing: {DEFAULT_MODEL}")
    if not DEFAULT_MMPROJ.exists():
        pytest.skip(f"mmproj file is missing: {DEFAULT_MMPROJ}")

    source = tmp_path / "table_formula.md"
    pdf = tmp_path / "table_formula.pdf"
    output = tmp_path / "table_formula.md"

    source.write_text(
        """# Table and Formula Check

| Input | Output |
| --- | --- |
| Alpha | One |
| Beta | Four |

The formula is:

$$
y = x^2
$$
""",
        encoding="utf-8",
    )

    subprocess.run(
        ["pandoc", str(source), "-o", str(pdf), "--pdf-engine=xelatex"],
        check=True,
        cwd=REPO_ROOT,
    )

    subprocess.run(
        [str(REPO_ROOT / "ocr.sh"), str(pdf)],
        check=True,
        cwd=REPO_ROOT,
        timeout=600,
    )

    markdown = output.read_text(encoding="utf-8")
    assert "Alpha" in markdown
    assert "Beta" in markdown
    assert "One" in markdown
    assert "Four" in markdown
    assert _contains_markdown_table(markdown)
    assert "y=x^2" in _normalized_math(markdown)