Coverage for domain / converters / pdf_converter.py: 100.00%
21 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-07 00:07 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-07 00:07 +0000
1"""PDF document converter."""
2import pytesseract
3from PIL import Image
4import io
5from typing import Optional, Callable, List, Any
6from domain.core.base_converter import BaseConverter
7from .reader_protocols import _PDFReader
8from .pdf_reader import PyMuPDFReader
11class PDFConverter(BaseConverter):
12 """Converter for PDF documents with OCR fallback for scanned pages."""
14 def __init__(self, source_path, reader: Optional[_PDFReader] = None):
15 super().__init__(source_path)
16 self._reader: _PDFReader = reader or PyMuPDFReader()
18 def _load_items(self) -> List[Any]:
19 """Load all pages from PDF document."""
20 doc = self._reader.open(self.source_path)
21 return [doc.load_page(i) for i in range(len(doc))]
23 def _extract_from_item(self, page: Any) -> str:
24 """Extract text from a single PDF page with OCR fallback.
26 Args:
27 page: PyMuPDF page object
29 Returns:
30 Extracted text string
31 """
32 text = page.get_text("text").strip()
34 # OCR Fallback for scanned/empty pages
35 if not text:
36 pix = page.get_pixmap()
37 img = Image.open(io.BytesIO(pix.tobytes()))
38 text = pytesseract.image_to_string(img)
40 return text