Coverage for domain / converters / pdf_converter.py: 100.00%

21 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-07 00:07 +0000

1"""PDF document converter.""" 

2import pytesseract 

3from PIL import Image 

4import io 

5from typing import Optional, Callable, List, Any 

6from domain.core.base_converter import BaseConverter 

7from .reader_protocols import _PDFReader 

8from .pdf_reader import PyMuPDFReader 

9 

10 

11class PDFConverter(BaseConverter): 

12 """Converter for PDF documents with OCR fallback for scanned pages.""" 

13 

14 def __init__(self, source_path, reader: Optional[_PDFReader] = None): 

15 super().__init__(source_path) 

16 self._reader: _PDFReader = reader or PyMuPDFReader() 

17 

18 def _load_items(self) -> List[Any]: 

19 """Load all pages from PDF document.""" 

20 doc = self._reader.open(self.source_path) 

21 return [doc.load_page(i) for i in range(len(doc))] 

22 

23 def _extract_from_item(self, page: Any) -> str: 

24 """Extract text from a single PDF page with OCR fallback. 

25  

26 Args: 

27 page: PyMuPDF page object 

28  

29 Returns: 

30 Extracted text string 

31 """ 

32 text = page.get_text("text").strip() 

33 

34 # OCR Fallback for scanned/empty pages 

35 if not text: 

36 pix = page.get_pixmap() 

37 img = Image.open(io.BytesIO(pix.tobytes())) 

38 text = pytesseract.image_to_string(img) 

39 

40 return text