Coverage for domain / core / base_converter.py: 100.00%
35 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-07 00:07 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-07 00:07 +0000
1"""Abstract base class for document converters."""
2import abc
3import logging
4from pathlib import Path
5from typing import Optional, Callable, List, Any
7logger = logging.getLogger(__name__)
10class BaseConverter(metaclass=abc.ABCMeta):
11 """Abstract Base Class for all document converters.
13 Provides a template for document extraction with common progress callback handling.
14 """
16 def __init__(self, source_path: Path):
17 self.source_path = source_path
19 def extract_content(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> str:
20 """Extracts raw text from the source file.
22 Template method that orchestrates the extraction process:
23 1. Load items (pages, chapters, etc.) from the document
24 2. Extract text from each item using _extract_from_item()
25 3. Join all extracted text
26 4. Handle progress callbacks with proper exception logging
28 Args:
29 progress_callback: Optional callable receiving (current_index, total_count)
30 Called after processing each item
32 Returns:
33 Concatenated text from all items separated by double newlines
34 """
35 items = self._load_items()
36 contents = self._extract_from_items(items, progress_callback)
37 return "\n\n".join(contents)
39 def extract_content_per_item(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> List[str]:
40 """Extract content as list of strings, one per item (page/chapter).
42 Args:
43 progress_callback: Optional callable receiving (current_index, total_count)
45 Returns:
46 List of strings, one per document item
47 """
48 items = self._load_items()
49 return self._extract_from_items(items, progress_callback)
51 def _extract_from_items(self, items: List[Any], progress_callback: Optional[Callable[[int, int], None]]) -> List[str]:
52 """Extract text from all items with progress callback handling.
54 Args:
55 items: List of document items to process
56 progress_callback: Optional callback for progress updates
58 Returns:
59 List of extracted text strings
60 """
61 contents = []
62 total = len(items)
64 for idx, item in enumerate(items):
65 text = self._extract_from_item(item)
66 contents.append(text)
67 self._call_progress_callback(progress_callback, idx + 1, total)
69 return contents
71 @staticmethod
72 def _call_progress_callback(
73 progress_callback: Optional[Callable[[int, int], None]],
74 current: int,
75 total: int
76 ) -> None:
77 """Execute progress callback with exception handling.
79 Logs exceptions to prevent callback failures from breaking extraction.
81 Args:
82 progress_callback: Callback to invoke, if provided
83 current: Current item count
84 total: Total items
85 """
86 if not progress_callback:
87 return
89 try:
90 progress_callback(current, total)
91 except Exception as e:
92 # Log but don't re-raise to prevent breaking extraction
93 logger.error(f"Progress callback error: {e}")
95 @abc.abstractmethod
96 def _load_items(self) -> List[Any]:
97 """Load items (pages, chapters, etc.) from document.
99 Returns:
100 List of document items ready for processing
101 """
102 raise NotImplementedError("Subclasses must implement _load_items()")
104 @abc.abstractmethod
105 def _extract_from_item(self, item: Any) -> str:
106 """Extract text from a single item.
108 Args:
109 item: Single document item (page, chapter, etc.)
111 Returns:
112 Extracted text string
113 """
114 raise NotImplementedError("Subclasses must implement _extract_from_item()")