Coverage for domain/core/base_converter.py: 100.00%

1"""Abstract base class for document converters."""

2import abc

3import logging

4from pathlib import Path

5from typing import Optional, Callable, List, Any

7logger = logging.getLogger(__name__)

10class BaseConverter(metaclass=abc.ABCMeta):

11 """Abstract Base Class for all document converters.

13 Provides a template for document extraction with common progress callback handling.

14 """

16 def __init__(self, source_path: Path):

17 self.source_path = source_path

19 def extract_content(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> str:

20 """Extracts raw text from the source file.

22 Template method that orchestrates the extraction process:

23 1. Load items (pages, chapters, etc.) from the document

24 2. Extract text from each item using _extract_from_item()

25 3. Join all extracted text

26 4. Handle progress callbacks with proper exception logging

28 Args:

29 progress_callback: Optional callable receiving (current_index, total_count)

30 Called after processing each item

32 Returns:

33 Concatenated text from all items separated by double newlines

34 """

35 items = self._load_items()

36 contents = self._extract_from_items(items, progress_callback)

37 return "\n\n".join(contents)

39 def extract_content_per_item(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> List[str]:

40 """Extract content as list of strings, one per item (page/chapter).

42 Args:

43 progress_callback: Optional callable receiving (current_index, total_count)

45 Returns:

46 List of strings, one per document item

47 """

48 items = self._load_items()

49 return self._extract_from_items(items, progress_callback)

51 def _extract_from_items(self, items: List[Any], progress_callback: Optional[Callable[[int, int], None]]) -> List[str]:

52 """Extract text from all items with progress callback handling.

54 Args:

55 items: List of document items to process

56 progress_callback: Optional callback for progress updates

58 Returns:

59 List of extracted text strings

60 """

61 contents = []

62 total = len(items)

64 for idx, item in enumerate(items):

65 text = self._extract_from_item(item)

66 contents.append(text)

67 self._call_progress_callback(progress_callback, idx + 1, total)

69 return contents

71 @staticmethod

72 def _call_progress_callback(

73 progress_callback: Optional[Callable[[int, int], None]],

74 current: int,

75 total: int

76 ) -> None:

77 """Execute progress callback with exception handling.

79 Logs exceptions to prevent callback failures from breaking extraction.

81 Args:

82 progress_callback: Callback to invoke, if provided

83 current: Current item count

84 total: Total items

85 """

86 if not progress_callback:

87 return

89 try:

90 progress_callback(current, total)

91 except Exception as e:

92 # Log but don't re-raise to prevent breaking extraction

93 logger.error(f"Progress callback error: {e}")

95 @abc.abstractmethod

96 def _load_items(self) -> List[Any]:

97 """Load items (pages, chapters, etc.) from document.

99 Returns:

100 List of document items ready for processing

101 """

102 raise NotImplementedError("Subclasses must implement _load_items()")

103

104 @abc.abstractmethod

105 def _extract_from_item(self, item: Any) -> str:

106 """Extract text from a single item.

107

108 Args:

109 item: Single document item (page, chapter, etc.)

110

111 Returns:

112 Extracted text string

113 """

114 raise NotImplementedError("Subclasses must implement _extract_from_item()")

Coverage for domain / core / base_converter.py: 100.00%

35 statements