Coverage for domain / core / base_converter.py: 100.00%

35 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-07 00:07 +0000

1"""Abstract base class for document converters.""" 

2import abc 

3import logging 

4from pathlib import Path 

5from typing import Optional, Callable, List, Any 

6 

7logger = logging.getLogger(__name__) 

8 

9 

10class BaseConverter(metaclass=abc.ABCMeta): 

11 """Abstract Base Class for all document converters. 

12  

13 Provides a template for document extraction with common progress callback handling. 

14 """ 

15 

16 def __init__(self, source_path: Path): 

17 self.source_path = source_path 

18 

19 def extract_content(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> str: 

20 """Extracts raw text from the source file. 

21  

22 Template method that orchestrates the extraction process: 

23 1. Load items (pages, chapters, etc.) from the document 

24 2. Extract text from each item using _extract_from_item() 

25 3. Join all extracted text 

26 4. Handle progress callbacks with proper exception logging 

27 

28 Args: 

29 progress_callback: Optional callable receiving (current_index, total_count) 

30 Called after processing each item 

31  

32 Returns: 

33 Concatenated text from all items separated by double newlines 

34 """ 

35 items = self._load_items() 

36 contents = self._extract_from_items(items, progress_callback) 

37 return "\n\n".join(contents) 

38 

39 def extract_content_per_item(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> List[str]: 

40 """Extract content as list of strings, one per item (page/chapter). 

41  

42 Args: 

43 progress_callback: Optional callable receiving (current_index, total_count) 

44  

45 Returns: 

46 List of strings, one per document item 

47 """ 

48 items = self._load_items() 

49 return self._extract_from_items(items, progress_callback) 

50 

51 def _extract_from_items(self, items: List[Any], progress_callback: Optional[Callable[[int, int], None]]) -> List[str]: 

52 """Extract text from all items with progress callback handling. 

53  

54 Args: 

55 items: List of document items to process 

56 progress_callback: Optional callback for progress updates 

57  

58 Returns: 

59 List of extracted text strings 

60 """ 

61 contents = [] 

62 total = len(items) 

63 

64 for idx, item in enumerate(items): 

65 text = self._extract_from_item(item) 

66 contents.append(text) 

67 self._call_progress_callback(progress_callback, idx + 1, total) 

68 

69 return contents 

70 

71 @staticmethod 

72 def _call_progress_callback( 

73 progress_callback: Optional[Callable[[int, int], None]], 

74 current: int, 

75 total: int 

76 ) -> None: 

77 """Execute progress callback with exception handling. 

78  

79 Logs exceptions to prevent callback failures from breaking extraction. 

80  

81 Args: 

82 progress_callback: Callback to invoke, if provided 

83 current: Current item count 

84 total: Total items 

85 """ 

86 if not progress_callback: 

87 return 

88 

89 try: 

90 progress_callback(current, total) 

91 except Exception as e: 

92 # Log but don't re-raise to prevent breaking extraction 

93 logger.error(f"Progress callback error: {e}") 

94 

95 @abc.abstractmethod 

96 def _load_items(self) -> List[Any]: 

97 """Load items (pages, chapters, etc.) from document. 

98  

99 Returns: 

100 List of document items ready for processing 

101 """ 

102 raise NotImplementedError("Subclasses must implement _load_items()") 

103 

104 @abc.abstractmethod 

105 def _extract_from_item(self, item: Any) -> str: 

106 """Extract text from a single item. 

107  

108 Args: 

109 item: Single document item (page, chapter, etc.) 

110  

111 Returns: 

112 Extracted text string 

113 """ 

114 raise NotImplementedError("Subclasses must implement _extract_from_item()")