from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart from ..subUtils import makeId class TableMerger: def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: """ Merge table parts based on strategy. Strategy options: - groupBy: "parentId" (default), "documentId", "sheet", "none" - maxSize: maximum size per merged part - combineSheets: bool - whether to combine multiple sheets into one table """ if not parts: return parts groupBy = strategy.get("groupBy", "parentId") maxSize = strategy.get("maxSize", 0) combineSheets = strategy.get("combineSheets", False) # Group parts groups = self._groupParts(parts, groupBy, combineSheets) merged: List[ContentPart] = [] for groupKey, groupParts in groups.items(): if maxSize > 0: merged.extend(self._mergeWithSizeLimit(groupParts, maxSize, groupKey)) else: merged.extend(self._mergeGroup(groupParts, groupKey)) return merged def _groupParts(self, parts: List[ContentPart], groupBy: str, combineSheets: bool) -> Dict[str, List[ContentPart]]: groups: Dict[str, List[ContentPart]] = {} for part in parts: if part.typeGroup != "table": # Non-table parts go in their own group key = f"nontable_{part.id}" if key not in groups: groups[key] = [] groups[key].append(part) continue if groupBy == "parentId": key = part.parentId or "root" elif groupBy == "documentId": key = part.metadata.get("documentId", "unknown") elif groupBy == "sheet" and not combineSheets: key = part.metadata.get("sheet", "unknown") else: # "none" or combineSheets=True key = "all_tables" if key not in groups: groups[key] = [] groups[key].append(part) return groups def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]: if not parts: return [] if len(parts) == 1: return parts # For tables, we typically keep them separate unless explicitly combining # But we can add metadata about the group for i, part in enumerate(parts): part.metadata["groupKey"] = groupKey part.metadata["groupIndex"] = i part.metadata["groupSize"] = len(parts) return parts def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int, groupKey: str) -> List[ContentPart]: if not parts: return [] # For tables, we typically don't merge across different tables # Instead, we chunk individual large tables merged: List[ContentPart] = [] for part in parts: partSize = part.metadata.get("size", 0) if partSize <= maxSize: # Part fits within limit part.metadata["groupKey"] = groupKey merged.append(part) else: # Chunk the large table chunks = self._chunkTable(part, maxSize) merged.extend(chunks) return merged def _chunkTable(self, part: ContentPart, maxSize: int) -> List[ContentPart]: """Chunk a large table by rows while preserving CSV structure.""" lines = part.data.split('\n') if not lines: return [part] chunks: List[ContentPart] = [] currentChunk: List[str] = [] currentSize = 0 for line in lines: lineSize = len(line.encode('utf-8')) + 1 # +1 for newline if currentSize + lineSize > maxSize and currentChunk: # Flush current chunk chunkData = '\n'.join(currentChunk) chunks.append(ContentPart( id=makeId(), parentId=part.parentId, label=f"{part.label}_chunk_{len(chunks)}", typeGroup="table", mimeType=part.mimeType, data=chunkData, metadata={ "size": len(chunkData.encode('utf-8')), "chunk": True, "originalPart": part.id, "chunkIndex": len(chunks) } )) currentChunk = [line] currentSize = lineSize else: currentChunk.append(line) currentSize += lineSize # Flush remaining chunk if currentChunk: chunkData = '\n'.join(currentChunk) chunks.append(ContentPart( id=makeId(), parentId=part.parentId, label=f"{part.label}_chunk_{len(chunks)}", typeGroup="table", mimeType=part.mimeType, data=chunkData, metadata={ "size": len(chunkData.encode('utf-8')), "chunk": True, "originalPart": part.id, "chunkIndex": len(chunks) } )) return chunks