gateway/modules/services/serviceExtraction/merging/mergerTable.py
2025-10-12 01:14:07 +02:00

152 lines
5.5 KiB
Python

from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
class TableMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
"""
Merge table parts based on strategy.
Strategy options:
- groupBy: "parentId" (default), "documentId", "sheet", "none"
- maxSize: maximum size per merged part
- combineSheets: bool - whether to combine multiple sheets into one table
"""
if not parts:
return parts
groupBy = strategy.get("groupBy", "parentId")
maxSize = strategy.get("maxSize", 0)
combineSheets = strategy.get("combineSheets", False)
# Group parts
groups = self._groupParts(parts, groupBy, combineSheets)
merged: List[ContentPart] = []
for groupKey, groupParts in groups.items():
if maxSize > 0:
merged.extend(self._mergeWithSizeLimit(groupParts, maxSize, groupKey))
else:
merged.extend(self._mergeGroup(groupParts, groupKey))
return merged
def _groupParts(self, parts: List[ContentPart], groupBy: str, combineSheets: bool) -> Dict[str, List[ContentPart]]:
groups: Dict[str, List[ContentPart]] = {}
for part in parts:
if part.typeGroup != "table":
# Non-table parts go in their own group
key = f"nontable_{part.id}"
if key not in groups:
groups[key] = []
groups[key].append(part)
continue
if groupBy == "parentId":
key = part.parentId or "root"
elif groupBy == "documentId":
key = part.metadata.get("documentId", "unknown")
elif groupBy == "sheet" and not combineSheets:
key = part.metadata.get("sheet", "unknown")
else: # "none" or combineSheets=True
key = "all_tables"
if key not in groups:
groups[key] = []
groups[key].append(part)
return groups
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
if not parts:
return []
if len(parts) == 1:
return parts
# For tables, we typically keep them separate unless explicitly combining
# But we can add metadata about the group
for i, part in enumerate(parts):
part.metadata["groupKey"] = groupKey
part.metadata["groupIndex"] = i
part.metadata["groupSize"] = len(parts)
return parts
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int, groupKey: str) -> List[ContentPart]:
if not parts:
return []
# For tables, we typically don't merge across different tables
# Instead, we chunk individual large tables
merged: List[ContentPart] = []
for part in parts:
partSize = part.metadata.get("size", 0)
if partSize <= maxSize:
# Part fits within limit
part.metadata["groupKey"] = groupKey
merged.append(part)
else:
# Chunk the large table
chunks = self._chunkTable(part, maxSize)
merged.extend(chunks)
return merged
def _chunkTable(self, part: ContentPart, maxSize: int) -> List[ContentPart]:
"""Chunk a large table by rows while preserving CSV structure."""
lines = part.data.split('\n')
if not lines:
return [part]
chunks: List[ContentPart] = []
currentChunk: List[str] = []
currentSize = 0
for line in lines:
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline
if currentSize + lineSize > maxSize and currentChunk:
# Flush current chunk
chunkData = '\n'.join(currentChunk)
chunks.append(ContentPart(
id=makeId(),
parentId=part.parentId,
label=f"{part.label}_chunk_{len(chunks)}",
typeGroup="table",
mimeType=part.mimeType,
data=chunkData,
metadata={
"size": len(chunkData.encode('utf-8')),
"chunk": True,
"originalPart": part.id,
"chunkIndex": len(chunks)
}
))
currentChunk = [line]
currentSize = lineSize
else:
currentChunk.append(line)
currentSize += lineSize
# Flush remaining chunk
if currentChunk:
chunkData = '\n'.join(currentChunk)
chunks.append(ContentPart(
id=makeId(),
parentId=part.parentId,
label=f"{part.label}_chunk_{len(chunks)}",
typeGroup="table",
mimeType=part.mimeType,
data=chunkData,
metadata={
"size": len(chunkData.encode('utf-8')),
"chunk": True,
"originalPart": part.id,
"chunkIndex": len(chunks)
}
))
return chunks