152 lines
5.5 KiB
Python
152 lines
5.5 KiB
Python
from typing import Any, Dict, List
|
|
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
|
from ..subUtils import makeId
|
|
|
|
|
|
class TableMerger:
|
|
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
|
"""
|
|
Merge table parts based on strategy.
|
|
Strategy options:
|
|
- groupBy: "parentId" (default), "documentId", "sheet", "none"
|
|
- maxSize: maximum size per merged part
|
|
- combineSheets: bool - whether to combine multiple sheets into one table
|
|
"""
|
|
if not parts:
|
|
return parts
|
|
|
|
groupBy = strategy.groupBy
|
|
maxSize = strategy.maxSize or 0
|
|
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
|
|
|
|
# Group parts
|
|
groups = self._groupParts(parts, groupBy, combineSheets)
|
|
|
|
merged: List[ContentPart] = []
|
|
for groupKey, groupParts in groups.items():
|
|
if maxSize > 0:
|
|
merged.extend(self._mergeWithSizeLimit(groupParts, maxSize, groupKey))
|
|
else:
|
|
merged.extend(self._mergeGroup(groupParts, groupKey))
|
|
|
|
return merged
|
|
|
|
def _groupParts(self, parts: List[ContentPart], groupBy: str, combineSheets: bool) -> Dict[str, List[ContentPart]]:
|
|
groups: Dict[str, List[ContentPart]] = {}
|
|
|
|
for part in parts:
|
|
if part.typeGroup != "table":
|
|
# Non-table parts go in their own group
|
|
key = f"nontable_{part.id}"
|
|
if key not in groups:
|
|
groups[key] = []
|
|
groups[key].append(part)
|
|
continue
|
|
|
|
if groupBy == "parentId":
|
|
key = part.parentId or "root"
|
|
elif groupBy == "documentId":
|
|
key = part.metadata.get("documentId", "unknown")
|
|
elif groupBy == "sheet" and not combineSheets:
|
|
key = part.metadata.get("sheet", "unknown")
|
|
else: # "none" or combineSheets=True
|
|
key = "all_tables"
|
|
|
|
if key not in groups:
|
|
groups[key] = []
|
|
groups[key].append(part)
|
|
|
|
return groups
|
|
|
|
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
|
|
if not parts:
|
|
return []
|
|
if len(parts) == 1:
|
|
return parts
|
|
|
|
# For tables, we typically keep them separate unless explicitly combining
|
|
# But we can add metadata about the group
|
|
for i, part in enumerate(parts):
|
|
part.metadata["groupKey"] = groupKey
|
|
part.metadata["groupIndex"] = i
|
|
part.metadata["groupSize"] = len(parts)
|
|
|
|
return parts
|
|
|
|
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int, groupKey: str) -> List[ContentPart]:
|
|
if not parts:
|
|
return []
|
|
|
|
# For tables, we typically don't merge across different tables
|
|
# Instead, we chunk individual large tables
|
|
merged: List[ContentPart] = []
|
|
|
|
for part in parts:
|
|
partSize = part.metadata.get("size", 0)
|
|
|
|
if partSize <= maxSize:
|
|
# Part fits within limit
|
|
part.metadata["groupKey"] = groupKey
|
|
merged.append(part)
|
|
else:
|
|
# Chunk the large table
|
|
chunks = self._chunkTable(part, maxSize)
|
|
merged.extend(chunks)
|
|
|
|
return merged
|
|
|
|
def _chunkTable(self, part: ContentPart, maxSize: int) -> List[ContentPart]:
|
|
"""Chunk a large table by rows while preserving CSV structure."""
|
|
lines = part.data.split('\n')
|
|
if not lines:
|
|
return [part]
|
|
|
|
chunks: List[ContentPart] = []
|
|
currentChunk: List[str] = []
|
|
currentSize = 0
|
|
|
|
for line in lines:
|
|
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline
|
|
|
|
if currentSize + lineSize > maxSize and currentChunk:
|
|
# Flush current chunk
|
|
chunkData = '\n'.join(currentChunk)
|
|
chunks.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=part.parentId,
|
|
label=f"{part.label}_chunk_{len(chunks)}",
|
|
typeGroup="table",
|
|
mimeType=part.mimeType,
|
|
data=chunkData,
|
|
metadata={
|
|
"size": len(chunkData.encode('utf-8')),
|
|
"chunk": True,
|
|
"originalPart": part.id,
|
|
"chunkIndex": len(chunks)
|
|
}
|
|
))
|
|
currentChunk = [line]
|
|
currentSize = lineSize
|
|
else:
|
|
currentChunk.append(line)
|
|
currentSize += lineSize
|
|
|
|
# Flush remaining chunk
|
|
if currentChunk:
|
|
chunkData = '\n'.join(currentChunk)
|
|
chunks.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=part.parentId,
|
|
label=f"{part.label}_chunk_{len(chunks)}",
|
|
typeGroup="table",
|
|
mimeType=part.mimeType,
|
|
data=chunkData,
|
|
metadata={
|
|
"size": len(chunkData.encode('utf-8')),
|
|
"chunk": True,
|
|
"originalPart": part.id,
|
|
"chunkIndex": len(chunks)
|
|
}
|
|
))
|
|
|
|
return chunks
|