gateway/modules/services/serviceExtraction/chunking/chunkerStructure.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import json

from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker


class StructureChunker(Chunker):
    def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
        maxBytes = int(options.get("structureChunkSize", 40000))
        data = part.data or ""
        # best-effort: try JSON list/object bucketing; else fallback to line-based
        chunks: List[Dict[str, Any]] = []
        try:
            obj = json.loads(data)
            def emit(bucket: Any):
                text = json.dumps(bucket, ensure_ascii=False)
                chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
            if isinstance(obj, list):
                bucket: list[Any] = []
                size = 0
                for item in obj:
                    text = json.dumps(item, ensure_ascii=False)
                    s = len(text.encode('utf-8'))
                    if size + s > maxBytes and bucket:
                        emit(bucket)
                        bucket = [item]
                        size = s
                    else:
                        bucket.append(item)
                        size += s
                if bucket:
                    emit(bucket)
            else:
                # JSON object (dict) - check if it fits
                text = json.dumps(obj, ensure_ascii=False)
                textSize = len(text.encode('utf-8'))
                if textSize <= maxBytes:
                    emit(obj)
                else:
                    # Object too large - try to split by keys if possible
                    # For large objects, we need to chunk by character boundaries
                    # since we can't split JSON objects arbitrarily
                    if isinstance(obj, dict) and len(obj) > 1:
                        # Try to split object into multiple chunks by keys
                        # This preserves JSON structure better than line-based chunking
                        currentChunk: Dict[str, Any] = {}
                        currentSize = 2  # Start with "{}" overhead
                        for key, value in obj.items():
                            itemText = json.dumps({key: value}, ensure_ascii=False)
                            itemSize = len(itemText.encode('utf-8'))
                            # Account for comma and spacing between items
                            if currentChunk:
                                itemSize += 2  # ", " separator

                            if currentSize + itemSize > maxBytes and currentChunk:
                                # Current chunk is full, emit it
                                emit(currentChunk)
                                currentChunk = {key: value}
                                currentSize = len(itemText.encode('utf-8'))
                            else:
                                currentChunk[key] = value
                                currentSize += itemSize

                        # Emit remaining chunk
                        if currentChunk:
                            emit(currentChunk)
                    else:
                        # Single large value or can't split - fallback to line chunking
                        raise ValueError("too large")
        except Exception:
            current: List[str] = []
            size = 0
            for line in data.split('\n'):
                s = len(line.encode('utf-8')) + 1
                if size + s > maxBytes and current:
                    text = '\n'.join(current)
                    chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
                    current = [line]
                    size = s
                else:
                    current.append(line)
                    size += s
            if current:
                text = '\n'.join(current)
                chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
        return chunks