# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List import json import logging from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Chunker logger = logging.getLogger(__name__) class StructureChunker(Chunker): def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: maxBytes = int(options.get("structureChunkSize", 40000)) data = part.data or "" chunks: List[Dict[str, Any]] = [] try: obj = json.loads(data) self._chunkValue(obj, maxBytes, chunks) except (json.JSONDecodeError, ValueError): self._chunkByLines(data, maxBytes, chunks) return chunks def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]): """Recursively chunk a JSON value (list or dict) into pieces <= maxBytes.""" text = json.dumps(obj, ensure_ascii=False) if len(text.encode('utf-8')) <= maxBytes: self._emit(obj, chunks) return if isinstance(obj, list): self._chunkList(obj, maxBytes, chunks) elif isinstance(obj, dict): self._chunkDict(obj, maxBytes, chunks) else: self._chunkByLines(text, maxBytes, chunks) def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]): """Split a JSON array into sub-arrays that each fit within maxBytes.""" bucket: list = [] bucketSize = 2 # "[]" overhead for item in items: itemText = json.dumps(item, ensure_ascii=False) itemSize = len(itemText.encode('utf-8')) separator = 2 if bucket else 0 # ", " if bucketSize + itemSize + separator > maxBytes and bucket: self._emit(bucket, chunks) bucket = [] bucketSize = 2 separator = 0 if itemSize + 2 > maxBytes: if bucket: self._emit(bucket, chunks) bucket = [] bucketSize = 2 self._chunkValue(item, maxBytes, chunks) else: bucket.append(item) bucketSize += itemSize + separator if bucket: self._emit(bucket, chunks) def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]): """Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it.""" if len(obj) <= 1: key, value = next(iter(obj.items())) if isinstance(value, (list, dict)): self._chunkSingleKeyValue(key, value, maxBytes, chunks) else: text = json.dumps(obj, ensure_ascii=False) self._chunkByLines(text, maxBytes, chunks) return currentChunk: Dict[str, Any] = {} currentSize = 2 # "{}" overhead for key, value in obj.items(): itemText = json.dumps({key: value}, ensure_ascii=False) itemSize = len(itemText.encode('utf-8')) separator = 2 if currentChunk else 0 if currentSize + itemSize + separator > maxBytes and currentChunk: self._emit(currentChunk, chunks) currentChunk = {} currentSize = 2 separator = 0 if itemSize + 2 > maxBytes: if currentChunk: self._emit(currentChunk, chunks) currentChunk = {} currentSize = 2 if isinstance(value, (list, dict)): self._chunkSingleKeyValue(key, value, maxBytes, chunks) else: self._chunkByLines(itemText, maxBytes, chunks) else: currentChunk[key] = value currentSize += itemSize + separator if currentChunk: self._emit(currentChunk, chunks) def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]): """Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}.""" subChunks: List[Dict[str, Any]] = [] self._chunkValue(value, maxBytes, subChunks) for sub in subChunks: subData = json.loads(sub["data"]) wrapped = {key: subData} wrappedText = json.dumps(wrapped, ensure_ascii=False) wrappedSize = len(wrappedText.encode('utf-8')) if wrappedSize <= maxBytes: self._emit(wrapped, chunks) else: self._chunkByLines(wrappedText, maxBytes, chunks) def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]): text = json.dumps(bucket, ensure_ascii=False) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]): """Line-based fallback for content that cannot be split structurally.""" current: List[str] = [] size = 0 for line in data.split('\n'): s = len(line.encode('utf-8')) + 1 if size + s > maxBytes and current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) current = [line] size = s else: current.append(line) size += s if current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})