# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List import json from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Chunker class StructureChunker(Chunker): def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: maxBytes = int(options.get("structureChunkSize", 40000)) data = part.data or "" # best-effort: try JSON list/object bucketing; else fallback to line-based chunks: List[Dict[str, Any]] = [] try: obj = json.loads(data) def emit(bucket: Any): text = json.dumps(bucket, ensure_ascii=False) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) if isinstance(obj, list): bucket: list[Any] = [] size = 0 for item in obj: text = json.dumps(item, ensure_ascii=False) s = len(text.encode('utf-8')) if size + s > maxBytes and bucket: emit(bucket) bucket = [item] size = s else: bucket.append(item) size += s if bucket: emit(bucket) else: # JSON object (dict) - check if it fits text = json.dumps(obj, ensure_ascii=False) textSize = len(text.encode('utf-8')) if textSize <= maxBytes: emit(obj) else: # Object too large - try to split by keys if possible # For large objects, we need to chunk by character boundaries # since we can't split JSON objects arbitrarily if isinstance(obj, dict) and len(obj) > 1: # Try to split object into multiple chunks by keys # This preserves JSON structure better than line-based chunking currentChunk: Dict[str, Any] = {} currentSize = 2 # Start with "{}" overhead for key, value in obj.items(): itemText = json.dumps({key: value}, ensure_ascii=False) itemSize = len(itemText.encode('utf-8')) # Account for comma and spacing between items if currentChunk: itemSize += 2 # ", " separator if currentSize + itemSize > maxBytes and currentChunk: # Current chunk is full, emit it emit(currentChunk) currentChunk = {key: value} currentSize = len(itemText.encode('utf-8')) else: currentChunk[key] = value currentSize += itemSize # Emit remaining chunk if currentChunk: emit(currentChunk) else: # Single large value or can't split - fallback to line chunking raise ValueError("too large") except Exception: current: List[str] = [] size = 0 for line in data.split('\n'): s = len(line.encode('utf-8')) + 1 if size + s > maxBytes and current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) current = [line] size = s else: current.append(line) size += s if current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) return chunks