# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List import json from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Chunker class StructureChunker(Chunker): def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: maxBytes = int(options.get("structureChunkSize", 40000)) data = part.data or "" # best-effort: try JSON list/object bucketing; else fallback to line-based chunks: List[Dict[str, Any]] = [] try: obj = json.loads(data) def emit(bucket: Any): text = json.dumps(bucket, ensure_ascii=False) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) if isinstance(obj, list): bucket: list[Any] = [] size = 0 for item in obj: text = json.dumps(item, ensure_ascii=False) s = len(text.encode('utf-8')) if size + s > maxBytes and bucket: emit(bucket) bucket = [item] size = s else: bucket.append(item) size += s if bucket: emit(bucket) else: text = json.dumps(obj, ensure_ascii=False) if len(text.encode('utf-8')) <= maxBytes: emit(obj) else: # fallback to line chunking raise ValueError("too large") except Exception: current: List[str] = [] size = 0 for line in data.split('\n'): s = len(line.encode('utf-8')) + 1 if size + s > maxBytes and current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) current = [line] size = s else: current.append(line) size += s if current: text = '\n'.join(current) chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)}) return chunks