91 lines
4.1 KiB
Python
91 lines
4.1 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List
|
|
import json
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subRegistry import Chunker
|
|
|
|
|
|
class StructureChunker(Chunker):
|
|
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
|
maxBytes = int(options.get("structureChunkSize", 40000))
|
|
data = part.data or ""
|
|
# best-effort: try JSON list/object bucketing; else fallback to line-based
|
|
chunks: List[Dict[str, Any]] = []
|
|
try:
|
|
obj = json.loads(data)
|
|
def emit(bucket: Any):
|
|
text = json.dumps(bucket, ensure_ascii=False)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
if isinstance(obj, list):
|
|
bucket: list[Any] = []
|
|
size = 0
|
|
for item in obj:
|
|
text = json.dumps(item, ensure_ascii=False)
|
|
s = len(text.encode('utf-8'))
|
|
if size + s > maxBytes and bucket:
|
|
emit(bucket)
|
|
bucket = [item]
|
|
size = s
|
|
else:
|
|
bucket.append(item)
|
|
size += s
|
|
if bucket:
|
|
emit(bucket)
|
|
else:
|
|
# JSON object (dict) - check if it fits
|
|
text = json.dumps(obj, ensure_ascii=False)
|
|
textSize = len(text.encode('utf-8'))
|
|
if textSize <= maxBytes:
|
|
emit(obj)
|
|
else:
|
|
# Object too large - try to split by keys if possible
|
|
# For large objects, we need to chunk by character boundaries
|
|
# since we can't split JSON objects arbitrarily
|
|
if isinstance(obj, dict) and len(obj) > 1:
|
|
# Try to split object into multiple chunks by keys
|
|
# This preserves JSON structure better than line-based chunking
|
|
currentChunk: Dict[str, Any] = {}
|
|
currentSize = 2 # Start with "{}" overhead
|
|
for key, value in obj.items():
|
|
itemText = json.dumps({key: value}, ensure_ascii=False)
|
|
itemSize = len(itemText.encode('utf-8'))
|
|
# Account for comma and spacing between items
|
|
if currentChunk:
|
|
itemSize += 2 # ", " separator
|
|
|
|
if currentSize + itemSize > maxBytes and currentChunk:
|
|
# Current chunk is full, emit it
|
|
emit(currentChunk)
|
|
currentChunk = {key: value}
|
|
currentSize = len(itemText.encode('utf-8'))
|
|
else:
|
|
currentChunk[key] = value
|
|
currentSize += itemSize
|
|
|
|
# Emit remaining chunk
|
|
if currentChunk:
|
|
emit(currentChunk)
|
|
else:
|
|
# Single large value or can't split - fallback to line chunking
|
|
raise ValueError("too large")
|
|
except Exception:
|
|
current: List[str] = []
|
|
size = 0
|
|
for line in data.split('\n'):
|
|
s = len(line.encode('utf-8')) + 1
|
|
if size + s > maxBytes and current:
|
|
text = '\n'.join(current)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
current = [line]
|
|
size = s
|
|
else:
|
|
current.append(line)
|
|
size += s
|
|
if current:
|
|
text = '\n'.join(current)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
return chunks
|
|
|
|
|