gateway/modules/services/serviceExtraction/chunking/chunkerStructure.py
2025-10-12 01:14:07 +02:00

59 lines
2.3 KiB
Python

from typing import Any, Dict, List
import json
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
class StructureChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("structureChunkSize", 40000))
data = part.data or ""
# best-effort: try JSON list/object bucketing; else fallback to line-based
chunks: List[Dict[str, Any]] = []
try:
obj = json.loads(data)
def emit(bucket: Any):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
if isinstance(obj, list):
bucket: list[Any] = []
size = 0
for item in obj:
text = json.dumps(item, ensure_ascii=False)
s = len(text.encode('utf-8'))
if size + s > maxBytes and bucket:
emit(bucket)
bucket = [item]
size = s
else:
bucket.append(item)
size += s
if bucket:
emit(bucket)
else:
text = json.dumps(obj, ensure_ascii=False)
if len(text.encode('utf-8')) <= maxBytes:
emit(obj)
else:
# fallback to line chunking
raise ValueError("too large")
except Exception:
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
return chunks