gateway/modules/serviceCenter/services/serviceExtraction/chunking/chunkerStructure.py
ValueOn AG c8b7517209 refactor: modules/services/ abgeloest durch serviceCenter + serviceHub
serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung
serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter)

- modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface()
- 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt
- resolver.py: legacy fallback auf altes services/ entfernt
- modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py)
- pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
2026-03-14 11:51:45 +01:00

148 lines
5.6 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import json
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
logger = logging.getLogger(__name__)
class StructureChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("structureChunkSize", 40000))
data = part.data or ""
chunks: List[Dict[str, Any]] = []
try:
obj = json.loads(data)
self._chunkValue(obj, maxBytes, chunks)
except (json.JSONDecodeError, ValueError):
self._chunkByLines(data, maxBytes, chunks)
return chunks
def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Recursively chunk a JSON value (list or dict) into pieces <= maxBytes."""
text = json.dumps(obj, ensure_ascii=False)
if len(text.encode('utf-8')) <= maxBytes:
self._emit(obj, chunks)
return
if isinstance(obj, list):
self._chunkList(obj, maxBytes, chunks)
elif isinstance(obj, dict):
self._chunkDict(obj, maxBytes, chunks)
else:
self._chunkByLines(text, maxBytes, chunks)
def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON array into sub-arrays that each fit within maxBytes."""
bucket: list = []
bucketSize = 2 # "[]" overhead
for item in items:
itemText = json.dumps(item, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if bucket else 0 # ", "
if bucketSize + itemSize + separator > maxBytes and bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
self._chunkValue(item, maxBytes, chunks)
else:
bucket.append(item)
bucketSize += itemSize + separator
if bucket:
self._emit(bucket, chunks)
def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it."""
if len(obj) <= 1:
key, value = next(iter(obj.items()))
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
text = json.dumps(obj, ensure_ascii=False)
self._chunkByLines(text, maxBytes, chunks)
return
currentChunk: Dict[str, Any] = {}
currentSize = 2 # "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if currentChunk else 0
if currentSize + itemSize + separator > maxBytes and currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
self._chunkByLines(itemText, maxBytes, chunks)
else:
currentChunk[key] = value
currentSize += itemSize + separator
if currentChunk:
self._emit(currentChunk, chunks)
def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}."""
subChunks: List[Dict[str, Any]] = []
self._chunkValue(value, maxBytes, subChunks)
for sub in subChunks:
subData = json.loads(sub["data"])
wrapped = {key: subData}
wrappedText = json.dumps(wrapped, ensure_ascii=False)
wrappedSize = len(wrappedText.encode('utf-8'))
if wrappedSize <= maxBytes:
self._emit(wrapped, chunks)
else:
self._chunkByLines(wrappedText, maxBytes, chunks)
def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Line-based fallback for content that cannot be split structurally."""
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})