serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter) - modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface() - 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt - resolver.py: legacy fallback auf altes services/ entfernt - modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py) - pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
148 lines
5.6 KiB
Python
148 lines
5.6 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List
|
|
import json
|
|
import logging
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subRegistry import Chunker
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StructureChunker(Chunker):
|
|
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
|
maxBytes = int(options.get("structureChunkSize", 40000))
|
|
data = part.data or ""
|
|
chunks: List[Dict[str, Any]] = []
|
|
|
|
try:
|
|
obj = json.loads(data)
|
|
self._chunkValue(obj, maxBytes, chunks)
|
|
except (json.JSONDecodeError, ValueError):
|
|
self._chunkByLines(data, maxBytes, chunks)
|
|
|
|
return chunks
|
|
|
|
def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
|
|
"""Recursively chunk a JSON value (list or dict) into pieces <= maxBytes."""
|
|
text = json.dumps(obj, ensure_ascii=False)
|
|
if len(text.encode('utf-8')) <= maxBytes:
|
|
self._emit(obj, chunks)
|
|
return
|
|
|
|
if isinstance(obj, list):
|
|
self._chunkList(obj, maxBytes, chunks)
|
|
elif isinstance(obj, dict):
|
|
self._chunkDict(obj, maxBytes, chunks)
|
|
else:
|
|
self._chunkByLines(text, maxBytes, chunks)
|
|
|
|
def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]):
|
|
"""Split a JSON array into sub-arrays that each fit within maxBytes."""
|
|
bucket: list = []
|
|
bucketSize = 2 # "[]" overhead
|
|
|
|
for item in items:
|
|
itemText = json.dumps(item, ensure_ascii=False)
|
|
itemSize = len(itemText.encode('utf-8'))
|
|
separator = 2 if bucket else 0 # ", "
|
|
|
|
if bucketSize + itemSize + separator > maxBytes and bucket:
|
|
self._emit(bucket, chunks)
|
|
bucket = []
|
|
bucketSize = 2
|
|
separator = 0
|
|
|
|
if itemSize + 2 > maxBytes:
|
|
if bucket:
|
|
self._emit(bucket, chunks)
|
|
bucket = []
|
|
bucketSize = 2
|
|
self._chunkValue(item, maxBytes, chunks)
|
|
else:
|
|
bucket.append(item)
|
|
bucketSize += itemSize + separator
|
|
|
|
if bucket:
|
|
self._emit(bucket, chunks)
|
|
|
|
def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]):
|
|
"""Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it."""
|
|
if len(obj) <= 1:
|
|
key, value = next(iter(obj.items()))
|
|
if isinstance(value, (list, dict)):
|
|
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
|
|
else:
|
|
text = json.dumps(obj, ensure_ascii=False)
|
|
self._chunkByLines(text, maxBytes, chunks)
|
|
return
|
|
|
|
currentChunk: Dict[str, Any] = {}
|
|
currentSize = 2 # "{}" overhead
|
|
|
|
for key, value in obj.items():
|
|
itemText = json.dumps({key: value}, ensure_ascii=False)
|
|
itemSize = len(itemText.encode('utf-8'))
|
|
separator = 2 if currentChunk else 0
|
|
|
|
if currentSize + itemSize + separator > maxBytes and currentChunk:
|
|
self._emit(currentChunk, chunks)
|
|
currentChunk = {}
|
|
currentSize = 2
|
|
separator = 0
|
|
|
|
if itemSize + 2 > maxBytes:
|
|
if currentChunk:
|
|
self._emit(currentChunk, chunks)
|
|
currentChunk = {}
|
|
currentSize = 2
|
|
if isinstance(value, (list, dict)):
|
|
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
|
|
else:
|
|
self._chunkByLines(itemText, maxBytes, chunks)
|
|
else:
|
|
currentChunk[key] = value
|
|
currentSize += itemSize + separator
|
|
|
|
if currentChunk:
|
|
self._emit(currentChunk, chunks)
|
|
|
|
def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
|
|
"""Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}."""
|
|
subChunks: List[Dict[str, Any]] = []
|
|
self._chunkValue(value, maxBytes, subChunks)
|
|
|
|
for sub in subChunks:
|
|
subData = json.loads(sub["data"])
|
|
wrapped = {key: subData}
|
|
wrappedText = json.dumps(wrapped, ensure_ascii=False)
|
|
wrappedSize = len(wrappedText.encode('utf-8'))
|
|
if wrappedSize <= maxBytes:
|
|
self._emit(wrapped, chunks)
|
|
else:
|
|
self._chunkByLines(wrappedText, maxBytes, chunks)
|
|
|
|
def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]):
|
|
text = json.dumps(bucket, ensure_ascii=False)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
|
|
def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]):
|
|
"""Line-based fallback for content that cannot be split structurally."""
|
|
current: List[str] = []
|
|
size = 0
|
|
for line in data.split('\n'):
|
|
s = len(line.encode('utf-8')) + 1
|
|
if size + s > maxBytes and current:
|
|
text = '\n'.join(current)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
current = [line]
|
|
size = s
|
|
else:
|
|
current.append(line)
|
|
size += s
|
|
if current:
|
|
text = '\n'.join(current)
|
|
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
|
|
|
|