From f65fe4b3d04a8081f844ae8720c59595b29b7f6f Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Wed, 5 Nov 2025 01:32:54 +0100
Subject: [PATCH] fixes for chunking and sharepoint wildcard
---
modules/interfaces/interfaceAiObjects.py | 36 +++++++++++++++---
.../serviceExtraction/chunking/chunkerText.py | 37 +++++++++++++++----
modules/workflows/methods/methodSharepoint.py | 9 +++++
3 files changed, 70 insertions(+), 12 deletions(-)
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index 2562028c..18673987 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -286,10 +286,31 @@ class AiObjects:
raise
# For non-image parts, check if part fits in model context
+ # Calculate available space accounting for prompt, system message, and output reservation
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
- modelContextBytes = model.contextLength * 4 # Convert tokens to bytes
- if partSize <= modelContextBytes:
+ # Use same calculation as _chunkContentPart to determine actual available space
+ modelContextTokens = model.contextLength
+ modelMaxOutputTokens = model.maxTokens
+
+ # Reserve tokens for prompt, system message, output, and message overhead
+ promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
+ systemMessageTokens = 10 # ~40 bytes = 10 tokens
+ outputTokens = modelMaxOutputTokens
+ messageOverheadTokens = 100
+ totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
+
+ # Available tokens for content (with 80% safety margin)
+ availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
+ if availableContentTokens < 100:
+ availableContentTokens = max(100, int(modelContextTokens * 0.1))
+
+ # Convert to bytes (1 token ≈ 4 bytes)
+ availableContentBytes = availableContentTokens * 4
+
+ logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)")
+
+ if partSize <= availableContentBytes:
# Part fits - call AI directly
response = await self._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}")
@@ -310,7 +331,9 @@ class AiObjects:
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
- logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
+ chunkData = chunk.get('data', '')
+ chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
+ logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes")
# Calculate and log progress
if progressCallback:
@@ -318,7 +341,7 @@ class AiObjects:
progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}")
try:
- chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
+ chunkResponse = await self._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
@@ -572,7 +595,10 @@ class AiObjects:
)
# Log before calling model
- logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes")
+ contextSize = len(context.encode('utf-8')) if context else 0
+ promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0
+ totalInputSize = contextSize + promptSize
+ logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes")
# Call the model with standardized interface
modelResponse = await model.functionCall(modelCall)
diff --git a/modules/services/serviceExtraction/chunking/chunkerText.py b/modules/services/serviceExtraction/chunking/chunkerText.py
index 2c05eeaf..802858d1 100644
--- a/modules/services/serviceExtraction/chunking/chunkerText.py
+++ b/modules/services/serviceExtraction/chunking/chunkerText.py
@@ -13,21 +13,44 @@ class TextChunker(Chunker):
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
chunks: List[Dict[str, Any]] = []
+
+ # Split by lines first (preferred method for text)
+ lines = part.data.split('\n')
current: List[str] = []
size = 0
- for line in part.data.split('\n'):
- lineSize = len(line.encode('utf-8')) + 1
+
+ for line in lines:
+ lineSize = len(line.encode('utf-8')) + 1 # +1 for newline character
if size + lineSize > maxBytes and current:
+ # Current chunk is full, save it and start new one
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
- current = [line]
- size = lineSize
- else:
- current.append(line)
- size += lineSize
+ current = []
+ size = 0
+
+ # If a single line is larger than maxBytes, split it by character boundaries
+ if lineSize > maxBytes:
+ # Split the long line into chunks
+ lineBytes = line.encode('utf-8')
+ lineStart = 0
+ while lineStart < len(lineBytes):
+ chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
+ chunkText = chunkBytes.decode('utf-8', errors='ignore')
+ chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
+ lineStart += maxBytes
+ # Don't add this line to current, it's already chunked
+ continue
+
+ # Add line to current chunk
+ current.append(line)
+ size += lineSize
+
+ # Add remaining lines as final chunk
if current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
+
+ logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
return chunks
diff --git a/modules/workflows/methods/methodSharepoint.py b/modules/workflows/methods/methodSharepoint.py
index 6dabdaf5..b96b87d8 100644
--- a/modules/workflows/methods/methodSharepoint.py
+++ b/modules/workflows/methods/methodSharepoint.py
@@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase):
else:
# List all items in the drive root
endpoint = f"sites/{siteId}/drive/root/children"
+
+ # Make the API call to list items
+ listResult = await self._makeGraphApiCall(endpoint)
+ if "error" in listResult:
+ logger.warning(f"List failed for site {siteName}: {listResult['error']}")
+ continue
+ # Process list results for this site
+ items = listResult.get("value", [])
+ logger.info(f"Retrieved {len(items)} items from site {siteName}")
else:
# For files, use regular search API
# Clean the query: remove path-like syntax and invalid KQL syntax