From f65fe4b3d04a8081f844ae8720c59595b29b7f6f Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Wed, 5 Nov 2025 01:32:54 +0100 Subject: [PATCH] fixes for chunking and sharepoint wildcard --- modules/interfaces/interfaceAiObjects.py | 36 +++++++++++++++--- .../serviceExtraction/chunking/chunkerText.py | 37 +++++++++++++++---- modules/workflows/methods/methodSharepoint.py | 9 +++++ 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 2562028c..18673987 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -286,10 +286,31 @@ class AiObjects: raise # For non-image parts, check if part fits in model context + # Calculate available space accounting for prompt, system message, and output reservation partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0 - modelContextBytes = model.contextLength * 4 # Convert tokens to bytes - if partSize <= modelContextBytes: + # Use same calculation as _chunkContentPart to determine actual available space + modelContextTokens = model.contextLength + modelMaxOutputTokens = model.maxTokens + + # Reserve tokens for prompt, system message, output, and message overhead + promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0 + systemMessageTokens = 10 # ~40 bytes = 10 tokens + outputTokens = modelMaxOutputTokens + messageOverheadTokens = 100 + totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens + + # Available tokens for content (with 80% safety margin) + availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8) + if availableContentTokens < 100: + availableContentTokens = max(100, int(modelContextTokens * 0.1)) + + # Convert to bytes (1 token ≈ 4 bytes) + availableContentBytes = availableContentTokens * 4 + + logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)") + + if partSize <= availableContentBytes: # Part fits - call AI directly response = await self._callWithModel(model, prompt, contentPart.data, options) logger.info(f"✅ Content part processed successfully with model: {model.name}") @@ -310,7 +331,9 @@ class AiObjects: chunkResults = [] for idx, chunk in enumerate(chunks): chunkNum = idx + 1 - logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}") + chunkData = chunk.get('data', '') + chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0 + logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes") # Calculate and log progress if progressCallback: @@ -318,7 +341,7 @@ class AiObjects: progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}") try: - chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options) + chunkResponse = await self._callWithModel(model, prompt, chunkData, options) chunkResults.append(chunkResponse) logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully") @@ -572,7 +595,10 @@ class AiObjects: ) # Log before calling model - logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes") + contextSize = len(context.encode('utf-8')) if context else 0 + promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0 + totalInputSize = contextSize + promptSize + logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes") # Call the model with standardized interface modelResponse = await model.functionCall(modelCall) diff --git a/modules/services/serviceExtraction/chunking/chunkerText.py b/modules/services/serviceExtraction/chunking/chunkerText.py index 2c05eeaf..802858d1 100644 --- a/modules/services/serviceExtraction/chunking/chunkerText.py +++ b/modules/services/serviceExtraction/chunking/chunkerText.py @@ -13,21 +13,44 @@ class TextChunker(Chunker): logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}") logger.debug(f"TextChunker: using maxBytes: {maxBytes}") chunks: List[Dict[str, Any]] = [] + + # Split by lines first (preferred method for text) + lines = part.data.split('\n') current: List[str] = [] size = 0 - for line in part.data.split('\n'): - lineSize = len(line.encode('utf-8')) + 1 + + for line in lines: + lineSize = len(line.encode('utf-8')) + 1 # +1 for newline character if size + lineSize > maxBytes and current: + # Current chunk is full, save it and start new one data = '\n'.join(current) chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)}) - current = [line] - size = lineSize - else: - current.append(line) - size += lineSize + current = [] + size = 0 + + # If a single line is larger than maxBytes, split it by character boundaries + if lineSize > maxBytes: + # Split the long line into chunks + lineBytes = line.encode('utf-8') + lineStart = 0 + while lineStart < len(lineBytes): + chunkBytes = lineBytes[lineStart:lineStart + maxBytes] + chunkText = chunkBytes.decode('utf-8', errors='ignore') + chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)}) + lineStart += maxBytes + # Don't add this line to current, it's already chunked + continue + + # Add line to current chunk + current.append(line) + size += lineSize + + # Add remaining lines as final chunk if current: data = '\n'.join(current) chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)}) + + logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes") return chunks diff --git a/modules/workflows/methods/methodSharepoint.py b/modules/workflows/methods/methodSharepoint.py index 6dabdaf5..b96b87d8 100644 --- a/modules/workflows/methods/methodSharepoint.py +++ b/modules/workflows/methods/methodSharepoint.py @@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase): else: # List all items in the drive root endpoint = f"sites/{siteId}/drive/root/children" + + # Make the API call to list items + listResult = await self._makeGraphApiCall(endpoint) + if "error" in listResult: + logger.warning(f"List failed for site {siteName}: {listResult['error']}") + continue + # Process list results for this site + items = listResult.get("value", []) + logger.info(f"Retrieved {len(items)} items from site {siteName}") else: # For files, use regular search API # Clean the query: remove path-like syntax and invalid KQL syntax