From f65fe4b3d04a8081f844ae8720c59595b29b7f6f Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Wed, 5 Nov 2025 01:32:54 +0100
Subject: [PATCH] fixes for chunking and sharepoint wildcard

---
 modules/interfaces/interfaceAiObjects.py      | 36 +++++++++++++++---
 .../serviceExtraction/chunking/chunkerText.py | 37 +++++++++++++++----
 modules/workflows/methods/methodSharepoint.py |  9 +++++
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index 2562028c..18673987 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -286,10 +286,31 @@ class AiObjects:
                             raise
                 
                 # For non-image parts, check if part fits in model context
+                # Calculate available space accounting for prompt, system message, and output reservation
                 partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
-                modelContextBytes = model.contextLength * 4  # Convert tokens to bytes
                 
-                if partSize <= modelContextBytes:
+                # Use same calculation as _chunkContentPart to determine actual available space
+                modelContextTokens = model.contextLength
+                modelMaxOutputTokens = model.maxTokens
+                
+                # Reserve tokens for prompt, system message, output, and message overhead
+                promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
+                systemMessageTokens = 10  # ~40 bytes = 10 tokens
+                outputTokens = modelMaxOutputTokens
+                messageOverheadTokens = 100
+                totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
+                
+                # Available tokens for content (with 80% safety margin)
+                availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
+                if availableContentTokens < 100:
+                    availableContentTokens = max(100, int(modelContextTokens * 0.1))
+                
+                # Convert to bytes (1 token ≈ 4 bytes)
+                availableContentBytes = availableContentTokens * 4
+                
+                logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)")
+                
+                if partSize <= availableContentBytes:
                     # Part fits - call AI directly
                     response = await self._callWithModel(model, prompt, contentPart.data, options)
                     logger.info(f"✅ Content part processed successfully with model: {model.name}")
@@ -310,7 +331,9 @@ class AiObjects:
                     chunkResults = []
                     for idx, chunk in enumerate(chunks):
                         chunkNum = idx + 1
-                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
+                        chunkData = chunk.get('data', '')
+                        chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
+                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes")
                         
                         # Calculate and log progress
                         if progressCallback:
@@ -318,7 +341,7 @@ class AiObjects:
                             progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}")
                         
                         try:
-                            chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
+                            chunkResponse = await self._callWithModel(model, prompt, chunkData, options)
                             chunkResults.append(chunkResponse)
                             logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
                             
@@ -572,7 +595,10 @@ class AiObjects:
             )
             
             # Log before calling model
-            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes")
+            contextSize = len(context.encode('utf-8')) if context else 0
+            promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0
+            totalInputSize = contextSize + promptSize
+            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes")
             
             # Call the model with standardized interface
             modelResponse = await model.functionCall(modelCall)
diff --git a/modules/services/serviceExtraction/chunking/chunkerText.py b/modules/services/serviceExtraction/chunking/chunkerText.py
index 2c05eeaf..802858d1 100644
--- a/modules/services/serviceExtraction/chunking/chunkerText.py
+++ b/modules/services/serviceExtraction/chunking/chunkerText.py
@@ -13,21 +13,44 @@ class TextChunker(Chunker):
         logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
         logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
         chunks: List[Dict[str, Any]] = []
+        
+        # Split by lines first (preferred method for text)
+        lines = part.data.split('\n')
         current: List[str] = []
         size = 0
-        for line in part.data.split('\n'):
-            lineSize = len(line.encode('utf-8')) + 1
+        
+        for line in lines:
+            lineSize = len(line.encode('utf-8')) + 1  # +1 for newline character
             if size + lineSize > maxBytes and current:
+                # Current chunk is full, save it and start new one
                 data = '\n'.join(current)
                 chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
-                current = [line]
-                size = lineSize
-            else:
-                current.append(line)
-                size += lineSize
+                current = []
+                size = 0
+            
+            # If a single line is larger than maxBytes, split it by character boundaries
+            if lineSize > maxBytes:
+                # Split the long line into chunks
+                lineBytes = line.encode('utf-8')
+                lineStart = 0
+                while lineStart < len(lineBytes):
+                    chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
+                    chunkText = chunkBytes.decode('utf-8', errors='ignore')
+                    chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
+                    lineStart += maxBytes
+                # Don't add this line to current, it's already chunked
+                continue
+            
+            # Add line to current chunk
+            current.append(line)
+            size += lineSize
+        
+        # Add remaining lines as final chunk
         if current:
             data = '\n'.join(current)
             chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
+        
+        logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
         return chunks
 
 
diff --git a/modules/workflows/methods/methodSharepoint.py b/modules/workflows/methods/methodSharepoint.py
index 6dabdaf5..b96b87d8 100644
--- a/modules/workflows/methods/methodSharepoint.py
+++ b/modules/workflows/methods/methodSharepoint.py
@@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase):
                             else:
                                 # List all items in the drive root
                                 endpoint = f"sites/{siteId}/drive/root/children"
+                            
+                            # Make the API call to list items
+                            listResult = await self._makeGraphApiCall(endpoint)
+                            if "error" in listResult:
+                                logger.warning(f"List failed for site {siteName}: {listResult['error']}")
+                                continue
+                            # Process list results for this site
+                            items = listResult.get("value", [])
+                            logger.info(f"Retrieved {len(items)} items from site {siteName}")
                         else:
                             # For files, use regular search API
                             # Clean the query: remove path-like syntax and invalid KQL syntax