fixes for chunking and sharepoint wildcard

2025-11-05 01:32:54 +01:00 · 2025-11-05 01:32:54 +01:00 · f65fe4b3d0
commit f65fe4b3d0
parent 236a85a99b
3 changed files with 70 additions and 12 deletions
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@ -286,10 +286,31 @@ class AiObjects:
                            raise
                
                # For non-image parts, check if part fits in model context
+                # Calculate available space accounting for prompt, system message, and output reservation
                partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
-                modelContextBytes = model.contextLength * 4  # Convert tokens to bytes
                
-                if partSize <= modelContextBytes:
+                # Use same calculation as _chunkContentPart to determine actual available space
+                modelContextTokens = model.contextLength
+                modelMaxOutputTokens = model.maxTokens
+                
+                # Reserve tokens for prompt, system message, output, and message overhead
+                promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
+                systemMessageTokens = 10  # ~40 bytes = 10 tokens
+                outputTokens = modelMaxOutputTokens
+                messageOverheadTokens = 100
+                totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
+                
+                # Available tokens for content (with 80% safety margin)
+                availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
+                if availableContentTokens < 100:
+                    availableContentTokens = max(100, int(modelContextTokens * 0.1))
+                
+                # Convert to bytes (1 token ≈ 4 bytes)
+                availableContentBytes = availableContentTokens * 4
+                
+                logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)")
+                
+                if partSize <= availableContentBytes:
                    # Part fits - call AI directly
                    response = await self._callWithModel(model, prompt, contentPart.data, options)
                    logger.info(f"✅ Content part processed successfully with model: {model.name}")
@ -310,7 +331,9 @@ class AiObjects:
                    chunkResults = []
                    for idx, chunk in enumerate(chunks):
                        chunkNum = idx + 1
-                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
+                        chunkData = chunk.get('data', '')
+                        chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
+                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes")
                        
                        # Calculate and log progress
                        if progressCallback:
@ -318,7 +341,7 @@ class AiObjects:
                            progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}")
                        
                        try:
-                            chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
+                            chunkResponse = await self._callWithModel(model, prompt, chunkData, options)
                            chunkResults.append(chunkResponse)
                            logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
                            
@ -572,7 +595,10 @@ class AiObjects:
            )
            
            # Log before calling model
-            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes")
+            contextSize = len(context.encode('utf-8')) if context else 0
+            promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0
+            totalInputSize = contextSize + promptSize
+            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes")
            
            # Call the model with standardized interface
            modelResponse = await model.functionCall(modelCall)
--- a/modules/services/serviceExtraction/chunking/chunkerText.py
+++ b/modules/services/serviceExtraction/chunking/chunkerText.py
@ -13,21 +13,44 @@ class TextChunker(Chunker):
        logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
        logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
        chunks: List[Dict[str, Any]] = []
+        
+        # Split by lines first (preferred method for text)
+        lines = part.data.split('\n')
        current: List[str] = []
        size = 0
-        for line in part.data.split('\n'):
-            lineSize = len(line.encode('utf-8')) + 1
+        
+        for line in lines:
+            lineSize = len(line.encode('utf-8')) + 1  # +1 for newline character
            if size + lineSize > maxBytes and current:
+                # Current chunk is full, save it and start new one
                data = '\n'.join(current)
                chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
-                current = [line]
-                size = lineSize
-            else:
-                current.append(line)
-                size += lineSize
+                current = []
+                size = 0
+            
+            # If a single line is larger than maxBytes, split it by character boundaries
+            if lineSize > maxBytes:
+                # Split the long line into chunks
+                lineBytes = line.encode('utf-8')
+                lineStart = 0
+                while lineStart < len(lineBytes):
+                    chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
+                    chunkText = chunkBytes.decode('utf-8', errors='ignore')
+                    chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
+                    lineStart += maxBytes
+                # Don't add this line to current, it's already chunked
+                continue
+            
+            # Add line to current chunk
+            current.append(line)
+            size += lineSize
+        
+        # Add remaining lines as final chunk
        if current:
            data = '\n'.join(current)
            chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
+        
+        logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
        return chunks


--- a/modules/workflows/methods/methodSharepoint.py
+++ b/modules/workflows/methods/methodSharepoint.py
@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase):
                            else:
                                # List all items in the drive root
                                endpoint = f"sites/{siteId}/drive/root/children"
+                            
+                            # Make the API call to list items
+                            listResult = await self._makeGraphApiCall(endpoint)
+                            if "error" in listResult:
+                                logger.warning(f"List failed for site {siteName}: {listResult['error']}")
+                                continue
+                            # Process list results for this site
+                            items = listResult.get("value", [])
+                            logger.info(f"Retrieved {len(items)} items from site {siteName}")
                        else:
                            # For files, use regular search API
                            # Clean the query: remove path-like syntax and invalid KQL syntax