fixes for chunking and sharepoint wildcard

2025-11-05 01:32:54 +01:00 · 2025-11-05 01:32:54 +01:00 · f65fe4b3d0
commit f65fe4b3d0
parent 236a85a99b
3 changed files with 70 additions and 12 deletions
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@ -286,10 +286,31 @@ class AiObjects:
                            raise
                # For non-image parts, check if part fits in model context
                # Calculate available space accounting for prompt, system message, and output reservation
                partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
                modelContextBytes = model.contextLength * 4  # Convert tokens to bytes
-                if partSize <= modelContextBytes:
+                # Use same calculation as _chunkContentPart to determine actual available space
                modelContextTokens = model.contextLength
                modelMaxOutputTokens = model.maxTokens
                # Reserve tokens for prompt, system message, output, and message overhead
                promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
                systemMessageTokens = 10  # ~40 bytes = 10 tokens
                outputTokens = modelMaxOutputTokens
                messageOverheadTokens = 100
                totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
                # Available tokens for content (with 80% safety margin)
                availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
                if availableContentTokens < 100:
                    availableContentTokens = max(100, int(modelContextTokens * 0.1))
                # Convert to bytes (1 token ≈ 4 bytes)
                availableContentBytes = availableContentTokens * 4
                logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)")
                if partSize <= availableContentBytes:
                    # Part fits - call AI directly
                    response = await self._callWithModel(model, prompt, contentPart.data, options)
                    logger.info(f"✅ Content part processed successfully with model: {model.name}")
@ -310,7 +331,9 @@ class AiObjects:
                    chunkResults = []
                    for idx, chunk in enumerate(chunks):
                        chunkNum = idx + 1
-                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
+                        chunkData = chunk.get('data', '')
                        chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes")
                        # Calculate and log progress
                        if progressCallback:
@ -318,7 +341,7 @@ class AiObjects:
                            progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}")
                        try:
-                            chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
+                            chunkResponse = await self._callWithModel(model, prompt, chunkData, options)
                            chunkResults.append(chunkResponse)
                            logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
@ -572,7 +595,10 @@ class AiObjects:
            )
            # Log before calling model
-            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes")
+            contextSize = len(context.encode('utf-8')) if context else 0
            promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0
            totalInputSize = contextSize + promptSize
            logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes")
            # Call the model with standardized interface
            modelResponse = await model.functionCall(modelCall)
--- a/modules/services/serviceExtraction/chunking/chunkerText.py
+++ b/modules/services/serviceExtraction/chunking/chunkerText.py
@ -13,21 +13,44 @@ class TextChunker(Chunker):
        logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
        logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
        chunks: List[Dict[str, Any]] = []
        # Split by lines first (preferred method for text)
        lines = part.data.split('\n')
        current: List[str] = []
        size = 0
-        for line in part.data.split('\n'):
+        
-            lineSize = len(line.encode('utf-8')) + 1
+        for line in lines:
            lineSize = len(line.encode('utf-8')) + 1  # +1 for newline character
            if size + lineSize > maxBytes and current:
                # Current chunk is full, save it and start new one
                data = '\n'.join(current)
                chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
-                current = [line]
+                current = []
-                size = lineSize
+                size = 0
-            else:
+            
-                current.append(line)
+            # If a single line is larger than maxBytes, split it by character boundaries
-                size += lineSize
+            if lineSize > maxBytes:
                # Split the long line into chunks
                lineBytes = line.encode('utf-8')
                lineStart = 0
                while lineStart < len(lineBytes):
                    chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
                    chunkText = chunkBytes.decode('utf-8', errors='ignore')
                    chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
                    lineStart += maxBytes
                # Don't add this line to current, it's already chunked
                continue
            # Add line to current chunk
            current.append(line)
            size += lineSize
        # Add remaining lines as final chunk
        if current:
            data = '\n'.join(current)
            chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
        logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
        return chunks
--- a/modules/workflows/methods/methodSharepoint.py
+++ b/modules/workflows/methods/methodSharepoint.py
@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase):
                            else:
                                # List all items in the drive root
                                endpoint = f"sites/{siteId}/drive/root/children"
                            # Make the API call to list items
                            listResult = await self._makeGraphApiCall(endpoint)
                            if "error" in listResult:
                                logger.warning(f"List failed for site {siteName}: {listResult['error']}")
                                continue
                            # Process list results for this site
                            items = listResult.get("value", [])
                            logger.info(f"Retrieved {len(items)} items from site {siteName}")
                        else:
                            # For files, use regular search API
                            # Clean the query: remove path-like syntax and invalid KQL syntax