fixes for chunking and sharepoint wildcard

This commit is contained in:
ValueOn AG 2025-11-05 01:32:54 +01:00
parent 236a85a99b
commit f65fe4b3d0
3 changed files with 70 additions and 12 deletions

View file

@ -286,10 +286,31 @@ class AiObjects:
raise
# For non-image parts, check if part fits in model context
# Calculate available space accounting for prompt, system message, and output reservation
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
modelContextBytes = model.contextLength * 4 # Convert tokens to bytes
if partSize <= modelContextBytes:
# Use same calculation as _chunkContentPart to determine actual available space
modelContextTokens = model.contextLength
modelMaxOutputTokens = model.maxTokens
# Reserve tokens for prompt, system message, output, and message overhead
promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
systemMessageTokens = 10 # ~40 bytes = 10 tokens
outputTokens = modelMaxOutputTokens
messageOverheadTokens = 100
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
# Available tokens for content (with 80% safety margin)
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
if availableContentTokens < 100:
availableContentTokens = max(100, int(modelContextTokens * 0.1))
# Convert to bytes (1 token ≈ 4 bytes)
availableContentBytes = availableContentTokens * 4
logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes (contextLength={modelContextTokens} tokens, reserved={totalReservedTokens:.0f} tokens)")
if partSize <= availableContentBytes:
# Part fits - call AI directly
response = await self._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}")
@ -310,7 +331,9 @@ class AiObjects:
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
chunkData = chunk.get('data', '')
chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}, chunk size: {chunkSize} bytes")
# Calculate and log progress
if progressCallback:
@ -318,7 +341,7 @@ class AiObjects:
progressCallback(progress, f"Processing chunk {chunkNum}/{len(chunks)}")
try:
chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
chunkResponse = await self._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
@ -572,7 +595,10 @@ class AiObjects:
)
# Log before calling model
logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {len(context.encode('utf-8'))} bytes")
contextSize = len(context.encode('utf-8')) if context else 0
promptSize = len(modelPrompt.encode('utf-8')) if modelPrompt else 0
totalInputSize = contextSize + promptSize
logger.debug(f"Calling model {model.name} with {len(messages)} messages, context size: {contextSize} bytes, prompt size: {promptSize} bytes, total input: {totalInputSize} bytes")
# Call the model with standardized interface
modelResponse = await model.functionCall(modelCall)

View file

@ -13,21 +13,44 @@ class TextChunker(Chunker):
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
chunks: List[Dict[str, Any]] = []
# Split by lines first (preferred method for text)
lines = part.data.split('\n')
current: List[str] = []
size = 0
for line in part.data.split('\n'):
lineSize = len(line.encode('utf-8')) + 1
for line in lines:
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline character
if size + lineSize > maxBytes and current:
# Current chunk is full, save it and start new one
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
current = [line]
size = lineSize
else:
current.append(line)
size += lineSize
current = []
size = 0
# If a single line is larger than maxBytes, split it by character boundaries
if lineSize > maxBytes:
# Split the long line into chunks
lineBytes = line.encode('utf-8')
lineStart = 0
while lineStart < len(lineBytes):
chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
chunkText = chunkBytes.decode('utf-8', errors='ignore')
chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
lineStart += maxBytes
# Don't add this line to current, it's already chunked
continue
# Add line to current chunk
current.append(line)
size += lineSize
# Add remaining lines as final chunk
if current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
return chunks

View file

@ -945,6 +945,15 @@ class MethodSharepoint(MethodBase):
else:
# List all items in the drive root
endpoint = f"sites/{siteId}/drive/root/children"
# Make the API call to list items
listResult = await self._makeGraphApiCall(endpoint)
if "error" in listResult:
logger.warning(f"List failed for site {siteName}: {listResult['error']}")
continue
# Process list results for this site
items = listResult.get("value", [])
logger.info(f"Retrieved {len(items)} items from site {siteName}")
else:
# For files, use regular search API
# Clean the query: remove path-like syntax and invalid KQL syntax