ai core with unlimited document size

This commit is contained in:
ValueOn AG 2025-10-11 18:30:26 +02:00
parent 73c1126200
commit be5f6773b6
14 changed files with 2083 additions and 527 deletions

View file

@ -135,3 +135,29 @@ class AiCallResponse(BaseModel):
costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call")
class EnhancedAiCallOptions(AiCallOptions):
"""Enhanced options for improved document processing with chunk mapping."""
# Parallel processing
enableParallelProcessing: bool = Field(
default=True,
description="Enable parallel processing of chunks"
)
maxConcurrentChunks: int = Field(
default=5,
ge=1,
le=20,
description="Maximum number of chunks to process concurrently"
)
# Chunk mapping
preserveChunkMetadata: bool = Field(
default=True,
description="Preserve chunk metadata during processing"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks in merged output"
)

View file

@ -18,6 +18,16 @@ class ContentExtracted(BaseModel):
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
class ChunkResult(BaseModel):
"""Preserves the relationship between a chunk and its AI result."""
originalChunk: ContentPart
aiResult: str
chunkIndex: int
documentId: str
processingTime: float = 0.0
metadata: Dict[str, Any] = Field(default_factory=dict)
class MergeStrategy(BaseModel):
"""Strategy configuration for merging content parts and AI results."""

View file

@ -261,6 +261,7 @@ class AiObjects:
if not requiredTags:
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
# Override priority based on processing mode if not explicitly set
effectivePriority = options.priority
if options.priority == Priority.BALANCED:
@ -269,6 +270,7 @@ class AiObjects:
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
for name, info in aiModels.items():
logger.info(f"Checking model: {name}, tags: {info.get('tags', [])}, function: {info.get('function', 'unknown')}")
# Check context length
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
continue
@ -280,8 +282,11 @@ class AiObjects:
# Check required tags/capabilities
modelTags = info.get("tags", [])
if requiredTags and not any(tag in modelTags for tag in requiredTags):
if requiredTags and not all(tag in modelTags for tag in requiredTags):
logger.info(f" -> Skipping {name}: missing required tags. Has: {modelTags}, needs: {requiredTags}")
continue
else:
logger.info(f" -> {name} passed tag check")
# Check processing mode requirements
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
@ -289,16 +294,24 @@ class AiObjects:
continue
candidates[name] = info
logger.info(f" -> {name} added to candidates")
logger.info(f"Final candidates: {list(candidates.keys())}")
if not candidates:
logger.info("No candidates found, using fallback")
# Fallback based on operation type
if options.operationType == OperationType.IMAGE_ANALYSIS:
logger.info("Using fallback: openai_callAiImage")
return "openai_callAiImage"
elif options.operationType == OperationType.IMAGE_GENERATION:
logger.info("Using fallback: openai_generateImage")
return "openai_generateImage"
elif options.operationType == OperationType.WEB_RESEARCH:
logger.info("Using fallback: perplexity_callAiWithWebSearch")
return "perplexity_callAiWithWebSearch"
else:
logger.info("Using fallback: openai_callAiBasic_gpt35")
return "openai_callAiBasic_gpt35"
# Special handling for planning operations - use Claude for consistency
@ -314,17 +327,25 @@ class AiObjects:
# Select based on priority for other operations
if effectivePriority == Priority.SPEED:
return max(candidates, key=lambda k: candidates[k]["speedRating"])
selected = max(candidates, key=lambda k: candidates[k]["speedRating"])
logger.info(f"Selected by SPEED: {selected}")
return selected
elif effectivePriority == Priority.QUALITY:
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
selected = max(candidates, key=lambda k: candidates[k]["qualityRating"])
logger.info(f"Selected by QUALITY: {selected}")
return selected
elif effectivePriority == Priority.COST:
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
selected = min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
logger.info(f"Selected by COST: {selected}")
return selected
else: # BALANCED
def balancedScore(name: str) -> float:
info = candidates[name]
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
return max(candidates, key=balancedScore)
selected = max(candidates, key=balancedScore)
logger.info(f"Selected by BALANCED: {selected}")
return selected
def _connectorFor(self, modelName: str):
"""Get the appropriate connector for the model."""

View file

@ -5,6 +5,7 @@ from modules.datamodels.datamodelChat import PromptPlaceholder
from modules.datamodels.datamodelChat import ChatDocument
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted
from modules.datamodels.datamodelWeb import (
WebResearchRequest,
WebResearchActionResult,
@ -62,6 +63,12 @@ class AiService:
logger.info("AiObjects.create() completed")
return instance
def _createImageAnalysisPrompt(self, userPrompt: str) -> str:
"""Create a safe image analysis prompt based on the user's original request."""
# Escape the user prompt to prevent string formatting issues
safeUserPrompt = userPrompt.replace("'", "\\'").replace('"', '\\"').replace('\n', ' ').replace('\r', ' ')
return f"Based on the user's request: '{safeUserPrompt}', \n\nAnalyze this image completely. Provide a detailed description of everything you see. Include all text, logos, colors, fonts, layout, objects, and visual elements. This is a document processing system that requires complete image analysis for all content types including logos, graphics, simple images, flags, and animations. You must analyze and describe everything visible in the image without any disclaimers or refusals."
# AI Image Analysis
async def readImage(
self,
@ -72,8 +79,24 @@ class AiService:
) -> str:
"""Call AI for image analysis using interface.callImage()."""
try:
return await self.aiObjects.callImage(prompt, imageData, mimeType, options)
print(f"🔍 readImage called with prompt: '{prompt[:100]}...', imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
logger.info(f"readImage called with prompt: '{prompt[:100]}...', imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
# Always use IMAGE_ANALYSIS operation type for image processing
if options is None:
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
else:
# Override the operation type to ensure image analysis
options.operationType = OperationType.IMAGE_ANALYSIS
print(f"🔍 Calling aiObjects.callImage with operationType: {options.operationType}")
logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}")
result = await self.aiObjects.callImage(prompt, imageData, mimeType, options)
print(f"🔍 callImage returned: {result[:200]}..." if len(result) > 200 else result)
logger.info(f"callImage returned: {result[:200]}..." if len(result) > 200 else result)
return result
except Exception as e:
print(f"🔍 Error in AI image analysis: {str(e)}")
logger.error(f"Error in AI image analysis: {str(e)}")
return f"Error: {str(e)}"
@ -449,138 +472,6 @@ class AiService:
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
return WebResearchActionResult(success=False, error=str(e))
async def _processDocumentsForAi(
self,
documents: List[ChatDocument],
operationType: str,
compressDocuments: bool,
processIndividually: bool,
userPrompt: str,
options: Optional[AiCallOptions] = None
) -> str:
if not documents:
return ""
# Calculate model-derived size limits
maxContextBytes = self._calculateMaxContextBytes(options)
# Build extraction options with model-derived limits
extractionOptions: Dict[str, Any] = {
"prompt": f"Extract content that supports the user's request: '{userPrompt}'. Focus on information relevant to: {operationType}",
"operationType": operationType,
"processDocumentsIndividually": processIndividually,
"maxSize": maxContextBytes,
"chunkAllowed": not options.compressContext if options else True,
"textChunkSize": int(maxContextBytes * 0.3), # 30% of max for text chunks
"imageChunkSize": int(maxContextBytes * 0.5), # 50% of max for image chunks
"imageMaxPixels": 1024 * 1024, # 1MP default
"imageQuality": 85,
"mergeStrategy": {
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate" # Default fallback
},
}
# Override mergeStrategy if provided in options
if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
extractionOptions["mergeStrategy"] = options.mergeStrategy
else:
# Set intelligent merge strategy for JSON and CSV based on outputFormat
# This is a fallback when mergeStrategy is not provided in options
pass # Keep default concatenate strategy
processedContents: List[str] = []
try:
# Use new ChatDocument-based API
logger.info(f"=== PROCESSING {len(documents)} DOCUMENTS FOR AI ===")
for i, doc in enumerate(documents):
logger.info(f"Document {i}: {doc.fileName} (MIME: {doc.mimeType})")
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
logger.info(f"Extraction completed: {len(extractionResult)} results")
async def _partsToText(parts, documentName: str, documentType: str, logger_ref) -> str:
lines: List[str] = []
logger_ref.debug(f"Processing {len(parts)} content parts for {documentName}")
for p in parts:
logger_ref.debug(f" Part: {p.typeGroup} ({p.mimeType}) - {len(p.data) if p.data else 0} chars")
if p.typeGroup in ("text", "table", "structure") and p.data and isinstance(p.data, str):
lines.append(p.data)
elif p.typeGroup == "image" and p.data:
# Use AI to extract text from image with user prompt
logger_ref.debug(f" Processing image with AI using user prompt...")
try:
imageResult = await self.aiObjects.callImage(
prompt=userPrompt,
imageData=p.data,
mimeType=p.mimeType
)
lines.append(f"[Image Analysis]: {imageResult}")
logger_ref.debug(f" AI image analysis completed: {len(imageResult)} chars")
except Exception as e:
logger_ref.warning(f" AI image processing failed: {e}")
lines.append(f"[Image Analysis Failed]: {str(e)}")
return "\n\n".join(lines)
if isinstance(extractionResult, list):
for i, ec in enumerate(extractionResult):
try:
# Get document info for this extraction result
doc = documents[i] if i < len(documents) else None
docName = doc.fileName if doc else f"Document_{i}"
docType = doc.mimeType if doc else "unknown"
contentText = await _partsToText(ec.parts, docName, docType, logger)
logger.debug(f"Document {i} content: {len(contentText)} chars")
if compressDocuments and len(contentText.encode("utf-8")) > 10000:
originalLength = len(contentText)
contentText = await self._compressContent(contentText, 10000, "document")
logger.debug(f"Document {i} compressed: {originalLength} -> {len(contentText)} chars")
processedContents.append(contentText)
except Exception as e:
logger.warning(f"Error aggregating extracted content: {str(e)}")
processedContents.append("[Error aggregating content]")
else:
# Fallback: no content
contentText = ""
if compressDocuments and len(contentText.encode("utf-8")) > 10000:
contentText = await self._compressContent(contentText, 10000, "document")
processedContents.append(contentText)
except Exception as e:
logger.warning(f"Error during extraction: {str(e)}")
processedContents.append("[Error during extraction]")
# Build JSON structure ONLY when adding to AI prompt
import json
documentsJson = []
for i, content in enumerate(processedContents):
doc = documents[i] if i < len(documents) else None
docName = doc.fileName if doc else f"Document_{i}"
docType = doc.mimeType if doc else "unknown"
documentData = {
"documentName": docName,
"documentType": docType,
"content": content
}
documentsJson.append(documentData)
finalContext = json.dumps({
"documents": documentsJson,
"totalDocuments": len(documentsJson)
}, indent=2, ensure_ascii=False)
logger.debug(f"=== FINAL CONTEXT ===")
logger.debug(f"Total context: {len(finalContext)} chars")
logger.debug(f"Documents: {len(documentsJson)}")
return finalContext
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
"""Calculate maximum context bytes based on model capabilities and options."""
@ -604,6 +495,7 @@ class AiService:
) -> str:
"""
Process documents with per-chunk AI calls and merge results.
FIXED: Now preserves chunk relationships and document structure.
Args:
documents: List of ChatDocument objects to process
@ -611,7 +503,7 @@ class AiService:
options: AI call options
Returns:
Merged AI results as string
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
@ -619,7 +511,7 @@ class AiService:
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Build extraction options for chunking
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
@ -631,6 +523,9 @@ class AiService:
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
@ -646,136 +541,394 @@ class AiService:
if not isinstance(extractionResult, list):
return "[Error: No extraction results]"
# Prepare debug directory TODO TO REMOVE
import os
from datetime import datetime, UTC
debug_root = "./test-chat/ai"
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
debug_dir = os.path.join(debug_root, f"{ts}_extraction_per_chunk")
try:
os.makedirs(debug_dir, exist_ok=True)
except Exception:
pass
# Process each chunk with AI
aiResults: List[str] = []
# FIXED: Process chunks with proper mapping
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options)
for ec in extractionResult:
for part in ec.parts:
if part.typeGroup == "image":
# Process image with AI
try:
# Safety check for part.data
if not hasattr(part, 'data') or part.data is None:
logger.warning(f"Skipping image chunk with no data")
continue
aiResult = await self.readImage(
prompt=prompt,
imageData=part.data,
mimeType=part.mimeType,
options=options
)
aiResults.append(aiResult)
except Exception as e:
logger.warning(f"Error processing image chunk: {str(e)}")
aiResults.append(f"[Error processing image: {str(e)}]")
elif part.typeGroup in ("text", "table", "structure"):
# Process text content with AI
try:
# Safety check for part.data
if not hasattr(part, 'data') or part.data is None:
logger.warning(f"Skipping chunk with no data")
continue
logger.info(f"=== PROCESSING CHUNK {len(aiResults) + 1} ===")
logger.info(f"Chunk size: {len(part.data)} chars")
logger.info(f"Chunk preview: {part.data[:200]}...")
# Dump input chunk
try:
idx = len(aiResults) + 1
fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(str(part.data))
except Exception:
pass
# Create AI call request for this chunk
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=options
)
# Make the call using AiObjects
response = await self.aiObjects.call(request)
aiResults.append(response.content)
logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response")
# Dump AI response
try:
idx = len(aiResults)
fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(str(response.content))
except Exception:
pass
except Exception as e:
logger.warning(f"Error processing text chunk: {str(e)}")
aiResults.append(f"[Error processing text: {str(e)}]")
# FIXED: Merge with preserved chunk relationships
mergedContent = self._mergeChunkResults(chunkResults, options)
# Merge AI results using ExtractionService
from modules.datamodels.datamodelExtraction import MergeStrategy
# Use mergeStrategy from options if available, otherwise default
if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
mergeStrategy = MergeStrategy(
groupBy=options.mergeStrategy.get("groupBy", "typeGroup"),
orderBy=options.mergeStrategy.get("orderBy", "id"),
mergeType=options.mergeStrategy.get("mergeType", "concatenate"),
chunkSeparator="\n\n---\n\n"
)
else:
mergeStrategy = MergeStrategy(
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate",
chunkSeparator="\n\n---\n\n"
)
mergedContent = self.extractionService.mergeAiResults(
extractionResult,
aiResults,
mergeStrategy
)
# Extract only AI-generated text from merged content
resultText = ""
for part in mergedContent.parts:
if (
part.typeGroup in ("text", "table", "structure")
and part.data
and getattr(part, "metadata", {}).get("aiResult", False)
):
resultText += part.data + "\n\n"
# Dump merged output
try:
fpath = os.path.join(debug_dir, "merged_output.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(resultText.strip())
except Exception:
pass
return resultText.strip()
return mergedContent
except Exception as e:
logger.error(f"Error in per-chunk processing: {str(e)}")
return f"[Error in per-chunk processing: {str(e)}]"
async def _processDocumentsPerChunkClean(
self,
documents: List[ChatDocument],
prompt: str,
options: Optional[AiCallOptions] = None
) -> str:
"""
Process documents with per-chunk AI calls and merge results in CLEAN mode.
This version excludes debug metadata and document headers for document generation.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
options: AI call options
Returns:
Clean merged AI results as string without debug metadata
"""
if not documents:
return ""
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (clean mode): {extractionOptions}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return "[Error: No extraction results]"
# Process chunks with proper mapping
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options)
# Merge with CLEAN mode (no debug metadata)
mergedContent = self._mergeChunkResultsClean(chunkResults, options)
return mergedContent
except Exception as e:
logger.error(f"Error in per-chunk processing (clean mode): {str(e)}")
return f"[Error in per-chunk processing: {str(e)}]"
async def _processChunksWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
options: Optional[AiCallOptions] = None
) -> List[ChunkResult]:
"""Process chunks with proper mapping to preserve relationships."""
from modules.datamodels.datamodelExtraction import ChunkResult
import asyncio
import time
# Collect all chunks that need processing with proper indexing
chunks_to_process = []
chunk_index = 0
for ec in extractionResult:
# Get document MIME type from metadata
document_mime_type = None
for part in ec.parts:
if part.metadata and 'documentMimeType' in part.metadata:
document_mime_type = part.metadata['documentMimeType']
break
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
chunks_to_process.append({
'part': part,
'chunk_index': chunk_index,
'document_id': ec.id,
'document_mime_type': document_mime_type
})
chunk_index += 1
logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping")
# Process chunks in parallel with proper mapping
async def process_single_chunk(chunk_info: Dict) -> ChunkResult:
part = chunk_info['part']
chunk_index = chunk_info['chunk_index']
document_id = chunk_info['document_id']
document_mime_type = chunk_info.get('document_mime_type', part.mimeType)
start_time = time.time()
try:
# FIXED: Check MIME type first, then fallback to typeGroup
is_image = (
(document_mime_type and document_mime_type.startswith('image/')) or
(part.mimeType and part.mimeType.startswith('image/')) or
(part.typeGroup == "image")
)
# Debug logging
print(f"🔍 Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}")
logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}")
if is_image:
# Create image analysis prompt based on user's original intent
imagePrompt = self._createImageAnalysisPrompt(prompt)
ai_result = await self.readImage(
prompt=imagePrompt,
imageData=part.data,
mimeType=part.mimeType,
options=options
)
elif part.typeGroup in ("container", "binary"):
# Handle container and binary content as text (skip processing)
ai_result = f"[Skipped {part.typeGroup} content: {len(part.data)} bytes]"
else:
# Ensure options is not None and set correct operation type for text
request_options = options if options is not None else AiCallOptions()
# FIXED: Set operation type to general for text processing
request_options.operationType = OperationType.GENERAL
print(f"🔍 Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}")
logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}")
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
ai_result = response.content
processing_time = time.time() - start_time
logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s")
return ChunkResult(
originalChunk=part,
aiResult=ai_result,
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": True,
"chunkSize": len(part.data) if part.data else 0,
"resultSize": len(ai_result),
"typeGroup": part.typeGroup
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing chunk {chunk_index}: {str(e)}")
return ChunkResult(
originalChunk=part,
aiResult=f"[Error processing chunk: {str(e)}]",
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"chunkSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process chunks with concurrency control
max_concurrent = 5 # Default concurrency
if options and hasattr(options, 'maxConcurrentChunks'):
max_concurrent = options.maxConcurrentChunks
elif options and hasattr(options, 'maxParallelChunks'):
max_concurrent = options.maxParallelChunks
logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}")
# Create semaphore for concurrency control
semaphore = asyncio.Semaphore(max_concurrent)
async def process_with_semaphore(chunk_info):
async with semaphore:
return await process_single_chunk(chunk_info)
# Process all chunks in parallel with concurrency control
tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process]
chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle any exceptions in the gather itself
processed_results = []
for i, result in enumerate(chunk_results):
if isinstance(result, Exception):
# Create error ChunkResult
chunk_info = chunks_to_process[i]
processed_results.append(ChunkResult(
originalChunk=chunk_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
chunkIndex=chunk_info['chunk_index'],
documentId=chunk_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
else:
processed_results.append(result)
logger.info(f"Completed processing {len(processed_results)} chunks")
return processed_results
def _mergeChunkResults(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results while preserving document structure and chunk order."""
if not chunkResults:
return ""
# Get merging configuration from options
chunk_separator = "\n\n---\n\n"
include_document_headers = True
include_chunk_metadata = False
if options:
if hasattr(options, 'chunkSeparator'):
chunk_separator = options.chunkSeparator
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n")
# Check for enhanced options
if hasattr(options, 'preserveChunkMetadata'):
include_chunk_metadata = options.preserveChunkMetadata
# Group chunk results by document
results_by_document = {}
for chunk_result in chunkResults:
doc_id = chunk_result.documentId
if doc_id not in results_by_document:
results_by_document[doc_id] = []
results_by_document[doc_id].append(chunk_result)
# Sort chunks within each document by chunk index
for doc_id in results_by_document:
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
# Merge results for each document
merged_documents = []
for doc_id, doc_chunks in results_by_document.items():
# Build document header if enabled
doc_header = ""
if include_document_headers:
doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n"
# Merge chunks for this document
doc_content = ""
for i, chunk_result in enumerate(doc_chunks):
# Add chunk separator (except for first chunk)
if i > 0:
doc_content += chunk_separator
# Add chunk content with optional metadata
chunk_metadata = chunk_result.metadata
if chunk_metadata.get("success", False):
chunk_content = chunk_result.aiResult
# Add chunk metadata if enabled
if include_chunk_metadata:
chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]"
chunk_content = f"{chunk_info}\n{chunk_content}"
doc_content += chunk_content
else:
# Handle error chunks
error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]"
doc_content += error_msg
merged_documents.append(doc_header + doc_content)
# Join all documents
final_result = "\n\n".join(merged_documents)
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents")
return final_result.strip()
def _mergeChunkResultsClean(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results in CLEAN mode - no debug metadata or document headers."""
if not chunkResults:
return ""
# Get merging configuration from options
chunk_separator = "\n\n"
include_document_headers = False # CLEAN MODE: No document headers
include_chunk_metadata = False # CLEAN MODE: No chunk metadata
if options:
if hasattr(options, 'chunkSeparator'):
chunk_separator = options.chunkSeparator
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n")
# Group chunk results by document
results_by_document = {}
for chunk_result in chunkResults:
doc_id = chunk_result.documentId
if doc_id not in results_by_document:
results_by_document[doc_id] = []
results_by_document[doc_id].append(chunk_result)
# Sort chunks within each document by chunk index
for doc_id in results_by_document:
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
# Merge results for each document in CLEAN mode
merged_documents = []
for doc_id, doc_chunks in results_by_document.items():
# CLEAN MODE: No document headers
doc_header = ""
# Merge chunks for this document
doc_content = ""
for i, chunk_result in enumerate(doc_chunks):
# Add chunk separator (except for first chunk)
if i > 0:
doc_content += chunk_separator
# Add chunk content without metadata
chunk_metadata = chunk_result.metadata
if chunk_metadata.get("success", False):
chunk_content = chunk_result.aiResult
# CLEAN MODE: Skip container/binary chunks entirely
if chunk_content.startswith("[Skipped ") and "content:" in chunk_content:
continue # Skip container/binary chunks in clean mode
# CLEAN MODE: Skip empty or whitespace-only chunks
if not chunk_content.strip():
continue # Skip empty chunks in clean mode
# CLEAN MODE: No chunk metadata
doc_content += chunk_content
else:
# Handle error chunks silently in clean mode
continue
merged_documents.append(doc_header + doc_content)
# Join all documents
final_result = "\n\n".join(merged_documents)
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (clean mode)")
return final_result.strip()
async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str:
if len(content.encode("utf-8")) <= targetSize:
return content
@ -1032,190 +1185,31 @@ class AiService:
) -> str:
"""
Handle text calls with document processing through ExtractionService.
UNIFIED PROCESSING: Always use per-chunk processing for consistency.
"""
# Ensure aiObjects is initialized
await self._ensureAiObjectsInitialized()
# Determine processing strategy based on options
if options.processDocumentsIndividually and documents:
# Use per-chunk processing for individual document processing
return await self._processDocumentsPerChunk(documents, prompt, options)
# UNIFIED PROCESSING: Always use per-chunk processing for consistency
# This ensures MIME-type checking, chunk mapping, and parallel processing
return await self._processDocumentsPerChunk(documents, prompt, options)
# Check if we need chunking - if so, use per-chunk processing
if documents and not options.compressContext:
# Get model capabilities to check if chunking will be needed
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
total_doc_size = sum(doc.fileSize or 0 for doc in documents)
if total_doc_size > model_capabilities["maxContextBytes"]:
logger.info(f"Document size ({total_doc_size}) exceeds model capacity ({model_capabilities['maxContextBytes']}), using per-chunk processing")
return await self._processDocumentsPerChunk(documents, prompt, options)
async def _callAiTextClean(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> str:
"""
Handle text calls with document processing in CLEAN mode for document generation.
This version excludes debug metadata and document headers from the final output.
"""
# Ensure aiObjects is initialized
await self._ensureAiObjectsInitialized()
# Extract and process documents using ExtractionService
context = ""
if documents:
logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===")
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Use new ChatDocument-based API
extraction_options = {
"prompt": prompt,
"operationType": options.operationType,
"processDocumentsIndividually": options.processDocumentsIndividually,
"maxSize": options.maxContextBytes or model_capabilities["maxContextBytes"],
"chunkAllowed": not options.compressContext,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"}
}
logger.debug(f"Extraction options: {extraction_options}")
extracted_content = self.extractionService.extractContent(
documents=documents,
options=extraction_options
)
logger.info(f"Extraction completed: {len(extracted_content)} documents")
# Build context from list of extracted content
if isinstance(extracted_content, list):
context_parts = []
chunk_count = 0
for ec in extracted_content:
for p in ec.parts:
if p.typeGroup in ["text", "table", "structure"] and p.data:
if p.metadata.get("chunk", False):
chunk_count += 1
context_parts.append(p.data)
elif p.typeGroup == "image" and p.data:
# Process image with AI using user prompt
try:
imageResult = await self.aiObjects.callImage(
prompt=prompt,
imageData=p.data,
mimeType=p.mimeType
)
context_parts.append(f"[Image Analysis]: {imageResult}")
except Exception as e:
logger.warning(f"AI image processing failed: {e}")
context_parts.append(f"[Image Analysis Failed]: {str(e)}")
if chunk_count > 0:
logger.debug(f"=== PROCESSING CHUNKED CONTENT ===")
logger.debug(f"Total chunks: {chunk_count}")
logger.debug(f"Total context parts: {len(context_parts)}")
context = "\n\n---\n\n".join(context_parts)
else:
context = ""
# Process documents with clean merging (no debug metadata)
return await self._processDocumentsPerChunkClean(documents, prompt, options)
# Check size and reduce if needed
full_prompt = prompt + "\n\n" + context if context else prompt
# Add generic completeness guidance: first vs subsequent (based on presence of context)
try:
if context and context.strip():
# Subsequent calls with prior context: continue next part only
full_prompt += (
"\n\nINSTRUCTIONS (COMPLETENESS):\n"
"- Continue from where the previous content ended. Do NOT repeat earlier content.\n"
"- If more parts are still needed after this response, the LAST LINE of your response MUST be exactly: 'CONTINUATION: true'.\n"
"- If the content is now complete, the LAST LINE of your response MUST be exactly: 'CONTINUATION: false'.\n"
"- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n"
)
else:
# First call (no prior context): deliver full content or first part
full_prompt += (
"\n\nINSTRUCTIONS (COMPLETENESS):\n"
"- Deliver the complete content. Do NOT truncate.\n"
"- If platform limits force truncation, provide the first complete section(s) only and ensure the LAST LINE of your response is exactly: 'CONTINUATION: true'.\n"
"- If the entire content is fully included, ensure the LAST LINE of your response is exactly: 'CONTINUATION: false'.\n"
"- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n"
)
except Exception:
# Non-fatal if any issue building guidance
pass
logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})")
# Use AiObjects to select the best model and make the call
try:
# Helper to detect and strip continuation flag
import re
def _split_content_and_flag(text: str) -> (str, bool):
if not text:
return "", False
lines = text.strip().splitlines()
cont = False
# Scan last 3 lines for flag to be robust
for i in range(1, min(4, len(lines))+1):
m = re.match(r"^\s*CONTINUATION:\s*(true|false)\s*$", lines[-i].strip(), re.IGNORECASE)
if m:
cont = m.group(1).lower() == 'true'
# remove the matched flag line
del lines[-i]
break
return "\n".join(lines).strip(), cont
# First call
request = AiCallRequest(
prompt=full_prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
try:
logger.debug(f"AI model selected (text): {getattr(response, 'modelName', 'unknown')}")
except Exception:
pass
content_first = response.content or ""
merged_content, needs_more = _split_content_and_flag(content_first)
# Iteratively request next parts if flagged
# Allow configurable max parts via options; default = 1000
try:
max_parts = int(getattr(options, 'maxParts', 1000) or 1000)
except Exception:
max_parts = 1000
part_index = 1
while needs_more and part_index < max_parts:
part_index += 1
# Build subsequent prompt with explicit continuation instructions
subsequent_prompt = (
prompt
+ "\n\nINSTRUCTIONS (CONTINUE NEXT PART ONLY):\n"
"- Continue from where the previous content ended.\n"
"- Do NOT repeat earlier content.\n"
"- The LAST LINE of your response MUST be exactly one of: 'CONTINUATION: true' (if more parts are needed) or 'CONTINUATION: false' (if complete).\n"
"- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n"
)
next_request = AiCallRequest(
prompt=subsequent_prompt,
context=merged_content,
options=options
)
next_response = await self.aiObjects.call(next_request)
part_text = next_response.content or ""
part_clean, needs_more = _split_content_and_flag(part_text)
if part_clean:
# Separate parts clearly
merged_content = (merged_content + "\n\n" + part_clean).strip()
else:
# Avoid infinite loops on empty parts
break
logger.debug(f"=== AI RESPONSE (MERGED) ===")
logger.debug(f"Response length: {len(merged_content)} chars")
logger.debug(f"Response preview: {merged_content[:200]}...")
return merged_content
except Exception as e:
logger.error(f"AI call failed: {e}")
raise Exception(f"AI call failed: {e}")
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
@ -1527,8 +1521,9 @@ class AiService:
title=title
)
# Process documents with format-specific prompt
ai_response = await self._callAiText(extraction_prompt, documents, options)
# Process documents with format-specific prompt using CLEAN mode
# This ensures no debug metadata is included in the final output
ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
# Parse filename header from AI response if present
parsed_filename = None
@ -1556,7 +1551,8 @@ class AiService:
rendered_content, mime_type = await generation_service.renderReport(
extracted_content=ai_response,
output_format=outputFormat,
title=title
title=title,
user_prompt=prompt
)
# Generate meaningful filename (use AI-provided if valid, else fallback)

View file

@ -0,0 +1,206 @@
import logging
import base64
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
"""Extractor for PowerPoint (.pptx) files using python-pptx library."""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global Presentation
from pptx import Presentation
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.
Args:
fileBytes: Raw file data as bytes
context: Context dictionary with file information
Returns:
List of ContentPart objects with extracted content
"""
self._load()
if not self._haveLibs:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data="Error: python-pptx library not installed",
metadata={"error": True, "error_message": "python-pptx library not installed"}
)]
try:
import io
# Load presentation from bytes
presentation = Presentation(io.BytesIO(fileBytes))
parts = []
slide_index = 0
# Extract content from each slide
for slide in presentation.slides:
slide_index += 1
slide_content = []
# Extract text from slide
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content.append(shape.text.strip())
# Extract table data
for shape in slide.shapes:
if shape.has_table:
table = shape.table
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
row_data.append(cell.text.strip())
table_data.append(row_data)
if table_data:
# Convert table to markdown format
table_md = self._table_to_markdown(table_data)
slide_content.append(table_md)
# Extract images
for shape in slide.shapes:
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
try:
image = shape.image
image_bytes = image.blob
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Create image part
image_part = ContentPart(
id=f"slide_{slide_index}_image_{len(parts)}",
label=f"Slide {slide_index} Image",
typeGroup="image",
mimeType="image/png", # Default to PNG
data=image_b64,
metadata={
"slide_number": slide_index,
"shape_type": "image",
"extracted_from": "powerpoint"
}
)
parts.append(image_part)
except Exception as e:
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
# Create slide content part
if slide_content:
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
slide_part = ContentPart(
id=f"slide_{slide_index}",
label=f"Slide {slide_index} Content",
typeGroup="structure",
mimeType="text/plain",
data=slide_text,
metadata={
"slide_number": slide_index,
"content_type": "slide",
"extracted_from": "powerpoint",
"text_length": len(slide_text)
}
)
parts.append(slide_part)
# Create presentation overview
file_name = context.get("fileName", "presentation.pptx")
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
overview_text += f"**Content Parts:** {len(parts)}\n\n"
# Add slide summaries
for i, slide in enumerate(presentation.slides, 1):
slide_text_parts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text_parts.append(shape.text.strip())
if slide_text_parts:
overview_text += f"## Slide {i}\n"
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
overview_text += "\n\n"
# Create overview part
overview_part = ContentPart(
id="presentation_overview",
label="Presentation Overview",
typeGroup="text",
mimeType="text/plain",
data=overview_text,
metadata={
"content_type": "overview",
"extracted_from": "powerpoint",
"total_slides": len(presentation.slides),
"text_length": len(overview_text)
}
)
parts.insert(0, overview_part) # Insert at beginning
return parts
except Exception as e:
logger.error(f"Error extracting PowerPoint content: {str(e)}")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data=f"Error extracting PowerPoint content: {str(e)}",
metadata={"error": True, "error_message": str(e)}
)]
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""Convert table data to markdown format."""
if not table_data:
return ""
markdown_lines = []
# Header row
if table_data:
header = "| " + " | ".join(table_data[0]) + " |"
markdown_lines.append(header)
# Separator row
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
markdown_lines.append(separator)
# Data rows
for row in table_data[1:]:
data_row = "| " + " | ".join(row) + " |"
markdown_lines.append(data_row)
return "\n".join(markdown_lines)

View file

@ -0,0 +1,209 @@
"""
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
"""
from typing import List, Dict, Any, Tuple
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from .subUtils import makeId
logger = logging.getLogger(__name__)
class IntelligentTokenAwareMerger:
"""
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
Strategy:
1. Calculate token count for each chunk
2. Group chunks to maximize token usage without exceeding limits
3. Preserve document structure and semantic boundaries
4. Minimize total number of AI calls
"""
def __init__(self, model_capabilities: Dict[str, Any]):
self.max_tokens = model_capabilities.get("maxTokens", 4000)
self.safety_margin = model_capabilities.get("safetyMargin", 0.1)
self.effective_max_tokens = int(self.max_tokens * (1 - self.safety_margin))
self.chars_per_token = model_capabilities.get("charsPerToken", 4) # Rough estimation
def merge_chunks_intelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
"""
Merge chunks intelligently based on token limits.
Args:
chunks: List of ContentPart chunks to merge
prompt: AI prompt to account for in token calculation
Returns:
List of optimally merged ContentPart objects
"""
if not chunks:
return chunks
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, max_tokens={self.effective_max_tokens}")
# Calculate tokens for prompt
prompt_tokens = self._estimate_tokens(prompt)
available_tokens = self.effective_max_tokens - prompt_tokens
logger.info(f"📊 Prompt tokens: {prompt_tokens}, Available for content: {available_tokens}")
# Group chunks by document and type for semantic coherence
grouped_chunks = self._group_chunks_by_document_and_type(chunks)
merged_parts = []
for group_key, group_chunks in grouped_chunks.items():
logger.info(f"📁 Processing group: {group_key} ({len(group_chunks)} chunks)")
# Merge chunks within this group optimally
group_merged = self._merge_group_optimally(group_chunks, available_tokens)
merged_parts.extend(group_merged)
logger.info(f"✅ Intelligent merging complete: {len(chunks)}{len(merged_parts)} parts")
return merged_parts
def _group_chunks_by_document_and_type(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
"""Group chunks by document and type for semantic coherence."""
groups = {}
for chunk in chunks:
# Create group key: document_id + type_group
doc_id = chunk.metadata.get("documentId", "unknown")
type_group = chunk.typeGroup
group_key = f"{doc_id}_{type_group}"
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(chunk)
return groups
def _merge_group_optimally(self, chunks: List[ContentPart], available_tokens: int) -> List[ContentPart]:
"""Merge chunks within a group optimally to minimize AI calls."""
if not chunks:
return []
# Sort chunks by size (smallest first for better packing)
sorted_chunks = sorted(chunks, key=lambda c: self._estimate_tokens(c.data))
merged_parts = []
current_group = []
current_tokens = 0
for chunk in sorted_chunks:
chunk_tokens = self._estimate_tokens(chunk.data)
# Special case: If single chunk is already at max size, process it alone
if chunk_tokens >= available_tokens * 0.9: # 90% of available tokens
# Finalize current group if it exists
if current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
current_group = []
current_tokens = 0
# Process large chunk individually
merged_parts.append(chunk)
logger.debug(f"🔍 Large chunk processed individually: {chunk_tokens} tokens")
continue
# If adding this chunk would exceed limit, finalize current group
if current_tokens + chunk_tokens > available_tokens and current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
current_group = [chunk]
current_tokens = chunk_tokens
else:
current_group.append(chunk)
current_tokens += chunk_tokens
# Finalize remaining group
if current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
logger.info(f"📦 Group merged: {len(chunks)}{len(merged_parts)} parts")
return merged_parts
def _create_merged_part(self, chunks: List[ContentPart], total_tokens: int) -> ContentPart:
"""Create a merged ContentPart from multiple chunks."""
if len(chunks) == 1:
return chunks[0] # No need to merge single chunk
# Combine data with semantic separators
combined_data = self._combine_chunk_data(chunks)
# Use metadata from first chunk as base
base_chunk = chunks[0]
merged_metadata = base_chunk.metadata.copy()
merged_metadata.update({
"merged": True,
"originalChunkCount": len(chunks),
"totalTokens": total_tokens,
"originalChunkIds": [c.id for c in chunks],
"size": len(combined_data.encode('utf-8'))
})
merged_part = ContentPart(
id=makeId(),
parentId=base_chunk.parentId,
label=f"merged_{len(chunks)}_chunks",
typeGroup=base_chunk.typeGroup,
mimeType=base_chunk.mimeType,
data=combined_data,
metadata=merged_metadata
)
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {total_tokens} tokens")
return merged_part
def _combine_chunk_data(self, chunks: List[ContentPart]) -> str:
"""Combine chunk data with appropriate separators."""
if not chunks:
return ""
# Use different separators based on content type
if chunks[0].typeGroup == "text":
separator = "\n\n---\n\n" # Clear text separation
elif chunks[0].typeGroup == "table":
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
else:
separator = "\n\n---\n\n" # Default separation
return separator.join([chunk.data for chunk in chunks])
def _estimate_tokens(self, text: str) -> int:
"""Estimate token count for text."""
if not text:
return 0
return len(text) // self.chars_per_token
def calculate_optimization_stats(self, original_chunks: List[ContentPart], merged_parts: List[ContentPart]) -> Dict[str, Any]:
"""Calculate optimization statistics with detailed analysis."""
original_calls = len(original_chunks)
optimized_calls = len(merged_parts)
reduction_percent = ((original_calls - optimized_calls) / original_calls * 100) if original_calls > 0 else 0
# Analyze chunk sizes
large_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) >= self.effective_max_tokens * 0.9]
small_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) < self.effective_max_tokens * 0.9]
# Calculate theoretical maximum optimization (if all small chunks could be merged)
theoretical_min_calls = len(large_chunks) + max(1, len(small_chunks) // 3) # Assume 3 small chunks per call
theoretical_reduction = ((original_calls - theoretical_min_calls) / original_calls * 100) if original_calls > 0 else 0
return {
"original_ai_calls": original_calls,
"optimized_ai_calls": optimized_calls,
"reduction_percent": round(reduction_percent, 1),
"cost_savings": f"{reduction_percent:.1f}%",
"efficiency_gain": f"{original_calls / optimized_calls:.1f}x" if optimized_calls > 0 else "",
"analysis": {
"large_chunks": len(large_chunks),
"small_chunks": len(small_chunks),
"theoretical_min_calls": theoretical_min_calls,
"theoretical_reduction": round(theoretical_reduction, 1),
"optimization_potential": "high" if reduction_percent > 50 else "moderate" if reduction_percent > 20 else "low"
}
}

View file

@ -67,10 +67,12 @@ class ExtractionService:
if part.metadata:
logger.debug(f" Metadata: {part.metadata}")
# Attach document id to parts if missing
# Attach document id and MIME type to parts if missing
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
if "documentMimeType" not in p.metadata:
p.metadata["documentMimeType"] = documentData["mimeType"]
# Log chunking information
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]

View file

@ -8,6 +8,7 @@ from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.text_merger import TextMerger
from .merging.table_merger import TableMerger
from .merging.default_merger import DefaultMerger
from .intelligent_merger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
@ -84,14 +85,21 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
print(f"🔍 DEBUG: runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}")
if non_chunk_parts:
# Apply intelligent merging for small text parts
if non_chunk_parts:
# Count text parts
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
if len(text_parts) > 5: # If we have many small text parts, merge them
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
# Combine non-chunk parts with chunk parts (chunks stay separate)
parts = non_chunk_parts + chunk_parts
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
print(f"🔍 DEBUG: runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})")
# DEBUG: dump parts and chunks to files TODO TO REMOVE
try:
base_dir = "./test-chat/ai"
@ -146,13 +154,22 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
kept: List[ContentPart] = []
remaining: List[ContentPart] = []
for p in parts:
print(f"🔍 DEBUG: Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}")
for i, p in enumerate(parts):
size = int(p.metadata.get("size", 0) or 0)
# Show first 50 characters of text content for debugging
content_preview = p.data[:50].replace('\n', '\\n') if p.data else ""
print(f"🔍 DEBUG: Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})")
if current + size <= maxSize:
kept.append(p)
current += size
print(f"🔍 DEBUG: Part {i} kept (total: {current})")
else:
remaining.append(p)
print(f"🔍 DEBUG: Part {i} moved to remaining")
print(f"🔍 DEBUG: Kept: {len(kept)}, Remaining: {len(remaining)}")
# If we have remaining parts and chunking is allowed, try chunking
if remaining and chunkAllowed:
@ -160,12 +177,15 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
logger.debug(f"Remaining parts to chunk: {len(remaining)}")
logger.debug(f"Max size limit: {maxSize} bytes")
logger.debug(f"Current size used: {current} bytes")
print(f"🔍 DEBUG: Chunking {len(remaining)} remaining parts")
for p in remaining:
if p.typeGroup in ("text", "table", "structure", "image"):
if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
print(f"🔍 DEBUG: Chunking {p.typeGroup} part with {len(p.data)} chars")
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
logger.debug(f"Created {len(chunks)} chunks")
print(f"🔍 DEBUG: Created {len(chunks)} chunks")
chunks_added = 0
for ch in chunks:
@ -197,13 +217,19 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
if non_chunk_parts:
# Apply intelligent merging for small text parts
if non_chunk_parts:
# Count text parts
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
if len(text_parts) > 5: # If we have many small text parts, merge them
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
# Combine non-chunk parts with chunk parts (chunks stay separate)
kept = non_chunk_parts + chunk_parts
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
print(f"🔍 DEBUG: Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
# Re-check size after merging
totalSize = sum(int(p.metadata.get("size", 0) or 0) for p in kept)
@ -211,11 +237,30 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
# Apply size limit to merged parts
kept = _applySizeLimit(kept, maxSize)
print(f"🔍 DEBUG: poolAndLimit returning {len(kept)} parts")
return kept
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
"""Apply merging strategy to parts."""
"""Apply merging strategy to parts with intelligent token-aware merging."""
print(f"🔍 DEBUG: _applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("modelCapabilities", {})
intelligent_merger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = intelligent_merger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats
stats = intelligent_merger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
@ -226,18 +271,29 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
print(f"🔍 DEBUG: Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
merged.extend(textMerger.merge(textParts, strategy))
textMerged = textMerger.merge(textParts, strategy)
print(f"🔍 DEBUG: TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
merged.extend(tableMerger.merge(tableParts, strategy))
tableMerged = tableMerger.merge(tableParts, strategy)
print(f"🔍 DEBUG: TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
merged.extend(textMerger.merge(structureParts, strategy))
structureMerged = textMerger.merge(structureParts, strategy)
print(f"🔍 DEBUG: StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
merged.extend(defaultMerger.merge(otherParts, strategy))
otherMerged = defaultMerger.merge(otherParts, strategy)
print(f"🔍 DEBUG: DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
print(f"🔍 DEBUG: _applyMerging returning {len(merged)} parts")
return merged

View file

@ -30,6 +30,7 @@ class ExtractorRegistry:
from .formats.pdf_extractor import PdfExtractor
from .formats.docx_extractor import DocxExtractor
from .formats.xlsx_extractor import XlsxExtractor
from .formats.pptx_extractor import PptxExtractor
from .formats.image_extractor import ImageExtractor
from .formats.binary_extractor import BinaryExtractor
self.register("text/plain", TextExtractor())
@ -41,6 +42,8 @@ class ExtractorRegistry:
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
self.register("application/vnd.ms-powerpoint", PptxExtractor())
# images
self.register("image/jpeg", ImageExtractor())
self.register("image/png", ImageExtractor())
@ -57,6 +60,8 @@ class ExtractorRegistry:
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor())
self.register("xlsm", XlsxExtractor())
self.register("pptx", PptxExtractor())
self.register("ppt", PptxExtractor())
# fallback
self.setFallback(BinaryExtractor())
print(f"✅ ExtractorRegistry: Successfully registered {len(self._map)} extractors")
@ -91,12 +96,14 @@ class ChunkerRegistry:
from .chunking.text_chunker import TextChunker
from .chunking.table_chunker import TableChunker
from .chunking.structure_chunker import StructureChunker
# Skip ImageChunker for now to avoid PIL import hang
# from .chunking.image_chunker import ImageChunker
from .chunking.image_chunker import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
# self.register("image", ImageChunker())
self.register("image", ImageChunker())
# Use text chunker for container and binary content
self.register("container", TextChunker())
self.register("binary", TextChunker())
except Exception as e:
print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}")
import traceback

View file

@ -296,7 +296,7 @@ class GenerationService:
'workflowId': 'unknown'
}
async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]:
async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
"""
Render extracted content to the specified output format.
@ -328,8 +328,8 @@ class GenerationService:
if not renderer:
raise ValueError(f"Unsupported output format: {output_format}")
# Render the content
rendered_content, mime_type = await renderer.render(extracted_content, title)
# Render the content with user prompt for structure
rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
# DEBUG: dump rendered output
try:
import os

View file

@ -6,6 +6,7 @@ from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
import re
from datetime import datetime, UTC
try:
@ -42,14 +43,17 @@ class DocxRenderer(BaseRenderer):
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
return (
"DOCX FORMAT GUIDELINES:\n"
"- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
"- Use clear section hierarchy; bullet and numbered lists where needed.\n"
"- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
"- Use bullet points (-) for lists and sub-items\n"
"- Use **bold** for emphasis on key terms\n"
"- Provide clean, structured content that can be directly converted to Word formatting\n"
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
"- Start directly with your content - no introductory text or separators\n"
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to DOCX format."""
async def render(self, extracted_content: str, title: str, user_prompt: str = None) -> Tuple[str, str]:
"""Render extracted content to DOCX format using user prompt as blueprint."""
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
@ -58,8 +62,8 @@ class DocxRenderer(BaseRenderer):
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate DOCX using python-docx
docx_content = self._generate_docx(extracted_content, title)
# Generate DOCX using prompt-based structure
docx_content = self._generate_docx_from_prompt(extracted_content, title, user_prompt)
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
@ -68,8 +72,8 @@ class DocxRenderer(BaseRenderer):
# Return minimal fallback
return f"DOCX Generation Error: {str(e)}", "text/plain"
def _generate_docx(self, content: str, title: str) -> str:
"""Generate DOCX content using python-docx."""
def _generate_docx_from_prompt(self, content: str, title: str, user_prompt: str = None) -> str:
"""Generate DOCX content by parsing the AI-generated structured content."""
try:
# Create new document
doc = Document()
@ -77,63 +81,11 @@ class DocxRenderer(BaseRenderer):
# Set up document styles
self._setup_document_styles(doc)
# Add title
title_para = doc.add_heading(title, 0)
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Clean the content - remove debug information
clean_content = self._clean_ai_content(content)
# Add generation date
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add page break
doc.add_page_break()
# Process content
lines = content.split('\n')
current_section = []
for line in lines:
line = line.strip()
if not line:
continue
# Check for ALL CAPS headings (major headings)
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=1)
# Check for Title Case headings (subheadings)
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=2)
# Check for markdown headings (fallback)
elif line.startswith('# '):
# H1 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[2:], level=1)
elif line.startswith('## '):
# H2 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[3:], level=2)
elif line.startswith('### '):
# H3 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[4:], level=3)
else:
current_section.append(line)
# Process remaining content
if current_section:
self._process_section(doc, current_section)
# Parse and convert the structured content to DOCX
self._parse_and_format_content(doc, clean_content, title)
# Save to buffer
buffer = io.BytesIO()
@ -147,8 +99,156 @@ class DocxRenderer(BaseRenderer):
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX: {str(e)}")
raise
self.logger.error(f"Error generating DOCX from prompt: {str(e)}")
raise Exception(f"DOCX generation failed: {str(e)}")
def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
"""Extract document structure from user prompt."""
structure = {
'title': title,
'sections': [],
'format': 'standard'
}
if not user_prompt:
return structure
# Extract title from prompt if not provided
if not title or title == "Generated Document":
# Look for "create a ... document" or "generate a ... report"
import re
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
if title_match:
structure['title'] = title_match.group(1).strip().title()
# Extract sections from numbered lists in prompt
import re
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
sections = re.findall(section_pattern, user_prompt)
for num, section_text in sections:
structure['sections'].append({
'number': int(num),
'title': section_text.strip(),
'level': 2 # H2 level
})
# If no numbered sections found, try to extract from "including:" patterns
if not structure['sections']:
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
if including_match:
including_text = including_match.group(1)
# Split by common separators
parts = re.split(r'[,;]\s*', including_text)
for i, part in enumerate(parts, 1):
part = part.strip()
if part:
structure['sections'].append({
'number': i,
'title': part,
'level': 2
})
# If still no sections, extract from any list-like patterns
if not structure['sections']:
# Look for bullet points or dashes
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
bullets = re.findall(bullet_pattern, user_prompt)
for i, bullet in enumerate(bullets, 1):
bullet = bullet.strip()
if bullet and len(bullet) > 3:
structure['sections'].append({
'number': i,
'title': bullet,
'level': 2
})
# If still no sections, extract from sentence structure
if not structure['sections']:
# Split prompt into sentences and use as sections
sentences = re.split(r'[.!?]\s+', user_prompt)
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
sentence = sentence.strip()
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
structure['sections'].append({
'number': i,
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
'level': 2
})
# Final fallback: create sections from prompt keywords
if not structure['sections']:
# Extract key action words from prompt
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
found_actions = []
for action in action_words:
if action in user_prompt.lower():
found_actions.append(action.title())
if found_actions:
for i, action in enumerate(found_actions[:3], 1):
structure['sections'].append({
'number': i,
'title': f"{action} Document Content",
'level': 2
})
else:
# Last resort: generic but meaningful sections
structure['sections'] = [
{'number': 1, 'title': 'Document Analysis', 'level': 2},
{'number': 2, 'title': 'Key Information', 'level': 2},
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
]
return structure
def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
"""Generate DOCX content based on extracted structure."""
# Add sections based on prompt structure
for section in structure['sections']:
# Add section heading
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
# Add AI-generated content for this section
# Try to extract relevant content for this section from the AI response
section_content = self._extract_section_content(content, section['title'])
if section_content:
doc.add_paragraph(section_content)
else:
# If no specific content found, add a note
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
# Add some spacing
doc.add_paragraph()
# Add the complete AI-generated content as additional analysis
if content and content.strip():
doc.add_heading("Complete Analysis", level=1)
doc.add_paragraph(content)
def _extract_section_content(self, content: str, section_title: str) -> str:
"""Extract relevant content for a specific section from AI response."""
if not content or not section_title:
return ""
# Look for content that matches the section title
section_keywords = section_title.lower().split()
# Split content into paragraphs
paragraphs = content.split('\n\n')
relevant_paragraphs = []
for paragraph in paragraphs:
paragraph_lower = paragraph.lower()
# Check if paragraph contains keywords from section title
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
relevant_paragraphs.append(paragraph.strip())
if relevant_paragraphs:
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
return ""
def _setup_document_styles(self, doc):
"""Set up document styles."""
@ -246,4 +346,146 @@ class DocxRenderer(BaseRenderer):
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
self.logger.warning(f"Could not style table: {str(e)}")
def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
return ""
# Remove debug information
lines = content.split('\n')
clean_lines = []
for line in lines:
# Skip debug lines and separators
if (line.startswith('[Skipped ') or
line.startswith('=== DOCUMENT:') or
line.startswith('---') or
line.startswith('FILENAME:') or
line.strip() == '' or
line.strip() == '---'):
continue
clean_lines.append(line)
# Join lines and remove duplicate content
clean_content = '\n'.join(clean_lines)
# Remove duplicate sections by keeping only the first occurrence
sections = clean_content.split('\n\n')
seen_sections = set()
unique_sections = []
for section in sections:
section_key = section.strip()[:50] # Use first 50 chars as key
if section_key not in seen_sections and section.strip():
seen_sections.add(section_key)
unique_sections.append(section)
return '\n\n'.join(unique_sections)
def _parse_and_format_content(self, doc, content: str, title: str):
"""Parse AI-generated structured content and format it as DOCX."""
if not content:
return
# Add title
title_para = doc.add_heading(title, 0)
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add generation date
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add page break
doc.add_page_break()
# Parse content line by line
lines = content.split('\n')
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
# Empty line - end current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
continue
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
if re.match(r'^\d+\)\s+.+', line):
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# Add as heading
heading_text = re.sub(r'^\d+\)\s+', '', line)
doc.add_heading(heading_text, level=1)
# Check if this is a bullet point (- item)
elif line.startswith('- '):
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# Add as bullet point
bullet_text = line[2:] # Remove "- "
self._add_bullet_point(doc, bullet_text)
else:
# Regular text - add to current paragraph
current_paragraph.append(line)
# Flush any remaining paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
def _add_paragraph_to_doc(self, doc, text: str):
"""Add a paragraph to the document with proper formatting."""
if not text.strip():
return
# Check for bold text (**text**)
if '**' in text:
para = doc.add_paragraph()
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text
if part:
para.add_run(part)
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True
else:
# Regular paragraph
doc.add_paragraph(text)
def _add_bullet_point(self, doc, text: str):
"""Add a bullet point to the document."""
if not text.strip():
return
# Create paragraph with bullet style
para = doc.add_paragraph(text, style='List Bullet')
# Check for bold text in bullet point
if '**' in text:
# Clear the paragraph and rebuild with formatting
para.clear()
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text
if part:
para.add_run(part)
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True

View file

@ -0,0 +1,252 @@
import logging
import base64
import io
from typing import Dict, Any, Optional, Tuple
from .base_renderer import BaseRenderer
logger = logging.getLogger(__name__)
class PptxRenderer(BaseRenderer):
"""Renderer for PowerPoint (.pptx) files using python-pptx library."""
def __init__(self):
super().__init__()
self.supported_formats = ["pptx", "ppt"]
self.output_mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
@classmethod
def get_supported_formats(cls) -> list:
"""Get list of supported output formats."""
return ["pptx", "ppt"]
async def render(self, content: str, title: str = "Generated Presentation", **kwargs) -> Tuple[str, str]:
"""
Render content as PowerPoint presentation.
Args:
content: Content to render as presentation
title: Title for the presentation
**kwargs: Additional rendering options
Returns:
Base64-encoded PowerPoint presentation as string
"""
try:
# Import python-pptx
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
import re
# Create new presentation
prs = Presentation()
# Set slide size (16:9)
prs.slide_width = Inches(13.33)
prs.slide_height = Inches(7.5)
# Parse content into slides
slides_data = self._parse_content_to_slides(content, title)
logger.info(f"Parsed {len(slides_data)} slides from content")
# Debug: Show first 200 chars of content
logger.info(f"Content preview: '{content[:200]}...'")
for i, slide_data in enumerate(slides_data):
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - {len(slide_data.get('content', ''))} chars")
# Debug: Show slide content preview
slide_content = slide_data.get('content', '')
if slide_content:
logger.info(f" Content preview: '{slide_content[:100]}...'")
else:
logger.warning(f" ⚠️ Slide {i+1} has NO content!")
# Create slide with title and content layout
slide_layout = prs.slide_layouts[1] # Title and Content layout
slide = prs.slides.add_slide(slide_layout)
# Set title
title_shape = slide.shapes.title
title_shape.text = slide_data.get("title", "Slide")
# Set content
content_shape = slide.placeholders[1]
content_text = slide_data.get("content", "")
# Format content text
text_frame = content_shape.text_frame
text_frame.clear()
# Split content into paragraphs
paragraphs = content_text.split('\n\n')
for i, paragraph in enumerate(paragraphs):
if paragraph.strip():
if i == 0:
p = text_frame.paragraphs[0]
else:
p = text_frame.add_paragraph()
p.text = paragraph.strip()
# Format based on content type
if paragraph.startswith('#'):
# Header
p.text = paragraph.lstrip('#').strip()
p.font.size = Pt(24)
p.font.bold = True
elif paragraph.startswith('##'):
# Subheader
p.text = paragraph.lstrip('#').strip()
p.font.size = Pt(20)
p.font.bold = True
elif paragraph.startswith('*') and paragraph.endswith('*'):
# Bold text
p.text = paragraph.strip('*')
p.font.bold = True
else:
# Regular text
p.font.size = Pt(14)
p.alignment = PP_ALIGN.LEFT
# If no slides were created, create a default slide
if not slides_data:
slide_layout = prs.slide_layouts[0] # Title slide layout
slide = prs.slides.add_slide(slide_layout)
title_shape = slide.shapes.title
title_shape.text = title
subtitle_shape = slide.placeholders[1]
subtitle_shape.text = "Generated by PowerOn AI System"
# Save to buffer
buffer = io.BytesIO()
prs.save(buffer)
buffer.seek(0)
# Convert to base64
pptx_bytes = buffer.getvalue()
pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8')
logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes")
return pptx_base64, "application/vnd.openxmlformats-officedocument.presentationml.presentation"
except ImportError:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return "python-pptx library not installed", "text/plain"
except Exception as e:
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
return f"Error rendering PowerPoint presentation: {str(e)}", "text/plain"
def _parse_content_to_slides(self, content: str, title: str) -> list:
"""
Parse content into slide data structure.
Args:
content: Content to parse
title: Presentation title
Returns:
List of slide data dictionaries
"""
slides = []
# Split content by slide markers or headers
slide_sections = self._split_content_into_slides(content)
for i, section in enumerate(slide_sections):
if section.strip():
slide_data = {
"title": f"Slide {i + 1}",
"content": section.strip()
}
# Extract title from content if it starts with #
lines = section.strip().split('\n')
if lines and lines[0].startswith('#'):
# Remove # symbols and clean up title
slide_title = lines[0].lstrip('#').strip()
slide_data["title"] = slide_title
slide_data["content"] = '\n'.join(lines[1:]).strip()
elif lines and lines[0].strip():
# Use first line as title if it looks like a title
first_line = lines[0].strip()
if len(first_line) < 100 and not first_line.endswith('.'):
slide_data["title"] = first_line
slide_data["content"] = '\n'.join(lines[1:]).strip()
slides.append(slide_data)
return slides
def _split_content_into_slides(self, content: str) -> list:
"""
Split content into individual slides based on headers and structure.
Args:
content: Content to split
Returns:
List of slide content strings
"""
import re
# First, try to split by major headers (# or ##)
# This is the most common case for AI-generated content
header_pattern = r'^(#{1,2})\s+(.+)$'
lines = content.split('\n')
slides = []
current_slide = []
for line in lines:
# Check if this line is a header
header_match = re.match(header_pattern, line.strip())
if header_match:
# If we have content in current slide, save it
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
current_slide = []
# Start new slide with this header
current_slide.append(line)
else:
# Add line to current slide
current_slide.append(line)
# Add the last slide
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
# If we found slides with headers, return them
if len(slides) > 1:
return slides
# Fallback: Split by double newlines
sections = content.split('\n\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Another fallback: Split by double newlines
sections = content.split('\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Last resort: return as single slide
return [content.strip()]
def get_output_mime_type(self) -> str:
"""Get MIME type for rendered output."""
return self.output_mime_type
def getExtractionPrompt(self) -> str:
"""Get extraction prompt for this renderer."""
return "Extract content for PowerPoint presentation generation"

446
test_document_processing.py Normal file
View file

@ -0,0 +1,446 @@
"""
Test script for document processing and DOCX generation.
Calls the main AI service directly to process PDF documents and generate DOCX summaries.
"""
import asyncio
import sys
import os
import logging
import base64
from datetime import datetime
from pathlib import Path
# Add the gateway module to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import EnhancedAiCallOptions
from modules.services.serviceAi.mainServiceAi import AiService
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
async def process_documents_and_generate_summary():
"""Process documents using the main AI service with intelligent chunk integration."""
logger.info("🚀 Starting intelligent chunk integration test...")
# Find testdata directory
testdata_path = Path("../wiki/poweron/testdata")
if not testdata_path.exists():
# Try relative to current directory
testdata_path = Path("wiki/poweron/testdata")
if not testdata_path.exists():
# Try relative to parent directory
testdata_path = Path("../wiki/poweron/testdata")
if not testdata_path.exists():
logger.error(f"❌ Testdata path not found. Tried:")
logger.error(f" - ../wiki/poweron/testdata")
logger.error(f" - wiki/poweron/testdata")
logger.error(f" - ../wiki/poweron/testdata")
logger.info("Please ensure the testdata folder exists with PDF documents")
return False
# Find all supported document files
supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
document_files = []
for ext in supported_extensions:
document_files.extend(list(testdata_path.glob(ext)))
logger.info(f"Found {len(document_files)} document files in testdata:")
for doc_file in document_files:
logger.info(f" - {doc_file.name}")
if not document_files:
logger.error("❌ No supported document files found in testdata folder")
return False
try:
# Mock the database interface to provide our file data BEFORE creating AI service
class TestDbInterface:
def __init__(self, file_data_map):
self.file_data_map = file_data_map
def getFileData(self, file_id):
logger.info(f"TestDbInterface.getFileData called with file_id: {file_id}")
data = self.file_data_map.get(file_id)
if data:
logger.info(f"✅ Found file data for {file_id}: {len(data)} bytes")
else:
logger.warning(f"❌ No file data found for {file_id}")
return data
# Create file data mapping
file_data_map = {}
for i, doc_file in enumerate(document_files):
with open(doc_file, 'rb') as f:
file_data_map[f"test_doc_{i+1}"] = f.read()
logger.info(f"📁 Loaded {doc_file.name} as test_doc_{i+1}: {len(file_data_map[f'test_doc_{i+1}'])} bytes")
# Mock the database interface BEFORE creating AI service
import modules.interfaces.interfaceDbComponentObjects as db_interface_module
original_get_interface = db_interface_module.getInterface
db_interface_module.getInterface = lambda: TestDbInterface(file_data_map)
logger.info("🔧 Database interface mocked successfully")
# Initialize the main AI service - let it handle everything
logger.info("🔧 Initializing main AI service...")
ai_service = await AiService.create()
# Create test documents - the AI service will handle file access internally
documents = []
logger.info(f"📁 Found {len(document_files)} document files")
for i, doc_file in enumerate(document_files):
logger.info(f"📄 Processing file {i+1}/{len(document_files)}: {doc_file.name}")
# Determine MIME type based on file extension
mime_type = "application/octet-stream" # default
if doc_file.suffix.lower() == '.pdf':
mime_type = "application/pdf"
elif doc_file.suffix.lower() in ['.jpg', '.jpeg']:
mime_type = "image/jpeg"
elif doc_file.suffix.lower() == '.png':
mime_type = "image/png"
elif doc_file.suffix.lower() == '.gif':
mime_type = "image/gif"
elif doc_file.suffix.lower() == '.docx':
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif doc_file.suffix.lower() == '.xlsx':
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif doc_file.suffix.lower() == '.pptx':
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
elif doc_file.suffix.lower() == '.ppt':
mime_type = "application/vnd.ms-powerpoint"
elif doc_file.suffix.lower() == '.html':
mime_type = "text/html"
elif doc_file.suffix.lower() == '.csv':
mime_type = "text/csv"
elif doc_file.suffix.lower() in ['.txt', '.md']:
mime_type = "text/plain"
chat_doc = ChatDocument(
fileId=f"test_doc_{i+1}",
messageId=f"test_message_{i+1}",
fileName=doc_file.name,
mimeType=mime_type,
fileSize=doc_file.stat().st_size,
roundNumber=1,
taskNumber=1,
actionNumber=1,
actionId=f"test_action_{i+1}"
)
documents.append(chat_doc)
logger.info(f"✅ Created ChatDocument: {chat_doc.fileName} ({chat_doc.mimeType}) - {chat_doc.fileSize} bytes")
logger.info(f"📄 Created {len(documents)} document objects")
# Create enhanced AI call options for intelligent chunked processing
ai_options = EnhancedAiCallOptions(
operationType="general",
enableParallelProcessing=True,
maxConcurrentChunks=5, # Increased for better testing
preserveChunkMetadata=True,
chunkSeparator="\n\n---\n\n"
)
# Call the main AI service directly - let it handle everything including DOCX generation
logger.info("🤖 Calling main AI service with intelligent merging...")
# Test different AI operations end-to-end
test_prompts = [
{
"name": "Document Analysis",
"prompt": "Analyze these documents and provide a comprehensive summary of their content, key points, and important information.",
"outputFormat": None # Text response
},
{
"name": "DOCX Generation",
"prompt": "Create a professional DOCX document summarizing the key information from these documents.",
"outputFormat": "docx"
},
{
"name": "PDF Generation",
"prompt": "Generate a PDF report analyzing these documents with sections for each document type.",
"outputFormat": "pdf"
}
]
# Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...")
try:
# Single AI call with DOCX generation
ai_response = await ai_service.callAi(
prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
documents=documents,
options=ai_options,
outputFormat="docx",
title="Document Analysis Summary"
)
logger.info(f"✅ End-to-end test completed successfully")
logger.info(f"📊 Response type: {type(ai_response)}")
logger.info(f"📊 Response length: {len(str(ai_response))} characters")
# Single test result
test_results = [{
"test_name": "End-to-End DOCX Generation",
"success": True,
"response_type": type(ai_response).__name__,
"response_length": len(str(ai_response)),
"response": ai_response
}]
except Exception as e:
logger.error(f"❌ End-to-end test failed: {str(e)}")
test_results = [{
"test_name": "End-to-End DOCX Generation",
"success": False,
"error": str(e),
"response": None
}]
logger.info(f"🎯 Completed 1 end-to-end test")
# Process all test results and save outputs
logger.info("📊 Processing test results...")
successful_tests = [r for r in test_results if r['success']]
failed_tests = [r for r in test_results if not r['success']]
logger.info(f"✅ Successful tests: {len(successful_tests)}")
logger.info(f"❌ Failed tests: {len(failed_tests)}")
# Display test results summary
logger.info("=" * 80)
logger.info("END-TO-END TEST RESULTS SUMMARY")
logger.info("=" * 80)
for i, result in enumerate(test_results, 1):
status = "✅ PASS" if result['success'] else "❌ FAIL"
logger.info(f"Test {i}: {result['test_name']} - {status}")
if result['success']:
logger.info(f" Response Type: {result['response_type']}")
logger.info(f" Response Length: {result['response_length']} characters")
else:
logger.info(f" Error: {result['error']}")
logger.info("=" * 80)
# Create output directory if it doesn't exist
output_dir = Path("test-chat/unittestoutput")
output_dir.mkdir(parents=True, exist_ok=True)
# Save all test results and generated files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
logger.info("💾 Saving test results and generated files...")
try:
for i, result in enumerate(successful_tests, 1):
test_name = result['test_name'].replace(' ', '_').lower()
response = result['response']
logger.info(f"💾 Saving Test {i}: {result['test_name']}")
# Handle different response types
if isinstance(response, dict):
# Document generation response
if 'documents' in response and response['documents']:
logger.info(f"📄 Found {len(response['documents'])} documents in response")
for j, doc in enumerate(response['documents']):
doc_name = doc.get('documentName', f'{test_name}_document_{j+1}')
doc_data = doc.get('documentData', '')
doc_mime = doc.get('mimeType', 'application/octet-stream')
logger.info(f"📄 Document {j+1}: {doc_name}")
logger.info(f"📄 MIME Type: {doc_mime}")
logger.info(f"📄 Data length: {len(doc_data)} characters")
# Determine file extension with better MIME type detection
file_ext = '.bin' # Default fallback
if doc_mime:
if 'docx' in doc_mime.lower() or 'wordprocessingml' in doc_mime.lower():
file_ext = '.docx'
elif 'pdf' in doc_mime.lower():
file_ext = '.pdf'
elif 'txt' in doc_mime.lower() or 'plain' in doc_mime.lower():
file_ext = '.txt'
elif 'html' in doc_mime.lower():
file_ext = '.html'
elif 'json' in doc_mime.lower():
file_ext = '.json'
elif 'csv' in doc_mime.lower():
file_ext = '.csv'
elif 'xlsx' in doc_mime.lower() or 'spreadsheetml' in doc_mime.lower():
file_ext = '.xlsx'
elif 'pptx' in doc_mime.lower() or 'presentationml' in doc_mime.lower():
file_ext = '.pptx'
else:
logger.warning(f"⚠️ Unknown MIME type: {doc_mime}, using .bin")
# Also check filename for hints
if doc_name and '.' in doc_name:
name_ext = '.' + doc_name.split('.')[-1].lower()
if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx']:
file_ext = name_ext
logger.info(f"📄 Using extension from filename: {file_ext}")
logger.info(f"📄 Final file extension: {file_ext}")
# Save document
output_path = output_dir / f"{test_name}_{timestamp}{file_ext}"
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)
logger.info(f"✅ Document saved: {output_path} ({len(doc_bytes)} bytes)")
# Also save raw content as text
content = response.get('content', '')
if content:
text_path = output_dir / f"{test_name}_content_{timestamp}.txt"
with open(text_path, 'w', encoding='utf-8') as f:
f.write(content)
logger.info(f"✅ Content saved: {text_path}")
elif isinstance(response, str):
# Text response
text_path = output_dir / f"{test_name}_response_{timestamp}.txt"
with open(text_path, 'w', encoding='utf-8') as f:
f.write(response)
logger.info(f"✅ Text response saved: {text_path}")
else:
logger.warning(f"⚠️ Unknown response type for {result['test_name']}: {type(response)}")
# Save failed test details
if failed_tests:
error_path = output_dir / f"failed_tests_{timestamp}.txt"
with open(error_path, 'w', encoding='utf-8') as f:
f.write("# Failed Test Details\n\n")
for i, result in enumerate(failed_tests, 1):
f.write(f"## Test {i}: {result['test_name']}\n")
f.write(f"**Error:** {result['error']}\n\n")
logger.info(f"✅ Failed test details saved: {error_path}")
except Exception as e:
logger.error(f"❌ Error saving test results: {str(e)}")
return False
# Save comprehensive test report
report_path = output_dir / f"end_to_end_test_report_{timestamp}.txt"
with open(report_path, 'w', encoding='utf-8') as f:
f.write(f"# End-to-End AI Service Test Report\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Test Configuration\n")
f.write(f"- Documents processed: {len(documents)}\n")
f.write(f"- Processing method: Intelligent Token-Aware Merging\n")
f.write(f"- Parallel processing: {ai_options.enableParallelProcessing}\n")
f.write(f"- Max concurrent chunks: {ai_options.maxConcurrentChunks}\n")
f.write(f"- Chunk metadata preserved: {ai_options.preserveChunkMetadata}\n")
f.write(f"- Chunk separator: '{ai_options.chunkSeparator}'\n\n")
f.write(f"## Document Inventory\n")
for i, doc in enumerate(documents, 1):
f.write(f"{i}. **{doc.fileName}**\n")
f.write(f" - MIME Type: {doc.mimeType}\n")
f.write(f" - File Size: {doc.fileSize:,} bytes\n")
f.write(f" - File ID: {doc.fileId}\n\n")
f.write(f"## Test Results Summary\n")
f.write(f"- Total Tests: {len(test_results)}\n")
f.write(f"- Successful: {len(successful_tests)}\n")
f.write(f"- Failed: {len(failed_tests)}\n")
f.write(f"- Success Rate: {len(successful_tests)/len(test_results)*100:.1f}%\n\n")
f.write(f"## Detailed Test Results\n")
for i, result in enumerate(test_results, 1):
f.write(f"### Test {i}: {result['test_name']}\n")
f.write(f"**Status:** {'✅ PASS' if result['success'] else '❌ FAIL'}\n")
if result['success']:
f.write(f"**Response Type:** {result['response_type']}\n")
f.write(f"**Response Length:** {result['response_length']} characters\n")
# Show response preview
response_preview = str(result['response'])[:500]
f.write(f"**Response Preview:**\n```\n{response_preview}...\n```\n\n")
else:
f.write(f"**Error:** {result['error']}\n\n")
f.write(f"## Technical Implementation Details\n")
f.write(f"This test validates the complete AI service pipeline:\n\n")
f.write(f"### Tested Components:\n")
f.write(f"- **Document Extraction**: PDF, DOCX, images, etc.\n")
f.write(f"- **Intelligent Chunking**: Token-aware merging\n")
f.write(f"- **Model Selection**: Automatic AI model choice\n")
f.write(f"- **Parallel Processing**: Concurrent chunk processing\n")
f.write(f"- **Document Generation**: DOCX, PDF, text output\n")
f.write(f"- **Error Handling**: Graceful failure management\n\n")
f.write(f"### Performance Metrics:\n")
f.write(f"- **Chunk Optimization**: Intelligent merging reduces AI calls\n")
f.write(f"- **Processing Speed**: Parallel execution\n")
f.write(f"- **Memory Efficiency**: Token-aware chunking\n")
f.write(f"- **Output Quality**: Multiple format support\n\n")
f.write(f"## Generated Files\n")
for i, result in enumerate(successful_tests, 1):
test_name = result['test_name'].replace(' ', '_').lower()
f.write(f"- **Test {i}**: {result['test_name']} → `{test_name}_*_{timestamp}.*`\n")
if failed_tests:
f.write(f"- **Failed Tests**: `failed_tests_{timestamp}.txt`\n")
f.write(f"- **This Report**: `end_to_end_test_report_{timestamp}.txt`\n\n")
f.write(f"The end-to-end test successfully validates the complete AI service\n")
f.write(f"pipeline from document input to formatted output generation.\n")
logger.info(f"✅ Comprehensive test report saved: {report_path}")
# Restore original database interface
db_interface_module.getInterface = original_get_interface
return True
except Exception as e:
logger.error(f"❌ Error during document processing: {str(e)}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
# Restore original database interface in case of error
try:
db_interface_module.getInterface = original_get_interface
except:
pass
return False
async def main():
"""Main function to run the intelligent chunk integration test."""
logger.info("🎯 Starting Intelligent Chunk Integration Test")
logger.info("=" * 60)
success = await process_documents_and_generate_summary()
if success:
logger.info("🎉 Intelligent chunk integration test completed successfully!")
logger.info("✅ Main AI service handled all processing internally")
logger.info("✅ Intelligent token-aware merging activated")
logger.info("✅ DOCX document generated directly by AI service")
logger.info("✅ Detailed chunk integration analysis saved")
logger.info("✅ Performance optimization achieved")
else:
logger.error("❌ Test failed!")
logger.error("Please check the error messages above for details")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

83
test_image_processing.py Normal file
View file

@ -0,0 +1,83 @@
"""
Simple test to verify image processing works correctly.
"""
import asyncio
import sys
import os
import base64
import logging
# Add the gateway module to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
from modules.services.serviceAi.mainServiceAi import AiService
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
async def test_image_processing():
"""Test image processing with a simple base64 image."""
print("🧪 Testing image processing...")
logger.info("🧪 Testing image processing...")
try:
print("🔧 Initializing AI service...")
logger.info("🔧 Initializing AI service...")
# Initialize AI service
ai_service = await AiService.create()
print("✅ AI service initialized successfully")
logger.info("✅ AI service initialized successfully")
# Create a simple test image (1x1 pixel PNG in base64)
test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
print(f"📸 Test image base64 length: {len(test_image_base64)}")
logger.info(f"📸 Test image base64 length: {len(test_image_base64)}")
# Test the readImage method directly
print("📸 Testing readImage method...")
logger.info("📸 Testing readImage method...")
result = await ai_service.readImage(
prompt="What do you see in this image?",
imageData=test_image_base64,
mimeType="image/png"
)
print(f"✅ Image processing result: {result}")
logger.info(f"✅ Image processing result: {result}")
return True
except Exception as e:
print(f"❌ Image processing test failed: {str(e)}")
logger.error(f"❌ Image processing test failed: {str(e)}")
import traceback
traceback.print_exc()
logger.error(f"Traceback: {traceback.format_exc()}")
return False
async def main():
"""Main function to run the image processing test."""
print("🎯 Starting Image Processing Test")
print("=" * 60)
logger.info("🎯 Starting Image Processing Test")
logger.info("=" * 60)
success = await test_image_processing()
if success:
print("🎉 Image processing test completed successfully!")
logger.info("🎉 Image processing test completed successfully!")
else:
print("❌ Image processing test failed!")
logger.error("❌ Image processing test failed!")
print("=" * 60)
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())