diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 38c5cb1e..d9c21ff1 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -596,7 +596,32 @@ class AiObjects: """Extract links from HTML/Markdown content.""" try: import re - from urllib.parse import urljoin, urlparse + from urllib.parse import urljoin, urlparse, quote, urlunparse + + def _cleanUrl(url: str) -> str: + """Clean and encode URL to remove spaces and invalid characters.""" + # Remove quotes and extra spaces + url = url.strip().strip('"\'') + + # If it's a relative URL, make it absolute first + if not url.startswith(('http://', 'https://')): + url = urljoin(base_url, url) + + # Parse and re-encode the URL properly + parsed = urlparse(url) + if parsed.query: + # Encode query parameters properly + encoded_query = quote(parsed.query, safe='=&') + url = urlunparse(( + parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + encoded_query, + parsed.fragment + )) + + return url links = [] @@ -605,39 +630,45 @@ class AiObjects: html_links = re.findall(html_link_pattern, content, re.IGNORECASE) for url in html_links: - if url and (url.startswith('http://') or url.startswith('https://')): - links.append(url) - elif url and not url.startswith('#') and not url.startswith('javascript:'): - # Convert relative URLs to absolute URLs - absolute_url = urljoin(base_url, url) - links.append(absolute_url) + if url and not url.startswith('#') and not url.startswith('javascript:'): + try: + cleaned_url = _cleanUrl(url) + links.append(cleaned_url) + logger.debug(f"Extracted HTML link: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean HTML link {url}: {e}") # Extract markdown links: [text](url) format markdown_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)' markdown_links = re.findall(markdown_link_pattern, content) for text, url in markdown_links: - if url and (url.startswith('http://') or url.startswith('https://')): - absolute_url = urljoin(base_url, url) - # Only keep URLs from the same domain - if urlparse(absolute_url).netloc == urlparse(base_url).netloc: - links.append(absolute_url) - elif url and not url.startswith('#'): - absolute_url = urljoin(base_url, url) - if urlparse(absolute_url).netloc == urlparse(base_url).netloc: - links.append(absolute_url) + if url and not url.startswith('#'): + try: + cleaned_url = _cleanUrl(url) + # Only keep URLs from the same domain + if urlparse(cleaned_url).netloc == urlparse(base_url).netloc: + links.append(cleaned_url) + logger.debug(f"Extracted markdown link: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean markdown link {url}: {e}") # Extract plain URLs in the text url_pattern = r'https?://[^\s\)]+' plain_urls = re.findall(url_pattern, content) for url in plain_urls: - clean_url = url.rstrip('.,;!?') - absolute_url = urljoin(base_url, clean_url) - if urlparse(absolute_url).netloc == urlparse(base_url).netloc: - if absolute_url not in links: # Avoid duplicates - links.append(absolute_url) + try: + clean_url = url.rstrip('.,;!?') + cleaned_url = _cleanUrl(clean_url) + if urlparse(cleaned_url).netloc == urlparse(base_url).netloc: + if cleaned_url not in links: # Avoid duplicates + links.append(cleaned_url) + logger.debug(f"Extracted plain URL: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean plain URL {url}: {e}") + logger.debug(f"Total links extracted and cleaned: {len(links)}") return links except Exception as e: diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index e67c1958..00f67f8a 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -53,46 +53,8 @@ class AiService: logger.info("AiObjects.create() completed") return instance - # AI Text Generation - async def callAiText( - self, - prompt: str, - documents: Optional[List[ChatDocument]] = None, - processDocumentsIndividually: bool = False, - options: Optional[AiCallOptions] = None, - ) -> str: - """Call AI for text generation using interface.call().""" - try: - documentContent = "" - if documents: - documentContent = await self._processDocumentsForAi( - documents, - options.operationType if options else "general", - options.compressContext if options else True, - options.processDocumentsIndividually if options else processDocumentsIndividually, - prompt, - options - ) - - effectiveOptions = options or AiCallOptions() - # Compute maxContextBytes if not provided: conservative defaults per model tag could be added here - if options and options.maxContextBytes is None: - options.maxContextBytes = 16000 # bytes, conservative default if model limit unknown - - request = AiCallRequest( - prompt=prompt, - context=documentContent or None, - options=effectiveOptions, - ) - - response = await self.aiObjects.call(request) - return response.content - except Exception as e: - logger.error(f"Error in AI text generation: {str(e)}") - return f"Error: {str(e)}" - # AI Image Analysis - async def callAiImage( + async def readImage( self, prompt: str, imageData: Union[str, bytes], @@ -146,40 +108,40 @@ class AiService: # Use AI to generate optimized Tavily search query and selection strategy query_optimizer_prompt = f"""You are a search query optimizer and web page selector. -Given a user query, perform two tasks: + Given a user query, perform two tasks: -TASK 1 - GENERATE TAVILY SEARCH QUERY: -Analyze the user's intent and extract the core search terms. -- For entity-specific queries (companies, people, products): Use entity name + key identifiers -- For informational queries (how to, what is): Use core concept keywords -- For transactional queries (where to buy, find services): Use action + category -- Keep it 2-6 words maximum, keyword format only + TASK 1 - GENERATE TAVILY SEARCH QUERY: + Analyze the user's intent and extract the core search terms. + - For entity-specific queries (companies, people, products): Use entity name + key identifiers + - For informational queries (how to, what is): Use core concept keywords + - For transactional queries (where to buy, find services): Use action + category + - Keep it 2-6 words maximum, keyword format only -TASK 2 - DEFINE URL SELECTION STRATEGY: -Determine what type of results the user needs: + TASK 2 - DEFINE URL SELECTION STRATEGY: + Determine what type of results the user needs: -A) SINGLE AUTHORITATIVE SOURCE - Use when: Looking for specific entity information (company profile, person bio, specific product) - Select: Official website, primary domain, or most authoritative single page - -B) MULTIPLE DIVERSE SOURCES - Use when: Comparing options, finding services, shopping, research across sources - Select: Multiple relevant URLs (5-15), prioritizing diversity and relevance - -C) SPECIFIC PAGE TYPE - Use when: Looking for particular content (documentation, pricing, contact, careers) - Select: Deep links to specific page types on relevant sites + A) SINGLE AUTHORITATIVE SOURCE + Use when: Looking for specific entity information (company profile, person bio, specific product) + Select: Official website, primary domain, or most authoritative single page + + B) MULTIPLE DIVERSE SOURCES + Use when: Comparing options, finding services, shopping, research across sources + Select: Multiple relevant URLs (5-15), prioritizing diversity and relevance + + C) SPECIFIC PAGE TYPE + Use when: Looking for particular content (documentation, pricing, contact, careers) + Select: Deep links to specific page types on relevant sites -Return your response in this exact JSON format: -{{ - "search_query": "your generated search query", - "selection_strategy": "single|multiple|specific_page", - "selection_criteria": "description of what URLs to prioritize", - "expected_url_patterns": ["pattern1", "pattern2"], - "estimated_result_count": number -}} + Return your response in this exact JSON format: + {{ + "search_query": "your generated search query", + "selection_strategy": "single|multiple|specific_page", + "selection_criteria": "description of what URLs to prioritize", + "expected_url_patterns": ["pattern1", "pattern2"], + "estimated_result_count": number + }} -USER QUERY: {request.search_query}""" + USER QUERY: {request.search_query}""" # Get AI response for query optimization ai_request = AiCallRequest( @@ -248,14 +210,13 @@ USER QUERY: {request.search_query}""" # Create a prompt for AI to identify main URLs based on user's intention ai_prompt = f""" -Select the most relevant URLs from these search results: + Select the most relevant URLs from these search results: -{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} + {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} -Return only the URLs that are most relevant for the user's query. -One URL per line. -""" - + Return only the URLs that are most relevant for the user's query. + One URL per line. + """ # Create AI call request ai_request = AiCallRequest( prompt=ai_prompt, @@ -397,7 +358,7 @@ One URL per line. "operationType": operationType, "processDocumentsIndividually": processIndividually, "maxSize": maxContextBytes, - "chunkAllowed": options.chunkAllowed if options else True, + "chunkAllowed": not options.compressContext if options else True, "textChunkSize": int(maxContextBytes * 0.3), # 30% of max for text chunks "imageChunkSize": int(maxContextBytes * 0.5), # 50% of max for image chunks "imageMaxPixels": 1024 * 1024, # 1MP default @@ -534,18 +495,18 @@ One URL per line. if not documents: return "" - # Calculate model-derived size limits - maxContextBytes = self._calculateMaxContextBytes(options) + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) # Build extraction options for chunking extractionOptions: Dict[str, Any] = { "prompt": prompt, "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately - "maxSize": maxContextBytes, + "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, - "textChunkSize": int(maxContextBytes * 0.3), - "imageChunkSize": int(maxContextBytes * 0.5), + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { @@ -555,6 +516,8 @@ One URL per line. }, } + logger.debug(f"Per-chunk extraction options: {extractionOptions}") + try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) @@ -570,7 +533,7 @@ One URL per line. if part.typeGroup == "image": # Process image with AI try: - aiResult = await self.callAiImage( + aiResult = await self.readImage( prompt=prompt, imageData=part.data, mimeType=part.mimeType, @@ -584,23 +547,36 @@ One URL per line. elif part.typeGroup in ("text", "table", "structure"): # Process text content with AI try: - aiResult = await self.callAiText( + logger.info(f"=== PROCESSING CHUNK {len(aiResults) + 1} ===") + logger.info(f"Chunk size: {len(part.data)} chars") + logger.info(f"Chunk preview: {part.data[:200]}...") + + # Create AI call request for this chunk + request = AiCallRequest( prompt=prompt, - documents=None, + context=part.data, options=options ) - aiResults.append(aiResult) + + # Make the call using AiObjects + response = await self.aiObjects.call(request) + aiResults.append(response.content) + + logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response") + except Exception as e: logger.warning(f"Error processing text chunk: {str(e)}") aiResults.append(f"[Error processing text: {str(e)}]") # Merge AI results using ExtractionService - mergeStrategy = { - "groupBy": "typeGroup", - "orderBy": "id", - "mergeType": "concatenate", - "chunkSeparator": "\n\n---\n\n" - } + from modules.datamodels.datamodelExtraction import MergeStrategy + + mergeStrategy = MergeStrategy( + groupBy="typeGroup", + orderBy="id", + mergeType="concatenate", + chunkSeparator="\n\n---\n\n" + ) mergedContent = self.extractionService.mergeAiResults( extractionResult, @@ -675,6 +651,9 @@ One URL per line. if call_type == "planning": return await self._callAiPlanning(prompt, placeholders, options) else: + # Set processDocumentsIndividually from the legacy parameter if not set in options + if options.processDocumentsIndividually is None and documents: + options.processDocumentsIndividually = False # Default to batch processing return await self._callAiText(prompt, documents, options) def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str: @@ -731,99 +710,182 @@ One URL per line. """ Handle text calls with document processing through ExtractionService. """ - # Get available models for text processing - models = self._getModelsForOperation("text", options) + # Determine processing strategy based on options + if options.processDocumentsIndividually and documents: + # Use per-chunk processing for individual document processing + return await self._processDocumentsPerChunk(documents, prompt, options) - for model in models: - try: - # Determine processing strategy based on options - if options.processDocumentsIndividually and documents: - # Use per-chunk processing for individual document processing - return await self._processDocumentsPerChunk(documents, prompt, options) + # Check if we need chunking - if so, use per-chunk processing + if documents and not options.compressContext: + # Get model capabilities to check if chunking will be needed + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + total_doc_size = sum(doc.fileSize or 0 for doc in documents) + + if total_doc_size > model_capabilities["maxContextBytes"]: + logger.info(f"Document size ({total_doc_size}) exceeds model capacity ({model_capabilities['maxContextBytes']}), using per-chunk processing") + return await self._processDocumentsPerChunk(documents, prompt, options) + + # Extract and process documents using ExtractionService + context = "" + if documents: + logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===") + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + + # Use new ChatDocument-based API + extraction_options = { + "prompt": prompt, + "operationType": options.operationType, + "processDocumentsIndividually": options.processDocumentsIndividually, + "maxSize": options.maxContextBytes or model_capabilities["maxContextBytes"], + "chunkAllowed": not options.compressContext, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"} + } + + logger.debug(f"Extraction options: {extraction_options}") + + extracted_content = self.extractionService.extractContent( + documents=documents, + options=extraction_options + ) + + logger.info(f"Extraction completed: {len(extracted_content)} documents") + + # Build context from list of ExtractedContent + if isinstance(extracted_content, list): + context_parts = [] + chunk_count = 0 + for ec in extracted_content: + for p in ec.parts: + if p.typeGroup in ["text", "table", "structure"] and p.data: + if p.metadata.get("chunk", False): + chunk_count += 1 + context_parts.append(p.data) + elif p.typeGroup == "image" and p.data: + # Process image with AI using user prompt + try: + imageResult = await self.aiObjects.callImage( + prompt=prompt, + imageData=p.data, + mimeType=p.mimeType + ) + context_parts.append(f"[Image Analysis]: {imageResult}") + except Exception as e: + logger.warning(f"AI image processing failed: {e}") + context_parts.append(f"[Image Analysis Failed]: {str(e)}") - # Extract and process documents using ExtractionService + if chunk_count > 0: + logger.debug(f"=== PROCESSING CHUNKED CONTENT ===") + logger.debug(f"Total chunks: {chunk_count}") + logger.debug(f"Total context parts: {len(context_parts)}") + + context = "\n\n---\n\n".join(context_parts) + else: context = "" - if documents: - logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===") - - # Use new ChatDocument-based API - extracted_content = self.extractionService.extractContent( - documents=documents, - options={ - "prompt": prompt, - "operationType": options.operationType, - "processDocumentsIndividually": options.processDocumentsIndividually, - "maxSize": options.maxContextBytes or int(model.maxTokens * 0.9), - "chunkAllowed": not options.compressContext, - "textChunkSize": int((options.maxContextBytes or model.maxTokens * 4) * 0.3), - "imageChunkSize": int((options.maxContextBytes or model.maxTokens * 4) * 0.5), - "imageMaxPixels": 1024 * 1024, - "imageQuality": 85, - "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"} - } - ) - - logger.info(f"Extraction completed: {len(extracted_content)} documents") - - # Build context from list of ExtractedContent - if isinstance(extracted_content, list): - context_parts = [] - for ec in extracted_content: - for p in ec.parts: - if p.typeGroup in ["text", "table", "structure"] and p.data: - context_parts.append(p.data) - elif p.typeGroup == "image" and p.data: - # Process image with AI using user prompt - try: - imageResult = await self.aiObjects.callImage( - prompt=prompt, - imageData=p.data, - mimeType=p.mimeType - ) - context_parts.append(f"[Image Analysis]: {imageResult}") - except Exception as e: - logger.warning(f"AI image processing failed: {e}") - context_parts.append(f"[Image Analysis Failed]: {str(e)}") - context = "\n\n---\n\n".join(context_parts) - else: - context = "" - - # Check size and reduce if needed - full_prompt = prompt + "\n\n" + context if context else prompt - logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})") - - if self._exceedsTokenLimit(full_prompt, model, options.safetyMargin): - full_prompt = self._reduceTextPrompt(prompt, context, model, options) - logger.debug(f"Prompt reduced to {len(full_prompt)} chars") - - # Make AI call using the connector directly - result = await self._makeAiCall(full_prompt, model, options) - logger.debug(f"=== AI RESPONSE ===") - logger.debug(f"Response length: {len(result)} chars") - logger.debug(f"Response preview: {result[:200]}...") - return result - - except Exception as e: - logger.warning(f"Text model {model.name} failed: {e}") - continue - raise Exception("All text models failed - check model availability and capabilities") + # Check size and reduce if needed + full_prompt = prompt + "\n\n" + context if context else prompt + logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})") + + # Use AiObjects to select the best model and make the call + try: + # Create AI call request + request = AiCallRequest( + prompt=full_prompt, + context="", # Context is already included in the prompt + options=options + ) + + # Make the call using AiObjects (which handles model selection) + response = await self.aiObjects.call(request) + logger.debug(f"=== AI RESPONSE ===") + logger.debug(f"Response length: {len(response.content)} chars") + logger.debug(f"Response preview: {response.content[:200]}...") + return response.content + + except Exception as e: + logger.error(f"AI call failed: {e}") + raise Exception(f"AI call failed: {e}") - async def _makeAiCall(self, prompt: str, model: ModelCapabilities, options: AiCallOptions) -> str: - """Make actual AI call using the connector.""" - if not self.aiObjects: - raise Exception("AI objects not initialized") + + def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: + """ + Get model capabilities for content processing, including appropriate size limits for chunking. + """ + # Estimate total content size + prompt_size = len(prompt.encode('utf-8')) + document_size = 0 + if documents: + # Rough estimate of document content size + for doc in documents: + document_size += doc.fileSize or 0 - # Create AI call request - request = AiCallRequest( - prompt=prompt, - context="", # Context is already included in the prompt - options=options - ) + total_size = prompt_size + document_size - # Make the call - response = await self.aiObjects.call(request) - return response.content + # Use AiObjects to select the best model for this content size + # We'll simulate the model selection by checking available models + from modules.interfaces.interfaceAiObjects import aiModels + + # Find the best model for this content size and operation + best_model = None + best_context_length = 0 + + for model_name, model_info in aiModels.items(): + context_length = model_info.get("contextLength", 0) + + # Skip models with no context length or too small for content + if context_length == 0: + continue + + # Check if model supports the operation type + capabilities = model_info.get("capabilities", []) + if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: + continue + elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: + continue + elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: + continue + elif "text_generation" not in capabilities: + continue + + # Prefer models that can handle the content without chunking, but allow chunking if needed + if context_length >= total_size * 0.8: # 80% of content size + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + elif best_model is None: # Fallback to largest available model + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + + # Fallback to a reasonable default if no model found + if best_model is None: + best_model = { + "contextLength": 128000, # GPT-4o default + "llmName": "gpt-4o" + } + + # Calculate appropriate sizes + # Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters) + context_length_bytes = int(best_model["contextLength"] * 4) + max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length + text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks + image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks + + logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") + logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") + logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") + + return { + "maxContextBytes": max_context_bytes, + "textChunkSize": text_chunk_size, + "imageChunkSize": image_chunk_size + } def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]: """ diff --git a/modules/services/serviceExtraction/chunking/text_chunker.py b/modules/services/serviceExtraction/chunking/text_chunker.py index 35c75168..b5f6d582 100644 --- a/modules/services/serviceExtraction/chunking/text_chunker.py +++ b/modules/services/serviceExtraction/chunking/text_chunker.py @@ -7,6 +7,10 @@ from ..subRegistry import Chunker class TextChunker(Chunker): def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: maxBytes = int(options.get("textChunkSize", 40000)) + import logging + logger = logging.getLogger(__name__) + logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}") + logger.debug(f"TextChunker: using maxBytes: {maxBytes}") chunks: List[Dict[str, Any]] = [] current: List[str] = [] size = 0 diff --git a/modules/services/serviceExtraction/formats/binary_extractor.py b/modules/services/serviceExtraction/formats/binary_extractor.py index 1c201c36..e6667fda 100644 --- a/modules/services/serviceExtraction/formats/binary_extractor.py +++ b/modules/services/serviceExtraction/formats/binary_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import base64 -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/csv_extractor.py b/modules/services/serviceExtraction/formats/csv_extractor.py index db3cf969..27233979 100644 --- a/modules/services/serviceExtraction/formats/csv_extractor.py +++ b/modules/services/serviceExtraction/formats/csv_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/docx_extractor.py b/modules/services/serviceExtraction/formats/docx_extractor.py index 6cb75716..51384ffd 100644 --- a/modules/services/serviceExtraction/formats/docx_extractor.py +++ b/modules/services/serviceExtraction/formats/docx_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import io -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/html_extractor.py b/modules/services/serviceExtraction/formats/html_extractor.py index 6c49c50c..09da02f4 100644 --- a/modules/services/serviceExtraction/formats/html_extractor.py +++ b/modules/services/serviceExtraction/formats/html_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List from bs4 import BeautifulSoup from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/image_extractor.py b/modules/services/serviceExtraction/formats/image_extractor.py index 296eb50b..22327f50 100644 --- a/modules/services/serviceExtraction/formats/image_extractor.py +++ b/modules/services/serviceExtraction/formats/image_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import base64 -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/json_extractor.py b/modules/services/serviceExtraction/formats/json_extractor.py index 456eb08e..86eac791 100644 --- a/modules/services/serviceExtraction/formats/json_extractor.py +++ b/modules/services/serviceExtraction/formats/json_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import json from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/pdf_extractor.py b/modules/services/serviceExtraction/formats/pdf_extractor.py index 4d0d8058..59c88dc7 100644 --- a/modules/services/serviceExtraction/formats/pdf_extractor.py +++ b/modules/services/serviceExtraction/formats/pdf_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import base64 import io -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/text_extractor.py b/modules/services/serviceExtraction/formats/text_extractor.py index 5099d04c..a6d92bc1 100644 --- a/modules/services/serviceExtraction/formats/text_extractor.py +++ b/modules/services/serviceExtraction/formats/text_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/xlsx_extractor.py b/modules/services/serviceExtraction/formats/xlsx_extractor.py index 141af3db..ea6396a2 100644 --- a/modules/services/serviceExtraction/formats/xlsx_extractor.py +++ b/modules/services/serviceExtraction/formats/xlsx_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import io from datetime import datetime -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/xml_extractor.py b/modules/services/serviceExtraction/formats/xml_extractor.py index 7067924b..5aabea35 100644 --- a/modules/services/serviceExtraction/formats/xml_extractor.py +++ b/modules/services/serviceExtraction/formats/xml_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import xml.etree.ElementTree as ET from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index 9e989376..1c63e12c 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -74,9 +74,13 @@ class ExtractionService: # Log chunking information chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)] if chunked_parts: - logger.debug(f"Chunking: {len(chunked_parts)} parts were chunked") + logger.debug(f"=== CHUNKING RESULTS ===") + logger.debug(f"Total parts: {len(ec.parts)}") + logger.debug(f"Chunked parts: {len(chunked_parts)}") for chunk in chunked_parts: - logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars") + logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})") + else: + logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits") ec = applyAiIfRequested(ec, options) results.append(ec) diff --git a/modules/services/serviceExtraction/merging/table_merger.py b/modules/services/serviceExtraction/merging/table_merger.py index 04be404e..4f62358c 100644 --- a/modules/services/serviceExtraction/merging/table_merger.py +++ b/modules/services/serviceExtraction/merging/table_merger.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId class TableMerger: diff --git a/modules/services/serviceExtraction/merging/text_merger.py b/modules/services/serviceExtraction/merging/text_merger.py index bb9e850d..38f7c6f0 100644 --- a/modules/services/serviceExtraction/merging/text_merger.py +++ b/modules/services/serviceExtraction/merging/text_merger.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId class TextMerger: diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py index cf12a246..65d8f1f2 100644 --- a/modules/services/serviceExtraction/subPipeline.py +++ b/modules/services/serviceExtraction/subPipeline.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart -from .utils import makeId +from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry from .merging.text_merger import TextMerger from .merging.table_merger import TableMerger @@ -67,10 +67,29 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker return ExtractedContent(id=makeId(), parts=[part]) parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options}) - # Optional merge step + + # Apply chunking and size limiting + parts = poolAndLimit(parts, chunkerRegistry, options) + + # Optional merge step - but preserve chunks mergeStrategy = options.get("mergeStrategy", {}) if mergeStrategy: - parts = _mergeParts(parts, mergeStrategy) + import logging + logger = logging.getLogger(__name__) + + # Don't merge chunks - they should stay separate for processing + non_chunk_parts = [p for p in parts if not p.metadata.get("chunk", False)] + chunk_parts = [p for p in parts if p.metadata.get("chunk", False)] + + logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging") + + if non_chunk_parts: + non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy) + + # Combine non-chunk parts with chunk parts (chunks stay separate) + parts = non_chunk_parts + chunk_parts + + logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})") return ExtractedContent(id=makeId(), parts=parts) @@ -100,32 +119,56 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt # If we have remaining parts and chunking is allowed, try chunking if remaining and chunkAllowed: + import logging + logger = logging.getLogger(__name__) + logger.debug(f"=== CHUNKING ACTIVATED ===") + logger.debug(f"Remaining parts to chunk: {len(remaining)}") + logger.debug(f"Max size limit: {maxSize} bytes") + logger.debug(f"Current size used: {current} bytes") + for p in remaining: if p.typeGroup in ("text", "table", "structure", "image"): + logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars") chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options) + logger.debug(f"Created {len(chunks)} chunks") + + chunks_added = 0 for ch in chunks: chSize = int(ch.get("size", 0) or 0) - if current + chSize <= maxSize: - kept.append(ContentPart( - id=makeId(), - parentId=p.id, - label=f"chunk_{ch.get('order', 0)}", - typeGroup=p.typeGroup, - mimeType=p.mimeType, - data=ch.get("data", ""), - metadata={ - "size": chSize, - "chunk": True, - **ch.get("metadata", {}) - } - )) - current += chSize - else: - break + # Add all chunks - don't limit by maxSize since they'll be processed separately + kept.append(ContentPart( + id=makeId(), + parentId=p.id, + label=f"chunk_{ch.get('order', 0)}", + typeGroup=p.typeGroup, + mimeType=p.mimeType, + data=ch.get("data", ""), + metadata={ + "size": chSize, + "chunk": True, + **ch.get("metadata", {}) + } + )) + chunks_added += 1 + logger.debug(f"Added chunk {ch.get('order', 0)}: {chSize} bytes") + + logger.debug(f"Added {chunks_added} chunks from {p.typeGroup} part") - # Apply merging strategy if provided + # Apply merging strategy if provided, but preserve chunks if mergeStrategy: - kept = _applyMerging(kept, mergeStrategy) + # Don't merge chunks - they should stay separate for processing + non_chunk_parts = [p for p in kept if not p.metadata.get("chunk", False)] + chunk_parts = [p for p in kept if p.metadata.get("chunk", False)] + + logger.debug(f"Preserving {len(chunk_parts)} chunks from merging") + + if non_chunk_parts: + non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy) + + # Combine non-chunk parts with chunk parts (chunks stay separate) + kept = non_chunk_parts + chunk_parts + + logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})") # Re-check size after merging totalSize = sum(int(p.metadata.get("size", 0) or 0) for p in kept) diff --git a/modules/services/serviceExtraction/utils/__init__.py b/modules/services/serviceExtraction/subUtils.py similarity index 96% rename from modules/services/serviceExtraction/utils/__init__.py rename to modules/services/serviceExtraction/subUtils.py index a16d3f59..efee532b 100644 --- a/modules/services/serviceExtraction/utils/__init__.py +++ b/modules/services/serviceExtraction/subUtils.py @@ -3,5 +3,3 @@ import uuid def makeId() -> str: return str(uuid.uuid4()) - - diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index f18f071b..14bfe7fe 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -257,4 +257,82 @@ class GenerationService: 'totalActions': 0, 'workflowStatus': 'unknown', 'workflowId': 'unknown' - } \ No newline at end of file + } + + async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]: + """ + Render extracted content to the specified output format. + + Args: + extracted_content: Content extracted by AI using format-specific prompt + output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + title: Report title + + Returns: + tuple: (rendered_content, mime_type) + """ + try: + # Get the appropriate renderer for the format + renderer = self._getFormatRenderer(output_format) + if not renderer: + raise ValueError(f"Unsupported output format: {output_format}") + + # Render the content + rendered_content, mime_type = await renderer.render(extracted_content, title) + + logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") + return rendered_content, mime_type + + except Exception as e: + logger.error(f"Error rendering report to {output_format}: {str(e)}") + raise + + def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str: + """ + Get the format-specific extraction prompt for AI content extraction. + + Args: + output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + user_prompt: User's original prompt for report generation + title: Report title + + Returns: + str: Format-specific prompt for AI extraction + """ + try: + # Get the appropriate renderer for the format + renderer = self._getFormatRenderer(output_format) + if not renderer: + raise ValueError(f"Unsupported output format: {output_format}") + + # Get the format-specific extraction prompt + extraction_prompt = renderer.getExtractionPrompt(user_prompt, title) + + logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters") + return extraction_prompt + + except Exception as e: + logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}") + raise + + def _getFormatRenderer(self, output_format: str): + """Get the appropriate renderer for the specified format using auto-discovery.""" + try: + from .renderers.registry import get_renderer + renderer = get_renderer(output_format) + + if renderer: + return renderer + + # Fallback to text renderer if no specific renderer found + logger.warning(f"No renderer found for format {output_format}, falling back to text") + fallback_renderer = get_renderer('text') + if fallback_renderer: + return fallback_renderer + + logger.error("Even text renderer fallback failed") + return None + + except Exception as e: + logger.error(f"Error getting renderer for {output_format}: {str(e)}") + return None \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/base_renderer.py b/modules/services/serviceGeneration/renderers/base_renderer.py new file mode 100644 index 00000000..dd91be09 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/base_renderer.py @@ -0,0 +1,86 @@ +""" +Base renderer class for all format renderers. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Tuple, List +import logging + +logger = logging.getLogger(__name__) + +class BaseRenderer(ABC): + """Base class for all format renderers.""" + + def __init__(self): + self.logger = logger + + @classmethod + def get_supported_formats(cls) -> List[str]: + """ + Return list of supported format names for this renderer. + Override this method in subclasses to specify supported formats. + """ + return [] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """ + Return list of format aliases for this renderer. + Override this method in subclasses to specify format aliases. + """ + return [] + + @classmethod + def get_priority(cls) -> int: + """ + Return priority for this renderer (higher number = higher priority). + Used when multiple renderers support the same format. + """ + return 0 + + @abstractmethod + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """ + Get the format-specific extraction prompt for AI content extraction. + + Args: + user_prompt: User's original prompt for report generation + title: Report title + + Returns: + str: Format-specific prompt for AI extraction + """ + pass + + @abstractmethod + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """ + Render extracted content to the target format. + + Args: + extracted_content: Raw content extracted by AI using format-specific prompt + title: Report title + + Returns: + tuple: (rendered_content, mime_type) + """ + pass + + def _extract_sections(self, report_data: Dict[str, Any]) -> list: + """Extract sections from report data.""" + return report_data.get('sections', []) + + def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]: + """Extract metadata from report data.""" + return report_data.get('metadata', {}) + + def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str: + """Get title from report data or use fallback.""" + return report_data.get('title', fallback_title) + + def _format_timestamp(self, timestamp: str = None) -> str: + """Format timestamp for display.""" + if timestamp: + return timestamp + from datetime import datetime, UTC + return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/modules/services/serviceGeneration/renderers/csv_renderer.py b/modules/services/serviceGeneration/renderers/csv_renderer.py new file mode 100644 index 00000000..8e2344ac --- /dev/null +++ b/modules/services/serviceGeneration/renderers/csv_renderer.py @@ -0,0 +1,90 @@ +""" +CSV renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import csv +import io + +class CsvRenderer(BaseRenderer): + """Renders content to CSV format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported CSV formats.""" + return ['csv'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'table'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for CSV renderer.""" + return 70 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Get CSV-specific extraction prompt.""" + return f""" +{user_prompt} + +Generate a comprehensive CSV report with the title: "{title}" + +CSV FORMAT REQUIREMENTS: +- Create structured data in CSV format +- Use proper CSV syntax with commas and quotes +- Include headers for all columns +- Structure data in rows and columns +- Include source document information +- Add metadata as additional rows + +CSV STRUCTURE: +- First row: Headers (Section, Type, Heading, Content, Source) +- Data rows: One per section/item +- Use quotes around content that contains commas +- Escape quotes properly +- Include metadata rows at the end + +FORMATTING RULES: +- Headers: Section, Type, Heading, Content, Source +- Content: Escape commas and quotes, limit length +- Source: Include document name and page if available +- Metadata: Add special rows for generation info + +OUTPUT POLICY: +- Return ONLY CSV data +- No markdown, no code blocks, no additional text +- Properly formatted CSV +- Include all necessary information +- Valid CSV that can be imported + +Generate the complete CSV report: +""" + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to CSV format.""" + try: + # The extracted content should already be CSV from the AI + # Just clean it up + csv_content = self._clean_csv_content(extracted_content, title) + + return csv_content, "text/csv" + + except Exception as e: + self.logger.error(f"Error rendering CSV: {str(e)}") + # Return minimal CSV fallback + return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv" + + def _clean_csv_content(self, content: str, title: str) -> str: + """Clean and validate CSV content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + return content diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py new file mode 100644 index 00000000..e2ea7f3f --- /dev/null +++ b/modules/services/serviceGeneration/renderers/docx_renderer.py @@ -0,0 +1,291 @@ +""" +DOCX renderer for report generation using python-docx. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from docx import Document + from docx.shared import Inches, Pt + from docx.enum.text import WD_ALIGN_PARAGRAPH + from docx.enum.table import WD_TABLE_ALIGNMENT + from docx.oxml.shared import OxmlElement, qn + from docx.oxml.ns import nsdecls + from docx.oxml import parse_xml + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + +class DocxRenderer(BaseRenderer): + """Renders content to DOCX format using python-docx.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported DOCX formats.""" + return ['docx', 'doc'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['word', 'document'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for DOCX renderer.""" + return 115 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Get DOCX-specific extraction prompt.""" + return f""" +{user_prompt} + +Generate a comprehensive DOCX report with the title: "{title}" + +DOCX FORMAT REQUIREMENTS: +- Create structured content suitable for Word documents +- Use clear headings and sections with proper hierarchy +- Include tables for structured data +- Use bullet points and numbered lists where appropriate +- Include source document information +- Structure content for professional presentation +- Use consistent formatting throughout + +DOCX STRUCTURE: +- Title page with report title and generation date +- Table of contents (if multiple sections) +- Executive summary +- Main content sections with clear headings +- Data tables and analysis +- Conclusions and recommendations +- Appendices with source information + +FORMATTING RULES: +- Use clear section headings (H1, H2, H3 style) +- Include consistent paragraph formatting +- Use tables with proper alignment and borders +- Use bullet points and numbered lists +- Add source citations and references +- Include generation metadata +- Use professional fonts and spacing + +OUTPUT POLICY: +- Return ONLY plain text content suitable for Word document generation +- NO markdown formatting (no **bold**, no # headings, no --- separators) +- NO HTML tags +- NO code blocks +- Use plain text with clear structure +- Use line breaks for separation +- Use indentation for lists +- Use ALL CAPS for major headings +- Use Title Case for subheadings +- Use bullet points with dashes (-) for lists +- Use numbers (1., 2., 3.) for numbered lists +- Professional document format +- Include all necessary information + +Generate the complete DOCX report content: +""" + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to DOCX format.""" + try: + if not DOCX_AVAILABLE: + # Fallback to HTML if python-docx not available + from .html_renderer import HtmlRenderer + html_renderer = HtmlRenderer() + html_content, _ = await html_renderer.render(extracted_content, title) + return html_content, "text/html" + + # Generate DOCX using python-docx + docx_content = self._generate_docx(extracted_content, title) + + return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + + except Exception as e: + self.logger.error(f"Error rendering DOCX: {str(e)}") + # Return minimal fallback + return f"DOCX Generation Error: {str(e)}", "text/plain" + + def _generate_docx(self, content: str, title: str) -> str: + """Generate DOCX content using python-docx.""" + try: + # Create new document + doc = Document() + + # Set up document styles + self._setup_document_styles(doc) + + # Add title + title_para = doc.add_heading(title, 0) + title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # Add generation date + date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") + date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # Add page break + doc.add_page_break() + + # Process content + lines = content.split('\n') + current_section = [] + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for ALL CAPS headings (major headings) + if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'): + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line, level=1) + # Check for Title Case headings (subheadings) + elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')): + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line, level=2) + # Check for markdown headings (fallback) + elif line.startswith('# '): + # H1 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[2:], level=1) + elif line.startswith('## '): + # H2 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[3:], level=2) + elif line.startswith('### '): + # H3 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[4:], level=3) + else: + current_section.append(line) + + # Process remaining content + if current_section: + self._process_section(doc, current_section) + + # Save to buffer + buffer = io.BytesIO() + doc.save(buffer) + buffer.seek(0) + + # Convert to base64 + docx_bytes = buffer.getvalue() + docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') + + return docx_base64 + + except Exception as e: + self.logger.error(f"Error generating DOCX: {str(e)}") + raise + + def _setup_document_styles(self, doc): + """Set up document styles.""" + try: + # Set default font + style = doc.styles['Normal'] + font = style.font + font.name = 'Calibri' + font.size = Pt(11) + + # Set heading styles + for i in range(1, 4): + heading_style = doc.styles[f'Heading {i}'] + heading_font = heading_style.font + heading_font.name = 'Calibri' + heading_font.size = Pt(16 - i * 2) + heading_font.bold = True + except Exception as e: + self.logger.warning(f"Could not set up document styles: {str(e)}") + + def _process_section(self, doc, lines: list): + """Process a section of content into DOCX elements.""" + for line in lines: + if not line.strip(): + continue + + # Check for tables (lines with |) + if '|' in line and not line.startswith('|'): + # This might be part of a table, process as table + table_data = self._extract_table_data(lines) + if table_data: + self._add_table(doc, table_data) + return + + # Check for lists + if line.startswith('- ') or line.startswith('* '): + # This is a list item + doc.add_paragraph(line[2:], style='List Bullet') + elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): + # This is a numbered list item + doc.add_paragraph(line[3:], style='List Number') + else: + # Regular paragraph + doc.add_paragraph(line) + + def _extract_table_data(self, lines: list) -> list: + """Extract table data from lines.""" + table_data = [] + in_table = False + + for line in lines: + if '|' in line: + if not in_table: + in_table = True + # Split by | and clean up + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + if cells: + table_data.append(cells) + elif in_table and not line.strip(): + # Empty line, might be end of table + break + + return table_data if len(table_data) > 1 else [] + + def _add_table(self, doc, table_data: list): + """Add a table to the document.""" + try: + if not table_data: + return + + # Create table + table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) + table.alignment = WD_TABLE_ALIGNMENT.CENTER + + # Add data to table + for row_idx, row_data in enumerate(table_data): + for col_idx, cell_data in enumerate(row_data): + if col_idx < len(table.rows[row_idx].cells): + table.rows[row_idx].cells[col_idx].text = cell_data + + # Style the table + self._style_table(table) + + except Exception as e: + self.logger.warning(f"Could not add table: {str(e)}") + + def _style_table(self, table): + """Apply styling to the table.""" + try: + # Style header row + if len(table.rows) > 0: + header_cells = table.rows[0].cells + for cell in header_cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + except Exception as e: + self.logger.warning(f"Could not style table: {str(e)}") \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/excel_renderer.py b/modules/services/serviceGeneration/renderers/excel_renderer.py new file mode 100644 index 00000000..e22a5b5f --- /dev/null +++ b/modules/services/serviceGeneration/renderers/excel_renderer.py @@ -0,0 +1,264 @@ +""" +Excel renderer for report generation using openpyxl. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from openpyxl import Workbook + from openpyxl.styles import Font, PatternFill, Alignment, Border, Side + from openpyxl.utils import get_column_letter + from openpyxl.worksheet.table import Table, TableStyleInfo + OPENPYXL_AVAILABLE = True +except ImportError: + OPENPYXL_AVAILABLE = False + +class ExcelRenderer(BaseRenderer): + """Renders content to Excel format using openpyxl.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported Excel formats.""" + return ['xlsx', 'xls', 'excel'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'workbook'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for Excel renderer.""" + return 110 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Get Excel-specific extraction prompt.""" + return f""" +{user_prompt} + +Generate a comprehensive Excel report with the title: "{title}" + +EXCEL FORMAT REQUIREMENTS: +- Create structured data suitable for Excel spreadsheets +- Use clear column headers and organized rows +- Include multiple sheets if needed (Summary, Data, Analysis, etc.) +- Use proper data types (text, numbers, dates) +- Include formulas where appropriate +- Structure data in tables with clear headers +- Include source document information +- Add metadata and generation information + +EXCEL STRUCTURE: +- Sheet 1: Summary/Overview with key metrics +- Sheet 2: Detailed data in tabular format +- Sheet 3: Analysis and insights +- Use proper column headers (A, B, C, etc.) +- Include data validation and formatting hints +- Add comments for complex data + +FORMATTING RULES: +- Headers: Use bold formatting, clear column names +- Data: Organize in rows and columns, consistent formatting +- Numbers: Use proper number formatting (currency, percentages, etc.) +- Dates: Use standard date format (YYYY-MM-DD) +- Text: Left-aligned, wrap long text +- Formulas: Use Excel formula syntax (=SUM, =AVERAGE, etc.) +- Colors: Use conditional formatting for highlights + +SHEET STRUCTURE: +Sheet 1 - Summary: +- Report Title +- Key Metrics (counts, totals, averages) +- Executive Summary +- Generation Date + +Sheet 2 - Data: +- Column A: Item/Category +- Column B: Value/Amount +- Column C: Percentage +- Column D: Source Document +- Column E: Notes/Comments + +Sheet 3 - Analysis: +- Trends and patterns +- Comparisons +- Recommendations +- Charts descriptions + +OUTPUT POLICY: +- Return ONLY Excel-compatible data +- No HTML, no markdown, no code blocks +- Structured data that can be imported to Excel +- Include sheet names and structure +- Professional spreadsheet format +- Include all necessary information + +Generate the complete Excel report data: +""" + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to Excel format.""" + try: + if not OPENPYXL_AVAILABLE: + # Fallback to CSV if openpyxl not available + from .csv_renderer import CsvRenderer + csv_renderer = CsvRenderer() + csv_content, _ = await csv_renderer.render(extracted_content, title) + return csv_content, "text/csv" + + # Generate Excel using openpyxl + excel_content = self._generate_excel(extracted_content, title) + + return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + except Exception as e: + self.logger.error(f"Error rendering Excel: {str(e)}") + # Return CSV fallback + return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv" + + def _generate_excel(self, content: str, title: str) -> str: + """Generate Excel content using openpyxl.""" + try: + # Create workbook + wb = Workbook() + + # Remove default sheet + wb.remove(wb.active) + + # Create sheets + summary_sheet = wb.create_sheet("Summary", 0) + data_sheet = wb.create_sheet("Data", 1) + analysis_sheet = wb.create_sheet("Analysis", 2) + + # Add content to sheets + self._populate_summary_sheet(summary_sheet, title) + self._populate_data_sheet(data_sheet, content) + self._populate_analysis_sheet(analysis_sheet, content) + + # Save to buffer + buffer = io.BytesIO() + wb.save(buffer) + buffer.seek(0) + + # Convert to base64 + excel_bytes = buffer.getvalue() + excel_base64 = base64.b64encode(excel_bytes).decode('utf-8') + + return excel_base64 + + except Exception as e: + self.logger.error(f"Error generating Excel: {str(e)}") + raise + + def _populate_summary_sheet(self, sheet, title: str): + """Populate the summary sheet.""" + try: + # Title + sheet['A1'] = title + sheet['A1'].font = Font(size=16, bold=True) + sheet['A1'].alignment = Alignment(horizontal='center') + + # Generation info + sheet['A3'] = "Generated:" + sheet['B3'] = self._format_timestamp() + sheet['A4'] = "Status:" + sheet['B4'] = "Generated Successfully" + + # Key metrics placeholder + sheet['A6'] = "Key Metrics:" + sheet['A6'].font = Font(bold=True) + sheet['A7'] = "Total Items:" + sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet + + # Auto-adjust column widths + sheet.column_dimensions['A'].width = 20 + sheet.column_dimensions['B'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate summary sheet: {str(e)}") + + def _populate_data_sheet(self, sheet, content: str): + """Populate the data sheet.""" + try: + # Headers + headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"] + for col, header in enumerate(headers, 1): + cell = sheet.cell(row=1, column=col, value=header) + cell.font = Font(bold=True) + cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid") + + # Process content + lines = content.split('\n') + row = 2 + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for table data (lines with |) + if '|' in line: + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns + sheet.cell(row=row, column=col, value=cell_data) + row += 1 + else: + # Regular content + sheet.cell(row=row, column=1, value=line) + row += 1 + + # Auto-adjust column widths + for col in range(1, 6): + sheet.column_dimensions[get_column_letter(col)].width = 20 + + except Exception as e: + self.logger.warning(f"Could not populate data sheet: {str(e)}") + + def _populate_analysis_sheet(self, sheet, content: str): + """Populate the analysis sheet.""" + try: + # Title + sheet['A1'] = "Analysis & Insights" + sheet['A1'].font = Font(size=14, bold=True) + + # Content analysis + lines = content.split('\n') + row = 3 + + sheet['A3'] = "Content Analysis:" + sheet['A3'].font = Font(bold=True) + row += 1 + + # Count different types of content + table_lines = sum(1 for line in lines if '|' in line) + list_lines = sum(1 for line in lines if line.startswith(('- ', '* '))) + text_lines = len(lines) - table_lines - list_lines + + sheet[f'A{row}'] = f"Total Lines: {len(lines)}" + row += 1 + sheet[f'A{row}'] = f"Table Rows: {table_lines}" + row += 1 + sheet[f'A{row}'] = f"List Items: {list_lines}" + row += 1 + sheet[f'A{row}'] = f"Text Lines: {text_lines}" + row += 2 + + # Recommendations + sheet[f'A{row}'] = "Recommendations:" + sheet[f'A{row}'].font = Font(bold=True) + row += 1 + sheet[f'A{row}'] = "1. Review data accuracy" + row += 1 + sheet[f'A{row}'] = "2. Consider additional analysis" + row += 1 + sheet[f'A{row}'] = "3. Update regularly" + + # Auto-adjust column width + sheet.column_dimensions['A'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate analysis sheet: {str(e)}") diff --git a/modules/services/serviceGeneration/renderers/html_renderer.py b/modules/services/serviceGeneration/renderers/html_renderer.py new file mode 100644 index 00000000..5d3c886a --- /dev/null +++ b/modules/services/serviceGeneration/renderers/html_renderer.py @@ -0,0 +1,94 @@ +""" +HTML renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List + +class HtmlRenderer(BaseRenderer): + """Renders content to HTML format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported HTML formats.""" + return ['html', 'htm'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['web', 'webpage'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for HTML renderer.""" + return 100 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Get HTML-specific extraction prompt.""" + return f""" +{user_prompt} + +Generate a comprehensive HTML report with the title: "{title}" + +HTML STRUCTURE REQUIREMENTS: +- Create a complete, self-contained HTML document +- Start with: +- Include: ,
(with and