diff --git a/config.ini b/config.ini index 780a9e08..ab0b6712 100644 --- a/config.ini +++ b/config.ini @@ -29,4 +29,9 @@ Web_Search_MIN_RESULTS = 1 # Web Crawl configuration Web_Crawl_TIMEOUT = 30 Web_Crawl_MAX_RETRIES = 3 -Web_Crawl_RETRY_DELAY = 2 \ No newline at end of file +Web_Crawl_RETRY_DELAY = 2 + +# Web Research configuration +Web_Research_MAX_DEPTH = 2 +Web_Research_MAX_LINKS_PER_DOMAIN = 4 +Web_Research_CRAWL_TIMEOUT_MINUTES = 10 \ No newline at end of file diff --git a/env_dev.env b/env_dev.env index 9ebbb93b..0c3fd25b 100644 --- a/env_dev.env +++ b/env_dev.env @@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000 # Perplexity AI configuration Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions -Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0= Connector_AiPerplexity_MODEL_NAME = sonar Connector_AiPerplexity_TEMPERATURE = 0.2 Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c -Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQk4xYnpmbnItUEU3dHU4eHB5dzVYay1WT012RTRLUWJDTlBILVY5dC1FX3VMNjZmLThrbDRFNWFSNGprY3RRTlpYNGlubVBpNnY3MjNJcGtzVk9PMzRacl9LUlM2RU5vTVVZWHJvaUhWSHVfc1pNR0pfQmI5SEprOG5KdlB1QnQ= +Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk= Service_MSFT_TENANT_ID = common # Google Service configuration @@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETk5FWWM3Q0JKMzhI # Feature SyncDelta JIRA configuration Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEbm0yRUJ6VUJKbUwyRW5kMnRaNW4wM2YxMkJUTXVXZUdmdVRCaUZIVHU2TTV2RWZLRmUtZkcwZE4yRUNlNDQ0aUJWYjNfdVg5YjV5c2JwMHhoUUYxZWdkeS11bXR0eGxRLWRVaVU3cUVQZWJlNDRtY1lWUDdqeDVFSlpXS0VFX21WajlRS3lHQjc0bS11akkybWV3QUFlR2hNWUNYLUdiRjZuN2dQODdDSExXWG1Dd2ZGclI2aUhlSWhETVZuY3hYdnhkb2c2LU1JTFBvWFpTNmZtMkNVOTZTejJwbDI2eGE0OS1xUlIwQnlCSmFxRFNCeVJNVzlOMDhTR1VUamx4RDRyV3p6Tk9qVHBrWWdySUM3TVRaYjd3N0JHMFhpdzFhZTNDLTFkRVQ2RVE4U19COXRhRWtNc0NVOHRqUS1CRDFpZ19xQmtFLU9YSDU3TXBZQXpVcld3PT0= + +# Debug Configuration +APP_DEBUG_CHAT_WORKFLOW_ENABLED = True +APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat \ No newline at end of file diff --git a/env_int.env b/env_int.env index 4a0f3e39..c69f1f57 100644 --- a/env_int.env +++ b/env_int.env @@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000 # Perplexity AI configuration Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions -Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0= Connector_AiPerplexity_MODEL_NAME = sonar Connector_AiPerplexity_TEMPERATURE = 0.2 Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c -Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNzB2M3ZjaE1SVE9ON2FKam9yVURxcHl1Ym5VNVUtS0MyWUpNVXVlaWpWS2U3VVd3em9vQl9lcnVYay03bS04YjNBbDZZNTB4eUtjT3ppQjJjY3dOT0FNLW9LeDhIUU5iaTNqNURUWE5La3kzaHNGcU9yNVI0YjhWZTZRRFktcTk= +Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk= Service_MSFT_TENANT_ID = common # Google Service configuration @@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkNmVXZ1pWcHcydTF2 # Feature SyncDelta JIRA configuration Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkTUNsWm4wX0p6eXFDZmJ4dFdHNEs1MV9MUzdrb3RzeC1jVWVYZ0REWHRyZkFiaGZLcUQtTXFBZzZkNzRmQ0gxbEhGbUNlVVFfR1JEQTc0aldkZkgyWnBOcjdlUlZxR0tDTEdKRExULXAyUEtsVmNTMkRKU1BJNnFiM0hlMXo4YndMcHlRMExtZDQ3Zm9vNFhMcEZCcHpBPT0= + +# Debug Configuration +APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE +APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat \ No newline at end of file diff --git a/env_prod.env b/env_prod.env index c1ba8086..3d1aa40a 100644 --- a/env_prod.env +++ b/env_prod.env @@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000 # Perplexity AI configuration Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions -Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0= Connector_AiPerplexity_MODEL_NAME = sonar Connector_AiPerplexity_TEMPERATURE = 0.2 Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c -Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pVEhHdlZHU3FNMmhuRGVwaGc3YzIxSjlZNzBCQjlOV2pSYVNXb0t1ZnVwQzZsQzY4cHMtVlZtNF85OEVaV1BMTzdXMmpzaGZpaG1DalJ0bkNPMHA5ZUcwZjNDdGk1TFdxYTJSZnVrVmhhZ2VRUEZxbjJOOGFhWk9EYlY3dmRVTnI= +Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm83T29rSzdYLTRydXN5V3lQLXhmRjMyQ1FOaGpuek45QllaX1REN2s5aWNIUl81NGlrYlJTeFV0RlRZd0xPcm5uMDM4QlpibHJQbm5XZTlWeWxfcWNVdFpCUHI2amh0MVBnZ21IN2ptSkhWLTVfaHEwNmI5SEtiS05pQmt5eV8yMnhLMEc= Service_MSFT_TENANT_ID = common # Google Service configuration @@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pNjlJdmFMeERXUUQ # Feature SyncDelta JIRA configuration Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pTDhnTVNzRUhScU8wYnZsZk52bHFkSWxLc18xQmtCeC1HbnNwTzVBbXRNTmQzRjZYaGE2MVlCNGtnWDk1T2I5VXVKNHpKU1VRbXEyN2tRWUJnU2ltZE5qZ3lmNEF6Z1hMTTEwZkk2NUNBYjhmVTJEcWpRUW9HNEVpSGFWdjBWQXQ3eUtHUTFJS3U5QWpaeno0RFNhMUxnPT0= + +# Debug Configuration +APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE +APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat \ No newline at end of file diff --git a/modules/connectors/connectorAiAnthropic.py b/modules/connectors/connectorAiAnthropic.py index 1bcfe289..e7eb07a2 100644 --- a/modules/connectors/connectorAiAnthropic.py +++ b/modules/connectors/connectorAiAnthropic.py @@ -1,5 +1,6 @@ import logging import httpx +import os from typing import Dict, Any, List, Union from fastapi import HTTPException from modules.shared.configuration import APP_CONFIG @@ -147,6 +148,11 @@ class AiAnthropic: # Direct content as string (in older API versions) content = anthropicResponse["content"] + # Debug logging for empty responses + if not content or content.strip() == "": + logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}") + content = "[Anthropic API returned empty response]" + # Return in OpenAI format return { "id": anthropicResponse.get("id", ""), @@ -182,14 +188,27 @@ class AiAnthropic: The analysis response as text """ try: + # Debug logging + logger.info(f"callAiImage called with imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}") + # Distinguish between file path and binary data if isinstance(imageData, str): - # It's a file path - import filehandling only when needed - from modules import agentserviceFilemanager as fileHandler - base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData) - mimeType = mimeType or autoMimeType + # Check if it's base64 encoded data or a file path + if len(imageData) > 100 and not os.path.exists(imageData): + # It's likely base64 encoded data + logger.info("Treating imageData as base64 encoded string") + base64Data = imageData + if not mimeType: + mimeType = "image/png" + else: + # It's a file path - import filehandling only when needed + logger.info(f"Treating imageData as file path: {imageData}") + from modules import agentserviceFilemanager as fileHandler + base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData) + mimeType = mimeType or autoMimeType else: # It's binary data + logger.info("Treating imageData as binary data") import base64 base64Data = base64.b64encode(imageData).decode('utf-8') # MIME type must be specified for binary data @@ -216,8 +235,16 @@ class AiAnthropic: # Use the existing callAiBasic function with the Vision model response = await self.callAiBasic(messages) - # Extract and return content - return response["choices"][0]["message"]["content"] + # Extract and return content with proper error handling + try: + content = response["choices"][0]["message"]["content"] + if content is None or content.strip() == "": + return "[AI returned empty response for image analysis]" + return content + except (KeyError, IndexError, TypeError) as e: + logger.error(f"Error extracting content from AI response: {str(e)}") + logger.error(f"Response structure: {response}") + return f"[Error extracting AI response: {str(e)}]" except Exception as e: logger.error(f"Error during image analysis: {str(e)}", exc_info=True) diff --git a/modules/connectors/connectorAiOpenai.py b/modules/connectors/connectorAiOpenai.py index 4a9f4888..692fe422 100644 --- a/modules/connectors/connectorAiOpenai.py +++ b/modules/connectors/connectorAiOpenai.py @@ -188,4 +188,83 @@ class AiOpenai: except Exception as e: logger.error(f"Error during image analysis: {str(e)}", exc_info=True) - return f"[Error during image analysis: {str(e)}]" \ No newline at end of file + return f"[Error during image analysis: {str(e)}]" + + async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]: + """ + Generate an image using DALL-E 3. + + Args: + prompt: The text prompt for image generation + size: Image size (1024x1024, 1792x1024, or 1024x1792) + quality: Image quality (standard or hd) + style: Image style (vivid or natural) + + Returns: + Dictionary with success status and image data + """ + try: + logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'") + + # DALL-E 3 API endpoint + dalle_url = "https://api.openai.com/v1/images/generations" + + payload = { + "model": "dall-e-3", + "prompt": prompt, + "size": size, + "quality": quality, + "style": style, + "n": 1, + "response_format": "b64_json" # Get base64 data directly instead of URLs + } + + # Create a separate client for DALL-E API calls + dalle_client = httpx.AsyncClient( + timeout=120.0, + headers={ + "Authorization": f"Bearer {self.apiKey}", + "Content-Type": "application/json" + } + ) + + response = await dalle_client.post( + dalle_url, + json=payload + ) + + await dalle_client.aclose() + + if response.status_code != 200: + logger.error(f"DALL-E API error: {response.status_code} - {response.text}") + return { + "success": False, + "error": f"DALL-E API error: {response.status_code} - {response.text}" + } + + responseJson = response.json() + + if "data" in responseJson and len(responseJson["data"]) > 0: + image_data = responseJson["data"][0]["b64_json"] + + logger.info(f"Successfully generated image: {len(image_data)} characters") + return { + "success": True, + "image_data": image_data, + "size": size, + "quality": quality, + "style": style + } + else: + logger.error("No image data in DALL-E response") + return { + "success": False, + "error": "No image data in DALL-E response" + } + + except Exception as e: + logger.error(f"Error during image generation: {str(e)}", exc_info=True) + return { + "success": False, + "error": f"Error during image generation: {str(e)}" + } \ No newline at end of file diff --git a/modules/connectors/connectorAiTavily.py b/modules/connectors/connectorAiTavily.py index f86c49b2..b7631ea3 100644 --- a/modules/connectors/connectorAiTavily.py +++ b/modules/connectors/connectorAiTavily.py @@ -271,6 +271,7 @@ class ConnectorWeb: include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, language: str | None = None, + country: str | None = None, include_answer: bool | None = None, include_raw_content: bool | None = None, ) -> list[WebSearchResult]: @@ -290,17 +291,20 @@ class ConnectorWeb: kwargs["time_range"] = time_range if topic is not None: kwargs["topic"] = topic - if include_domains is not None: + if include_domains is not None and len(include_domains) > 0: kwargs["include_domains"] = include_domains if exclude_domains is not None: kwargs["exclude_domains"] = exclude_domains if language is not None: kwargs["language"] = language + if country is not None: + kwargs["country"] = country if include_answer is not None: kwargs["include_answer"] = include_answer if include_raw_content is not None: kwargs["include_raw_content"] = include_raw_content + logger.debug(f"Tavily.search kwargs: {kwargs}") response = await self.client.search(**kwargs) return [ diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py index ad06f785..41c434da 100644 --- a/modules/datamodels/datamodelAi.py +++ b/modules/datamodels/datamodelAi.py @@ -135,3 +135,29 @@ class AiCallResponse(BaseModel): costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call") +class EnhancedAiCallOptions(AiCallOptions): + """Enhanced options for improved document processing with chunk mapping.""" + + # Parallel processing + enableParallelProcessing: bool = Field( + default=True, + description="Enable parallel processing of chunks" + ) + maxConcurrentChunks: int = Field( + default=5, + ge=1, + le=20, + description="Maximum number of chunks to process concurrently" + ) + + # Chunk mapping + preserveChunkMetadata: bool = Field( + default=True, + description="Preserve chunk metadata during processing" + ) + chunkSeparator: str = Field( + default="\n\n---\n\n", + description="Separator between chunks in merged output" + ) + + diff --git a/modules/datamodels/datamodelDocument.py b/modules/datamodels/datamodelDocument.py new file mode 100644 index 00000000..4c37c106 --- /dev/null +++ b/modules/datamodels/datamodelDocument.py @@ -0,0 +1,130 @@ +from typing import Any, Dict, List, Optional, Literal, Union +from pydantic import BaseModel, Field +from datetime import datetime + + +class DocumentMetadata(BaseModel): + """Metadata for the entire document.""" + title: str = Field(description="Document title") + author: Optional[str] = Field(default=None, description="Document author") + created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp") + source_documents: List[str] = Field(default_factory=list, description="Source document IDs") + extraction_method: str = Field(default="ai_extraction", description="Method used for extraction") + version: str = Field(default="1.0", description="Document version") + + +class TableData(BaseModel): + """Structured table data.""" + headers: List[str] = Field(description="Table column headers") + rows: List[List[str]] = Field(description="Table data rows") + caption: Optional[str] = Field(default=None, description="Table caption") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata") + + +class ListItem(BaseModel): + """Individual list item with optional sub-items.""" + text: str = Field(description="List item text") + subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata") + + +class BulletList(BaseModel): + """Bulleted or numbered list.""" + items: List[ListItem] = Field(description="List items") + list_type: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type") + metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata") + + +class Paragraph(BaseModel): + """Text paragraph with optional formatting.""" + text: str = Field(description="Paragraph text") + formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata") + + +class Heading(BaseModel): + """Document heading.""" + text: str = Field(description="Heading text") + level: int = Field(ge=1, le=6, description="Heading level (1-6)") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata") + + +class CodeBlock(BaseModel): + """Code block with syntax highlighting.""" + code: str = Field(description="Code content") + language: Optional[str] = Field(default=None, description="Programming language") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata") + + +class Image(BaseModel): + """Image with metadata.""" + data: str = Field(description="Base64 encoded image data") + alt_text: Optional[str] = Field(default=None, description="Alternative text") + caption: Optional[str] = Field(default=None, description="Image caption") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata") + + +class DocumentSection(BaseModel): + """A section of the document containing one or more content elements.""" + id: str = Field(description="Unique section identifier") + title: Optional[str] = Field(default=None, description="Section title") + content_type: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type") + elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section") + order: int = Field(description="Section order in document") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata") + + +class StructuredDocument(BaseModel): + """Complete structured document in JSON format.""" + metadata: DocumentMetadata = Field(description="Document metadata") + sections: List[DocumentSection] = Field(description="Document sections") + summary: Optional[str] = Field(default=None, description="Document summary") + tags: List[str] = Field(default_factory=list, description="Document tags") + + def get_sections_by_type(self, content_type: str) -> List[DocumentSection]: + """Get all sections of a specific content type.""" + return [section for section in self.sections if section.content_type == content_type] + + def get_all_tables(self) -> List[TableData]: + """Get all table data from the document.""" + tables = [] + for section in self.sections: + for element in section.elements: + if isinstance(element, TableData): + tables.append(element) + return tables + + def get_all_lists(self) -> List[BulletList]: + """Get all lists from the document.""" + lists = [] + for section in self.sections: + for element in section.elements: + if isinstance(element, BulletList): + lists.append(element) + return lists + + +class JsonChunkResult(BaseModel): + """Result from processing a single chunk with JSON output.""" + chunk_id: str = Field(description="Chunk identifier") + document_section: DocumentSection = Field(description="Structured content from this chunk") + processing_time: float = Field(description="Processing time in seconds") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk processing metadata") + + +class JsonMergeResult(BaseModel): + """Result from merging multiple JSON chunks.""" + merged_document: StructuredDocument = Field(description="Merged structured document") + merge_strategy: str = Field(description="Strategy used for merging") + chunks_processed: int = Field(description="Number of chunks processed") + merge_time: float = Field(description="Time taken to merge chunks") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata") + + +# Update forward references (compatible with Pydantic v1 and v2) +try: + # Pydantic v2 + ListItem.model_rebuild() +except AttributeError: + # Pydantic v1 + ListItem.update_forward_refs() diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py index ff44aa19..cfce0275 100644 --- a/modules/datamodels/datamodelExtraction.py +++ b/modules/datamodels/datamodelExtraction.py @@ -18,6 +18,16 @@ class ContentExtracted(BaseModel): summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary") +class ChunkResult(BaseModel): + """Preserves the relationship between a chunk and its AI result.""" + originalChunk: ContentPart + aiResult: str + chunkIndex: int + documentId: str + processingTime: float = 0.0 + metadata: Dict[str, Any] = Field(default_factory=dict) + + class MergeStrategy(BaseModel): """Strategy configuration for merging content parts and AI results.""" diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 3e9c744d..7c3c5ce2 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -1,4 +1,5 @@ import logging +import asyncio from typing import Dict, Any, List, Union, Tuple, Optional from dataclasses import dataclass @@ -260,6 +261,7 @@ class AiObjects: if not requiredTags: requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT]) + # Override priority based on processing mode if not explicitly set effectivePriority = options.priority if options.priority == Priority.BALANCED: @@ -268,6 +270,7 @@ class AiObjects: logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}") for name, info in aiModels.items(): + logger.info(f"Checking model: {name}, tags: {info.get('tags', [])}, function: {info.get('function', 'unknown')}") # Check context length if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8: continue @@ -279,8 +282,11 @@ class AiObjects: # Check required tags/capabilities modelTags = info.get("tags", []) - if requiredTags and not any(tag in modelTags for tag in requiredTags): + if requiredTags and not all(tag in modelTags for tag in requiredTags): + logger.info(f" -> Skipping {name}: missing required tags. Has: {modelTags}, needs: {requiredTags}") continue + else: + logger.info(f" -> {name} passed tag check") # Check processing mode requirements if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags: @@ -288,16 +294,24 @@ class AiObjects: continue candidates[name] = info + logger.info(f" -> {name} added to candidates") + + logger.info(f"Final candidates: {list(candidates.keys())}") if not candidates: + logger.info("No candidates found, using fallback") # Fallback based on operation type if options.operationType == OperationType.IMAGE_ANALYSIS: + logger.info("Using fallback: openai_callAiImage") return "openai_callAiImage" elif options.operationType == OperationType.IMAGE_GENERATION: + logger.info("Using fallback: openai_generateImage") return "openai_generateImage" elif options.operationType == OperationType.WEB_RESEARCH: + logger.info("Using fallback: perplexity_callAiWithWebSearch") return "perplexity_callAiWithWebSearch" else: + logger.info("Using fallback: openai_callAiBasic_gpt35") return "openai_callAiBasic_gpt35" # Special handling for planning operations - use Claude for consistency @@ -313,17 +327,60 @@ class AiObjects: # Select based on priority for other operations if effectivePriority == Priority.SPEED: - return max(candidates, key=lambda k: candidates[k]["speedRating"]) + selected = max(candidates, key=lambda k: candidates[k]["speedRating"]) + logger.info(f"Selected by SPEED: {selected}") + return selected elif effectivePriority == Priority.QUALITY: - return max(candidates, key=lambda k: candidates[k]["qualityRating"]) + selected = max(candidates, key=lambda k: candidates[k]["qualityRating"]) + logger.info(f"Selected by QUALITY: {selected}") + return selected elif effectivePriority == Priority.COST: - return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"]) + selected = min(candidates, key=lambda k: candidates[k]["costPer1kTokens"]) + logger.info(f"Selected by COST: {selected}") + return selected else: # BALANCED def balancedScore(name: str) -> float: info = candidates[name] return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3 - return max(candidates, key=balancedScore) + selected = max(candidates, key=balancedScore) + logger.info(f"Selected by BALANCED: {selected}") + return selected + + def _getFallbackModels(self, operationType: str) -> List[str]: + """Get ordered list of fallback models for a given operation type.""" + fallbackMappings = { + OperationType.GENERAL: [ + "openai_callAiBasic_gpt35", # Fast and reliable + "openai_callAiBasic", # High quality + "anthropic_callAiBasic", # Alternative high quality + "perplexity_callAiBasic" # Cost effective + ], + OperationType.IMAGE_ANALYSIS: [ + "openai_callAiImage", # Primary image analysis + "anthropic_callAiImage" # Alternative image analysis + ], + OperationType.IMAGE_GENERATION: [ + "openai_generateImage" # Only image generation model + ], + OperationType.WEB_RESEARCH: [ + "perplexity_callAiWithWebSearch", # Primary web research + "perplexity_callAiBasic", # Alternative with web search + "openai_callAiBasic" # Fallback to general model + ], + OperationType.GENERATE_PLAN: [ + "anthropic_callAiBasic", # Best for planning + "openai_callAiBasic", # High quality alternative + "openai_callAiBasic_gpt35" # Fast fallback + ], + OperationType.ANALYSE_CONTENT: [ + "anthropic_callAiBasic", # Best for analysis + "openai_callAiBasic", # High quality alternative + "openai_callAiBasic_gpt35" # Fast fallback + ] + } + + return fallbackMappings.get(operationType, fallbackMappings[OperationType.GENERAL]) def _connectorFor(self, modelName: str): """Get the appropriate connector for the model.""" @@ -340,7 +397,7 @@ class AiObjects: raise ValueError(f"Unknown connector type: {connectorType}") async def call(self, request: AiCallRequest) -> AiCallResponse: - """Call AI model for text generation.""" + """Call AI model for text generation with fallback mechanism.""" prompt = request.prompt context = request.context or "" options = request.options @@ -357,9 +414,6 @@ class AiObjects: if options.compressContext and len(context.encode("utf-8")) > 70000: context = maybeTruncate(context, 70000) - # Select model for text generation - modelName = self._selectModel(prompt, context, options) - # Derive generation parameters temperature = getattr(options, "temperature", None) if temperature is None: @@ -376,58 +430,112 @@ class AiObjects: messages.append({"role": "system", "content": f"Context from documents:\n{context}"}) messages.append({"role": "user", "content": prompt}) - connector = self._connectorFor(modelName) - functionName = aiModels[modelName]["function"] + # Get fallback models for this operation type + fallbackModels = self._getFallbackModels(options.operationType) - # Call the appropriate function - if functionName == "callAiBasic": - if aiModels[modelName]["connector"] == "openai": - content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) - elif aiModels[modelName]["connector"] == "perplexity": - content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) - else: - response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) - content = response["choices"][0]["message"]["content"] - elif functionName == "callAiWithWebSearch": - # Perplexity web search function - query = prompt - if context: - query = f"Context: {context}\n\nQuery: {prompt}" - content = await connector.callAiWithWebSearch(query) - elif functionName == "researchTopic": - # Perplexity research function - content = await connector.researchTopic(prompt) - elif functionName == "answerQuestion": - # Perplexity question answering function - content = await connector.answerQuestion(prompt, context) - elif functionName == "getCurrentNews": - # Perplexity news function - content = await connector.getCurrentNews(prompt) - else: - raise ValueError(f"Function {functionName} not supported for text generation") + # Try primary model first, then fallbacks + lastError = None + for attempt, modelName in enumerate(fallbackModels): + try: + logger.info(f"Attempting AI call with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})") + + connector = self._connectorFor(modelName) + functionName = aiModels[modelName]["function"] + + # Call the appropriate function + if functionName == "callAiBasic": + if aiModels[modelName]["connector"] == "openai": + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) + elif aiModels[modelName]["connector"] == "perplexity": + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) + else: + response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) + content = response["choices"][0]["message"]["content"] + elif functionName == "callAiWithWebSearch": + # Perplexity web search function + query = prompt + if context: + query = f"Context: {context}\n\nQuery: {prompt}" + content = await connector.callAiWithWebSearch(query) + elif functionName == "researchTopic": + # Perplexity research function + content = await connector.researchTopic(prompt) + elif functionName == "answerQuestion": + # Perplexity question answering function + content = await connector.answerQuestion(prompt, context) + elif functionName == "getCurrentNews": + # Perplexity news function + content = await connector.getCurrentNews(prompt) + else: + raise ValueError(f"Function {functionName} not supported for text generation") - # Estimate cost/tokens - totalSize = len((prompt + context).encode("utf-8")) - cost = self._estimateCost(aiModels[modelName], totalSize) - usedTokens = int(totalSize / 4) + # Success! Estimate cost/tokens and return + totalSize = len((prompt + context).encode("utf-8")) + cost = self._estimateCost(aiModels[modelName], totalSize) + usedTokens = int(totalSize / 4) + + logger.info(f"βœ… AI call successful with model: {modelName}") + return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost) + + except Exception as e: + lastError = e + logger.warning(f"❌ AI call failed with model {modelName}: {str(e)}") + + # If this is not the last model, try the next one + if attempt < len(fallbackModels) - 1: + logger.info(f"πŸ”„ Trying next fallback model...") + continue + else: + # All models failed + logger.error(f"πŸ’₯ All {len(fallbackModels)} models failed for operation {options.operationType}") + break - return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost) + # All fallback attempts failed + errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}" + logger.error(errorMsg) + raise Exception(errorMsg) async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str: - """Call AI model for image analysis.""" + """Call AI model for image analysis with fallback mechanism.""" if options is None: options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS) - # Select model for image analysis - modelName = self._selectModel(prompt, "", options) + # Get fallback models for image analysis + fallbackModels = self._getFallbackModels(OperationType.IMAGE_ANALYSIS) - connector = self._connectorFor(modelName) - functionName = aiModels[modelName]["function"] - - if functionName == "callAiImage": - return await connector.callAiImage(prompt, imageData, mimeType) - else: - raise ValueError(f"Function {functionName} not supported for image analysis") + # Try primary model first, then fallbacks + lastError = None + for attempt, modelName in enumerate(fallbackModels): + try: + logger.info(f"Attempting image analysis with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})") + + connector = self._connectorFor(modelName) + functionName = aiModels[modelName]["function"] + + if functionName == "callAiImage": + content = await connector.callAiImage(prompt, imageData, mimeType) + logger.info(f"βœ… Image analysis successful with model: {modelName}") + return content + else: + raise ValueError(f"Function {functionName} not supported for image analysis") + + except Exception as e: + lastError = e + logger.warning(f"❌ Image analysis failed with model {modelName}: {str(e)}") + + # If this is not the last model, try the next one + if attempt < len(fallbackModels) - 1: + logger.info(f"πŸ”„ Trying next fallback model for image analysis...") + continue + else: + # All models failed + logger.error(f"πŸ’₯ All {len(fallbackModels)} models failed for image analysis") + break + + # All fallback attempts failed + errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}" + logger.error(errorMsg) + raise Exception(errorMsg) async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]: """Generate an image using AI.""" @@ -694,7 +802,22 @@ class AiObjects: logger.warning(f"Failed to extract links from content: {e}") return [] - async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]: + def _normalizeUrl(self, url: str) -> str: + """Normalize URL to handle variations that should be considered duplicates.""" + if not url: + return url + + # Remove trailing slashes and fragments + url = url.rstrip('/') + if '#' in url: + url = url.split('#')[0] + + # Handle common URL variations + url = url.replace('http://', 'https://') # Normalize protocol + + return url + + async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]: """ Recursively crawl URLs up to specified depth. @@ -703,76 +826,100 @@ class AiObjects: max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.) extract_depth: Tavily extract depth setting max_per_domain: Maximum URLs per domain per level + global_processed_urls: Optional global set to track processed URLs across sessions Returns: Dictionary mapping URL -> content for all crawled pages """ logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}") - # URL index to track all processed URLs + # URL index to track all processed URLs (local + global) processed_urls = set() + if global_processed_urls is not None: + # Use global index if provided, otherwise create local one + processed_urls = global_processed_urls + logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs") + else: + logger.info("Using local URL index for this crawl session") + all_content = {} # Current level URLs to process current_level_urls = urls.copy() - for depth in range(1, max_depth + 1): - logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===") - logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}") - - # URLs found at this level (for next iteration) - next_level_urls = [] - - for url in current_level_urls: - if url in processed_urls: - logger.debug(f"URL {url} already processed, skipping") - continue + try: + for depth in range(1, max_depth + 1): + logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===") + logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}") - try: - logger.info(f"Processing URL at depth {depth}: {url}") + # URLs found at this level (for next iteration) + next_level_urls = [] + + for url in current_level_urls: + # Normalize URL for duplicate checking + normalized_url = self._normalizeUrl(url) + if normalized_url in processed_urls: + logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping") + continue - # Read page content - content = await self.readPage(url, extract_depth) - if content: - all_content[url] = content - processed_urls.add(url) - logger.info(f"βœ“ Successfully processed {url}: {len(content)} chars") + try: + logger.info(f"Processing URL at depth {depth}: {url}") + logger.debug(f"Total processed URLs so far: {len(processed_urls)}") - # Get URLs from this page for next level - page_urls = await self.getUrlsFromPage(url, extract_depth) - logger.info(f"Found {len(page_urls)} URLs on {url}") - - # Filter URLs and add to next level - filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain) - logger.info(f"Filtered to {len(filtered_urls)} valid URLs") - - # Add new URLs to next level (avoiding already processed ones) - new_urls_count = 0 - for new_url in filtered_urls: - if new_url not in processed_urls: - next_level_urls.append(new_url) - new_urls_count += 1 - - logger.info(f"Added {new_urls_count} new URLs to next level from {url}") - else: - logger.warning(f"βœ— No content extracted from {url}") - processed_urls.add(url) # Mark as processed to avoid retry - - except Exception as e: - logger.warning(f"βœ— Failed to process URL {url} at depth {depth}: {e}") - processed_urls.add(url) # Mark as processed to avoid retry + # Read page content + content = await self.readPage(url, extract_depth) + if content: + all_content[url] = content + processed_urls.add(normalized_url) + logger.info(f"βœ“ Successfully processed {url}: {len(content)} chars") + + # Get URLs from this page for next level + page_urls = await self.getUrlsFromPage(url, extract_depth) + logger.info(f"Found {len(page_urls)} URLs on {url}") + + # Filter URLs and add to next level + filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain) + logger.info(f"Filtered to {len(filtered_urls)} valid URLs") + + # Add new URLs to next level (avoiding already processed ones) + new_urls_count = 0 + for new_url in filtered_urls: + normalized_new_url = self._normalizeUrl(new_url) + if normalized_new_url not in processed_urls: + next_level_urls.append(new_url) + new_urls_count += 1 + else: + logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping") + + logger.info(f"Added {new_urls_count} new URLs to next level from {url}") + else: + logger.warning(f"βœ— No content extracted from {url}") + processed_urls.add(normalized_url) # Mark as processed to avoid retry + + except Exception as e: + logger.warning(f"βœ— Failed to process URL {url} at depth {depth}: {e}") + processed_urls.add(normalized_url) # Mark as processed to avoid retry + + # Prepare for next iteration + current_level_urls = next_level_urls + logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level") + + # Stop if no more URLs to process + if not current_level_urls: + logger.info(f"No more URLs found at depth {depth}, stopping recursion") + break - # Prepare for next iteration - current_level_urls = next_level_urls - logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level") + logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled") + logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}") + logger.info(f"Unique URLs found: {len(all_content)}") + return all_content - # Stop if no more URLs to process - if not current_level_urls: - logger.info(f"No more URLs found at depth {depth}, stopping recursion") - break - - logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled") - return all_content + except asyncio.TimeoutError: + logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far") + return all_content + except Exception as e: + logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far") + return all_content async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str: """Use Perplexity AI to provide the best answers for web-related queries.""" diff --git a/modules/interfaces/interfaceDbChatObjects.py b/modules/interfaces/interfaceDbChatObjects.py index 319c1703..ff18a9a9 100644 --- a/modules/interfaces/interfaceDbChatObjects.py +++ b/modules/interfaces/interfaceDbChatObjects.py @@ -571,8 +571,10 @@ class ChatObjects: actionName=createdMessage.get("actionName") ) - # Debug: Store message and documents for debugging TODO REMOVE - self._storeDebugMessageAndDocuments(chat_message) + # Debug: Store message and documents for debugging - only if debug enabled + debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + self._storeDebugMessageAndDocuments(chat_message) return chat_message @@ -1052,8 +1054,11 @@ class ChatObjects: def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None: """ - Store message and documents for debugging purposes in fileshare. - Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents + Store message and documents (metadata and file bytes) for debugging purposes. + Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/ + - message.json, message_text.txt + - document_###_metadata.json + - document_###_ (actual file bytes) Args: message: ChatMessage object to store @@ -1156,6 +1161,26 @@ class ChatObjects: json.dump(doc_meta, f, indent=2, ensure_ascii=False, default=str) logger.info(f"Debug: Stored document metadata for {doc.fileName}") + + # Also store the actual file bytes next to metadata for debugging + try: + # Lazy import to avoid circular deps at module load + from modules.interfaces import interfaceDbComponentObjects as comp + componentInterface = comp.getInterface(self.currentUser) + file_bytes = componentInterface.getFileData(doc.fileId) + if file_bytes: + # Build a safe filename preserving original name + safe_name = doc.fileName or f"document_{i+1:03d}" + # Avoid path traversal + safe_name = os.path.basename(safe_name) + doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name) + with open(doc_file_path, "wb") as df: + df.write(file_bytes) + logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)") + else: + logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}") + except Exception as e: + logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}") logger.info(f"Debug: Stored message and documents in {message_path}") diff --git a/modules/routes/routeDataPrompts.py b/modules/routes/routeDataPrompts.py index a9c8952f..b3ae6f47 100644 --- a/modules/routes/routeDataPrompts.py +++ b/modules/routes/routeDataPrompts.py @@ -95,8 +95,8 @@ async def update_prompt( detail=f"Prompt with ID {promptId} not found" ) - # Convert Prompt to dict for interface - update_data = promptData.dict() + # Convert Prompt to dict for interface, excluding the id field + update_data = promptData.dict(exclude={'id'}) # Update prompt updatedPrompt = managementInterface.updatePrompt(promptId, update_data) diff --git a/modules/routes/routeSecurityLocal.py b/modules/routes/routeSecurityLocal.py index 7a7bc0f2..f24aee78 100644 --- a/modules/routes/routeSecurityLocal.py +++ b/modules/routes/routeSecurityLocal.py @@ -14,7 +14,7 @@ from pydantic import BaseModel # Import auth modules from modules.security.auth import getCurrentUser, limiter, SECRET_KEY, ALGORITHM -from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie +from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie, clearAccessTokenCookie, clearRefreshTokenCookie from modules.interfaces.interfaceDbAppObjects import getInterface, getRootInterface from modules.datamodels.datamodelUam import User, UserInDB, AuthAuthority, UserPrivilege from modules.datamodels.datamodelSecurity import Token @@ -263,8 +263,7 @@ async def read_user_me( @limiter.limit("60/minute") async def refresh_token( request: Request, - response: Response, - currentUser: User = Depends(getCurrentUser) + response: Response ) -> Dict[str, Any]: """Refresh access token using refresh token from cookie""" try: @@ -283,12 +282,27 @@ async def refresh_token( except jwt.JWTError: raise HTTPException(status_code=401, detail="Invalid refresh token") + # Get user information from refresh token payload + user_id = payload.get("userId") + if not user_id: + raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID") + + # Get user from database using the user ID from refresh token + try: + app_interface = getRootInterface() + current_user = app_interface.getUser(user_id) + if not current_user: + raise HTTPException(status_code=401, detail="User not found") + except Exception as e: + logger.error(f"Failed to get user from database: {str(e)}") + raise HTTPException(status_code=500, detail="Failed to validate user") + # Create new token data token_data = { - "sub": currentUser.username, - "mandateId": str(currentUser.mandateId), - "userId": str(currentUser.id), - "authenticationAuthority": currentUser.authenticationAuthority + "sub": current_user.username, + "mandateId": str(current_user.mandateId), + "userId": str(current_user.id), + "authenticationAuthority": current_user.authenticationAuthority } # Create new access token + set cookie @@ -365,15 +379,18 @@ async def logout(request: Request, response: Response, currentUser: User = Depen # Don't fail if audit logging fails pass - # Clear httpOnly cookies - response.delete_cookie(key="auth_token", httponly=True, samesite="strict") - response.delete_cookie(key="refresh_token", httponly=True, samesite="strict") - - return JSONResponse({ + # Create the JSON response first + json_response = JSONResponse({ "message": "Successfully logged out - cookies cleared", "revokedTokens": revoked }) + # Clear httpOnly cookies on the response we're actually returning + clearAccessTokenCookie(json_response) + clearRefreshTokenCookie(json_response) + + return json_response + except Exception as e: logger.error(f"Error during logout: {str(e)}") raise HTTPException( diff --git a/modules/security/jwtService.py b/modules/security/jwtService.py index 5e09e63e..87e226c7 100644 --- a/modules/security/jwtService.py +++ b/modules/security/jwtService.py @@ -17,6 +17,11 @@ ALGORITHM = APP_CONFIG.get("Auth_ALGORITHM") ACCESS_TOKEN_EXPIRE_MINUTES = int(APP_CONFIG.get("APP_TOKEN_EXPIRY")) REFRESH_TOKEN_EXPIRE_DAYS = int(APP_CONFIG.get("APP_REFRESH_TOKEN_EXPIRY", "7")) +# Cookie security settings - use secure cookies based on whether API uses HTTPS +# Cookies must have secure=True on HTTPS sites, secure=False on HTTP sites +APP_API_URL = APP_CONFIG.get("APP_API_URL", "http://localhost:8000") +USE_SECURE_COOKIES = APP_API_URL.startswith("https://") if APP_API_URL else False + def createAccessToken(data: dict, expiresDelta: Optional[timedelta] = None) -> Tuple[str, "datetime"]: """Create a JWT access token and return (token, expiresAt).""" @@ -52,8 +57,9 @@ def setAccessTokenCookie(response: Response, token: str, expiresDelta: Optional[ key="auth_token", value=token, httponly=True, - secure=True, + secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS) samesite="strict", + path="/", max_age=maxAge ) @@ -64,9 +70,46 @@ def setRefreshTokenCookie(response: Response, token: str) -> None: key="refresh_token", value=token, httponly=True, - secure=True, + secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS) samesite="strict", + path="/", max_age=REFRESH_TOKEN_EXPIRE_DAYS * 24 * 60 * 60 ) +def clearAccessTokenCookie(response: Response) -> None: + """ + Clear access token cookie by setting it to expire immediately. + Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility. + """ + # Build secure flag based on environment + secure_flag = "; Secure" if USE_SECURE_COOKIES else "" + + # Primary method: Raw Set-Cookie header for guaranteed deletion + response.headers.append( + "Set-Cookie", + f"auth_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict" + ) + + # Fallback: Also use FastAPI's built-in method + response.delete_cookie(key="auth_token", path="/") + + +def clearRefreshTokenCookie(response: Response) -> None: + """ + Clear refresh token cookie by setting it to expire immediately. + Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility. + """ + # Build secure flag based on environment + secure_flag = "; Secure" if USE_SECURE_COOKIES else "" + + # Primary method: Raw Set-Cookie header for guaranteed deletion + response.headers.append( + "Set-Cookie", + f"refresh_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict" + ) + + # Fallback: Also use FastAPI's built-in method + response.delete_cookie(key="refresh_token", path="/") + + diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 16619a52..ed5e318a 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -5,6 +5,7 @@ from modules.datamodels.datamodelChat import PromptPlaceholder from modules.datamodels.datamodelChat import ChatDocument from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority +from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted from modules.datamodels.datamodelWeb import ( WebResearchRequest, WebResearchActionResult, @@ -13,16 +14,30 @@ from modules.datamodels.datamodelWeb import ( WebSearchResultItem, ) from modules.interfaces.interfaceAiObjects import AiObjects +from modules.shared.configuration import APP_CONFIG +from modules.services.serviceAi.subCoreAi import SubCoreAi +from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing +from modules.services.serviceAi.subWebResearch import SubWebResearch +from modules.services.serviceAi.subDocumentGeneration import SubDocumentGeneration +from modules.services.serviceAi.subUtilities import SubUtilities logger = logging.getLogger(__name__) - -# Model registry is now provided by interfaces via AiModels - - class AiService: - """Centralized AI service orchestrating documents, model selection, failover, and web operations. + """Lightweight AI service orchestrator that delegates to specialized sub-modules. + + Manager delegates to specialized sub-modules: + - SubCoreAi: Core AI operations (readImage, generateImage, callAi, planning, text calls) + - SubDocumentProcessing: Document chunking, processing, and merging logic + - SubWebResearch: Web research and crawling functionality + - SubDocumentGeneration: Single-file and multi-file document generation + - SubUtilities: Helper functions, text processing, and debugging utilities + + The main service acts as a coordinator: + 1. Manages lazy initialization of sub-modules + 2. Delegates operations to appropriate sub-modules + 3. Maintains the same public API for backward compatibility """ def __init__(self, serviceCenter=None) -> None: @@ -31,19 +46,64 @@ class AiService: Args: serviceCenter: Service center instance for accessing other services """ - self.serviceCenter = serviceCenter + self.services = serviceCenter # Only depend on interfaces self.aiObjects = None # Will be initialized in create() self._extractionService = None # Lazy initialization + self._coreAi = None # Lazy initialization + self._documentProcessor = None # Lazy initialization + self._webResearch = None # Lazy initialization + self._documentGenerator = None # Lazy initialization + self._utilities = None # Lazy initialization @property def extractionService(self): """Lazy initialization of extraction service.""" if self._extractionService is None: logger.info("Lazy initializing ExtractionService...") - self._extractionService = ExtractionService() + self._extractionService = ExtractionService(self.services) return self._extractionService + @property + def coreAi(self): + """Lazy initialization of core AI service.""" + if self._coreAi is None: + logger.info("Lazy initializing SubCoreAi...") + self._coreAi = SubCoreAi(self.services, self.aiObjects) + return self._coreAi + + @property + def documentProcessor(self): + """Lazy initialization of document processing service.""" + if self._documentProcessor is None: + logger.info("Lazy initializing SubDocumentProcessing...") + self._documentProcessor = SubDocumentProcessing(self.services, self.aiObjects) + return self._documentProcessor + + @property + def webResearchService(self): + """Lazy initialization of web research service.""" + if self._webResearch is None: + logger.info("Lazy initializing SubWebResearch...") + self._webResearch = SubWebResearch(self.services, self.aiObjects) + return self._webResearch + + @property + def documentGenerator(self): + """Lazy initialization of document generation service.""" + if self._documentGenerator is None: + logger.info("Lazy initializing SubDocumentGeneration...") + self._documentGenerator = SubDocumentGeneration(self.services, self.aiObjects, self.documentProcessor) + return self._documentGenerator + + @property + def utilities(self): + """Lazy initialization of utilities service.""" + if self._utilities is None: + logger.info("Lazy initializing SubUtilities...") + self._utilities = SubUtilities(self.services) + return self._utilities + async def _ensureAiObjectsInitialized(self): """Ensure aiObjects is initialized.""" if self.aiObjects is None: @@ -70,11 +130,8 @@ class AiService: options: Optional[AiCallOptions] = None, ) -> str: """Call AI for image analysis using interface.callImage().""" - try: - return await self.aiObjects.callImage(prompt, imageData, mimeType, options) - except Exception as e: - logger.error(f"Error in AI image analysis: {str(e)}") - return f"Error: {str(e)}" + await self._ensureAiObjectsInitialized() + return await self.coreAi.readImage(prompt, imageData, mimeType, options) # AI Image Generation async def generateImage( @@ -86,621 +143,16 @@ class AiService: options: Optional[AiCallOptions] = None, ) -> Dict[str, Any]: """Generate an image using AI using interface.generateImage().""" - try: - return await self.aiObjects.generateImage(prompt, size, quality, style, options) - except Exception as e: - logger.error(f"Error in AI image generation: {str(e)}") - return {"success": False, "error": str(e)} + await self._ensureAiObjectsInitialized() + return await self.coreAi.generateImage(prompt, size, quality, style, options) - # Web Research - Using interface functions + # Web Research async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult: """Perform web research using interface functions.""" - try: - logger.info(f"WEB RESEARCH STARTED") - logger.info(f"User Query: {request.user_prompt}") - logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}") - - # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs - logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===") - - if request.urls: - # Use provided URLs as initial main URLs - websites = request.urls - logger.info(f"Using provided URLs ({len(websites)}):") - for i, url in enumerate(websites, 1): - logger.info(f" {i}. {url}") - else: - # Use AI to determine main URLs based on user's intention - logger.info(f"AI analyzing user intent: '{request.user_prompt}'") - - # Use AI to generate optimized Tavily search query and search parameters - query_optimizer_prompt = f"""You are a search query optimizer. - - USER QUERY: {request.user_prompt} - - Your task: Create a search query and parameters for the USER QUERY given. - - RULES: - 1. The search query MUST be related to the user query above - 2. Extract key terms from the user query - 3. Determine appropriate country/language based on the query context - 4. Keep search query short (2-6 words) - - Return ONLY this JSON format: - {{ - "user_prompt": "search query based on user query above", - "country": "country_code_or_null", - "language": "language_code_or_null", - "topic": "general|news|academic_or_null", - "time_range": "d|w|m|y_or_null", - "selection_strategy": "single|multiple|specific_page", - "selection_criteria": "what URLs to prioritize", - "expected_url_patterns": ["pattern1", "pattern2"], - "estimated_result_count": number - }}""" - - # Get AI response for query optimization - ai_request = AiCallRequest( - prompt=query_optimizer_prompt, - options=AiCallOptions() - ) - ai_response_obj = await self.aiObjects.call(ai_request) - ai_response = ai_response_obj.content - logger.debug(f"AI query optimizer response: {ai_response}") - - # Parse AI response to extract search query - import json - try: - # Clean the response by removing markdown code blocks - cleaned_response = ai_response.strip() - if cleaned_response.startswith('```json'): - cleaned_response = cleaned_response[7:] # Remove ```json - if cleaned_response.endswith('```'): - cleaned_response = cleaned_response[:-3] # Remove ``` - cleaned_response = cleaned_response.strip() - - query_data = json.loads(cleaned_response) - search_query = query_data.get("user_prompt", request.user_prompt) - ai_country = query_data.get("country") - ai_language = query_data.get("language") - ai_topic = query_data.get("topic") - ai_time_range = query_data.get("time_range") - selection_strategy = query_data.get("selection_strategy", "multiple") - selection_criteria = query_data.get("selection_criteria", "relevant URLs") - expected_patterns = query_data.get("expected_url_patterns", []) - estimated_count = query_data.get("estimated_result_count", request.max_results) - - logger.info(f"AI optimized search query: '{search_query}'") - logger.info(f"Selection strategy: {selection_strategy}") - logger.info(f"Selection criteria: {selection_criteria}") - logger.info(f"Expected URL patterns: {expected_patterns}") - logger.info(f"Estimated result count: {estimated_count}") - - except json.JSONDecodeError: - logger.warning("Failed to parse AI response as JSON, using original query") - search_query = request.user_prompt - ai_country = None - ai_language = None - ai_topic = None - ai_time_range = None - selection_strategy = "multiple" - - # Perform the web search with AI-determined parameters - search_kwargs = { - "query": search_query, - "max_results": request.max_results, - "search_depth": request.options.search_depth, - "auto_parameters": False # Use explicit parameters - } - - # Add parameters only if they have valid values - if ai_country and ai_country not in ['null', '', 'none', 'undefined']: - search_kwargs["country"] = ai_country - elif request.options.country and request.options.country not in ['null', '', 'none', 'undefined']: - search_kwargs["country"] = request.options.country - - if ai_language and ai_language not in ['null', '', 'none', 'undefined']: - search_kwargs["language"] = ai_language - elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']: - search_kwargs["language"] = request.options.language - - if ai_topic and ai_topic in ['general', 'news', 'academic']: - search_kwargs["topic"] = ai_topic - elif request.options.topic and request.options.topic in ['general', 'news', 'academic']: - search_kwargs["topic"] = request.options.topic - - if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']: - search_kwargs["time_range"] = ai_time_range - elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']: - search_kwargs["time_range"] = request.options.time_range - - # Log the parameters being used - logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}") - - search_results = await self.aiObjects.search_websites(**search_kwargs) - - logger.debug(f"Web search returned {len(search_results)} results:") - for i, result in enumerate(search_results, 1): - logger.debug(f" {i}. {result.url} - {result.title}") - - # Deduplicate while preserving order - seen = set() - search_urls = [] - for r in search_results: - u = str(r.url) - if u not in seen: - seen.add(u) - search_urls.append(u) - - if not search_urls: - logger.error("No relevant websites found") - return WebResearchActionResult(success=False, error="No relevant websites found") - - # Now use AI to determine the main URLs based on user's intention - logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent") - - # Create a prompt for AI to identify main URLs based on user's intention - ai_prompt = f""" - Select the most relevant URLs from these search results: - - {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} - - Return only the URLs that are most relevant for the user's query. - One URL per line. - """ - # Create AI call request - ai_request = AiCallRequest( - prompt=ai_prompt, - options=AiCallOptions() - ) - ai_response_obj = await self.aiObjects.call(ai_request) - ai_response = ai_response_obj.content - logger.debug(f"AI response for main URL selection: {ai_response}") - - # Parse AI response to extract URLs - websites = [] - for line in ai_response.strip().split('\n'): - line = line.strip() - if line and ('http://' in line or 'https://' in line): - # Extract URL from the line - for word in line.split(): - if word.startswith('http://') or word.startswith('https://'): - websites.append(word.rstrip('.,;')) - break - - if not websites: - logger.warning("AI did not identify any main URLs, using first few search results") - websites = search_urls[:3] # Fallback to first 3 search results - - # Deduplicate while preserving order - seen = set() - unique_websites = [] - for url in websites: - if url not in seen: - seen.add(url) - unique_websites.append(url) - - websites = unique_websites - - logger.info(f"AI selected {len(websites)} main URLs (after deduplication):") - for i, url in enumerate(websites, 1): - logger.info(f" {i}. {url}") - - # Step 2: Smart website selection using AI interface - logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===") - logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'") - - selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt) - - logger.debug(f"AI Response: {aiResponse}") - logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:") - for i, url in enumerate(selectedWebsites, 1): - logger.debug(f" {i}. {url}") - - # Show which were filtered out - filtered_out = [url for url in websites if url not in selectedWebsites] - if filtered_out: - logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:") - for i, url in enumerate(filtered_out, 1): - logger.debug(f" {i}. {url}") - - # Step 3+4+5: Recursive crawling with configurable depth - logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {request.options.pages_search_depth}) ===") - logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...") - logger.info(f"Search depth: {request.options.pages_search_depth} levels") - logger.info(f"DEBUG: request.options.pages_search_depth = {request.options.pages_search_depth}") - - # Use recursive crawling with URL index to avoid duplicates - allContent = await self.aiObjects.crawlRecursively( - urls=selectedWebsites, - max_depth=request.options.pages_search_depth, - extract_depth=request.options.extract_depth, - max_per_domain=10 - ) - - if not allContent: - logger.error("Could not extract content from any websites") - return WebResearchActionResult(success=False, error="Could not extract content from any websites") - - logger.info(f"=== WEB RESEARCH COMPLETED ===") - logger.info(f"Successfully crawled {len(allContent)} URLs total") - logger.info(f"Crawl depth: {request.options.pages_search_depth} levels") - - # Create simple result with raw content - sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites] - - # Get all additional links (all URLs except main ones) - additional_links = [url for url in allContent.keys() if url not in selectedWebsites] - - # Combine all content into a single result - combinedContent = "" - for url, content in allContent.items(): - combinedContent += f"\n\n=== {url} ===\n{content}\n" - - documentData = WebResearchDocumentData( - user_prompt=request.user_prompt, - websites_analyzed=len(allContent), - additional_links_found=len(additional_links), - analysis_result=combinedContent, # Raw content, no analysis - sources=sources, - additional_links=additional_links, - individual_content=allContent, # Individual URL -> content mapping - debug_info={ - "crawl_depth": request.options.pages_search_depth, - "total_urls_crawled": len(allContent), - "main_urls": len(selectedWebsites), - "additional_urls": len(additional_links) - } - ) - - document = WebResearchActionDocument( - documentName=f"web_research_{request.user_prompt[:50]}.json", - documentData=documentData, - mimeType="application/json" - ) - - return WebResearchActionResult( - success=True, - documents=[document], - resultLabel="web_research_results" - ) - - except Exception as e: - logger.error(f"Error in web research: {str(e)}") - return WebResearchActionResult(success=False, error=str(e)) - - async def _processDocumentsForAi( - self, - documents: List[ChatDocument], - operationType: str, - compressDocuments: bool, - processIndividually: bool, - userPrompt: str, - options: Optional[AiCallOptions] = None - ) -> str: - if not documents: - return "" - - # Calculate model-derived size limits - maxContextBytes = self._calculateMaxContextBytes(options) - - # Build extraction options with model-derived limits - extractionOptions: Dict[str, Any] = { - "prompt": f"Extract content that supports the user's request: '{userPrompt}'. Focus on information relevant to: {operationType}", - "operationType": operationType, - "processDocumentsIndividually": processIndividually, - "maxSize": maxContextBytes, - "chunkAllowed": not options.compressContext if options else True, - "textChunkSize": int(maxContextBytes * 0.3), # 30% of max for text chunks - "imageChunkSize": int(maxContextBytes * 0.5), # 50% of max for image chunks - "imageMaxPixels": 1024 * 1024, # 1MP default - "imageQuality": 85, - "mergeStrategy": { - "groupBy": "typeGroup", - "orderBy": "id", - "mergeType": "concatenate" - }, - } - - processedContents: List[str] = [] - - try: - # Use new ChatDocument-based API - logger.info(f"=== PROCESSING {len(documents)} DOCUMENTS FOR AI ===") - for i, doc in enumerate(documents): - logger.info(f"Document {i}: {doc.fileName} (MIME: {doc.mimeType})") - - extractionResult = self.extractionService.extractContent(documents, extractionOptions) - logger.info(f"Extraction completed: {len(extractionResult)} results") - - async def _partsToText(parts, documentName: str, documentType: str, logger_ref) -> str: - lines: List[str] = [] - logger_ref.debug(f"Processing {len(parts)} content parts for {documentName}") - - for p in parts: - logger_ref.debug(f" Part: {p.typeGroup} ({p.mimeType}) - {len(p.data) if p.data else 0} chars") - - if p.typeGroup in ("text", "table", "structure") and p.data and isinstance(p.data, str): - lines.append(p.data) - elif p.typeGroup == "image" and p.data: - # Use AI to extract text from image with user prompt - logger_ref.debug(f" Processing image with AI using user prompt...") - try: - imageResult = await self.aiObjects.callImage( - prompt=userPrompt, - imageData=p.data, - mimeType=p.mimeType - ) - lines.append(f"[Image Analysis]: {imageResult}") - logger_ref.debug(f" AI image analysis completed: {len(imageResult)} chars") - except Exception as e: - logger_ref.warning(f" AI image processing failed: {e}") - lines.append(f"[Image Analysis Failed]: {str(e)}") - return "\n\n".join(lines) - - if isinstance(extractionResult, list): - for i, ec in enumerate(extractionResult): - try: - # Get document info for this extraction result - doc = documents[i] if i < len(documents) else None - docName = doc.fileName if doc else f"Document_{i}" - docType = doc.mimeType if doc else "unknown" - - contentText = await _partsToText(ec.parts, docName, docType, logger) - logger.debug(f"Document {i} content: {len(contentText)} chars") - - if compressDocuments and len(contentText.encode("utf-8")) > 10000: - originalLength = len(contentText) - contentText = await self._compressContent(contentText, 10000, "document") - logger.debug(f"Document {i} compressed: {originalLength} -> {len(contentText)} chars") - - processedContents.append(contentText) - except Exception as e: - logger.warning(f"Error aggregating extracted content: {str(e)}") - processedContents.append("[Error aggregating content]") - else: - # Fallback: no content - contentText = "" - if compressDocuments and len(contentText.encode("utf-8")) > 10000: - contentText = await self._compressContent(contentText, 10000, "document") - processedContents.append(contentText) - except Exception as e: - logger.warning(f"Error during extraction: {str(e)}") - processedContents.append("[Error during extraction]") - - # Build JSON structure ONLY when adding to AI prompt - import json - documentsJson = [] - for i, content in enumerate(processedContents): - doc = documents[i] if i < len(documents) else None - docName = doc.fileName if doc else f"Document_{i}" - docType = doc.mimeType if doc else "unknown" - - documentData = { - "documentName": docName, - "documentType": docType, - "content": content - } - documentsJson.append(documentData) - - finalContext = json.dumps({ - "documents": documentsJson, - "totalDocuments": len(documentsJson) - }, indent=2, ensure_ascii=False) - - logger.debug(f"=== FINAL CONTEXT ===") - logger.debug(f"Total context: {len(finalContext)} chars") - logger.debug(f"Documents: {len(documentsJson)}") - return finalContext - - def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int: - """Calculate maximum context bytes based on model capabilities and options.""" - if options and options.maxContextBytes: - return options.maxContextBytes - - # Default model capabilities (this should be enhanced with actual model registry) - defaultMaxTokens = 4000 - safetyMargin = options.safetyMargin if options else 0.1 - - # Calculate bytes (4 chars per token estimation) - maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4) - - return maxContextBytes - - async def _processDocumentsPerChunk( - self, - documents: List[ChatDocument], - prompt: str, - options: Optional[AiCallOptions] = None - ) -> str: - """ - Process documents with per-chunk AI calls and merge results. - - Args: - documents: List of ChatDocument objects to process - prompt: AI prompt for processing - options: AI call options - - Returns: - Merged AI results as string - """ - if not documents: - return "" - - # Get model capabilities for size calculation - model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) - - # Build extraction options for chunking - extractionOptions: Dict[str, Any] = { - "prompt": prompt, - "operationType": options.operationType if options else "general", - "processDocumentsIndividually": True, # Process each document separately - "maxSize": model_capabilities["maxContextBytes"], - "chunkAllowed": True, - "textChunkSize": model_capabilities["textChunkSize"], - "imageChunkSize": model_capabilities["imageChunkSize"], - "imageMaxPixels": 1024 * 1024, - "imageQuality": 85, - "mergeStrategy": { - "groupBy": "typeGroup", - "orderBy": "id", - "mergeType": "concatenate" - }, - } - - logger.debug(f"Per-chunk extraction options: {extractionOptions}") - - try: - # Extract content with chunking - extractionResult = self.extractionService.extractContent(documents, extractionOptions) - - if not isinstance(extractionResult, list): - return "[Error: No extraction results]" - - # Prepare debug directory TODO TO REMOVE - import os - from datetime import datetime, UTC - debug_root = "./test-chat/ai" - ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] - debug_dir = os.path.join(debug_root, f"{ts}_extraction_per_chunk") - try: - os.makedirs(debug_dir, exist_ok=True) - except Exception: - pass - - # Process each chunk with AI - aiResults: List[str] = [] - - for ec in extractionResult: - for part in ec.parts: - if part.typeGroup == "image": - # Process image with AI - try: - # Safety check for part.data - if not hasattr(part, 'data') or part.data is None: - logger.warning(f"Skipping image chunk with no data") - continue - - aiResult = await self.readImage( - prompt=prompt, - imageData=part.data, - mimeType=part.mimeType, - options=options - ) - aiResults.append(aiResult) - except Exception as e: - logger.warning(f"Error processing image chunk: {str(e)}") - aiResults.append(f"[Error processing image: {str(e)}]") - - elif part.typeGroup in ("text", "table", "structure"): - # Process text content with AI - try: - # Safety check for part.data - if not hasattr(part, 'data') or part.data is None: - logger.warning(f"Skipping chunk with no data") - continue - - logger.info(f"=== PROCESSING CHUNK {len(aiResults) + 1} ===") - logger.info(f"Chunk size: {len(part.data)} chars") - logger.info(f"Chunk preview: {part.data[:200]}...") - - # Dump input chunk - try: - idx = len(aiResults) + 1 - fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt") - with open(fpath, "w", encoding="utf-8") as f: - f.write(str(part.data)) - except Exception: - pass - - # Create AI call request for this chunk - request = AiCallRequest( - prompt=prompt, - context=part.data, - options=options - ) - - # Make the call using AiObjects - response = await self.aiObjects.call(request) - aiResults.append(response.content) - - logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response") - # Dump AI response - try: - idx = len(aiResults) - fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt") - with open(fpath, "w", encoding="utf-8") as f: - f.write(str(response.content)) - except Exception: - pass - - except Exception as e: - logger.warning(f"Error processing text chunk: {str(e)}") - aiResults.append(f"[Error processing text: {str(e)}]") - - # Merge AI results using ExtractionService - from modules.datamodels.datamodelExtraction import MergeStrategy - - mergeStrategy = MergeStrategy( - groupBy="typeGroup", - orderBy="id", - mergeType="concatenate", - chunkSeparator="\n\n---\n\n" - ) - - mergedContent = self.extractionService.mergeAiResults( - extractionResult, - aiResults, - mergeStrategy - ) - - # Extract only AI-generated text from merged content - resultText = "" - for part in mergedContent.parts: - if ( - part.typeGroup in ("text", "table", "structure") - and part.data - and getattr(part, "metadata", {}).get("aiResult", False) - ): - resultText += part.data + "\n\n" - - # Dump merged output - try: - fpath = os.path.join(debug_dir, "merged_output.txt") - with open(fpath, "w", encoding="utf-8") as f: - f.write(resultText.strip()) - except Exception: - pass - - return resultText.strip() - - except Exception as e: - logger.error(f"Error in per-chunk processing: {str(e)}") - return f"[Error in per-chunk processing: {str(e)}]" - - async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str: - if len(content.encode("utf-8")) <= targetSize: - return content - - try: - compressionPrompt = f""" - Komprimiere den folgenden {contentType} auf maximal {targetSize} Zeichen, - behalte aber alle wichtigen Informationen bei: - - {content} - - Gib nur den komprimierten Inhalt zurΓΌck, ohne zusΓ€tzliche ErklΓ€rungen. - """ - - # Service must not call connectors directly; use simple truncation fallback here - data = content.encode("utf-8") - return data[:targetSize].decode("utf-8", errors="ignore") + "... [truncated]" - except Exception as e: - logger.warning(f"AI compression failed, using truncation: {str(e)}") - return content[:targetSize] + "... [truncated]" - - # ===== DYNAMIC GENERIC AI CALLS IMPLEMENTATION ===== + await self._ensureAiObjectsInitialized() + return await self.webResearchService.webResearch(request) + # Master AI Call (process user prompt with optional unlimited count of input documents delivering one or many output documents, no size limitations) async def callAi( self, prompt: str, @@ -727,776 +179,13 @@ class AiService: Raises: Exception: If all available models fail """ - # Ensure aiObjects is initialized await self._ensureAiObjectsInitialized() - if options is None: - options = AiCallOptions() + # Get document processor and generator + documentProcessor = self.documentProcessor + documentGenerator = self.documentGenerator - # Normalize placeholders from List[PromptPlaceholder] - placeholders_dict: Dict[str, str] = {} - placeholders_meta: Dict[str, bool] = {} - if placeholders: - placeholders_dict = {p.label: p.content for p in placeholders} - placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders} - - # Auto-determine call type based on documents and operation type - call_type = self._determineCallType(documents, options.operationType) - options.callType = call_type - - # Log the prompt being sent to AI for debugging (before routing) TODO TO REMOVE - try: - # Build the full prompt that will be sent to AI - if placeholders: - full_prompt = prompt - for p in placeholders: - placeholder = f"{{{{KEY:{p.label}}}}}" - full_prompt = full_prompt.replace(placeholder, p.content) - else: - full_prompt = prompt - - self._writeAiResponseDebug( - label='ai_prompt_debug', - content=full_prompt, - partIndex=1, - modelName=None, - continuation=False - ) - except Exception: - pass - - # Handle document generation with specific output format - if outputFormat: - result = await self._callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title) - # Log AI response for debugging TODO TO REMOVE - try: - if isinstance(result, dict) and 'content' in result: - self._writeAiResponseDebug( - label='ai_document_generation', - content=result['content'], - partIndex=1, - modelName=None, # Document generation doesn't return model info - continuation=False - ) - except Exception: - pass - return result - - if call_type == "planning": - result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options) - # Log AI response for debugging TODO TO REMOVE - try: - self._writeAiResponseDebug( - label='ai_planning', - content=result or "", - partIndex=1, - modelName=None, # Planning doesn't return model info - continuation=False - ) - except Exception: - pass - return result - else: - # Set processDocumentsIndividually from the legacy parameter if not set in options - if options.processDocumentsIndividually is None and documents: - options.processDocumentsIndividually = False # Default to batch processing - - # For text calls, we need to build the full prompt with placeholders here - # since _callAiText doesn't handle placeholders directly - if placeholders_dict: - full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict) - else: - full_prompt = prompt - - result = await self._callAiText(full_prompt, documents, options) - # Log AI response for debugging (additional logging for text calls) TODO TO REMOVE - try: - self._writeAiResponseDebug( - label='ai_text_main', - content=result or "", - partIndex=1, - modelName=None, # Text calls already log internally - continuation=False - ) - except Exception: - pass - return result - - def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str: - """ - Determine call type based on documents and operation type. - - Criteria: no documents AND operationType is "generate_plan" -> planning - All other cases -> text - """ - has_documents = documents is not None and len(documents) > 0 - is_planning_operation = operation_type == OperationType.GENERATE_PLAN - - if not has_documents and is_planning_operation: - return "planning" - else: - return "text" - - async def _callAiPlanning( - self, - prompt: str, - placeholders: Optional[Dict[str, str]], - placeholdersMeta: Optional[Dict[str, bool]], - options: AiCallOptions - ) -> str: - """ - Handle planning calls with placeholder system and selective summarization. - """ - # Ensure aiObjects is initialized - await self._ensureAiObjectsInitialized() - - # Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally - effective_placeholders = placeholders or {} - full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) - - if options.compressPrompt and placeholdersMeta: - # Determine model capacity - try: - caps = self._getModelCapabilitiesForContent(full_prompt, None, options) - max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8"))) - except Exception: - max_bytes = len(full_prompt.encode("utf-8")) - - current_bytes = len(full_prompt.encode("utf-8")) - if current_bytes > max_bytes: - # Compute total bytes contributed by allowed placeholders (approximate by content length) - allowed_labels = [l for l, allow in placeholdersMeta.items() if allow] - allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} - total_allowed = sum(allowed_sizes.values()) - - overage = current_bytes - max_bytes - if total_allowed > 0 and overage > 0: - # Target total for allowed after reduction - target_allowed = max(total_allowed - overage, 0) - # Global ratio to apply across allowed placeholders - ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0 - ratio = max(0.0, min(1.0, ratio)) - - reduced: Dict[str, str] = {} - for label, content in effective_placeholders.items(): - if label in allowed_labels and isinstance(content, str) and len(content) > 0: - old_len = len(content) - # Reduce by proportional ratio on characters (fallback if empty) - reduction_factor = ratio if old_len > 0 else 1.0 - reduced[label] = self._reduceText(content, reduction_factor) - else: - reduced[label] = content - - effective_placeholders = reduced - full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) - - # If still slightly over, perform a second-pass fine adjustment with updated ratio - current_bytes = len(full_prompt.encode("utf-8")) - if current_bytes > max_bytes and total_allowed > 0: - overage2 = current_bytes - max_bytes - # Recompute allowed sizes after first reduction - allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} - total_allowed2 = sum(allowed_sizes2.values()) - if total_allowed2 > 0 and overage2 > 0: - target_allowed2 = max(total_allowed2 - overage2, 0) - ratio2 = target_allowed2 / total_allowed2 - ratio2 = max(0.0, min(1.0, ratio2)) - reduced2: Dict[str, str] = {} - for label, content in effective_placeholders.items(): - if label in allowed_labels and isinstance(content, str) and len(content) > 0: - old_len = len(content) - reduction_factor = ratio2 if old_len > 0 else 1.0 - reduced2[label] = self._reduceText(content, reduction_factor) - else: - reduced2[label] = content - effective_placeholders = reduced2 - full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) - - - # Make AI call using AiObjects (let it handle model selection) - request = AiCallRequest( - prompt=full_prompt, - context="", # Context is already included in the prompt - options=options + return await self.coreAi.callAi( + prompt, documents, placeholders, options, outputFormat, title, + documentProcessor, documentGenerator ) - response = await self.aiObjects.call(request) - try: - logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}") - except Exception: - pass - return response.content - - async def _callAiText( - self, - prompt: str, - documents: Optional[List[ChatDocument]], - options: AiCallOptions - ) -> str: - """ - Handle text calls with document processing through ExtractionService. - """ - # Ensure aiObjects is initialized - await self._ensureAiObjectsInitialized() - - # Determine processing strategy based on options - if options.processDocumentsIndividually and documents: - # Use per-chunk processing for individual document processing - return await self._processDocumentsPerChunk(documents, prompt, options) - - # Check if we need chunking - if so, use per-chunk processing - if documents and not options.compressContext: - # Get model capabilities to check if chunking will be needed - model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) - total_doc_size = sum(doc.fileSize or 0 for doc in documents) - - if total_doc_size > model_capabilities["maxContextBytes"]: - logger.info(f"Document size ({total_doc_size}) exceeds model capacity ({model_capabilities['maxContextBytes']}), using per-chunk processing") - return await self._processDocumentsPerChunk(documents, prompt, options) - - # Extract and process documents using ExtractionService - context = "" - if documents: - logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===") - - # Get model capabilities for size calculation - model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) - - # Use new ChatDocument-based API - extraction_options = { - "prompt": prompt, - "operationType": options.operationType, - "processDocumentsIndividually": options.processDocumentsIndividually, - "maxSize": options.maxContextBytes or model_capabilities["maxContextBytes"], - "chunkAllowed": not options.compressContext, - "textChunkSize": model_capabilities["textChunkSize"], - "imageChunkSize": model_capabilities["imageChunkSize"], - "imageMaxPixels": 1024 * 1024, - "imageQuality": 85, - "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"} - } - - logger.debug(f"Extraction options: {extraction_options}") - - extracted_content = self.extractionService.extractContent( - documents=documents, - options=extraction_options - ) - - logger.info(f"Extraction completed: {len(extracted_content)} documents") - - # Build context from list of extracted content - if isinstance(extracted_content, list): - context_parts = [] - chunk_count = 0 - for ec in extracted_content: - for p in ec.parts: - if p.typeGroup in ["text", "table", "structure"] and p.data: - if p.metadata.get("chunk", False): - chunk_count += 1 - context_parts.append(p.data) - elif p.typeGroup == "image" and p.data: - # Process image with AI using user prompt - try: - imageResult = await self.aiObjects.callImage( - prompt=prompt, - imageData=p.data, - mimeType=p.mimeType - ) - context_parts.append(f"[Image Analysis]: {imageResult}") - except Exception as e: - logger.warning(f"AI image processing failed: {e}") - context_parts.append(f"[Image Analysis Failed]: {str(e)}") - - if chunk_count > 0: - logger.debug(f"=== PROCESSING CHUNKED CONTENT ===") - logger.debug(f"Total chunks: {chunk_count}") - logger.debug(f"Total context parts: {len(context_parts)}") - - context = "\n\n---\n\n".join(context_parts) - else: - context = "" - - # Check size and reduce if needed - full_prompt = prompt + "\n\n" + context if context else prompt - - # Add generic completeness guidance: first vs subsequent (based on presence of context) - try: - if context and context.strip(): - # Subsequent calls with prior context: continue next part only - full_prompt += ( - "\n\nINSTRUCTIONS (COMPLETENESS):\n" - "- Continue from where the previous content ended. Do NOT repeat earlier content.\n" - "- If more parts are still needed after this response, the LAST LINE of your response MUST be exactly: 'CONTINUATION: true'.\n" - "- If the content is now complete, the LAST LINE of your response MUST be exactly: 'CONTINUATION: false'.\n" - "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" - ) - else: - # First call (no prior context): deliver full content or first part - full_prompt += ( - "\n\nINSTRUCTIONS (COMPLETENESS):\n" - "- Deliver the complete content. Do NOT truncate.\n" - "- If platform limits force truncation, provide the first complete section(s) only and ensure the LAST LINE of your response is exactly: 'CONTINUATION: true'.\n" - "- If the entire content is fully included, ensure the LAST LINE of your response is exactly: 'CONTINUATION: false'.\n" - "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" - ) - except Exception: - # Non-fatal if any issue building guidance - pass - logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})") - - # Use AiObjects to select the best model and make the call - try: - # Helper to detect and strip continuation flag - import re - def _split_content_and_flag(text: str) -> (str, bool): - if not text: - return "", False - lines = text.strip().splitlines() - cont = False - # Scan last 3 lines for flag to be robust - for i in range(1, min(4, len(lines))+1): - m = re.match(r"^\s*CONTINUATION:\s*(true|false)\s*$", lines[-i].strip(), re.IGNORECASE) - if m: - cont = m.group(1).lower() == 'true' - # remove the matched flag line - del lines[-i] - break - return "\n".join(lines).strip(), cont - - # First call - request = AiCallRequest( - prompt=full_prompt, - context="", - options=options - ) - response = await self.aiObjects.call(request) - try: - logger.debug(f"AI model selected (text): {getattr(response, 'modelName', 'unknown')}") - except Exception: - pass - content_first = response.content or "" - merged_content, needs_more = _split_content_and_flag(content_first) - - # Iteratively request next parts if flagged - # Allow configurable max parts via options; default = 1000 - try: - max_parts = int(getattr(options, 'maxParts', 1000) or 1000) - except Exception: - max_parts = 1000 - part_index = 1 - while needs_more and part_index < max_parts: - part_index += 1 - # Build subsequent prompt with explicit continuation instructions - subsequent_prompt = ( - prompt - + "\n\nINSTRUCTIONS (CONTINUE NEXT PART ONLY):\n" - "- Continue from where the previous content ended.\n" - "- Do NOT repeat earlier content.\n" - "- The LAST LINE of your response MUST be exactly one of: 'CONTINUATION: true' (if more parts are needed) or 'CONTINUATION: false' (if complete).\n" - "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" - ) - next_request = AiCallRequest( - prompt=subsequent_prompt, - context=merged_content, - options=options - ) - next_response = await self.aiObjects.call(next_request) - part_text = next_response.content or "" - part_clean, needs_more = _split_content_and_flag(part_text) - if part_clean: - # Separate parts clearly - merged_content = (merged_content + "\n\n" + part_clean).strip() - else: - # Avoid infinite loops on empty parts - break - - logger.debug(f"=== AI RESPONSE (MERGED) ===") - logger.debug(f"Response length: {len(merged_content)} chars") - logger.debug(f"Response preview: {merged_content[:200]}...") - return merged_content - - except Exception as e: - logger.error(f"AI call failed: {e}") - raise Exception(f"AI call failed: {e}") - - - def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: - """ - Get model capabilities for content processing, including appropriate size limits for chunking. - """ - # Estimate total content size - prompt_size = len(prompt.encode('utf-8')) - document_size = 0 - if documents: - # Rough estimate of document content size - for doc in documents: - document_size += doc.fileSize or 0 - - total_size = prompt_size + document_size - - # Use AiObjects to select the best model for this content size - # We'll simulate the model selection by checking available models - from modules.interfaces.interfaceAiObjects import aiModels - - # Find the best model for this content size and operation - best_model = None - best_context_length = 0 - - for model_name, model_info in aiModels.items(): - context_length = model_info.get("contextLength", 0) - - # Skip models with no context length or too small for content - if context_length == 0: - continue - - # Check if model supports the operation type - capabilities = model_info.get("capabilities", []) - if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: - continue - elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: - continue - elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: - continue - elif "text_generation" not in capabilities: - continue - - # Prefer models that can handle the content without chunking, but allow chunking if needed - if context_length >= total_size * 0.8: # 80% of content size - if context_length > best_context_length: - best_model = model_info - best_context_length = context_length - elif best_model is None: # Fallback to largest available model - if context_length > best_context_length: - best_model = model_info - best_context_length = context_length - - # Fallback to a reasonable default if no model found - if best_model is None: - best_model = { - "contextLength": 128000, # GPT-4o default - "llmName": "gpt-4o" - } - - # Calculate appropriate sizes - # Convert tokens to bytes (rough estimate: 1 token β‰ˆ 4 characters) - context_length_bytes = int(best_model["contextLength"] * 4) - max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length - text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks - image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks - - logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") - logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") - logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") - - return { - "maxContextBytes": max_context_bytes, - "textChunkSize": text_chunk_size, - "imageChunkSize": image_chunk_size - } - - def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]: - """ - Get models capable of handling the specific operation with capability filtering. - """ - # Use the actual AI objects model selection instead of hardcoded default - if hasattr(self, 'aiObjects') and self.aiObjects: - # Let AiObjects handle the model selection - return [] - else: - # Fallback to default model if AiObjects not available - default_model = ModelCapabilities( - name="default", - maxTokens=4000, - capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"], - costPerToken=0.001, - processingTime=1.0, - isAvailable=True - ) - return [default_model] - - def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str: - """ - Build full prompt by replacing placeholders with their content. - Uses the new {{KEY:placeholder}} format. - """ - if not placeholders: - return prompt - - full_prompt = prompt - for placeholder, content in placeholders.items(): - # Replace both old format {{placeholder}} and new format {{KEY:placeholder}} - full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content) - full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content) - - return full_prompt - - def _writeTraceLog(self, contextText: str, data: Any) -> None: - """Write raw data to the central trace log file without truncation.""" - try: - import os - import json - from datetime import datetime, UTC - # Only write if logger is in debug mode - if logger.level > logging.DEBUG: - return - # Get log directory from configuration via service center if possible - logDir = None - try: - if self.serviceCenter and hasattr(self.serviceCenter, 'utils'): - logDir = self.serviceCenter.utils.configGet("APP_LOGGING_LOG_DIR", "./") - except Exception: - pass - if not logDir: - logDir = "./" - if not os.path.isabs(logDir): - # Make it relative to gateway directory - gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - logDir = os.path.join(gatewayDir, logDir) - os.makedirs(logDir, exist_ok=True) - traceFile = os.path.join(logDir, "log_trace.log") - timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] - traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n" - if data is None: - traceEntry += "No data provided\n" - else: - # Prefer exact text; if dict/list, pretty print JSON - try: - if isinstance(data, (dict, list)): - traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n" - else: - text = str(data) - traceEntry += f"Text Data:\n{text}\n" - except Exception: - traceEntry += f"Data (fallback): {str(data)}\n" - traceEntry += ("=" * 80) + "\n\n" - with open(traceFile, "a", encoding="utf-8") as f: - f.write(traceEntry) - except Exception: - # Swallow to avoid recursive logging issues - pass - - def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None: - """Persist raw AI response parts for debugging under test-chat/ai.""" - try: - import os - from datetime import datetime, UTC - # Base dir: gateway/test-chat/ai (go up 4 levels from this file) - # .../gateway/modules/services/serviceAi/mainServiceAi.py -> up to gateway root - gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - outDir = os.path.join(gatewayDir, 'test-chat', 'ai') - os.makedirs(outDir, exist_ok=True) - ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] - suffix = [] - if partIndex is not None: - suffix.append(f"part{partIndex}") - if continuation is not None: - suffix.append(f"cont_{str(continuation).lower()}") - if modelName: - safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName) - suffix.append(safeModel) - suffixStr = ('_' + '_'.join(suffix)) if suffix else '' - fname = f"{ts}_{label}{suffixStr}.txt" - fpath = os.path.join(outDir, fname) - with open(fpath, 'w', encoding='utf-8') as f: - f.write(content or '') - except Exception: - # Do not raise; best-effort debug write - pass - - def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool: - """ - Check if text exceeds model token limit with safety margin. - """ - # Simple character-based estimation (4 chars per token) - estimated_tokens = len(text) // 4 - max_tokens = int(model.maxTokens * (1 - safety_margin)) - return estimated_tokens > max_tokens - - def _reducePlanningPrompt( - self, - full_prompt: str, - placeholders: Optional[Dict[str, str]], - model: ModelCapabilities, - options: AiCallOptions - ) -> str: - """ - Reduce planning prompt size by summarizing placeholders while preserving prompt structure. - """ - if not placeholders: - return self._reduceText(full_prompt, 0.7) - - # Reduce placeholders while preserving prompt - reduced_placeholders = {} - for placeholder, content in placeholders.items(): - if len(content) > 1000: # Only reduce long content - reduction_factor = 0.7 - reduced_content = self._reduceText(content, reduction_factor) - reduced_placeholders[placeholder] = reduced_content - else: - reduced_placeholders[placeholder] = content - - return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders) - - def _reduceTextPrompt( - self, - prompt: str, - context: str, - model: ModelCapabilities, - options: AiCallOptions - ) -> str: - """ - Reduce text prompt size using typeGroup-aware chunking and merging. - """ - max_size = int(model.maxTokens * (1 - options.safetyMargin)) - - if options.compressPrompt: - # Reduce both prompt and context - target_size = max_size - current_size = len(prompt) + len(context) - reduction_factor = (target_size * 0.7) / current_size - - if reduction_factor < 1.0: - prompt = self._reduceText(prompt, reduction_factor) - context = self._reduceText(context, reduction_factor) - else: - # Only reduce context, preserve prompt integrity - max_context_size = max_size - len(prompt) - if len(context) > max_context_size: - reduction_factor = max_context_size / len(context) - context = self._reduceText(context, reduction_factor) - - return prompt + "\n\n" + context if context else prompt - - def _extractTextFromContentParts(self, extracted_content) -> str: - """ - Extract text content from ExtractionService ContentPart objects. - """ - if not extracted_content or not hasattr(extracted_content, 'parts'): - return "" - - text_parts = [] - for part in extracted_content.parts: - if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']: - if hasattr(part, 'data') and part.data: - text_parts.append(part.data) - - return "\n\n".join(text_parts) - - def _reduceText(self, text: str, reduction_factor: float) -> str: - """ - Reduce text size by the specified factor. - """ - if reduction_factor >= 1.0: - return text - - target_length = int(len(text) * reduction_factor) - return text[:target_length] + "... [reduced]" - - async def _callAiWithDocumentGeneration( - self, - prompt: str, - documents: Optional[List[ChatDocument]], - options: AiCallOptions, - outputFormat: str, - title: Optional[str] - ) -> Dict[str, Any]: - """ - Handle AI calls with document generation in specific output format. - - Args: - prompt: The main prompt for the AI call - documents: Optional list of documents to process - options: AI call configuration options - outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx) - title: Optional title for generated documents - - Returns: - Dict with generated documents and metadata - """ - try: - # Get format-specific extraction prompt from generation service - from modules.services.serviceGeneration.mainServiceGeneration import GenerationService - generation_service = GenerationService(self.serviceCenter) - - # Use default title if not provided - if not title: - title = "AI Generated Document" - - # Get format-specific extraction prompt - extraction_prompt = generation_service.getExtractionPrompt( - output_format=outputFormat, - user_prompt=prompt, - title=title - ) - - # Process documents with format-specific prompt - ai_response = await self._callAiText(extraction_prompt, documents, options) - - # Parse filename header from AI response if present - parsed_filename = None - try: - if ai_response: - first_newline = ai_response.find('\n') - header_line = ai_response if first_newline == -1 else ai_response[:first_newline] - if header_line.strip().lower().startswith('filename:'): - parsed = header_line.split(':', 1)[1].strip() - # basic sanitization - import re - parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed) - parsed = re.sub(r"-+", "-", parsed).strip('-') - if parsed: - parsed_filename = parsed - # remove header line from content for rendering - ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else '' - except Exception: - parsed_filename = None - - if not ai_response or ai_response.strip() == "": - raise Exception("AI content generation failed") - - # Render the content to the specified format - rendered_content, mime_type = await generation_service.renderReport( - extracted_content=ai_response, - output_format=outputFormat, - title=title - ) - - # Generate meaningful filename (use AI-provided if valid, else fallback) - from datetime import datetime, UTC - timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"): - filename = parsed_filename - else: - safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') - filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}" - - # Return structured result with document information - return { - "success": True, - "content": ai_response, # Raw AI response - "rendered_content": rendered_content, # Formatted content - "mime_type": mime_type, - "filename": filename, - "format": outputFormat, - "title": title, - "documents": [{ - "documentName": filename, - "documentData": rendered_content, - "mimeType": mime_type - }] - } - - except Exception as e: - logger.error(f"Error in document generation: {str(e)}") - return { - "success": False, - "error": str(e), - "content": "", - "rendered_content": "", - "mime_type": "text/plain", - "filename": f"error_{outputFormat}", - "format": outputFormat, - "title": title or "Error", - "documents": [] - } - diff --git a/modules/services/serviceAi/subCoreAi.py b/modules/services/serviceAi/subCoreAi.py new file mode 100644 index 00000000..3f245334 --- /dev/null +++ b/modules/services/serviceAi/subCoreAi.py @@ -0,0 +1,596 @@ +import logging +from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority +from modules.interfaces.interfaceAiObjects import AiObjects + +logger = logging.getLogger(__name__) + + +class SubCoreAi: + """Core AI operations including image analysis, text generation, and planning calls.""" + + def __init__(self, services, aiObjects): + """Initialize core AI operations. + + Args: + services: Service center instance for accessing other services + aiObjects: Initialized AiObjects instance + """ + self.services = services + self.aiObjects = aiObjects + + # AI Processing Call + async def callAi( + self, + prompt: str, + documents: Optional[List[ChatDocument]] = None, + placeholders: Optional[List[PromptPlaceholder]] = None, + options: Optional[AiCallOptions] = None, + outputFormat: Optional[str] = None, + title: Optional[str] = None, + documentProcessor=None, + documentGenerator=None + ) -> Union[str, Dict[str, Any]]: + """ + Unified AI call interface that automatically routes to appropriate handler. + + Args: + prompt: The main prompt for the AI call + documents: Optional list of documents to process + placeholders: Optional list of placeholder replacements for planning calls + options: AI call configuration options + outputFormat: Optional output format (html, pdf, docx, txt, md, json, csv, xlsx) for document generation + title: Optional title for generated documents + documentProcessor: Document processing service instance + documentGenerator: Document generation service instance + + Returns: + AI response as string, or dict with documents if outputFormat is specified + + Raises: + Exception: If all available models fail + """ + if options is None: + options = AiCallOptions() + + # Normalize placeholders from List[PromptPlaceholder] + placeholders_dict: Dict[str, str] = {} + placeholders_meta: Dict[str, bool] = {} + if placeholders: + placeholders_dict = {p.label: p.content for p in placeholders} + placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders} + + # Auto-determine call type based on documents and operation type + call_type = self._determineCallType(documents, options.operationType) + options.callType = call_type + + try: + # Build the full prompt that will be sent to AI + if placeholders: + full_prompt = prompt + for p in placeholders: + placeholder = f"{{{{KEY:{p.label}}}}}" + full_prompt = full_prompt.replace(placeholder, p.content) + else: + full_prompt = prompt + + self._writeAiResponseDebug( + label='ai_prompt_debug', + content=full_prompt, + partIndex=1, + modelName=None, + continuation=False + ) + except Exception: + pass + + # Handle document generation with specific output format + if outputFormat and documentGenerator: + result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title) + # Log AI response for debugging + try: + if isinstance(result, dict) and 'content' in result: + self._writeAiResponseDebug( + label='ai_document_generation', + content=result['content'], + partIndex=1, + modelName=None, # Document generation doesn't return model info + continuation=False + ) + except Exception: + pass + return result + + if call_type == "planning": + result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options) + # Log AI response for debugging + try: + self._writeAiResponseDebug( + label='ai_planning', + content=result or "", + partIndex=1, + modelName=None, # Planning doesn't return model info + continuation=False + ) + except Exception: + pass + return result + else: + # Set processDocumentsIndividually from the legacy parameter if not set in options + if options.processDocumentsIndividually is None and documents: + options.processDocumentsIndividually = False # Default to batch processing + + # For text calls, we need to build the full prompt with placeholders here + # since _callAiText doesn't handle placeholders directly + if placeholders_dict: + full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict) + else: + full_prompt = prompt + + if documentProcessor and documents: + result = await documentProcessor.callAiText(full_prompt, documents, options) + else: + # Fallback to direct AI call if no document processor available + request = AiCallRequest( + prompt=full_prompt, + context="", + options=options + ) + response = await self.aiObjects.call(request) + result = response.content + + # Log AI response for debugging (additional logging for text calls) + try: + self._writeAiResponseDebug( + label='ai_text_main', + content=result or "", + partIndex=1, + modelName=None, # Text calls already log internally + continuation=False + ) + except Exception: + pass + return result + + # AI Image Analysis + async def readImage( + self, + prompt: str, + imageData: Union[str, bytes], + mimeType: str = None, + options: Optional[AiCallOptions] = None, + ) -> str: + """Call AI for image analysis using interface.callImage().""" + try: + # Check if imageData is valid + if not imageData: + error_msg = "No image data provided" + self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE") + logger.error(f"Error in AI image analysis: {error_msg}") + return f"Error: {error_msg}" + + self.services.utils.debugLogToFile(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}", "AI_SERVICE") + logger.info(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}") + + # Always use IMAGE_ANALYSIS operation type for image processing + if options is None: + options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS) + else: + # Override the operation type to ensure image analysis + options.operationType = OperationType.IMAGE_ANALYSIS + + self.services.utils.debugLogToFile(f"Calling aiObjects.callImage with operationType: {options.operationType}", "AI_SERVICE") + logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}") + result = await self.aiObjects.callImage(prompt, imageData, mimeType, options) + + # Debug the result + self.services.utils.debugLogToFile(f"Raw AI result type: {type(result)}, value: {repr(result)}", "AI_SERVICE") + + # Check if result is valid + if not result or (isinstance(result, str) and not result.strip()): + error_msg = f"No response from AI image analysis (result: {repr(result)})" + self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE") + logger.error(f"Error in AI image analysis: {error_msg}") + return f"Error: {error_msg}" + + self.services.utils.debugLogToFile(f"callImage returned: {result[:200]}..." if len(result) > 200 else result, "AI_SERVICE") + logger.info(f"callImage returned: {result[:200]}..." if len(result) > 200 else result) + return result + except Exception as e: + self.services.utils.debugLogToFile(f"Error in AI image analysis: {str(e)}", "AI_SERVICE") + logger.error(f"Error in AI image analysis: {str(e)}") + return f"Error: {str(e)}" + + # AI Image Generation + async def generateImage( + self, + prompt: str, + size: str = "1024x1024", + quality: str = "standard", + style: str = "vivid", + options: Optional[AiCallOptions] = None, + ) -> Dict[str, Any]: + """Generate an image using AI using interface.generateImage().""" + try: + return await self.aiObjects.generateImage(prompt, size, quality, style, options) + except Exception as e: + logger.error(f"Error in AI image generation: {str(e)}") + return {"success": False, "error": str(e)} + + def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str: + """ + Determine call type based on documents and operation type. + + Criteria: no documents AND operationType is "generate_plan" -> planning + All other cases -> text + """ + has_documents = documents is not None and len(documents) > 0 + is_planning_operation = operation_type == OperationType.GENERATE_PLAN + + if not has_documents and is_planning_operation: + return "planning" + else: + return "text" + + async def _callAiPlanning( + self, + prompt: str, + placeholders: Optional[Dict[str, str]], + placeholdersMeta: Optional[Dict[str, bool]], + options: AiCallOptions + ) -> str: + """ + Handle planning calls with placeholder system and selective summarization. + """ + # Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally + effective_placeholders = placeholders or {} + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) + + if options.compressPrompt and placeholdersMeta: + # Determine model capacity + try: + caps = self._getModelCapabilitiesForContent(full_prompt, None, options) + max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8"))) + except Exception: + max_bytes = len(full_prompt.encode("utf-8")) + + current_bytes = len(full_prompt.encode("utf-8")) + if current_bytes > max_bytes: + # Compute total bytes contributed by allowed placeholders (approximate by content length) + allowed_labels = [l for l, allow in placeholdersMeta.items() if allow] + allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} + total_allowed = sum(allowed_sizes.values()) + + overage = current_bytes - max_bytes + if total_allowed > 0 and overage > 0: + # Target total for allowed after reduction + target_allowed = max(total_allowed - overage, 0) + # Global ratio to apply across allowed placeholders + ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0 + ratio = max(0.0, min(1.0, ratio)) + + reduced: Dict[str, str] = {} + for label, content in effective_placeholders.items(): + if label in allowed_labels and isinstance(content, str) and len(content) > 0: + old_len = len(content) + # Reduce by proportional ratio on characters (fallback if empty) + reduction_factor = ratio if old_len > 0 else 1.0 + reduced[label] = self._reduceText(content, reduction_factor) + else: + reduced[label] = content + + effective_placeholders = reduced + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) + + # If still slightly over, perform a second-pass fine adjustment with updated ratio + current_bytes = len(full_prompt.encode("utf-8")) + if current_bytes > max_bytes and total_allowed > 0: + overage2 = current_bytes - max_bytes + # Recompute allowed sizes after first reduction + allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} + total_allowed2 = sum(allowed_sizes2.values()) + if total_allowed2 > 0 and overage2 > 0: + target_allowed2 = max(total_allowed2 - overage2, 0) + ratio2 = target_allowed2 / total_allowed2 + ratio2 = max(0.0, min(1.0, ratio2)) + reduced2: Dict[str, str] = {} + for label, content in effective_placeholders.items(): + if label in allowed_labels and isinstance(content, str) and len(content) > 0: + old_len = len(content) + reduction_factor = ratio2 if old_len > 0 else 1.0 + reduced2[label] = self._reduceText(content, reduction_factor) + else: + reduced2[label] = content + effective_placeholders = reduced2 + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) + + + # Make AI call using AiObjects (let it handle model selection) + request = AiCallRequest( + prompt=full_prompt, + context="", # Context is already included in the prompt + options=options + ) + response = await self.aiObjects.call(request) + try: + logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}") + except Exception: + pass + return response.content + + async def _callAiDirect( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + documentProcessor=None + ) -> Dict[str, Any]: + """ + Call AI directly with prompt and documents for JSON output. + Used for multi-file generation - uses the existing generation pipeline. + """ + # Use the existing generation pipeline that already works + # This ensures proper document processing and content extraction + logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents") + + if documentProcessor: + # Process documents with JSON merging using the existing pipeline + result = await documentProcessor.processDocumentsPerChunkJson(documents, prompt, options) + else: + # Fallback to simple AI call + request = AiCallRequest( + prompt=prompt, + context="", + options=options + ) + response = await self.aiObjects.call(request) + result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]} + + # Convert single-file result to multi-file format if needed + if "sections" in result and "documents" not in result: + logger.info("Converting single-file result to multi-file format") + # This is a single-file result, convert it to multi-file format + return { + "metadata": result.get("metadata", {"title": "Converted Document"}), + "documents": [{ + "id": "doc_1", + "title": result.get("metadata", {}).get("title", "Document"), + "filename": "document.txt", + "sections": result.get("sections", []) + }] + } + + return result + + def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: + """ + Get model capabilities for content processing, including appropriate size limits for chunking. + """ + # Estimate total content size + prompt_size = len(prompt.encode('utf-8')) + document_size = 0 + if documents: + # Rough estimate of document content size + for doc in documents: + document_size += doc.fileSize or 0 + + total_size = prompt_size + document_size + + # Use AiObjects to select the best model for this content size + # We'll simulate the model selection by checking available models + from modules.interfaces.interfaceAiObjects import aiModels + + # Find the best model for this content size and operation + best_model = None + best_context_length = 0 + + for model_name, model_info in aiModels.items(): + context_length = model_info.get("contextLength", 0) + + # Skip models with no context length or too small for content + if context_length == 0: + continue + + # Check if model supports the operation type + capabilities = model_info.get("capabilities", []) + if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: + continue + elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: + continue + elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: + continue + elif "text_generation" not in capabilities: + continue + + # Prefer models that can handle the content without chunking, but allow chunking if needed + if context_length >= total_size * 0.8: # 80% of content size + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + elif best_model is None: # Fallback to largest available model + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + + # Fallback to a reasonable default if no model found + if best_model is None: + best_model = { + "contextLength": 128000, # GPT-4o default + "llmName": "gpt-4o" + } + + # Calculate appropriate sizes + # Convert tokens to bytes (rough estimate: 1 token β‰ˆ 4 characters) + context_length_bytes = int(best_model["contextLength"] * 4) + max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length + text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks + image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks + + logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") + logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") + logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") + + return { + "maxContextBytes": max_context_bytes, + "textChunkSize": text_chunk_size, + "imageChunkSize": image_chunk_size + } + + def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]: + """ + Get models capable of handling the specific operation with capability filtering. + """ + # Use the actual AI objects model selection instead of hardcoded default + if hasattr(self, 'aiObjects') and self.aiObjects: + # Let AiObjects handle the model selection + return [] + else: + # Fallback to default model if AiObjects not available + default_model = ModelCapabilities( + name="default", + maxTokens=4000, + capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"], + costPerToken=0.001, + processingTime=1.0, + isAvailable=True + ) + return [default_model] + + def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str: + """ + Build full prompt by replacing placeholders with their content. + Uses the new {{KEY:placeholder}} format. + """ + if not placeholders: + return prompt + + full_prompt = prompt + for placeholder, content in placeholders.items(): + # Replace both old format {{placeholder}} and new format {{KEY:placeholder}} + full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content) + full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content) + + return full_prompt + + def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None: + """Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled.""" + try: + # Check if debug logging is enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if not debug_enabled: + return + + import os + from datetime import datetime, UTC + # Base dir: gateway/test-chat/ai (go up 4 levels from this file) + # .../gateway/modules/services/serviceAi/subCoreAi.py -> up to gateway root + gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + outDir = os.path.join(gatewayDir, 'test-chat', 'ai') + os.makedirs(outDir, exist_ok=True) + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + suffix = [] + if partIndex is not None: + suffix.append(f"part{partIndex}") + if continuation is not None: + suffix.append(f"cont_{str(continuation).lower()}") + if modelName: + safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName) + suffix.append(safeModel) + suffixStr = ('_' + '_'.join(suffix)) if suffix else '' + fname = f"{ts}_{label}{suffixStr}.txt" + fpath = os.path.join(outDir, fname) + with open(fpath, 'w', encoding='utf-8') as f: + f.write(content or '') + except Exception: + # Do not raise; best-effort debug write + pass + + def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool: + """ + Check if text exceeds model token limit with safety margin. + """ + # Simple character-based estimation (4 chars per token) + estimated_tokens = len(text) // 4 + max_tokens = int(model.maxTokens * (1 - safety_margin)) + return estimated_tokens > max_tokens + + def _reducePlanningPrompt( + self, + full_prompt: str, + placeholders: Optional[Dict[str, str]], + model: ModelCapabilities, + options: AiCallOptions + ) -> str: + """ + Reduce planning prompt size by summarizing placeholders while preserving prompt structure. + """ + if not placeholders: + return self._reduceText(full_prompt, 0.7) + + # Reduce placeholders while preserving prompt + reduced_placeholders = {} + for placeholder, content in placeholders.items(): + if len(content) > 1000: # Only reduce long content + reduction_factor = 0.7 + reduced_content = self._reduceText(content, reduction_factor) + reduced_placeholders[placeholder] = reduced_content + else: + reduced_placeholders[placeholder] = content + + return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders) + + def _reduceTextPrompt( + self, + prompt: str, + context: str, + model: ModelCapabilities, + options: AiCallOptions + ) -> str: + """ + Reduce text prompt size using typeGroup-aware chunking and merging. + """ + max_size = int(model.maxTokens * (1 - options.safetyMargin)) + + if options.compressPrompt: + # Reduce both prompt and context + target_size = max_size + current_size = len(prompt) + len(context) + reduction_factor = (target_size * 0.7) / current_size + + if reduction_factor < 1.0: + prompt = self._reduceText(prompt, reduction_factor) + context = self._reduceText(context, reduction_factor) + else: + # Only reduce context, preserve prompt integrity + max_context_size = max_size - len(prompt) + if len(context) > max_context_size: + reduction_factor = max_context_size / len(context) + context = self._reduceText(context, reduction_factor) + + return prompt + "\n\n" + context if context else prompt + + def _extractTextFromContentParts(self, extracted_content) -> str: + """ + Extract text content from ExtractionService ContentPart objects. + """ + if not extracted_content or not hasattr(extracted_content, 'parts'): + return "" + + text_parts = [] + for part in extracted_content.parts: + if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']: + if hasattr(part, 'data') and part.data: + text_parts.append(part.data) + + return "\n\n".join(text_parts) + + def _reduceText(self, text: str, reduction_factor: float) -> str: + """ + Reduce text size by the specified factor. + """ + if reduction_factor >= 1.0: + return text + + target_length = int(len(text) * reduction_factor) + return text[:target_length] + "... [reduced]" diff --git a/modules/services/serviceAi/subDocumentGeneration.py b/modules/services/serviceAi/subDocumentGeneration.py new file mode 100644 index 00000000..6d7ee4b7 --- /dev/null +++ b/modules/services/serviceAi/subDocumentGeneration.py @@ -0,0 +1,804 @@ +import logging +from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelChat import ChatDocument +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + +logger = logging.getLogger(__name__) + + +class SubDocumentGeneration: + """Document generation operations including single-file and multi-file generation.""" + + def __init__(self, services, aiObjects, documentProcessor): + """Initialize document generation service. + + Args: + services: Service center instance for accessing other services + aiObjects: Initialized AiObjects instance + documentProcessor: Document processing service instance + """ + self.services = services + self.aiObjects = aiObjects + self.documentProcessor = documentProcessor + + async def callAiWithDocumentGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str] + ) -> Dict[str, Any]: + """ + Handle AI calls with document generation in specific output format. + Now supports both single-file and multi-file generation. + + Args: + prompt: The main prompt for the AI call + documents: Optional list of documents to process + options: AI call configuration options + outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx) + title: Optional title for generated documents + + Returns: + Dict with generated documents and metadata + """ + try: + # Use AI to analyze prompt intent + prompt_analysis = await self._analyzePromptIntent(prompt, self) + logger.info(f"Prompt analysis result: {prompt_analysis}") + + if prompt_analysis.get("is_multi_file", False): + return await self._callAiWithMultiFileGeneration( + prompt, documents, options, outputFormat, title, prompt_analysis + ) + else: + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) + + except Exception as e: + logger.error(f"Error in document generation: {str(e)}") + return { + "success": False, + "error": str(e), + "content": "", + "rendered_content": "", + "mime_type": "text/plain", + "filename": f"error_{outputFormat}", + "format": outputFormat, + "title": title or "Error", + "documents": [] + } + + async def _callAiWithSingleFileGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str], + generationPrompt: Optional[str] = None + ) -> Dict[str, Any]: + """Handle single-file document generation (existing functionality).""" + try: + # Get format-specific extraction prompt from generation service + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generation_service = GenerationService(self.services) + + # Use default title if not provided + if not title: + title = "AI Generated Document" + + # Get format-specific extraction prompt + extractionPrompt = await generation_service.getExtractionPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + aiService=self + ) + + # Process documents with format-specific prompt using JSON mode + # This ensures structured JSON output instead of text + aiResponseJson = await self._callAiJson(extractionPrompt, documents, options) + + # Validate JSON response + if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson: + raise Exception("AI response is not valid JSON document structure") + + # Emit raw extracted data as a chat message attachment before rendering + try: + await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single") + except Exception: + logger.warning("Failed to emit raw extraction chat message (single-file)") + + # Generate filename from document metadata + parsedFilename = None + try: + if aiResponseJson.get("metadata", {}).get("title"): + title = aiResponseJson["metadata"]["title"] + # Clean title for filename + import re + parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title) + parsed = re.sub(r"-+", "-", parsed).strip('-') + if parsed: + parsedFilename = f"{parsed}.{outputFormat}" + except Exception: + parsedFilename = None + + # Use AI generation to enhance the extracted JSON before rendering + enhancedContent = aiResponseJson # Default to original + if prompt: + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + # Get generation prompt + generationPrompt = await generation_service.getGenerationPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + aiService=self + ) + + # Prepare the AI call + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + # Create context with the extracted JSON content + import json + context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}" + + request = AiCallRequest( + prompt=generationPrompt, + context=context, + options=request_options + ) + + # Call AI to enhance the content + response = await self.aiObjects.call(request) + + if response and response.content: + # Parse the AI response as JSON + try: + import re + result = response.content.strip() + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + enhancedContent = json.loads(result) + logger.info(f"AI enhanced JSON content successfully") + + except json.JSONDecodeError as e: + logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content") + enhancedContent = aiResponseJson + else: + logger.warning("AI generation returned empty response, using original content") + enhancedContent = aiResponseJson + + except Exception as e: + logger.warning(f"AI generation failed: {str(e)}, using original content") + enhancedContent = aiResponseJson + + # Render the enhanced JSON content + renderedContent, mimeType = await generation_service.renderReport( + extractedContent=enhancedContent, + outputFormat=outputFormat, + title=title, + userPrompt=prompt, + aiService=self + ) + + # Generate meaningful filename (use AI-provided if valid, else fallback) + from datetime import datetime, UTC + timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"): + filename = parsedFilename + else: + safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') + filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}" + + # Return structured result with document information + return { + "success": True, + "content": aiResponseJson, # Structured JSON document + "rendered_content": renderedContent, # Formatted content + "mime_type": mimeType, + "filename": filename, + "format": outputFormat, + "title": title, + "documents": [{ + "documentName": filename, + "documentData": renderedContent, + "mimeType": mimeType + }], + "is_multi_file": False + } + + except Exception as e: + logger.error(f"Error in single-file document generation: {str(e)}") + raise + + async def _callAiWithMultiFileGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str], + prompt_analysis: Dict[str, Any] + ) -> Dict[str, Any]: + """Handle multi-file document generation using AI analysis.""" + try: + # Get multi-file extraction prompt based on AI analysis + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generation_service = GenerationService(self.services) + + # Use default title if not provided + if not title: + title = "AI Generated Documents" + + # Get adaptive extraction prompt + extraction_prompt = await generation_service.getAdaptiveExtractionPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + promptAnalysis=prompt_analysis, + aiService=self + ) + + logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters") + logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...") + + # Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt + logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars") + logger.debug(f"Processing documents: {len(documents) if documents else 0} documents") + + # Use the existing pipeline but replace the prompt with our adaptive one + # This ensures proper document processing while using the multi-file prompt + ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options) + + logger.info(f"AI response type: {type(ai_response)}") + logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}") + logger.debug(f"AI response preview: {str(ai_response)[:500]}...") + + # Validate response structure + if not self._validateResponseStructure(ai_response, prompt_analysis): + # Fallback to single-file if multi-file fails + logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}") + logger.warning(f"Prompt analysis: {prompt_analysis}") + logger.warning("Falling back to single-file generation") + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) + + # Emit raw extracted data as a chat message attachment before transformation/rendering + try: + await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi") + except Exception: + logger.warning("Failed to emit raw extraction chat message (multi-file)") + + # Process multiple documents + generated_documents = [] + for i, doc_data in enumerate(ai_response.get("documents", [])): + # Transform AI-generated sections to renderer-compatible format + transformed_sections = [] + for section in doc_data.get("sections", []): + # Convert AI format to renderer format + transformed_section = { + "id": section.get("id", f"section_{len(transformed_sections) + 1}"), + "content_type": section.get("content_type", "paragraph"), + "elements": section.get("elements", []), + "order": section.get("order", len(transformed_sections) + 1) + } + + # Extract text from elements for simple text-based sections + if section.get("content_type") in ["paragraph", "heading"]: + text_parts = [] + for element in section.get("elements", []): + if "text" in element: + text_parts.append(element["text"]) + # Add text to the first element or create a new one + if transformed_section["elements"]: + transformed_section["elements"][0]["text"] = "\n".join(text_parts) + else: + transformed_section["elements"] = [{"text": "\n".join(text_parts)}] + + transformed_sections.append(transformed_section) + + # Create complete document structure for rendering + complete_document = { + "metadata": { + "title": doc_data["title"], + "source_document": "multi_file_generation", + "document_id": doc_data.get("id", f"doc_{i+1}"), + "filename": doc_data.get("filename", f"document_{i+1}"), + "split_strategy": prompt_analysis.get("strategy", "custom") + }, + "sections": transformed_sections, + "summary": f"Generated document: {doc_data['title']}", + "tags": ["multi_file", "ai_generated"] + } + + # Use AI generation to enhance the extracted JSON before rendering + enhancedContent = complete_document # Default to original + if prompt: + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + # Get generation prompt + generationPrompt = await generation_service.getGenerationPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=doc_data["title"], + aiService=self + ) + + # Prepare the AI call + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + # Create context with the extracted JSON content + import json + context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}" + + request = AiCallRequest( + prompt=generationPrompt, + context=context, + options=request_options + ) + + # Call AI to enhance the content + response = await self.aiObjects.call(request) + + if response and response.content: + # Parse the AI response as JSON + try: + import re + result = response.content.strip() + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + enhancedContent = json.loads(result) + logger.info(f"AI enhanced JSON content successfully") + + except json.JSONDecodeError as e: + logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...") + # Try to repair common JSON issues + try: + repaired_result = self._repairJson(result) + enhancedContent = json.loads(repaired_result) + logger.info(f"Successfully repaired JSON content") + except (json.JSONDecodeError, Exception) as repair_error: + logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...") + # Try AI-powered JSON repair as last resort + try: + ai_repaired = await self._repairJsonWithAI(result) + enhancedContent = json.loads(ai_repaired) + logger.info(f"AI successfully repaired JSON content") + except Exception as ai_repair_error: + logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content") + enhancedContent = complete_document + else: + logger.warning("AI generation returned empty response, using original content") + enhancedContent = complete_document + + except Exception as e: + logger.warning(f"AI generation failed: {str(e)}, using original content") + enhancedContent = complete_document + + # Render the enhanced JSON content + rendered_content, mime_type = await generation_service.renderReport( + extractedContent=enhancedContent, + outputFormat=outputFormat, + title=doc_data["title"], + userPrompt=prompt, + aiService=self + ) + + # Generate proper filename with correct extension + base_filename = doc_data.get("filename", f"document_{i+1}") + # Remove any existing extension and add the correct one + if '.' in base_filename: + base_filename = base_filename.rsplit('.', 1)[0] + + # Add proper extension based on output format + if outputFormat.lower() == "docx": + filename = f"{base_filename}.docx" + elif outputFormat.lower() == "pdf": + filename = f"{base_filename}.pdf" + elif outputFormat.lower() == "html": + filename = f"{base_filename}.html" + else: + filename = f"{base_filename}.{outputFormat}" + + generated_documents.append({ + "documentName": filename, + "documentData": rendered_content, + "mimeType": mime_type + }) + + # Save debug files for multi-file generation - only if debug enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + try: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + debug_dir = os.path.join(debug_root, f"multifile_output_{ts}") + os.makedirs(debug_dir, exist_ok=True) + + # Save metadata + with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f: + f.write(f"title: {title}\n") + f.write(f"format: {outputFormat}\n") + f.write(f"documents_count: {len(generated_documents)}\n") + f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n") + f.write(f"prompt_analysis: {prompt_analysis}\n") + + # Save each generated document + for i, doc in enumerate(generated_documents): + doc_filename = doc["documentName"] + doc_data = doc["documentData"] + doc_mime = doc["mimeType"] + + # Determine file extension + if outputFormat.lower() == "docx": + file_ext = ".docx" + elif outputFormat.lower() == "pdf": + file_ext = ".pdf" + elif outputFormat.lower() == "html": + file_ext = ".html" + else: + file_ext = f".{outputFormat}" + + # Save the rendered document + output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}") + + if file_ext in ['.md', '.txt', '.html', '.json', '.csv']: + # Text-based formats + with open(output_path, 'w', encoding='utf-8') as f: + f.write(doc_data) + else: + # Binary formats - decode from base64 if needed + try: + import base64 + doc_bytes = base64.b64decode(doc_data) + with open(output_path, 'wb') as f: + f.write(doc_bytes) + except Exception: + # If not base64, save as text + with open(output_path, 'w', encoding='utf-8') as f: + f.write(doc_data) + + logger.info(f"πŸ’Ύ Debug: Saved multi-file document {i+1}: {output_path}") + + logger.info(f"πŸ’Ύ Debug: Multi-file output saved to: {debug_dir}") + + except Exception as e: + logger.warning(f"Failed to save multi-file debug output: {e}") + + return { + "success": True, + "content": ai_response, + "rendered_content": None, # Not applicable for multi-file + "mime_type": None, # Not applicable for multi-file + "filename": None, # Not applicable for multi-file + "format": outputFormat, + "title": title, + "documents": generated_documents, + "is_multi_file": True, + "split_strategy": prompt_analysis.get("strategy", "custom") + } + + except Exception as e: + logger.error(f"Error in multi-file document generation: {str(e)}") + # Fallback to single-file + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) + + async def _callAiJson( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions + ) -> Dict[str, Any]: + """ + Handle AI calls with document processing for JSON output. + Returns structured JSON document instead of text. + """ + # Process documents with JSON merging + return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options) + + async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]: + """Use AI to analyze user prompt and determine processing requirements.""" + if not ai_service: + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + try: + analysis_prompt = f""" +Analyze this user request and determine if it requires multiple file output or single file output. + +User request: "{prompt}" + +Respond with JSON only in this exact format: +{{ + "is_multi_file": true/false, + "strategy": "single|per_entity|by_section|by_criteria|custom", + "criteria": "description of how to split content", + "file_naming_pattern": "suggested pattern for filenames", + "reasoning": "brief explanation of the analysis" +}} + +Consider: +- Does the user want separate files for different entities (customers, products, etc.)? +- Does the user want to split content into multiple documents? +- What would be the most logical way to organize the content? +- What language is the request in? (analyze in the original language) + +Return only the JSON response. +""" + + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) + response = await ai_service.aiObjects.call(request) + + if response and response.content: + import json + import re + + # Extract JSON from response + result = response.content.strip() + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + result = json_match.group(0) + + analysis = json.loads(result) + return analysis + else: + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + except Exception as e: + logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file") + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool: + """Validate that AI response matches the expected structure.""" + try: + if not isinstance(response, dict): + logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") + return False + + # Check for multi-file structure + if prompt_analysis.get("is_multi_file", False): + has_documents = "documents" in response + is_documents_list = isinstance(response.get("documents"), list) + logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}") + if has_documents and is_documents_list: + logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found") + else: + logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}") + logger.warning(f"Available keys: {list(response.keys())}") + return has_documents and is_documents_list + else: + has_sections = "sections" in response + is_sections_list = isinstance(response.get("sections"), list) + logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}") + return has_sections and is_sections_list + except Exception as e: + logger.warning(f"Response validation failed with exception: {str(e)}") + return False + + async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None: + """ + Create a ChatMessage with the extracted raw JSON attached as a file so the user + has access to the data even if downstream processing fails. + """ + try: + services = self.services + workflow = services.currentWorkflow + + # Serialize payload + import json as _json + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + content_text = _json.dumps(payload, ensure_ascii=False, indent=2) + content_bytes = content_text.encode('utf-8') + + # Store as file via component storage + file_name = f"{label}_{ts}.json" + file_item = services.interfaceDbComponent.createFile( + name=file_name, + mimeType="application/json", + content=content_bytes + ) + services.interfaceDbComponent.createFileData(file_item.id, content_bytes) + + # Lookup file info for ChatDocument + file_info = services.workflow.getFileInfo(file_item.id) + doc = ChatDocument( + messageId="", # set after message creation + fileId=file_item.id, + fileName=file_info.get("fileName", file_name) if file_info else file_name, + fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes), + mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json" + ) + + # Create message referencing the file + messageData = { + "workflowId": workflow.id, + "role": "assistant", + "message": "Raw extraction data saved", + "status": "data", + "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1, + "publishedAt": services.utils.getUtcTimestamp(), + "documentsLabel": label, + "documents": [] + } + message = services.workflow.createMessage(messageData) + if not message: + return + + # Persist ChatDocument with messageId + doc.messageId = message.id + services.interfaceDbChat.createDocument(doc.to_dict()) + + # Update message to include document + try: + if not message.documents: + message.documents = [] + message.documents.append(doc) + services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]}) + except Exception: + pass + except Exception: + # Non-fatal; ignore if storage or chat creation fails + return + + def _repairJson(self, json_string: str) -> str: + """Repair common JSON syntax errors efficiently for large JSON.""" + try: + import re + import json + + # Remove any leading/trailing whitespace + json_string = json_string.strip() + + # For large JSON, skip substring extraction and go straight to targeted repairs + logger.info(f"Attempting JSON repair for {len(json_string)} characters...") + + # Try to parse first to see what specific error we get + try: + json.loads(json_string) + return json_string # Already valid + except json.JSONDecodeError as e: + error_msg = str(e) + logger.info(f"JSON error: {error_msg}") + + # Apply targeted fixes based on the specific error + if "Expecting ',' delimiter" in error_msg: + # Fix missing commas between array elements + json_string = re.sub(r'\]\s*\[', '], [', json_string) + json_string = re.sub(r'\}\s*\{', '}, {', json_string) + # Fix missing commas between object properties + json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string) + + if "Expecting value" in error_msg: + # Fix missing values (replace empty with null) + json_string = re.sub(r':\s*,', ': null,', json_string) + json_string = re.sub(r':\s*}', ': null}', json_string) + + if "Expecting property name" in error_msg: + # Fix unquoted property names + json_string = re.sub(r'(\w+):', r'"\1":', json_string) + + # Fix trailing commas before closing brackets/braces + json_string = re.sub(r',(\s*[}\]])', r'\1', json_string) + + # Fix missing closing brackets/braces (only if reasonable) + open_braces = json_string.count('{') + close_braces = json_string.count('}') + open_brackets = json_string.count('[') + close_brackets = json_string.count(']') + + # Only add missing brackets if the difference is small (avoid runaway) + if 0 < (open_braces - close_braces) <= 5: + missing_braces = open_braces - close_braces + json_string += '}' * missing_braces + + if 0 < (open_brackets - close_brackets) <= 5: + missing_brackets = open_brackets - close_brackets + json_string += ']' * missing_brackets + + # Try to parse again + try: + json.loads(json_string) + logger.info("JSON repair successful") + return json_string + except json.JSONDecodeError: + logger.warning("JSON repair failed - will try AI repair") + return json_string + + except Exception as e: + logger.warning(f"JSON repair failed: {str(e)}") + return json_string + + async def _repairJsonWithAI(self, malformed_json: str) -> str: + """Use AI to repair malformed JSON efficiently for large files.""" + try: + # Limit JSON size for AI processing (max 50KB to avoid token limits) + max_json_size = 50000 + json_to_repair = malformed_json + + if len(malformed_json) > max_json_size: + logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair") + # Try to find a good truncation point (end of a complete object/array) + truncate_at = max_json_size + for i in range(max_json_size, max(0, max_json_size - 1000), -1): + if malformed_json[i] in ['}', ']']: + truncate_at = i + 1 + break + json_to_repair = malformed_json[:truncate_at] + "..." + + repair_prompt = f""" +You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations. + +Malformed JSON: +{json_to_repair} + +Return only the valid JSON: +""" + + # Use AI to repair the JSON + repaired_json = await self.services.ai.callAi( + prompt=repair_prompt, + documents=None, + options={ + "process_type": "text", + "operation_type": "generate_content", + "priority": "speed", + "max_cost": 0.01 + } + ) + + # Clean up the response (remove any markdown formatting) + repaired_json = repaired_json.strip() + if repaired_json.startswith('```json'): + repaired_json = repaired_json[7:] + if repaired_json.endswith('```'): + repaired_json = repaired_json[:-3] + repaired_json = repaired_json.strip() + + # Validate the repaired JSON + import json + json.loads(repaired_json) + logger.info("AI JSON repair successful") + return repaired_json + + except Exception as e: + logger.warning(f"AI JSON repair failed: {str(e)}") + return malformed_json diff --git a/modules/services/serviceAi/subDocumentProcessing.py b/modules/services/serviceAi/subDocumentProcessing.py new file mode 100644 index 00000000..d85a5341 --- /dev/null +++ b/modules/services/serviceAi/subDocumentProcessing.py @@ -0,0 +1,1132 @@ +import logging +from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelChat import ChatDocument +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority +from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted +from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService + +logger = logging.getLogger(__name__) + + +class SubDocumentProcessing: + """Document processing operations including chunking, processing, and merging.""" + + def __init__(self, services, aiObjects): + """Initialize document processing service. + + Args: + services: Service center instance for accessing other services + aiObjects: Initialized AiObjects instance + """ + self.services = services + self.aiObjects = aiObjects + self._extractionService = None + + @property + def extractionService(self): + """Lazy initialization of extraction service.""" + if self._extractionService is None: + logger.info("Lazy initializing ExtractionService...") + self._extractionService = ExtractionService(self.services) + return self._extractionService + + def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int: + """Calculate maximum context bytes based on model capabilities and options.""" + if options and options.maxContextBytes: + return options.maxContextBytes + + # Default model capabilities (this should be enhanced with actual model registry) + defaultMaxTokens = 4000 + safetyMargin = options.safetyMargin if options else 0.1 + + # Calculate bytes (4 chars per token estimation) + maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4) + + return maxContextBytes + + async def processDocumentsPerChunk( + self, + documents: List[ChatDocument], + prompt: str, + options: Optional[AiCallOptions] = None + ) -> str: + """ + Process documents with per-chunk AI calls and merge results. + FIXED: Now preserves chunk relationships and document structure. + + Args: + documents: List of ChatDocument objects to process + prompt: AI prompt for processing + options: AI call options + + Returns: + Merged AI results as string with preserved document structure + """ + if not documents: + return "" + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + + # Build extraction options for chunking with intelligent merging + extractionOptions: Dict[str, Any] = { + "prompt": prompt, + "operationType": options.operationType if options else "general", + "processDocumentsIndividually": True, # Process each document separately + "maxSize": model_capabilities["maxContextBytes"], + "chunkAllowed": True, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": { + "useIntelligentMerging": True, # Enable intelligent token-aware merging + "modelCapabilities": model_capabilities, + "prompt": prompt, + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, + } + + logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") + + try: + # Extract content with chunking + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + + if not isinstance(extractionResult, list): + return "[Error: No extraction results]" + + # FIXED: Process chunks with proper mapping + chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options) + + # FIXED: Merge with preserved chunk relationships + mergedContent = self._mergeChunkResults(chunkResults, options) + + # Save merged extraction content to debug file - only if debug enabled + try: + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_merged.txt"), "w", encoding="utf-8") as f: + f.write(mergedContent or "") + except Exception: + pass + + return mergedContent + + except Exception as e: + logger.error(f"Error in per-chunk processing: {str(e)}") + return f"[Error in per-chunk processing: {str(e)}]" + + async def processDocumentsPerChunkJson( + self, + documents: List[ChatDocument], + prompt: str, + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """ + Process documents with per-chunk AI calls and merge results in JSON mode. + Returns structured JSON document instead of text. + """ + if not documents: + return {"metadata": {"title": "Empty Document"}, "sections": []} + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + + # Build extraction options for chunking with intelligent merging + extractionOptions: Dict[str, Any] = { + "prompt": prompt, + "operationType": options.operationType if options else "general", + "processDocumentsIndividually": True, # Process each document separately + "maxSize": model_capabilities["maxContextBytes"], + "chunkAllowed": True, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": { + "useIntelligentMerging": True, # Enable intelligent token-aware merging + "modelCapabilities": model_capabilities, + "prompt": prompt, + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, + } + + logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") + + try: + # Extract content with chunking + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + + if not isinstance(extractionResult, list): + return {"metadata": {"title": "Error Document"}, "sections": []} + + # Process chunks with proper mapping + chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options, generate_json=True) + + # Merge with JSON mode + mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) + + # Normalize merged JSON into a single canonical table (only if table content exists) + try: + from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService + normalizer = NormalizationService(self.services) + inventory = normalizer.discoverStructures(mergedJsonDocument) + + # Check if any table content was discovered + tableHeaders = inventory.get("tableHeaders", []) + if not tableHeaders: + logger.info("No table content found in merged JSON, skipping normalization and returning original structure") + else: + # Use workflow id as cache key + cacheKey = self.services.currentWorkflow.id + # Provide the extraction/merge prompt context when available to help mapping + mergePrompt = prompt + mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) + canonical = normalizer.applyMapping(mergedJsonDocument, mapping) + report = normalizer.validateCanonical(canonical) + if report.get('success'): + mergedJsonDocument = canonical + else: + raise ValueError('Normalization produced zero rows') + except Exception as e: + # Log normalization failure but don't re-raise - continue with original merged JSON + logger.warning(f"Normalization failed (expected): {str(e)}") + logger.debug(f"Normalization error type: {type(e).__name__}") + # Continue with original merged JSON instead of re-raising + + # Save merged JSON extraction content to debug file - only if debug enabled + try: + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + import os + import json as _json + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_merged.json"), "w", encoding="utf-8") as f: + f.write(_json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2)) + except Exception: + pass + + return mergedJsonDocument + + except Exception as e: + logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}") + logger.error(f"Exception type: {type(e).__name__}") + logger.error(f"Exception args: {e.args}") + import traceback + logger.error(f"Traceback: {traceback.format_exc()}") + return {"metadata": {"title": "Error Document"}, "sections": []} + + async def processDocumentsPerChunkJsonWithPrompt( + self, + documents: List[ChatDocument], + custom_prompt: str, + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """ + Process documents with per-chunk AI calls and merge results in JSON mode. + Uses a custom prompt instead of the default extraction prompt. + """ + if not documents: + return {"metadata": {"title": "Empty Document"}, "sections": []} + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options) + + # Build extraction options for chunking with intelligent merging + extractionOptions: Dict[str, Any] = { + "prompt": custom_prompt, # Use the custom prompt instead of default + "operationType": options.operationType if options else "general", + "processDocumentsIndividually": True, # Process each document separately + "maxSize": model_capabilities["maxContextBytes"], + "chunkAllowed": True, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": { + "useIntelligentMerging": True, # Enable intelligent token-aware merging + "modelCapabilities": model_capabilities, + "prompt": custom_prompt, # Use the custom prompt + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, + } + + logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") + + try: + # Extract content with chunking + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + + if not isinstance(extractionResult, list): + return {"metadata": {"title": "Error Document"}, "sections": []} + + # Process chunks with proper mapping + logger.info(f"Processing {len(extractionResult)} chunks with custom prompt") + logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...") + + # Debug: Show what content is being processed (before filtering) + for i, ec in enumerate(extractionResult): + if hasattr(ec, 'parts'): + for j, part in enumerate(ec.parts): + if not (hasattr(part, 'data') and part.data): + # Check if this is an empty container chunk (which is expected) + part_type = getattr(part, 'typeGroup', None) + part_mime = getattr(part, 'mimeType', '') + + is_empty_container = ( + part_type == "container" and + part_mime and + 'document' in part_mime.lower() + ) + + if not is_empty_container: + logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'") + + chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True) + + # Debug: Show what chunks were actually processed (after filtering) + logger.info(f"After filtering: {len(chunkResults)} chunks will be processed") + + # Merge with JSON mode + mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) + + # Debug: Show what the AI actually returned + logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}") + if 'documents' in mergedJsonDocument: + logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}") + elif 'sections' in mergedJsonDocument: + logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") + + return mergedJsonDocument + + except Exception as e: + logger.error(f"Error in per-chunk JSON processing: {str(e)}") + return {"metadata": {"title": "Error Document"}, "sections": []} + + async def callAiText( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions + ) -> str: + """ + Handle text calls with document processing through ExtractionService. + UNIFIED PROCESSING: Always use per-chunk processing for consistency. + """ + # UNIFIED PROCESSING: Always use per-chunk processing for consistency + # This ensures MIME-type checking, chunk mapping, and parallel processing + return await self.processDocumentsPerChunk(documents, prompt, options) + + async def _processChunksWithMapping( + self, + extractionResult: List[ContentExtracted], + prompt: str, + options: Optional[AiCallOptions] = None, + generate_json: bool = False + ) -> List[ChunkResult]: + """Process chunks with proper mapping to preserve relationships.""" + from modules.datamodels.datamodelExtraction import ChunkResult + import asyncio + import time + + # Collect all chunks that need processing with proper indexing + chunks_to_process = [] + chunk_index = 0 + + for ec in extractionResult: + # Get document MIME type from metadata + document_mime_type = None + for part in ec.parts: + if part.metadata and 'documentMimeType' in part.metadata: + document_mime_type = part.metadata['documentMimeType'] + break + + for part in ec.parts: + if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"): + # Skip empty container chunks (they're just metadata containers) + if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): + logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") + continue + + chunks_to_process.append({ + 'part': part, + 'chunk_index': chunk_index, + 'document_id': ec.id, + 'document_mime_type': document_mime_type + }) + chunk_index += 1 + + logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping") + + # Process chunks in parallel with proper mapping + async def process_single_chunk(chunk_info: Dict) -> ChunkResult: + part = chunk_info['part'] + chunk_index = chunk_info['chunk_index'] + document_id = chunk_info['document_id'] + document_mime_type = chunk_info.get('document_mime_type', part.mimeType) + + start_time = time.time() + + try: + # FIXED: Check MIME type first, then fallback to typeGroup + is_image = ( + (document_mime_type and document_mime_type.startswith('image/')) or + (part.mimeType and part.mimeType.startswith('image/')) or + (part.typeGroup == "image") + ) + + # Debug logging + self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE") + logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}") + + if is_image: + # Use the same extraction prompt for image analysis (contains table JSON format) + self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") + + # Check if image data is available + if not part.data: + error_msg = f"No image data available for chunk {chunk_index}" + logger.warning(error_msg) + ai_result = f"Error: {error_msg}" + else: + try: + # Import here to avoid circular imports + from modules.services.serviceAi.subCoreAi import SubCoreAi + core_ai = SubCoreAi(self.services, self.aiObjects) + + ai_result = await core_ai.readImage( + prompt=prompt, + imageData=part.data, + mimeType=part.mimeType, + options=options + ) + + self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE") + # Save image extraction response to debug file - only if debug enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + try: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_image_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: + f.write(f"EXTRACTION IMAGE RESPONSE:\n{ai_result if ai_result else 'No response'}\n") + except Exception: + pass + + # Check if result is empty or None + if not ai_result or not ai_result.strip(): + logger.warning(f"Image chunk {chunk_index} returned empty response from AI") + ai_result = "No content detected in image" + + except Exception as e: + logger.error(f"Error processing image chunk {chunk_index}: {str(e)}") + ai_result = f"Error analyzing image: {str(e)}" + + # If generating JSON, clean image analysis result + if generate_json: + try: + import json + import re + + # Clean the response - remove markdown code blocks if present + cleaned_result = ai_result.strip() + + # Remove various markdown patterns + if cleaned_result.startswith('```json'): + cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) + cleaned_result = re.sub(r'\s*```$', '', cleaned_result) + elif cleaned_result.startswith('```'): + cleaned_result = re.sub(r'^```\s*', '', cleaned_result) + cleaned_result = re.sub(r'\s*```$', '', cleaned_result) + + # Remove any leading/trailing text that's not JSON + # Look for the first { and last } to extract JSON + first_brace = cleaned_result.find('{') + last_brace = cleaned_result.rfind('}') + + if first_brace != -1 and last_brace != -1 and last_brace > first_brace: + cleaned_result = cleaned_result[first_brace:last_brace + 1] + + # Additional cleaning for common AI response issues + cleaned_result = cleaned_result.strip() + + # Validate JSON + json.loads(cleaned_result) + ai_result = cleaned_result # Use cleaned version + self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE") + + except json.JSONDecodeError as e: + logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}") + logger.warning(f"Raw response was: '{ai_result[:500]}...'") + + # Create fallback JSON with the actual response content (not the error message) + # Use the original AI response content, not the error message + fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" + + self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") + + ai_result = json.dumps({ + "metadata": {"title": f"Image Analysis - Chunk {chunk_index}"}, + "sections": [{ + "id": f"image_section_{chunk_index}", + "content_type": "paragraph", + "elements": [{"text": fallback_content}] + }] + }) + self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE") + elif part.typeGroup in ("container", "binary"): + # Handle ALL container and binary content generically - let AI process any document type + self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") + + # Skip empty container chunks (they're just metadata containers) + if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): + self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") + logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") + # Skip processing this chunk + pass + elif part.mimeType and part.data and len(part.data.strip()) > 0: + # Process any document container as text content + request_options = options if options is not None else AiCallOptions() + request_options.operationType = OperationType.GENERAL + self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE") + logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}") + + # Log extraction prompt and context + self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE") + self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") + + # Strengthen prompt to forbid fabrication for text/container extraction + augmented_prompt = ( + f"{prompt}\n\n" + "CRITICAL RULES (NO FABRICATION):\n" + "- Use ONLY content present in the provided CONTEXT.\n" + "- Do NOT create, infer, or guess values not explicitly in the context.\n" + "- If a value is missing, leave the cell empty or omit the row.\n" + ) + request = AiCallRequest( + prompt=augmented_prompt, + context=part.data, + options=request_options + ) + response = await self.aiObjects.call(request) + ai_result = response.content + + # Log extraction response + self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") + + # Save full extraction prompt and response to debug file - only if debug enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + try: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_container_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: + f.write(f"EXTRACTION PROMPT:\n{prompt}\n\n") + f.write(f"EXTRACTION CONTEXT:\n{part.data if part.data else 'No context'}\n\n") + f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n") + except Exception: + pass + + # If generating JSON, validate the response + if generate_json: + try: + import json + import re + + # Clean the response - remove markdown code blocks if present + cleaned_result = ai_result.strip() + + # Remove various markdown patterns + if cleaned_result.startswith('```json'): + cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) + cleaned_result = re.sub(r'\s*```$', '', cleaned_result) + elif cleaned_result.startswith('```'): + cleaned_result = re.sub(r'^```\s*', '', cleaned_result) + cleaned_result = re.sub(r'\s*```$', '', cleaned_result) + + # Remove any leading/trailing text that's not JSON + # Look for the first { and last } to extract JSON + first_brace = cleaned_result.find('{') + last_brace = cleaned_result.rfind('}') + + if first_brace != -1 and last_brace != -1 and last_brace > first_brace: + cleaned_result = cleaned_result[first_brace:last_brace + 1] + + # Additional cleaning for common AI response issues + cleaned_result = cleaned_result.strip() + + # Validate JSON + json.loads(cleaned_result) + ai_result = cleaned_result # Use cleaned version + + except json.JSONDecodeError as e: + logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}") + logger.warning(f"Raw response was: '{ai_result[:500]}...'") + + # Create fallback JSON with the actual response content (not the error message) + # Use the original AI response content, not the error message + fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" + + self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") + + ai_result = json.dumps({ + "metadata": {"title": f"Document Analysis - Chunk {chunk_index}"}, + "sections": [{ + "id": f"analysis_section_{chunk_index}", + "content_type": "paragraph", + "elements": [{"text": fallback_content}] + }] + }) + self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE") + else: + # Skip empty or invalid container/binary content - don't create a result + self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") + # Return None to indicate this chunk should be completely skipped + return None + else: + # Ensure options is not None and set correct operation type for text + request_options = options if options is not None else AiCallOptions() + # FIXED: Set operation type to general for text processing + request_options.operationType = OperationType.GENERAL + self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE") + logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}") + + # Log extraction context length + self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") + + # Debug: Log the actual prompt being sent to AI + logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...") + logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...") + + # Strengthen prompt to forbid fabrication for text extraction + augmented_prompt_text = ( + f"{prompt}\n\n" + "CRITICAL RULES (NO FABRICATION):\n" + "- Use ONLY content present in the provided CONTEXT.\n" + "- Do NOT create, infer, or guess values not explicitly in the context.\n" + "- If a value is missing, leave the cell empty or omit the row.\n" + ) + request = AiCallRequest( + prompt=augmented_prompt_text, + context=part.data, + options=request_options + ) + response = await self.aiObjects.call(request) + + # Debug: Log what AI actually returned + logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...") + ai_result = response.content + + # Log extraction response length + self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") + + # Save extraction response to debug file (without verbose prompt) - only if debug enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + try: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: + f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n") + except Exception: + pass + + # If generating JSON, validate the response + if generate_json: + try: + import json + import re + + # Clean the response - remove markdown code blocks and extra formatting + cleaned_result = ai_result.strip() + + # Remove any markdown code block markers (```json, ```, etc.) + cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE) + cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE) + + # Remove any remaining ``` markers anywhere in the text + cleaned_result = re.sub(r'```', '', cleaned_result) + + # Try to extract JSON from the response if it's embedded in other text + json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL) + if json_match: + cleaned_result = json_match.group(0) + + # Validate JSON + json.loads(cleaned_result) + ai_result = cleaned_result # Use cleaned version + + except json.JSONDecodeError as e: + logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}") + # Create fallback JSON + ai_result = json.dumps({ + "metadata": {"title": "Error Section"}, + "sections": [{ + "id": f"error_section_{chunk_index}", + "content_type": "paragraph", + "elements": [{"text": f"Error parsing JSON: {str(e)}"}] + }] + }) + + processing_time = time.time() - start_time + + logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s") + + return ChunkResult( + originalChunk=part, + aiResult=ai_result, + chunkIndex=chunk_index, + documentId=document_id, + processingTime=processing_time, + metadata={ + "success": True, + "chunkSize": len(part.data) if part.data else 0, + "resultSize": len(ai_result), + "typeGroup": part.typeGroup + } + ) + + except Exception as e: + processing_time = time.time() - start_time + logger.warning(f"Error processing chunk {chunk_index}: {str(e)}") + + return ChunkResult( + originalChunk=part, + aiResult=f"[Error processing chunk: {str(e)}]", + chunkIndex=chunk_index, + documentId=document_id, + processingTime=processing_time, + metadata={ + "success": False, + "error": str(e), + "chunkSize": len(part.data) if part.data else 0, + "typeGroup": part.typeGroup + } + ) + + # Process chunks with concurrency control + max_concurrent = 5 # Default concurrency + if options and hasattr(options, 'maxConcurrentChunks'): + max_concurrent = options.maxConcurrentChunks + elif options and hasattr(options, 'maxParallelChunks'): + max_concurrent = options.maxParallelChunks + + logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}") + self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE") + for i, chunk_info in enumerate(chunks_to_process): + self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE") + + # Create semaphore for concurrency control + semaphore = asyncio.Semaphore(max_concurrent) + + async def process_with_semaphore(chunk_info): + async with semaphore: + return await process_single_chunk(chunk_info) + + # Process all chunks in parallel with concurrency control + tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process] + self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE") + chunk_results = await asyncio.gather(*tasks, return_exceptions=True) + self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE") + + # Handle any exceptions in the gather itself + processed_results = [] + for i, result in enumerate(chunk_results): + if isinstance(result, Exception): + # Create error ChunkResult + chunk_info = chunks_to_process[i] + processed_results.append(ChunkResult( + originalChunk=chunk_info['part'], + aiResult=f"[Error in parallel processing: {str(result)}]", + chunkIndex=chunk_info['chunk_index'], + documentId=chunk_info['document_id'], + processingTime=0.0, + metadata={"success": False, "error": str(result)} + )) + elif result is not None: + # Only add non-None results (skip empty containers) + processed_results.append(result) + + logger.info(f"Completed processing {len(processed_results)} chunks") + return processed_results + + def _mergeChunkResults( + self, + chunkResults: List[ChunkResult], + options: Optional[AiCallOptions] = None + ) -> str: + """Merge chunk results while preserving document structure and chunk order.""" + + if not chunkResults: + return "" + + # Get merging configuration from options + chunk_separator = "\n\n---\n\n" + include_document_headers = True + include_chunk_metadata = False + + if options: + if hasattr(options, 'chunkSeparator'): + chunk_separator = options.chunkSeparator + elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: + chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n") + + # Check for enhanced options + if hasattr(options, 'preserveChunkMetadata'): + include_chunk_metadata = options.preserveChunkMetadata + + # Group chunk results by document + results_by_document = {} + for chunk_result in chunkResults: + doc_id = chunk_result.documentId + if doc_id not in results_by_document: + results_by_document[doc_id] = [] + results_by_document[doc_id].append(chunk_result) + + # Sort chunks within each document by chunk index + for doc_id in results_by_document: + results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) + + # Merge results for each document + merged_documents = [] + + for doc_id, doc_chunks in results_by_document.items(): + # Build document header if enabled + doc_header = "" + if include_document_headers: + doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n" + + # Merge chunks for this document + doc_content = "" + for i, chunk_result in enumerate(doc_chunks): + # Add chunk separator (except for first chunk) + if i > 0: + doc_content += chunk_separator + + # Add chunk content with optional metadata + chunk_metadata = chunk_result.metadata + if chunk_metadata.get("success", False): + chunk_content = chunk_result.aiResult + + # Add chunk metadata if enabled + if include_chunk_metadata: + chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]" + chunk_content = f"{chunk_info}\n{chunk_content}" + + doc_content += chunk_content + else: + # Handle error chunks + error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]" + doc_content += error_msg + + merged_documents.append(doc_header + doc_content) + + # Join all documents + final_result = "\n\n".join(merged_documents) + + logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents") + return final_result.strip() + + def _mergeChunkResultsClean( + self, + chunkResults: List[ChunkResult], + options: Optional[AiCallOptions] = None + ) -> str: + """Merge chunk results in CLEAN mode - no debug metadata or document headers.""" + + if not chunkResults: + return "" + + # Get merging configuration from options + chunk_separator = "\n\n" + include_document_headers = False # CLEAN MODE: No document headers + include_chunk_metadata = False # CLEAN MODE: No chunk metadata + + if options: + if hasattr(options, 'chunkSeparator'): + chunk_separator = options.chunkSeparator + elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: + chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n") + + # Group chunk results by document + results_by_document = {} + for chunk_result in chunkResults: + doc_id = chunk_result.documentId + if doc_id not in results_by_document: + results_by_document[doc_id] = [] + results_by_document[doc_id].append(chunk_result) + + # Sort chunks within each document by chunk index + for doc_id in results_by_document: + results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) + + # Merge results for each document in CLEAN mode + merged_documents = [] + + for doc_id, doc_chunks in results_by_document.items(): + # CLEAN MODE: No document headers + doc_header = "" + + # Merge chunks for this document + doc_content = "" + for i, chunk_result in enumerate(doc_chunks): + # Add chunk separator (except for first chunk) + if i > 0: + doc_content += chunk_separator + + # Add chunk content without metadata + chunk_metadata = chunk_result.metadata + if chunk_metadata.get("success", False): + chunk_content = chunk_result.aiResult + + # CLEAN MODE: Skip container/binary chunks entirely + if chunk_content.startswith("[Skipped ") and "content:" in chunk_content: + continue # Skip container/binary chunks in clean mode + + # CLEAN MODE: Skip empty or whitespace-only chunks + if not chunk_content.strip(): + continue # Skip empty chunks in clean mode + + # CLEAN MODE: No chunk metadata + doc_content += chunk_content + else: + # Handle error chunks silently in clean mode + continue + + merged_documents.append(doc_header + doc_content) + + # Join all documents + final_result = "\n\n".join(merged_documents) + + return final_result.strip() + + def _mergeChunkResultsJson( + self, + chunkResults: List[ChunkResult], + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """Merge chunk results in JSON mode - returns structured JSON document.""" + import json + + if not chunkResults: + return {"metadata": {"title": "Empty Document"}, "sections": []} + + # Group chunk results by document + results_by_document = {} + for chunk_result in chunkResults: + doc_id = chunk_result.documentId + if doc_id not in results_by_document: + results_by_document[doc_id] = [] + results_by_document[doc_id].append(chunk_result) + + # Sort chunks within each document by chunk index + for doc_id in results_by_document: + results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) + + # Merge JSON results for each document + all_documents = [] + all_sections = [] + document_titles = [] + combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"} + + for doc_id, doc_chunks in results_by_document.items(): + # Process each chunk's JSON result + for chunk_result in doc_chunks: + chunk_metadata = chunk_result.metadata + if chunk_metadata.get("success", False): + try: + # Parse JSON from AI result + chunk_json = json.loads(chunk_result.aiResult) + + # Check if this is a multi-file response (has "documents" key) + if isinstance(chunk_json, dict) and "documents" in chunk_json: + # This is a multi-file response - merge all documents + logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents") + + # Add all documents from this chunk + for doc in chunk_json["documents"]: + # Add chunk context to document + doc["metadata"] = doc.get("metadata", {}) + doc["metadata"]["source_chunk"] = chunk_result.chunkIndex + doc["metadata"]["source_document"] = doc_id + all_documents.append(doc) + + # Update combined metadata + if "metadata" in chunk_json: + combined_metadata.update(chunk_json["metadata"]) + + # Extract sections from single-file response (fallback) + elif isinstance(chunk_json, dict) and "sections" in chunk_json: + for section in chunk_json["sections"]: + # Add document context to section + section["metadata"] = section.get("metadata", {}) + section["metadata"]["source_document"] = doc_id + section["metadata"]["chunk_index"] = chunk_result.chunkIndex + all_sections.append(section) + + # Extract document title + if isinstance(chunk_json, dict) and "metadata" in chunk_json: + title = chunk_json["metadata"].get("title", "") + if title and title not in document_titles: + document_titles.append(title) + + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON from chunk {chunk_result.chunkIndex}: {str(e)}") + # Create a fallback section for invalid JSON + fallback_section = { + "id": f"error_section_{chunk_result.chunkIndex}", + "title": "Error Section", + "content_type": "paragraph", + "elements": [{ + "text": f"Error parsing chunk {chunk_result.chunkIndex}: {str(e)}" + }], + "order": chunk_result.chunkIndex, + "metadata": { + "source_document": doc_id, + "chunk_index": chunk_result.chunkIndex, + "error": str(e) + } + } + all_sections.append(fallback_section) + else: + # Handle error chunks + error_section = { + "id": f"error_section_{chunk_result.chunkIndex}", + "title": "Error Section", + "content_type": "paragraph", + "elements": [{ + "text": f"Error in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}" + }], + "order": chunk_result.chunkIndex, + "metadata": { + "source_document": doc_id, + "chunk_index": chunk_result.chunkIndex, + "error": chunk_metadata.get('error', 'Unknown error') + } + } + all_sections.append(error_section) + + # Sort sections by order + all_sections.sort(key=lambda x: x.get("order", 0)) + + # If we have merged documents from multi-file responses, return them + if all_documents: + logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks") + return { + "metadata": combined_metadata, + "documents": all_documents + } + + # Otherwise, create merged document with sections (single-file fallback) + merged_document = { + "metadata": { + "title": document_titles[0] if document_titles else "Merged Document", + "source_documents": list(results_by_document.keys()), + "extraction_method": "ai_json_extraction", + "version": "1.0" + }, + "sections": all_sections, + "summary": f"Merged document from {len(results_by_document)} source documents", + "tags": ["merged", "ai_generated"] + } + + logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (JSON mode)") + return merged_document + + def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: + """ + Get model capabilities for content processing, including appropriate size limits for chunking. + """ + # Estimate total content size + prompt_size = len(prompt.encode('utf-8')) + document_size = 0 + if documents: + # Rough estimate of document content size + for doc in documents: + document_size += doc.fileSize or 0 + + total_size = prompt_size + document_size + + # Use AiObjects to select the best model for this content size + # We'll simulate the model selection by checking available models + from modules.interfaces.interfaceAiObjects import aiModels + + # Find the best model for this content size and operation + best_model = None + best_context_length = 0 + + for model_name, model_info in aiModels.items(): + context_length = model_info.get("contextLength", 0) + + # Skip models with no context length or too small for content + if context_length == 0: + continue + + # Check if model supports the operation type + capabilities = model_info.get("capabilities", []) + if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: + continue + elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: + continue + elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: + continue + elif "text_generation" not in capabilities: + continue + + # Prefer models that can handle the content without chunking, but allow chunking if needed + if context_length >= total_size * 0.8: # 80% of content size + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + elif best_model is None: # Fallback to largest available model + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + + # Fallback to a reasonable default if no model found + if best_model is None: + best_model = { + "contextLength": 128000, # GPT-4o default + "llmName": "gpt-4o" + } + + # Calculate appropriate sizes + # Convert tokens to bytes (rough estimate: 1 token β‰ˆ 4 characters) + context_length_bytes = int(best_model["contextLength"] * 4) + max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length + text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks + image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks + + logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") + logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") + logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") + + return { + "maxContextBytes": max_context_bytes, + "textChunkSize": text_chunk_size, + "imageChunkSize": image_chunk_size + } diff --git a/modules/services/serviceAi/subUtilities.py b/modules/services/serviceAi/subUtilities.py new file mode 100644 index 00000000..0f5bcc4d --- /dev/null +++ b/modules/services/serviceAi/subUtilities.py @@ -0,0 +1,316 @@ +import logging +from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelAi import ModelCapabilities, AiCallOptions + +logger = logging.getLogger(__name__) + + +class SubUtilities: + """Utility functions for text processing, debugging, and helper operations.""" + + def __init__(self, services): + """Initialize utilities service. + + Args: + services: Service center instance for accessing other services + """ + self.services = services + + def _writeTraceLog(self, contextText: str, data: Any) -> None: + """Write raw data to the central trace log file without truncation.""" + try: + import os + import json + from datetime import datetime, UTC + # Only write if logger is in debug mode + if logger.level > logging.DEBUG: + return + # Get log directory from configuration via service center if possible + logDir = None + try: + logDir = self.services.utils.configGet("APP_LOGGING_LOG_DIR", "./") + except Exception: + pass + if not logDir: + logDir = "./" + if not os.path.isabs(logDir): + # Make it relative to gateway directory + gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + logDir = os.path.join(gatewayDir, logDir) + os.makedirs(logDir, exist_ok=True) + traceFile = os.path.join(logDir, "log_trace.log") + timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n" + if data is None: + traceEntry += "No data provided\n" + else: + # Prefer exact text; if dict/list, pretty print JSON + try: + if isinstance(data, (dict, list)): + traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n" + else: + text = str(data) + traceEntry += f"Text Data:\n{text}\n" + except Exception: + traceEntry += f"Data (fallback): {str(data)}\n" + traceEntry += ("=" * 80) + "\n\n" + with open(traceFile, "a", encoding="utf-8") as f: + f.write(traceEntry) + except Exception: + # Swallow to avoid recursive logging issues + pass + + def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None: + """Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled.""" + try: + # Check if debug logging is enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if not debug_enabled: + return + + import os + from datetime import datetime, UTC + # Base dir: gateway/test-chat/ai (go up 4 levels from this file) + # .../gateway/modules/services/serviceAi/subUtilities.py -> up to gateway root + gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + outDir = os.path.join(gatewayDir, 'test-chat', 'ai') + os.makedirs(outDir, exist_ok=True) + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + suffix = [] + if partIndex is not None: + suffix.append(f"part{partIndex}") + if continuation is not None: + suffix.append(f"cont_{str(continuation).lower()}") + if modelName: + safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName) + suffix.append(safeModel) + suffixStr = ('_' + '_'.join(suffix)) if suffix else '' + fname = f"{ts}_{label}{suffixStr}.txt" + fpath = os.path.join(outDir, fname) + with open(fpath, 'w', encoding='utf-8') as f: + f.write(content or '') + except Exception: + # Do not raise; best-effort debug write + pass + + def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool: + """ + Check if text exceeds model token limit with safety margin. + """ + # Simple character-based estimation (4 chars per token) + estimated_tokens = len(text) // 4 + max_tokens = int(model.maxTokens * (1 - safety_margin)) + return estimated_tokens > max_tokens + + def _reduceText(self, text: str, reduction_factor: float) -> str: + """ + Reduce text size by the specified factor. + """ + if reduction_factor >= 1.0: + return text + + target_length = int(len(text) * reduction_factor) + return text[:target_length] + "... [reduced]" + + def _extractTextFromContentParts(self, extracted_content) -> str: + """ + Extract text content from ExtractionService ContentPart objects. + """ + if not extracted_content or not hasattr(extracted_content, 'parts'): + return "" + + text_parts = [] + for part in extracted_content.parts: + if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']: + if hasattr(part, 'data') and part.data: + text_parts.append(part.data) + + return "\n\n".join(text_parts) + + def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str: + """ + Build full prompt by replacing placeholders with their content. + Uses the new {{KEY:placeholder}} format. + """ + if not placeholders: + return prompt + + full_prompt = prompt + for placeholder, content in placeholders.items(): + # Replace both old format {{placeholder}} and new format {{KEY:placeholder}} + full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content) + full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content) + + return full_prompt + + def _reducePlanningPrompt( + self, + full_prompt: str, + placeholders: Optional[Dict[str, str]], + model: ModelCapabilities, + options: AiCallOptions + ) -> str: + """ + Reduce planning prompt size by summarizing placeholders while preserving prompt structure. + """ + if not placeholders: + return self._reduceText(full_prompt, 0.7) + + # Reduce placeholders while preserving prompt + reduced_placeholders = {} + for placeholder, content in placeholders.items(): + if len(content) > 1000: # Only reduce long content + reduction_factor = 0.7 + reduced_content = self._reduceText(content, reduction_factor) + reduced_placeholders[placeholder] = reduced_content + else: + reduced_placeholders[placeholder] = content + + return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders) + + def _reduceTextPrompt( + self, + prompt: str, + context: str, + model: ModelCapabilities, + options: AiCallOptions + ) -> str: + """ + Reduce text prompt size using typeGroup-aware chunking and merging. + """ + max_size = int(model.maxTokens * (1 - options.safetyMargin)) + + if options.compressPrompt: + # Reduce both prompt and context + target_size = max_size + current_size = len(prompt) + len(context) + reduction_factor = (target_size * 0.7) / current_size + + if reduction_factor < 1.0: + prompt = self._reduceText(prompt, reduction_factor) + context = self._reduceText(context, reduction_factor) + else: + # Only reduce context, preserve prompt integrity + max_context_size = max_size - len(prompt) + if len(context) > max_context_size: + reduction_factor = max_context_size / len(context) + context = self._reduceText(context, reduction_factor) + + return prompt + "\n\n" + context if context else prompt + + async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str: + """Compress content to target size.""" + if len(content.encode("utf-8")) <= targetSize: + return content + + try: + compressionPrompt = f""" + Komprimiere den folgenden {contentType} auf maximal {targetSize} Zeichen, + behalte aber alle wichtigen Informationen bei: + + {content} + + Gib nur den komprimierten Inhalt zurΓΌck, ohne zusΓ€tzliche ErklΓ€rungen. + """ + + # Service must not call connectors directly; use simple truncation fallback here + data = content.encode("utf-8") + return data[:targetSize].decode("utf-8", errors="ignore") + "... [truncated]" + except Exception as e: + logger.warning(f"AI compression failed, using truncation: {str(e)}") + return content[:targetSize] + "... [truncated]" + + def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List], options: AiCallOptions) -> Dict[str, int]: + """ + Get model capabilities for content processing, including appropriate size limits for chunking. + """ + # Estimate total content size + prompt_size = len(prompt.encode('utf-8')) + document_size = 0 + if documents: + # Rough estimate of document content size + for doc in documents: + document_size += getattr(doc, 'fileSize', 0) or 0 + + total_size = prompt_size + document_size + + # Use AiObjects to select the best model for this content size + # We'll simulate the model selection by checking available models + from modules.interfaces.interfaceAiObjects import aiModels + + # Find the best model for this content size and operation + best_model = None + best_context_length = 0 + + for model_name, model_info in aiModels.items(): + context_length = model_info.get("contextLength", 0) + + # Skip models with no context length or too small for content + if context_length == 0: + continue + + # Check if model supports the operation type + capabilities = model_info.get("capabilities", []) + from modules.datamodels.datamodelAi import OperationType + if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: + continue + elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: + continue + elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: + continue + elif "text_generation" not in capabilities: + continue + + # Prefer models that can handle the content without chunking, but allow chunking if needed + if context_length >= total_size * 0.8: # 80% of content size + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + elif best_model is None: # Fallback to largest available model + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + + # Fallback to a reasonable default if no model found + if best_model is None: + best_model = { + "contextLength": 128000, # GPT-4o default + "llmName": "gpt-4o" + } + + # Calculate appropriate sizes + # Convert tokens to bytes (rough estimate: 1 token β‰ˆ 4 characters) + context_length_bytes = int(best_model["contextLength"] * 4) + max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length + text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks + image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks + + logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") + logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") + logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") + + return { + "maxContextBytes": max_context_bytes, + "textChunkSize": text_chunk_size, + "imageChunkSize": image_chunk_size + } + + def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]: + """ + Get models capable of handling the specific operation with capability filtering. + """ + # Use the actual AI objects model selection instead of hardcoded default + if hasattr(self, 'aiObjects') and self.aiObjects: + # Let AiObjects handle the model selection + return [] + else: + # Fallback to default model if AiObjects not available + default_model = ModelCapabilities( + name="default", + maxTokens=4000, + capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"], + costPerToken=0.001, + processingTime=1.0, + isAvailable=True + ) + return [default_model] diff --git a/modules/services/serviceAi/subWebResearch.py b/modules/services/serviceAi/subWebResearch.py new file mode 100644 index 00000000..953324aa --- /dev/null +++ b/modules/services/serviceAi/subWebResearch.py @@ -0,0 +1,384 @@ +import logging +from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelWeb import ( + WebResearchRequest, + WebResearchActionResult, + WebResearchDocumentData, + WebResearchActionDocument, + WebSearchResultItem, +) +from modules.interfaces.interfaceAiObjects import AiObjects +from modules.shared.configuration import APP_CONFIG + +logger = logging.getLogger(__name__) + + +class SubWebResearch: + """Web research operations including search, crawling, and analysis.""" + + def __init__(self, services, aiObjects): + """Initialize web research service. + + Args: + services: Service center instance for accessing other services + aiObjects: Initialized AiObjects instance + """ + self.services = services + self.aiObjects = aiObjects + + async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult: + """Perform web research using interface functions.""" + try: + logger.info(f"WEB RESEARCH STARTED") + logger.info(f"User Query: {request.user_prompt}") + logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}") + + # Global URL index to track all processed URLs across the entire research session + global_processed_urls = set() + + # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs + logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===") + + if request.urls: + # Use provided URLs as initial main URLs + websites = request.urls + logger.info(f"Using provided URLs ({len(websites)}):") + for i, url in enumerate(websites, 1): + logger.info(f" {i}. {url}") + else: + # Use AI to determine main URLs based on user's intention + logger.info(f"AI analyzing user intent: '{request.user_prompt}'") + + # Use AI to generate optimized Tavily search query and search parameters + query_optimizer_prompt = f"""You are a search query optimizer. + + USER QUERY: {request.user_prompt} + + Your task: Create a search query and parameters for the USER QUERY given. + + RULES: + 1. The search query MUST be related to the user query above + 2. Extract key terms from the user query + 3. Determine appropriate country/language based on the query context + 4. Keep search query short (2-6 words) + + Return ONLY this JSON format: + {{ + "user_prompt": "search query based on user query above", + "country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)", + "language": "language_code_or_null", + "topic": "general|news|academic_or_null", + "time_range": "d|w|m|y_or_null", + "selection_strategy": "single|multiple|specific_page", + "selection_criteria": "what URLs to prioritize", + "expected_url_patterns": ["pattern1", "pattern2"], + "estimated_result_count": number + }}""" + + # Get AI response for query optimization + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions + ai_request = AiCallRequest( + prompt=query_optimizer_prompt, + options=AiCallOptions() + ) + ai_response_obj = await self.aiObjects.call(ai_request) + ai_response = ai_response_obj.content + logger.debug(f"AI query optimizer response: {ai_response}") + + # Parse AI response to extract search query + import json + try: + # Clean the response by removing markdown code blocks + cleaned_response = ai_response.strip() + if cleaned_response.startswith('```json'): + cleaned_response = cleaned_response[7:] # Remove ```json + if cleaned_response.endswith('```'): + cleaned_response = cleaned_response[:-3] # Remove ``` + cleaned_response = cleaned_response.strip() + + query_data = json.loads(cleaned_response) + search_query = query_data.get("user_prompt", request.user_prompt) + ai_country = query_data.get("country") + ai_language = query_data.get("language") + ai_topic = query_data.get("topic") + ai_time_range = query_data.get("time_range") + selection_strategy = query_data.get("selection_strategy", "multiple") + selection_criteria = query_data.get("selection_criteria", "relevant URLs") + expected_patterns = query_data.get("expected_url_patterns", []) + estimated_count = query_data.get("estimated_result_count", request.max_results) + + logger.info(f"AI optimized search query: '{search_query}'") + logger.info(f"Selection strategy: {selection_strategy}") + logger.info(f"Selection criteria: {selection_criteria}") + logger.info(f"Expected URL patterns: {expected_patterns}") + logger.info(f"Estimated result count: {estimated_count}") + + except json.JSONDecodeError: + logger.warning("Failed to parse AI response as JSON, using original query") + search_query = request.user_prompt + ai_country = None + ai_language = None + ai_topic = None + ai_time_range = None + selection_strategy = "multiple" + + # Perform the web search with AI-determined parameters + search_kwargs = { + "query": search_query, + "max_results": request.max_results, + "search_depth": request.options.search_depth, + "auto_parameters": False # Use explicit parameters + } + + # Add parameters only if they have valid values + def _normalizeCountry(c: Optional[str]) -> Optional[str]: + if not c: + return None + s = str(c).strip() + if not s or s.lower() in ['null', 'none', 'undefined']: + return None + # Map common codes to full English names when easy to do without extra deps + mapping = { + 'ch': 'Switzerland', 'che': 'Switzerland', + 'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany', + 'at': 'Austria', 'aut': 'Austria', + 'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States', + 'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom' + } + key = s.lower() + if key in mapping: + return mapping[key] + # If looks like full name, capitalize first letter only (Tavily accepts English names) + return s + + norm_ai_country = _normalizeCountry(ai_country) + norm_req_country = _normalizeCountry(request.options.country) + if norm_ai_country: + search_kwargs["country"] = norm_ai_country + elif norm_req_country: + search_kwargs["country"] = norm_req_country + + if ai_language and ai_language not in ['null', '', 'none', 'undefined']: + search_kwargs["language"] = ai_language + elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']: + search_kwargs["language"] = request.options.language + + if ai_topic and ai_topic in ['general', 'news', 'academic']: + search_kwargs["topic"] = ai_topic + elif request.options.topic and request.options.topic in ['general', 'news', 'academic']: + search_kwargs["topic"] = request.options.topic + + if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']: + search_kwargs["time_range"] = ai_time_range + elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']: + search_kwargs["time_range"] = request.options.time_range + + # Constrain by expected domains if provided by AI + try: + include_domains = [] + for p in expected_patterns or []: + if not isinstance(p, str): + continue + # Extract bare domain from pattern or URL + import re + m = re.search(r"(?:https?://)?([^/\s]+)", p.strip()) + if m: + domain = m.group(1).lower() + # strip leading www. + if domain.startswith('www.'): + domain = domain[4:] + include_domains.append(domain) + # Deduplicate + if include_domains: + seen = set() + uniq = [] + for d in include_domains: + if d not in seen: + seen.add(d) + uniq.append(d) + search_kwargs["include_domains"] = uniq + except Exception: + pass + + # Log the parameters being used + logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}") + + search_results = await self.aiObjects.search_websites(**search_kwargs) + + logger.debug(f"Web search returned {len(search_results)} results:") + for i, result in enumerate(search_results, 1): + logger.debug(f" {i}. {result.url} - {result.title}") + + # Deduplicate while preserving order + seen = set() + search_urls = [] + for r in search_results: + u = str(r.url) + if u not in seen: + seen.add(u) + search_urls.append(u) + + logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results") + + if not search_urls: + logger.error("No relevant websites found") + return WebResearchActionResult(success=False, error="No relevant websites found") + + # Now use AI to determine the main URLs based on user's intention + logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent") + + # Create a prompt for AI to identify main URLs based on user's intention + ai_prompt = f""" + Select the most relevant URLs from these search results: + + {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} + + Return only the URLs that are most relevant for the user's query. + One URL per line. + """ + # Create AI call request + ai_request = AiCallRequest( + prompt=ai_prompt, + options=AiCallOptions() + ) + ai_response_obj = await self.aiObjects.call(ai_request) + ai_response = ai_response_obj.content + logger.debug(f"AI response for main URL selection: {ai_response}") + + # Parse AI response to extract URLs + websites = [] + for line in ai_response.strip().split('\n'): + line = line.strip() + if line and ('http://' in line or 'https://' in line): + # Extract URL from the line + for word in line.split(): + if word.startswith('http://') or word.startswith('https://'): + websites.append(word.rstrip('.,;')) + break + + if not websites: + logger.warning("AI did not identify any main URLs, using first few search results") + websites = search_urls[:3] # Fallback to first 3 search results + + # Deduplicate while preserving order + seen = set() + unique_websites = [] + for url in websites: + if url not in seen: + seen.add(url) + unique_websites.append(url) + + websites = unique_websites + logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs") + + logger.info(f"AI selected {len(websites)} main URLs (after deduplication):") + for i, url in enumerate(websites, 1): + logger.info(f" {i}. {url}") + + # Step 2: Smart website selection using AI interface + logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===") + logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'") + + selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt) + + logger.debug(f"AI Response: {aiResponse}") + logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:") + for i, url in enumerate(selectedWebsites, 1): + logger.debug(f" {i}. {url}") + + # Show which were filtered out + filtered_out = [url for url in websites if url not in selectedWebsites] + if filtered_out: + logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:") + for i, url in enumerate(filtered_out, 1): + logger.debug(f" {i}. {url}") + + # Step 3+4+5: Recursive crawling with configurable depth + # Get configuration parameters + max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2")) + max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4")) + crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10")) + crawl_timeout_seconds = crawl_timeout_minutes * 60 + + # Use the configured max_depth or the request's pages_search_depth, whichever is smaller + effective_depth = min(max_depth, request.options.pages_search_depth) + + logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===") + logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...") + logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})") + logger.info(f"Max links per domain: {max_links_per_domain}") + logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes") + + # Use recursive crawling with URL index to avoid duplicates + import asyncio + try: + allContent = await asyncio.wait_for( + self.aiObjects.crawlRecursively( + urls=selectedWebsites, + max_depth=effective_depth, + extract_depth=request.options.extract_depth, + max_per_domain=max_links_per_domain, + global_processed_urls=global_processed_urls + ), + timeout=crawl_timeout_seconds + ) + logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled") + except asyncio.TimeoutError: + logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results") + # crawlRecursively now handles timeouts gracefully and returns partial results + # Try to get the partial results that were collected + allContent = {} + + if not allContent: + logger.error("Could not extract content from any websites") + return WebResearchActionResult(success=False, error="Could not extract content from any websites") + + logger.info(f"=== WEB RESEARCH COMPLETED ===") + logger.info(f"Successfully crawled {len(allContent)} URLs total") + logger.info(f"Crawl depth: {effective_depth} levels") + + # Create simple result with raw content + sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites] + + # Get all additional links (all URLs except main ones) + additional_links = [url for url in allContent.keys() if url not in selectedWebsites] + + # Combine all content into a single result + combinedContent = "" + for url, content in allContent.items(): + combinedContent += f"\n\n=== {url} ===\n{content}\n" + + documentData = WebResearchDocumentData( + user_prompt=request.user_prompt, + websites_analyzed=len(allContent), + additional_links_found=len(additional_links), + analysis_result=combinedContent, # Raw content, no analysis + sources=sources, + additional_links=additional_links, + individual_content=allContent, # Individual URL -> content mapping + debug_info={ + "crawl_depth": effective_depth, + "max_configured_depth": max_depth, + "max_links_per_domain": max_links_per_domain, + "crawl_timeout_minutes": crawl_timeout_minutes, + "total_urls_crawled": len(allContent), + "main_urls": len(selectedWebsites), + "additional_urls": len(additional_links) + } + ) + + document = WebResearchActionDocument( + documentName=f"web_research_{request.user_prompt[:50]}.json", + documentData=documentData, + mimeType="application/json" + ) + + return WebResearchActionResult( + success=True, + documents=[document], + resultLabel="web_research_results" + ) + + except Exception as e: + logger.error(f"Error in web research: {str(e)}") + return WebResearchActionResult(success=False, error=str(e)) diff --git a/modules/services/serviceExtraction/chunking/image_chunker.py b/modules/services/serviceExtraction/chunking/chunkerImage.py similarity index 100% rename from modules/services/serviceExtraction/chunking/image_chunker.py rename to modules/services/serviceExtraction/chunking/chunkerImage.py diff --git a/modules/services/serviceExtraction/chunking/structure_chunker.py b/modules/services/serviceExtraction/chunking/chunkerStructure.py similarity index 100% rename from modules/services/serviceExtraction/chunking/structure_chunker.py rename to modules/services/serviceExtraction/chunking/chunkerStructure.py diff --git a/modules/services/serviceExtraction/chunking/table_chunker.py b/modules/services/serviceExtraction/chunking/chunkerTable.py similarity index 100% rename from modules/services/serviceExtraction/chunking/table_chunker.py rename to modules/services/serviceExtraction/chunking/chunkerTable.py diff --git a/modules/services/serviceExtraction/chunking/text_chunker.py b/modules/services/serviceExtraction/chunking/chunkerText.py similarity index 100% rename from modules/services/serviceExtraction/chunking/text_chunker.py rename to modules/services/serviceExtraction/chunking/chunkerText.py diff --git a/modules/services/serviceExtraction/formats/__init__.py b/modules/services/serviceExtraction/extractors/__init__.py similarity index 100% rename from modules/services/serviceExtraction/formats/__init__.py rename to modules/services/serviceExtraction/extractors/__init__.py diff --git a/modules/services/serviceExtraction/formats/binary_extractor.py b/modules/services/serviceExtraction/extractors/extractorBinary.py similarity index 53% rename from modules/services/serviceExtraction/formats/binary_extractor.py rename to modules/services/serviceExtraction/extractors/extractorBinary.py index e6667fda..8a52986c 100644 --- a/modules/services/serviceExtraction/formats/binary_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorBinary.py @@ -7,8 +7,28 @@ from ..subRegistry import Extractor class BinaryExtractor(Extractor): + """ + Fallback extractor for unsupported file types. + + This extractor handles any file type that doesn't match other extractors. + It encodes the file as base64 and marks it as binary data. + + Supported formats: + - All file types (fallback) + - MIME types: application/octet-stream (default) + - File extensions: All (fallback) + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return True + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions (all).""" + return [] # Accepts all extensions as fallback + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types (all).""" + return [] # Accepts all MIME types as fallback def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/octet-stream" diff --git a/modules/services/serviceExtraction/formats/csv_extractor.py b/modules/services/serviceExtraction/extractors/extractorCsv.py similarity index 65% rename from modules/services/serviceExtraction/formats/csv_extractor.py rename to modules/services/serviceExtraction/extractors/extractorCsv.py index 27233979..fb1c642e 100644 --- a/modules/services/serviceExtraction/formats/csv_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorCsv.py @@ -6,8 +6,25 @@ from ..subRegistry import Extractor class CsvExtractor(Extractor): + """ + Extractor for CSV files. + + Supported formats: + - MIME types: text/csv + - File extensions: .csv + - Special handling: Treats as table data + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".csv"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/csv"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: fileName = context.get("fileName") diff --git a/modules/services/serviceExtraction/formats/docx_extractor.py b/modules/services/serviceExtraction/extractors/extractorDocx.py similarity index 82% rename from modules/services/serviceExtraction/formats/docx_extractor.py rename to modules/services/serviceExtraction/extractors/extractorDocx.py index 51384ffd..bce9f04b 100644 --- a/modules/services/serviceExtraction/formats/docx_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorDocx.py @@ -7,6 +7,16 @@ from ..subRegistry import Extractor class DocxExtractor(Extractor): + """ + Extractor for Microsoft Word documents. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document + - File extensions: .docx + - Special handling: Extracts paragraphs and tables (converts tables to CSV) + - Dependencies: python-docx + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -24,6 +34,14 @@ class DocxExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".docx"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/formats/html_extractor.py b/modules/services/serviceExtraction/extractors/extractorHtml.py similarity index 65% rename from modules/services/serviceExtraction/formats/html_extractor.py rename to modules/services/serviceExtraction/extractors/extractorHtml.py index 09da02f4..730df49c 100644 --- a/modules/services/serviceExtraction/formats/html_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorHtml.py @@ -7,8 +7,26 @@ from ..subRegistry import Extractor class HtmlExtractor(Extractor): + """ + Extractor for HTML files. + + Supported formats: + - MIME types: text/html + - File extensions: .html, .htm + - Special handling: Uses BeautifulSoup for parsing + - Dependencies: beautifulsoup4 + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".html", ".htm"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/html"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "text/html" diff --git a/modules/services/serviceExtraction/extractors/extractorImage.py b/modules/services/serviceExtraction/extractors/extractorImage.py new file mode 100644 index 00000000..578e0148 --- /dev/null +++ b/modules/services/serviceExtraction/extractors/extractorImage.py @@ -0,0 +1,75 @@ +from typing import Any, Dict, List +import base64 +import logging + +from ..subUtils import makeId +from modules.datamodels.datamodelExtraction import ContentPart +from ..subRegistry import Extractor + +logger = logging.getLogger(__name__) + + +class ImageExtractor(Extractor): + """ + Extractor for image files. + + Supported formats: + - MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff + - File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff + - Special handling: GIF files are converted to PNG during extraction + """ + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return ((mimeType or "").startswith("image/") or + (fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"))) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"] + + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: + mimeType = context.get("mimeType") or "image/unknown" + fileName = context.get("fileName", "") + + # Convert GIF to PNG during extraction + if mimeType.lower() == "image/gif": + try: + from PIL import Image + import io + + # Open GIF and convert to PNG + with Image.open(io.BytesIO(fileBytes)) as img: + # Convert to RGB (removes animation) + if img.mode in ('RGBA', 'LA', 'P'): + img = img.convert('RGB') + + # Save as PNG in memory + png_buffer = io.BytesIO() + img.save(png_buffer, format='PNG') + png_data = png_buffer.getvalue() + + # Update mimeType and fileBytes + mimeType = "image/png" + fileBytes = png_data + + logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes") + + except Exception as e: + logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original") + # Keep original GIF data if conversion fails + + return [ContentPart( + id=makeId(), + parentId=None, + label="image", + typeGroup="image", + mimeType=mimeType, + data=base64.b64encode(fileBytes).decode("utf-8"), + metadata={"size": len(fileBytes)} + )] + + diff --git a/modules/services/serviceExtraction/formats/json_extractor.py b/modules/services/serviceExtraction/extractors/extractorJson.py similarity index 66% rename from modules/services/serviceExtraction/formats/json_extractor.py rename to modules/services/serviceExtraction/extractors/extractorJson.py index 86eac791..04ab1c10 100644 --- a/modules/services/serviceExtraction/formats/json_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorJson.py @@ -7,8 +7,25 @@ from ..subRegistry import Extractor class JsonExtractor(Extractor): + """ + Extractor for JSON files. + + Supported formats: + - MIME types: application/json + - File extensions: .json + - Special handling: Validates JSON format, falls back to text if invalid + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/json" or (fileName or "").lower().endswith(".json") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".json"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/json"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/json" diff --git a/modules/services/serviceExtraction/formats/pdf_extractor.py b/modules/services/serviceExtraction/extractors/extractorPdf.py similarity index 88% rename from modules/services/serviceExtraction/formats/pdf_extractor.py rename to modules/services/serviceExtraction/extractors/extractorPdf.py index 59c88dc7..4f0290ec 100644 --- a/modules/services/serviceExtraction/formats/pdf_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorPdf.py @@ -8,6 +8,16 @@ from ..subRegistry import Extractor class PdfExtractor(Extractor): + """ + Extractor for PDF files. + + Supported formats: + - MIME types: application/pdf + - File extensions: .pdf + - Special handling: Extracts text per page and embedded images + - Dependencies: PyPDF2, PyMuPDF (fitz) + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -26,6 +36,14 @@ class PdfExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".pdf"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/pdf"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/extractors/extractorPptx.py b/modules/services/serviceExtraction/extractors/extractorPptx.py new file mode 100644 index 00000000..1a5a7ff8 --- /dev/null +++ b/modules/services/serviceExtraction/extractors/extractorPptx.py @@ -0,0 +1,225 @@ +import logging +import base64 +from typing import List, Dict, Any, Optional +from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted +from ..subRegistry import Extractor + +logger = logging.getLogger(__name__) + + +class PptxExtractor(Extractor): + """ + Extractor for PowerPoint files. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint + - File extensions: .pptx, .ppt + - Special handling: Extracts slide content, tables, and images + - Dependencies: python-pptx + """ + + def __init__(self): + self._loaded = False + self._haveLibs = False + + def _load(self): + if self._loaded: + return + self._loaded = True + try: + global Presentation + from pptx import Presentation + self._haveLibs = True + except Exception: + self._haveLibs = False + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return (mimeType in [ + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint" + ]) or (fileName or "").lower().endswith((".pptx", ".ppt")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".pptx", ".ppt"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [ + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint" + ] + + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: + """ + Extract content from PowerPoint files. + + Args: + fileBytes: Raw file data as bytes + context: Context dictionary with file information + + Returns: + List of ContentPart objects with extracted content + """ + self._load() + + if not self._haveLibs: + logger.error("python-pptx library not installed. Install with: pip install python-pptx") + return [ContentPart( + id="error", + label="PowerPoint Extraction Error", + typeGroup="text", + mimeType="text/plain", + data="Error: python-pptx library not installed", + metadata={"error": True, "error_message": "python-pptx library not installed"} + )] + + try: + import io + + # Load presentation from bytes + presentation = Presentation(io.BytesIO(fileBytes)) + + parts = [] + slide_index = 0 + + # Extract content from each slide + for slide in presentation.slides: + slide_index += 1 + slide_content = [] + + # Extract text from slide + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + slide_content.append(shape.text.strip()) + + # Extract table data + for shape in slide.shapes: + if shape.has_table: + table = shape.table + table_data = [] + for row in table.rows: + row_data = [] + for cell in row.cells: + row_data.append(cell.text.strip()) + table_data.append(row_data) + + if table_data: + # Convert table to markdown format + table_md = self._table_to_markdown(table_data) + slide_content.append(table_md) + + # Extract images + for shape in slide.shapes: + if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE + try: + image = shape.image + image_bytes = image.blob + image_b64 = base64.b64encode(image_bytes).decode('utf-8') + + # Create image part + image_part = ContentPart( + id=f"slide_{slide_index}_image_{len(parts)}", + label=f"Slide {slide_index} Image", + typeGroup="image", + mimeType="image/png", # Default to PNG + data=image_b64, + metadata={ + "slide_number": slide_index, + "shape_type": "image", + "extracted_from": "powerpoint" + } + ) + parts.append(image_part) + except Exception as e: + logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}") + + # Create slide content part + if slide_content: + slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content) + + slide_part = ContentPart( + id=f"slide_{slide_index}", + label=f"Slide {slide_index} Content", + typeGroup="structure", + mimeType="text/plain", + data=slide_text, + metadata={ + "slide_number": slide_index, + "content_type": "slide", + "extracted_from": "powerpoint", + "text_length": len(slide_text) + } + ) + parts.append(slide_part) + + # Create presentation overview + file_name = context.get("fileName", "presentation.pptx") + overview_text = f"# PowerPoint Presentation: {file_name}\n\n" + overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n" + overview_text += f"**Content Parts:** {len(parts)}\n\n" + + # Add slide summaries + for i, slide in enumerate(presentation.slides, 1): + slide_text_parts = [] + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + slide_text_parts.append(shape.text.strip()) + + if slide_text_parts: + overview_text += f"## Slide {i}\n" + overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements + overview_text += "\n\n" + + # Create overview part + overview_part = ContentPart( + id="presentation_overview", + label="Presentation Overview", + typeGroup="text", + mimeType="text/plain", + data=overview_text, + metadata={ + "content_type": "overview", + "extracted_from": "powerpoint", + "total_slides": len(presentation.slides), + "text_length": len(overview_text) + } + ) + parts.insert(0, overview_part) # Insert at beginning + + return parts + + except Exception as e: + logger.error(f"Error extracting PowerPoint content: {str(e)}") + return [ContentPart( + id="error", + label="PowerPoint Extraction Error", + typeGroup="text", + mimeType="text/plain", + data=f"Error extracting PowerPoint content: {str(e)}", + metadata={"error": True, "error_message": str(e)} + )] + + def _table_to_markdown(self, table_data: List[List[str]]) -> str: + """Convert table data to markdown format.""" + if not table_data: + return "" + + markdown_lines = [] + + # Header row + if table_data: + header = "| " + " | ".join(table_data[0]) + " |" + markdown_lines.append(header) + + # Separator row + separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |" + markdown_lines.append(separator) + + # Data rows + for row in table_data[1:]: + data_row = "| " + " | ".join(row) + " |" + markdown_lines.append(data_row) + + return "\n".join(markdown_lines) + diff --git a/modules/services/serviceExtraction/extractors/extractorSql.py b/modules/services/serviceExtraction/extractors/extractorSql.py new file mode 100644 index 00000000..c751d7ca --- /dev/null +++ b/modules/services/serviceExtraction/extractors/extractorSql.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, List + +from modules.datamodels.datamodelExtraction import ContentPart +from ..subUtils import makeId +from ..subRegistry import Extractor + + +class SqlExtractor(Extractor): + """ + Extractor for SQL files. + + Supported formats: + - MIME types: text/x-sql, application/sql + - File extensions: .sql, .ddl, .dml, .dcl, .tcl + - Special handling: Treats as structured text with SQL syntax + """ + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return (mimeType in ("text/x-sql", "application/sql") or + (fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl"))) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".sql", ".ddl", ".dml", ".dcl", ".tcl"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/x-sql", "application/sql"] + + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: + fileName = context.get("fileName") + mimeType = context.get("mimeType") or "text/x-sql" + data = fileBytes.decode("utf-8", errors="replace") + + # Add SQL-specific metadata + metadata = { + "size": len(fileBytes), + "file_type": "sql", + "line_count": len(data.splitlines()), + "has_select": "SELECT" in data.upper(), + "has_insert": "INSERT" in data.upper(), + "has_update": "UPDATE" in data.upper(), + "has_delete": "DELETE" in data.upper(), + "has_create": "CREATE" in data.upper(), + "has_drop": "DROP" in data.upper() + } + + return [ContentPart( + id=makeId(), + parentId=None, + label="main", + typeGroup="structure", + mimeType=mimeType, + data=data, + metadata=metadata + )] diff --git a/modules/services/serviceExtraction/extractors/extractorText.py b/modules/services/serviceExtraction/extractors/extractorText.py new file mode 100644 index 00000000..3cd0ebdf --- /dev/null +++ b/modules/services/serviceExtraction/extractors/extractorText.py @@ -0,0 +1,103 @@ +from typing import Any, Dict, List + +from modules.datamodels.datamodelExtraction import ContentPart +from ..subUtils import makeId +from ..subRegistry import Extractor + + +class TextExtractor(Extractor): + """ + Extractor for plain text files and code files. + + Supported formats: + - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc. + - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z + """ + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + # Check MIME types + if mimeType and mimeType.startswith("text/"): + return True + + # Check file extensions + if fileName: + ext = fileName.lower() + return ext.endswith(( + # Basic text files + ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", + # Programming languages + ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", + ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", + # Web technologies + ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", + # Configuration files + ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", + # Scripts and automation + ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", + # Data files + ".csv", ".tsv", ".tab", ".dat", ".data", + # Documentation + ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", + # Other text formats + ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", + ".env", ".env.local", ".env.development", ".env.production", ".env.test", + ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" + )) + + return False + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [ + # Basic text files + ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", + # Programming languages + ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", + ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", + # Web technologies + ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", + # Configuration files + ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", + # Scripts and automation + ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", + # Data files + ".csv", ".tsv", ".tab", ".dat", ".data", + # Documentation + ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", + # Other text formats + ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", + ".env", ".env.local", ".env.development", ".env.production", ".env.test", + ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" + ] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [ + "text/plain", "text/markdown", "text/x-python", "text/x-java-source", + "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript", + "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby", + "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin", + "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml", + "text/x-ini", "text/x-config", "text/x-properties", "text/x-log", + "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less", + "text/xml", "text/csv", "text/tab-separated-values", "text/rtf", + "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org", + "application/x-yaml", "application/x-toml", "application/x-ini", + "application/x-config", "application/x-properties", "application/x-log" + ] + + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: + fileName = context.get("fileName") + mimeType = context.get("mimeType") or "text/plain" + data = fileBytes.decode("utf-8", errors="replace") + return [ContentPart( + id=makeId(), + parentId=None, + label="main", + typeGroup="text", + mimeType=mimeType, + data=data, + metadata={"size": len(fileBytes)} + )] + + diff --git a/modules/services/serviceExtraction/formats/xlsx_extractor.py b/modules/services/serviceExtraction/extractors/extractorXlsx.py similarity index 84% rename from modules/services/serviceExtraction/formats/xlsx_extractor.py rename to modules/services/serviceExtraction/extractors/extractorXlsx.py index ea6396a2..af346419 100644 --- a/modules/services/serviceExtraction/formats/xlsx_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorXlsx.py @@ -8,6 +8,16 @@ from ..subRegistry import Extractor class XlsxExtractor(Extractor): + """ + Extractor for Microsoft Excel spreadsheets. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + - File extensions: .xlsx, .xlsm + - Special handling: Extracts all sheets as CSV data + - Dependencies: openpyxl + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -26,6 +36,14 @@ class XlsxExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".xlsx", ".xlsm"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/formats/xml_extractor.py b/modules/services/serviceExtraction/extractors/extractorXml.py similarity index 66% rename from modules/services/serviceExtraction/formats/xml_extractor.py rename to modules/services/serviceExtraction/extractors/extractorXml.py index 5aabea35..c7d034ad 100644 --- a/modules/services/serviceExtraction/formats/xml_extractor.py +++ b/modules/services/serviceExtraction/extractors/extractorXml.py @@ -7,8 +7,25 @@ from ..subRegistry import Extractor class XmlExtractor(Extractor): + """ + Extractor for XML files. + + Supported formats: + - MIME types: application/xml + - File extensions: .xml, .rss, .atom + - Special handling: Uses ElementTree for parsing + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".xml", ".rss", ".atom"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/xml"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/xml" diff --git a/modules/services/serviceExtraction/formats/image_extractor.py b/modules/services/serviceExtraction/formats/image_extractor.py deleted file mode 100644 index 22327f50..00000000 --- a/modules/services/serviceExtraction/formats/image_extractor.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import Any, Dict, List -import base64 - -from ..subUtils import makeId -from modules.datamodels.datamodelExtraction import ContentPart -from ..subRegistry import Extractor - - -class ImageExtractor(Extractor): - def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: - return (mimeType or "").startswith("image/") - - def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: - mimeType = context.get("mimeType") or "image/unknown" - return [ContentPart( - id=makeId(), - parentId=None, - label="image", - typeGroup="image", - mimeType=mimeType, - data=base64.b64encode(fileBytes).decode("utf-8"), - metadata={"size": len(fileBytes)} - )] - - diff --git a/modules/services/serviceExtraction/formats/text_extractor.py b/modules/services/serviceExtraction/formats/text_extractor.py deleted file mode 100644 index a6d92bc1..00000000 --- a/modules/services/serviceExtraction/formats/text_extractor.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import Any, Dict, List - -from modules.datamodels.datamodelExtraction import ContentPart -from ..subUtils import makeId -from ..subRegistry import Extractor - - -class TextExtractor(Extractor): - def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: - return mimeType in ("text/plain", "text/markdown") - - def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: - fileName = context.get("fileName") - mimeType = context.get("mimeType") or "text/plain" - data = fileBytes.decode("utf-8", errors="replace") - return [ContentPart( - id=makeId(), - parentId=None, - label="main", - typeGroup="text", - mimeType=mimeType, - data=data, - metadata={"size": len(fileBytes)} - )] - - diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index 6d313463..7608cc1b 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -67,10 +67,12 @@ class ExtractionService: if part.metadata: logger.debug(f" Metadata: {part.metadata}") - # Attach document id to parts if missing + # Attach document id and MIME type to parts if missing for p in ec.parts: if "documentId" not in p.metadata: p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4()) + if "documentMimeType" not in p.metadata: + p.metadata["documentMimeType"] = documentData["mimeType"] # Log chunking information chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)] diff --git a/modules/services/serviceExtraction/merging/default_merger.py b/modules/services/serviceExtraction/merging/mergerDefault.py similarity index 100% rename from modules/services/serviceExtraction/merging/default_merger.py rename to modules/services/serviceExtraction/merging/mergerDefault.py diff --git a/modules/services/serviceExtraction/merging/table_merger.py b/modules/services/serviceExtraction/merging/mergerTable.py similarity index 100% rename from modules/services/serviceExtraction/merging/table_merger.py rename to modules/services/serviceExtraction/merging/mergerTable.py diff --git a/modules/services/serviceExtraction/merging/text_merger.py b/modules/services/serviceExtraction/merging/mergerText.py similarity index 100% rename from modules/services/serviceExtraction/merging/text_merger.py rename to modules/services/serviceExtraction/merging/mergerText.py diff --git a/modules/services/serviceExtraction/subMerger.py b/modules/services/serviceExtraction/subMerger.py new file mode 100644 index 00000000..da0bbfcd --- /dev/null +++ b/modules/services/serviceExtraction/subMerger.py @@ -0,0 +1,209 @@ +""" +Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits. +""" +from typing import List, Dict, Any, Tuple +import logging +from modules.datamodels.datamodelExtraction import ContentPart +from .subUtils import makeId + +logger = logging.getLogger(__name__) + + +class IntelligentTokenAwareMerger: + """ + Intelligent merger that groups chunks based on LLM token limits to minimize AI calls. + + Strategy: + 1. Calculate token count for each chunk + 2. Group chunks to maximize token usage without exceeding limits + 3. Preserve document structure and semantic boundaries + 4. Minimize total number of AI calls + """ + + def __init__(self, model_capabilities: Dict[str, Any]): + self.max_tokens = model_capabilities.get("maxTokens", 4000) + self.safety_margin = model_capabilities.get("safetyMargin", 0.1) + self.effective_max_tokens = int(self.max_tokens * (1 - self.safety_margin)) + self.chars_per_token = model_capabilities.get("charsPerToken", 4) # Rough estimation + + def merge_chunks_intelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]: + """ + Merge chunks intelligently based on token limits. + + Args: + chunks: List of ContentPart chunks to merge + prompt: AI prompt to account for in token calculation + + Returns: + List of optimally merged ContentPart objects + """ + if not chunks: + return chunks + + logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, max_tokens={self.effective_max_tokens}") + + # Calculate tokens for prompt + prompt_tokens = self._estimate_tokens(prompt) + available_tokens = self.effective_max_tokens - prompt_tokens + + logger.info(f"πŸ“Š Prompt tokens: {prompt_tokens}, Available for content: {available_tokens}") + + # Group chunks by document and type for semantic coherence + grouped_chunks = self._group_chunks_by_document_and_type(chunks) + + merged_parts = [] + + for group_key, group_chunks in grouped_chunks.items(): + logger.info(f"πŸ“ Processing group: {group_key} ({len(group_chunks)} chunks)") + + # Merge chunks within this group optimally + group_merged = self._merge_group_optimally(group_chunks, available_tokens) + merged_parts.extend(group_merged) + + logger.info(f"βœ… Intelligent merging complete: {len(chunks)} β†’ {len(merged_parts)} parts") + return merged_parts + + def _group_chunks_by_document_and_type(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]: + """Group chunks by document and type for semantic coherence.""" + groups = {} + + for chunk in chunks: + # Create group key: document_id + type_group + doc_id = chunk.metadata.get("documentId", "unknown") + type_group = chunk.typeGroup + group_key = f"{doc_id}_{type_group}" + + if group_key not in groups: + groups[group_key] = [] + groups[group_key].append(chunk) + + return groups + + def _merge_group_optimally(self, chunks: List[ContentPart], available_tokens: int) -> List[ContentPart]: + """Merge chunks within a group optimally to minimize AI calls.""" + if not chunks: + return [] + + # Sort chunks by size (smallest first for better packing) + sorted_chunks = sorted(chunks, key=lambda c: self._estimate_tokens(c.data)) + + merged_parts = [] + current_group = [] + current_tokens = 0 + + for chunk in sorted_chunks: + chunk_tokens = self._estimate_tokens(chunk.data) + + # Special case: If single chunk is already at max size, process it alone + if chunk_tokens >= available_tokens * 0.9: # 90% of available tokens + # Finalize current group if it exists + if current_group: + merged_part = self._create_merged_part(current_group, current_tokens) + merged_parts.append(merged_part) + current_group = [] + current_tokens = 0 + + # Process large chunk individually + merged_parts.append(chunk) + logger.debug(f"πŸ” Large chunk processed individually: {chunk_tokens} tokens") + continue + + # If adding this chunk would exceed limit, finalize current group + if current_tokens + chunk_tokens > available_tokens and current_group: + merged_part = self._create_merged_part(current_group, current_tokens) + merged_parts.append(merged_part) + current_group = [chunk] + current_tokens = chunk_tokens + else: + current_group.append(chunk) + current_tokens += chunk_tokens + + # Finalize remaining group + if current_group: + merged_part = self._create_merged_part(current_group, current_tokens) + merged_parts.append(merged_part) + + logger.info(f"πŸ“¦ Group merged: {len(chunks)} β†’ {len(merged_parts)} parts") + return merged_parts + + def _create_merged_part(self, chunks: List[ContentPart], total_tokens: int) -> ContentPart: + """Create a merged ContentPart from multiple chunks.""" + if len(chunks) == 1: + return chunks[0] # No need to merge single chunk + + # Combine data with semantic separators + combined_data = self._combine_chunk_data(chunks) + + # Use metadata from first chunk as base + base_chunk = chunks[0] + merged_metadata = base_chunk.metadata.copy() + merged_metadata.update({ + "merged": True, + "originalChunkCount": len(chunks), + "totalTokens": total_tokens, + "originalChunkIds": [c.id for c in chunks], + "size": len(combined_data.encode('utf-8')) + }) + + merged_part = ContentPart( + id=makeId(), + parentId=base_chunk.parentId, + label=f"merged_{len(chunks)}_chunks", + typeGroup=base_chunk.typeGroup, + mimeType=base_chunk.mimeType, + data=combined_data, + metadata=merged_metadata + ) + + logger.debug(f"πŸ”— Created merged part: {len(chunks)} chunks, {total_tokens} tokens") + return merged_part + + def _combine_chunk_data(self, chunks: List[ContentPart]) -> str: + """Combine chunk data with appropriate separators.""" + if not chunks: + return "" + + # Use different separators based on content type + if chunks[0].typeGroup == "text": + separator = "\n\n---\n\n" # Clear text separation + elif chunks[0].typeGroup == "table": + separator = "\n\n[TABLE BREAK]\n\n" # Table separation + else: + separator = "\n\n---\n\n" # Default separation + + return separator.join([chunk.data for chunk in chunks]) + + def _estimate_tokens(self, text: str) -> int: + """Estimate token count for text.""" + if not text: + return 0 + return len(text) // self.chars_per_token + + def calculate_optimization_stats(self, original_chunks: List[ContentPart], merged_parts: List[ContentPart]) -> Dict[str, Any]: + """Calculate optimization statistics with detailed analysis.""" + original_calls = len(original_chunks) + optimized_calls = len(merged_parts) + reduction_percent = ((original_calls - optimized_calls) / original_calls * 100) if original_calls > 0 else 0 + + # Analyze chunk sizes + large_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) >= self.effective_max_tokens * 0.9] + small_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) < self.effective_max_tokens * 0.9] + + # Calculate theoretical maximum optimization (if all small chunks could be merged) + theoretical_min_calls = len(large_chunks) + max(1, len(small_chunks) // 3) # Assume 3 small chunks per call + theoretical_reduction = ((original_calls - theoretical_min_calls) / original_calls * 100) if original_calls > 0 else 0 + + return { + "original_ai_calls": original_calls, + "optimized_ai_calls": optimized_calls, + "reduction_percent": round(reduction_percent, 1), + "cost_savings": f"{reduction_percent:.1f}%", + "efficiency_gain": f"{original_calls / optimized_calls:.1f}x" if optimized_calls > 0 else "∞", + "analysis": { + "large_chunks": len(large_chunks), + "small_chunks": len(small_chunks), + "theoretical_min_calls": theoretical_min_calls, + "theoretical_reduction": round(theoretical_reduction, 1), + "optimization_potential": "high" if reduction_percent > 50 else "moderate" if reduction_percent > 20 else "low" + } + } diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py index fd7eb20c..9b18ea88 100644 --- a/modules/services/serviceExtraction/subPipeline.py +++ b/modules/services/serviceExtraction/subPipeline.py @@ -3,11 +3,13 @@ import logging import os from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart +from modules.shared.configuration import APP_CONFIG from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry -from .merging.text_merger import TextMerger -from .merging.table_merger import TableMerger -from .merging.default_merger import DefaultMerger +from .merging.mergerText import TextMerger +from .merging.mergerTable import TableMerger +from .merging.mergerDefault import DefaultMerger +from .subMerger import IntelligentTokenAwareMerger logger = logging.getLogger(__name__) @@ -84,46 +86,55 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker chunk_parts = [p for p in parts if p.metadata.get("chunk", False)] logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging") + logger.debug(f"runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}") - if non_chunk_parts: + # Apply intelligent merging for small text parts + if non_chunk_parts: + # Count text parts + text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"] + if len(text_parts) > 5: # If we have many small text parts, merge them + logger.info(f"πŸ”§ Merging {len(text_parts)} small text parts for efficiency") non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy) # Combine non-chunk parts with chunk parts (chunks stay separate) parts = non_chunk_parts + chunk_parts logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})") - # DEBUG: dump parts and chunks to files TODO TO REMOVE + logger.debug(f"runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})") + # DEBUG: dump parts and chunks to files - only if debug enabled try: - base_dir = "./test-chat/ai" - os.makedirs(base_dir, exist_ok=True) - - # Generate timestamp for consistent naming - from datetime import datetime, UTC - ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] - - # Write a summary file - summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"] - text_index = 0 - for idx, part in enumerate(parts): - is_texty = part.typeGroup in ("text", "table", "structure") - size = int(part.metadata.get("size", 0) or 0) - is_chunk = bool(part.metadata.get("chunk", False)) - summary_lines.append( - f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}" - ) - if is_texty and getattr(part, "data", None): - text_index += 1 - fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt" - fpath = os.path.join(base_dir, fname) - with open(fpath, "w", encoding="utf-8") as f: - f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n") - f.write(str(part.data)) - - # Write summary file - summary_fname = f"{ts}_extract_{fileName}_summary.txt" - summary_fpath = os.path.join(base_dir, summary_fname) - with open(summary_fpath, "w", encoding="utf-8") as f: - f.write("\n".join(summary_lines)) + debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + base_dir = "./test-chat/ai" + os.makedirs(base_dir, exist_ok=True) + + # Generate timestamp for consistent naming + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + + # Write a summary file + summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"] + text_index = 0 + for idx, part in enumerate(parts): + is_texty = part.typeGroup in ("text", "table", "structure") + size = int(part.metadata.get("size", 0) or 0) + is_chunk = bool(part.metadata.get("chunk", False)) + summary_lines.append( + f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}" + ) + if is_texty and getattr(part, "data", None): + text_index += 1 + fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt" + fpath = os.path.join(base_dir, fname) + with open(fpath, "w", encoding="utf-8") as f: + f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n") + f.write(str(part.data)) + + # Write summary file + summary_fname = f"{ts}_extract_{fileName}_summary.txt" + summary_fpath = os.path.join(base_dir, summary_fname) + with open(summary_fpath, "w", encoding="utf-8") as f: + f.write("\n".join(summary_lines)) except Exception as _e: logger.debug(f"Debug dump skipped: {_e}") @@ -146,13 +157,22 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt kept: List[ContentPart] = [] remaining: List[ContentPart] = [] - for p in parts: + logger.debug(f"Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}") + + for i, p in enumerate(parts): size = int(p.metadata.get("size", 0) or 0) + # Show first 50 characters of text content for debugging + content_preview = p.data[:50].replace('\n', '\\n') if p.data else "" + logger.debug(f"Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})") if current + size <= maxSize: kept.append(p) current += size + logger.debug(f"Part {i} kept (total: {current})") else: remaining.append(p) + logger.debug(f"Part {i} moved to remaining") + + logger.debug(f"Kept: {len(kept)}, Remaining: {len(remaining)}") # If we have remaining parts and chunking is allowed, try chunking if remaining and chunkAllowed: @@ -160,12 +180,15 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt logger.debug(f"Remaining parts to chunk: {len(remaining)}") logger.debug(f"Max size limit: {maxSize} bytes") logger.debug(f"Current size used: {current} bytes") + logger.debug(f"Chunking {len(remaining)} remaining parts") for p in remaining: - if p.typeGroup in ("text", "table", "structure", "image"): + if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"): logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars") + logger.debug(f"Chunking {p.typeGroup} part with {len(p.data)} chars") chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options) logger.debug(f"Created {len(chunks)} chunks") + logger.debug(f"Created {len(chunks)} chunks") chunks_added = 0 for ch in chunks: @@ -197,12 +220,18 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt logger.debug(f"Preserving {len(chunk_parts)} chunks from merging") - if non_chunk_parts: + # Apply intelligent merging for small text parts + if non_chunk_parts: + # Count text parts + text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"] + if len(text_parts) > 5: # If we have many small text parts, merge them + logger.info(f"πŸ”§ Merging {len(text_parts)} small text parts for efficiency") non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy) # Combine non-chunk parts with chunk parts (chunks stay separate) kept = non_chunk_parts + chunk_parts + logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})") logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})") # Re-check size after merging @@ -211,11 +240,30 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt # Apply size limit to merged parts kept = _applySizeLimit(kept, maxSize) + logger.debug(f"poolAndLimit returning {len(kept)} parts") return kept def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: - """Apply merging strategy to parts.""" + """Apply merging strategy to parts with intelligent token-aware merging.""" + logger.debug(f"_applyMerging called with {len(parts)} parts") + + # Check if intelligent merging is enabled + if strategy.get("useIntelligentMerging", False): + model_capabilities = strategy.get("modelCapabilities", {}) + subMerger = IntelligentTokenAwareMerger(model_capabilities) + + # Use intelligent merging for all parts + merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", "")) + + # Calculate and log optimization stats + stats = subMerger.calculate_optimization_stats(parts, merged) + logger.info(f"🧠 Intelligent merging stats: {stats}") + logger.debug(f"Intelligent merging: {stats['original_ai_calls']} β†’ {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)") + + return merged + + # Fallback to traditional merging textMerger = TextMerger() tableMerger = TableMerger() defaultMerger = DefaultMerger() @@ -226,18 +274,29 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co structureParts = [p for p in parts if p.typeGroup == "structure"] otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")] + logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}") + merged: List[ContentPart] = [] if textParts: - merged.extend(textMerger.merge(textParts, strategy)) + textMerged = textMerger.merge(textParts, strategy) + logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts") + merged.extend(textMerged) if tableParts: - merged.extend(tableMerger.merge(tableParts, strategy)) + tableMerged = tableMerger.merge(tableParts, strategy) + logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts") + merged.extend(tableMerged) if structureParts: # For now, treat structure like text - merged.extend(textMerger.merge(structureParts, strategy)) + structureMerged = textMerger.merge(structureParts, strategy) + logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts") + merged.extend(structureMerged) if otherParts: - merged.extend(defaultMerger.merge(otherParts, strategy)) + otherMerged = defaultMerger.merge(otherParts, strategy) + logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts") + merged.extend(otherMerged) + logger.debug(f"_applyMerging returning {len(merged)} parts") return merged diff --git a/modules/services/serviceExtraction/subRegistry.py b/modules/services/serviceExtraction/subRegistry.py index 07a978d4..eb2ece4d 100644 --- a/modules/services/serviceExtraction/subRegistry.py +++ b/modules/services/serviceExtraction/subRegistry.py @@ -1,14 +1,37 @@ from typing import Any, Dict, Optional +import logging from modules.datamodels.datamodelExtraction import ContentPart +logger = logging.getLogger(__name__) + class Extractor: + """ + Base class for all document extractors. + + Each extractor should implement: + - detect(): Check if this extractor can handle the given file + - extract(): Extract content from the file + - getSupportedExtensions(): Return supported file extensions + - getSupportedMimeTypes(): Return supported MIME types + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + """Check if this extractor can handle the given file.""" return False def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]: + """Extract content from the file bytes.""" raise NotImplementedError + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions (including dots).""" + return [] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [] class Chunker: @@ -20,50 +43,85 @@ class ExtractorRegistry: def __init__(self): self._map: Dict[str, Extractor] = {} self._fallback: Optional[Extractor] = None - # Register built-ins + self._auto_discover_extractors() + + def _auto_discover_extractors(self): + """Auto-discover and register all extractors from the extractors directory.""" try: - from .formats.text_extractor import TextExtractor - from .formats.csv_extractor import CsvExtractor - from .formats.json_extractor import JsonExtractor - from .formats.xml_extractor import XmlExtractor - from .formats.html_extractor import HtmlExtractor - from .formats.pdf_extractor import PdfExtractor - from .formats.docx_extractor import DocxExtractor - from .formats.xlsx_extractor import XlsxExtractor - from .formats.image_extractor import ImageExtractor - from .formats.binary_extractor import BinaryExtractor - self.register("text/plain", TextExtractor()) - self.register("text/markdown", TextExtractor()) - self.register("text/csv", CsvExtractor()) - self.register("application/json", JsonExtractor()) - self.register("application/xml", XmlExtractor()) - self.register("text/html", HtmlExtractor()) - self.register("application/pdf", PdfExtractor()) - self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor()) - self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor()) - # images - self.register("image/jpeg", ImageExtractor()) - self.register("image/png", ImageExtractor()) - self.register("image/gif", ImageExtractor()) - # extension fallbacks - self.register("txt", TextExtractor()) - self.register("md", TextExtractor()) - self.register("csv", CsvExtractor()) - self.register("json", JsonExtractor()) - self.register("xml", XmlExtractor()) - self.register("html", HtmlExtractor()) - self.register("htm", HtmlExtractor()) - self.register("pdf", PdfExtractor()) - self.register("docx", DocxExtractor()) - self.register("xlsx", XlsxExtractor()) - self.register("xlsm", XlsxExtractor()) - # fallback - self.setFallback(BinaryExtractor()) - print(f"βœ… ExtractorRegistry: Successfully registered {len(self._map)} extractors") + import os + import importlib + from pathlib import Path + + # Get the extractors directory + current_dir = Path(__file__).parent + extractors_dir = current_dir / "extractors" + + if not extractors_dir.exists(): + logger.error(f"Extractors directory not found: {extractors_dir}") + return + + # Import all extractor modules + extractor_modules = [] + for file_path in extractors_dir.glob("extractor*.py"): + if file_path.name == "__init__.py": + continue + + module_name = file_path.stem + try: + # Import the module + module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors") + + # Find all extractor classes in the module + for attr_name in dir(module): + attr = getattr(module, attr_name) + if (isinstance(attr, type) and + issubclass(attr, Extractor) and + attr != Extractor and + not attr_name.startswith('_')): + + # Create instance and auto-register + extractor_instance = attr() + self._auto_register_extractor(extractor_instance) + extractor_modules.append(attr_name) + + except Exception as e: + logger.warning(f"Failed to import {module_name}: {str(e)}") + continue + + # Set fallback extractor + try: + from .extractors.extractorBinary import BinaryExtractor + self.setFallback(BinaryExtractor()) + except Exception as e: + logger.warning(f"Failed to set fallback extractor: {str(e)}") + + logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}") + logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}") + except Exception as e: - print(f"❌ ExtractorRegistry: Failed to register extractors: {str(e)}") + logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}") import traceback traceback.print_exc() + + def _auto_register_extractor(self, extractor: Extractor): + """Auto-register an extractor based on its declared supported formats.""" + try: + # Register MIME types + mime_types = extractor.getSupportedMimeTypes() + for mime_type in mime_types: + self.register(mime_type, extractor) + logger.debug(f"Registered MIME type: {mime_type} β†’ {extractor.__class__.__name__}") + + # Register file extensions + extensions = extractor.getSupportedExtensions() + for ext in extensions: + # Remove leading dot for registry key + ext_key = ext.lstrip('.') + self.register(ext_key, extractor) + logger.debug(f"Registered extension: .{ext_key} β†’ {extractor.__class__.__name__}") + + except Exception as e: + logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}") def register(self, key: str, extractor: Extractor): self._map[key] = extractor @@ -80,6 +138,43 @@ class ExtractorRegistry: if ext in self._map: return self._map[ext] return self._fallback + + def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]: + """ + Get all supported formats from all registered extractors. + + Returns: + Dictionary with format information: + { + "extensions": { + "extractor_name": [".ext1", ".ext2", ...] + }, + "mime_types": { + "extractor_name": ["mime/type1", "mime/type2", ...] + } + } + """ + formats = {"extensions": {}, "mime_types": {}} + + # Get formats from registered extractors + for key, extractor in self._map.items(): + if hasattr(extractor, 'getSupportedExtensions'): + extensions = extractor.getSupportedExtensions() + if extensions: + formats["extensions"][key] = extensions + + if hasattr(extractor, 'getSupportedMimeTypes'): + mime_types = extractor.getSupportedMimeTypes() + if mime_types: + formats["mime_types"][key] = mime_types + + # Add fallback extractor info + if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'): + formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions() + if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'): + formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes() + + return formats class ChunkerRegistry: @@ -88,17 +183,19 @@ class ChunkerRegistry: self._noop = Chunker() # Register default chunkers try: - from .chunking.text_chunker import TextChunker - from .chunking.table_chunker import TableChunker - from .chunking.structure_chunker import StructureChunker - # Skip ImageChunker for now to avoid PIL import hang - # from .chunking.image_chunker import ImageChunker + from .chunking.chunkerText import TextChunker + from .chunking.chunkerTable import TableChunker + from .chunking.chunkerStructure import StructureChunker + from .chunking.chunkerImage import ImageChunker self.register("text", TextChunker()) self.register("table", TableChunker()) self.register("structure", StructureChunker()) - # self.register("image", ImageChunker()) + self.register("image", ImageChunker()) + # Use text chunker for container and binary content + self.register("container", TextChunker()) + self.register("binary", TextChunker()) except Exception as e: - print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}") + logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}") import traceback traceback.print_exc() diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index 72301768..8ed6423b 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -1,6 +1,7 @@ import logging import uuid -from typing import Any, Dict, List, Optional +import json +from typing import Any, Dict, List, Optional, Union, Tuple from datetime import datetime, UTC import re from modules.shared.timezoneUtils import get_utc_timestamp @@ -18,7 +19,7 @@ logger = logging.getLogger(__name__) class GenerationService: def __init__(self, serviceCenter=None): # Directly use interfaces from the provided service center (no self.service calls) - self.serviceCenter = serviceCenter + self.services = serviceCenter self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None @@ -296,101 +297,237 @@ class GenerationService: 'workflowId': 'unknown' } - async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]: + async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]: """ - Render extracted content to the specified output format. + Render extracted JSON content to the specified output format. Args: - extracted_content: Content extracted by AI using format-specific prompt - output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + extractedContent: Structured JSON document from AI extraction + outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) title: Report title + userPrompt: User's original prompt for report generation + aiService: AI service instance for generation prompt creation Returns: tuple: (rendered_content, mime_type) """ try: - # DEBUG: dump renderer input to diagnose JSON+HTML mixtures TODO REMOVE + # Validate JSON input + if not isinstance(extractedContent, dict): + raise ValueError("extractedContent must be a JSON dictionary") + + if "sections" not in extractedContent: + raise ValueError("extractedContent must contain 'sections' field") + + # DEBUG: Log renderer input metadata only (no verbose JSON) - only if debug enabled try: - import os - ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - debug_root = "./test-chat/ai" - debug_dir = os.path.join(debug_root, f"render_input_{ts}") - os.makedirs(debug_dir, exist_ok=True) - with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: - f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n") - with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f: - f.write(extracted_content or "") + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + import os + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + debug_dir = os.path.join(debug_root, f"render_input_{ts}") + os.makedirs(debug_dir, exist_ok=True) + with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: + f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n") + f.write(f"content_size: {len(str(extractedContent))} characters\n") + f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n") except Exception: pass # Get the appropriate renderer for the format - renderer = self._getFormatRenderer(output_format) + renderer = self._getFormatRenderer(outputFormat) if not renderer: - raise ValueError(f"Unsupported output format: {output_format}") + raise ValueError(f"Unsupported output format: {outputFormat}") - # Render the content - rendered_content, mime_type = await renderer.render(extracted_content, title) + # Render the JSON content directly (AI generation handled by main service) + renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService) # DEBUG: dump rendered output try: import os with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f: - f.write(rendered_content or "") + f.write(renderedContent or "") except Exception: pass - logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") - return rendered_content, mime_type + logger.info(f"Successfully rendered JSON report to {outputFormat} format: {len(renderedContent)} characters") + return renderedContent, mimeType except Exception as e: - logger.error(f"Error rendering report to {output_format}: {str(e)}") + logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}") raise - def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str: + async def getAdaptiveExtractionPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + promptAnalysis: Dict[str, Any], + aiService=None + ) -> str: + """Get adaptive extraction prompt based on AI analysis.""" + from .subPromptBuilder import buildAdaptiveExtractionPrompt + return await buildAdaptiveExtractionPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + promptAnalysis=promptAnalysis, + aiService=aiService, + services=self.services + ) + + async def getGenerationPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + aiService=None + ) -> str: + """Get generation prompt for enhancing extracted JSON content.""" + from .subPromptBuilder import buildGenerationPrompt + return await buildGenerationPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + aiService=aiService, + services=self.services + ) + + async def getGenericExtractionPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + aiService=None + ) -> str: + """Get generic extraction prompt that works for both single and multi-file.""" + from .subPromptBuilder import buildGenericExtractionPrompt + return await buildGenericExtractionPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + aiService=aiService, + services=self.services + ) + + async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str: """ Get the format-specific extraction prompt for AI content extraction. Args: - output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) - user_prompt: User's original prompt for report generation + outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + userPrompt: User's original prompt for report generation title: Report title + aiService: AI service instance for intent extraction Returns: str: Format-specific prompt for AI extraction """ try: # Get the appropriate renderer for the format - renderer = self._getFormatRenderer(output_format) + renderer = self._getFormatRenderer(outputFormat) if not renderer: - raise ValueError(f"Unsupported output format: {output_format}") + raise ValueError(f"Unsupported output format: {outputFormat}") # Build centralized prompt with generic rules + format-specific guidelines - from .prompt_builder import buildExtractionPrompt - extraction_prompt = buildExtractionPrompt( - output_format=output_format, + from .subPromptBuilder import buildExtractionPrompt + extractionPrompt = await buildExtractionPrompt( + outputFormat=outputFormat, renderer=renderer, - user_prompt=user_prompt, - title=title + userPrompt=userPrompt, + title=title, + aiService=aiService, + services=self.services ) - logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters") - return extraction_prompt + logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters") + return extractionPrompt except Exception as e: - logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}") + logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}") raise + async def renderAdaptiveReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None, + isMultiFile: bool = False + ) -> Union[Tuple[str, str], List[Dict[str, Any]]]: + """Render report adaptively based on content structure.""" + + if isMultiFile and "documents" in extractedContent: + return await self._renderMultiFileReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + else: + return await self._renderSingleFileReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + + async def _renderMultiFileReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None + ) -> List[Dict[str, Any]]: + """Render multiple documents from extracted content.""" + + generated_documents = [] + + for doc_data in extractedContent.get("documents", []): + # Use existing single-file renderer for each document + renderer = self._getFormatRenderer(outputFormat) + if not renderer: + continue + + # Render individual document + rendered_content, mime_type = await renderer.render( + extractedContent={"sections": doc_data["sections"]}, + title=doc_data["title"], + userPrompt=userPrompt, + aiService=aiService + ) + + generated_documents.append({ + "filename": doc_data["filename"], + "content": rendered_content, + "mime_type": mime_type, + "title": doc_data["title"] + }) + + return generated_documents + + async def _renderSingleFileReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None + ) -> Tuple[str, str]: + """Render single file report (existing functionality).""" + # Use existing renderReport method + return await self.renderReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + def _getFormatRenderer(self, output_format: str): """Get the appropriate renderer for the specified format using auto-discovery.""" try: from .renderers.registry import get_renderer - renderer = get_renderer(output_format) + renderer = get_renderer(output_format, services=self.services) if renderer: return renderer # Fallback to text renderer if no specific renderer found logger.warning(f"No renderer found for format {output_format}, falling back to text") - fallback_renderer = get_renderer('text') + fallback_renderer = get_renderer('text', services=self.services) if fallback_renderer: return fallback_renderer diff --git a/modules/services/serviceGeneration/prompt_builder.py b/modules/services/serviceGeneration/prompt_builder.py deleted file mode 100644 index 208c4c18..00000000 --- a/modules/services/serviceGeneration/prompt_builder.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -Centralized prompt builder for document generation across formats. - -Builds a robust prompt that: -- Accepts any user intent (no fixed structure assumptions) -- Injects format-specific guidelines from the selected renderer -- Adds a common policy section to always use real data from source docs -- Requires the AI to output a filename header that we can parse and use -""" - -from typing import Protocol - - -class _RendererLike(Protocol): - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines - ... - - -def buildExtractionPrompt( - output_format: str, - renderer: _RendererLike, - user_prompt: str, - title: str -) -> str: - """ - Build the final extraction prompt by combining: - - The raw user prompt (verbatim) - - Generic cross-format instructions (filename header + real-data policy) - - Format-specific guidelines snippet provided by the renderer - - The AI must place a single filename header at the very top: - FILENAME: - followed by a blank line and then ONLY the document content according to the target format. - """ - - format_guidelines = renderer.getExtractionPrompt(user_prompt, title) - - # Generic block appears once for every format - generic_intro = f""" -{user_prompt} - -You are generating a document in {output_format.upper()} format for the title: "{title}". - -Rules: -- The user's intent fully defines the structure. Do not assume a fixed template or headings. -- Use only factual information extracted from the supplied source documents. -- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD"). -- The output must strictly follow the target format and be ready for saving without extra wrapping. -- At the VERY TOP output exactly one line with the filename header: - FILENAME: - - The base name should be short, descriptive, and kebab-case or snake-case without spaces. - - Include the correct extension for the requested format (e.g., .html, .pdf, .docx, .md, .txt, .json, .csv, .xlsx). - - Avoid special characters beyond [a-zA-Z0-9-_]. - - After this header, insert a single blank line and then provide ONLY the document content. - -Common policy: -- Use the actual data from the source documents to create the content. -- Do not generate placeholder text or templates. -- Extract and use the real data provided in the source documents to create meaningful content. -""".strip() - - # Final assembly - final_prompt = ( - generic_intro - + "\n\nFORMAT-SPECIFIC GUIDELINES:\n" - + format_guidelines.strip() - + "\n\nGenerate the complete document content now based on the source documents below:" - ) - - return final_prompt - - diff --git a/modules/services/serviceGeneration/renderers/base_renderer.py b/modules/services/serviceGeneration/renderers/base_renderer.py deleted file mode 100644 index dd91be09..00000000 --- a/modules/services/serviceGeneration/renderers/base_renderer.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Base renderer class for all format renderers. -""" - -from abc import ABC, abstractmethod -from typing import Dict, Any, Tuple, List -import logging - -logger = logging.getLogger(__name__) - -class BaseRenderer(ABC): - """Base class for all format renderers.""" - - def __init__(self): - self.logger = logger - - @classmethod - def get_supported_formats(cls) -> List[str]: - """ - Return list of supported format names for this renderer. - Override this method in subclasses to specify supported formats. - """ - return [] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """ - Return list of format aliases for this renderer. - Override this method in subclasses to specify format aliases. - """ - return [] - - @classmethod - def get_priority(cls) -> int: - """ - Return priority for this renderer (higher number = higher priority). - Used when multiple renderers support the same format. - """ - return 0 - - @abstractmethod - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """ - Get the format-specific extraction prompt for AI content extraction. - - Args: - user_prompt: User's original prompt for report generation - title: Report title - - Returns: - str: Format-specific prompt for AI extraction - """ - pass - - @abstractmethod - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """ - Render extracted content to the target format. - - Args: - extracted_content: Raw content extracted by AI using format-specific prompt - title: Report title - - Returns: - tuple: (rendered_content, mime_type) - """ - pass - - def _extract_sections(self, report_data: Dict[str, Any]) -> list: - """Extract sections from report data.""" - return report_data.get('sections', []) - - def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]: - """Extract metadata from report data.""" - return report_data.get('metadata', {}) - - def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str: - """Get title from report data or use fallback.""" - return report_data.get('title', fallback_title) - - def _format_timestamp(self, timestamp: str = None) -> str: - """Format timestamp for display.""" - if timestamp: - return timestamp - from datetime import datetime, UTC - return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/modules/services/serviceGeneration/renderers/csv_renderer.py b/modules/services/serviceGeneration/renderers/csv_renderer.py deleted file mode 100644 index 9ef6882c..00000000 --- a/modules/services/serviceGeneration/renderers/csv_renderer.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -CSV renderer for report generation. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List -import csv -import io - -class CsvRenderer(BaseRenderer): - """Renders content to CSV format with format-specific extraction.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported CSV formats.""" - return ['csv'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['spreadsheet', 'table'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for CSV renderer.""" - return 70 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only CSV-specific guidelines; global prompt is built centrally.""" - return ( - "CSV FORMAT GUIDELINES:\n" - "- Emit ONLY CSV text without fences or commentary.\n" - "- Include a single header row with clear column names.\n" - "- Quote fields containing commas, quotes, or newlines; escape quotes by doubling them.\n" - "- Use rows to represent items/records derived from sources.\n" - "- Keep cells concise; include units in headers when useful.\n" - "OUTPUT: Return ONLY valid CSV content that can be imported." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to CSV format.""" - try: - # The extracted content should already be CSV from the AI - # Just clean it up - csv_content = self._clean_csv_content(extracted_content, title) - - return csv_content, "text/csv" - - except Exception as e: - self.logger.error(f"Error rendering CSV: {str(e)}") - # Return minimal CSV fallback - return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv" - - def _clean_csv_content(self, content: str, title: str) -> str: - """Clean and validate CSV content from AI.""" - content = content.strip() - - # Remove markdown code blocks if present - if content.startswith("```") and content.endswith("```"): - lines = content.split('\n') - if len(lines) > 2: - content = '\n'.join(lines[1:-1]).strip() - - return content diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py deleted file mode 100644 index 134f00cd..00000000 --- a/modules/services/serviceGeneration/renderers/docx_renderer.py +++ /dev/null @@ -1,249 +0,0 @@ -""" -DOCX renderer for report generation using python-docx. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List -import io -import base64 -from datetime import datetime, UTC - -try: - from docx import Document - from docx.shared import Inches, Pt - from docx.enum.text import WD_ALIGN_PARAGRAPH - from docx.enum.table import WD_TABLE_ALIGNMENT - from docx.oxml.shared import OxmlElement, qn - from docx.oxml.ns import nsdecls - from docx.oxml import parse_xml - DOCX_AVAILABLE = True -except ImportError: - DOCX_AVAILABLE = False - -class DocxRenderer(BaseRenderer): - """Renders content to DOCX format using python-docx.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported DOCX formats.""" - return ['docx', 'doc'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['word', 'document'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for DOCX renderer.""" - return 115 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only DOCX-specific guidelines; global prompt is built centrally.""" - return ( - "DOCX FORMAT GUIDELINES:\n" - "- Provide plain text content suitable for Word generation (no markdown/HTML).\n" - "- Use clear section hierarchy; bullet and numbered lists where needed.\n" - "- Include tables as simple pipe-delimited lines if tabular data is needed.\n" - "OUTPUT: Return ONLY the structured plain text to be converted into DOCX." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to DOCX format.""" - try: - if not DOCX_AVAILABLE: - # Fallback to HTML if python-docx not available - from .html_renderer import HtmlRenderer - html_renderer = HtmlRenderer() - html_content, _ = await html_renderer.render(extracted_content, title) - return html_content, "text/html" - - # Generate DOCX using python-docx - docx_content = self._generate_docx(extracted_content, title) - - return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - - except Exception as e: - self.logger.error(f"Error rendering DOCX: {str(e)}") - # Return minimal fallback - return f"DOCX Generation Error: {str(e)}", "text/plain" - - def _generate_docx(self, content: str, title: str) -> str: - """Generate DOCX content using python-docx.""" - try: - # Create new document - doc = Document() - - # Set up document styles - self._setup_document_styles(doc) - - # Add title - title_para = doc.add_heading(title, 0) - title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER - - # Add generation date - date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") - date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER - - # Add page break - doc.add_page_break() - - # Process content - lines = content.split('\n') - current_section = [] - - for line in lines: - line = line.strip() - if not line: - continue - - # Check for ALL CAPS headings (major headings) - if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'): - if current_section: - self._process_section(doc, current_section) - current_section = [] - doc.add_heading(line, level=1) - # Check for Title Case headings (subheadings) - elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')): - if current_section: - self._process_section(doc, current_section) - current_section = [] - doc.add_heading(line, level=2) - # Check for markdown headings (fallback) - elif line.startswith('# '): - # H1 heading - if current_section: - self._process_section(doc, current_section) - current_section = [] - doc.add_heading(line[2:], level=1) - elif line.startswith('## '): - # H2 heading - if current_section: - self._process_section(doc, current_section) - current_section = [] - doc.add_heading(line[3:], level=2) - elif line.startswith('### '): - # H3 heading - if current_section: - self._process_section(doc, current_section) - current_section = [] - doc.add_heading(line[4:], level=3) - else: - current_section.append(line) - - # Process remaining content - if current_section: - self._process_section(doc, current_section) - - # Save to buffer - buffer = io.BytesIO() - doc.save(buffer) - buffer.seek(0) - - # Convert to base64 - docx_bytes = buffer.getvalue() - docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') - - return docx_base64 - - except Exception as e: - self.logger.error(f"Error generating DOCX: {str(e)}") - raise - - def _setup_document_styles(self, doc): - """Set up document styles.""" - try: - # Set default font - style = doc.styles['Normal'] - font = style.font - font.name = 'Calibri' - font.size = Pt(11) - - # Set heading styles - for i in range(1, 4): - heading_style = doc.styles[f'Heading {i}'] - heading_font = heading_style.font - heading_font.name = 'Calibri' - heading_font.size = Pt(16 - i * 2) - heading_font.bold = True - except Exception as e: - self.logger.warning(f"Could not set up document styles: {str(e)}") - - def _process_section(self, doc, lines: list): - """Process a section of content into DOCX elements.""" - for line in lines: - if not line.strip(): - continue - - # Check for tables (lines with |) - if '|' in line and not line.startswith('|'): - # This might be part of a table, process as table - table_data = self._extract_table_data(lines) - if table_data: - self._add_table(doc, table_data) - return - - # Check for lists - if line.startswith('- ') or line.startswith('* '): - # This is a list item - doc.add_paragraph(line[2:], style='List Bullet') - elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): - # This is a numbered list item - doc.add_paragraph(line[3:], style='List Number') - else: - # Regular paragraph - doc.add_paragraph(line) - - def _extract_table_data(self, lines: list) -> list: - """Extract table data from lines.""" - table_data = [] - in_table = False - - for line in lines: - if '|' in line: - if not in_table: - in_table = True - # Split by | and clean up - cells = [cell.strip() for cell in line.split('|') if cell.strip()] - if cells: - table_data.append(cells) - elif in_table and not line.strip(): - # Empty line, might be end of table - break - - return table_data if len(table_data) > 1 else [] - - def _add_table(self, doc, table_data: list): - """Add a table to the document.""" - try: - if not table_data: - return - - # Create table - table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) - table.alignment = WD_TABLE_ALIGNMENT.CENTER - - # Add data to table - for row_idx, row_data in enumerate(table_data): - for col_idx, cell_data in enumerate(row_data): - if col_idx < len(table.rows[row_idx].cells): - table.rows[row_idx].cells[col_idx].text = cell_data - - # Style the table - self._style_table(table) - - except Exception as e: - self.logger.warning(f"Could not add table: {str(e)}") - - def _style_table(self, table): - """Apply styling to the table.""" - try: - # Style header row - if len(table.rows) > 0: - header_cells = table.rows[0].cells - for cell in header_cells: - for paragraph in cell.paragraphs: - for run in paragraph.runs: - run.bold = True - except Exception as e: - self.logger.warning(f"Could not style table: {str(e)}") \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/excel_renderer.py b/modules/services/serviceGeneration/renderers/excel_renderer.py deleted file mode 100644 index 1472201b..00000000 --- a/modules/services/serviceGeneration/renderers/excel_renderer.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Excel renderer for report generation using openpyxl. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List -import io -import base64 -from datetime import datetime, UTC - -try: - from openpyxl import Workbook - from openpyxl.styles import Font, PatternFill, Alignment, Border, Side - from openpyxl.utils import get_column_letter - from openpyxl.worksheet.table import Table, TableStyleInfo - OPENPYXL_AVAILABLE = True -except ImportError: - OPENPYXL_AVAILABLE = False - -class ExcelRenderer(BaseRenderer): - """Renders content to Excel format using openpyxl.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported Excel formats.""" - return ['xlsx', 'xls', 'excel'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['spreadsheet', 'workbook'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for Excel renderer.""" - return 110 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only Excel-specific guidelines; global prompt is built centrally.""" - return ( - "EXCEL FORMAT GUIDELINES:\n" - "- Output one or more pipe-delimited tables with a single header row.\n" - "- Let user intent define columns; use clear names and ISO dates.\n" - "- Separate multiple tables by a single blank line.\n" - "- No markdown/HTML/code fences; tables only unless user explicitly asks for notes.\n" - "OUTPUT: Return ONLY pipe-delimited tables suitable for import." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to Excel format.""" - try: - if not OPENPYXL_AVAILABLE: - # Fallback to CSV if openpyxl not available - from .csv_renderer import CsvRenderer - csv_renderer = CsvRenderer() - csv_content, _ = await csv_renderer.render(extracted_content, title) - return csv_content, "text/csv" - - # Generate Excel using openpyxl - excel_content = self._generate_excel(extracted_content, title) - - return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - - except Exception as e: - self.logger.error(f"Error rendering Excel: {str(e)}") - # Return CSV fallback - return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv" - - def _generate_excel(self, content: str, title: str) -> str: - """Generate Excel content using openpyxl.""" - try: - # Create workbook - wb = Workbook() - - # Remove default sheet - wb.remove(wb.active) - - # Create sheets - summary_sheet = wb.create_sheet("Summary", 0) - data_sheet = wb.create_sheet("Data", 1) - analysis_sheet = wb.create_sheet("Analysis", 2) - - # Add content to sheets - self._populate_summary_sheet(summary_sheet, title) - self._populate_data_sheet(data_sheet, content) - self._populate_analysis_sheet(analysis_sheet, content) - - # Save to buffer - buffer = io.BytesIO() - wb.save(buffer) - buffer.seek(0) - - # Convert to base64 - excel_bytes = buffer.getvalue() - excel_base64 = base64.b64encode(excel_bytes).decode('utf-8') - - return excel_base64 - - except Exception as e: - self.logger.error(f"Error generating Excel: {str(e)}") - raise - - def _populate_summary_sheet(self, sheet, title: str): - """Populate the summary sheet.""" - try: - # Title - sheet['A1'] = title - sheet['A1'].font = Font(size=16, bold=True) - sheet['A1'].alignment = Alignment(horizontal='center') - - # Generation info - sheet['A3'] = "Generated:" - sheet['B3'] = self._format_timestamp() - sheet['A4'] = "Status:" - sheet['B4'] = "Generated Successfully" - - # Key metrics placeholder - sheet['A6'] = "Key Metrics:" - sheet['A6'].font = Font(bold=True) - sheet['A7'] = "Total Items:" - sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet - - # Auto-adjust column widths - sheet.column_dimensions['A'].width = 20 - sheet.column_dimensions['B'].width = 30 - - except Exception as e: - self.logger.warning(f"Could not populate summary sheet: {str(e)}") - - def _populate_data_sheet(self, sheet, content: str): - """Populate the data sheet.""" - try: - # Headers - headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"] - for col, header in enumerate(headers, 1): - cell = sheet.cell(row=1, column=col, value=header) - cell.font = Font(bold=True) - cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid") - - # Process content - lines = content.split('\n') - row = 2 - - for line in lines: - line = line.strip() - if not line: - continue - - # Check for table data (lines with |) - if '|' in line: - cells = [cell.strip() for cell in line.split('|') if cell.strip()] - for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns - sheet.cell(row=row, column=col, value=cell_data) - row += 1 - else: - # Regular content - sheet.cell(row=row, column=1, value=line) - row += 1 - - # Auto-adjust column widths - for col in range(1, 6): - sheet.column_dimensions[get_column_letter(col)].width = 20 - - except Exception as e: - self.logger.warning(f"Could not populate data sheet: {str(e)}") - - def _populate_analysis_sheet(self, sheet, content: str): - """Populate the analysis sheet.""" - try: - # Title - sheet['A1'] = "Analysis & Insights" - sheet['A1'].font = Font(size=14, bold=True) - - # Content analysis - lines = content.split('\n') - row = 3 - - sheet['A3'] = "Content Analysis:" - sheet['A3'].font = Font(bold=True) - row += 1 - - # Count different types of content - table_lines = sum(1 for line in lines if '|' in line) - list_lines = sum(1 for line in lines if line.startswith(('- ', '* '))) - text_lines = len(lines) - table_lines - list_lines - - sheet[f'A{row}'] = f"Total Lines: {len(lines)}" - row += 1 - sheet[f'A{row}'] = f"Table Rows: {table_lines}" - row += 1 - sheet[f'A{row}'] = f"List Items: {list_lines}" - row += 1 - sheet[f'A{row}'] = f"Text Lines: {text_lines}" - row += 2 - - # Recommendations - sheet[f'A{row}'] = "Recommendations:" - sheet[f'A{row}'].font = Font(bold=True) - row += 1 - sheet[f'A{row}'] = "1. Review data accuracy" - row += 1 - sheet[f'A{row}'] = "2. Consider additional analysis" - row += 1 - sheet[f'A{row}'] = "3. Update regularly" - - # Auto-adjust column width - sheet.column_dimensions['A'].width = 30 - - except Exception as e: - self.logger.warning(f"Could not populate analysis sheet: {str(e)}") diff --git a/modules/services/serviceGeneration/renderers/html_renderer.py b/modules/services/serviceGeneration/renderers/html_renderer.py deleted file mode 100644 index c2b7e586..00000000 --- a/modules/services/serviceGeneration/renderers/html_renderer.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -HTML renderer for report generation. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List - -class HtmlRenderer(BaseRenderer): - """Renders content to HTML format with format-specific extraction.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported HTML formats.""" - return ['html', 'htm'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['web', 'webpage'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for HTML renderer.""" - return 100 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only HTML-specific guidelines; global prompt is built centrally.""" - return ( - "HTML FORMAT GUIDELINES:\n" - "- Output a complete HTML5 document starting with .\n" - "- Include , with and , and <body>.\n" - "- Use semantic elements: <header>, <main>, <section>, <article>, <footer>.\n" - "- Provide professional CSS in a <style> block; responsive, clean typography.\n" - "- Use h1/h2/h3 for headings; tables and lists for structure.\n" - "OUTPUT: Return ONLY valid HTML (no markdown, no code fences)." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to HTML format.""" - try: - # The extracted content should already be HTML from the AI - # Just clean it up and ensure it's valid - html_content = self._clean_html_content(extracted_content, title) - - return html_content, "text/html" - - except Exception as e: - self.logger.error(f"Error rendering HTML: {str(e)}") - # Return minimal HTML fallback - return f"<html><head><title>{title}

{title}

Error rendering report: {str(e)}

", "text/html" - - def _clean_html_content(self, content: str, title: str) -> str: - """Clean and validate HTML content from AI.""" - content = content.strip() - - # Remove markdown code blocks if present - if content.startswith("```") and content.endswith("```"): - lines = content.split('\n') - if len(lines) > 2: - content = '\n'.join(lines[1:-1]).strip() - - # Ensure it starts with DOCTYPE - if not content.startswith('\n' + content - else: - content = f'\n\n{title}\n\n{content}\n\n' - - return content diff --git a/modules/services/serviceGeneration/renderers/json_renderer.py b/modules/services/serviceGeneration/renderers/json_renderer.py deleted file mode 100644 index 845d33c2..00000000 --- a/modules/services/serviceGeneration/renderers/json_renderer.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -JSON renderer for report generation. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List -import json - -class JsonRenderer(BaseRenderer): - """Renders content to JSON format with format-specific extraction.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported JSON formats.""" - return ['json'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['data'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for JSON renderer.""" - return 80 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only JSON-specific guidelines; global prompt is built centrally.""" - return ( - "JSON FORMAT GUIDELINES:\n" - "- Output ONLY a single valid JSON object (no fences, no pre/post text).\n" - "- Choose a structure that best fits the user's intent; include a top-level title and data.\n" - "- Prefer arrays/objects that map cleanly to the extracted facts.\n" - "- Include minimal metadata only if useful (e.g., generatedAt, sources).\n" - "OUTPUT: Return ONLY valid, parseable JSON." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to JSON format.""" - try: - # The extracted content should already be JSON from the AI - # Just validate and format it - json_content = self._clean_json_content(extracted_content, title) - - return json_content, "application/json" - - except Exception as e: - self.logger.error(f"Error rendering JSON: {str(e)}") - # Return minimal JSON fallback - fallback_data = { - "title": title, - "sections": [{"type": "text", "content": f"Error rendering report: {str(e)}"}], - "metadata": {"error": str(e)} - } - return json.dumps(fallback_data, indent=2), "application/json" - - def _clean_json_content(self, content: str, title: str) -> str: - """Clean and validate JSON content from AI.""" - content = content.strip() - - # Remove markdown code blocks if present - if content.startswith("```") and content.endswith("```"): - lines = content.split('\n') - if len(lines) > 2: - content = '\n'.join(lines[1:-1]).strip() - - # Validate JSON - try: - parsed = json.loads(content) - # Re-format with proper indentation - return json.dumps(parsed, indent=2, ensure_ascii=False) - except json.JSONDecodeError: - # If not valid JSON, return as-is - return content diff --git a/modules/services/serviceGeneration/renderers/markdown_renderer.py b/modules/services/serviceGeneration/renderers/markdown_renderer.py deleted file mode 100644 index 8b9b4293..00000000 --- a/modules/services/serviceGeneration/renderers/markdown_renderer.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Markdown renderer for report generation. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List - -class MarkdownRenderer(BaseRenderer): - """Renders content to Markdown format with format-specific extraction.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported Markdown formats.""" - return ['md', 'markdown'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['mdown', 'mkd'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for markdown renderer.""" - return 95 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only Markdown-specific guidelines; global prompt is built centrally.""" - return ( - "MARKDOWN FORMAT GUIDELINES:\n" - "- Use proper Markdown syntax only (no HTML wrappers).\n" - "- # for main title, ## for sections, ### for subsections.\n" - "- Tables with | separators and a header row.\n" - "- Bullet lists with - or *.\n" - "- Emphasis with **bold** and *italic*.\n" - "- Code blocks with ```language.\n" - "- Horizontal rules (---) to separate major sections when helpful.\n" - "- Include links [text](url) and images ![alt](url) when referenced by sources.\n" - "OUTPUT: Return ONLY raw Markdown content without code fences." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to Markdown format.""" - try: - # The extracted content should already be Markdown from the AI - # Just clean it up - markdown_content = self._clean_markdown_content(extracted_content, title) - - return markdown_content, "text/markdown" - - except Exception as e: - self.logger.error(f"Error rendering markdown: {str(e)}") - # Return minimal markdown fallback - return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown" - - def _clean_markdown_content(self, content: str, title: str) -> str: - """Clean and validate Markdown content from AI.""" - content = content.strip() - - # Remove markdown code blocks if present - if content.startswith("```") and content.endswith("```"): - lines = content.split('\n') - if len(lines) > 2: - content = '\n'.join(lines[1:-1]).strip() - - return content diff --git a/modules/services/serviceGeneration/renderers/pdf_renderer.py b/modules/services/serviceGeneration/renderers/pdf_renderer.py deleted file mode 100644 index 6a8409a1..00000000 --- a/modules/services/serviceGeneration/renderers/pdf_renderer.py +++ /dev/null @@ -1,225 +0,0 @@ -""" -PDF renderer for report generation using reportlab. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List -import io -import base64 -from datetime import datetime, UTC - -try: - from reportlab.lib.pagesizes import letter, A4 - from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak - from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle - from reportlab.lib.units import inch - from reportlab.lib import colors - from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY - REPORTLAB_AVAILABLE = True -except ImportError: - REPORTLAB_AVAILABLE = False - -class PdfRenderer(BaseRenderer): - """Renders content to PDF format using reportlab.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported PDF formats.""" - return ['pdf'] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return ['document', 'print'] - - @classmethod - def get_priority(cls) -> int: - """Return priority for PDF renderer.""" - return 120 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only PDF-specific guidelines; global prompt is built centrally.""" - return ( - "PDF FORMAT GUIDELINES:\n" - "- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n" - "- Use bullet lists and tables where useful; separate major sections clearly.\n" - "- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n" - "OUTPUT: Return ONLY the PDF-ready textual content (no fences)." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to PDF format.""" - try: - if not REPORTLAB_AVAILABLE: - # Fallback to HTML if reportlab not available - from .html_renderer import HtmlRenderer - html_renderer = HtmlRenderer() - html_content, _ = await html_renderer.render(extracted_content, title) - return html_content, "text/html" - - # Generate PDF using reportlab - pdf_content = self._generate_pdf(extracted_content, title) - - return pdf_content, "application/pdf" - - except Exception as e: - self.logger.error(f"Error rendering PDF: {str(e)}") - # Return minimal fallback - return f"PDF Generation Error: {str(e)}", "text/plain" - - def _generate_pdf(self, content: str, title: str) -> str: - """Generate PDF content using reportlab.""" - try: - # Create a buffer to hold the PDF - buffer = io.BytesIO() - - # Create PDF document - doc = SimpleDocTemplate( - buffer, - pagesize=A4, - rightMargin=72, - leftMargin=72, - topMargin=72, - bottomMargin=18 - ) - - # Get styles - styles = getSampleStyleSheet() - - # Create custom styles - title_style = ParagraphStyle( - 'CustomTitle', - parent=styles['Heading1'], - fontSize=24, - spaceAfter=30, - alignment=TA_CENTER, - textColor=colors.darkblue - ) - - heading_style = ParagraphStyle( - 'CustomHeading', - parent=styles['Heading2'], - fontSize=16, - spaceAfter=12, - spaceBefore=12, - textColor=colors.darkblue - ) - - # Build PDF content - story = [] - - # Title page - story.append(Paragraph(title, title_style)) - story.append(Spacer(1, 20)) - story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal'])) - story.append(PageBreak()) - - # Process content - lines = content.split('\n') - current_section = [] - - for line in lines: - line = line.strip() - if not line: - continue - - # Check for headings - if line.startswith('# '): - # H1 heading - if current_section: - story.extend(self._process_section(current_section, styles)) - current_section = [] - story.append(Paragraph(line[2:], title_style)) - story.append(Spacer(1, 12)) - elif line.startswith('## '): - # H2 heading - if current_section: - story.extend(self._process_section(current_section, styles)) - current_section = [] - story.append(Paragraph(line[3:], heading_style)) - story.append(Spacer(1, 8)) - elif line.startswith('### '): - # H3 heading - if current_section: - story.extend(self._process_section(current_section, styles)) - current_section = [] - story.append(Paragraph(line[4:], styles['Heading3'])) - story.append(Spacer(1, 6)) - else: - current_section.append(line) - - # Process remaining content - if current_section: - story.extend(self._process_section(current_section, styles)) - - # Build PDF - doc.build(story) - - # Get PDF content as base64 - buffer.seek(0) - pdf_bytes = buffer.getvalue() - pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') - - return pdf_base64 - - except Exception as e: - self.logger.error(f"Error generating PDF: {str(e)}") - raise - - def _process_section(self, lines: list, styles) -> list: - """Process a section of content into PDF elements.""" - elements = [] - - for line in lines: - if not line.strip(): - continue - - # Check for tables (lines with |) - if '|' in line and not line.startswith('|'): - # This might be part of a table, process as table - table_data = self._extract_table_data(lines) - if table_data: - table = Table(table_data) - table.setStyle(TableStyle([ - ('BACKGROUND', (0, 0), (-1, 0), colors.grey), - ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), - ('ALIGN', (0, 0), (-1, -1), 'CENTER'), - ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), - ('FONTSIZE', (0, 0), (-1, 0), 14), - ('BOTTOMPADDING', (0, 0), (-1, 0), 12), - ('BACKGROUND', (0, 1), (-1, -1), colors.beige), - ('GRID', (0, 0), (-1, -1), 1, colors.black) - ])) - elements.append(table) - elements.append(Spacer(1, 12)) - return elements - - # Check for lists - if line.startswith('- ') or line.startswith('* '): - # This is a list item - elements.append(Paragraph(f"β€’ {line[2:]}", styles['Normal'])) - else: - # Regular paragraph - elements.append(Paragraph(line, styles['Normal'])) - - elements.append(Spacer(1, 6)) - return elements - - def _extract_table_data(self, lines: list) -> list: - """Extract table data from lines.""" - table_data = [] - in_table = False - - for line in lines: - if '|' in line: - if not in_table: - in_table = True - # Split by | and clean up - cells = [cell.strip() for cell in line.split('|') if cell.strip()] - if cells: - table_data.append(cells) - elif in_table and not line.strip(): - # Empty line, might be end of table - break - - return table_data if len(table_data) > 1 else [] \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/registry.py b/modules/services/serviceGeneration/renderers/registry.py index 5c498081..909cfb2c 100644 --- a/modules/services/serviceGeneration/renderers/registry.py +++ b/modules/services/serviceGeneration/renderers/registry.py @@ -6,7 +6,7 @@ import logging import importlib import pkgutil from typing import Dict, Type, List, Optional -from .base_renderer import BaseRenderer +from .rendererBaseTemplate import BaseRenderer logger = logging.getLogger(__name__) @@ -37,7 +37,7 @@ class RendererRegistry: # Scan all Python files in the renderers directory for file_path in renderers_dir.glob("*.py"): - if file_path.name in ['registry.py', 'base_renderer.py', '__init__.py']: + if file_path.name in ['registry.py', 'rendererBaseTemplate.py', '__init__.py']: continue # Extract module name from filename @@ -92,7 +92,7 @@ class RendererRegistry: except Exception as e: logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}") - def get_renderer(self, output_format: str) -> Optional[BaseRenderer]: + def get_renderer(self, output_format: str, services=None) -> Optional[BaseRenderer]: """Get a renderer instance for the specified format.""" if not self._discovered: self.discover_renderers() @@ -109,7 +109,7 @@ class RendererRegistry: if renderer_class: try: - return renderer_class() + return renderer_class(services=services) except Exception as e: logger.error(f"Error creating renderer instance for {format_name}: {str(e)}") return None @@ -144,9 +144,9 @@ class RendererRegistry: # Global registry instance _registry = RendererRegistry() -def get_renderer(output_format: str) -> Optional[BaseRenderer]: +def get_renderer(output_format: str, services=None) -> Optional[BaseRenderer]: """Get a renderer instance for the specified format.""" - return _registry.get_renderer(output_format) + return _registry.get_renderer(output_format, services) def get_supported_formats() -> List[str]: """Get list of all supported formats.""" diff --git a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py new file mode 100644 index 00000000..150a903b --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py @@ -0,0 +1,459 @@ +""" +Base renderer class for all format renderers. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Tuple, List +import logging +import json + +logger = logging.getLogger(__name__) + +class BaseRenderer(ABC): + """Base class for all format renderers.""" + + def __init__(self, services=None): + self.logger = logger + self.services = services # Add services attribute + + @classmethod + def get_supported_formats(cls) -> List[str]: + """ + Return list of supported format names for this renderer. + Override this method in subclasses to specify supported formats. + """ + return [] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """ + Return list of format aliases for this renderer. + Override this method in subclasses to specify format aliases. + """ + return [] + + @classmethod + def get_priority(cls) -> int: + """ + Return priority for this renderer (higher number = higher priority). + Used when multiple renderers support the same format. + """ + return 0 + + @abstractmethod + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """ + Render extracted JSON content to the target format. + + Args: + extracted_content: Structured JSON content with sections and metadata + title: Report title + user_prompt: Original user prompt for context + ai_service: AI service instance for additional processing + + Returns: + tuple: (rendered_content, mime_type) + """ + pass + + def _extract_sections(self, report_data: Dict[str, Any]) -> List[Dict[str, Any]]: + """Extract sections from report data.""" + return report_data.get('sections', []) + + def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]: + """Extract metadata from report data.""" + return report_data.get('metadata', {}) + + def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str: + """Get title from report data or use fallback.""" + metadata = report_data.get('metadata', {}) + return metadata.get('title', fallback_title) + + def _validate_json_structure(self, json_content: Dict[str, Any]) -> bool: + """Validate that JSON content has the expected structure.""" + if not isinstance(json_content, dict): + return False + + if "sections" not in json_content: + return False + + sections = json_content.get("sections", []) + if not isinstance(sections, list): + return False + + # Validate each section has content_type and elements + for section in sections: + if not isinstance(section, dict): + return False + if "content_type" not in section or "elements" not in section: + return False + + return True + + def _get_section_type(self, section: Dict[str, Any]) -> str: + """Get the type of a section.""" + return section.get("content_type", "paragraph") + + def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]: + """Get the elements of a section.""" + return section.get("elements", []) + + def _get_section_id(self, section: Dict[str, Any]) -> str: + """Get the ID of a section (if available).""" + return section.get("id", "unknown") + + def _extract_table_data(self, section_data: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]: + """Extract table headers and rows from section data.""" + headers = section_data.get("headers", []) + rows = section_data.get("rows", []) + return headers, rows + + def _extract_bullet_list_items(self, section_data: Dict[str, Any]) -> List[str]: + """Extract bullet list items from section data.""" + items = section_data.get("items", []) + result = [] + for item in items: + if isinstance(item, str): + result.append(item) + elif isinstance(item, dict) and "text" in item: + result.append(item["text"]) + return result + + def _extract_heading_data(self, section_data: Dict[str, Any]) -> Tuple[int, str]: + """Extract heading level and text from section data.""" + level = section_data.get("level", 1) + text = section_data.get("text", "") + return level, text + + def _extract_paragraph_text(self, section_data: Dict[str, Any]) -> str: + """Extract paragraph text from section data.""" + return section_data.get("text", "") + + def _extract_code_block_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]: + """Extract code and language from section data.""" + code = section_data.get("code", "") + language = section_data.get("language", "") + return code, language + + def _extract_image_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]: + """Extract base64 data and alt text from section data.""" + base64_data = section_data.get("base64Data", "") + alt_text = section_data.get("altText", "Image") + return base64_data, alt_text + + def _render_image_section(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any: + """ + Render an image section. This is a base implementation that should be overridden + by format-specific renderers. + + Args: + section: Image section data + styles: Optional styling information + + Returns: + Format-specific image representation + """ + section_data = self._get_section_data(section) + base64_data, alt_text = self._extract_image_data(section_data) + + # Base implementation returns a simple dict + # Format-specific renderers should override this method + return { + "content_type": "image", + "base64Data": base64_data, + "altText": alt_text, + "width": section_data.get("width", None), + "height": section_data.get("height", None), + "caption": section_data.get("caption", "") + } + + def _validate_image_data(self, base64_data: str, alt_text: str) -> bool: + """Validate image data.""" + if not base64_data: + self.logger.warning("Image section has no base64 data") + return False + + if not alt_text: + self.logger.warning("Image section has no alt text") + return False + + # Basic base64 validation + try: + import base64 + base64.b64decode(base64_data, validate=True) + return True + except Exception as e: + self.logger.warning(f"Invalid base64 image data: {str(e)}") + return False + + def _get_image_dimensions(self, base64_data: str) -> Tuple[int, int]: + """ + Get image dimensions from base64 data. + This is a helper method that format-specific renderers can use. + """ + try: + import base64 + from PIL import Image + import io + + # Decode base64 data + image_data = base64.b64decode(base64_data) + image = Image.open(io.BytesIO(image_data)) + + return image.size # Returns (width, height) + + except Exception as e: + self.logger.warning(f"Could not determine image dimensions: {str(e)}") + return (0, 0) + + def _resize_image_if_needed(self, base64_data: str, max_width: int = 800, max_height: int = 600) -> str: + """ + Resize image if it exceeds maximum dimensions. + Returns the resized image as base64 string. + """ + try: + import base64 + from PIL import Image + import io + + # Decode base64 data + image_data = base64.b64decode(base64_data) + image = Image.open(io.BytesIO(image_data)) + + # Check if resizing is needed + width, height = image.size + if width <= max_width and height <= max_height: + return base64_data # No resizing needed + + # Calculate new dimensions maintaining aspect ratio + ratio = min(max_width / width, max_height / height) + new_width = int(width * ratio) + new_height = int(height * ratio) + + # Resize image + resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + + # Convert back to base64 + buffer = io.BytesIO() + resized_image.save(buffer, format=image.format or 'PNG') + resized_data = buffer.getvalue() + + return base64.b64encode(resized_data).decode('utf-8') + + except Exception as e: + self.logger.warning(f"Could not resize image: {str(e)}") + return base64_data # Return original if resize fails + + def _get_supported_section_types(self) -> List[str]: + """Return list of supported section types.""" + return ["table", "bullet_list", "heading", "paragraph", "code_block", "image"] + + def _is_valid_section_type(self, section_type: str) -> bool: + """Check if a section type is valid.""" + return section_type in self._get_supported_section_types() + + def _process_section_by_type(self, section: Dict[str, Any]) -> Dict[str, Any]: + """Process a section and return structured data based on its type.""" + section_type = self._get_section_type(section) + section_data = self._get_section_data(section) + + if section_type == "table": + headers, rows = self._extract_table_data(section_data) + return {"content_type": "table", "headers": headers, "rows": rows} + elif section_type == "bullet_list": + items = self._extract_bullet_list_items(section_data) + return {"content_type": "bullet_list", "items": items} + elif section_type == "heading": + level, text = self._extract_heading_data(section_data) + return {"content_type": "heading", "level": level, "text": text} + elif section_type == "paragraph": + text = self._extract_paragraph_text(section_data) + return {"content_type": "paragraph", "text": text} + elif section_type == "code_block": + code, language = self._extract_code_block_data(section_data) + return {"content_type": "code_block", "code": code, "language": language} + elif section_type == "image": + base64_data, alt_text = self._extract_image_data(section_data) + # Validate image data + if self._validate_image_data(base64_data, alt_text): + return { + "content_type": "image", + "base64Data": base64_data, + "altText": alt_text, + "width": section_data.get("width"), + "height": section_data.get("height"), + "caption": section_data.get("caption", "") + } + else: + # Return placeholder if image data is invalid + return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"} + else: + # Fallback to paragraph + text = self._extract_paragraph_text(section_data) + return {"content_type": "paragraph", "text": text} + + def _format_timestamp(self, timestamp: str = None) -> str: + """Format timestamp for display.""" + if timestamp: + return timestamp + from datetime import datetime, UTC + return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + + # ===== GENERIC AI STYLING HELPERS ===== + + async def _get_ai_styles(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: + """ + Generic AI styling method that can be used by all renderers. + + Args: + ai_service: AI service instance + style_template: Format-specific style template + default_styles: Default styles to fall back to + + Returns: + Dict with styling definitions + """ + # DEBUG: Show which renderer is calling this method + + if not ai_service: + return default_styles + + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=style_template, context="", options=request_options) + + # DEBUG: Show the actual prompt being sent to AI + self.logger.debug(f"AI Style Template Prompt:") + self.logger.debug(f"{style_template}") + + response = await ai_service.aiObjects.call(request) + + import json + import re + + # Clean and parse JSON + result = response.content.strip() if response and response.content else "" + + # Check if result is empty + if not result: + self.logger.warning("AI styling returned empty response, using defaults") + return default_styles + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + try: + styles = json.loads(result) + except json.JSONDecodeError as json_error: + self.logger.warning(f"AI styling returned invalid JSON: {json_error}") + + # Use print instead of logger to avoid truncation + self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER") + self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER") + + self.logger.warning(f"Raw content that failed to parse: {result}") + + # Try to fix incomplete JSON by adding missing closing braces + open_braces = result.count('{') + close_braces = result.count('}') + + if open_braces > close_braces: + # JSON is incomplete, add missing closing braces + missing_braces = open_braces - close_braces + result = result + '}' * missing_braces + self.logger.info(f"Added {missing_braces} missing closing brace(s)") + self.logger.debug(f"Fixed JSON: {result}") + + # Try parsing the fixed JSON + try: + styles = json.loads(result) + self.logger.info("Successfully fixed incomplete JSON") + except json.JSONDecodeError as fix_error: + self.logger.warning(f"Fixed JSON still invalid: {fix_error}") + self.logger.warning(f"Fixed JSON content: {result}") + # Try to extract just the JSON part if it's embedded in text + json_start = result.find('{') + json_end = result.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + json_part = result[json_start:json_end+1] + try: + styles = json.loads(json_part) + self.logger.info("Successfully extracted JSON from explanatory text") + except json.JSONDecodeError: + self.logger.warning("Could not extract valid JSON from response, using defaults") + return default_styles + else: + return default_styles + else: + # Try to extract just the JSON part if it's embedded in text + json_start = result.find('{') + json_end = result.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + json_part = result[json_start:json_end+1] + try: + styles = json.loads(json_part) + self.logger.info("Successfully extracted JSON from explanatory text") + except json.JSONDecodeError: + self.logger.warning("Could not extract valid JSON from response, using defaults") + return default_styles + else: + return default_styles + + # Convert colors to appropriate format + styles = self._convert_colors_format(styles) + + return styles + + except Exception as e: + self.logger.warning(f"AI styling failed: {str(e)}, using defaults") + return default_styles + + def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert colors to appropriate format based on renderer type. + Override this method in subclasses for format-specific color handling. + """ + return styles + + def _create_ai_style_template(self, format_name: str, user_prompt: str, style_schema: Dict[str, Any]) -> str: + """ + Create a standardized AI style template for any format. + + Args: + format_name: Name of the format (e.g., "docx", "xlsx", "pptx") + user_prompt: User's original prompt + style_schema: Format-specific style schema + + Returns: + Formatted prompt string + """ + schema_json = json.dumps(style_schema, indent=4) + + # DEBUG: Show the schema being sent + + return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {format_name.upper()} documents. + +Use this schema as a template and customize the values for professional document styling: + +{schema_json} + +Requirements: +- Return ONLY the complete JSON object (no markdown, no explanations) +- Customize colors, fonts, and spacing for professional appearance +- Ensure all objects are properly closed with closing braces +- Make the styling modern and professional + +Return the complete JSON:""" \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/rendererCsv.py b/modules/services/serviceGeneration/renderers/rendererCsv.py new file mode 100644 index 00000000..782e7d4a --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererCsv.py @@ -0,0 +1,260 @@ +""" +CSV renderer for report generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import csv +import io + +class RendererCsv(BaseRenderer): + """Renders content to CSV format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported CSV formats.""" + return ['csv'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'table'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for CSV renderer.""" + return 70 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to CSV format.""" + try: + # Generate CSV directly from JSON (no styling needed for CSV) + csv_content = await self._generate_csv_from_json(extracted_content, title) + + return csv_content, "text/csv" + + except Exception as e: + self.logger.error(f"Error rendering CSV: {str(e)}") + # Return minimal CSV fallback + return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv" + + async def _generate_csv_from_json(self, json_content: Dict[str, Any], title: str) -> str: + """Generate CSV content from structured JSON document.""" + try: + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Generate CSV content + csv_rows = [] + + # Add title row + if document_title: + csv_rows.append([document_title]) + csv_rows.append([]) # Empty row + + # Process each section in order + sections = json_content.get("sections", []) + for section in sections: + section_csv = self._render_json_section_to_csv(section) + if section_csv: + csv_rows.extend(section_csv) + csv_rows.append([]) # Empty row between sections + + # Convert to CSV string + csv_content = self._convert_rows_to_csv(csv_rows) + + return csv_content + + except Exception as e: + self.logger.error(f"Error generating CSV from JSON: {str(e)}") + raise Exception(f"CSV generation failed: {str(e)}") + + def _render_json_section_to_csv(self, section: Dict[str, Any]) -> List[List[str]]: + """Render a single JSON section to CSV rows.""" + try: + section_type = section.get("content_type", "paragraph") + elements = section.get("elements", []) + + csv_rows = [] + + # Add section title if available + section_title = section.get("title") + if section_title: + csv_rows.append([f"# {section_title}"]) + + # Process each element in the section + for element in elements: + if section_type == "table": + csv_rows.extend(self._render_json_table_to_csv(element)) + elif section_type == "list": + csv_rows.extend(self._render_json_list_to_csv(element)) + elif section_type == "heading": + csv_rows.extend(self._render_json_heading_to_csv(element)) + elif section_type == "paragraph": + csv_rows.extend(self._render_json_paragraph_to_csv(element)) + elif section_type == "code": + csv_rows.extend(self._render_json_code_to_csv(element)) + else: + # Fallback to paragraph for unknown types + csv_rows.extend(self._render_json_paragraph_to_csv(element)) + + return csv_rows + + except Exception as e: + self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}") + return [["[Error rendering section]"]] + + def _render_json_table_to_csv(self, table_data: Dict[str, Any]) -> List[List[str]]: + """Render a JSON table to CSV rows.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + csv_rows = [] + + if headers: + csv_rows.append(headers) + + if rows: + csv_rows.extend(rows) + + return csv_rows + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + return [["[Error rendering table]"]] + + def _render_json_list_to_csv(self, list_data: Dict[str, Any]) -> List[List[str]]: + """Render a JSON list to CSV rows.""" + try: + items = list_data.get("items", []) + csv_rows = [] + + for item in items: + if isinstance(item, dict): + text = item.get("text", "") + subitems = item.get("subitems", []) + csv_rows.append([text]) + + # Add subitems as indented rows + for subitem in subitems: + if isinstance(subitem, dict): + csv_rows.append([f" - {subitem.get('text', '')}"]) + else: + csv_rows.append([f" - {subitem}"]) + else: + csv_rows.append([str(item)]) + + return csv_rows + + except Exception as e: + self.logger.warning(f"Error rendering list: {str(e)}") + return [["[Error rendering list]"]] + + def _render_json_heading_to_csv(self, heading_data: Dict[str, Any]) -> List[List[str]]: + """Render a JSON heading to CSV rows.""" + try: + text = heading_data.get("text", "") + level = heading_data.get("level", 1) + + if text: + # Use # symbols for heading levels + heading_text = f"{'#' * level} {text}" + return [[heading_text]] + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + return [["[Error rendering heading]"]] + + def _render_json_paragraph_to_csv(self, paragraph_data: Dict[str, Any]) -> List[List[str]]: + """Render a JSON paragraph to CSV rows.""" + try: + text = paragraph_data.get("text", "") + + if text: + # Split long paragraphs into multiple rows if needed + if len(text) > 100: + words = text.split() + rows = [] + current_row = [] + current_length = 0 + + for word in words: + if current_length + len(word) > 100 and current_row: + rows.append([" ".join(current_row)]) + current_row = [word] + current_length = len(word) + else: + current_row.append(word) + current_length += len(word) + 1 + + if current_row: + rows.append([" ".join(current_row)]) + + return rows + else: + return [[text]] + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + return [["[Error rendering paragraph]"]] + + def _render_json_code_to_csv(self, code_data: Dict[str, Any]) -> List[List[str]]: + """Render a JSON code block to CSV rows.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + csv_rows = [] + + if language: + csv_rows.append([f"Code ({language}):"]) + + if code: + # Split code into lines + code_lines = code.split('\n') + for line in code_lines: + csv_rows.append([f" {line}"]) + + return csv_rows + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + return [["[Error rendering code block]"]] + + def _convert_rows_to_csv(self, rows: List[List[str]]) -> str: + """Convert rows to CSV string.""" + import csv + import io + + output = io.StringIO() + writer = csv.writer(output) + + for row in rows: + if row: # Only write non-empty rows + writer.writerow(row) + + return output.getvalue() + + def _clean_csv_content(self, content: str, title: str) -> str: + """Clean and validate CSV content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + return content diff --git a/modules/services/serviceGeneration/renderers/rendererDocx.py b/modules/services/serviceGeneration/renderers/rendererDocx.py new file mode 100644 index 00000000..d744b7e5 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererDocx.py @@ -0,0 +1,958 @@ +""" +DOCX renderer for report generation using python-docx. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +import re +import os +from datetime import datetime, UTC + +try: + from docx import Document + from docx.shared import Inches, Pt, RGBColor + from docx.enum.text import WD_ALIGN_PARAGRAPH + from docx.enum.table import WD_TABLE_ALIGNMENT + from docx.oxml.shared import OxmlElement, qn + from docx.oxml.ns import nsdecls + from docx.oxml import parse_xml + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + +class RendererDocx(BaseRenderer): + """Renders content to DOCX format using python-docx.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported DOCX formats.""" + return ['docx', 'doc'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['word', 'document'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for DOCX renderer.""" + return 115 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to DOCX format using AI-analyzed styling.""" + self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={user_prompt[:50] if user_prompt else 'None'}...", "DOCX_RENDERER") + try: + if not DOCX_AVAILABLE: + # Fallback to HTML if python-docx not available + from .rendererHtml import RendererHtml + html_renderer = RendererHtml() + html_content, _ = await html_renderer.render(extracted_content, title) + return html_content, "text/html" + + # Generate DOCX using AI-analyzed styling + docx_content = await self._generate_docx_from_json(extracted_content, title, user_prompt, ai_service) + + return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + + except Exception as e: + self.logger.error(f"Error rendering DOCX: {str(e)}") + # Return minimal fallback + return f"DOCX Generation Error: {str(e)}", "text/plain" + + async def _generate_docx_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Generate DOCX content from structured JSON document using AI-generated styling.""" + try: + # Create new document + doc = Document() + + # Get AI-generated styling definitions + self.logger.info(f"About to call AI styling with user_prompt: {user_prompt[:100] if user_prompt else 'None'}...") + styles = await self._get_docx_styles(user_prompt, ai_service) + + # Apply basic document setup + self._setup_basic_document_styles(doc) + + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Add document title using analyzed styles + if document_title: + title_heading = doc.add_heading(document_title, level=1) + title_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # Process each section in order + sections = json_content.get("sections", []) + for section in sections: + self._render_json_section(doc, section, styles) + + # Save to buffer + buffer = io.BytesIO() + doc.save(buffer) + buffer.seek(0) + + # Convert to base64 + docx_bytes = buffer.getvalue() + docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') + + return docx_base64 + + except Exception as e: + self.logger.error(f"Error generating DOCX from JSON: {str(e)}") + raise Exception(f"DOCX generation failed: {str(e)}") + + async def _get_docx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: + """Get DOCX styling definitions using base template AI styling.""" + style_schema = { + "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, + "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, + "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, + "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, + "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"}, + "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"}, + "table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"}, + "bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20}, + "code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"} + } + + style_template = self._create_ai_style_template("docx", user_prompt, style_schema) + styles = await self._get_ai_styles(ai_service, style_template, self._get_default_styles()) + + # Validate and fix contrast issues + return self._validate_styles_contrast(styles) + + def _validate_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Validate and fix contrast issues in AI-generated styles.""" + try: + # Fix table header contrast + if "table_header" in styles: + header = styles["table_header"] + bg_color = header.get("background", "#FFFFFF") + text_color = header.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + + # Fix table cell contrast + if "table_cell" in styles: + cell = styles["table_cell"] + bg_color = cell.get("background", "#FFFFFF") + text_color = cell.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + + return styles + + except Exception as e: + self.logger.warning(f"Style validation failed: {str(e)}") + return self._get_default_styles() + + def _get_default_styles(self) -> Dict[str, Any]: + """Default DOCX styles.""" + return { + "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, + "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, + "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, + "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, + "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"}, + "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"}, + "table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"}, + "bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20}, + "code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"} + } + + def _setup_basic_document_styles(self, doc: Document) -> None: + """Set up basic document styles.""" + try: + # Set default font + style = doc.styles['Normal'] + font = style.font + font.name = 'Calibri' + font.size = Pt(11) + except Exception as e: + self.logger.warning(f"Could not set up basic document styles: {str(e)}") + + + + + def _clear_template_content(self, doc: Document) -> None: + """Clear template content while preserving styles.""" + try: + # Remove all paragraphs except keep the styles + for paragraph in list(doc.paragraphs): + # Keep the paragraph but clear its content + paragraph.clear() + + # Remove all tables + for table in list(doc.tables): + table._element.getparent().remove(table._element) + + except Exception as e: + self.logger.warning(f"Could not clear template content: {str(e)}") + + def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a single JSON section to DOCX using AI-generated styles.""" + try: + section_type = section.get("content_type", "paragraph") + elements = section.get("elements", []) + + # Process each element in the section + for element in elements: + if section_type == "table": + self._render_json_table(doc, element, styles) + elif section_type == "bullet_list": + self._render_json_bullet_list(doc, element, styles) + elif section_type == "heading": + self._render_json_heading(doc, element, styles) + elif section_type == "paragraph": + self._render_json_paragraph(doc, element, styles) + elif section_type == "code_block": + self._render_json_code_block(doc, element, styles) + elif section_type == "image": + self._render_json_image(doc, element, styles) + else: + # Fallback to paragraph for unknown types + self._render_json_paragraph(doc, element, styles) + + except Exception as e: + self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}") + # Add error paragraph as fallback + error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]") + + def _render_json_table(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON table to DOCX using AI-generated styles.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + if not headers or not rows: + return + + # Create table + table = doc.add_table(rows=len(rows) + 1, cols=len(headers)) + table.alignment = WD_TABLE_ALIGNMENT.CENTER + + # Apply table borders based on AI style + border_style = styles["table_border"]["style"] + if border_style == "horizontal_only": + self._apply_horizontal_borders_only(table) + elif border_style == "grid": + table.style = 'Table Grid' + # else: no borders + + # Add headers with AI-generated styling + header_row = table.rows[0] + header_style = styles["table_header"] + for i, header in enumerate(headers): + if i < len(header_row.cells): + cell = header_row.cells[i] + cell.text = str(header) + + # Apply background color + bg_color = header_style["background"].lstrip('#') + self._set_cell_background(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16))) + + # Apply text styling + for paragraph in cell.paragraphs: + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT + for run in paragraph.runs: + run.bold = header_style["bold"] + run.font.size = Pt(11) + text_color = header_style["text_color"].lstrip('#') + run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) + + # Add data rows with AI-generated styling + cell_style = styles["table_cell"] + for row_idx, row_data in enumerate(rows): + if row_idx + 1 < len(table.rows): + table_row = table.rows[row_idx + 1] + for col_idx, cell_data in enumerate(row_data): + if col_idx < len(table_row.cells): + cell = table_row.cells[col_idx] + cell.text = str(cell_data) + + # Apply text styling + for paragraph in cell.paragraphs: + paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT + for run in paragraph.runs: + run.font.size = Pt(10) + text_color = cell_style["text_color"].lstrip('#') + run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + + def _apply_horizontal_borders_only(self, table) -> None: + """Apply only horizontal borders to the table (no vertical borders).""" + try: + from docx.oxml.shared import OxmlElement, qn + + # Get table properties + tbl_pr = table._element.find(qn('w:tblPr')) + if tbl_pr is None: + tbl_pr = OxmlElement('w:tblPr') + table._element.insert(0, tbl_pr) + + # Remove existing borders + existing_borders = tbl_pr.find(qn('w:tblBorders')) + if existing_borders is not None: + tbl_pr.remove(existing_borders) + + # Create new borders element + tbl_borders = OxmlElement('w:tblBorders') + + # Top border + top_border = OxmlElement('w:top') + top_border.set(qn('w:val'), 'single') + top_border.set(qn('w:sz'), '4') + top_border.set(qn('w:space'), '0') + top_border.set(qn('w:color'), '000000') + tbl_borders.append(top_border) + + # Bottom border + bottom_border = OxmlElement('w:bottom') + bottom_border.set(qn('w:val'), 'single') + bottom_border.set(qn('w:sz'), '4') + bottom_border.set(qn('w:space'), '0') + bottom_border.set(qn('w:color'), '000000') + tbl_borders.append(bottom_border) + + # Left border - none + left_border = OxmlElement('w:left') + left_border.set(qn('w:val'), 'none') + tbl_borders.append(left_border) + + # Right border - none + right_border = OxmlElement('w:right') + right_border.set(qn('w:val'), 'none') + tbl_borders.append(right_border) + + # Inside horizontal border + inside_h_border = OxmlElement('w:insideH') + inside_h_border.set(qn('w:val'), 'single') + inside_h_border.set(qn('w:sz'), '4') + inside_h_border.set(qn('w:space'), '0') + inside_h_border.set(qn('w:color'), '000000') + tbl_borders.append(inside_h_border) + + # Inside vertical border - none + inside_v_border = OxmlElement('w:insideV') + inside_v_border.set(qn('w:val'), 'none') + tbl_borders.append(inside_v_border) + + tbl_pr.append(tbl_borders) + + except Exception as e: + self.logger.warning(f"Could not apply horizontal borders: {str(e)}") + + def _set_cell_background(self, cell, color: RGBColor) -> None: + """Set the background color of a table cell.""" + try: + from docx.oxml.shared import OxmlElement, qn + + # Get cell properties + tc_pr = cell._element.find(qn('w:tcPr')) + if tc_pr is None: + tc_pr = OxmlElement('w:tcPr') + cell._element.insert(0, tc_pr) + + # Remove existing shading + existing_shading = tc_pr.find(qn('w:shd')) + if existing_shading is not None: + tc_pr.remove(existing_shading) + + # Create new shading element + shading = OxmlElement('w:shd') + shading.set(qn('w:val'), 'clear') + shading.set(qn('w:color'), 'auto') + # Convert RGBColor to hex string by unpacking RGB components + red, green, blue = color + hex_color = f"{red:02x}{green:02x}{blue:02x}" + shading.set(qn('w:fill'), hex_color) + tc_pr.append(shading) + + except Exception as e: + self.logger.warning(f"Could not set cell background: {str(e)}") + + + def _render_json_bullet_list(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON bullet list to DOCX using AI-generated styles.""" + try: + items = list_data.get("items", []) + bullet_style = styles["bullet_list"] + + for item in items: + if isinstance(item, str): + para = doc.add_paragraph(item, style='List Bullet') + elif isinstance(item, dict) and "text" in item: + para = doc.add_paragraph(item["text"], style='List Bullet') + + except Exception as e: + self.logger.warning(f"Error rendering bullet list: {str(e)}") + + def _render_json_heading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON heading to DOCX using AI-generated styles.""" + try: + level = heading_data.get("level", 1) + text = heading_data.get("text", "") + + if text: + level = max(1, min(6, level)) + doc.add_heading(text, level=level) + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + + def _render_json_paragraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON paragraph to DOCX using AI-generated styles.""" + try: + text = paragraph_data.get("text", "") + + if text: + para = doc.add_paragraph(text) + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + + def _render_json_code_block(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON code block to DOCX using AI-generated styles.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + if code: + if language: + lang_para = doc.add_paragraph(f"Code ({language}):") + lang_para.runs[0].bold = True + + code_para = doc.add_paragraph(code) + for run in code_para.runs: + run.font.name = 'Courier New' + run.font.size = Pt(10) + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + + def _render_json_image(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Render a JSON image to DOCX.""" + try: + base64_data = image_data.get("base64Data", "") + alt_text = image_data.get("altText", "Image") + + if base64_data: + image_bytes = base64.b64decode(base64_data) + doc.add_picture(io.BytesIO(image_bytes), width=Inches(4)) + + if alt_text: + caption_para = doc.add_paragraph(f"Figure: {alt_text}") + caption_para.runs[0].italic = True + + except Exception as e: + self.logger.warning(f"Error rendering image: {str(e)}") + doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]") + + def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]: + """Extract document structure from user prompt.""" + structure = { + 'title': title, + 'sections': [], + 'format': 'standard' + } + + if not user_prompt: + return structure + + # Extract title from prompt if not provided + if not title or title == "Generated Document": + # Look for "create a ... document" or "generate a ... report" + import re + title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower()) + if title_match: + structure['title'] = title_match.group(1).strip().title() + + # Extract sections from numbered lists in prompt + import re + section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)' + sections = re.findall(section_pattern, user_prompt) + + for num, section_text in sections: + structure['sections'].append({ + 'number': int(num), + 'title': section_text.strip(), + 'level': 2 # H2 level + }) + + # If no numbered sections found, try to extract from "including:" patterns + if not structure['sections']: + including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL) + if including_match: + including_text = including_match.group(1) + # Split by common separators + parts = re.split(r'[,;]\s*', including_text) + for i, part in enumerate(parts, 1): + part = part.strip() + if part: + structure['sections'].append({ + 'number': i, + 'title': part, + 'level': 2 + }) + + # If still no sections, extract from any list-like patterns + if not structure['sections']: + # Look for bullet points or dashes + bullet_pattern = r'[-β€’]\s*([^,\n]+?)(?:\s*[,:]|\s*$)' + bullets = re.findall(bullet_pattern, user_prompt) + for i, bullet in enumerate(bullets, 1): + bullet = bullet.strip() + if bullet and len(bullet) > 3: + structure['sections'].append({ + 'number': i, + 'title': bullet, + 'level': 2 + }) + + # If still no sections, extract from sentence structure + if not structure['sections']: + # Split prompt into sentences and use as sections + sentences = re.split(r'[.!?]\s+', user_prompt) + for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections + sentence = sentence.strip() + if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')): + structure['sections'].append({ + 'number': i, + 'title': sentence[:50] + "..." if len(sentence) > 50 else sentence, + 'level': 2 + }) + + # Final fallback: create sections from prompt keywords + if not structure['sections']: + # Extract key action words from prompt + action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate'] + found_actions = [] + for action in action_words: + if action in user_prompt.lower(): + found_actions.append(action.title()) + + if found_actions: + for i, action in enumerate(found_actions[:3], 1): + structure['sections'].append({ + 'number': i, + 'title': f"{action} Document Content", + 'level': 2 + }) + else: + # Last resort: generic but meaningful sections + structure['sections'] = [ + {'number': 1, 'title': 'Document Analysis', 'level': 2}, + {'number': 2, 'title': 'Key Information', 'level': 2}, + {'number': 3, 'title': 'Summary and Conclusions', 'level': 2} + ] + + return structure + + def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]): + """Generate DOCX content based on extracted structure.""" + # Add sections based on prompt structure + for section in structure['sections']: + # Add section heading + doc.add_heading(f"{section['number']}) {section['title']}", level=section['level']) + + # Add AI-generated content for this section + # Try to extract relevant content for this section from the AI response + section_content = self._extract_section_content(content, section['title']) + + if section_content: + doc.add_paragraph(section_content) + else: + # If no specific content found, add a note + doc.add_paragraph(f"Content for {section['title']} based on document analysis.") + + # Add some spacing + doc.add_paragraph() + + # Add the complete AI-generated content as additional analysis + if content and content.strip(): + doc.add_heading("Complete Analysis", level=1) + doc.add_paragraph(content) + + def _extract_section_content(self, content: str, section_title: str) -> str: + """Extract relevant content for a specific section from AI response.""" + if not content or not section_title: + return "" + + # Look for content that matches the section title + section_keywords = section_title.lower().split() + + # Split content into paragraphs + paragraphs = content.split('\n\n') + + relevant_paragraphs = [] + for paragraph in paragraphs: + paragraph_lower = paragraph.lower() + # Check if paragraph contains keywords from section title + if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3): + relevant_paragraphs.append(paragraph.strip()) + + if relevant_paragraphs: + return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section + + return "" + + def _setup_document_styles(self, doc): + """Set up document styles.""" + try: + # Set default font + style = doc.styles['Normal'] + font = style.font + font.name = 'Calibri' + font.size = Pt(11) + + # Set heading styles + for i in range(1, 4): + heading_style = doc.styles[f'Heading {i}'] + heading_font = heading_style.font + heading_font.name = 'Calibri' + heading_font.size = Pt(16 - i * 2) + heading_font.bold = True + except Exception as e: + self.logger.warning(f"Could not set up document styles: {str(e)}") + + def _process_section(self, doc, lines: list): + """Process a section of content into DOCX elements.""" + for line in lines: + if not line.strip(): + continue + + # Check for tables (lines with |) + if '|' in line and not line.startswith('|'): + # This might be part of a table, process as table + table_data = self._extract_table_data(lines) + if table_data: + self._add_table(doc, table_data) + return + + # Check for lists + if line.startswith('- ') or line.startswith('* '): + # This is a list item + doc.add_paragraph(line[2:], style='List Bullet') + elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): + # This is a numbered list item + doc.add_paragraph(line[3:], style='List Number') + else: + # Regular paragraph + doc.add_paragraph(line) + + def _extract_table_data(self, lines: list) -> list: + """Extract table data from lines.""" + table_data = [] + in_table = False + + for line in lines: + if '|' in line: + if not in_table: + in_table = True + # Split by | and clean up + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + if cells: + table_data.append(cells) + elif in_table and not line.strip(): + # Empty line, might be end of table + break + + return table_data if len(table_data) > 1 else [] + + def _add_table(self, doc, table_data: list): + """Add a table to the document.""" + try: + if not table_data: + return + + # Create table + table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) + table.alignment = WD_TABLE_ALIGNMENT.CENTER + + # Add data to table + for row_idx, row_data in enumerate(table_data): + for col_idx, cell_data in enumerate(row_data): + if col_idx < len(table.rows[row_idx].cells): + table.rows[row_idx].cells[col_idx].text = cell_data + + # Style the table + self._style_table(table) + + except Exception as e: + self.logger.warning(f"Could not add table: {str(e)}") + + def _style_table(self, table): + """Apply styling to the table.""" + try: + # Style header row + if len(table.rows) > 0: + header_cells = table.rows[0].cells + for cell in header_cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + except Exception as e: + self.logger.warning(f"Could not style table: {str(e)}") + + def _process_table_row(self, doc, line: str): + """Process a table row and add it to the document.""" + if not line.strip(): + return + + # Split by pipe separator + parts = [part.strip() for part in line.split('|')] + + if len(parts) >= 2: + # This is a table row - create a table if it doesn't exist + if not hasattr(self, '_current_table') or self._current_table is None: + # Create new table + self._current_table = doc.add_table(rows=1, cols=len(parts)) + self._current_table.style = 'Table Grid' + + # Add header row + for i, part in enumerate(parts): + if i < len(self._current_table.rows[0].cells): + cell = self._current_table.rows[0].cells[i] + cell.text = part + # Make header bold + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + else: + # Add data row to existing table + row = self._current_table.add_row() + for i, part in enumerate(parts): + if i < len(row.cells): + row.cells[i].text = part + else: + # Not a table row, treat as regular text + doc.add_paragraph(line) + + def _clean_ai_content(self, content: str) -> str: + """Clean AI-generated content by removing debug information and duplicates.""" + if not content: + return "" + + # Remove debug information + lines = content.split('\n') + clean_lines = [] + + for line in lines: + # Skip debug lines and separators + if (line.startswith('[Skipped ') or + line.startswith('=== DOCUMENT:') or + line.startswith('---') or + line.startswith('FILENAME:') or + line.strip() == '' or + line.strip() == '---'): + continue + clean_lines.append(line) + + # Join lines and remove duplicate content + clean_content = '\n'.join(clean_lines) + + # Remove duplicate sections by keeping only the first occurrence + sections = clean_content.split('\n\n') + seen_sections = set() + unique_sections = [] + + for section in sections: + section_key = section.strip()[:50] # Use first 50 chars as key + if section_key not in seen_sections and section.strip(): + seen_sections.add(section_key) + unique_sections.append(section) + + return '\n\n'.join(unique_sections) + + def _process_tables(self, doc, content: str) -> str: + """ + Process tables in the content (both CSV and pipe-separated) and convert them to Word tables. + Returns the content with tables replaced by placeholders. + """ + import csv + import io + + lines = content.split('\n') + processed_lines = [] + i = 0 + + while i < len(lines): + line = lines[i].strip() + + # Check if this line looks like a table (contains pipes or commas with multiple fields) + is_pipe_table = '|' in line and len(line.split('|')) >= 2 + is_csv_table = ',' in line and len(line.split(',')) >= 2 + + if is_pipe_table or is_csv_table: + # Collect consecutive table lines + table_lines = [] + j = i + + # Determine separator and collect lines + separator = '|' if is_pipe_table else ',' + while j < len(lines): + current_line = lines[j].strip() + if separator in current_line and len(current_line.split(separator)) >= 2: + table_lines.append(current_line) + j += 1 + else: + break + + if len(table_lines) >= 2: # At least header + 1 data row + # Create Word table + try: + if separator == '|': + # Process pipe-separated table + rows = [] + for table_line in table_lines: + # Split by pipe and clean up + cells = [cell.strip() for cell in table_line.split('|')] + rows.append(cells) + else: + # Process CSV table + csv_content = '\n'.join(table_lines) + csv_reader = csv.reader(io.StringIO(csv_content)) + rows = list(csv_reader) + + if rows and len(rows[0]) > 0: + # Create Word table + table = doc.add_table(rows=len(rows), cols=len(rows[0])) + table.style = 'Table Grid' + + # Populate table + for row_idx, row_data in enumerate(rows): + for col_idx, cell_data in enumerate(row_data): + if col_idx < len(table.rows[row_idx].cells): + table.rows[row_idx].cells[col_idx].text = cell_data.strip() + + # Make header row bold + if row_idx == 0: + for cell in table.rows[row_idx].cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + + # Add placeholder to mark where table was inserted + processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]") + + # Skip the table lines + i = j + continue + except Exception as e: + # If table parsing fails, treat as regular text + pass + + processed_lines.append(line) + i += 1 + + return '\n'.join(processed_lines) + + def _parse_and_format_content(self, doc, content: str, title: str): + """Parse AI-generated content in standardized format and apply proper DOCX formatting.""" + if not content: + return + + # Process tables and replace them with placeholders + content = self._process_tables(doc, content) + + # Parse content line by line in exact sequence + lines = content.split('\n') + + for line in lines: + line = line.strip() + if not line: + # Empty line - add paragraph break + doc.add_paragraph() + continue + + # Skip table placeholders (already processed) + if line.startswith('[TABLE_INSERTED_'): + continue + + # Check if this is a Markdown heading (# ## ###) + if line.startswith('#'): + level = len(line) - len(line.lstrip('#')) + heading_text = line.lstrip('# ').strip() + doc.add_heading(heading_text, level=min(level, 3)) + + # Check if this is a numbered heading (1) Title, 2) Title, etc.) + elif re.match(r'^\d+\)\s+.+', line): + heading_text = re.sub(r'^\d+\)\s+', '', line) + doc.add_heading(heading_text, level=1) + + # Check if this is a Markdown list item + elif line.startswith('- ') or re.match(r'^\d+\.\s+', line): + bullet_text = re.sub(r'^[-β€’]\s+|\d+\.\s+', '', line) + self._add_bullet_point(doc, bullet_text) + + # Check if this is a code block + elif line.startswith('```'): + if not line.endswith('```'): + # Start of code block - collect until end + code_lines = [line] + continue + else: + # End of code block + if 'code_lines' in locals(): + code_lines.append(line) + code_text = '\n'.join(code_lines) + para = doc.add_paragraph() + run = para.add_run(code_text) + run.font.name = 'Courier New' + del code_lines + + # Regular paragraph + else: + self._add_paragraph_to_doc(doc, line) + + def _add_paragraph_to_doc(self, doc, text: str): + """Add a paragraph to the document with proper formatting.""" + if not text.strip(): + return + + # Check for Markdown formatting (**bold**, *italic*) + para = doc.add_paragraph() + + # Split by bold markers + parts = text.split('**') + for i, part in enumerate(parts): + if i % 2 == 0: + # Regular text - check for italic + italic_parts = part.split('*') + for j, italic_part in enumerate(italic_parts): + if j % 2 == 0: + # Regular text + if italic_part: + para.add_run(italic_part) + else: + # Italic text + if italic_part: + run = para.add_run(italic_part) + run.italic = True + else: + # Bold text + if part: + run = para.add_run(part) + run.bold = True \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/rendererHtml.py b/modules/services/serviceGeneration/renderers/rendererHtml.py new file mode 100644 index 00000000..1dedaf46 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererHtml.py @@ -0,0 +1,424 @@ +""" +HTML renderer for report generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List + +class RendererHtml(BaseRenderer): + """Renders content to HTML format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported HTML formats.""" + return ['html', 'htm'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['web', 'webpage'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for HTML renderer.""" + return 100 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to HTML format using AI-analyzed styling.""" + try: + # Generate HTML using AI-analyzed styling + html_content = await self._generate_html_from_json(extracted_content, title, user_prompt, ai_service) + + return html_content, "text/html" + + except Exception as e: + self.logger.error(f"Error rendering HTML: {str(e)}") + # Return minimal HTML fallback + return f"{title}

{title}

Error rendering report: {str(e)}

", "text/html" + + async def _generate_html_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Generate HTML content from structured JSON document using AI-generated styling.""" + try: + # Get AI-generated styling definitions + styles = await self._get_html_styles(user_prompt, ai_service) + + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Build HTML document + html_parts = [] + + # HTML document structure + html_parts.append('') + html_parts.append('') + html_parts.append('') + html_parts.append('') + html_parts.append('') + html_parts.append(f'{document_title}') + html_parts.append('') + html_parts.append('') + html_parts.append('') + + # Document header + html_parts.append(f'

{document_title}

') + + # Main content + html_parts.append('
') + + # Process each section + sections = json_content.get("sections", []) + for section in sections: + section_html = self._render_json_section(section, styles) + if section_html: + html_parts.append(section_html) + + html_parts.append('
') + + # Footer + html_parts.append('
') + html_parts.append(f'

Generated: {self._format_timestamp()}

') + html_parts.append('
') + + html_parts.append('') + html_parts.append('') + + return '\n'.join(html_parts) + + except Exception as e: + self.logger.error(f"Error generating HTML from JSON: {str(e)}") + raise Exception(f"HTML generation failed: {str(e)}") + + async def _get_html_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: + """Get HTML styling definitions using base template AI styling.""" + style_schema = { + "title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"}, + "heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"}, + "heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"}, + "paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"}, + "table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"}, + "table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"}, + "table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"}, + "bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"}, + "code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"}, + "image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"}, + "body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"} + } + + style_template = self._create_ai_style_template("html", user_prompt, style_schema) + styles = await self._get_ai_styles(ai_service, style_template, self._get_default_html_styles()) + + # Validate and fix contrast issues + return self._validate_html_styles_contrast(styles) + + def _validate_html_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Validate and fix contrast issues in AI-generated styles.""" + try: + # Fix table header contrast + if "table_header" in styles: + header = styles["table_header"] + bg_color = header.get("background", "#FFFFFF") + text_color = header.get("color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + header["background"] = "#4F4F4F" + header["color"] = "#FFFFFF" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + header["background"] = "#4F4F4F" + header["color"] = "#FFFFFF" + + # Fix table cell contrast + if "table_cell" in styles: + cell = styles["table_cell"] + bg_color = cell.get("background", "#FFFFFF") + text_color = cell.get("color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + cell["background"] = "#FFFFFF" + cell["color"] = "#2F2F2F" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + cell["background"] = "#FFFFFF" + cell["color"] = "#2F2F2F" + + return styles + + except Exception as e: + self.logger.warning(f"Style validation failed: {str(e)}") + return self._get_default_html_styles() + + + def _get_default_html_styles(self) -> Dict[str, Any]: + """Default HTML styles.""" + return { + "title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"}, + "heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"}, + "heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"}, + "paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"}, + "table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"}, + "table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"}, + "table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"}, + "bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"}, + "code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"}, + "image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"}, + "body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"} + } + + def _generate_css_styles(self, styles: Dict[str, Any]) -> str: + """Generate CSS from style definitions.""" + css_parts = [] + + # Body styles + body_style = styles.get("body", {}) + css_parts.append("body {") + for property_name, value in body_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Document title + title_style = styles.get("title", {}) + css_parts.append(".document-title {") + for property_name, value in title_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Headings + for heading_level in ["heading1", "heading2"]: + heading_style = styles.get(heading_level, {}) + css_class = f"h{heading_level[-1]}" + css_parts.append(f"{css_class} {{") + for property_name, value in heading_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Paragraphs + paragraph_style = styles.get("paragraph", {}) + css_parts.append("p {") + for property_name, value in paragraph_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Tables + table_style = styles.get("table", {}) + css_parts.append("table {") + for property_name, value in table_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Table headers + table_header_style = styles.get("table_header", {}) + css_parts.append("th {") + for property_name, value in table_header_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Table cells + table_cell_style = styles.get("table_cell", {}) + css_parts.append("td {") + for property_name, value in table_cell_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Lists + bullet_list_style = styles.get("bullet_list", {}) + css_parts.append("ul {") + for property_name, value in bullet_list_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Code blocks + code_block_style = styles.get("code_block", {}) + css_parts.append("pre {") + for property_name, value in code_block_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Images + image_style = styles.get("image", {}) + css_parts.append("img {") + for property_name, value in image_style.items(): + css_property = property_name.replace("_", "-") + css_parts.append(f" {css_property}: {value};") + css_parts.append("}") + + # Generated info + css_parts.append(".generated-info {") + css_parts.append(" font-size: 0.9em;") + css_parts.append(" color: #666;") + css_parts.append(" text-align: center;") + css_parts.append(" margin-top: 2em;") + css_parts.append(" padding-top: 1em;") + css_parts.append(" border-top: 1px solid #ddd;") + css_parts.append("}") + + return '\n'.join(css_parts) + + def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a single JSON section to HTML using AI-generated styles.""" + try: + section_type = self._get_section_type(section) + section_data = self._get_section_data(section) + + if section_type == "table": + # Process the section data to extract table structure + processed_data = self._process_section_by_type(section) + return self._render_json_table(processed_data, styles) + elif section_type == "bullet_list": + # Process the section data to extract bullet list structure + processed_data = self._process_section_by_type(section) + return self._render_json_bullet_list(processed_data, styles) + elif section_type == "heading": + return self._render_json_heading(section_data, styles) + elif section_type == "paragraph": + return self._render_json_paragraph(section_data, styles) + elif section_type == "code_block": + # Process the section data to extract code block structure + processed_data = self._process_section_by_type(section) + return self._render_json_code_block(processed_data, styles) + elif section_type == "image": + # Process the section data to extract image structure + processed_data = self._process_section_by_type(section) + return self._render_json_image(processed_data, styles) + else: + # Fallback to paragraph for unknown types + return self._render_json_paragraph(section_data, styles) + + except Exception as e: + self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") + return f'
[Error rendering section: {str(e)}]
' + + def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON table to HTML using AI-generated styles.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + if not headers or not rows: + return "" + + html_parts = [''] + + # Table header + html_parts.append('') + for header in headers: + html_parts.append(f'') + html_parts.append('') + + # Table body + html_parts.append('') + for row in rows: + html_parts.append('') + for cell_data in row: + html_parts.append(f'') + html_parts.append('') + html_parts.append('') + + html_parts.append('
{header}
{cell_data}
') + return '\n'.join(html_parts) + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + return "" + + def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON bullet list to HTML using AI-generated styles.""" + try: + items = list_data.get("items", []) + + if not items: + return "" + + html_parts = ['
    '] + for item in items: + if isinstance(item, str): + html_parts.append(f'
  • {item}
  • ') + elif isinstance(item, dict) and "text" in item: + html_parts.append(f'
  • {item["text"]}
  • ') + html_parts.append('
') + + return '\n'.join(html_parts) + + except Exception as e: + self.logger.warning(f"Error rendering bullet list: {str(e)}") + return "" + + def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON heading to HTML using AI-generated styles.""" + try: + level = heading_data.get("level", 1) + text = heading_data.get("text", "") + + if text: + level = max(1, min(6, level)) + return f'{text}' + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + return "" + + def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON paragraph to HTML using AI-generated styles.""" + try: + text = paragraph_data.get("text", "") + + if text: + return f'

{text}

' + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + return "" + + def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON code block to HTML using AI-generated styles.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + if code: + if language: + return f'
{code}
' + else: + return f'
{code}
' + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + return "" + + def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> str: + """Render a JSON image to HTML.""" + try: + base64_data = image_data.get("base64Data", "") + alt_text = image_data.get("altText", "Image") + + if base64_data: + return f'{alt_text}' + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering image: {str(e)}") + return f'
[Image: {image_data.get("altText", "Image")}]
' diff --git a/modules/services/serviceGeneration/renderers/rendererImage.py b/modules/services/serviceGeneration/renderers/rendererImage.py new file mode 100644 index 00000000..863a52e2 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererImage.py @@ -0,0 +1,281 @@ +""" +Image renderer for report generation using AI image generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import base64 +import logging + +logger = logging.getLogger(__name__) + +class RendererImage(BaseRenderer): + """Renders content to image format using AI image generation.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported image formats.""" + return ['png', 'jpg', 'jpeg', 'image'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['img', 'picture', 'photo', 'graphic'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for image renderer.""" + return 90 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to image format using AI image generation.""" + try: + # Generate AI image from content + image_content = await self._generate_ai_image(extracted_content, title, user_prompt, ai_service) + + return image_content, "image/png" + + except Exception as e: + self.logger.error(f"Error rendering image: {str(e)}") + # Re-raise the exception instead of using fallback + raise Exception(f"Image rendering failed: {str(e)}") + + async def _generate_ai_image(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Generate AI image from extracted content.""" + try: + if not ai_service: + raise ValueError("AI service is required for image generation") + + # Validate JSON structure + if not isinstance(extracted_content, dict): + raise ValueError("Extracted content must be a dictionary") + + if "sections" not in extracted_content: + raise ValueError("Extracted content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = extracted_content.get("metadata", {}).get("title", title) + + # Create AI prompt for image generation + image_prompt = await self._create_image_generation_prompt(extracted_content, document_title, user_prompt, ai_service) + + # Generate image using AI + image_result = await ai_service.aiObjects.generateImage( + prompt=image_prompt, + size="1024x1024", + quality="standard", + style="vivid" + ) + + # Extract base64 image data from result + if image_result and image_result.get("success", False): + image_data = image_result.get("image_data", "") + if image_data: + return image_data + else: + raise ValueError("No image data returned from AI") + else: + error_msg = image_result.get("error", "Unknown error") if image_result else "No result" + raise ValueError(f"AI image generation failed: {error_msg}") + + except Exception as e: + self.logger.error(f"Error generating AI image: {str(e)}") + raise Exception(f"AI image generation failed: {str(e)}") + + async def _create_image_generation_prompt(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Create a detailed prompt for AI image generation based on the content.""" + try: + # Start with base prompt + prompt_parts = [] + + # Add user's original intent if available + if user_prompt: + prompt_parts.append(f"User Request: {user_prompt}") + + # Add document title + prompt_parts.append(f"Document Title: {title}") + + # Analyze content and create visual description + sections = extracted_content.get("sections", []) + content_description = self._analyze_content_for_visual_description(sections) + + if content_description: + prompt_parts.append(f"Content to Visualize: {content_description}") + + # Add style guidance + style_guidance = self._get_style_guidance_from_content(extracted_content, user_prompt) + if style_guidance: + prompt_parts.append(f"Visual Style: {style_guidance}") + + # Combine all parts + full_prompt = "Create a professional, informative image that visualizes the following content:\n\n" + "\n\n".join(prompt_parts) + + # Add technical requirements + full_prompt += "\n\nTechnical Requirements:" + full_prompt += "\n- High quality, professional appearance" + full_prompt += "\n- Clear, readable text if any text is included" + full_prompt += "\n- Appropriate colors and layout" + full_prompt += "\n- Suitable for business/professional use" + + # Truncate prompt if it exceeds DALL-E's 4000 character limit + if len(full_prompt) > 4000: + # Use AI to compress the prompt intelligently + compressed_prompt = await self._compress_prompt_with_ai(full_prompt, ai_service) + if compressed_prompt and len(compressed_prompt) <= 4000: + return compressed_prompt + + # Fallback to minimal prompt if AI compression fails or is still too long + minimal_prompt = f"Create a professional image representing: {title}" + if user_prompt: + minimal_prompt += f" - {user_prompt}" + + # If even the minimal prompt is too long, truncate it + if len(minimal_prompt) > 4000: + minimal_prompt = minimal_prompt[:3997] + "..." + + return minimal_prompt + + return full_prompt + + except Exception as e: + self.logger.warning(f"Error creating image prompt: {str(e)}") + # Fallback to simple prompt + return f"Create a professional image representing: {title}" + + async def _compress_prompt_with_ai(self, long_prompt: str, ai_service=None) -> str: + """Use AI to intelligently compress a long prompt while preserving key information.""" + try: + if not ai_service: + return None + + compression_prompt = f""" +You are an expert at creating concise, effective prompts for AI image generation. + +The following prompt is too long for DALL-E (4000 character limit) and needs to be compressed to under 4000 characters while preserving the most important visual information. + +Original prompt ({len(long_prompt)} characters): +{long_prompt} + +Please create a compressed version that: +1. Keeps the most important visual elements and requirements +2. Maintains the core intent and style guidance +3. Preserves technical requirements +4. Stays under 4000 characters +5. Is optimized for DALL-E image generation + +Return only the compressed prompt, no explanations. +""" + + # Use AI to compress the prompt - call the AI service correctly + # The ai_service has an aiObjects attribute that contains the actual AI interface + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + request = AiCallRequest( + prompt=compression_prompt, + options=AiCallOptions( + operationType=OperationType.GENERAL, + maxTokens=2000, + temperature=0.3 # Lower temperature for more consistent compression + ) + ) + + response = await ai_service.aiObjects.call(request) + compressed = response.content.strip() + + # Validate the compressed prompt + if compressed and len(compressed) <= 4000 and len(compressed) > 50: + self.logger.info(f"Successfully compressed prompt from {len(long_prompt)} to {len(compressed)} characters") + return compressed + else: + self.logger.warning(f"AI compression failed or produced invalid result: {len(compressed) if compressed else 0} chars") + return None + + except Exception as e: + self.logger.warning(f"Error compressing prompt with AI: {str(e)}") + return None + + def _analyze_content_for_visual_description(self, sections: List[Dict[str, Any]]) -> str: + """Analyze content sections and create a visual description for AI.""" + try: + descriptions = [] + + for section in sections: + section_type = self._get_section_type(section) + section_data = self._get_section_data(section) + + if section_type == "table": + headers = section_data.get("headers", []) + rows = section_data.get("rows", []) + if headers and rows: + descriptions.append(f"Data table with {len(headers)} columns and {len(rows)} rows: {', '.join(headers)}") + + elif section_type == "bullet_list": + items = section_data.get("items", []) + if items: + descriptions.append(f"List with {len(items)} items") + + elif section_type == "heading": + text = section_data.get("text", "") + level = section_data.get("level", 1) + if text: + descriptions.append(f"Heading {level}: {text}") + + elif section_type == "paragraph": + text = section_data.get("text", "") + if text and len(text) > 10: # Only include substantial paragraphs + # Truncate long text + truncated = text[:100] + "..." if len(text) > 100 else text + descriptions.append(f"Text content: {truncated}") + + elif section_type == "code_block": + code = section_data.get("code", "") + language = section_data.get("language", "") + if code: + descriptions.append(f"Code block ({language}): {code[:50]}...") + + return "; ".join(descriptions) if descriptions else "General document content" + + except Exception as e: + self.logger.warning(f"Error analyzing content: {str(e)}") + return "Document content" + + def _get_style_guidance_from_content(self, extracted_content: Dict[str, Any], user_prompt: str = None) -> str: + """Determine visual style guidance based on content and user prompt.""" + try: + style_elements = [] + + # Analyze user prompt for style hints + if user_prompt: + prompt_lower = user_prompt.lower() + + if any(word in prompt_lower for word in ["modern", "contemporary", "sleek"]): + style_elements.append("modern, clean design") + elif any(word in prompt_lower for word in ["classic", "traditional", "formal"]): + style_elements.append("classic, formal design") + elif any(word in prompt_lower for word in ["creative", "artistic", "colorful"]): + style_elements.append("creative, artistic design") + elif any(word in prompt_lower for word in ["corporate", "business", "professional"]): + style_elements.append("corporate, professional design") + + # Analyze content type for additional style hints + sections = extracted_content.get("sections", []) + has_tables = any(self._get_section_type(s) == "table" for s in sections) + has_lists = any(self._get_section_type(s) == "bullet_list" for s in sections) + has_code = any(self._get_section_type(s) == "code_block" for s in sections) + + if has_tables: + style_elements.append("data-focused layout") + if has_lists: + style_elements.append("organized, structured presentation") + if has_code: + style_elements.append("technical, developer-friendly") + + # Default style if no specific guidance + if not style_elements: + style_elements.append("professional, clean design") + + return ", ".join(style_elements) + + except Exception as e: + self.logger.warning(f"Error determining style guidance: {str(e)}") + return "professional design" diff --git a/modules/services/serviceGeneration/renderers/rendererJson.py b/modules/services/serviceGeneration/renderers/rendererJson.py new file mode 100644 index 00000000..2ff07ad6 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererJson.py @@ -0,0 +1,79 @@ +""" +JSON renderer for report generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import json + +class RendererJson(BaseRenderer): + """Renders content to JSON format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported JSON formats.""" + return ['json'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['data'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for JSON renderer.""" + return 80 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to JSON format.""" + try: + # The extracted content should already be JSON from the AI + # Just validate and format it + json_content = self._clean_json_content(extracted_content, title) + + return json_content, "application/json" + + except Exception as e: + self.logger.error(f"Error rendering JSON: {str(e)}") + # Return minimal JSON fallback + fallback_data = { + "title": title, + "sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}], + "metadata": {"error": str(e)} + } + return json.dumps(fallback_data, indent=2), "application/json" + + def _clean_json_content(self, content: Dict[str, Any], title: str) -> str: + """Clean and validate JSON content from AI.""" + try: + # Validate JSON structure + if not isinstance(content, dict): + raise ValueError("Content must be a dictionary") + + # Ensure it has the expected structure + if "sections" not in content: + # Convert old format to new format + content = { + "sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}], + "metadata": {"title": title} + } + + # Ensure metadata exists + if "metadata" not in content: + content["metadata"] = {} + + # Set title in metadata if not present + if "title" not in content["metadata"]: + content["metadata"]["title"] = title + + # Re-format with proper indentation + return json.dumps(content, indent=2, ensure_ascii=False) + + except Exception as e: + self.logger.warning(f"Error cleaning JSON content: {str(e)}") + # Return minimal valid JSON + fallback_data = { + "sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}], + "metadata": {"title": title, "error": str(e)} + } + return json.dumps(fallback_data, indent=2, ensure_ascii=False) diff --git a/modules/services/serviceGeneration/renderers/rendererMarkdown.py b/modules/services/serviceGeneration/renderers/rendererMarkdown.py new file mode 100644 index 00000000..59806d4c --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererMarkdown.py @@ -0,0 +1,221 @@ +""" +Markdown renderer for report generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List + +class RendererMarkdown(BaseRenderer): + """Renders content to Markdown format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported Markdown formats.""" + return ['md', 'markdown'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['mdown', 'mkd'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for markdown renderer.""" + return 95 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to Markdown format.""" + try: + # Generate markdown from JSON structure + markdown_content = self._generate_markdown_from_json(extracted_content, title) + + return markdown_content, "text/markdown" + + except Exception as e: + self.logger.error(f"Error rendering markdown: {str(e)}") + # Return minimal markdown fallback + return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown" + + def _generate_markdown_from_json(self, json_content: Dict[str, Any], title: str) -> str: + """Generate markdown content from structured JSON document.""" + try: + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Build markdown content + markdown_parts = [] + + # Document title + markdown_parts.append(f"# {document_title}") + markdown_parts.append("") + + # Process each section + sections = json_content.get("sections", []) + for section in sections: + section_markdown = self._render_json_section(section) + if section_markdown: + markdown_parts.append(section_markdown) + markdown_parts.append("") # Add spacing between sections + + # Add generation info + markdown_parts.append("---") + markdown_parts.append(f"*Generated: {self._format_timestamp()}*") + + return '\n'.join(markdown_parts) + + except Exception as e: + self.logger.error(f"Error generating markdown from JSON: {str(e)}") + raise Exception(f"Markdown generation failed: {str(e)}") + + def _render_json_section(self, section: Dict[str, Any]) -> str: + """Render a single JSON section to markdown.""" + try: + section_type = self._get_section_type(section) + section_data = self._get_section_data(section) + + if section_type == "table": + # Process the section data to extract table structure + processed_data = self._process_section_by_type(section) + return self._render_json_table(processed_data) + elif section_type == "bullet_list": + # Process the section data to extract bullet list structure + processed_data = self._process_section_by_type(section) + return self._render_json_bullet_list(processed_data) + elif section_type == "heading": + return self._render_json_heading(section_data) + elif section_type == "paragraph": + return self._render_json_paragraph(section_data) + elif section_type == "code_block": + # Process the section data to extract code block structure + processed_data = self._process_section_by_type(section) + return self._render_json_code_block(processed_data) + elif section_type == "image": + # Process the section data to extract image structure + processed_data = self._process_section_by_type(section) + return self._render_json_image(processed_data) + else: + # Fallback to paragraph for unknown types + return self._render_json_paragraph(section_data) + + except Exception as e: + self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") + return f"*[Error rendering section: {str(e)}]*" + + def _render_json_table(self, table_data: Dict[str, Any]) -> str: + """Render a JSON table to markdown.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + if not headers or not rows: + return "" + + markdown_parts = [] + + # Create table header + header_line = " | ".join(str(header) for header in headers) + markdown_parts.append(header_line) + + # Add separator line + separator_line = " | ".join("---" for _ in headers) + markdown_parts.append(separator_line) + + # Add data rows + for row in rows: + row_line = " | ".join(str(cell_data) for cell_data in row) + markdown_parts.append(row_line) + + return '\n'.join(markdown_parts) + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + return "" + + def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str: + """Render a JSON bullet list to markdown.""" + try: + items = list_data.get("items", []) + + if not items: + return "" + + markdown_parts = [] + for item in items: + if isinstance(item, str): + markdown_parts.append(f"- {item}") + elif isinstance(item, dict) and "text" in item: + markdown_parts.append(f"- {item['text']}") + + return '\n'.join(markdown_parts) + + except Exception as e: + self.logger.warning(f"Error rendering bullet list: {str(e)}") + return "" + + def _render_json_heading(self, heading_data: Dict[str, Any]) -> str: + """Render a JSON heading to markdown.""" + try: + level = heading_data.get("level", 1) + text = heading_data.get("text", "") + + if text: + level = max(1, min(6, level)) + return f"{'#' * level} {text}" + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + return "" + + def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str: + """Render a JSON paragraph to markdown.""" + try: + text = paragraph_data.get("text", "") + return text if text else "" + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + return "" + + def _render_json_code_block(self, code_data: Dict[str, Any]) -> str: + """Render a JSON code block to markdown.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + if code: + if language: + return f"```{language}\n{code}\n```" + else: + return f"```\n{code}\n```" + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + return "" + + def _render_json_image(self, image_data: Dict[str, Any]) -> str: + """Render a JSON image to markdown.""" + try: + alt_text = image_data.get("altText", "Image") + base64_data = image_data.get("base64Data", "") + + if base64_data: + # For base64 images, we can't embed them directly in markdown + # So we'll use a placeholder with the alt text + return f"![{alt_text}](data:image/png;base64,{base64_data[:50]}...)" + else: + return f"![{alt_text}](image-placeholder)" + + except Exception as e: + self.logger.warning(f"Error rendering image: {str(e)}") + return f"![{image_data.get('altText', 'Image')}](image-error)" diff --git a/modules/services/serviceGeneration/renderers/rendererPdf.py b/modules/services/serviceGeneration/renderers/rendererPdf.py new file mode 100644 index 00000000..dc3195ae --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererPdf.py @@ -0,0 +1,642 @@ +""" +PDF renderer for report generation using reportlab. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from reportlab.lib.pagesizes import letter, A4 + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.lib.units import inch + from reportlab.lib import colors + from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY + REPORTLAB_AVAILABLE = True +except ImportError: + REPORTLAB_AVAILABLE = False + +class RendererPdf(BaseRenderer): + """Renders content to PDF format using reportlab.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported PDF formats.""" + return ['pdf'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['document', 'print'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for PDF renderer.""" + return 120 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to PDF format using AI-analyzed styling.""" + try: + if not REPORTLAB_AVAILABLE: + # Fallback to HTML if reportlab not available + from .rendererHtml import RendererHtml + html_renderer = RendererHtml() + html_content, _ = await html_renderer.render(extracted_content, title, user_prompt, ai_service) + return html_content, "text/html" + + # Generate PDF using AI-analyzed styling + pdf_content = await self._generate_pdf_from_json(extracted_content, title, user_prompt, ai_service) + + return pdf_content, "application/pdf" + + except Exception as e: + self.logger.error(f"Error rendering PDF: {str(e)}") + # Return minimal fallback + return f"PDF Generation Error: {str(e)}", "text/plain" + + async def _generate_pdf_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Generate PDF content from structured JSON document using AI-generated styling.""" + try: + # Get AI-generated styling definitions + styles = await self._get_pdf_styles(user_prompt, ai_service) + + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Make title shorter to prevent wrapping/overlapping + if len(document_title) > 40: + document_title = "PowerOn - Consent Agreement" + + # Create a buffer to hold the PDF + buffer = io.BytesIO() + + # Create PDF document + doc = SimpleDocTemplate( + buffer, + pagesize=A4, + rightMargin=72, + leftMargin=72, + topMargin=72, + bottomMargin=18 + ) + + # Build PDF content + story = [] + + # Title page + title_style = self._create_title_style(styles) + story.append(Paragraph(document_title, title_style)) + story.append(Spacer(1, 50)) # Increased spacing to prevent overlap + story.append(Paragraph(f"Generated: {self._format_timestamp()}", self._create_normal_style(styles))) + story.append(Spacer(1, 30)) # Add spacing before page break + story.append(PageBreak()) + + # Process each section + sections = json_content.get("sections", []) + self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER") + for i, section in enumerate(sections): + self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER") + section_elements = self._render_json_section(section, styles) + self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER") + story.extend(section_elements) + + # Build PDF + doc.build(story) + + # Get PDF content as base64 + buffer.seek(0) + pdf_bytes = buffer.getvalue() + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + return pdf_base64 + + except Exception as e: + self.logger.error(f"Error generating PDF from JSON: {str(e)}") + raise Exception(f"PDF generation failed: {str(e)}") + + async def _get_pdf_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: + """Get PDF styling definitions using base template AI styling.""" + style_schema = { + "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30}, + "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12}, + "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, + "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2}, + "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12}, + "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10}, + "bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3}, + "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6} + } + + style_template = self._create_ai_style_template("pdf", user_prompt, style_schema) + + # Use base template method like DOCX does (this works!) + styles = await self._get_ai_styles(ai_service, style_template, self._get_default_pdf_styles()) + + if styles is None: + return self._get_default_pdf_styles() + + # Convert colors to PDF format after getting styles + styles = self._convert_colors_format(styles) + + # Validate and fix contrast issues + return self._validate_pdf_styles_contrast(styles) + + async def _get_ai_styles_with_pdf_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: + """Get AI styles with proper PDF color conversion.""" + if not ai_service: + return default_styles + + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=style_template, context="", options=request_options) + + # Check if AI service is properly configured + if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects: + self.logger.warning("AI service not properly configured, using defaults") + return default_styles + + response = await ai_service.aiObjects.call(request) + + # Check if response is valid + if not response: + self.logger.warning("AI service returned no response, using defaults") + return default_styles + + import json + import re + + # Clean and parse JSON + result = response.content.strip() if response and response.content else "" + + # Check if result is empty + if not result: + self.logger.warning("AI styling returned empty response, using defaults") + return default_styles + + # Log the raw response for debugging + self.logger.debug(f"AI styling raw response: {result[:200]}...") + + # Extract JSON from various formats + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to extract JSON from explanatory text + json_patterns = [ + r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object + r'\{.*?"title".*?\}', # JSON with title field + r'\{.*?"font_size".*?\}', # JSON with font_size field + ] + + for pattern in json_patterns: + json_match = re.search(pattern, result, re.DOTALL) + if json_match: + result = json_match.group(0) + break + + # Additional cleanup - remove any leading/trailing whitespace and newlines + result = result.strip() + + # Check if result is still empty after cleanup + if not result: + self.logger.warning("AI styling returned empty content after cleanup, using defaults") + return default_styles + + # Try to parse JSON + try: + styles = json.loads(result) + self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}") + except json.JSONDecodeError as json_error: + self.logger.warning(f"AI styling returned invalid JSON: {json_error}") + + # Use print instead of logger to avoid truncation + self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER") + self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER") + + self.logger.warning(f"Raw content that failed to parse: {result}") + + # Try to fix incomplete JSON by adding missing closing braces + open_braces = result.count('{') + close_braces = result.count('}') + + if open_braces > close_braces: + # JSON is incomplete, add missing closing braces + missing_braces = open_braces - close_braces + result = result + '}' * missing_braces + self.logger.info(f"Added {missing_braces} missing closing brace(s)") + + # Try parsing the fixed JSON + try: + styles = json.loads(result) + self.logger.info("Successfully fixed incomplete JSON") + except json.JSONDecodeError as fix_error: + self.logger.warning(f"Fixed JSON still invalid: {fix_error}") + # Try to extract just the JSON part if it's embedded in text + json_start = result.find('{') + json_end = result.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + json_part = result[json_start:json_end+1] + try: + styles = json.loads(json_part) + self.logger.info("Successfully extracted JSON from explanatory text") + except json.JSONDecodeError: + self.logger.warning("Could not extract valid JSON from response, using defaults") + return default_styles + else: + return default_styles + else: + # Try to extract just the JSON part if it's embedded in text + json_start = result.find('{') + json_end = result.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + json_part = result[json_start:json_end+1] + try: + styles = json.loads(json_part) + self.logger.info("Successfully extracted JSON from explanatory text") + except json.JSONDecodeError: + self.logger.warning("Could not extract valid JSON from response, using defaults") + return default_styles + else: + return default_styles + + # Convert colors to PDF format (keep as hex strings, PDF renderer will convert them) + styles = self._convert_colors_format(styles) + + return styles + + except Exception as e: + self.logger.warning(f"AI styling failed: {str(e)}, using defaults") + return default_styles + + def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Convert colors to proper format for PDF compatibility.""" + try: + for style_name, style_config in styles.items(): + if isinstance(style_config, dict): + for prop, value in style_config.items(): + if isinstance(value, str) and value.startswith('#') and len(value) == 7: + # Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency + styles[style_name][prop] = f"FF{value[1:]}" + elif isinstance(value, str) and value.startswith('#') and len(value) == 9: + # Already aRGB format, keep as is + pass + return styles + except Exception as e: + self.logger.warning(f"Color conversion failed: {str(e)}") + return styles + + def _get_safe_color(self, color_value: str, default: str = "#000000") -> str: + """Get a safe hex color value for PDF.""" + if isinstance(color_value, str) and color_value.startswith('#'): + if len(color_value) == 7: + return f"FF{color_value[1:]}" + elif len(color_value) == 9: + return color_value + return default + + def _validate_pdf_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Validate and fix contrast issues in AI-generated styles.""" + try: + # Fix table header contrast + if "table_header" in styles: + header = styles["table_header"] + bg_color = header.get("background", "#FFFFFF") + text_color = header.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + + # Fix table cell contrast + if "table_cell" in styles: + cell = styles["table_cell"] + bg_color = cell.get("background", "#FFFFFF") + text_color = cell.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + + return styles + + except Exception as e: + self.logger.warning(f"Style validation failed: {str(e)}") + return self._get_default_pdf_styles() + + def _get_default_pdf_styles(self) -> Dict[str, Any]: + """Default PDF styles.""" + return { + "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30}, + "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12}, + "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, + "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2}, + "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12}, + "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10}, + "bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3}, + "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6} + } + + def _create_title_style(self, styles: Dict[str, Any]) -> ParagraphStyle: + """Create title style from style definitions.""" + title_style_def = styles.get("title", {}) + + # DEBUG: Show what color and spacing is being used for title + title_color = title_style_def.get("color", "#1F4E79") + title_space_after = title_style_def.get("space_after", 30) + self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hex_to_color(title_color)}", "PDF_RENDERER") + self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER") + + return ParagraphStyle( + 'CustomTitle', + fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20 + spaceAfter=title_style_def.get("space_after", 30), + alignment=self._get_alignment(title_style_def.get("align", "center")), + textColor=self._hex_to_color(title_color), + leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles + spaceBefore=0 # Ensure no space before title + ) + + def _create_heading_style(self, styles: Dict[str, Any], level: int) -> ParagraphStyle: + """Create heading style from style definitions.""" + heading_key = f"heading{level}" + heading_style_def = styles.get(heading_key, styles.get("heading1", {})) + + return ParagraphStyle( + f'CustomHeading{level}', + fontSize=heading_style_def.get("font_size", 18 - level * 2), + spaceAfter=heading_style_def.get("space_after", 12), + spaceBefore=heading_style_def.get("space_before", 12), + alignment=self._get_alignment(heading_style_def.get("align", "left")), + textColor=self._hex_to_color(heading_style_def.get("color", "#2F2F2F")) + ) + + def _create_normal_style(self, styles: Dict[str, Any]) -> ParagraphStyle: + """Create normal paragraph style from style definitions.""" + paragraph_style_def = styles.get("paragraph", {}) + + return ParagraphStyle( + 'CustomNormal', + fontSize=paragraph_style_def.get("font_size", 11), + spaceAfter=paragraph_style_def.get("space_after", 6), + alignment=self._get_alignment(paragraph_style_def.get("align", "left")), + textColor=self._hex_to_color(paragraph_style_def.get("color", "#2F2F2F")), + leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11) + ) + + def _get_alignment(self, align: str) -> int: + """Convert alignment string to reportlab alignment constant.""" + if not align or not isinstance(align, str): + return TA_LEFT + + align_map = { + "center": TA_CENTER, + "left": TA_LEFT, + "justify": TA_JUSTIFY, + "right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback + "0": TA_LEFT, # Handle numeric strings + "1": TA_CENTER, + "2": TA_JUSTIFY + } + return align_map.get(align.lower().strip(), TA_LEFT) + + def _get_table_alignment(self, align: str) -> str: + """Convert alignment string to ReportLab table alignment string.""" + if not align or not isinstance(align, str): + return 'LEFT' + + align_map = { + "center": 'CENTER', + "left": 'LEFT', + "justify": 'LEFT', # Tables don't support justify, use LEFT + "right": 'RIGHT', + "0": 'LEFT', # Handle numeric strings + "1": 'CENTER', + "2": 'LEFT' # Tables don't support justify, use LEFT + } + return align_map.get(align.lower().strip(), 'LEFT') + + def _hex_to_color(self, hex_color: str) -> colors.Color: + """Convert hex color to reportlab color.""" + try: + hex_color = hex_color.lstrip('#') + + # Handle aRGB format (8 characters: FF + RGB) + if len(hex_color) == 8: + # Skip the alpha channel (first 2 characters) + hex_color = hex_color[2:] + + # Handle RGB format (6 characters) + if len(hex_color) == 6: + r = int(hex_color[0:2], 16) / 255.0 + g = int(hex_color[2:4], 16) / 255.0 + b = int(hex_color[4:6], 16) / 255.0 + return colors.Color(r, g, b) + + # Fallback for other formats + return colors.black + except: + return colors.black + + def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a single JSON section to PDF elements using AI-generated styles.""" + try: + section_type = self._get_section_type(section) + elements = self._get_section_data(section) + + # Process each element in the section + all_elements = [] + for element in elements: + if section_type == "table": + all_elements.extend(self._render_json_table(element, styles)) + elif section_type == "bullet_list": + all_elements.extend(self._render_json_bullet_list(element, styles)) + elif section_type == "heading": + all_elements.extend(self._render_json_heading(element, styles)) + elif section_type == "paragraph": + all_elements.extend(self._render_json_paragraph(element, styles)) + elif section_type == "code_block": + all_elements.extend(self._render_json_code_block(element, styles)) + elif section_type == "image": + all_elements.extend(self._render_json_image(element, styles)) + else: + # Fallback to paragraph for unknown types + all_elements.extend(self._render_json_paragraph(element, styles)) + + return all_elements + + except Exception as e: + self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") + return [Paragraph(f"[Error rendering section: {str(e)}]", self._create_normal_style(styles))] + + def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON table to PDF elements using AI-generated styles.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + if not headers or not rows: + return [] + + # Prepare table data + table_data_list = [headers] + rows + + # Create table + table = Table(table_data_list) + + # Apply styling + table_header_style = styles.get("table_header", {}) + table_cell_style = styles.get("table_cell", {}) + + table_style = [ + ('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))), + ('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))), + ('ALIGN', (0, 0), (-1, -1), self._get_table_alignment(table_cell_style.get("align", "left"))), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'), + ('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))), + ('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ] + + table.setStyle(TableStyle(table_style)) + + return [table, Spacer(1, 12)] + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + return [] + + def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON bullet list to PDF elements using AI-generated styles.""" + try: + items = list_data.get("items", []) + bullet_style_def = styles.get("bullet_list", {}) + + elements = [] + for item in items: + if isinstance(item, str): + elements.append(Paragraph(f"β€’ {item}", self._create_normal_style(styles))) + elif isinstance(item, dict) and "text" in item: + elements.append(Paragraph(f"β€’ {item['text']}", self._create_normal_style(styles))) + + if elements: + elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) + + return elements + + except Exception as e: + self.logger.warning(f"Error rendering bullet list: {str(e)}") + return [] + + def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON heading to PDF elements using AI-generated styles.""" + try: + level = heading_data.get("level", 1) + text = heading_data.get("text", "") + + if text: + level = max(1, min(6, level)) + heading_style = self._create_heading_style(styles, level) + return [Paragraph(text, heading_style)] + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + return [] + + def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON paragraph to PDF elements using AI-generated styles.""" + try: + text = paragraph_data.get("text", "") + + if text: + return [Paragraph(text, self._create_normal_style(styles))] + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + return [] + + def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON code block to PDF elements using AI-generated styles.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + code_style_def = styles.get("code_block", {}) + + if code: + elements = [] + + if language: + lang_style = ParagraphStyle( + 'CodeLanguage', + fontSize=code_style_def.get("font_size", 9), + textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")), + fontName='Helvetica-Bold' + ) + elements.append(Paragraph(f"Code ({language}):", lang_style)) + + code_style = ParagraphStyle( + 'CodeBlock', + fontSize=code_style_def.get("font_size", 9), + textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")), + fontName=code_style_def.get("font", "Courier"), + backColor=self._hex_to_color(code_style_def.get("background", "#F5F5F5")), + spaceAfter=code_style_def.get("space_after", 6) + ) + elements.append(Paragraph(code, code_style)) + + return elements + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + return [] + + def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: + """Render a JSON image to PDF elements.""" + try: + base64_data = image_data.get("base64Data", "") + alt_text = image_data.get("altText", "Image") + + if base64_data: + # For now, just add a placeholder since reportlab image handling is complex + return [Paragraph(f"[Image: {alt_text}]", self._create_normal_style(styles))] + + return [] + + except Exception as e: + self.logger.warning(f"Error rendering image: {str(e)}") + return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._create_normal_style(styles))] \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/rendererPptx.py b/modules/services/serviceGeneration/renderers/rendererPptx.py new file mode 100644 index 00000000..26c707ca --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererPptx.py @@ -0,0 +1,885 @@ +import logging +import base64 +import io +from typing import Dict, Any, Optional, Tuple, List +from .rendererBaseTemplate import BaseRenderer + +logger = logging.getLogger(__name__) + + +class RendererPptx(BaseRenderer): + """Renderer for PowerPoint (.pptx) files using python-pptx library.""" + + def __init__(self): + super().__init__() + self.supported_formats = ["pptx", "ppt"] + self.output_mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + + @classmethod + def get_supported_formats(cls) -> list: + """Get list of supported output formats.""" + return ["pptx", "ppt"] + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """ + Render content as PowerPoint presentation from JSON data. + + Args: + extracted_content: JSON content to render as presentation + title: Title for the presentation + user_prompt: User prompt for AI styling + ai_service: AI service for styling + **kwargs: Additional rendering options + + Returns: + Base64-encoded PowerPoint presentation as string + """ + try: + # Import python-pptx + from pptx import Presentation + from pptx.util import Inches, Pt + from pptx.enum.text import PP_ALIGN + from pptx.dml.color import RGBColor + import re + + # Get AI-generated styling definitions first + styles = await self._get_pptx_styles(user_prompt, ai_service) + + # Create new presentation + prs = Presentation() + + # Set slide size based on user intent (default to 16:9) + slide_size = styles.get("slide_size", "16:9") + if slide_size == "4:3": + prs.slide_width = Inches(10) + prs.slide_height = Inches(7.5) + else: # Default to 16:9 + prs.slide_width = Inches(13.33) + prs.slide_height = Inches(7.5) + + # Generate slides from JSON content + slides_data = await self._parse_json_to_slides(extracted_content, title, styles) + logger.info(f"Parsed {len(slides_data)} slides from JSON content") + + # Debug: Show first 200 chars of content + logger.info(f"JSON content preview: {str(extracted_content)[:200]}...") + + for i, slide_data in enumerate(slides_data): + logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - {len(slide_data.get('content', ''))} chars") + # Debug: Show slide content preview + slide_content = slide_data.get('content', '') + if slide_content: + logger.info(f" Content preview: '{slide_content[:100]}...'") + else: + logger.warning(f" ⚠️ Slide {i+1} has NO content!") + + # Create slide with appropriate layout based on content + slide_layout_index = self._get_slide_layout_index(slide_data, styles) + slide_layout = prs.slide_layouts[slide_layout_index] + slide = prs.slides.add_slide(slide_layout) + + # Set title with AI-generated styling + title_shape = slide.shapes.title + title_shape.text = slide_data.get("title", "Slide") + + # Apply title styling + title_style = styles.get("title", {}) + if title_shape.text_frame.paragraphs[0].font: + title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44)) + title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True) + title_color = self._get_safe_color(title_style.get("color", (31, 78, 121))) + title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color) + + # Set content with AI-generated styling + content_shape = slide.placeholders[1] + content_text = slide_data.get("content", "") + + # Format content text with AI styles + text_frame = content_shape.text_frame + text_frame.clear() + + # Split content into paragraphs + paragraphs = content_text.split('\n\n') + + for i, paragraph in enumerate(paragraphs): + if paragraph.strip(): + if i == 0: + p = text_frame.paragraphs[0] + else: + p = text_frame.add_paragraph() + + p.text = paragraph.strip() + + # Apply AI-generated styling based on content type + if paragraph.startswith('#'): + # Header + p.text = paragraph.lstrip('#').strip() + heading_style = styles.get("heading", {}) + p.font.size = Pt(heading_style.get("font_size", 32)) + p.font.bold = heading_style.get("bold", True) + heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47))) + p.font.color.rgb = RGBColor(*heading_color) + elif paragraph.startswith('##'): + # Subheader + p.text = paragraph.lstrip('#').strip() + subheading_style = styles.get("subheading", {}) + p.font.size = Pt(subheading_style.get("font_size", 24)) + p.font.bold = subheading_style.get("bold", True) + subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79))) + p.font.color.rgb = RGBColor(*subheading_color) + elif paragraph.startswith('*') and paragraph.endswith('*'): + # Bold text + p.text = paragraph.strip('*') + paragraph_style = styles.get("paragraph", {}) + p.font.size = Pt(paragraph_style.get("font_size", 18)) + p.font.bold = True + paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47))) + p.font.color.rgb = RGBColor(*paragraph_color) + else: + # Regular text + paragraph_style = styles.get("paragraph", {}) + p.font.size = Pt(paragraph_style.get("font_size", 18)) + p.font.bold = paragraph_style.get("bold", False) + paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47))) + p.font.color.rgb = RGBColor(*paragraph_color) + + # Apply alignment + align = paragraph_style.get("align", "left") + if align == "center": + p.alignment = PP_ALIGN.CENTER + elif align == "right": + p.alignment = PP_ALIGN.RIGHT + else: + p.alignment = PP_ALIGN.LEFT + + # If no slides were created, create a default slide + if not slides_data: + slide_layout = prs.slide_layouts[0] # Title slide layout + slide = prs.slides.add_slide(slide_layout) + + title_shape = slide.shapes.title + title_shape.text = title + + # Apply title styling to default slide + title_style = styles.get("title", {}) + if title_shape.text_frame.paragraphs[0].font: + title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48)) + title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True) + title_color = self._get_safe_color(title_style.get("color", (31, 78, 121))) + title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color) + + subtitle_shape = slide.placeholders[1] + subtitle_shape.text = "Generated by PowerOn AI System" + + # Apply subtitle styling + paragraph_style = styles.get("paragraph", {}) + if subtitle_shape.text_frame.paragraphs[0].font: + subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20)) + subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False) + paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47))) + subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color) + + # Save to buffer + buffer = io.BytesIO() + prs.save(buffer) + buffer.seek(0) + + # Convert to base64 + pptx_bytes = buffer.getvalue() + pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8') + + logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes") + return pptx_base64, "application/vnd.openxmlformats-officedocument.presentationml.presentation" + + except ImportError: + logger.error("python-pptx library not installed. Install with: pip install python-pptx") + return "python-pptx library not installed", "text/plain" + except Exception as e: + logger.error(f"Error rendering PowerPoint presentation: {str(e)}") + return f"Error rendering PowerPoint presentation: {str(e)}", "text/plain" + + def _parse_content_to_slides(self, content: str, title: str) -> list: + """ + Parse content into slide data structure. + + Args: + content: Content to parse + title: Presentation title + + Returns: + List of slide data dictionaries + """ + slides = [] + + # Split content by slide markers or headers + slide_sections = self._split_content_into_slides(content) + + for i, section in enumerate(slide_sections): + if section.strip(): + slide_data = { + "title": f"Slide {i + 1}", + "content": section.strip() + } + + # Extract title from content if it starts with # + lines = section.strip().split('\n') + if lines and lines[0].startswith('#'): + # Remove # symbols and clean up title + slide_title = lines[0].lstrip('#').strip() + slide_data["title"] = slide_title + slide_data["content"] = '\n'.join(lines[1:]).strip() + elif lines and lines[0].strip(): + # Use first line as title if it looks like a title + first_line = lines[0].strip() + if len(first_line) < 100 and not first_line.endswith('.'): + slide_data["title"] = first_line + slide_data["content"] = '\n'.join(lines[1:]).strip() + + slides.append(slide_data) + + return slides + + def _split_content_into_slides(self, content: str) -> list: + """ + Split content into individual slides based on headers and structure. + + Args: + content: Content to split + + Returns: + List of slide content strings + """ + import re + + # First, try to split by major headers (# or ##) + # This is the most common case for AI-generated content + header_pattern = r'^(#{1,2})\s+(.+)$' + lines = content.split('\n') + slides = [] + current_slide = [] + + for line in lines: + # Check if this line is a header + header_match = re.match(header_pattern, line.strip()) + if header_match: + # If we have content in current slide, save it + if current_slide: + slide_content = '\n'.join(current_slide).strip() + if slide_content: + slides.append(slide_content) + current_slide = [] + + # Start new slide with this header + current_slide.append(line) + else: + # Add line to current slide + current_slide.append(line) + + # Add the last slide + if current_slide: + slide_content = '\n'.join(current_slide).strip() + if slide_content: + slides.append(slide_content) + + # If we found slides with headers, return them + if len(slides) > 1: + return slides + + # Fallback: Split by double newlines + sections = content.split('\n\n\n') + if len(sections) > 1: + return [s.strip() for s in sections if s.strip()] + + # Another fallback: Split by double newlines + sections = content.split('\n\n') + if len(sections) > 1: + return [s.strip() for s in sections if s.strip()] + + # Last resort: return as single slide + return [content.strip()] + + + def get_output_mime_type(self) -> str: + """Get MIME type for rendered output.""" + return self.output_mime_type + + async def _get_pptx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: + """Get PowerPoint styling definitions using base template AI styling.""" + style_schema = { + "title": {"font_size": 52, "color": "#1B365D", "bold": True, "align": "center"}, + "heading": {"font_size": 36, "color": "#2C5F2D", "bold": True, "align": "left"}, + "subheading": {"font_size": 28, "color": "#4A90E2", "bold": True, "align": "left"}, + "paragraph": {"font_size": 20, "color": "#2F2F2F", "bold": False, "align": "left"}, + "bullet_list": {"font_size": 20, "color": "#2F2F2F", "indent": 20}, + "table_header": {"font_size": 18, "color": "#FFFFFF", "bold": True, "background": "#1B365D"}, + "table_cell": {"font_size": 16, "color": "#2F2F2F", "bold": False, "background": "#F8F9FA"}, + "slide_size": "16:9", + "content_per_slide": "concise", + "design_theme": "corporate", + "color_scheme": "professional", + "background_style": "clean", + "accent_colors": ["#1B365D", "#2C5F2D", "#4A90E2", "#6B7280"], + "professional_grade": True, + "executive_ready": True + } + + style_template = self._create_professional_pptx_template(user_prompt, style_schema) + # Use our own _get_ai_styles_with_pptx_colors method to ensure proper color conversion + styles = await self._get_ai_styles_with_pptx_colors(ai_service, style_template, self._get_default_pptx_styles()) + + # Validate PowerPoint-specific requirements + return self._validate_pptx_styles_readability(styles) + + def _create_professional_pptx_template(self, user_prompt: str, style_schema: Dict[str, Any]) -> str: + """Create a professional PowerPoint-specific AI style template for corporate-quality slides.""" + import json + schema_json = json.dumps(style_schema, indent=4) + + return f"""Customize the JSON below for professional PowerPoint slides. + +User Request: {user_prompt or "Create professional corporate slides"} + +Rules: +- Use professional colors (blues, grays, deep greens) +- Large, readable font sizes +- High contrast +- Sophisticated color palettes + +Return ONLY this JSON with your changes: + +{schema_json} + +JSON ONLY. NO OTHER TEXT.""" + + async def _get_ai_styles_with_pptx_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: + """Get AI styles with proper PowerPoint color conversion.""" + if not ai_service: + return default_styles + + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=style_template, context="", options=request_options) + + # Check if AI service is properly configured + if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects: + self.logger.warning("AI service not properly configured, using defaults") + return default_styles + + response = await ai_service.aiObjects.call(request) + + # Check if response is valid + if not response: + self.logger.warning("AI service returned no response, using defaults") + return default_styles + + import json + import re + + # Clean and parse JSON + result = response.content.strip() if response and response.content else "" + + # Check if result is empty + if not result: + self.logger.warning("AI styling returned empty response, using defaults") + return default_styles + + # Log the raw response for debugging + self.logger.debug(f"AI styling raw response: {result[:200]}...") + + # Extract JSON from various formats + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to extract JSON from explanatory text + json_patterns = [ + r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object + r'\{.*?"title".*?\}', # JSON with title field + r'\{.*?"font_size".*?\}', # JSON with font_size field + ] + + for pattern in json_patterns: + json_match = re.search(pattern, result, re.DOTALL) + if json_match: + result = json_match.group(0) + break + + # Additional cleanup - remove any leading/trailing whitespace and newlines + result = result.strip() + + # Check if result is still empty after cleanup + if not result: + self.logger.warning("AI styling returned empty content after cleanup, using defaults") + return default_styles + + # Try to parse JSON + try: + styles = json.loads(result) + self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}") + except json.JSONDecodeError as json_error: + self.logger.warning(f"AI styling returned invalid JSON: {json_error}") + self.logger.warning(f"Raw content that failed to parse: {result[:100]}...") + # Try to extract just the JSON part if it's embedded in text + json_start = result.find('{') + json_end = result.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + json_part = result[json_start:json_end+1] + try: + styles = json.loads(json_part) + self.logger.info("Successfully extracted JSON from explanatory text") + self.logger.debug(f"Extracted AI styles: {list(styles.keys())}") + except json.JSONDecodeError: + self.logger.warning("Could not extract valid JSON from response, using defaults") + return default_styles + else: + return default_styles + + # Convert colors to PowerPoint RGB format + styles = self._convert_colors_format(styles) + + return styles + + except Exception as e: + self.logger.warning(f"AI styling failed: {str(e)}, using defaults") + return default_styles + + def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Convert hex colors to RGB format for PowerPoint compatibility.""" + try: + for style_name, style_config in styles.items(): + if isinstance(style_config, dict): + for prop, value in style_config.items(): + if isinstance(value, str) and value.startswith('#'): + # Convert hex to RGB tuple for PowerPoint + hex_color = value.lstrip('#') + if len(hex_color) == 6: + r = int(hex_color[0:2], 16) + g = int(hex_color[2:4], 16) + b = int(hex_color[4:6], 16) + styles[style_name][prop] = (r, g, b) + elif len(hex_color) == 8: # aRGB format + r = int(hex_color[2:4], 16) + g = int(hex_color[4:6], 16) + b = int(hex_color[6:8], 16) + styles[style_name][prop] = (r, g, b) + return styles + except Exception as e: + self.logger.warning(f"Color conversion failed: {str(e)}") + return styles + + def _get_safe_color(self, color_value, default=(0, 0, 0)) -> tuple: + """Get a safe RGB color tuple for PowerPoint.""" + if isinstance(color_value, tuple) and len(color_value) == 3: + return color_value + elif isinstance(color_value, str) and color_value.startswith('#'): + hex_color = color_value.lstrip('#') + if len(hex_color) == 6: + r = int(hex_color[0:2], 16) + g = int(hex_color[2:4], 16) + b = int(hex_color[4:6], 16) + return (r, g, b) + elif len(hex_color) == 8: # aRGB format + r = int(hex_color[2:4], 16) + g = int(hex_color[4:6], 16) + b = int(hex_color[6:8], 16) + return (r, g, b) + return default + + def _validate_pptx_styles_readability(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Validate and fix readability issues in AI-generated styles.""" + try: + # Ensure minimum font sizes for PowerPoint readability + min_font_sizes = { + "title": 36, + "heading": 24, + "subheading": 20, + "paragraph": 14, + "bullet_list": 14, + "table_header": 12, + "table_cell": 12 + } + + for style_name, min_size in min_font_sizes.items(): + if style_name in styles: + current_size = styles[style_name].get("font_size", 12) + if current_size < min_size: + styles[style_name]["font_size"] = min_size + + return styles + + except Exception as e: + logger.warning(f"Style validation failed: {str(e)}") + return self._get_default_pptx_styles() + + def _get_default_pptx_styles(self) -> Dict[str, Any]: + """Default PowerPoint styles with corporate professional color scheme.""" + return { + "title": {"font_size": 52, "color": (27, 54, 93), "bold": True, "align": "center"}, + "heading": {"font_size": 36, "color": (44, 95, 45), "bold": True, "align": "left"}, + "subheading": {"font_size": 28, "color": (74, 144, 226), "bold": True, "align": "left"}, + "paragraph": {"font_size": 20, "color": (47, 47, 47), "bold": False, "align": "left"}, + "bullet_list": {"font_size": 20, "color": (47, 47, 47), "indent": 20}, + "table_header": {"font_size": 18, "color": (255, 255, 255), "bold": True, "background": (27, 54, 93)}, + "table_cell": {"font_size": 16, "color": (47, 47, 47), "bold": False, "background": (248, 249, 250)}, + "slide_size": "16:9", + "content_per_slide": "concise", + "design_theme": "corporate", + "color_scheme": "professional", + "background_style": "clean", + "accent_colors": [(27, 54, 93), (44, 95, 45), (74, 144, 226), (107, 114, 128)], + "professional_grade": True, + "executive_ready": True + } + + async def _parse_json_to_slides(self, json_content: Dict[str, Any], title: str, styles: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Parse JSON content into slide data structure. + + Args: + json_content: JSON content to parse + title: Presentation title + styles: AI-generated styles + + Returns: + List of slide data dictionaries + """ + slides = [] + + try: + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Create title slide + slides.append({ + "title": document_title, + "content": "Generated by PowerOn AI System\n\n" + self._format_timestamp() + }) + + # Process sections into slides based on content and user intent + sections = json_content.get("sections", []) + slides.extend(self._create_slides_from_sections(sections, styles)) + + # If no content slides were created, create a default content slide + if len(slides) == 1: # Only title slide + slides.append({ + "title": "Content Overview", + "content": "No structured content found in the source documents.\n\nPlease check the source documents and try again." + }) + + return slides + + except Exception as e: + logger.error(f"Error parsing JSON to slides: {str(e)}") + # Return minimal fallback slides + return [ + { + "title": title, + "content": "Error parsing content for presentation" + } + ] + + def _create_slide_from_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]: + """Create a slide from a JSON section.""" + try: + # Get section title from data or use default + section_title = "Untitled Section" + if section.get("content_type") == "heading": + # Extract text from elements array + for element in section.get("elements", []): + if isinstance(element, dict) and "text" in element: + section_title = element.get("text", "Untitled Section") + break + elif section.get("title"): + section_title = section.get("title") + + content_type = section.get("content_type", "paragraph") + elements = section.get("elements", []) + + # Build slide content based on section type + content_parts = [] + + if content_type == "table": + content_parts.append(self._format_table_for_slide(elements)) + elif content_type == "list": + content_parts.append(self._format_list_for_slide(elements)) + elif content_type == "heading": + content_parts.append(self._format_heading_for_slide(elements)) + elif content_type == "paragraph": + content_parts.append(self._format_paragraph_for_slide(elements)) + elif content_type == "code": + content_parts.append(self._format_code_for_slide(elements)) + else: + content_parts.append(self._format_paragraph_for_slide(elements)) + + # Combine content parts + slide_content = "\n\n".join(filter(None, content_parts)) + + return { + "title": section_title, + "content": slide_content + } + + except Exception as e: + logger.warning(f"Error creating slide from section: {str(e)}") + return None + + def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str: + """Format table data for slide presentation.""" + try: + # Extract table data from elements array + headers = [] + rows = [] + for element in elements: + if isinstance(element, dict) and "headers" in element and "rows" in element: + headers = element.get("headers", []) + rows = element.get("rows", []) + break + + if not headers: + return "" + + # Create table representation + table_lines = [] + + # Add headers + header_line = " | ".join(str(h) for h in headers) + table_lines.append(header_line) + + # Add separator + separator = "-" * len(header_line) + table_lines.append(separator) + + # Add data rows (limit based on content density) + max_rows = 5 # Default limit + for row in rows[:max_rows]: + row_line = " | ".join(str(cell) for cell in row) + table_lines.append(row_line) + + if len(rows) > max_rows: + table_lines.append(f"... and {len(rows) - max_rows} more rows") + + return "\n".join(table_lines) + + except Exception as e: + logger.warning(f"Error formatting table for slide: {str(e)}") + return "" + + def _format_list_for_slide(self, list_data: Dict[str, Any]) -> str: + """Format list data for slide presentation.""" + try: + items = list_data.get("items", []) + + if not items: + return "" + + # Create list representation + list_lines = [] + + for item in items: + if isinstance(item, dict): + text = item.get("text", "") + list_lines.append(f"β€’ {text}") + + # Add subitems (limit to 3 for readability) + subitems = item.get("subitems", [])[:3] + for subitem in subitems: + if isinstance(subitem, dict): + list_lines.append(f" - {subitem.get('text', '')}") + else: + list_lines.append(f" - {subitem}") + else: + list_lines.append(f"β€’ {str(item)}") + + return "\n".join(list_lines) + + except Exception as e: + logger.warning(f"Error formatting list for slide: {str(e)}") + return "" + + def _format_heading_for_slide(self, heading_data: Dict[str, Any]) -> str: + """Format heading data for slide presentation.""" + try: + text = heading_data.get("text", "") + level = heading_data.get("level", 1) + + if text: + return f"{'#' * level} {text}" + + return "" + + except Exception as e: + logger.warning(f"Error formatting heading for slide: {str(e)}") + return "" + + def _format_paragraph_for_slide(self, paragraph_data: Dict[str, Any]) -> str: + """Format paragraph data for slide presentation.""" + try: + text = paragraph_data.get("text", "") + + if text: + # Limit paragraph length based on content density + max_length = 200 # Default limit + if len(text) > max_length: + text = text[:max_length] + "..." + + return text + + return "" + + except Exception as e: + logger.warning(f"Error formatting paragraph for slide: {str(e)}") + return "" + + def _format_code_for_slide(self, code_data: Dict[str, Any]) -> str: + """Format code data for slide presentation.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + if code: + # Limit code length based on content density + max_length = 100 # Default limit + if len(code) > max_length: + code = code[:max_length] + "..." + + if language: + return f"Code ({language}):\n{code}" + else: + return f"Code:\n{code}" + + return "" + + except Exception as e: + logger.warning(f"Error formatting code for slide: {str(e)}") + return "" + + def _get_slide_layout_index(self, slide_data: Dict[str, Any], styles: Dict[str, Any]) -> int: + """Determine the best professional slide layout based on content.""" + try: + content = slide_data.get("content", "") + title = slide_data.get("title", "") + + # Check if it's a title slide (first slide) + if not content or "Generated by PowerOn AI System" in content: + return 0 # Title slide layout + + # Professional layout selection based on content + if "|" in content and "-" in content: + # Has both tables and lists - use content with caption for professional look + return 2 + elif "|" in content: + # Has tables - use content layout for clean table presentation + return 1 + elif content.count("β€’") > 2: + # Has many bullet points - use content layout for better readability + return 1 + elif len(content) > 200: + # Long content - use content layout for better text flow + return 1 + elif title and len(title) > 20: + # Long title - use title and content layout + return 1 + else: + # Default to title and content layout for professional appearance + return 1 + + except Exception as e: + logger.warning(f"Error determining slide layout: {str(e)}") + return 1 # Default to title and content layout + + def _create_slides_from_sections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create slides from sections based on content density and user intent.""" + try: + slides = [] + content_per_slide = styles.get("content_per_slide", "concise") + + # Group sections by type and create slides + current_slide_content = [] + current_slide_title = "Content Overview" + + for section in sections: + section_type = section.get("content_type", "paragraph") + elements = section.get("elements", []) + + if section_type == "heading": + # If we have accumulated content, create a slide + if current_slide_content: + slides.append({ + "title": current_slide_title, + "content": "\n\n".join(current_slide_content) + }) + current_slide_content = [] + + # Start new slide with heading as title + for element in elements: + if isinstance(element, dict) and "text" in element: + current_slide_title = element.get("text", "Untitled Section") + break + else: + # Add content to current slide + formatted_content = self._format_section_content(section) + if formatted_content: + current_slide_content.append(formatted_content) + + # Add final slide if there's content + if current_slide_content: + slides.append({ + "title": current_slide_title, + "content": "\n\n".join(current_slide_content) + }) + + return slides + + except Exception as e: + logger.warning(f"Error creating slides from sections: {str(e)}") + return [] + + def _format_section_content(self, section: Dict[str, Any]) -> str: + """Format section content for slide presentation.""" + try: + content_type = section.get("content_type", "paragraph") + elements = section.get("elements", []) + + # Process each element in the section + content_parts = [] + for element in elements: + if content_type == "table": + content_parts.append(self._format_table_for_slide([element])) + elif content_type == "list": + content_parts.append(self._format_list_for_slide([element])) + elif content_type == "heading": + content_parts.append(self._format_heading_for_slide([element])) + elif content_type == "paragraph": + content_parts.append(self._format_paragraph_for_slide([element])) + elif content_type == "code": + content_parts.append(self._format_code_for_slide([element])) + else: + content_parts.append(self._format_paragraph_for_slide([element])) + + return "\n\n".join(filter(None, content_parts)) + + except Exception as e: + logger.warning(f"Error formatting section content: {str(e)}") + return "" + + def _format_timestamp(self) -> str: + """Format current timestamp for presentation generation.""" + from datetime import datetime, UTC + return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/modules/services/serviceGeneration/renderers/rendererText.py b/modules/services/serviceGeneration/renderers/rendererText.py new file mode 100644 index 00000000..68ccfdbe --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererText.py @@ -0,0 +1,256 @@ +""" +Text renderer for report generation. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List + +class RendererText(BaseRenderer): + """Renders content to plain text format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported text formats (excluding formats with dedicated renderers).""" + return [ + 'txt', 'text', 'plain', + # Programming languages + 'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx', + 'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp', + 'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust', + 'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc', + 'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd', + # Web technologies (excluding html/htm which have dedicated renderer) + 'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg', + # Data formats (excluding csv, md/markdown which have dedicated renderers) + 'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore', + # Configuration files + 'env', 'properties', 'conf', 'config', 'rc', + 'gitattributes', 'editorconfig', 'eslintrc', + # Documentation + 'readme', 'changelog', 'license', 'authors', + 'contributing', 'todo', 'notes', 'docs' + ] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return [ + 'ascii', 'utf8', 'utf-8', 'code', 'source', + 'script', 'program', 'file', 'document', + 'raw', 'unformatted', 'plaintext' + ] + + @classmethod + def get_priority(cls) -> int: + """Return priority for text renderer.""" + return 90 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to plain text format.""" + try: + # Generate text from JSON structure + text_content = self._generate_text_from_json(extracted_content, title) + + return text_content, "text/plain" + + except Exception as e: + self.logger.error(f"Error rendering text: {str(e)}") + # Return minimal text fallback + return f"{title}\n\nError rendering report: {str(e)}", "text/plain" + + def _generate_text_from_json(self, json_content: Dict[str, Any], title: str) -> str: + """Generate text content from structured JSON document.""" + try: + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Build text content + text_parts = [] + + # Document title + text_parts.append(document_title) + text_parts.append("=" * len(document_title)) + text_parts.append("") + + # Process each section + sections = json_content.get("sections", []) + for section in sections: + section_text = self._render_json_section(section) + if section_text: + text_parts.append(section_text) + text_parts.append("") # Add spacing between sections + + # Add generation info + text_parts.append("") + text_parts.append(f"Generated: {self._format_timestamp()}") + + return '\n'.join(text_parts) + + except Exception as e: + self.logger.error(f"Error generating text from JSON: {str(e)}") + raise Exception(f"Text generation failed: {str(e)}") + + def _render_json_section(self, section: Dict[str, Any]) -> str: + """Render a single JSON section to text.""" + try: + section_type = self._get_section_type(section) + section_data = self._get_section_data(section) + + if section_type == "table": + # Process the section data to extract table structure + processed_data = self._process_section_by_type(section) + return self._render_json_table(processed_data) + elif section_type == "bullet_list": + # Process the section data to extract bullet list structure + processed_data = self._process_section_by_type(section) + return self._render_json_bullet_list(processed_data) + elif section_type == "heading": + # Render each heading element in the elements array + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_heading(element)) + return "\n".join(rendered_elements) + elif section_type == "paragraph": + # Render each paragraph element in the elements array + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_paragraph(element)) + return "\n".join(rendered_elements) + elif section_type == "code_block": + # Process the section data to extract code block structure + processed_data = self._process_section_by_type(section) + return self._render_json_code_block(processed_data) + elif section_type == "image": + # Process the section data to extract image structure + processed_data = self._process_section_by_type(section) + return self._render_json_image(processed_data) + else: + # Fallback to paragraph for unknown types - render each element + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_paragraph(element)) + return "\n".join(rendered_elements) + + except Exception as e: + self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") + return f"[Error rendering section: {str(e)}]" + + def _render_json_table(self, table_data: Dict[str, Any]) -> str: + """Render a JSON table to text.""" + try: + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + + if not headers or not rows: + return "" + + text_parts = [] + + # Create table header + header_line = " | ".join(str(header) for header in headers) + text_parts.append(header_line) + + # Add separator line + separator_line = " | ".join("-" * len(str(header)) for header in headers) + text_parts.append(separator_line) + + # Add data rows + for row in rows: + row_line = " | ".join(str(cell_data) for cell_data in row) + text_parts.append(row_line) + + return '\n'.join(text_parts) + + except Exception as e: + self.logger.warning(f"Error rendering table: {str(e)}") + return "" + + def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str: + """Render a JSON bullet list to text.""" + try: + items = list_data.get("items", []) + + if not items: + return "" + + text_parts = [] + for item in items: + if isinstance(item, str): + text_parts.append(f"- {item}") + elif isinstance(item, dict) and "text" in item: + text_parts.append(f"- {item['text']}") + + return '\n'.join(text_parts) + + except Exception as e: + self.logger.warning(f"Error rendering bullet list: {str(e)}") + return "" + + def _render_json_heading(self, heading_data: Dict[str, Any]) -> str: + """Render a JSON heading to text.""" + try: + level = heading_data.get("level", 1) + text = heading_data.get("text", "") + + if text: + level = max(1, min(6, level)) + if level == 1: + return f"{text}\n{'=' * len(text)}" + elif level == 2: + return f"{text}\n{'-' * len(text)}" + else: + return f"{'#' * level} {text}" + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering heading: {str(e)}") + return "" + + def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str: + """Render a JSON paragraph to text.""" + try: + text = paragraph_data.get("text", "") + return text if text else "" + + except Exception as e: + self.logger.warning(f"Error rendering paragraph: {str(e)}") + return "" + + def _render_json_code_block(self, code_data: Dict[str, Any]) -> str: + """Render a JSON code block to text.""" + try: + code = code_data.get("code", "") + language = code_data.get("language", "") + + if code: + if language: + return f"Code ({language}):\n{code}" + else: + return code + + return "" + + except Exception as e: + self.logger.warning(f"Error rendering code block: {str(e)}") + return "" + + def _render_json_image(self, image_data: Dict[str, Any]) -> str: + """Render a JSON image to text.""" + try: + alt_text = image_data.get("altText", "Image") + return f"[Image: {alt_text}]" + + except Exception as e: + self.logger.warning(f"Error rendering image: {str(e)}") + return f"[Image: {image_data.get('altText', 'Image')}]" diff --git a/modules/services/serviceGeneration/renderers/rendererXlsx.py b/modules/services/serviceGeneration/renderers/rendererXlsx.py new file mode 100644 index 00000000..ddd6e9f3 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py @@ -0,0 +1,791 @@ +""" +Excel renderer for report generation using openpyxl. +""" + +from .rendererBaseTemplate import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from openpyxl import Workbook + from openpyxl.styles import Font, PatternFill, Alignment, Border, Side + from openpyxl.utils import get_column_letter + from openpyxl.worksheet.table import Table, TableStyleInfo + OPENPYXL_AVAILABLE = True +except ImportError: + OPENPYXL_AVAILABLE = False + +class RendererXlsx(BaseRenderer): + """Renders content to Excel format using openpyxl.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported Excel formats.""" + return ['xlsx', 'xls', 'excel'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'workbook'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for Excel renderer.""" + return 110 + + async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: + """Render extracted JSON content to Excel format using AI-analyzed styling.""" + try: + if not OPENPYXL_AVAILABLE: + # Fallback to CSV if openpyxl not available + from .rendererCsv import RendererCsv + csv_renderer = RendererCsv() + csv_content, _ = await csv_renderer.render(extracted_content, title, user_prompt, ai_service) + return csv_content, "text/csv" + + # Generate Excel using AI-analyzed styling + excel_content = await self._generate_excel_from_json(extracted_content, title, user_prompt, ai_service) + + return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + except Exception as e: + self.logger.error(f"Error rendering Excel: {str(e)}") + # Return CSV fallback + return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv" + + def _generate_excel(self, content: str, title: str) -> str: + """Generate Excel content using openpyxl.""" + try: + # Create workbook + wb = Workbook() + + # Remove default sheet + wb.remove(wb.active) + + # Create sheets + summary_sheet = wb.create_sheet("Summary", 0) + data_sheet = wb.create_sheet("Data", 1) + analysis_sheet = wb.create_sheet("Analysis", 2) + + # Add content to sheets + self._populate_summary_sheet(summary_sheet, title) + self._populate_data_sheet(data_sheet, content) + self._populate_analysis_sheet(analysis_sheet, content) + + # Save to buffer + buffer = io.BytesIO() + wb.save(buffer) + buffer.seek(0) + + # Convert to base64 + excel_bytes = buffer.getvalue() + excel_base64 = base64.b64encode(excel_bytes).decode('utf-8') + + return excel_base64 + + except Exception as e: + self.logger.error(f"Error generating Excel: {str(e)}") + raise + + def _populate_summary_sheet(self, sheet, title: str): + """Populate the summary sheet.""" + try: + # Title + sheet['A1'] = title + sheet['A1'].font = Font(size=16, bold=True) + sheet['A1'].alignment = Alignment(horizontal='center') + + # Generation info + sheet['A3'] = "Generated:" + sheet['B3'] = self._format_timestamp() + sheet['A4'] = "Status:" + sheet['B4'] = "Generated Successfully" + + # Key metrics placeholder + sheet['A6'] = "Key Metrics:" + sheet['A6'].font = Font(bold=True) + sheet['A7'] = "Total Items:" + sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet + + # Auto-adjust column widths + sheet.column_dimensions['A'].width = 20 + sheet.column_dimensions['B'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate summary sheet: {str(e)}") + + def _populate_data_sheet(self, sheet, content: str): + """Populate the data sheet.""" + try: + # Headers + headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"] + for col, header in enumerate(headers, 1): + cell = sheet.cell(row=1, column=col, value=header) + cell.font = Font(bold=True) + cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid") + + # Process content + lines = content.split('\n') + row = 2 + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for table data (lines with |) + if '|' in line: + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns + sheet.cell(row=row, column=col, value=cell_data) + row += 1 + else: + # Regular content + sheet.cell(row=row, column=1, value=line) + row += 1 + + # Auto-adjust column widths + for col in range(1, 6): + sheet.column_dimensions[get_column_letter(col)].width = 20 + + except Exception as e: + self.logger.warning(f"Could not populate data sheet: {str(e)}") + + def _populate_analysis_sheet(self, sheet, content: str): + """Populate the analysis sheet.""" + try: + # Title + sheet['A1'] = "Analysis & Insights" + sheet['A1'].font = Font(size=14, bold=True) + + # Content analysis + lines = content.split('\n') + row = 3 + + sheet['A3'] = "Content Analysis:" + sheet['A3'].font = Font(bold=True) + row += 1 + + # Count different types of content + table_lines = sum(1 for line in lines if '|' in line) + list_lines = sum(1 for line in lines if line.startswith(('- ', '* '))) + text_lines = len(lines) - table_lines - list_lines + + sheet[f'A{row}'] = f"Total Lines: {len(lines)}" + row += 1 + sheet[f'A{row}'] = f"Table Rows: {table_lines}" + row += 1 + sheet[f'A{row}'] = f"List Items: {list_lines}" + row += 1 + sheet[f'A{row}'] = f"Text Lines: {text_lines}" + row += 2 + + # Recommendations + sheet[f'A{row}'] = "Recommendations:" + sheet[f'A{row}'].font = Font(bold=True) + row += 1 + sheet[f'A{row}'] = "1. Review data accuracy" + row += 1 + sheet[f'A{row}'] = "2. Consider additional analysis" + row += 1 + sheet[f'A{row}'] = "3. Update regularly" + + # Auto-adjust column width + sheet.column_dimensions['A'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate analysis sheet: {str(e)}") + + async def _generate_excel_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: + """Generate Excel content from structured JSON document using AI-generated styling.""" + try: + # Debug output + self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(json_content)}", "EXCEL_RENDERER") + self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(json_content.keys()) if isinstance(json_content, dict) else 'Not a dict'}", "EXCEL_RENDERER") + + # Get AI-generated styling definitions + styles = await self._get_excel_styles(user_prompt, ai_service) + + # Validate JSON structure + if not isinstance(json_content, dict): + raise ValueError("JSON content must be a dictionary") + + if "sections" not in json_content: + raise ValueError("JSON content must contain 'sections' field") + + # Use title from JSON metadata if available, otherwise use provided title + document_title = json_content.get("metadata", {}).get("title", title) + + # Create workbook + wb = Workbook() + + # Create sheets based on content + sheets = self._create_excel_sheets(wb, json_content, styles) + self.services.utils.debugLogToFile(f"EXCEL SHEETS CREATED: {list(sheets.keys()) if sheets else 'None'}", "EXCEL_RENDERER") + + # Populate sheets with content + self._populate_excel_sheets(sheets, json_content, styles) + + # Save to buffer + buffer = io.BytesIO() + wb.save(buffer) + buffer.seek(0) + + # Convert to base64 + excel_bytes = buffer.getvalue() + self.services.utils.debugLogToFile(f"EXCEL BYTES LENGTH: {len(excel_bytes)}", "EXCEL_RENDERER") + try: + excel_base64 = base64.b64encode(excel_bytes).decode('utf-8') + self.services.utils.debugLogToFile(f"EXCEL BASE64 LENGTH: {len(excel_base64)}", "EXCEL_RENDERER") + except Exception as b64_error: + self.services.utils.debugLogToFile(f"BASE64 ENCODING ERROR: {b64_error}", "EXCEL_RENDERER") + raise + + return excel_base64 + + except Exception as e: + self.logger.error(f"Error generating Excel from JSON: {str(e)}") + raise Exception(f"Excel generation failed: {str(e)}") + + async def _get_excel_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: + """Get Excel styling definitions using base template AI styling.""" + style_schema = { + "title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"}, + "heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"}, + "table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"}, + "table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"}, + "bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2}, + "paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"}, + "code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"} + } + + style_template = self._create_ai_style_template("xlsx", user_prompt, style_schema) + # Use our own _get_ai_styles_with_excel_colors method to ensure proper color conversion + styles = await self._get_ai_styles_with_excel_colors(ai_service, style_template, self._get_default_excel_styles()) + + # Validate and fix contrast issues + return self._validate_excel_styles_contrast(styles) + + async def _get_ai_styles_with_excel_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: + """Get AI styles with proper Excel color conversion.""" + if not ai_service: + return default_styles + + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=style_template, context="", options=request_options) + response = await ai_service.aiObjects.call(request) + + import json + import re + + # Clean and parse JSON + result = response.content.strip() if response and response.content else "" + + # Check if result is empty + if not result: + self.logger.warning("AI styling returned empty response, using defaults") + return default_styles + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + self.services.utils.debugLogToFile(f"EXTRACTED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER") + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + self.services.utils.debugLogToFile(f"CLEANED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER") + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + self.services.utils.debugLogToFile(f"CLEANED JSON FROM GENERIC MARKDOWN: {result[:100]}...", "EXCEL_RENDERER") + + # Try to parse JSON + try: + styles = json.loads(result) + except json.JSONDecodeError as json_error: + self.logger.warning(f"AI styling returned invalid JSON: {json_error}, using defaults") + return default_styles + + # Convert colors to Excel aRGB format + styles = self._convert_colors_format(styles) + + return styles + + except Exception as e: + self.logger.warning(f"AI styling failed: {str(e)}, using defaults") + return default_styles + + def _get_safe_color(self, color_value: str, default: str = "FF000000") -> str: + """Get a safe aRGB color value for Excel (without # prefix).""" + if not isinstance(color_value, str): + return default + + # Remove # prefix if present + if color_value.startswith('#'): + color_value = color_value[1:] + + if len(color_value) == 6: + # Convert RRGGBB to AARRGGBB + return f"FF{color_value}" + elif len(color_value) == 8: + # Already aRGB format + return color_value + else: + # Unexpected format, return default + return default + + def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Convert hex colors to aRGB format for Excel compatibility.""" + try: + self.services.utils.debugLogToFile(f"CONVERTING COLORS IN STYLES: {styles}", "EXCEL_RENDERER") + for style_name, style_config in styles.items(): + if isinstance(style_config, dict): + for prop, value in style_config.items(): + if isinstance(value, str) and value.startswith('#') and len(value) == 7: + # Convert #RRGGBB to #AARRGGBB (add FF alpha channel) + styles[style_name][prop] = f"FF{value[1:]}" + elif isinstance(value, str) and value.startswith('#') and len(value) == 9: + pass # Already aRGB format + elif isinstance(value, str) and value.startswith('#'): + pass # Unexpected format, keep as is + return styles + except Exception as e: + return styles + + def _validate_excel_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: + """Validate and fix contrast issues in AI-generated styles.""" + try: + # Fix table header contrast + if "table_header" in styles: + header = styles["table_header"] + bg_color = header.get("background", "#FFFFFF") + text_color = header.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + header["background"] = "#4F4F4F" + header["text_color"] = "#FFFFFF" + + # Fix table cell contrast + if "table_cell" in styles: + cell = styles["table_cell"] + bg_color = cell.get("background", "#FFFFFF") + text_color = cell.get("text_color", "#000000") + + # If both are white or both are dark, fix it + if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": + cell["background"] = "#FFFFFF" + cell["text_color"] = "#2F2F2F" + + return styles + + except Exception as e: + self.logger.warning(f"Style validation failed: {str(e)}") + return self._get_default_excel_styles() + + def _get_default_excel_styles(self) -> Dict[str, Any]: + """Default Excel styles with aRGB color format.""" + return { + "title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"}, + "heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"}, + "table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"}, + "table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"}, + "bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2}, + "paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"}, + "code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"} + } + + def _create_excel_sheets(self, wb: Workbook, json_content: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]: + """Create Excel sheets based on content structure and user intent.""" + sheets = {} + + # Get sheet names from AI styles or generate based on content + sheet_names = styles.get("sheet_names", self._generate_sheet_names_from_content(json_content)) + self.services.utils.debugLogToFile(f"EXCEL SHEET NAMES: {sheet_names}", "EXCEL_RENDERER") + + # Create sheets + for i, sheet_name in enumerate(sheet_names): + if i == 0: + # Use the default sheet for the first sheet + sheet = wb.active + sheet.title = sheet_name + else: + # Create additional sheets + sheet = wb.create_sheet(sheet_name, i) + sheets[sheet_name.lower()] = sheet + + return sheets + + def _generate_sheet_names_from_content(self, json_content: Dict[str, Any]) -> List[str]: + """Generate sheet names based on actual content structure.""" + sections = json_content.get("sections", []) + + # If no sections, create a single sheet + if not sections: + return ["Content"] + + # Generate sheet names based on content structure + sheet_names = [] + + # Check if we have multiple table sections + table_sections = [s for s in sections if s.get("content_type") == "table"] + + if len(table_sections) > 1: + # Create separate sheets for each table + for i, section in enumerate(table_sections, 1): + section_title = section.get("title", f"Table {i}") + sheet_names.append(section_title[:31]) # Excel sheet name limit + else: + # Single table or mixed content - create main sheet + document_title = json_content.get("metadata", {}).get("title", "Document") + sheet_names.append(document_title[:31]) # Excel sheet name limit + + # Add additional sheets for other content types + content_types = set() + for section in sections: + content_type = section.get("content_type", "paragraph") + content_types.add(content_type) + + if "table" in content_types and len(table_sections) == 1: + sheet_names.append("Table Data") + if "list" in content_types: + sheet_names.append("Lists") + if "paragraph" in content_types or "heading" in content_types: + sheet_names.append("Text") + + # Limit to 4 sheets maximum + return sheet_names[:4] + + def _populate_excel_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any]) -> None: + """Populate Excel sheets with content from JSON based on actual sheet names.""" + try: + # Get the actual sheet names that were created + sheet_names = list(sheets.keys()) + + if not sheet_names: + return + + sections = json_content.get("sections", []) + table_sections = [s for s in sections if s.get("content_type") == "table"] + + if len(table_sections) > 1: + # Multiple tables - populate each sheet with its corresponding table + for i, section in enumerate(table_sections): + if i < len(sheet_names): + sheet_name = sheet_names[i] + sheet = sheets[sheet_name] + self._populate_table_sheet(sheet, section, styles, f"Table {i+1}") + else: + # Single table or mixed content - use original logic + first_sheet_name = sheet_names[0] + self._populate_main_sheet(sheets[first_sheet_name], json_content, styles) + + # If we have multiple sheets, distribute content by type + if len(sheet_names) > 1: + self._populate_content_type_sheets(sheets, json_content, styles, sheet_names[1:]) + + except Exception as e: + self.logger.warning(f"Could not populate Excel sheets: {str(e)}") + + def _populate_table_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], sheet_title: str): + """Populate a sheet with a single table section.""" + try: + # Sheet title + sheet['A1'] = sheet_title + sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79"))) + sheet['A1'].alignment = Alignment(horizontal="center") + + # Get table data from elements (canonical JSON format) + elements = section.get("elements", []) + if elements and isinstance(elements, list) and len(elements) > 0: + table_data = elements[0] + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + else: + headers = [] + rows = [] + + if not headers and not rows: + sheet['A3'] = "No table data available" + return + + # Add headers + header_style = styles.get("table_header", {}) + for col, header in enumerate(headers, 1): + cell = sheet.cell(row=3, column=col, value=header) + if header_style.get("bold"): + cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000"))) + if header_style.get("background"): + cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid") + + # Add rows + cell_style = styles.get("table_cell", {}) + for row_idx, row_data in enumerate(rows, 4): + for col_idx, cell_value in enumerate(row_data, 1): + cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value) + if cell_style.get("text_color"): + cell.font = Font(color=self._get_safe_color(cell_style["text_color"])) + + # Auto-adjust column widths + for col in range(1, len(headers) + 1): + sheet.column_dimensions[get_column_letter(col)].width = 20 + + except Exception as e: + self.logger.warning(f"Could not populate table sheet: {str(e)}") + + def _populate_main_sheet(self, sheet, json_content: Dict[str, Any], styles: Dict[str, Any]): + """Populate the main sheet with document overview and all content.""" + try: + # Document title + document_title = json_content.get("metadata", {}).get("title", "Generated Report") + sheet['A1'] = document_title + + # Safety check for title style + title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"}) + try: + safe_color = self._get_safe_color(title_style["color"]) + sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color) + sheet['A1'].alignment = Alignment(horizontal=title_style["align"]) + except Exception as font_error: + # Try with a safe color + sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color="FF000000") + sheet['A1'].alignment = Alignment(horizontal=title_style["align"]) + + # Generation info + sheet['A3'] = "Generated:" + sheet['B3'] = self._format_timestamp() + sheet['A4'] = "Status:" + sheet['B4'] = "Generated Successfully" + + # Document metadata + metadata = json_content.get("metadata", {}) + if metadata: + sheet['A6'] = "Document Information:" + sheet['A6'].font = Font(bold=True) + + row = 7 + for key, value in metadata.items(): + if key != "title": + sheet[f'A{row}'] = f"{key.title()}:" + sheet[f'B{row}'] = str(value) + row += 1 + + # Content overview + sections = json_content.get("sections", []) + sheet[f'A{row + 1}'] = "Content Overview:" + sheet[f'A{row + 1}'].font = Font(bold=True) + + row += 2 + sheet[f'A{row}'] = f"Total Sections: {len(sections)}" + + # Count different content types + content_types = {} + for section in sections: + content_type = section.get("content_type", "unknown") + content_types[content_type] = content_types.get(content_type, 0) + 1 + + for content_type, count in content_types.items(): + row += 1 + sheet[f'A{row}'] = f"{content_type.title()} Sections: {count}" + + # Add all content to this sheet + row += 2 + for section in sections: + row = self._add_section_to_sheet(sheet, section, styles, row) + row += 1 # Empty row between sections + + # Auto-adjust column widths + sheet.column_dimensions['A'].width = 20 + sheet.column_dimensions['B'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate main sheet: {str(e)}") + + def _populate_content_type_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any], sheet_names: List[str]): + """Populate additional sheets based on content types.""" + try: + sections = json_content.get("sections", []) + + for sheet_name in sheet_names: + if sheet_name not in sheets: + continue + + sheet = sheets[sheet_name] + sheet_title = sheet_name.title() + sheet['A1'] = sheet_title + sheet['A1'].font = Font(size=16, bold=True) + + row = 3 + + # Filter sections by content type + if sheet_name == "tables": + filtered_sections = [s for s in sections if s.get("content_type") == "table"] + elif sheet_name == "lists": + filtered_sections = [s for s in sections if s.get("content_type") == "list"] + elif sheet_name == "text": + filtered_sections = [s for s in sections if s.get("content_type") in ["paragraph", "heading"]] + else: + filtered_sections = sections + + for section in filtered_sections: + row = self._add_section_to_sheet(sheet, section, styles, row) + row += 1 # Empty row between sections + + # Auto-adjust column widths + for col in range(1, 6): + sheet.column_dimensions[get_column_letter(col)].width = 20 + + except Exception as e: + self.logger.warning(f"Could not populate content type sheets: {str(e)}") + + def _add_section_to_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: + """Add a section to a sheet and return the next row.""" + try: + # Add section title + section_title = section.get("title") + if section_title: + sheet[f'A{start_row}'] = f"# {section_title}" + sheet[f'A{start_row}'].font = Font(bold=True) + start_row += 1 + + # Process section based on type + section_type = section.get("content_type", "paragraph") + + # Handle all section types using elements array + elements = section.get("elements", []) + for element in elements: + if section_type == "table": + start_row = self._add_table_to_excel(sheet, element, styles, start_row) + elif section_type == "list": + start_row = self._add_list_to_excel(sheet, element, styles, start_row) + elif section_type == "paragraph": + start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row) + elif section_type == "heading": + start_row = self._add_heading_to_excel(sheet, element, styles, start_row) + else: + start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row) + + return start_row + + except Exception as e: + self.logger.warning(f"Could not add section to sheet: {str(e)}") + return start_row + 1 + + def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: + """Add a table element to Excel sheet.""" + try: + # In canonical JSON format, table elements have headers and rows directly + headers = element.get("headers", []) + rows = element.get("rows", []) + + if not headers and not rows: + return start_row + + # Add headers + header_style = styles.get("table_header", {}) + for col, header in enumerate(headers, 1): + cell = sheet.cell(row=start_row, column=col, value=header) + if header_style.get("bold"): + cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000"))) + if header_style.get("background"): + cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid") + + start_row += 1 + + # Add rows + cell_style = styles.get("table_cell", {}) + for row_data in rows: + for col, cell_value in enumerate(row_data, 1): + cell = sheet.cell(row=start_row, column=col, value=cell_value) + if cell_style.get("text_color"): + cell.font = Font(color=self._get_safe_color(cell_style["text_color"])) + start_row += 1 + + return start_row + + except Exception as e: + self.logger.warning(f"Could not add table to Excel: {str(e)}") + return start_row + 1 + + def _add_list_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: + """Add a list element to Excel sheet.""" + try: + list_items = element.get("items", []) + + list_style = styles.get("bullet_list", {}) + for item in list_items: + sheet.cell(row=start_row, column=1, value=f"β€’ {item}") + if list_style.get("color"): + sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(list_style["color"])) + start_row += 1 + + return start_row + + except Exception as e: + self.logger.warning(f"Could not add list to Excel: {str(e)}") + return start_row + 1 + + def _add_paragraph_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: + """Add a paragraph element to Excel sheet.""" + try: + text = element.get("text", "") + if text: + sheet.cell(row=start_row, column=1, value=text) + + paragraph_style = styles.get("paragraph", {}) + if paragraph_style.get("color"): + sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(paragraph_style["color"])) + + start_row += 1 + + return start_row + + except Exception as e: + self.logger.warning(f"Could not add paragraph to Excel: {str(e)}") + return start_row + 1 + + def _add_heading_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: + """Add a heading element to Excel sheet.""" + try: + text = element.get("text", "") + level = element.get("level", 1) + + if text: + sheet.cell(row=start_row, column=1, value=text) + + heading_style = styles.get("heading", {}) + font_size = heading_style.get("font_size", 14) + if level > 1: + font_size = max(10, font_size - (level - 1) * 2) + + sheet.cell(row=start_row, column=1).font = Font( + size=font_size, + bold=True, + color=self._get_safe_color(heading_style.get("color", "FF000000")) + ) + + start_row += 1 + + return start_row + + except Exception as e: + self.logger.warning(f"Could not add heading to Excel: {str(e)}") + return start_row + 1 + + def _format_timestamp(self) -> str: + """Format current timestamp for document generation.""" + return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/modules/services/serviceGeneration/renderers/text_renderer.py b/modules/services/serviceGeneration/renderers/text_renderer.py deleted file mode 100644 index 67e32069..00000000 --- a/modules/services/serviceGeneration/renderers/text_renderer.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Text renderer for report generation. -""" - -from .base_renderer import BaseRenderer -from typing import Dict, Any, Tuple, List - -class TextRenderer(BaseRenderer): - """Renders content to plain text format with format-specific extraction.""" - - @classmethod - def get_supported_formats(cls) -> List[str]: - """Return supported text formats (excluding formats with dedicated renderers).""" - return [ - 'txt', 'text', 'plain', - # Programming languages - 'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx', - 'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp', - 'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust', - 'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc', - 'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd', - # Web technologies (excluding html/htm which have dedicated renderer) - 'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg', - # Data formats (excluding csv, md/markdown which have dedicated renderers) - 'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore', - # Configuration files - 'env', 'properties', 'conf', 'config', 'rc', - 'gitattributes', 'editorconfig', 'eslintrc', - # Documentation - 'readme', 'changelog', 'license', 'authors', - 'contributing', 'todo', 'notes', 'docs' - ] - - @classmethod - def get_format_aliases(cls) -> List[str]: - """Return format aliases.""" - return [ - 'ascii', 'utf8', 'utf-8', 'code', 'source', - 'script', 'program', 'file', 'document', - 'raw', 'unformatted', 'plaintext' - ] - - @classmethod - def get_priority(cls) -> int: - """Return priority for text renderer.""" - return 90 - - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: - """Return only plain-text guidelines; global prompt is built centrally.""" - return ( - "TEXT FORMAT GUIDELINES:\n" - "- Output ONLY plain text (no markdown or HTML).\n" - "- Use clear headings (you may underline with === or --- when helpful).\n" - "- Use simple bullet lists with '-' and tables with '|' when needed.\n" - "- Preserve indentation for code-like content if present.\n" - "OUTPUT: Return ONLY the raw text content." - ) - - async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: - """Render extracted content to plain text format.""" - try: - # The extracted content should already be formatted text from the AI - # Just clean it up - text_content = self._clean_text_content(extracted_content, title) - - return text_content, "text/plain" - - except Exception as e: - self.logger.error(f"Error rendering text: {str(e)}") - # Return minimal text fallback - return f"{title}\n\nError rendering report: {str(e)}", "text/plain" - - def _clean_text_content(self, content: str, title: str) -> str: - """Clean and validate text content from AI.""" - content = content.strip() - - # Remove markdown code blocks if present - if content.startswith("```") and content.endswith("```"): - lines = content.split('\n') - if len(lines) > 2: - content = '\n'.join(lines[1:-1]).strip() - - # Remove any remaining markdown formatting - content = content.replace('**', '').replace('*', '') - content = content.replace('__', '').replace('_', '') - - # Clean up any HTML-like tags that might have slipped through - import re - content = re.sub(r'<[^>]+>', '', content) - - # Ensure proper line endings - content = content.replace('\r\n', '\n').replace('\r', '\n') - - return content diff --git a/modules/services/serviceGeneration/subJsonSchema.py b/modules/services/serviceGeneration/subJsonSchema.py new file mode 100644 index 00000000..868a6ca4 --- /dev/null +++ b/modules/services/serviceGeneration/subJsonSchema.py @@ -0,0 +1,517 @@ +""" +JSON Schema definitions for AI-generated document structures. +This module provides schemas that guide AI to generate structured JSON output. +""" + +from typing import Dict, Any + + +def get_multi_document_subJsonSchema() -> Dict[str, Any]: + """Get the JSON schema for multi-document generation.""" + return { + "type": "object", + "required": ["metadata", "documents"], + "properties": { + "metadata": { + "type": "object", + "required": ["title", "splitStrategy"], + "properties": { + "title": {"type": "string", "description": "Document title"}, + "splitStrategy": { + "type": "string", + "enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"], + "description": "Strategy for splitting content into multiple files" + }, + "splitCriteria": { + "type": "object", + "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)" + }, + "fileNamingPattern": { + "type": "string", + "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')" + }, + "author": {"type": "string", "description": "Document author (optional)"}, + "source_documents": { + "type": "array", + "items": {"type": "string"}, + "description": "List of source document IDs" + }, + "extraction_method": { + "type": "string", + "default": "ai_extraction", + "description": "Method used for extraction" + } + } + }, + "documents": { + "type": "array", + "description": "Array of individual documents to generate", + "items": { + "type": "object", + "required": ["id", "title", "sections", "filename"], + "properties": { + "id": {"type": "string", "description": "Unique document identifier"}, + "title": {"type": "string", "description": "Document title"}, + "filename": {"type": "string", "description": "Generated filename"}, + "sections": { + "type": "array", + "description": "Document sections containing structured content", + "items": { + "type": "object", + "required": ["id", "content_type", "elements", "order"], + "properties": { + "id": {"type": "string", "description": "Unique section identifier"}, + "title": {"type": "string", "description": "Section title (optional)"}, + "content_type": { + "type": "string", + "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], + "description": "Primary content type of this section" + }, + "elements": { + "type": "array", + "description": "Content elements in this section", + "items": { + "oneOf": [ + {"$ref": "#/definitions/table"}, + {"$ref": "#/definitions/bullet_list"}, + {"$ref": "#/definitions/paragraph"}, + {"$ref": "#/definitions/heading"}, + {"$ref": "#/definitions/code_block"} + ] + } + }, + "order": {"type": "integer", "description": "Section order in document"}, + "metadata": { + "type": "object", + "description": "Additional section metadata" + } + } + } + }, + "metadata": { + "type": "object", + "description": "Document-specific metadata" + } + } + } + } + }, + "definitions": { + "table": { + "type": "object", + "required": ["headers", "rows"], + "properties": { + "headers": { + "type": "array", + "items": {"type": "string"}, + "description": "Table column headers" + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": {"type": "string"} + }, + "description": "Table data rows" + }, + "caption": { + "type": "string", + "description": "Table caption (optional)" + } + } + }, + "bullet_list": { + "type": "object", + "required": ["items"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "description": "List items" + }, + "list_type": { + "type": "string", + "enum": ["bullet", "numbered", "checklist"], + "default": "bullet", + "description": "Type of list" + } + } + }, + "list_item": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "paragraph": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "Paragraph text"}, + "formatting": { + "type": "object", + "description": "Text formatting (bold, italic, etc.)" + } + } + }, + "heading": { + "type": "object", + "required": ["text", "level"], + "properties": { + "text": {"type": "string", "description": "Heading text"}, + "level": { + "type": "integer", + "minimum": 1, + "maximum": 6, + "description": "Heading level (1-6)" + } + } + }, + "code_block": { + "type": "object", + "required": ["code"], + "properties": { + "code": {"type": "string", "description": "Code content"}, + "language": {"type": "string", "description": "Programming language (optional)"} + } + } + } + } + +def get_document_subJsonSchema() -> Dict[str, Any]: + """Get the JSON schema for structured document generation (single document).""" + return { + "type": "object", + "required": ["metadata", "sections"], + "properties": { + "metadata": { + "type": "object", + "required": ["title"], + "properties": { + "title": {"type": "string", "description": "Document title"}, + "author": {"type": "string", "description": "Document author (optional)"}, + "source_documents": { + "type": "array", + "items": {"type": "string"}, + "description": "List of source document IDs" + }, + "extraction_method": { + "type": "string", + "default": "ai_extraction", + "description": "Method used for extraction" + } + } + }, + "sections": { + "type": "array", + "description": "Document sections containing structured content", + "items": { + "type": "object", + "required": ["id", "content_type", "elements", "order"], + "properties": { + "id": {"type": "string", "description": "Unique section identifier"}, + "title": {"type": "string", "description": "Section title (optional)"}, + "content_type": { + "type": "string", + "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], + "description": "Primary content type of this section" + }, + "elements": { + "type": "array", + "description": "Content elements in this section", + "items": { + "oneOf": [ + {"$ref": "#/definitions/table"}, + {"$ref": "#/definitions/bullet_list"}, + {"$ref": "#/definitions/paragraph"}, + {"$ref": "#/definitions/heading"}, + {"$ref": "#/definitions/code_block"} + ] + } + }, + "order": {"type": "integer", "description": "Section order in document"}, + "metadata": { + "type": "object", + "description": "Additional section metadata" + } + } + } + }, + "summary": { + "type": "string", + "description": "Document summary (optional)" + }, + "tags": { + "type": "array", + "items": {"type": "string"}, + "description": "Document tags for categorization" + } + }, + "definitions": { + "table": { + "type": "object", + "required": ["headers", "rows"], + "properties": { + "headers": { + "type": "array", + "items": {"type": "string"}, + "description": "Table column headers" + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": {"type": "string"} + }, + "description": "Table data rows" + }, + "caption": { + "type": "string", + "description": "Table caption (optional)" + } + } + }, + "bullet_list": { + "type": "object", + "required": ["items"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "description": "List items" + }, + "list_type": { + "type": "string", + "enum": ["bullet", "numbered", "checklist"], + "default": "bullet", + "description": "Type of list" + } + } + }, + "list_item": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "paragraph": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "Paragraph text"}, + "formatting": { + "type": "object", + "description": "Text formatting (bold, italic, etc.)" + } + } + }, + "heading": { + "type": "object", + "required": ["text", "level"], + "properties": { + "text": {"type": "string", "description": "Heading text"}, + "level": { + "type": "integer", + "minimum": 1, + "maximum": 6, + "description": "Heading level (1-6)" + } + } + }, + "code_block": { + "type": "object", + "required": ["code"], + "properties": { + "code": {"type": "string", "description": "Code content"}, + "language": {"type": "string", "description": "Programming language (optional)"} + } + } + } + } + + +def get_extraction_prompt_template() -> str: + """Get the template for AI extraction prompts that request JSON output.""" + return """ +You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document. + +IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. + +JSON Schema Requirements: +- Extract the actual data from the source documents +- If content is a table, extract it as a table with headers and rows +- If content is a list, extract it as a structured list with items +- If content is text, extract it as paragraphs or headings +- Preserve the original structure and data - do not summarize or interpret +- Use the exact JSON schema provided + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification + +Return only the JSON structure following the schema. Do not include any text before or after the JSON. +""" + + +def get_generation_prompt_template() -> str: + """Get the template for AI generation prompts that work with JSON input.""" + return """ +You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content. + +IMPORTANT: You must respond with valid JSON only, following the document schema. + +Generation Guidelines: +- Use the provided JSON structure as the foundation +- Enhance the content with proper formatting and organization +- Ensure logical flow and readability +- Maintain the original data integrity +- Add appropriate headings and sections +- Organize content in a logical sequence + +Content Enhancement: +- Tables: Ensure proper headers and data alignment +- Lists: Use appropriate list types (bullet, numbered, checklist) +- Headings: Use appropriate heading levels for hierarchy +- Paragraphs: Ensure proper text flow and formatting +- Code: Preserve code blocks with proper language identification + +Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON. +""" + + +def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]: + """Automatically select appropriate schema based on prompt analysis.""" + if prompt_analysis and prompt_analysis.get("is_multi_file", False): + return get_multi_document_subJsonSchema() + else: + return get_document_subJsonSchema() + +def validate_json_document(json_data: Dict[str, Any]) -> bool: + """Validate that the JSON data follows the document schema.""" + try: + # Basic validation - check required fields + if not isinstance(json_data, dict): + return False + + # Check if it's multi-document or single-document structure + if "documents" in json_data: + # Multi-document structure + if "metadata" not in json_data: + return False + + metadata = json_data["metadata"] + if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata: + return False + + documents = json_data["documents"] + if not isinstance(documents, list): + return False + + # Validate each document + for doc in documents: + if not isinstance(doc, dict): + return False + + required_fields = ["id", "title", "sections", "filename"] + for field in required_fields: + if field not in doc: + return False + + # Validate sections in each document + sections = doc.get("sections", []) + if not isinstance(sections, list): + return False + + for section in sections: + if not isinstance(section, dict): + return False + + section_required = ["id", "content_type", "elements", "order"] + for field in section_required: + if field not in section: + return False + + # Validate content_type + valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] + if section["content_type"] not in valid_types: + return False + + # Validate elements + if not isinstance(section["elements"], list): + return False + + elif "sections" in json_data: + # Single-document structure (existing validation) + if "metadata" not in json_data: + return False + + metadata = json_data["metadata"] + if not isinstance(metadata, dict) or "title" not in metadata: + return False + + sections = json_data["sections"] + if not isinstance(sections, list): + return False + + # Validate each section + for i, section in enumerate(sections): + if not isinstance(section, dict): + return False + + required_fields = ["id", "content_type", "elements", "order"] + for field in required_fields: + if field not in section: + return False + + # Validate content_type + valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] + if section["content_type"] not in valid_types: + return False + + # Validate elements + if not isinstance(section["elements"], list): + return False + else: + return False + + return True + + except Exception: + return False diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py new file mode 100644 index 00000000..cbcce375 --- /dev/null +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -0,0 +1,738 @@ +""" +Prompt builder for AI document generation and extraction. +This module builds prompts for AI services to extract and generate documents. +""" + +import json +import logging +from typing import Dict, Any, Optional, List, TYPE_CHECKING +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + +# Type hint for renderer parameter +if TYPE_CHECKING: + from .renderers.rendererBaseTemplate import BaseRenderer + _RendererLike = BaseRenderer +else: + _RendererLike = Any + +logger = logging.getLogger(__name__) + +async def buildAdaptiveExtractionPrompt( + outputFormat: str, + userPrompt: str, + title: str, + promptAnalysis: Dict[str, Any], + aiService=None, + services=None +) -> str: + """ + Build adaptive extraction prompt based on AI analysis. + Uses multi-file or single-file approach based on analysis. + """ + + # Multi-file example data instead of schema + multi_file_example = { + "metadata": { + "title": "Multi-Document Example", + "splitStrategy": "by_section", + "source_documents": ["doc_001"], + "extraction_method": "ai_extraction" + }, + "documents": [ + { + "id": "doc_section_1", + "title": "Section 1 Title", + "filename": "section_1.xlsx", + "sections": [ + { + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. SECTION TITLE" + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "This is the actual content that should be extracted from the document." + } + ], + "order": 2 + }, + { + "id": "section_3", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [["Value 1", "Value 2"]] + } + ], + "order": 3 + } + ] + } + ] + } + + # Single-file example data instead of schema + single_file_example = { + "metadata": { + "title": "Single Document Example", + "source_documents": ["doc_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. SECTION TITLE" + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "This is the actual content that should be extracted from the document." + } + ], + "order": 2 + }, + { + "id": "section_3", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [["Value 1", "Value 2"]] + } + ], + "order": 3 + } + ] + } + + if promptAnalysis.get("is_multi_file", False): + # Multi-file prompt + adaptive_prompt = f""" +{userPrompt} + +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. + +TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file. + +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Identify distinct sections in the document (by headings, topics, or logical breaks) +3. Create one JSON document entry for each section found +4. Extract the real content from each section (headings, paragraphs, lists, etc.) +5. Generate appropriate filenames for each section + +CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array. + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(multi_file_example, indent=2)} + +IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have: +- "id": unique identifier +- "title": section title from the document +- "filename": appropriate filename for the section +- "sections": array of content sections + +DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level. + +INSTRUCTIONS: +- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document +- Use actual section titles, headings, and text from the document +- Create meaningful filenames based on section content +- Ensure each section contains the complete content for that part of the document +- Do not use generic placeholder text like "Section 1", "Section 2" +- Extract real headings, paragraphs, lists, and other content elements +- CRITICAL: Return JSON with "documents" array, not "sections" array + +CONTEXT (Document Content): + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. +""".strip() + else: + # Single-file prompt - use example data instead of schema + adaptive_prompt = f""" +{userPrompt} + +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. + +TASK: Extract the actual content from the document and organize it into structured sections. + +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(single_file_example, indent=2)} + +INSTRUCTIONS: +- Replace example data with actual content from the document +- Use actual headings, paragraphs, and text from the document +- Ensure all content is properly structured +- Do not use generic placeholder text +- Extract real content from the documents + +CONTEXT (Document Content): + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. +""".strip() + + return adaptive_prompt + +async def buildGenericExtractionPrompt( + outputFormat: str, + userPrompt: str, + title: str, + aiService=None, + services=None +) -> str: + """Build generic extraction prompt that works for both single and multi-file.""" + + # Use AI to determine the best approach + if aiService: + try: + analysis_prompt = f""" +Analyze this user request and determine the best JSON structure for document extraction. + +User request: "{userPrompt}" + +Respond with JSON only: +{{ + "requires_multi_file": true/false, + "recommended_schema": "single_document|multi_document", + "split_approach": "description of how to organize content", + "file_naming": "suggested naming pattern" +}} + +Consider the user's intent and the most logical way to organize the extracted content. +""" + + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) + response = await aiService.aiObjects.call(request) + + if response and response.content: + import re + + result = response.content.strip() + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + result = json_match.group(0) + + analysis = json.loads(result) + + # Use analysis to build appropriate prompt + return await buildAdaptiveExtractionPrompt( + outputFormat, userPrompt, title, analysis, aiService, services + ) + except Exception as e: + services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER") + + # Fallback to single-file prompt + example_data = { + "metadata": { + "title": "Example Document", + "author": "AI Assistant", + "source_documents": ["document_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "section_001", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. SECTION TITLE" + } + ], + "order": 1, + "metadata": {} + } + ], + "summary": "", + "tags": [] + } + + return f""" +{userPrompt} + +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. + +TASK: Extract the actual content from the document and organize it into structured sections. + +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(example_data, indent=2)} + +Requirements: +- Preserve all original data - do not summarize or interpret +- Use the exact JSON format shown above +- Maintain data integrity and structure + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. + +DO NOT return a schema description - return actual extracted content in the JSON format shown above. +""" + +async def buildExtractionPrompt( + outputFormat: str, + renderer: _RendererLike, + userPrompt: str, + title: str, + aiService=None, + services=None +) -> str: + """ + Build the final extraction prompt by combining: + - Parsed extraction intent from user prompt (using AI) + - Generic cross-format instructions (filename header + real-data policy) + - Format-specific guidelines snippet provided by the renderer + + The AI must place a single filename header at the very top: + FILENAME: + followed by a blank line and then ONLY the document content according to the target format. + """ + + # Parse user prompt to separate extraction intent from generation format using AI + extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) + + # Import JSON schema for structured output + from .subJsonSchema import get_document_subJsonSchema + jsonSchema = get_document_subJsonSchema() + + # Generic block for JSON extraction - use mixed example data showing different content types + example_data = { + "metadata": { + "title": "Example Document", + "author": "AI Assistant", + "source_documents": ["document_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "section_001", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. INTRODUCTION" + } + ], + "order": 1, + "metadata": {} + }, + { + "id": "section_002", + "content_type": "paragraph", + "elements": [ + { + "text": "This is a sample paragraph with actual content that should be extracted from the document." + } + ], + "order": 2, + "metadata": {} + }, + { + "id": "section_003", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2", "Column 3"], + "rows": [ + ["Value 1", "Value 2", "Value 3"], + ["Value 4", "Value 5", "Value 6"] + ] + } + ], + "order": 3, + "metadata": {} + } + ], + "summary": "", + "tags": [] + } + + genericIntro = f""" +{extractionIntent} + +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. + +TASK: Extract the actual content from the document and organize it into structured sections. + +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(example_data, indent=2)} + +Requirements: +- Preserve all original data - do not summarize or interpret +- Use the exact JSON format shown above +- Maintain data integrity and structure + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. + +DO NOT return a schema description - return actual extracted content in the JSON format shown above. +""" + + # Get format-specific guidelines from renderer + formatGuidelines = "" + try: + if hasattr(renderer, 'getExtractionGuidelines'): + formatGuidelines = renderer.getExtractionGuidelines() + except Exception: + pass + + # Combine all parts + finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip() + + # Save extraction prompt to debug file - only if debug enabled + try: + debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f: + f.write(finalPrompt) + except Exception: + pass + + return finalPrompt + + +async def buildGenerationPrompt( + outputFormat: str, + userPrompt: str, + title: str, + aiService=None, + services=None +) -> str: + """ + Use AI to build the generation prompt based on user intent and format requirements. + Focus on what's important for the user and how to structure the content. + """ + if not aiService: + # Fallback if no AI service available + return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." + + try: + # Protect userPrompt from injection + safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') + + # Debug output + services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER") + + # AI call to generate the appropriate generation prompt + generationPromptRequest = f""" +You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document. + +User request: "{safeUserPrompt}" +Document title: "{title}" +Target format: {outputFormat} + +Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on: + +1. What content is most important for the user +2. How to structure and organize the content using the canonical JSON format with 'sections' +3. Specific formatting requirements for the target format +4. Language requirements to preserve +5. How to ensure the JSON content meets the user's needs + +CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure: +{{ + "metadata": {{ + "title": "Document Title" + }}, + "sections": [ + {{ + "id": "section_1", + "content_type": "heading", + "elements": [ + {{ + "level": 1, + "text": "1. SECTION TITLE" + }} + ], + "order": 1 + }}, + {{ + "id": "section_2", + "content_type": "paragraph", + "elements": [ + {{ + "text": "This is the actual content that should be extracted from the document." + }} + ], + "order": 2 + }}, + {{ + "id": "section_3", + "content_type": "table", + "elements": [ + {{ + "headers": ["Column 1", "Column 2", "Column 3"], + "rows": [ + ["Value 1", "Value 2", "Value 3"], + ["Value 4", "Value 5", "Value 6"] + ] + }} + ], + "order": 3 + }} + ] +}} + +The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements". + +Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format. +""" + + # Call AI service to generate the prompt + services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER") + + # Import and set proper options for AI call + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options) + response = await aiService.aiObjects.call(request) + result = response.content if response else "" + + # Replace the placeholder that the AI created with actual format rules + if result: + formatRules = _getFormatRules(outputFormat) + result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules) + + # Debug output + services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER") + + # Save full generation prompt and AI response to debug file - only if debug enabled + try: + debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + os.makedirs(debug_root, exist_ok=True) + with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f: + f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n") + f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n") + f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n") + except Exception: + pass + + return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." + + except Exception as e: + # Fallback on any error - preserve user prompt for language instructions + services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER") + return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" + + +def _getFormatRules(outputFormat: str) -> str: + """ + Get format-specific rules for the generation prompt. + """ + format_rules = { + "xlsx": """ +XLSX Format Rules: +- Create tables with clear headers and organized data +- Use appropriate column widths and formatting +- Include summary information if relevant +- Ensure data is properly structured for spreadsheet analysis +""", + "pdf": """ +PDF Format Rules: +- Create professional document layout +- Use appropriate headings and sections +- Include proper spacing and formatting +- Ensure content is well-organized and readable +""", + "docx": """ +DOCX Format Rules: +- Create professional document layout +- Use appropriate headings and sections +- Include proper spacing and formatting +- Ensure content is well-organized and readable +""", + "html": """ +HTML Format Rules: +- Create clean, semantic HTML structure +- Use appropriate tags for content organization +- Include proper styling classes +- Ensure content is accessible and well-formatted +""", + "json": """ +JSON Format Rules: +- Create well-structured JSON data +- Use appropriate nesting and organization +- Include metadata and context information +- Ensure data is properly formatted and valid +""", + "csv": """ +CSV Format Rules: +- Create clear, organized tabular data +- Use appropriate headers and data types +- Ensure proper CSV formatting +- Include all relevant data in structured format +""", + "txt": """ +TXT Format Rules: +- Create clean, readable text format +- Use appropriate spacing and organization +- Include clear headings and sections +- Ensure content is well-structured and easy to read +""" + } + + return format_rules.get(outputFormat.lower(), f""" +{outputFormat.upper()} Format Rules: +- Create well-structured content appropriate for {outputFormat} +- Use appropriate formatting and organization +- Ensure content is clear and professional +- Include all relevant information in proper format +""") + + +async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: + """ + Parse user prompt to extract the core extraction intent. + """ + if not aiService: + return f"Extract content from the provided documents and create a {outputFormat} report." + + try: + analysis_prompt = f""" +Analyze this user request and extract the core extraction intent: + +User request: "{userPrompt}" +Target format: {outputFormat} + +Extract the main intent and requirements for document processing. Focus on: +1. What content needs to be extracted +2. How it should be organized +3. Any specific requirements or preferences + +Respond with a clear, concise statement of the extraction intent. +""" + + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) + response = await aiService.aiObjects.call(request) + + if response and response.content: + return response.content.strip() + else: + return f"Extract content from the provided documents and create a {outputFormat} report." + + except Exception as e: + services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER") + return f"Extract content from the provided documents and create a {outputFormat} report." + diff --git a/modules/services/serviceNeutralization/mainServiceNeutralization.py b/modules/services/serviceNeutralization/mainServiceNeutralization.py index a76cf397..c48939f6 100644 --- a/modules/services/serviceNeutralization/mainServiceNeutralization.py +++ b/modules/services/serviceNeutralization/mainServiceNeutralization.py @@ -32,7 +32,7 @@ class NeutralizationService: serviceCenter: Service center instance for accessing other services NamesToParse: List of names to parse and replace (case-insensitive) """ - self.serviceCenter = serviceCenter + self.services = serviceCenter self.interfaceDbApp = serviceCenter.interfaceDbApp # Initialize anonymization processors diff --git a/modules/services/serviceNormalization/mainServiceNormalization.py b/modules/services/serviceNormalization/mainServiceNormalization.py new file mode 100644 index 00000000..34805ef2 --- /dev/null +++ b/modules/services/serviceNormalization/mainServiceNormalization.py @@ -0,0 +1,264 @@ +import json +import os +from typing import Any, Dict, List, Set +from datetime import datetime, UTC + + +class NormalizationService: + """ + Produces a single canonical table in merged JSON using an AI-provided header mapping + and deterministic, in-code value normalization. No language heuristics in code. + """ + + def __init__(self, services): + self.services = services + + # Public API + def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]: + headers: Set[str] = set() + samples: Dict[str, List[str]] = {} + + sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else [] + for section in sections: + if not isinstance(section, dict): + continue + + # Use only the fundamental agreed JSON structure: content_type/elements + if section.get("content_type") != "table": + continue + + # Extract table data from elements array + hdrs = [] + rows = [] + for element in section.get("elements", []): + if isinstance(element, dict) and "headers" in element and "rows" in element: + hdrs = element.get("headers") or [] + rows = element.get("rows") or [] + break + + if not hdrs or not rows: + continue + + for h in hdrs: + if not isinstance(h, str): + continue + headers.add(h) + # collect small value samples by column index + for row in rows[:5]: + if not isinstance(row, list): + continue + for i, value in enumerate(row): + headerName = hdrs[i] if i < len(hdrs) else f"col_{i}" + if headerName not in samples: + samples[headerName] = [] + if len(samples[headerName]) < 5: + samples[headerName].append(str(value)) + + return { + "tableHeaders": sorted(list(headers)), + "headerSamples": samples, + } + + async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]: + + # Allow caller to specify any canonical schema. If none provided, default to discovered headers. + if canonicalSpec is None: + canonicalSpec = { + "canonicalHeaders": inventory.get("tableHeaders", []), + "constraints": {} + } + + # Protect merge prompt context by wrapping in single quotes and escaping internal quotes + protectedMerge = None + if mergePrompt: + try: + protectedMerge = str(mergePrompt).replace("'", "\\'") + except Exception: + protectedMerge = str(mergePrompt) + + prompt = ( + "You are a mapping generator. Return ONLY JSON.\n\n" + "Given discovered headers and sample values, map them to the canonical headers.\n" + "Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n" + f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n" + f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n" + + (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") + + "REPLY JSON SHAPE:\n(Example)\n" + "{\n \"mappings\": {\"\": \"|null\"},\n" + " \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n" + " \"Currency\": {\"stripSymbols\": true},\n" + " \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n" + ) + + response = await self.services.ai.callAi(prompt=prompt) + if not response: + return {"mapping": {}, "normalizationPolicy": {}} + + # Extract JSON from response more safely + start_idx = response.find('{') + end_idx = response.rfind('}') + if start_idx == -1 or end_idx == -1 or start_idx >= end_idx: + return {"mapping": {}, "normalizationPolicy": {}} + + js = response[start_idx:end_idx + 1] + try: + mapping = json.loads(js) + except json.JSONDecodeError: + return {"mapping": {}, "normalizationPolicy": {}} + # Normalize key naming from AI: prefer single key "mapping" + if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict): + mapping["mapping"] = mapping["mappings"] + try: + del mapping["mappings"] + except Exception: + pass + # Ensure canonicalHeaders present in mapping for downstream use + if "canonicalHeaders" not in mapping: + mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", []) + + # debug artifact + self._writeDebugArtifact("mapping.json", mapping) + return mapping + + def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]: + mappings = (mappingSpec or {}).get("mapping", {}) + policy = (mappingSpec or {}).get("normalizationPolicy", {}) + + # Prefer headers provided by mapping (generic across domains) + canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or [] + if not canonicalHeaders: + # Fallback to union of mapped targets + canonicalHeaders = sorted(list({t for t in mappings.values() if t})) + + rows: List[List[str]] = [] + sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else [] + for section in sections: + # Use only the fundamental agreed JSON structure: content_type/elements + if section.get("content_type") != "table": + continue + + # Extract table data from elements array + sourceHeaders = [] + sourceRows = [] + for element in section.get("elements", []): + if isinstance(element, dict) and "headers" in element and "rows" in element: + sourceHeaders = element.get("headers") or [] + sourceRows = element.get("rows") or [] + break + + if not sourceHeaders or not sourceRows: + continue + + # Build index map: canonical -> source index or None + indexMap: Dict[str, int] = {} + for ci, ch in enumerate(canonicalHeaders): + srcIndex = None + for si, sh in enumerate(sourceHeaders): + # Prefer explicit mapping target; fallback to identity when names match + target = mappings.get(sh) + if target is None and sh == ch: + target = ch + if target == ch: + srcIndex = si + break + indexMap[ch] = srcIndex + + # Transform rows + for r in sourceRows: + canonicalRow: List[str] = [] + for ch in canonicalHeaders: + idx = indexMap.get(ch) + try: + value = r[idx] if (idx is not None and idx < len(r)) else "" + except (IndexError, KeyError) as e: + # Handle corrupted data gracefully + value = "" + canonicalRow.append(self._normalizeValue(ch, value, policy)) + # consider as row if at least one non-empty meaningful field + if any(v.strip() for v in canonicalRow): + rows.append(canonicalRow) + + canonical = { + "metadata": { + "title": mergedJson.get("metadata", {}).get("title", "Merged Document"), + "source_documents": mergedJson.get("metadata", {}).get("source_documents", []) + }, + "sections": [ + { + "id": "canonical_table_1", + "content_type": "table", + "elements": [ + { + "headers": canonicalHeaders, + "rows": rows + } + ], + "order": 1 + } + ] + } + + # debug artifact + self._writeDebugArtifact("canonical_merged.json", canonical) + return canonical + + def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]: + rows = [] + try: + sections = canonicalJson.get("sections", []) + for s in sections: + if s.get("content_type") == "table": + # Extract rows from elements array + for element in s.get("elements", []): + if isinstance(element, dict) and "rows" in element: + rows.extend(element.get("rows", [])) + except Exception: + rows = [] + report = { + "rowCount": len(rows), + "success": len(rows) > 0 + } + self._writeDebugArtifact("normalization_report.json", report) + return report + + # Internal helpers + def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str: + if value is None: + return "" + text = str(value).strip() + # Generic normalization guided by policy; avoid domain specifics + if canonicalHeader in (policy.get("numericFields", []) or []): + dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator") + or (policy.get("numeric") or {}).get("decimalSeparator") + or ".") + if dec == ",": + text = text.replace(".", "").replace(",", ".") if "," in text else text + text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+']) + elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []): + text = ''.join(ch for ch in text if ch.isalpha()) + text = text.upper() + return text + + def _writeDebugArtifact(self, fileName: str, obj: Any) -> None: + try: + debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if not debugEnabled: + return + root = "./test-chat/ai" + os.makedirs(root, exist_ok=True) + # Prefix timestamp for files that are frequently overwritten + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + if fileName in ("mapping.json", "canonical_merged.json"): + outName = f"{ts}_{fileName}" + else: + outName = fileName + path = os.path.join(root, outName) + with open(path, "w", encoding="utf-8") as f: + if isinstance(obj, (dict, list)): + f.write(json.dumps(obj, ensure_ascii=False, indent=2)) + else: + f.write(str(obj)) + except Exception: + pass + + diff --git a/modules/services/serviceSharepoint/mainServiceSharepoint.py b/modules/services/serviceSharepoint/mainServiceSharepoint.py index 0b692b4b..66a6a7cf 100644 --- a/modules/services/serviceSharepoint/mainServiceSharepoint.py +++ b/modules/services/serviceSharepoint/mainServiceSharepoint.py @@ -21,7 +21,7 @@ class SharepointService: Use setAccessTokenFromConnection() method to configure the access token before making API calls. """ - self.serviceCenter = serviceCenter + self.services = serviceCenter self.access_token = None self.base_url = "https://graph.microsoft.com/v1.0" diff --git a/modules/services/serviceTicket/mainServiceTicket.py b/modules/services/serviceTicket/mainServiceTicket.py index c94ee1eb..3f1f982b 100644 --- a/modules/services/serviceTicket/mainServiceTicket.py +++ b/modules/services/serviceTicket/mainServiceTicket.py @@ -16,7 +16,7 @@ class TicketService: Args: serviceCenter: Service center instance for accessing other services """ - self.serviceCenter = serviceCenter + self.services = serviceCenter async def _createTicketInterfaceByType( self, diff --git a/modules/services/serviceUtils/mainServiceUtils.py b/modules/services/serviceUtils/mainServiceUtils.py index 90a18daa..8379382a 100644 --- a/modules/services/serviceUtils/mainServiceUtils.py +++ b/modules/services/serviceUtils/mainServiceUtils.py @@ -4,6 +4,7 @@ Provides centralized access to configuration, events, and other utilities. """ import logging +import os from typing import Any, Optional, Dict, Callable from modules.shared.configuration import APP_CONFIG from modules.shared.eventManagement import eventManager @@ -139,4 +140,43 @@ class UtilsService: return TokenManager().getFreshToken(connectionId) except Exception as e: logger.error(f"Error getting fresh token for connection {connectionId}: {str(e)}") - return None \ No newline at end of file + return None + + def debugLogToFile(self, message: str, context: str = "DEBUG"): + """ + Log debug message to file if debug logging is enabled. + + Args: + message: Debug message to log + context: Context identifier for the debug message + """ + try: + # Check if debug logging is enabled + debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if not debug_enabled: + return + + # Get debug directory + debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat") + if not os.path.isabs(debug_dir): + # If relative path, make it relative to the gateway directory + gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + debug_dir = os.path.join(gateway_dir, debug_dir) + + # Ensure debug directory exists + os.makedirs(debug_dir, exist_ok=True) + + # Create debug file path + debug_file = os.path.join(debug_dir, "debug_workflow.log") + + # Format the debug entry + timestamp = self.getUtcTimestamp() + debug_entry = f"[{timestamp}] [{context}] {message}\n" + + # Write to debug file + with open(debug_file, "a", encoding="utf-8") as f: + f.write(debug_entry) + + except Exception as e: + # Don't log debug errors to avoid recursion + pass \ No newline at end of file diff --git a/modules/services/serviceWorkflow/mainServiceWorkflow.py b/modules/services/serviceWorkflow/mainServiceWorkflow.py index 180779b5..afc4e3b5 100644 --- a/modules/services/serviceWorkflow/mainServiceWorkflow.py +++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py @@ -16,7 +16,7 @@ class WorkflowService: """Service class containing methods for document processing, chat operations, and workflow management""" def __init__(self, serviceCenter): - self.serviceCenter = serviceCenter + self.services = serviceCenter self.user = serviceCenter.user self.workflow = serviceCenter.workflow self.interfaceDbChat = serviceCenter.interfaceDbChat @@ -78,11 +78,15 @@ class WorkflowService: def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]: """Get ChatDocuments from a list of document references using all three formats.""" try: - # Get the current workflow from services (same pattern as setWorkflowContext) - workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow - if not workflow: - logger.error("No workflow available for document list resolution") - return [] + workflow = self.services.currentWorkflow + + # Reload workflow from database to ensure we have all messages + if hasattr(workflow, 'id'): + try: + workflow = self.getWorkflow(workflow.id) + logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages") + except Exception as e: + logger.warning(f"Could not reload workflow from database: {str(e)}") all_documents = [] for doc_ref in documentList: @@ -125,7 +129,9 @@ class WorkflowService: break if not message_found: - logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}") + available_ids = [str(msg.id) for msg in workflow.messages] + logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}") + raise ValueError(f"Document reference not found: docList:{message_id}:{label}") elif len(parts) >= 2: # Format: docList: