diff --git a/modules/routes/routeSecurityLocal.py b/modules/routes/routeSecurityLocal.py index fcc59d1b..f24aee78 100644 --- a/modules/routes/routeSecurityLocal.py +++ b/modules/routes/routeSecurityLocal.py @@ -263,8 +263,7 @@ async def read_user_me( @limiter.limit("60/minute") async def refresh_token( request: Request, - response: Response, - currentUser: User = Depends(getCurrentUser) + response: Response ) -> Dict[str, Any]: """Refresh access token using refresh token from cookie""" try: @@ -283,12 +282,27 @@ async def refresh_token( except jwt.JWTError: raise HTTPException(status_code=401, detail="Invalid refresh token") + # Get user information from refresh token payload + user_id = payload.get("userId") + if not user_id: + raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID") + + # Get user from database using the user ID from refresh token + try: + app_interface = getRootInterface() + current_user = app_interface.getUser(user_id) + if not current_user: + raise HTTPException(status_code=401, detail="User not found") + except Exception as e: + logger.error(f"Failed to get user from database: {str(e)}") + raise HTTPException(status_code=500, detail="Failed to validate user") + # Create new token data token_data = { - "sub": currentUser.username, - "mandateId": str(currentUser.mandateId), - "userId": str(currentUser.id), - "authenticationAuthority": currentUser.authenticationAuthority + "sub": current_user.username, + "mandateId": str(current_user.mandateId), + "userId": str(current_user.id), + "authenticationAuthority": current_user.authenticationAuthority } # Create new access token + set cookie diff --git a/modules/services/serviceAi/subCoreAi.py b/modules/services/serviceAi/subCoreAi.py index 52163432..3f245334 100644 --- a/modules/services/serviceAi/subCoreAi.py +++ b/modules/services/serviceAi/subCoreAi.py @@ -345,7 +345,7 @@ class SubCoreAi: options=options ) response = await self.aiObjects.call(request) - result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "type": "paragraph", "data": {"text": response.content}}]} + result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]} # Convert single-file result to multi-file format if needed if "sections" in result and "documents" not in result: diff --git a/modules/services/serviceAi/subDocumentGeneration.py b/modules/services/serviceAi/subDocumentGeneration.py index d9318f00..09fe524c 100644 --- a/modules/services/serviceAi/subDocumentGeneration.py +++ b/modules/services/serviceAi/subDocumentGeneration.py @@ -77,7 +77,8 @@ class SubDocumentGeneration: documents: Optional[List[ChatDocument]], options: AiCallOptions, outputFormat: str, - title: Optional[str] + title: Optional[str], + generationPrompt: Optional[str] = None ) -> Dict[str, Any]: """Handle single-file document generation (existing functionality).""" try: @@ -125,9 +126,72 @@ class SubDocumentGeneration: except Exception: parsedFilename = None - # Render the JSON content to the specified format + # Use AI generation to enhance the extracted JSON before rendering + enhancedContent = aiResponseJson # Default to original + if prompt: + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + # Get generation prompt + generationPrompt = await generation_service.getGenerationPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + aiService=self + ) + + # Prepare the AI call + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + # Create context with the extracted JSON content + import json + context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}" + + request = AiCallRequest( + prompt=generationPrompt, + context=context, + options=request_options + ) + + # Call AI to enhance the content + response = await self.aiObjects.call(request) + + if response and response.content: + # Parse the AI response as JSON + try: + import re + result = response.content.strip() + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + enhancedContent = json.loads(result) + logger.info(f"AI enhanced JSON content successfully") + + except json.JSONDecodeError as e: + logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content") + enhancedContent = aiResponseJson + else: + logger.warning("AI generation returned empty response, using original content") + enhancedContent = aiResponseJson + + except Exception as e: + logger.warning(f"AI generation failed: {str(e)}, using original content") + enhancedContent = aiResponseJson + + # Render the enhanced JSON content renderedContent, mimeType = await generation_service.renderReport( - extractedContent=aiResponseJson, + extractedContent=enhancedContent, outputFormat=outputFormat, title=title, userPrompt=prompt, @@ -232,11 +296,8 @@ class SubDocumentGeneration: # Convert AI format to renderer format transformed_section = { "id": section.get("id", f"section_{len(transformed_sections) + 1}"), - "type": section.get("content_type", "paragraph"), - "data": { - "text": "", - "elements": section.get("elements", []) - }, + "content_type": section.get("content_type", "paragraph"), + "elements": section.get("elements", []), "order": section.get("order", len(transformed_sections) + 1) } @@ -246,7 +307,11 @@ class SubDocumentGeneration: for element in section.get("elements", []): if "text" in element: text_parts.append(element["text"]) - transformed_section["data"]["text"] = "\n".join(text_parts) + # Add text to the first element or create a new one + if transformed_section["elements"]: + transformed_section["elements"][0]["text"] = "\n".join(text_parts) + else: + transformed_section["elements"] = [{"text": "\n".join(text_parts)}] transformed_sections.append(transformed_section) @@ -264,8 +329,72 @@ class SubDocumentGeneration: "tags": ["multi_file", "ai_generated"] } + # Use AI generation to enhance the extracted JSON before rendering + enhancedContent = complete_document # Default to original + if prompt: + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + + # Get generation prompt + generationPrompt = await generation_service.getGenerationPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=doc_data["title"], + aiService=self + ) + + # Prepare the AI call + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + # Create context with the extracted JSON content + import json + context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}" + + request = AiCallRequest( + prompt=generationPrompt, + context=context, + options=request_options + ) + + # Call AI to enhance the content + response = await self.aiObjects.call(request) + + if response and response.content: + # Parse the AI response as JSON + try: + import re + result = response.content.strip() + + # Extract JSON from markdown if present + json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if json_match: + result = json_match.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + enhancedContent = json.loads(result) + logger.info(f"AI enhanced JSON content successfully") + + except json.JSONDecodeError as e: + logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content") + enhancedContent = complete_document + else: + logger.warning("AI generation returned empty response, using original content") + enhancedContent = complete_document + + except Exception as e: + logger.warning(f"AI generation failed: {str(e)}, using original content") + enhancedContent = complete_document + + # Render the enhanced JSON content rendered_content, mime_type = await generation_service.renderReport( - extractedContent=complete_document, + extractedContent=enhancedContent, outputFormat=outputFormat, title=doc_data["title"], userPrompt=prompt, @@ -477,9 +606,7 @@ Return only the JSON response. """ try: services = self.services - workflow = getattr(services, 'currentWorkflow', None) - if not workflow: - return + workflow = services.currentWorkflow # Serialize payload import json as _json diff --git a/modules/services/serviceAi/subDocumentProcessing.py b/modules/services/serviceAi/subDocumentProcessing.py index de3a0f2f..a0dc5088 100644 --- a/modules/services/serviceAi/subDocumentProcessing.py +++ b/modules/services/serviceAi/subDocumentProcessing.py @@ -181,9 +181,8 @@ class SubDocumentProcessing: from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService normalizer = NormalizationService(self.services) inventory = normalizer.discoverStructures(mergedJsonDocument) - # Use workflow id if available as cache key, else default - cacheKey = getattr(self.services, 'currentWorkflow', None) - cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run' + # Use workflow id as cache key + cacheKey = self.services.currentWorkflow.id # Provide the extraction/merge prompt context when available to help mapping mergePrompt = prompt mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) @@ -476,8 +475,8 @@ class SubDocumentProcessing: "metadata": {"title": f"Image Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"image_section_{chunk_index}", - "type": "paragraph", - "data": {"text": fallback_content} + "content_type": "paragraph", + "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE") @@ -583,8 +582,8 @@ class SubDocumentProcessing: "metadata": {"title": f"Document Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"analysis_section_{chunk_index}", - "type": "paragraph", - "data": {"text": fallback_content} + "content_type": "paragraph", + "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE") @@ -676,8 +675,8 @@ class SubDocumentProcessing: "metadata": {"title": "Error Section"}, "sections": [{ "id": f"error_section_{chunk_index}", - "type": "paragraph", - "data": {"text": f"Error parsing JSON: {str(e)}"} + "content_type": "paragraph", + "elements": [{"text": f"Error parsing JSON: {str(e)}"}] }] }) diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index 340cb8ce..8ed6423b 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -1,5 +1,6 @@ import logging import uuid +import json from typing import Any, Dict, List, Optional, Union, Tuple from datetime import datetime, UTC import re @@ -339,24 +340,8 @@ class GenerationService: if not renderer: raise ValueError(f"Unsupported output format: {outputFormat}") - # Generate AI-based generation prompt if AI service is available - generationPrompt = userPrompt # Default to user prompt - if aiService and userPrompt: - try: - from .subPromptBuilder import buildGenerationPrompt - generationPrompt = await buildGenerationPrompt( - outputFormat=outputFormat, - userPrompt=userPrompt, - title=title, - aiService=aiService, - services=self.services - ) - except Exception as e: - logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt") - generationPrompt = userPrompt - - # Render the JSON content with AI-generated prompt - renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt, aiService) + # Render the JSON content directly (AI generation handled by main service) + renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService) # DEBUG: dump rendered output try: import os @@ -391,6 +376,23 @@ class GenerationService: services=self.services ) + async def getGenerationPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + aiService=None + ) -> str: + """Get generation prompt for enhancing extracted JSON content.""" + from .subPromptBuilder import buildGenerationPrompt + return await buildGenerationPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + aiService=aiService, + services=self.services + ) + async def getGenericExtractionPrompt( self, outputFormat: str, diff --git a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py index b8158201..150a903b 100644 --- a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py +++ b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py @@ -81,11 +81,11 @@ class BaseRenderer(ABC): if not isinstance(sections, list): return False - # Validate each section has type and data + # Validate each section has content_type and elements for section in sections: if not isinstance(section, dict): return False - if "type" not in section or "data" not in section: + if "content_type" not in section or "elements" not in section: return False return True @@ -159,7 +159,7 @@ class BaseRenderer(ABC): # Base implementation returns a simple dict # Format-specific renderers should override this method return { - "type": "image", + "content_type": "image", "base64Data": base64_data, "altText": alt_text, "width": section_data.get("width", None), @@ -259,25 +259,25 @@ class BaseRenderer(ABC): if section_type == "table": headers, rows = self._extract_table_data(section_data) - return {"type": "table", "headers": headers, "rows": rows} + return {"content_type": "table", "headers": headers, "rows": rows} elif section_type == "bullet_list": items = self._extract_bullet_list_items(section_data) - return {"type": "bullet_list", "items": items} + return {"content_type": "bullet_list", "items": items} elif section_type == "heading": level, text = self._extract_heading_data(section_data) - return {"type": "heading", "level": level, "text": text} + return {"content_type": "heading", "level": level, "text": text} elif section_type == "paragraph": text = self._extract_paragraph_text(section_data) - return {"type": "paragraph", "text": text} + return {"content_type": "paragraph", "text": text} elif section_type == "code_block": code, language = self._extract_code_block_data(section_data) - return {"type": "code_block", "code": code, "language": language} + return {"content_type": "code_block", "code": code, "language": language} elif section_type == "image": base64_data, alt_text = self._extract_image_data(section_data) # Validate image data if self._validate_image_data(base64_data, alt_text): return { - "type": "image", + "content_type": "image", "base64Data": base64_data, "altText": alt_text, "width": section_data.get("width"), @@ -286,11 +286,11 @@ class BaseRenderer(ABC): } else: # Return placeholder if image data is invalid - return {"type": "paragraph", "text": f"[Image: {alt_text}]"} + return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"} else: # Fallback to paragraph text = self._extract_paragraph_text(section_data) - return {"type": "paragraph", "text": text} + return {"content_type": "paragraph", "text": text} def _format_timestamp(self, timestamp: str = None) -> str: """Format timestamp for display.""" diff --git a/modules/services/serviceGeneration/renderers/rendererJson.py b/modules/services/serviceGeneration/renderers/rendererJson.py index 17555b6f..2ff07ad6 100644 --- a/modules/services/serviceGeneration/renderers/rendererJson.py +++ b/modules/services/serviceGeneration/renderers/rendererJson.py @@ -38,7 +38,7 @@ class RendererJson(BaseRenderer): # Return minimal JSON fallback fallback_data = { "title": title, - "sections": [{"type": "paragraph", "data": {"text": f"Error rendering report: {str(e)}"}}], + "sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}], "metadata": {"error": str(e)} } return json.dumps(fallback_data, indent=2), "application/json" @@ -54,7 +54,7 @@ class RendererJson(BaseRenderer): if "sections" not in content: # Convert old format to new format content = { - "sections": [{"type": "paragraph", "data": {"text": str(content)}}], + "sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}], "metadata": {"title": title} } @@ -73,7 +73,7 @@ class RendererJson(BaseRenderer): self.logger.warning(f"Error cleaning JSON content: {str(e)}") # Return minimal valid JSON fallback_data = { - "sections": [{"type": "paragraph", "data": {"text": str(content)}}], + "sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}], "metadata": {"title": title, "error": str(e)} } return json.dumps(fallback_data, indent=2, ensure_ascii=False) diff --git a/modules/services/serviceGeneration/renderers/rendererXlsx.py b/modules/services/serviceGeneration/renderers/rendererXlsx.py index ed11dd92..ddd6e9f3 100644 --- a/modules/services/serviceGeneration/renderers/rendererXlsx.py +++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py @@ -442,7 +442,7 @@ class RendererXlsx(BaseRenderer): sheet_names = [] # Check if we have multiple table sections - table_sections = [s for s in sections if s.get("type") == "table"] + table_sections = [s for s in sections if s.get("content_type") == "table"] if len(table_sections) > 1: # Create separate sheets for each table @@ -480,7 +480,7 @@ class RendererXlsx(BaseRenderer): return sections = json_content.get("sections", []) - table_sections = [s for s in sections if s.get("type") == "table"] + table_sections = [s for s in sections if s.get("content_type") == "table"] if len(table_sections) > 1: # Multiple tables - populate each sheet with its corresponding table @@ -509,10 +509,15 @@ class RendererXlsx(BaseRenderer): sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79"))) sheet['A1'].alignment = Alignment(horizontal="center") - # Get table data - table_data = section.get("data", {}) - headers = table_data.get("headers", []) - rows = table_data.get("rows", []) + # Get table data from elements (canonical JSON format) + elements = section.get("elements", []) + if elements and isinstance(elements, list) and len(elements) > 0: + table_data = elements[0] + headers = table_data.get("headers", []) + rows = table_data.get("rows", []) + else: + headers = [] + rows = [] if not headers and not rows: sheet['A3'] = "No table data available" @@ -683,9 +688,9 @@ class RendererXlsx(BaseRenderer): def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int: """Add a table element to Excel sheet.""" try: - table_data = element.get("data", {}) - headers = table_data.get("headers", []) - rows = table_data.get("rows", []) + # In canonical JSON format, table elements have headers and rows directly + headers = element.get("headers", []) + rows = element.get("rows", []) if not headers and not rows: return start_row @@ -697,7 +702,7 @@ class RendererXlsx(BaseRenderer): if header_style.get("bold"): cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000"))) if header_style.get("background"): - cell.fill = PatternFill(start_color=header_style["background"], end_color=header_style["background"], fill_type="solid") + cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid") start_row += 1 diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py index e0faa029..c0139f45 100644 --- a/modules/services/serviceGeneration/subPromptBuilder.py +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -1,21 +1,21 @@ """ -Centralized prompt builder for document generation across formats. - -Builds a robust prompt that: -- Accepts any user intent (no fixed structure assumptions) -- Injects format-specific guidelines from the selected renderer -- Adds a common policy section to always use real data from source docs -- Requires the AI to output a filename header that we can parse and use +Prompt builder for AI document generation and extraction. +This module builds prompts for AI services to extract and generate documents. """ import json -from typing import Protocol, Dict, Any +import logging +from typing import Dict, Any, Optional, List, TYPE_CHECKING +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType +# Type hint for renderer parameter +if TYPE_CHECKING: + from .renderers.rendererBaseTemplate import BaseRenderer + _RendererLike = BaseRenderer +else: + _RendererLike = Any -class _RendererLike(Protocol): - def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines - ... - +logger = logging.getLogger(__name__) async def buildAdaptiveExtractionPrompt( outputFormat: str, @@ -25,57 +25,65 @@ async def buildAdaptiveExtractionPrompt( aiService=None, services=None ) -> str: - """Build adaptive extraction prompt based on AI analysis.""" + """ + Build adaptive extraction prompt based on AI analysis. + Uses multi-file or single-file approach based on analysis. + """ - # Get appropriate JSON schema based on analysis - from .subJsonSchema import get_adaptive_json_schema - json_schema = get_adaptive_json_schema(promptAnalysis) + # Multi-file example data instead of schema + multi_file_example = { + "metadata": { + "title": "Multi-Document Example", + "splitStrategy": "by_section", + "source_documents": ["doc_001"], + "extraction_method": "ai_extraction" + }, + "documents": [ + { + "id": "doc_section_1", + "title": "Section 1 Title", + "filename": "section_1.xlsx", + "sections": [ + { + "id": "table_1", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [["Value 1", "Value 2"]] + } + ], + "order": 1 + } + ] + } + ] + } + + # Single-file example data instead of schema + single_file_example = { + "metadata": { + "title": "Single Document Example", + "source_documents": ["doc_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "table_1", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [["Value 1", "Value 2"]] + } + ], + "order": 1 + } + ] + } if promptAnalysis.get("is_multi_file", False): - schema_type = "multi-document" - else: - schema_type = "single-document" - - # Build adaptive prompt using AI analysis - match single-file style - if promptAnalysis.get("is_multi_file", False): - # Multi-file prompt - use simple example format like single-file - multi_file_example = { - "metadata": { - "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE", - "splitStrategy": "by_section" - }, - "documents": [ - { - "id": "doc_1", - "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE", - "filename": "REPLACE_WITH_ACTUAL_FILENAME", - "sections": [ - { - "id": "section_1", - "content_type": "heading", - "elements": [ - { - "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT", - "level": 1 - } - ], - "order": 1 - }, - { - "id": "section_2", - "content_type": "paragraph", - "elements": [ - { - "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT" - } - ], - "order": 2 - } - ] - } - ] - } - + # Multi-file prompt adaptive_prompt = f""" {userPrompt} @@ -134,16 +142,31 @@ Return only the JSON structure with actual data from the documents. Do not inclu Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() else: - # Single-file prompt - use original style + # Single-file prompt - use example data instead of schema adaptive_prompt = f""" {userPrompt} -You are extracting structured content from documents and must respond with valid JSON only. +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. -IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. +TASK: Extract the actual content from the document and organize it into structured sections. -Extract the actual data from the source documents and structure it as JSON with this format: -{json.dumps(json_schema, indent=2)} +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(single_file_example, indent=2)} + +INSTRUCTIONS: +- Replace example data with actual content from the document +- Use actual headings, paragraphs, and text from the document +- Ensure all content is properly structured +- Do not use generic placeholder text +- Extract real content from the documents + +CONTEXT (Document Content): Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers @@ -220,22 +243,53 @@ Consider the user's intent and the most logical way to organize the extracted co services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER") # Fallback to single-file prompt - from .subJsonSchema import get_document_subJsonSchema - json_schema = get_document_subJsonSchema() + example_data = { + "metadata": { + "title": "Example Document", + "author": "AI Assistant", + "source_documents": ["document_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "section_001", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2", "Column 3"], + "rows": [ + ["Value 1", "Value 2", "Value 3"], + ["Value 4", "Value 5", "Value 6"] + ] + } + ], + "order": 1, + "metadata": {} + } + ], + "summary": "", + "tags": [] + } return f""" {userPrompt} -You are extracting structured content from documents and must respond with valid JSON only. +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. -CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. +TASK: Extract the actual content from the document and organize it into structured sections. -Extract the actual data from the source documents and structure it as JSON with this format: -{json.dumps(json_schema, indent=2)} +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(example_data, indent=2)} Requirements: - Preserve all original data - do not summarize or interpret -- Use the exact JSON schema provided +- Use the exact JSON format shown above - Maintain data integrity and structure Content Types to Extract: @@ -286,16 +340,55 @@ async def buildExtractionPrompt( from .subJsonSchema import get_document_subJsonSchema jsonSchema = get_document_subJsonSchema() - # Generic block for JSON extraction - use proper schema instead of hardcoded template + # Generic block for JSON extraction - use example data instead of schema + example_data = { + "metadata": { + "title": "Example Document", + "author": "AI Assistant", + "source_documents": ["document_001"], + "extraction_method": "ai_extraction" + }, + "sections": [ + { + "id": "section_001", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2", "Column 3"], + "rows": [ + ["Value 1", "Value 2", "Value 3"], + ["Value 4", "Value 5", "Value 6"] + ] + } + ], + "order": 1, + "metadata": {} + } + ], + "summary": "", + "tags": [] + } + genericIntro = f""" {extractionIntent} -You are extracting structured content from documents and must respond with valid JSON only. +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. -CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. +TASK: Extract the actual content from the document and organize it into structured sections. -Extract the actual data from the source documents and structure it as JSON with this format: -{json.dumps(jsonSchema, indent=2)} +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Extract all content and organize it into logical sections +3. Create structured JSON with sections containing the extracted content +4. Preserve the original structure and data + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(example_data, indent=2)} + +Requirements: +- Preserve all original data - do not summarize or interpret +- Use the exact JSON format shown above +- Maintain data integrity and structure Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers @@ -317,15 +410,20 @@ Return only the JSON structure with actual data from the documents. Do not inclu Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. DO NOT return a schema description - return actual extracted content in the JSON format shown above. -""".strip() +""" - # Final assembly - finalPrompt = genericIntro + # Get format-specific guidelines from renderer + formatGuidelines = "" + try: + if hasattr(renderer, 'getExtractionGuidelines'): + formatGuidelines = renderer.getExtractionGuidelines() + except Exception: + pass + + # Combine all parts + finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip() - # Debug output - services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER") - - # Save full extraction prompt to debug file - only if debug enabled + # Save extraction prompt to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: @@ -335,8 +433,7 @@ DO NOT return a schema description - return actual extracted content in the JSON debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f: - f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n") - f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n") + f.write(finalPrompt) except Exception: pass @@ -367,24 +464,46 @@ async def buildGenerationPrompt( # AI call to generate the appropriate generation prompt generationPromptRequest = f""" -Based on this user request, create a detailed generation prompt for creating a {outputFormat} document. +You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document. User request: "{safeUserPrompt}" Document title: "{title}" -Output format: {outputFormat} +Target format: {outputFormat} -Create a generation prompt that: -1. Identifies what content is most important for the user -2. Specifies how to structure and organize the content -3. Includes any specific formatting or presentation requirements -4. Preserves any language requirements -5. Ensures the document meets the user's needs +Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on: -IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically. +1. What content is most important for the user +2. How to structure and organize the content using the canonical JSON format with 'sections' +3. Specific formatting requirements for the target format +4. Language requirements to preserve +5. How to ensure the JSON content meets the user's needs -CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat} +CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure: +{{ + "metadata": {{ + "title": "Document Title" + }}, + "sections": [ + {{ + "id": "section_1", + "content_type": "table", + "elements": [ + {{ + "headers": ["Column1", "Column2", "Column3"], + "rows": [ + ["Value1", "Value2", "Value3"], + ["Value4", "Value5", "Value6"] + ] + }} + ], + "order": 1 + }} + ] +}} -Return only the generation prompt, starting with "Generate a {outputFormat} document that..." +The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements". + +Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format. """ # Call AI service to generate the prompt @@ -423,7 +542,7 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu except Exception: pass - return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" + return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." except Exception as e: # Fallback on any error - preserve user prompt for language instructions @@ -433,105 +552,104 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu def _getFormatRules(outputFormat: str) -> str: """ - Get format-specific rules for JSON-based generation. - Since we now use standardized JSON, all formats follow the same rules. + Get format-specific rules for the generation prompt. """ - return """ -- Generate content in standardized JSON format following the document schema -- Tables: Use JSON table format with headers and rows arrays -- Lists: Use JSON list format with items array -- Text: Use JSON paragraph format with text field -- Headings: Use JSON heading format with level field -- Structure: Follow the document JSON schema exactly -""".strip() + format_rules = { + "xlsx": """ +XLSX Format Rules: +- Create tables with clear headers and organized data +- Use appropriate column widths and formatting +- Include summary information if relevant +- Ensure data is properly structured for spreadsheet analysis +""", + "pdf": """ +PDF Format Rules: +- Create professional document layout +- Use appropriate headings and sections +- Include proper spacing and formatting +- Ensure content is well-organized and readable +""", + "docx": """ +DOCX Format Rules: +- Create professional document layout +- Use appropriate headings and sections +- Include proper spacing and formatting +- Ensure content is well-organized and readable +""", + "html": """ +HTML Format Rules: +- Create clean, semantic HTML structure +- Use appropriate tags for content organization +- Include proper styling classes +- Ensure content is accessible and well-formatted +""", + "json": """ +JSON Format Rules: +- Create well-structured JSON data +- Use appropriate nesting and organization +- Include metadata and context information +- Ensure data is properly formatted and valid +""", + "csv": """ +CSV Format Rules: +- Create clear, organized tabular data +- Use appropriate headers and data types +- Ensure proper CSV formatting +- Include all relevant data in structured format +""", + "txt": """ +TXT Format Rules: +- Create clean, readable text format +- Use appropriate spacing and organization +- Include clear headings and sections +- Ensure content is well-structured and easy to read +""" + } + + return format_rules.get(outputFormat.lower(), f""" +{outputFormat.upper()} Format Rules: +- Create well-structured content appropriate for {outputFormat} +- Use appropriate formatting and organization +- Ensure content is clear and professional +- Include all relevant information in proper format +""") async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: """ - Use AI to extract a rich, structured extraction intent from the user prompt. - Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance. + Parse user prompt to extract the core extraction intent. """ if not aiService: - # Fallback if no AI service available - return "Extract all relevant content from the document according to the user's requirements" + return f"Extract content from the provided documents and create a {outputFormat} report." try: - # Protect userPrompt from injection by escaping quotes and newlines - safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') - - # Rich analysis to derive a complete extraction intent and structure guidance - extractionPrompt = f""" -Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON. + analysis_prompt = f""" +Analyze this user request and extract the core extraction intent: -Goals: -- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details). -- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy). -- Identify if multi-file output is appropriate and how to split/files name. +User request: "{userPrompt}" +Target format: {outputFormat} -User request: "{safeUserPrompt}" +Extract the main intent and requirements for document processing. Focus on: +1. What content needs to be extracted +2. How it should be organized +3. Any specific requirements or preferences -Return JSON in this exact shape: -{{ - "detectedLanguage": "de|en|fr|it|...", - "normalizedRequest": "Full explicit instruction in detected language", - "requiresStructuredData": true|false, - "targetStructure": "table|list|mixed|unstructured", - "table": {{ - "headers": ["Header1", "Header2", "..."], - "headerOrderStrict": true|false, - "rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom", - "formats": {{ - "dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...", - "amountDecimals": 2, - "currencyFormat": "code|symbol", - "idMasking": "none|last4|custom" - }} - }}, - "multiFile": true|false, - "fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom", - "fileNamingPattern": "suggested pattern for filenames", - "constraints": ["List of critical constraints to enforce"], - "reasoning": "Brief justification (one sentence)" -}} - -Rules: -- Preserve user terminology and language in normalizedRequest. -- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true. -- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate). -- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured". +Respond with a clear, concise statement of the extraction intent. """ - # Call AI service to extract intention - services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER") - - # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL - request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options) + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await aiService.aiObjects.call(request) - result = response.content if response else "" - services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER") - - # Try to extract and pretty print JSON - if result: - import re, json as _json - match = re.search(r'\{[\s\S]*\}', result) - if match: - try: - obj = _json.loads(match.group(0)) - return _json.dumps(obj, ensure_ascii=False, indent=2) - except Exception: - pass - - # Fallback to previous simple format - return f"Extract: {safeUserPrompt}" + if response and response.content: + return response.content.strip() + else: + return f"Extract content from the provided documents and create a {outputFormat} report." + except Exception as e: - # Fallback on any error - preserve user prompt for language instructions - services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER") - return f"Extract: {userPrompt}" - - + services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER") + return f"Extract content from the provided documents and create a {outputFormat} report." diff --git a/modules/services/serviceNormalization/mainServiceNormalization.py b/modules/services/serviceNormalization/mainServiceNormalization.py index deb93351..763feaef 100644 --- a/modules/services/serviceNormalization/mainServiceNormalization.py +++ b/modules/services/serviceNormalization/mainServiceNormalization.py @@ -28,13 +28,17 @@ class NormalizationService: continue # Extract table data from elements array + hdrs = [] + rows = [] for element in section.get("elements", []): if isinstance(element, dict) and "headers" in element and "rows" in element: hdrs = element.get("headers") or [] rows = element.get("rows") or [] break - else: + + if not hdrs or not rows: continue + for h in hdrs: if not isinstance(h, str): continue @@ -122,13 +126,14 @@ class NormalizationService: continue # Extract table data from elements array + sourceHeaders = [] + sourceRows = [] for element in section.get("elements", []): if isinstance(element, dict) and "headers" in element and "rows" in element: sourceHeaders = element.get("headers") or [] sourceRows = element.get("rows") or [] break - else: - continue + if not sourceHeaders or not sourceRows: continue diff --git a/modules/services/serviceWorkflow/mainServiceWorkflow.py b/modules/services/serviceWorkflow/mainServiceWorkflow.py index 628eb1d2..d5f71cfd 100644 --- a/modules/services/serviceWorkflow/mainServiceWorkflow.py +++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py @@ -78,11 +78,15 @@ class WorkflowService: def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]: """Get ChatDocuments from a list of document references using all three formats.""" try: - # Get the current workflow from services (same pattern as setWorkflowContext) - workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow - if not workflow: - logger.error("No workflow available for document list resolution") - return [] + workflow = self.services.currentWorkflow + + # Reload workflow from database to ensure we have all messages + if hasattr(workflow, 'id'): + try: + workflow = self.getWorkflow(workflow.id) + logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages") + except Exception as e: + logger.warning(f"Could not reload workflow from database: {str(e)}") all_documents = [] for doc_ref in documentList: @@ -418,11 +422,7 @@ class WorkflowService: def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None): """Set current workflow context for document generation and routing""" try: - # Get the current workflow from services - workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow - if not workflow: - logger.error("No workflow available for context setting") - return + workflow = self.services.currentWorkflow # Prepare update data update_data = {} @@ -529,10 +529,7 @@ class WorkflowService: def getDocumentCount(self) -> str: """Get document count for task planning (matching old handlingTasks.py logic)""" try: - # Get the current workflow from services - workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow - if not workflow: - return "No documents available" + workflow = self.services.currentWorkflow # Count documents from all messages in the workflow (like old system) total_docs = 0 @@ -551,10 +548,7 @@ class WorkflowService: def getWorkflowHistoryContext(self) -> str: """Get workflow history context for task planning (matching old handlingTasks.py logic)""" try: - # Get the current workflow from services - workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow - if not workflow: - return "No previous round context available" + workflow = self.services.currentWorkflow # Check if there are any previous rounds by looking for "first" messages has_previous_rounds = False diff --git a/modules/workflows/processing/modes/modeReact.py b/modules/workflows/processing/modes/modeReact.py index f9bf5f50..606d1123 100644 --- a/modules/workflows/processing/modes/modeReact.py +++ b/modules/workflows/processing/modes/modeReact.py @@ -226,7 +226,7 @@ class ReactMode(BaseMode): # Get available documents from the current workflow try: - available_docs = self.services.workflow.getAvailableDocuments(context.workflow) + available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow) if not available_docs or available_docs == "No documents available": logger.warning("No documents available for validation") return diff --git a/modules/workflows/processing/shared/placeholderFactory.py b/modules/workflows/processing/shared/placeholderFactory.py index 75e143f7..e45e560c 100644 --- a/modules/workflows/processing/shared/placeholderFactory.py +++ b/modules/workflows/processing/shared/placeholderFactory.py @@ -68,20 +68,12 @@ def extractWorkflowHistory(service: Any, context: Any) -> str: """Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}} Reverse-chronological, enriched with message summaries and document labels. """ - # Prefer explicit workflow on context; else fall back to services.workflow - workflow = None try: - if hasattr(context, 'workflow') and context.workflow: - workflow = context.workflow - elif hasattr(service, 'workflow') and service.workflow: - workflow = service.workflow - except Exception: - workflow = None - - if workflow: - history = getPreviousRoundContext(service, workflow) + history = getPreviousRoundContext(service, service.currentWorkflow) return history or "No previous workflow rounds available" - return "No previous workflow rounds available" + except Exception as e: + logger.error(f"Error getting workflow history: {str(e)}") + return "No previous workflow rounds available" def extractAvailableMethods(service: Any) -> str: """Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}""" @@ -390,7 +382,7 @@ def extractLatestRefinementFeedback(context: Any) -> str: def extractAvailableDocumentsSummary(service: Any, context: Any) -> str: """Summary of available documents (count only).""" try: - documents = service.workflow.getAvailableDocuments(context.workflow) + documents = service.workflow.getAvailableDocuments(service.currentWorkflow) if documents and documents != "No documents available": # Count only actual documents, not list labels doc_count = documents.count("docItem:") @@ -403,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str: def extractAvailableDocumentsIndex(service: Any, context: Any) -> str: """Index of available documents with detailed references for parameter generation.""" try: - return service.workflow.getAvailableDocuments(context.workflow) + return service.workflow.getAvailableDocuments(service.currentWorkflow) except Exception as e: logger.error(f"Error getting document index: {str(e)}") return "No documents available"