diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 90c43273..3282d54f 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -1515,68 +1515,70 @@ class AiService: title = "AI Generated Document" # Get format-specific extraction prompt - extraction_prompt = generation_service.getExtractionPrompt( - output_format=outputFormat, - user_prompt=prompt, - title=title + extractionPrompt = await generation_service.getExtractionPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + aiService=self ) # Process documents with format-specific prompt using CLEAN mode # This ensures no debug metadata is included in the final output - ai_response = await self._callAiTextClean(extraction_prompt, documents, options) + aiResponse = await self._callAiTextClean(extractionPrompt, documents, options) # Parse filename header from AI response if present - parsed_filename = None + parsedFilename = None try: - if ai_response: - first_newline = ai_response.find('\n') - header_line = ai_response if first_newline == -1 else ai_response[:first_newline] - if header_line.strip().lower().startswith('filename:'): - parsed = header_line.split(':', 1)[1].strip() + if aiResponse: + firstNewline = aiResponse.find('\n') + headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline] + if headerLine.strip().lower().startswith('filename:'): + parsed = headerLine.split(':', 1)[1].strip() # basic sanitization import re parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed) parsed = re.sub(r"-+", "-", parsed).strip('-') if parsed: - parsed_filename = parsed + parsedFilename = parsed # remove header line from content for rendering - ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else '' + aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else '' except Exception: - parsed_filename = None + parsedFilename = None - if not ai_response or ai_response.strip() == "": + if not aiResponse or aiResponse.strip() == "": raise Exception("AI content generation failed") # Render the content to the specified format - rendered_content, mime_type = await generation_service.renderReport( - extracted_content=ai_response, - output_format=outputFormat, + renderedContent, mimeType = await generation_service.renderReport( + extractedContent=aiResponse, + outputFormat=outputFormat, title=title, - user_prompt=prompt + userPrompt=prompt, + aiService=self ) # Generate meaningful filename (use AI-provided if valid, else fallback) from datetime import datetime, UTC timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"): - filename = parsed_filename + if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"): + filename = parsedFilename else: - safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') - filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}" + safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') + filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}" # Return structured result with document information return { "success": True, - "content": ai_response, # Raw AI response - "rendered_content": rendered_content, # Formatted content - "mime_type": mime_type, + "content": aiResponse, # Raw AI response + "rendered_content": renderedContent, # Formatted content + "mime_type": mimeType, "filename": filename, "format": outputFormat, "title": title, "documents": [{ "documentName": filename, - "documentData": rendered_content, - "mimeType": mime_type + "documentData": renderedContent, + "mimeType": mimeType }] } diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index 9bdf050d..ddc4cc4e 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -296,14 +296,16 @@ class GenerationService: 'workflowId': 'unknown' } - async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]: + async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]: """ Render extracted content to the specified output format. Args: - extracted_content: Content extracted by AI using format-specific prompt - output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + extractedContent: Content extracted by AI using format-specific prompt + outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) title: Report title + userPrompt: User's original prompt for report generation + aiService: AI service instance for generation prompt creation Returns: tuple: (rendered_content, mime_type) @@ -317,66 +319,83 @@ class GenerationService: debug_dir = os.path.join(debug_root, f"render_input_{ts}") os.makedirs(debug_dir, exist_ok=True) with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: - f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n") + f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n") with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f: - f.write(extracted_content or "") + f.write(extractedContent or "") except Exception: pass # Get the appropriate renderer for the format - renderer = self._getFormatRenderer(output_format) + renderer = self._getFormatRenderer(outputFormat) if not renderer: - raise ValueError(f"Unsupported output format: {output_format}") + raise ValueError(f"Unsupported output format: {outputFormat}") - # Render the content with user prompt for structure - rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt) + # Generate AI-based generation prompt if AI service is available + generationPrompt = userPrompt # Default to user prompt + if aiService and userPrompt: + try: + from .prompt_builder import buildGenerationPrompt + generationPrompt = await buildGenerationPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + aiService=aiService + ) + except Exception as e: + logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt") + generationPrompt = userPrompt + + # Render the content with AI-generated prompt + renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt) # DEBUG: dump rendered output try: import os with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f: - f.write(rendered_content or "") + f.write(renderedContent or "") except Exception: pass - logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") - return rendered_content, mime_type + logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters") + return renderedContent, mimeType except Exception as e: - logger.error(f"Error rendering report to {output_format}: {str(e)}") + logger.error(f"Error rendering report to {outputFormat}: {str(e)}") raise - def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str: + async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str: """ Get the format-specific extraction prompt for AI content extraction. Args: - output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) - user_prompt: User's original prompt for report generation + outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + userPrompt: User's original prompt for report generation title: Report title + aiService: AI service instance for intent extraction Returns: str: Format-specific prompt for AI extraction """ try: # Get the appropriate renderer for the format - renderer = self._getFormatRenderer(output_format) + renderer = self._getFormatRenderer(outputFormat) if not renderer: - raise ValueError(f"Unsupported output format: {output_format}") + raise ValueError(f"Unsupported output format: {outputFormat}") # Build centralized prompt with generic rules + format-specific guidelines from .prompt_builder import buildExtractionPrompt - extraction_prompt = buildExtractionPrompt( - output_format=output_format, + extractionPrompt = await buildExtractionPrompt( + outputFormat=outputFormat, renderer=renderer, - user_prompt=user_prompt, - title=title + userPrompt=userPrompt, + title=title, + aiService=aiService ) - logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters") - return extraction_prompt + logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters") + return extractionPrompt except Exception as e: - logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}") + logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}") raise def _getFormatRenderer(self, output_format: str): diff --git a/modules/services/serviceGeneration/prompt_builder.py b/modules/services/serviceGeneration/prompt_builder.py index 89f6bfe9..1565e42a 100644 --- a/modules/services/serviceGeneration/prompt_builder.py +++ b/modules/services/serviceGeneration/prompt_builder.py @@ -16,15 +16,16 @@ class _RendererLike(Protocol): ... -def buildExtractionPrompt( - output_format: str, +async def buildExtractionPrompt( + outputFormat: str, renderer: _RendererLike, - user_prompt: str, - title: str + userPrompt: str, + title: str, + aiService=None ) -> str: """ Build the final extraction prompt by combining: - - The raw user prompt (verbatim) + - Parsed extraction intent from user prompt (using AI) - Generic cross-format instructions (filename header + real-data policy) - Format-specific guidelines snippet provided by the renderer @@ -33,13 +34,16 @@ def buildExtractionPrompt( followed by a blank line and then ONLY the document content according to the target format. """ - format_guidelines = renderer.getExtractionPrompt(user_prompt, title) + # Parse user prompt to separate extraction intent from generation format using AI + extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService) + + formatGuidelines = renderer.getExtractionPrompt(userPrompt, title) # Generic block appears once for every format - generic_intro = f""" -{user_prompt} + genericIntro = f""" +{extractionIntent} -You are generating a document in {output_format.upper()} format for the title: "{title}". +You are generating a document in {outputFormat.upper()} format for the title: "{title}". Rules: - The user's intent fully defines the structure. Do not assume a fixed template or headings. @@ -62,13 +66,99 @@ Common policy: """.strip() # Final assembly - final_prompt = ( - generic_intro + finalPrompt = ( + genericIntro + "\n\nFORMAT-SPECIFIC GUIDELINES:\n" - + format_guidelines.strip() + + formatGuidelines.strip() + "\n\nGenerate the complete document content now based on the source documents below:" ) - return final_prompt + return finalPrompt + + +async def buildGenerationPrompt( + outputFormat: str, + userPrompt: str, + title: str, + aiService=None +) -> str: + """ + Use AI to build the generation prompt based on user intent and format requirements. + Focus on what's important for the user and how to structure the content. + """ + if not aiService: + # Fallback if no AI service available + return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." + + try: + # Protect userPrompt from injection + safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') + + # AI call to generate the appropriate generation prompt + generationPromptRequest = f""" +Based on this user request, create a detailed generation prompt for creating a {outputFormat} document. + +User request: "{safeUserPrompt}" +Document title: "{title}" +Output format: {outputFormat} + +Create a generation prompt that: +1. Identifies what content is most important for the user +2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention. +3. Includes any specific formatting or presentation requirements +4. Ensures the document meets the user's needs + +Return only the generation prompt, starting with "Generate a {outputFormat} document that..." +""" + + # Call AI service to generate the prompt + result = await aiService.callAi( + prompt=generationPromptRequest, + documents=None, + options=None + ) + + return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." + + except Exception: + # Fallback on any error + return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." + + +async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str: + """ + Use AI to extract the core content intention from the user prompt. + Focus on WHAT the user wants to extract, not HOW to format it. + """ + if not aiService: + # Fallback if no AI service available + return "Extract all relevant content from the document according to the user's requirements" + + try: + # Protect userPrompt from injection by escaping quotes and newlines + safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') + + # Simple AI call to extract the intention + extractionPrompt = f""" +Extract the core content intention from this user request. Focus on WHAT content they want. + +User request: "{safeUserPrompt}" + +Return only the content intention in a simple format like "Extract: [content description]" +Do not include formatting instructions, file types, or output methods. +""" + + # Call AI service to extract intention + result = await aiService.callAi( + prompt=extractionPrompt, + documents=None, + options=None + ) + + return result if result else "Extract all relevant content from the document according to the user's requirements" + + except Exception: + # Fallback on any error + return "Extract all relevant content from the document according to the user's requirements" diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py index c4919d42..450a1c72 100644 --- a/modules/services/serviceGeneration/renderers/docx_renderer.py +++ b/modules/services/serviceGeneration/renderers/docx_renderer.py @@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer): "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n" "- Use bullet points (-) for lists and sub-items\n" "- Use **bold** for emphasis on key terms\n" + "- Use pipe-separated format (Item | Status) for tables when appropriate\n" "- Provide clean, structured content that can be directly converted to Word formatting\n" "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n" "- Start directly with your content - no introductory text or separators\n" @@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer): except Exception as e: self.logger.warning(f"Could not style table: {str(e)}") + def _process_table_row(self, doc, line: str): + """Process a table row and add it to the document.""" + if not line.strip(): + return + + # Split by pipe separator + parts = [part.strip() for part in line.split('|')] + + if len(parts) >= 2: + # This is a table row - create a table if it doesn't exist + if not hasattr(self, '_current_table') or self._current_table is None: + # Create new table + self._current_table = doc.add_table(rows=1, cols=len(parts)) + self._current_table.style = 'Table Grid' + + # Add header row + for i, part in enumerate(parts): + if i < len(self._current_table.rows[0].cells): + cell = self._current_table.rows[0].cells[i] + cell.text = part + # Make header bold + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + else: + # Add data row to existing table + row = self._current_table.add_row() + for i, part in enumerate(parts): + if i < len(row.cells): + row.cells[i].text = part + else: + # Not a table row, treat as regular text + doc.add_paragraph(line) + def _clean_ai_content(self, content: str) -> str: """Clean AI-generated content by removing debug information and duplicates.""" if not content: @@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer): bullet_text = line[2:] # Remove "- " self._add_bullet_point(doc, bullet_text) + # Check if this is a table row (contains pipe separator) + elif '|' in line: + # Flush current paragraph + if current_paragraph: + self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) + current_paragraph = [] + + # This is a table row - collect table data + self._process_table_row(doc, line) + else: # Regular text - add to current paragraph current_paragraph.append(line) @@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer): if part: run = para.add_run(part) run.bold = True + + def _process_table_row(self, doc, line: str): + """Process a table row and add it to the document.""" + if not line.strip(): + return + + # Split by pipe separator + parts = [part.strip() for part in line.split('|')] + + if len(parts) >= 2: + # This is a table row - create a table if it doesn't exist + if not hasattr(self, '_current_table') or self._current_table is None: + # Create new table + self._current_table = doc.add_table(rows=1, cols=len(parts)) + self._current_table.style = 'Table Grid' + + # Add header row + for i, part in enumerate(parts): + if i < len(self._current_table.rows[0].cells): + cell = self._current_table.rows[0].cells[i] + cell.text = part + # Make header bold + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + else: + # Add data row to existing table + row = self._current_table.add_row() + for i, part in enumerate(parts): + if i < len(row.cells): + row.cells[i].text = part else: - # Regular paragraph - doc.add_paragraph(text) + # Not a table row, treat as regular text + doc.add_paragraph(line) def _add_bullet_point(self, doc, text: str): """Add a bullet point to the document.""" @@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer): # Bold text if part: run = para.add_run(part) - run.bold = True \ No newline at end of file + run.bold = True + + def _process_table_row(self, doc, line: str): + """Process a table row and add it to the document.""" + if not line.strip(): + return + + # Split by pipe separator + parts = [part.strip() for part in line.split('|')] + + if len(parts) >= 2: + # This is a table row - create a table if it doesn't exist + if not hasattr(self, '_current_table') or self._current_table is None: + # Create new table + self._current_table = doc.add_table(rows=1, cols=len(parts)) + self._current_table.style = 'Table Grid' + + # Add header row + for i, part in enumerate(parts): + if i < len(self._current_table.rows[0].cells): + cell = self._current_table.rows[0].cells[i] + cell.text = part + # Make header bold + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + else: + # Add data row to existing table + row = self._current_table.add_row() + for i, part in enumerate(parts): + if i < len(row.cells): + row.cells[i].text = part + else: + # Not a table row, treat as regular text + doc.add_paragraph(line) \ No newline at end of file diff --git a/test_document_processing.py b/test_document_processing.py index fe16967d..bafc05c0 100644 --- a/test_document_processing.py +++ b/test_document_processing.py @@ -170,10 +170,14 @@ async def process_documents_and_generate_summary(): # Run a single end-to-end test to avoid the loop issue logger.info("🧪 Running single end-to-end test...") + # userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations." + + userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted." + try: # Single AI call with DOCX generation ai_response = await ai_service.callAi( - prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.", + prompt=userPrompt, documents=documents, options=ai_options, outputFormat="docx",