From 1aecec9d6146613e42462a57f3e821d93f8a97be Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Sat, 11 Oct 2025 18:51:23 +0200
Subject: [PATCH] full dynamic document extraction - processing - generation

---
 modules/services/serviceAi/mainServiceAi.py   |  58 ++++-----
 .../mainServiceGeneration.py                  |  69 +++++++----
 .../serviceGeneration/prompt_builder.py       | 116 ++++++++++++++++--
 .../renderers/docx_renderer.py                | 116 +++++++++++++++++-
 test_document_processing.py                   |   6 +-
 5 files changed, 295 insertions(+), 70 deletions(-)

diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 90c43273..3282d54f 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -1515,68 +1515,70 @@ class AiService:
                 title = "AI Generated Document"
             
             # Get format-specific extraction prompt
-            extraction_prompt = generation_service.getExtractionPrompt(
-                output_format=outputFormat,
-                user_prompt=prompt,
-                title=title
+            extractionPrompt = await generation_service.getExtractionPrompt(
+                outputFormat=outputFormat,
+                userPrompt=prompt,
+                title=title,
+                aiService=self
             )
             
             # Process documents with format-specific prompt using CLEAN mode
             # This ensures no debug metadata is included in the final output
-            ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
+            aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
 
             # Parse filename header from AI response if present
-            parsed_filename = None
+            parsedFilename = None
             try:
-                if ai_response:
-                    first_newline = ai_response.find('\n')
-                    header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
-                    if header_line.strip().lower().startswith('filename:'):
-                        parsed = header_line.split(':', 1)[1].strip()
+                if aiResponse:
+                    firstNewline = aiResponse.find('\n')
+                    headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
+                    if headerLine.strip().lower().startswith('filename:'):
+                        parsed = headerLine.split(':', 1)[1].strip()
                         # basic sanitization
                         import re
                         parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
                         parsed = re.sub(r"-+", "-", parsed).strip('-')
                         if parsed:
-                            parsed_filename = parsed
+                            parsedFilename = parsed
                             # remove header line from content for rendering
-                            ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
+                            aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
             except Exception:
-                parsed_filename = None
+                parsedFilename = None
             
-            if not ai_response or ai_response.strip() == "":
+            if not aiResponse or aiResponse.strip() == "":
                 raise Exception("AI content generation failed")
             
             # Render the content to the specified format
-            rendered_content, mime_type = await generation_service.renderReport(
-                extracted_content=ai_response,
-                output_format=outputFormat,
+            renderedContent, mimeType = await generation_service.renderReport(
+                extractedContent=aiResponse,
+                outputFormat=outputFormat,
                 title=title,
-                user_prompt=prompt
+                userPrompt=prompt,
+                aiService=self
             )
             
             # Generate meaningful filename (use AI-provided if valid, else fallback)
             from datetime import datetime, UTC
             timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
-            if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
-                filename = parsed_filename
+            if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
+                filename = parsedFilename
             else:
-                safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
-                filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
+                safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
+                filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
             
             # Return structured result with document information
             return {
                 "success": True,
-                "content": ai_response,  # Raw AI response
-                "rendered_content": rendered_content,  # Formatted content
-                "mime_type": mime_type,
+                "content": aiResponse,  # Raw AI response
+                "rendered_content": renderedContent,  # Formatted content
+                "mime_type": mimeType,
                 "filename": filename,
                 "format": outputFormat,
                 "title": title,
                 "documents": [{
                     "documentName": filename,
-                    "documentData": rendered_content,
-                    "mimeType": mime_type
+                    "documentData": renderedContent,
+                    "mimeType": mimeType
                 }]
             }
             
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index 9bdf050d..ddc4cc4e 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -296,14 +296,16 @@ class GenerationService:
                 'workflowId': 'unknown'
             }
 
-    async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
+    async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
         """
         Render extracted content to the specified output format.
         
         Args:
-            extracted_content: Content extracted by AI using format-specific prompt
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            extractedContent: Content extracted by AI using format-specific prompt
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
             title: Report title
+            userPrompt: User's original prompt for report generation
+            aiService: AI service instance for generation prompt creation
             
         Returns:
             tuple: (rendered_content, mime_type)
@@ -317,66 +319,83 @@ class GenerationService:
                 debug_dir = os.path.join(debug_root, f"render_input_{ts}")
                 os.makedirs(debug_dir, exist_ok=True)
                 with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
-                    f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
+                    f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
                 with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
-                    f.write(extracted_content or "")
+                    f.write(extractedContent or "")
             except Exception:
                 pass
 
             # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
             if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
             
-            # Render the content with user prompt for structure
-            rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
+            # Generate AI-based generation prompt if AI service is available
+            generationPrompt = userPrompt  # Default to user prompt
+            if aiService and userPrompt:
+                try:
+                    from .prompt_builder import buildGenerationPrompt
+                    generationPrompt = await buildGenerationPrompt(
+                        outputFormat=outputFormat,
+                        userPrompt=userPrompt,
+                        title=title,
+                        aiService=aiService
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
+                    generationPrompt = userPrompt
+            
+            # Render the content with AI-generated prompt
+            renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
             # DEBUG: dump rendered output
             try:
                 import os
                 with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
-                    f.write(rendered_content or "")
+                    f.write(renderedContent or "")
             except Exception:
                 pass
             
-            logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
-            return rendered_content, mime_type
+            logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
+            return renderedContent, mimeType
             
         except Exception as e:
-            logger.error(f"Error rendering report to {output_format}: {str(e)}")
+            logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
             raise
     
-    def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
+    async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
         """
         Get the format-specific extraction prompt for AI content extraction.
         
         Args:
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
-            user_prompt: User's original prompt for report generation
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            userPrompt: User's original prompt for report generation
             title: Report title
+            aiService: AI service instance for intent extraction
             
         Returns:
             str: Format-specific prompt for AI extraction
         """
         try:
             # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
             if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
             
             # Build centralized prompt with generic rules + format-specific guidelines
             from .prompt_builder import buildExtractionPrompt
-            extraction_prompt = buildExtractionPrompt(
-                output_format=output_format,
+            extractionPrompt = await buildExtractionPrompt(
+                outputFormat=outputFormat,
                 renderer=renderer,
-                user_prompt=user_prompt,
-                title=title
+                userPrompt=userPrompt,
+                title=title,
+                aiService=aiService
             )
             
-            logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
-            return extraction_prompt
+            logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
+            return extractionPrompt
             
         except Exception as e:
-            logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
+            logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
             raise
 
     def _getFormatRenderer(self, output_format: str):
diff --git a/modules/services/serviceGeneration/prompt_builder.py b/modules/services/serviceGeneration/prompt_builder.py
index 89f6bfe9..1565e42a 100644
--- a/modules/services/serviceGeneration/prompt_builder.py
+++ b/modules/services/serviceGeneration/prompt_builder.py
@@ -16,15 +16,16 @@ class _RendererLike(Protocol):
         ...
 
 
-def buildExtractionPrompt(
-    output_format: str,
+async def buildExtractionPrompt(
+    outputFormat: str,
     renderer: _RendererLike,
-    user_prompt: str,
-    title: str
+    userPrompt: str,
+    title: str,
+    aiService=None
 ) -> str:
     """
     Build the final extraction prompt by combining:
-    - The raw user prompt (verbatim)
+    - Parsed extraction intent from user prompt (using AI)
     - Generic cross-format instructions (filename header + real-data policy)
     - Format-specific guidelines snippet provided by the renderer
 
@@ -33,13 +34,16 @@ def buildExtractionPrompt(
     followed by a blank line and then ONLY the document content according to the target format.
     """
 
-    format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
+    # Parse user prompt to separate extraction intent from generation format using AI
+    extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
+    
+    formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
 
     # Generic block appears once for every format
-    generic_intro = f"""
-{user_prompt}
+    genericIntro = f"""
+{extractionIntent}
 
-You are generating a document in {output_format.upper()} format for the title: "{title}".
+You are generating a document in {outputFormat.upper()} format for the title: "{title}".
 
 Rules:
 - The user's intent fully defines the structure. Do not assume a fixed template or headings.
@@ -62,13 +66,99 @@ Common policy:
 """.strip()
 
     # Final assembly
-    final_prompt = (
-        generic_intro
+    finalPrompt = (
+        genericIntro
         + "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
-        + format_guidelines.strip()
+        + formatGuidelines.strip()
         + "\n\nGenerate the complete document content now based on the source documents below:"
     )
 
-    return final_prompt
+    return finalPrompt
+
+
+async def buildGenerationPrompt(
+    outputFormat: str,
+    userPrompt: str,
+    title: str,
+    aiService=None
+) -> str:
+    """
+    Use AI to build the generation prompt based on user intent and format requirements.
+    Focus on what's important for the user and how to structure the content.
+    """
+    if not aiService:
+        # Fallback if no AI service available
+        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+    
+    try:
+        # Protect userPrompt from injection
+        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+        
+        # AI call to generate the appropriate generation prompt
+        generationPromptRequest = f"""
+Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
+
+User request: "{safeUserPrompt}"
+Document title: "{title}"
+Output format: {outputFormat}
+
+Create a generation prompt that:
+1. Identifies what content is most important for the user
+2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
+3. Includes any specific formatting or presentation requirements
+4. Ensures the document meets the user's needs
+
+Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
+"""
+        
+        # Call AI service to generate the prompt
+        result = await aiService.callAi(
+            prompt=generationPromptRequest,
+            documents=None,
+            options=None
+        )
+        
+        return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+        
+    except Exception:
+        # Fallback on any error
+        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+
+
+async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
+    """
+    Use AI to extract the core content intention from the user prompt.
+    Focus on WHAT the user wants to extract, not HOW to format it.
+    """
+    if not aiService:
+        # Fallback if no AI service available
+        return "Extract all relevant content from the document according to the user's requirements"
+    
+    try:
+        # Protect userPrompt from injection by escaping quotes and newlines
+        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+        
+        # Simple AI call to extract the intention
+        extractionPrompt = f"""
+Extract the core content intention from this user request. Focus on WHAT content they want.
+
+User request: "{safeUserPrompt}"
+
+Return only the content intention in a simple format like "Extract: [content description]"
+Do not include formatting instructions, file types, or output methods.
+"""
+        
+        # Call AI service to extract intention
+        result = await aiService.callAi(
+            prompt=extractionPrompt,
+            documents=None,
+            options=None
+        )
+        
+        return result if result else "Extract all relevant content from the document according to the user's requirements"
+        
+    except Exception:
+        # Fallback on any error
+        return "Extract all relevant content from the document according to the user's requirements"
 
 
diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py
index c4919d42..450a1c72 100644
--- a/modules/services/serviceGeneration/renderers/docx_renderer.py
+++ b/modules/services/serviceGeneration/renderers/docx_renderer.py
@@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
             "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
             "- Use bullet points (-) for lists and sub-items\n"
             "- Use **bold** for emphasis on key terms\n"
+            "- Use pipe-separated format (Item | Status) for tables when appropriate\n"
             "- Provide clean, structured content that can be directly converted to Word formatting\n"
             "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
             "- Start directly with your content - no introductory text or separators\n"
@@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
         except Exception as e:
             self.logger.warning(f"Could not style table: {str(e)}")
     
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
+        else:
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
+    
     def _clean_ai_content(self, content: str) -> str:
         """Clean AI-generated content by removing debug information and duplicates."""
         if not content:
@@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
                 bullet_text = line[2:]  # Remove "- "
                 self._add_bullet_point(doc, bullet_text)
                 
+            # Check if this is a table row (contains pipe separator)
+            elif '|' in line:
+                # Flush current paragraph
+                if current_paragraph:
+                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
+                    current_paragraph = []
+                
+                # This is a table row - collect table data
+                self._process_table_row(doc, line)
+                
             else:
                 # Regular text - add to current paragraph
                 current_paragraph.append(line)
@@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
                     if part:
                         run = para.add_run(part)
                         run.bold = True
+    
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
         else:
-            # Regular paragraph
-            doc.add_paragraph(text)
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
     
     def _add_bullet_point(self, doc, text: str):
         """Add a bullet point to the document."""
@@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
                     # Bold text
                     if part:
                         run = para.add_run(part)
-                        run.bold = True
\ No newline at end of file
+                        run.bold = True
+    
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
+        else:
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
\ No newline at end of file
diff --git a/test_document_processing.py b/test_document_processing.py
index fe16967d..bafc05c0 100644
--- a/test_document_processing.py
+++ b/test_document_processing.py
@@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
         # Run a single end-to-end test to avoid the loop issue
         logger.info("🧪 Running single end-to-end test...")
         
+        # userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
+
+        userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
+
         try:
             # Single AI call with DOCX generation
             ai_response = await ai_service.callAi(
-                prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
+                prompt=userPrompt,
                 documents=documents,
                 options=ai_options,
                 outputFormat="docx",