full dynamic document extraction - processing - generation

2025-10-11 18:51:23 +02:00 · 2025-10-11 18:51:23 +02:00 · 1aecec9d61
commit 1aecec9d61
parent be5f6773b6
5 changed files with 295 additions and 70 deletions
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -1515,68 +1515,70 @@ class AiService:
                title = "AI Generated Document"
            
            # Get format-specific extraction prompt
-            extraction_prompt = generation_service.getExtractionPrompt(
-                output_format=outputFormat,
-                user_prompt=prompt,
-                title=title
+            extractionPrompt = await generation_service.getExtractionPrompt(
+                outputFormat=outputFormat,
+                userPrompt=prompt,
+                title=title,
+                aiService=self
            )
            
            # Process documents with format-specific prompt using CLEAN mode
            # This ensures no debug metadata is included in the final output
-            ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
+            aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)

            # Parse filename header from AI response if present
-            parsed_filename = None
+            parsedFilename = None
            try:
-                if ai_response:
-                    first_newline = ai_response.find('\n')
-                    header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
-                    if header_line.strip().lower().startswith('filename:'):
-                        parsed = header_line.split(':', 1)[1].strip()
+                if aiResponse:
+                    firstNewline = aiResponse.find('\n')
+                    headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
+                    if headerLine.strip().lower().startswith('filename:'):
+                        parsed = headerLine.split(':', 1)[1].strip()
                        # basic sanitization
                        import re
                        parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
                        parsed = re.sub(r"-+", "-", parsed).strip('-')
                        if parsed:
-                            parsed_filename = parsed
+                            parsedFilename = parsed
                            # remove header line from content for rendering
-                            ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
+                            aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
            except Exception:
-                parsed_filename = None
+                parsedFilename = None
            
-            if not ai_response or ai_response.strip() == "":
+            if not aiResponse or aiResponse.strip() == "":
                raise Exception("AI content generation failed")
            
            # Render the content to the specified format
-            rendered_content, mime_type = await generation_service.renderReport(
-                extracted_content=ai_response,
-                output_format=outputFormat,
+            renderedContent, mimeType = await generation_service.renderReport(
+                extractedContent=aiResponse,
+                outputFormat=outputFormat,
                title=title,
-                user_prompt=prompt
+                userPrompt=prompt,
+                aiService=self
            )
            
            # Generate meaningful filename (use AI-provided if valid, else fallback)
            from datetime import datetime, UTC
            timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
-            if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
-                filename = parsed_filename
+            if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
+                filename = parsedFilename
            else:
-                safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
-                filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
+                safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
+                filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
            
            # Return structured result with document information
            return {
                "success": True,
-                "content": ai_response,  # Raw AI response
-                "rendered_content": rendered_content,  # Formatted content
-                "mime_type": mime_type,
+                "content": aiResponse,  # Raw AI response
+                "rendered_content": renderedContent,  # Formatted content
+                "mime_type": mimeType,
                "filename": filename,
                "format": outputFormat,
                "title": title,
                "documents": [{
                    "documentName": filename,
-                    "documentData": rendered_content,
-                    "mimeType": mime_type
+                    "documentData": renderedContent,
+                    "mimeType": mimeType
                }]
            }
            
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@ -296,14 +296,16 @@ class GenerationService:
                'workflowId': 'unknown'
            }

-    async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
+    async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
        """
        Render extracted content to the specified output format.
        
        Args:
-            extracted_content: Content extracted by AI using format-specific prompt
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            extractedContent: Content extracted by AI using format-specific prompt
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
            title: Report title
+            userPrompt: User's original prompt for report generation
+            aiService: AI service instance for generation prompt creation
            
        Returns:
            tuple: (rendered_content, mime_type)
@ -317,66 +319,83 @@ class GenerationService:
                debug_dir = os.path.join(debug_root, f"render_input_{ts}")
                os.makedirs(debug_dir, exist_ok=True)
                with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
-                    f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
+                    f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
                with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
-                    f.write(extracted_content or "")
+                    f.write(extractedContent or "")
            except Exception:
                pass

            # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
            if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
            
-            # Render the content with user prompt for structure
-            rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
+            # Generate AI-based generation prompt if AI service is available
+            generationPrompt = userPrompt  # Default to user prompt
+            if aiService and userPrompt:
+                try:
+                    from .prompt_builder import buildGenerationPrompt
+                    generationPrompt = await buildGenerationPrompt(
+                        outputFormat=outputFormat,
+                        userPrompt=userPrompt,
+                        title=title,
+                        aiService=aiService
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
+                    generationPrompt = userPrompt
+            
+            # Render the content with AI-generated prompt
+            renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
            # DEBUG: dump rendered output
            try:
                import os
                with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
-                    f.write(rendered_content or "")
+                    f.write(renderedContent or "")
            except Exception:
                pass
            
-            logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
-            return rendered_content, mime_type
+            logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
+            return renderedContent, mimeType
            
        except Exception as e:
-            logger.error(f"Error rendering report to {output_format}: {str(e)}")
+            logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
            raise
    
-    def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
+    async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
        """
        Get the format-specific extraction prompt for AI content extraction.
        
        Args:
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
-            user_prompt: User's original prompt for report generation
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            userPrompt: User's original prompt for report generation
            title: Report title
+            aiService: AI service instance for intent extraction
            
        Returns:
            str: Format-specific prompt for AI extraction
        """
        try:
            # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
            if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
            
            # Build centralized prompt with generic rules + format-specific guidelines
            from .prompt_builder import buildExtractionPrompt
-            extraction_prompt = buildExtractionPrompt(
-                output_format=output_format,
+            extractionPrompt = await buildExtractionPrompt(
+                outputFormat=outputFormat,
                renderer=renderer,
-                user_prompt=user_prompt,
-                title=title
+                userPrompt=userPrompt,
+                title=title,
+                aiService=aiService
            )
            
-            logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
-            return extraction_prompt
+            logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
+            return extractionPrompt
            
        except Exception as e:
-            logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
+            logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
            raise

    def _getFormatRenderer(self, output_format: str):
--- a/modules/services/serviceGeneration/prompt_builder.py
+++ b/modules/services/serviceGeneration/prompt_builder.py
@ -16,15 +16,16 @@ class _RendererLike(Protocol):
        ...


-def buildExtractionPrompt(
-    output_format: str,
+async def buildExtractionPrompt(
+    outputFormat: str,
    renderer: _RendererLike,
-    user_prompt: str,
-    title: str
+    userPrompt: str,
+    title: str,
+    aiService=None
 ) -> str:
    """
    Build the final extraction prompt by combining:
-    - The raw user prompt (verbatim)
+    - Parsed extraction intent from user prompt (using AI)
    - Generic cross-format instructions (filename header + real-data policy)
    - Format-specific guidelines snippet provided by the renderer

@ -33,13 +34,16 @@ def buildExtractionPrompt(
    followed by a blank line and then ONLY the document content according to the target format.
    """

-    format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
+    # Parse user prompt to separate extraction intent from generation format using AI
+    extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
+    
+    formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)

    # Generic block appears once for every format
-    generic_intro = f"""
-{user_prompt}
+    genericIntro = f"""
+{extractionIntent}

-You are generating a document in {output_format.upper()} format for the title: "{title}".
+You are generating a document in {outputFormat.upper()} format for the title: "{title}".

 Rules:
 - The user's intent fully defines the structure. Do not assume a fixed template or headings.
@ -62,13 +66,99 @@ Common policy:
 """.strip()

    # Final assembly
-    final_prompt = (
-        generic_intro
+    finalPrompt = (
+        genericIntro
        + "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
-        + format_guidelines.strip()
+        + formatGuidelines.strip()
        + "\n\nGenerate the complete document content now based on the source documents below:"
    )

-    return final_prompt
+    return finalPrompt
+
+
+async def buildGenerationPrompt(
+    outputFormat: str,
+    userPrompt: str,
+    title: str,
+    aiService=None
+) -> str:
+    """
+    Use AI to build the generation prompt based on user intent and format requirements.
+    Focus on what's important for the user and how to structure the content.
+    """
+    if not aiService:
+        # Fallback if no AI service available
+        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+    
+    try:
+        # Protect userPrompt from injection
+        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+        
+        # AI call to generate the appropriate generation prompt
+        generationPromptRequest = f"""
+Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
+
+User request: "{safeUserPrompt}"
+Document title: "{title}"
+Output format: {outputFormat}
+
+Create a generation prompt that:
+1. Identifies what content is most important for the user
+2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
+3. Includes any specific formatting or presentation requirements
+4. Ensures the document meets the user's needs
+
+Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
+"""
+        
+        # Call AI service to generate the prompt
+        result = await aiService.callAi(
+            prompt=generationPromptRequest,
+            documents=None,
+            options=None
+        )
+        
+        return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+        
+    except Exception:
+        # Fallback on any error
+        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+
+
+async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
+    """
+    Use AI to extract the core content intention from the user prompt.
+    Focus on WHAT the user wants to extract, not HOW to format it.
+    """
+    if not aiService:
+        # Fallback if no AI service available
+        return "Extract all relevant content from the document according to the user's requirements"
+    
+    try:
+        # Protect userPrompt from injection by escaping quotes and newlines
+        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+        
+        # Simple AI call to extract the intention
+        extractionPrompt = f"""
+Extract the core content intention from this user request. Focus on WHAT content they want.
+
+User request: "{safeUserPrompt}"
+
+Return only the content intention in a simple format like "Extract: [content description]"
+Do not include formatting instructions, file types, or output methods.
+"""
+        
+        # Call AI service to extract intention
+        result = await aiService.callAi(
+            prompt=extractionPrompt,
+            documents=None,
+            options=None
+        )
+        
+        return result if result else "Extract all relevant content from the document according to the user's requirements"
+        
+    except Exception:
+        # Fallback on any error
+        return "Extract all relevant content from the document according to the user's requirements"


--- a/modules/services/serviceGeneration/renderers/docx_renderer.py
+++ b/modules/services/serviceGeneration/renderers/docx_renderer.py
@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
            "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
            "- Use bullet points (-) for lists and sub-items\n"
            "- Use **bold** for emphasis on key terms\n"
+            "- Use pipe-separated format (Item | Status) for tables when appropriate\n"
            "- Provide clean, structured content that can be directly converted to Word formatting\n"
            "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
            "- Start directly with your content - no introductory text or separators\n"
@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
        except Exception as e:
            self.logger.warning(f"Could not style table: {str(e)}")
    
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
+        else:
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
+    
    def _clean_ai_content(self, content: str) -> str:
        """Clean AI-generated content by removing debug information and duplicates."""
        if not content:
@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
                bullet_text = line[2:]  # Remove "- "
                self._add_bullet_point(doc, bullet_text)
                
+            # Check if this is a table row (contains pipe separator)
+            elif '|' in line:
+                # Flush current paragraph
+                if current_paragraph:
+                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
+                    current_paragraph = []
+                
+                # This is a table row - collect table data
+                self._process_table_row(doc, line)
+                
            else:
                # Regular text - add to current paragraph
                current_paragraph.append(line)
@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
                    if part:
                        run = para.add_run(part)
                        run.bold = True
+    
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
        else:
-            # Regular paragraph
-            doc.add_paragraph(text)
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
    
    def _add_bullet_point(self, doc, text: str):
        """Add a bullet point to the document."""
@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
                    # Bold text
                    if part:
                        run = para.add_run(part)
-                        run.bold = True
+                        run.bold = True
+    
+    def _process_table_row(self, doc, line: str):
+        """Process a table row and add it to the document."""
+        if not line.strip():
+            return
+        
+        # Split by pipe separator
+        parts = [part.strip() for part in line.split('|')]
+        
+        if len(parts) >= 2:
+            # This is a table row - create a table if it doesn't exist
+            if not hasattr(self, '_current_table') or self._current_table is None:
+                # Create new table
+                self._current_table = doc.add_table(rows=1, cols=len(parts))
+                self._current_table.style = 'Table Grid'
+                
+                # Add header row
+                for i, part in enumerate(parts):
+                    if i < len(self._current_table.rows[0].cells):
+                        cell = self._current_table.rows[0].cells[i]
+                        cell.text = part
+                        # Make header bold
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+            else:
+                # Add data row to existing table
+                row = self._current_table.add_row()
+                for i, part in enumerate(parts):
+                    if i < len(row.cells):
+                        row.cells[i].text = part
+        else:
+            # Not a table row, treat as regular text
+            doc.add_paragraph(line)
--- a/test_document_processing.py
+++ b/test_document_processing.py
@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
        # Run a single end-to-end test to avoid the loop issue
        logger.info("🧪 Running single end-to-end test...")
        
+        # userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
+
+        userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
+
        try:
            # Single AI call with DOCX generation
            ai_response = await ai_service.callAi(
-                prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
+                prompt=userPrompt,
                documents=documents,
                options=ai_options,
                outputFormat="docx",