full dynamic document extraction - processing - generation

2025-10-11 18:51:23 +02:00 · 2025-10-11 18:51:23 +02:00 · 1aecec9d61
commit 1aecec9d61
parent be5f6773b6
5 changed files with 295 additions and 70 deletions
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -1515,68 +1515,70 @@ class AiService:
                title = "AI Generated Document"
            # Get format-specific extraction prompt
-            extraction_prompt = generation_service.getExtractionPrompt(
+            extractionPrompt = await generation_service.getExtractionPrompt(
-                output_format=outputFormat,
+                outputFormat=outputFormat,
-                user_prompt=prompt,
+                userPrompt=prompt,
-                title=title
+                title=title,
                aiService=self
            )
            # Process documents with format-specific prompt using CLEAN mode
            # This ensures no debug metadata is included in the final output
-            ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
+            aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
            # Parse filename header from AI response if present
-            parsed_filename = None
+            parsedFilename = None
            try:
-                if ai_response:
+                if aiResponse:
-                    first_newline = ai_response.find('\n')
+                    firstNewline = aiResponse.find('\n')
-                    header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
+                    headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
-                    if header_line.strip().lower().startswith('filename:'):
+                    if headerLine.strip().lower().startswith('filename:'):
-                        parsed = header_line.split(':', 1)[1].strip()
+                        parsed = headerLine.split(':', 1)[1].strip()
                        # basic sanitization
                        import re
                        parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
                        parsed = re.sub(r"-+", "-", parsed).strip('-')
                        if parsed:
-                            parsed_filename = parsed
+                            parsedFilename = parsed
                            # remove header line from content for rendering
-                            ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
+                            aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
            except Exception:
-                parsed_filename = None
+                parsedFilename = None
-            if not ai_response or ai_response.strip() == "":
+            if not aiResponse or aiResponse.strip() == "":
                raise Exception("AI content generation failed")
            # Render the content to the specified format
-            rendered_content, mime_type = await generation_service.renderReport(
+            renderedContent, mimeType = await generation_service.renderReport(
-                extracted_content=ai_response,
+                extractedContent=aiResponse,
-                output_format=outputFormat,
+                outputFormat=outputFormat,
                title=title,
-                user_prompt=prompt
+                userPrompt=prompt,
                aiService=self
            )
            # Generate meaningful filename (use AI-provided if valid, else fallback)
            from datetime import datetime, UTC
            timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
-            if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
+            if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
-                filename = parsed_filename
+                filename = parsedFilename
            else:
-                safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
+                safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
-                filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
+                filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
            # Return structured result with document information
            return {
                "success": True,
-                "content": ai_response,  # Raw AI response
+                "content": aiResponse,  # Raw AI response
-                "rendered_content": rendered_content,  # Formatted content
+                "rendered_content": renderedContent,  # Formatted content
-                "mime_type": mime_type,
+                "mime_type": mimeType,
                "filename": filename,
                "format": outputFormat,
                "title": title,
                "documents": [{
                    "documentName": filename,
-                    "documentData": rendered_content,
+                    "documentData": renderedContent,
-                    "mimeType": mime_type
+                    "mimeType": mimeType
                }]
            }
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@ -296,14 +296,16 @@ class GenerationService:
                'workflowId': 'unknown'
            }
-    async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
+    async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
        """
        Render extracted content to the specified output format.
        Args:
-            extracted_content: Content extracted by AI using format-specific prompt
+            extractedContent: Content extracted by AI using format-specific prompt
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
            title: Report title
            userPrompt: User's original prompt for report generation
            aiService: AI service instance for generation prompt creation
        Returns:
            tuple: (rendered_content, mime_type)
@ -317,66 +319,83 @@ class GenerationService:
                debug_dir = os.path.join(debug_root, f"render_input_{ts}")
                os.makedirs(debug_dir, exist_ok=True)
                with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
-                    f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
+                    f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
                with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
-                    f.write(extracted_content or "")
+                    f.write(extractedContent or "")
            except Exception:
                pass
            # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
            if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
-            # Render the content with user prompt for structure
+            # Generate AI-based generation prompt if AI service is available
-            rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
+            generationPrompt = userPrompt  # Default to user prompt
            if aiService and userPrompt:
                try:
                    from .prompt_builder import buildGenerationPrompt
                    generationPrompt = await buildGenerationPrompt(
                        outputFormat=outputFormat,
                        userPrompt=userPrompt,
                        title=title,
                        aiService=aiService
                    )
                except Exception as e:
                    logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
                    generationPrompt = userPrompt
            # Render the content with AI-generated prompt
            renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
            # DEBUG: dump rendered output
            try:
                import os
                with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
-                    f.write(rendered_content or "")
+                    f.write(renderedContent or "")
            except Exception:
                pass
-            logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
+            logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
-            return rendered_content, mime_type
+            return renderedContent, mimeType
        except Exception as e:
-            logger.error(f"Error rendering report to {output_format}: {str(e)}")
+            logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
            raise
-    def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
+    async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
        """
        Get the format-specific extraction prompt for AI content extraction.
        Args:
-            output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+            outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
-            user_prompt: User's original prompt for report generation
+            userPrompt: User's original prompt for report generation
            title: Report title
            aiService: AI service instance for intent extraction
        Returns:
            str: Format-specific prompt for AI extraction
        """
        try:
            # Get the appropriate renderer for the format
-            renderer = self._getFormatRenderer(output_format)
+            renderer = self._getFormatRenderer(outputFormat)
            if not renderer:
-                raise ValueError(f"Unsupported output format: {output_format}")
+                raise ValueError(f"Unsupported output format: {outputFormat}")
            # Build centralized prompt with generic rules + format-specific guidelines
            from .prompt_builder import buildExtractionPrompt
-            extraction_prompt = buildExtractionPrompt(
+            extractionPrompt = await buildExtractionPrompt(
-                output_format=output_format,
+                outputFormat=outputFormat,
                renderer=renderer,
-                user_prompt=user_prompt,
+                userPrompt=userPrompt,
-                title=title
+                title=title,
                aiService=aiService
            )
-            logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
+            logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
-            return extraction_prompt
+            return extractionPrompt
        except Exception as e:
-            logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
+            logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
            raise
    def _getFormatRenderer(self, output_format: str):
--- a/modules/services/serviceGeneration/prompt_builder.py
+++ b/modules/services/serviceGeneration/prompt_builder.py
@ -16,15 +16,16 @@ class _RendererLike(Protocol):
        ...
-def buildExtractionPrompt(
+async def buildExtractionPrompt(
-    output_format: str,
+    outputFormat: str,
    renderer: _RendererLike,
-    user_prompt: str,
+    userPrompt: str,
-    title: str
+    title: str,
    aiService=None
 ) -> str:
    """
    Build the final extraction prompt by combining:
-    - The raw user prompt (verbatim)
+    - Parsed extraction intent from user prompt (using AI)
    - Generic cross-format instructions (filename header + real-data policy)
    - Format-specific guidelines snippet provided by the renderer
@ -33,13 +34,16 @@ def buildExtractionPrompt(
    followed by a blank line and then ONLY the document content according to the target format.
    """
-    format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
+    # Parse user prompt to separate extraction intent from generation format using AI
    extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
    formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
    # Generic block appears once for every format
-    generic_intro = f"""
+    genericIntro = f"""
-{user_prompt}
+{extractionIntent}
-You are generating a document in {output_format.upper()} format for the title: "{title}".
+You are generating a document in {outputFormat.upper()} format for the title: "{title}".
 Rules:
 - The user's intent fully defines the structure. Do not assume a fixed template or headings.
@ -62,13 +66,99 @@ Common policy:
 """.strip()
    # Final assembly
-    final_prompt = (
+    finalPrompt = (
-        generic_intro
+        genericIntro
        + "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
-        + format_guidelines.strip()
+        + formatGuidelines.strip()
        + "\n\nGenerate the complete document content now based on the source documents below:"
    )
-    return final_prompt
+    return finalPrompt
 async def buildGenerationPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    aiService=None
 ) -> str:
    """
    Use AI to build the generation prompt based on user intent and format requirements.
    Focus on what's important for the user and how to structure the content.
    """
    if not aiService:
        # Fallback if no AI service available
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
    try:
        # Protect userPrompt from injection
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
        # AI call to generate the appropriate generation prompt
        generationPromptRequest = f"""
 Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
 User request: "{safeUserPrompt}"
 Document title: "{title}"
 Output format: {outputFormat}
 Create a generation prompt that:
 1. Identifies what content is most important for the user
 2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
 3. Includes any specific formatting or presentation requirements
 4. Ensures the document meets the user's needs
 Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
 """
        # Call AI service to generate the prompt
        result = await aiService.callAi(
            prompt=generationPromptRequest,
            documents=None,
            options=None
        )
        return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
    except Exception:
        # Fallback on any error
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
 async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
    """
    Use AI to extract the core content intention from the user prompt.
    Focus on WHAT the user wants to extract, not HOW to format it.
    """
    if not aiService:
        # Fallback if no AI service available
        return "Extract all relevant content from the document according to the user's requirements"
    try:
        # Protect userPrompt from injection by escaping quotes and newlines
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
        # Simple AI call to extract the intention
        extractionPrompt = f"""
 Extract the core content intention from this user request. Focus on WHAT content they want.
 User request: "{safeUserPrompt}"
 Return only the content intention in a simple format like "Extract: [content description]"
 Do not include formatting instructions, file types, or output methods.
 """
        # Call AI service to extract intention
        result = await aiService.callAi(
            prompt=extractionPrompt,
            documents=None,
            options=None
        )
        return result if result else "Extract all relevant content from the document according to the user's requirements"
    except Exception:
        # Fallback on any error
        return "Extract all relevant content from the document according to the user's requirements"
--- a/modules/services/serviceGeneration/renderers/docx_renderer.py
+++ b/modules/services/serviceGeneration/renderers/docx_renderer.py
@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
            "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
            "- Use bullet points (-) for lists and sub-items\n"
            "- Use **bold** for emphasis on key terms\n"
            "- Use pipe-separated format (Item | Status) for tables when appropriate\n"
            "- Provide clean, structured content that can be directly converted to Word formatting\n"
            "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
            "- Start directly with your content - no introductory text or separators\n"
@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
        except Exception as e:
            self.logger.warning(f"Could not style table: {str(e)}")
    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return
        # Split by pipe separator
        parts = [part.strip() for part in line.split('|')]
        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'
                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
            # Not a table row, treat as regular text
            doc.add_paragraph(line)
    def _clean_ai_content(self, content: str) -> str:
        """Clean AI-generated content by removing debug information and duplicates."""
        if not content:
@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
                bullet_text = line[2:]  # Remove "- "
                self._add_bullet_point(doc, bullet_text)
            # Check if this is a table row (contains pipe separator)
            elif '|' in line:
                # Flush current paragraph
                if current_paragraph:
                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
                    current_paragraph = []
                # This is a table row - collect table data
                self._process_table_row(doc, line)
            else:
                # Regular text - add to current paragraph
                current_paragraph.append(line)
@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
                    if part:
                        run = para.add_run(part)
                        run.bold = True
    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return
        # Split by pipe separator
        parts = [part.strip() for part in line.split('|')]
        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'
                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
-            # Regular paragraph
+            # Not a table row, treat as regular text
-            doc.add_paragraph(text)
+            doc.add_paragraph(line)
    def _add_bullet_point(self, doc, text: str):
        """Add a bullet point to the document."""
@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
                    # Bold text
                    if part:
                        run = para.add_run(part)
-                        run.bold = True
+                        run.bold = True
    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return
        # Split by pipe separator
        parts = [part.strip() for part in line.split('|')]
        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'
                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
            # Not a table row, treat as regular text
            doc.add_paragraph(line)
--- a/test_document_processing.py
+++ b/test_document_processing.py
@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
        # Run a single end-to-end test to avoid the loop issue
        logger.info("🧪 Running single end-to-end test...")
        # userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
        userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
        try:
            # Single AI call with DOCX generation
            ai_response = await ai_service.callAi(
-                prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
+                prompt=userPrompt,
                documents=documents,
                options=ai_options,
                outputFormat="docx",