From 1aecec9d6146613e42462a57f3e821d93f8a97be Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sat, 11 Oct 2025 18:51:23 +0200
Subject: [PATCH] full dynamic document extraction - processing - generation
---
modules/services/serviceAi/mainServiceAi.py | 58 ++++-----
.../mainServiceGeneration.py | 69 +++++++----
.../serviceGeneration/prompt_builder.py | 116 ++++++++++++++++--
.../renderers/docx_renderer.py | 116 +++++++++++++++++-
test_document_processing.py | 6 +-
5 files changed, 295 insertions(+), 70 deletions(-)
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 90c43273..3282d54f 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -1515,68 +1515,70 @@ class AiService:
title = "AI Generated Document"
# Get format-specific extraction prompt
- extraction_prompt = generation_service.getExtractionPrompt(
- output_format=outputFormat,
- user_prompt=prompt,
- title=title
+ extractionPrompt = await generation_service.getExtractionPrompt(
+ outputFormat=outputFormat,
+ userPrompt=prompt,
+ title=title,
+ aiService=self
)
# Process documents with format-specific prompt using CLEAN mode
# This ensures no debug metadata is included in the final output
- ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
+ aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
# Parse filename header from AI response if present
- parsed_filename = None
+ parsedFilename = None
try:
- if ai_response:
- first_newline = ai_response.find('\n')
- header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
- if header_line.strip().lower().startswith('filename:'):
- parsed = header_line.split(':', 1)[1].strip()
+ if aiResponse:
+ firstNewline = aiResponse.find('\n')
+ headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
+ if headerLine.strip().lower().startswith('filename:'):
+ parsed = headerLine.split(':', 1)[1].strip()
# basic sanitization
import re
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
parsed = re.sub(r"-+", "-", parsed).strip('-')
if parsed:
- parsed_filename = parsed
+ parsedFilename = parsed
# remove header line from content for rendering
- ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
+ aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
except Exception:
- parsed_filename = None
+ parsedFilename = None
- if not ai_response or ai_response.strip() == "":
+ if not aiResponse or aiResponse.strip() == "":
raise Exception("AI content generation failed")
# Render the content to the specified format
- rendered_content, mime_type = await generation_service.renderReport(
- extracted_content=ai_response,
- output_format=outputFormat,
+ renderedContent, mimeType = await generation_service.renderReport(
+ extractedContent=aiResponse,
+ outputFormat=outputFormat,
title=title,
- user_prompt=prompt
+ userPrompt=prompt,
+ aiService=self
)
# Generate meaningful filename (use AI-provided if valid, else fallback)
from datetime import datetime, UTC
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
- if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
- filename = parsed_filename
+ if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
+ filename = parsedFilename
else:
- safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
- filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
+ safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
+ filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
# Return structured result with document information
return {
"success": True,
- "content": ai_response, # Raw AI response
- "rendered_content": rendered_content, # Formatted content
- "mime_type": mime_type,
+ "content": aiResponse, # Raw AI response
+ "rendered_content": renderedContent, # Formatted content
+ "mime_type": mimeType,
"filename": filename,
"format": outputFormat,
"title": title,
"documents": [{
"documentName": filename,
- "documentData": rendered_content,
- "mimeType": mime_type
+ "documentData": renderedContent,
+ "mimeType": mimeType
}]
}
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index 9bdf050d..ddc4cc4e 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -296,14 +296,16 @@ class GenerationService:
'workflowId': 'unknown'
}
- async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
+ async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
"""
Render extracted content to the specified output format.
Args:
- extracted_content: Content extracted by AI using format-specific prompt
- output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+ extractedContent: Content extracted by AI using format-specific prompt
+ outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Report title
+ userPrompt: User's original prompt for report generation
+ aiService: AI service instance for generation prompt creation
Returns:
tuple: (rendered_content, mime_type)
@@ -317,66 +319,83 @@ class GenerationService:
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
- f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
+ f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
- f.write(extracted_content or "")
+ f.write(extractedContent or "")
except Exception:
pass
# Get the appropriate renderer for the format
- renderer = self._getFormatRenderer(output_format)
+ renderer = self._getFormatRenderer(outputFormat)
if not renderer:
- raise ValueError(f"Unsupported output format: {output_format}")
+ raise ValueError(f"Unsupported output format: {outputFormat}")
- # Render the content with user prompt for structure
- rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
+ # Generate AI-based generation prompt if AI service is available
+ generationPrompt = userPrompt # Default to user prompt
+ if aiService and userPrompt:
+ try:
+ from .prompt_builder import buildGenerationPrompt
+ generationPrompt = await buildGenerationPrompt(
+ outputFormat=outputFormat,
+ userPrompt=userPrompt,
+ title=title,
+ aiService=aiService
+ )
+ except Exception as e:
+ logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
+ generationPrompt = userPrompt
+
+ # Render the content with AI-generated prompt
+ renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
# DEBUG: dump rendered output
try:
import os
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
- f.write(rendered_content or "")
+ f.write(renderedContent or "")
except Exception:
pass
- logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
- return rendered_content, mime_type
+ logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
+ return renderedContent, mimeType
except Exception as e:
- logger.error(f"Error rendering report to {output_format}: {str(e)}")
+ logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
raise
- def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
+ async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
Args:
- output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
- user_prompt: User's original prompt for report generation
+ outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+ userPrompt: User's original prompt for report generation
title: Report title
+ aiService: AI service instance for intent extraction
Returns:
str: Format-specific prompt for AI extraction
"""
try:
# Get the appropriate renderer for the format
- renderer = self._getFormatRenderer(output_format)
+ renderer = self._getFormatRenderer(outputFormat)
if not renderer:
- raise ValueError(f"Unsupported output format: {output_format}")
+ raise ValueError(f"Unsupported output format: {outputFormat}")
# Build centralized prompt with generic rules + format-specific guidelines
from .prompt_builder import buildExtractionPrompt
- extraction_prompt = buildExtractionPrompt(
- output_format=output_format,
+ extractionPrompt = await buildExtractionPrompt(
+ outputFormat=outputFormat,
renderer=renderer,
- user_prompt=user_prompt,
- title=title
+ userPrompt=userPrompt,
+ title=title,
+ aiService=aiService
)
- logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
- return extraction_prompt
+ logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
+ return extractionPrompt
except Exception as e:
- logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
+ logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
def _getFormatRenderer(self, output_format: str):
diff --git a/modules/services/serviceGeneration/prompt_builder.py b/modules/services/serviceGeneration/prompt_builder.py
index 89f6bfe9..1565e42a 100644
--- a/modules/services/serviceGeneration/prompt_builder.py
+++ b/modules/services/serviceGeneration/prompt_builder.py
@@ -16,15 +16,16 @@ class _RendererLike(Protocol):
...
-def buildExtractionPrompt(
- output_format: str,
+async def buildExtractionPrompt(
+ outputFormat: str,
renderer: _RendererLike,
- user_prompt: str,
- title: str
+ userPrompt: str,
+ title: str,
+ aiService=None
) -> str:
"""
Build the final extraction prompt by combining:
- - The raw user prompt (verbatim)
+ - Parsed extraction intent from user prompt (using AI)
- Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer
@@ -33,13 +34,16 @@ def buildExtractionPrompt(
followed by a blank line and then ONLY the document content according to the target format.
"""
- format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
+ # Parse user prompt to separate extraction intent from generation format using AI
+ extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
+
+ formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
# Generic block appears once for every format
- generic_intro = f"""
-{user_prompt}
+ genericIntro = f"""
+{extractionIntent}
-You are generating a document in {output_format.upper()} format for the title: "{title}".
+You are generating a document in {outputFormat.upper()} format for the title: "{title}".
Rules:
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
@@ -62,13 +66,99 @@ Common policy:
""".strip()
# Final assembly
- final_prompt = (
- generic_intro
+ finalPrompt = (
+ genericIntro
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
- + format_guidelines.strip()
+ + formatGuidelines.strip()
+ "\n\nGenerate the complete document content now based on the source documents below:"
)
- return final_prompt
+ return finalPrompt
+
+
+async def buildGenerationPrompt(
+ outputFormat: str,
+ userPrompt: str,
+ title: str,
+ aiService=None
+) -> str:
+ """
+ Use AI to build the generation prompt based on user intent and format requirements.
+ Focus on what's important for the user and how to structure the content.
+ """
+ if not aiService:
+ # Fallback if no AI service available
+ return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+
+ try:
+ # Protect userPrompt from injection
+ safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+
+ # AI call to generate the appropriate generation prompt
+ generationPromptRequest = f"""
+Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
+
+User request: "{safeUserPrompt}"
+Document title: "{title}"
+Output format: {outputFormat}
+
+Create a generation prompt that:
+1. Identifies what content is most important for the user
+2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
+3. Includes any specific formatting or presentation requirements
+4. Ensures the document meets the user's needs
+
+Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
+"""
+
+ # Call AI service to generate the prompt
+ result = await aiService.callAi(
+ prompt=generationPromptRequest,
+ documents=None,
+ options=None
+ )
+
+ return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+
+ except Exception:
+ # Fallback on any error
+ return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
+
+
+async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
+ """
+ Use AI to extract the core content intention from the user prompt.
+ Focus on WHAT the user wants to extract, not HOW to format it.
+ """
+ if not aiService:
+ # Fallback if no AI service available
+ return "Extract all relevant content from the document according to the user's requirements"
+
+ try:
+ # Protect userPrompt from injection by escaping quotes and newlines
+ safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
+
+ # Simple AI call to extract the intention
+ extractionPrompt = f"""
+Extract the core content intention from this user request. Focus on WHAT content they want.
+
+User request: "{safeUserPrompt}"
+
+Return only the content intention in a simple format like "Extract: [content description]"
+Do not include formatting instructions, file types, or output methods.
+"""
+
+ # Call AI service to extract intention
+ result = await aiService.callAi(
+ prompt=extractionPrompt,
+ documents=None,
+ options=None
+ )
+
+ return result if result else "Extract all relevant content from the document according to the user's requirements"
+
+ except Exception:
+ # Fallback on any error
+ return "Extract all relevant content from the document according to the user's requirements"
diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py
index c4919d42..450a1c72 100644
--- a/modules/services/serviceGeneration/renderers/docx_renderer.py
+++ b/modules/services/serviceGeneration/renderers/docx_renderer.py
@@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
"- Use bullet points (-) for lists and sub-items\n"
"- Use **bold** for emphasis on key terms\n"
+ "- Use pipe-separated format (Item | Status) for tables when appropriate\n"
"- Provide clean, structured content that can be directly converted to Word formatting\n"
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
"- Start directly with your content - no introductory text or separators\n"
@@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
+ def _process_table_row(self, doc, line: str):
+ """Process a table row and add it to the document."""
+ if not line.strip():
+ return
+
+ # Split by pipe separator
+ parts = [part.strip() for part in line.split('|')]
+
+ if len(parts) >= 2:
+ # This is a table row - create a table if it doesn't exist
+ if not hasattr(self, '_current_table') or self._current_table is None:
+ # Create new table
+ self._current_table = doc.add_table(rows=1, cols=len(parts))
+ self._current_table.style = 'Table Grid'
+
+ # Add header row
+ for i, part in enumerate(parts):
+ if i < len(self._current_table.rows[0].cells):
+ cell = self._current_table.rows[0].cells[i]
+ cell.text = part
+ # Make header bold
+ for paragraph in cell.paragraphs:
+ for run in paragraph.runs:
+ run.bold = True
+ else:
+ # Add data row to existing table
+ row = self._current_table.add_row()
+ for i, part in enumerate(parts):
+ if i < len(row.cells):
+ row.cells[i].text = part
+ else:
+ # Not a table row, treat as regular text
+ doc.add_paragraph(line)
+
def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
@@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
bullet_text = line[2:] # Remove "- "
self._add_bullet_point(doc, bullet_text)
+ # Check if this is a table row (contains pipe separator)
+ elif '|' in line:
+ # Flush current paragraph
+ if current_paragraph:
+ self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
+ current_paragraph = []
+
+ # This is a table row - collect table data
+ self._process_table_row(doc, line)
+
else:
# Regular text - add to current paragraph
current_paragraph.append(line)
@@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
if part:
run = para.add_run(part)
run.bold = True
+
+ def _process_table_row(self, doc, line: str):
+ """Process a table row and add it to the document."""
+ if not line.strip():
+ return
+
+ # Split by pipe separator
+ parts = [part.strip() for part in line.split('|')]
+
+ if len(parts) >= 2:
+ # This is a table row - create a table if it doesn't exist
+ if not hasattr(self, '_current_table') or self._current_table is None:
+ # Create new table
+ self._current_table = doc.add_table(rows=1, cols=len(parts))
+ self._current_table.style = 'Table Grid'
+
+ # Add header row
+ for i, part in enumerate(parts):
+ if i < len(self._current_table.rows[0].cells):
+ cell = self._current_table.rows[0].cells[i]
+ cell.text = part
+ # Make header bold
+ for paragraph in cell.paragraphs:
+ for run in paragraph.runs:
+ run.bold = True
+ else:
+ # Add data row to existing table
+ row = self._current_table.add_row()
+ for i, part in enumerate(parts):
+ if i < len(row.cells):
+ row.cells[i].text = part
else:
- # Regular paragraph
- doc.add_paragraph(text)
+ # Not a table row, treat as regular text
+ doc.add_paragraph(line)
def _add_bullet_point(self, doc, text: str):
"""Add a bullet point to the document."""
@@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
# Bold text
if part:
run = para.add_run(part)
- run.bold = True
\ No newline at end of file
+ run.bold = True
+
+ def _process_table_row(self, doc, line: str):
+ """Process a table row and add it to the document."""
+ if not line.strip():
+ return
+
+ # Split by pipe separator
+ parts = [part.strip() for part in line.split('|')]
+
+ if len(parts) >= 2:
+ # This is a table row - create a table if it doesn't exist
+ if not hasattr(self, '_current_table') or self._current_table is None:
+ # Create new table
+ self._current_table = doc.add_table(rows=1, cols=len(parts))
+ self._current_table.style = 'Table Grid'
+
+ # Add header row
+ for i, part in enumerate(parts):
+ if i < len(self._current_table.rows[0].cells):
+ cell = self._current_table.rows[0].cells[i]
+ cell.text = part
+ # Make header bold
+ for paragraph in cell.paragraphs:
+ for run in paragraph.runs:
+ run.bold = True
+ else:
+ # Add data row to existing table
+ row = self._current_table.add_row()
+ for i, part in enumerate(parts):
+ if i < len(row.cells):
+ row.cells[i].text = part
+ else:
+ # Not a table row, treat as regular text
+ doc.add_paragraph(line)
\ No newline at end of file
diff --git a/test_document_processing.py b/test_document_processing.py
index fe16967d..bafc05c0 100644
--- a/test_document_processing.py
+++ b/test_document_processing.py
@@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...")
+ # userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
+
+ userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
+
try:
# Single AI call with DOCX generation
ai_response = await ai_service.callAi(
- prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
+ prompt=userPrompt,
documents=documents,
options=ai_options,
outputFormat="docx",