full dynamic document extraction - processing - generation

This commit is contained in:
ValueOn AG 2025-10-11 18:51:23 +02:00
parent be5f6773b6
commit 1aecec9d61
5 changed files with 295 additions and 70 deletions

View file

@ -1515,68 +1515,70 @@ class AiService:
title = "AI Generated Document" title = "AI Generated Document"
# Get format-specific extraction prompt # Get format-specific extraction prompt
extraction_prompt = generation_service.getExtractionPrompt( extractionPrompt = await generation_service.getExtractionPrompt(
output_format=outputFormat, outputFormat=outputFormat,
user_prompt=prompt, userPrompt=prompt,
title=title title=title,
aiService=self
) )
# Process documents with format-specific prompt using CLEAN mode # Process documents with format-specific prompt using CLEAN mode
# This ensures no debug metadata is included in the final output # This ensures no debug metadata is included in the final output
ai_response = await self._callAiTextClean(extraction_prompt, documents, options) aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
# Parse filename header from AI response if present # Parse filename header from AI response if present
parsed_filename = None parsedFilename = None
try: try:
if ai_response: if aiResponse:
first_newline = ai_response.find('\n') firstNewline = aiResponse.find('\n')
header_line = ai_response if first_newline == -1 else ai_response[:first_newline] headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
if header_line.strip().lower().startswith('filename:'): if headerLine.strip().lower().startswith('filename:'):
parsed = header_line.split(':', 1)[1].strip() parsed = headerLine.split(':', 1)[1].strip()
# basic sanitization # basic sanitization
import re import re
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed) parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
parsed = re.sub(r"-+", "-", parsed).strip('-') parsed = re.sub(r"-+", "-", parsed).strip('-')
if parsed: if parsed:
parsed_filename = parsed parsedFilename = parsed
# remove header line from content for rendering # remove header line from content for rendering
ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else '' aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
except Exception: except Exception:
parsed_filename = None parsedFilename = None
if not ai_response or ai_response.strip() == "": if not aiResponse or aiResponse.strip() == "":
raise Exception("AI content generation failed") raise Exception("AI content generation failed")
# Render the content to the specified format # Render the content to the specified format
rendered_content, mime_type = await generation_service.renderReport( renderedContent, mimeType = await generation_service.renderReport(
extracted_content=ai_response, extractedContent=aiResponse,
output_format=outputFormat, outputFormat=outputFormat,
title=title, title=title,
user_prompt=prompt userPrompt=prompt,
aiService=self
) )
# Generate meaningful filename (use AI-provided if valid, else fallback) # Generate meaningful filename (use AI-provided if valid, else fallback)
from datetime import datetime, UTC from datetime import datetime, UTC
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"): if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
filename = parsed_filename filename = parsedFilename
else: else:
safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}" filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
# Return structured result with document information # Return structured result with document information
return { return {
"success": True, "success": True,
"content": ai_response, # Raw AI response "content": aiResponse, # Raw AI response
"rendered_content": rendered_content, # Formatted content "rendered_content": renderedContent, # Formatted content
"mime_type": mime_type, "mime_type": mimeType,
"filename": filename, "filename": filename,
"format": outputFormat, "format": outputFormat,
"title": title, "title": title,
"documents": [{ "documents": [{
"documentName": filename, "documentName": filename,
"documentData": rendered_content, "documentData": renderedContent,
"mimeType": mime_type "mimeType": mimeType
}] }]
} }

View file

@ -296,14 +296,16 @@ class GenerationService:
'workflowId': 'unknown' 'workflowId': 'unknown'
} }
async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]: async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
""" """
Render extracted content to the specified output format. Render extracted content to the specified output format.
Args: Args:
extracted_content: Content extracted by AI using format-specific prompt extractedContent: Content extracted by AI using format-specific prompt
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Report title title: Report title
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
Returns: Returns:
tuple: (rendered_content, mime_type) tuple: (rendered_content, mime_type)
@ -317,66 +319,83 @@ class GenerationService:
debug_dir = os.path.join(debug_root, f"render_input_{ts}") debug_dir = os.path.join(debug_root, f"render_input_{ts}")
os.makedirs(debug_dir, exist_ok=True) os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n") f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f: with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
f.write(extracted_content or "") f.write(extractedContent or "")
except Exception: except Exception:
pass pass
# Get the appropriate renderer for the format # Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format) renderer = self._getFormatRenderer(outputFormat)
if not renderer: if not renderer:
raise ValueError(f"Unsupported output format: {output_format}") raise ValueError(f"Unsupported output format: {outputFormat}")
# Render the content with user prompt for structure # Generate AI-based generation prompt if AI service is available
rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt) generationPrompt = userPrompt # Default to user prompt
if aiService and userPrompt:
try:
from .prompt_builder import buildGenerationPrompt
generationPrompt = await buildGenerationPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService
)
except Exception as e:
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
generationPrompt = userPrompt
# Render the content with AI-generated prompt
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
# DEBUG: dump rendered output # DEBUG: dump rendered output
try: try:
import os import os
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f: with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
f.write(rendered_content or "") f.write(renderedContent or "")
except Exception: except Exception:
pass pass
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
return rendered_content, mime_type return renderedContent, mimeType
except Exception as e: except Exception as e:
logger.error(f"Error rendering report to {output_format}: {str(e)}") logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
raise raise
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str: async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
""" """
Get the format-specific extraction prompt for AI content extraction. Get the format-specific extraction prompt for AI content extraction.
Args: Args:
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
user_prompt: User's original prompt for report generation userPrompt: User's original prompt for report generation
title: Report title title: Report title
aiService: AI service instance for intent extraction
Returns: Returns:
str: Format-specific prompt for AI extraction str: Format-specific prompt for AI extraction
""" """
try: try:
# Get the appropriate renderer for the format # Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format) renderer = self._getFormatRenderer(outputFormat)
if not renderer: if not renderer:
raise ValueError(f"Unsupported output format: {output_format}") raise ValueError(f"Unsupported output format: {outputFormat}")
# Build centralized prompt with generic rules + format-specific guidelines # Build centralized prompt with generic rules + format-specific guidelines
from .prompt_builder import buildExtractionPrompt from .prompt_builder import buildExtractionPrompt
extraction_prompt = buildExtractionPrompt( extractionPrompt = await buildExtractionPrompt(
output_format=output_format, outputFormat=outputFormat,
renderer=renderer, renderer=renderer,
user_prompt=user_prompt, userPrompt=userPrompt,
title=title title=title,
aiService=aiService
) )
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters") logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
return extraction_prompt return extractionPrompt
except Exception as e: except Exception as e:
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}") logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise raise
def _getFormatRenderer(self, output_format: str): def _getFormatRenderer(self, output_format: str):

View file

@ -16,15 +16,16 @@ class _RendererLike(Protocol):
... ...
def buildExtractionPrompt( async def buildExtractionPrompt(
output_format: str, outputFormat: str,
renderer: _RendererLike, renderer: _RendererLike,
user_prompt: str, userPrompt: str,
title: str title: str,
aiService=None
) -> str: ) -> str:
""" """
Build the final extraction prompt by combining: Build the final extraction prompt by combining:
- The raw user prompt (verbatim) - Parsed extraction intent from user prompt (using AI)
- Generic cross-format instructions (filename header + real-data policy) - Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer - Format-specific guidelines snippet provided by the renderer
@ -33,13 +34,16 @@ def buildExtractionPrompt(
followed by a blank line and then ONLY the document content according to the target format. followed by a blank line and then ONLY the document content according to the target format.
""" """
format_guidelines = renderer.getExtractionPrompt(user_prompt, title) # Parse user prompt to separate extraction intent from generation format using AI
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
# Generic block appears once for every format # Generic block appears once for every format
generic_intro = f""" genericIntro = f"""
{user_prompt} {extractionIntent}
You are generating a document in {output_format.upper()} format for the title: "{title}". You are generating a document in {outputFormat.upper()} format for the title: "{title}".
Rules: Rules:
- The user's intent fully defines the structure. Do not assume a fixed template or headings. - The user's intent fully defines the structure. Do not assume a fixed template or headings.
@ -62,13 +66,99 @@ Common policy:
""".strip() """.strip()
# Final assembly # Final assembly
final_prompt = ( finalPrompt = (
generic_intro genericIntro
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n" + "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
+ format_guidelines.strip() + formatGuidelines.strip()
+ "\n\nGenerate the complete document content now based on the source documents below:" + "\n\nGenerate the complete document content now based on the source documents below:"
) )
return final_prompt return finalPrompt
async def buildGenerationPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""
Use AI to build the generation prompt based on user intent and format requirements.
Focus on what's important for the user and how to structure the content.
"""
if not aiService:
# Fallback if no AI service available
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
try:
# Protect userPrompt from injection
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# AI call to generate the appropriate generation prompt
generationPromptRequest = f"""
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
User request: "{safeUserPrompt}"
Document title: "{title}"
Output format: {outputFormat}
Create a generation prompt that:
1. Identifies what content is most important for the user
2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
3. Includes any specific formatting or presentation requirements
4. Ensures the document meets the user's needs
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
"""
# Call AI service to generate the prompt
result = await aiService.callAi(
prompt=generationPromptRequest,
documents=None,
options=None
)
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
except Exception:
# Fallback on any error
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
"""
Use AI to extract the core content intention from the user prompt.
Focus on WHAT the user wants to extract, not HOW to format it.
"""
if not aiService:
# Fallback if no AI service available
return "Extract all relevant content from the document according to the user's requirements"
try:
# Protect userPrompt from injection by escaping quotes and newlines
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Simple AI call to extract the intention
extractionPrompt = f"""
Extract the core content intention from this user request. Focus on WHAT content they want.
User request: "{safeUserPrompt}"
Return only the content intention in a simple format like "Extract: [content description]"
Do not include formatting instructions, file types, or output methods.
"""
# Call AI service to extract intention
result = await aiService.callAi(
prompt=extractionPrompt,
documents=None,
options=None
)
return result if result else "Extract all relevant content from the document according to the user's requirements"
except Exception:
# Fallback on any error
return "Extract all relevant content from the document according to the user's requirements"

View file

@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n" "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
"- Use bullet points (-) for lists and sub-items\n" "- Use bullet points (-) for lists and sub-items\n"
"- Use **bold** for emphasis on key terms\n" "- Use **bold** for emphasis on key terms\n"
"- Use pipe-separated format (Item | Status) for tables when appropriate\n"
"- Provide clean, structured content that can be directly converted to Word formatting\n" "- Provide clean, structured content that can be directly converted to Word formatting\n"
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n" "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
"- Start directly with your content - no introductory text or separators\n" "- Start directly with your content - no introductory text or separators\n"
@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
except Exception as e: except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}") self.logger.warning(f"Could not style table: {str(e)}")
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _clean_ai_content(self, content: str) -> str: def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates.""" """Clean AI-generated content by removing debug information and duplicates."""
if not content: if not content:
@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
bullet_text = line[2:] # Remove "- " bullet_text = line[2:] # Remove "- "
self._add_bullet_point(doc, bullet_text) self._add_bullet_point(doc, bullet_text)
# Check if this is a table row (contains pipe separator)
elif '|' in line:
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# This is a table row - collect table data
self._process_table_row(doc, line)
else: else:
# Regular text - add to current paragraph # Regular text - add to current paragraph
current_paragraph.append(line) current_paragraph.append(line)
@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
if part: if part:
run = para.add_run(part) run = para.add_run(part)
run.bold = True run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else: else:
# Regular paragraph # Not a table row, treat as regular text
doc.add_paragraph(text) doc.add_paragraph(line)
def _add_bullet_point(self, doc, text: str): def _add_bullet_point(self, doc, text: str):
"""Add a bullet point to the document.""" """Add a bullet point to the document."""
@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
# Bold text # Bold text
if part: if part:
run = para.add_run(part) run = para.add_run(part)
run.bold = True run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)

View file

@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue # Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...") logger.info("🧪 Running single end-to-end test...")
# userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
try: try:
# Single AI call with DOCX generation # Single AI call with DOCX generation
ai_response = await ai_service.callAi( ai_response = await ai_service.callAi(
prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.", prompt=userPrompt,
documents=documents, documents=documents,
options=ai_options, options=ai_options,
outputFormat="docx", outputFormat="docx",