full dynamic document extraction - processing - generation

This commit is contained in:
ValueOn AG 2025-10-11 18:51:23 +02:00
parent be5f6773b6
commit 1aecec9d61
5 changed files with 295 additions and 70 deletions

View file

@ -1515,68 +1515,70 @@ class AiService:
title = "AI Generated Document"
# Get format-specific extraction prompt
extraction_prompt = generation_service.getExtractionPrompt(
output_format=outputFormat,
user_prompt=prompt,
title=title
extractionPrompt = await generation_service.getExtractionPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
aiService=self
)
# Process documents with format-specific prompt using CLEAN mode
# This ensures no debug metadata is included in the final output
ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
# Parse filename header from AI response if present
parsed_filename = None
parsedFilename = None
try:
if ai_response:
first_newline = ai_response.find('\n')
header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
if header_line.strip().lower().startswith('filename:'):
parsed = header_line.split(':', 1)[1].strip()
if aiResponse:
firstNewline = aiResponse.find('\n')
headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
if headerLine.strip().lower().startswith('filename:'):
parsed = headerLine.split(':', 1)[1].strip()
# basic sanitization
import re
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
parsed = re.sub(r"-+", "-", parsed).strip('-')
if parsed:
parsed_filename = parsed
parsedFilename = parsed
# remove header line from content for rendering
ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
except Exception:
parsed_filename = None
parsedFilename = None
if not ai_response or ai_response.strip() == "":
if not aiResponse or aiResponse.strip() == "":
raise Exception("AI content generation failed")
# Render the content to the specified format
rendered_content, mime_type = await generation_service.renderReport(
extracted_content=ai_response,
output_format=outputFormat,
renderedContent, mimeType = await generation_service.renderReport(
extractedContent=aiResponse,
outputFormat=outputFormat,
title=title,
user_prompt=prompt
userPrompt=prompt,
aiService=self
)
# Generate meaningful filename (use AI-provided if valid, else fallback)
from datetime import datetime, UTC
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
filename = parsed_filename
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
filename = parsedFilename
else:
safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
# Return structured result with document information
return {
"success": True,
"content": ai_response, # Raw AI response
"rendered_content": rendered_content, # Formatted content
"mime_type": mime_type,
"content": aiResponse, # Raw AI response
"rendered_content": renderedContent, # Formatted content
"mime_type": mimeType,
"filename": filename,
"format": outputFormat,
"title": title,
"documents": [{
"documentName": filename,
"documentData": rendered_content,
"mimeType": mime_type
"documentData": renderedContent,
"mimeType": mimeType
}]
}

View file

@ -296,14 +296,16 @@ class GenerationService:
'workflowId': 'unknown'
}
async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
"""
Render extracted content to the specified output format.
Args:
extracted_content: Content extracted by AI using format-specific prompt
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
extractedContent: Content extracted by AI using format-specific prompt
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Report title
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
Returns:
tuple: (rendered_content, mime_type)
@ -317,66 +319,83 @@ class GenerationService:
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
f.write(extracted_content or "")
f.write(extractedContent or "")
except Exception:
pass
# Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format)
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
raise ValueError(f"Unsupported output format: {output_format}")
raise ValueError(f"Unsupported output format: {outputFormat}")
# Render the content with user prompt for structure
rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
# Generate AI-based generation prompt if AI service is available
generationPrompt = userPrompt # Default to user prompt
if aiService and userPrompt:
try:
from .prompt_builder import buildGenerationPrompt
generationPrompt = await buildGenerationPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService
)
except Exception as e:
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
generationPrompt = userPrompt
# Render the content with AI-generated prompt
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
# DEBUG: dump rendered output
try:
import os
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
f.write(rendered_content or "")
f.write(renderedContent or "")
except Exception:
pass
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
return rendered_content, mime_type
logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
return renderedContent, mimeType
except Exception as e:
logger.error(f"Error rendering report to {output_format}: {str(e)}")
logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
raise
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
Args:
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
user_prompt: User's original prompt for report generation
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
userPrompt: User's original prompt for report generation
title: Report title
aiService: AI service instance for intent extraction
Returns:
str: Format-specific prompt for AI extraction
"""
try:
# Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format)
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
raise ValueError(f"Unsupported output format: {output_format}")
raise ValueError(f"Unsupported output format: {outputFormat}")
# Build centralized prompt with generic rules + format-specific guidelines
from .prompt_builder import buildExtractionPrompt
extraction_prompt = buildExtractionPrompt(
output_format=output_format,
extractionPrompt = await buildExtractionPrompt(
outputFormat=outputFormat,
renderer=renderer,
user_prompt=user_prompt,
title=title
userPrompt=userPrompt,
title=title,
aiService=aiService
)
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
return extraction_prompt
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
return extractionPrompt
except Exception as e:
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
def _getFormatRenderer(self, output_format: str):

View file

@ -16,15 +16,16 @@ class _RendererLike(Protocol):
...
def buildExtractionPrompt(
output_format: str,
async def buildExtractionPrompt(
outputFormat: str,
renderer: _RendererLike,
user_prompt: str,
title: str
userPrompt: str,
title: str,
aiService=None
) -> str:
"""
Build the final extraction prompt by combining:
- The raw user prompt (verbatim)
- Parsed extraction intent from user prompt (using AI)
- Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer
@ -33,13 +34,16 @@ def buildExtractionPrompt(
followed by a blank line and then ONLY the document content according to the target format.
"""
format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
# Parse user prompt to separate extraction intent from generation format using AI
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
# Generic block appears once for every format
generic_intro = f"""
{user_prompt}
genericIntro = f"""
{extractionIntent}
You are generating a document in {output_format.upper()} format for the title: "{title}".
You are generating a document in {outputFormat.upper()} format for the title: "{title}".
Rules:
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
@ -62,13 +66,99 @@ Common policy:
""".strip()
# Final assembly
final_prompt = (
generic_intro
finalPrompt = (
genericIntro
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
+ format_guidelines.strip()
+ formatGuidelines.strip()
+ "\n\nGenerate the complete document content now based on the source documents below:"
)
return final_prompt
return finalPrompt
async def buildGenerationPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""
Use AI to build the generation prompt based on user intent and format requirements.
Focus on what's important for the user and how to structure the content.
"""
if not aiService:
# Fallback if no AI service available
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
try:
# Protect userPrompt from injection
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# AI call to generate the appropriate generation prompt
generationPromptRequest = f"""
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
User request: "{safeUserPrompt}"
Document title: "{title}"
Output format: {outputFormat}
Create a generation prompt that:
1. Identifies what content is most important for the user
2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
3. Includes any specific formatting or presentation requirements
4. Ensures the document meets the user's needs
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
"""
# Call AI service to generate the prompt
result = await aiService.callAi(
prompt=generationPromptRequest,
documents=None,
options=None
)
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
except Exception:
# Fallback on any error
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
"""
Use AI to extract the core content intention from the user prompt.
Focus on WHAT the user wants to extract, not HOW to format it.
"""
if not aiService:
# Fallback if no AI service available
return "Extract all relevant content from the document according to the user's requirements"
try:
# Protect userPrompt from injection by escaping quotes and newlines
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Simple AI call to extract the intention
extractionPrompt = f"""
Extract the core content intention from this user request. Focus on WHAT content they want.
User request: "{safeUserPrompt}"
Return only the content intention in a simple format like "Extract: [content description]"
Do not include formatting instructions, file types, or output methods.
"""
# Call AI service to extract intention
result = await aiService.callAi(
prompt=extractionPrompt,
documents=None,
options=None
)
return result if result else "Extract all relevant content from the document according to the user's requirements"
except Exception:
# Fallback on any error
return "Extract all relevant content from the document according to the user's requirements"

View file

@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
"- Use bullet points (-) for lists and sub-items\n"
"- Use **bold** for emphasis on key terms\n"
"- Use pipe-separated format (Item | Status) for tables when appropriate\n"
"- Provide clean, structured content that can be directly converted to Word formatting\n"
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
"- Start directly with your content - no introductory text or separators\n"
@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
bullet_text = line[2:] # Remove "- "
self._add_bullet_point(doc, bullet_text)
# Check if this is a table row (contains pipe separator)
elif '|' in line:
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# This is a table row - collect table data
self._process_table_row(doc, line)
else:
# Regular text - add to current paragraph
current_paragraph.append(line)
@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
if part:
run = para.add_run(part)
run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Regular paragraph
doc.add_paragraph(text)
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _add_bullet_point(self, doc, text: str):
"""Add a bullet point to the document."""
@ -488,4 +564,38 @@ class DocxRenderer(BaseRenderer):
# Bold text
if part:
run = para.add_run(part)
run.bold = True
run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)

View file

@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...")
# userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
try:
# Single AI call with DOCX generation
ai_response = await ai_service.callAi(
prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
prompt=userPrompt,
documents=documents,
options=ai_options,
outputFormat="docx",