gateway/modules/services/serviceGeneration/renderers/docx_renderer.py
2025-10-11 18:55:00 +02:00

633 lines
No EOL
26 KiB
Python

"""
DOCX renderer for report generation using python-docx.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
import re
from datetime import datetime, UTC
try:
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
class DocxRenderer(BaseRenderer):
"""Renders content to DOCX format using python-docx."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported DOCX formats."""
return ['docx', 'doc']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['word', 'document']
@classmethod
def get_priority(cls) -> int:
"""Return priority for DOCX renderer."""
return 115
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
return (
"DOCX FORMAT GUIDELINES:\n"
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
"- Use bullet points (-) for lists and sub-items\n"
"- Use **bold** for emphasis on key terms\n"
"- Use pipe-separated format (Item | Status) for tables when appropriate\n"
"- Provide clean, structured content that can be directly converted to Word formatting\n"
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
"- Start directly with your content - no introductory text or separators\n"
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
)
async def render(self, extracted_content: str, title: str, user_prompt: str = None) -> Tuple[str, str]:
"""Render extracted content to DOCX format using user prompt as blueprint."""
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .html_renderer import HtmlRenderer
html_renderer = HtmlRenderer()
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate DOCX using prompt-based structure
docx_content = self._generate_docx_from_prompt(extracted_content, title, user_prompt)
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
except Exception as e:
self.logger.error(f"Error rendering DOCX: {str(e)}")
# Return minimal fallback
return f"DOCX Generation Error: {str(e)}", "text/plain"
def _generate_docx_from_prompt(self, content: str, title: str, user_prompt: str = None) -> str:
"""Generate DOCX content by parsing the AI-generated structured content."""
try:
# Create new document
doc = Document()
# Set up document styles
self._setup_document_styles(doc)
# Clean the content - remove debug information
clean_content = self._clean_ai_content(content)
# Parse and convert the structured content to DOCX
self._parse_and_format_content(doc, clean_content, title)
# Save to buffer
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
# Convert to base64
docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX from prompt: {str(e)}")
raise Exception(f"DOCX generation failed: {str(e)}")
def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
"""Extract document structure from user prompt."""
structure = {
'title': title,
'sections': [],
'format': 'standard'
}
if not user_prompt:
return structure
# Extract title from prompt if not provided
if not title or title == "Generated Document":
# Look for "create a ... document" or "generate a ... report"
import re
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
if title_match:
structure['title'] = title_match.group(1).strip().title()
# Extract sections from numbered lists in prompt
import re
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
sections = re.findall(section_pattern, user_prompt)
for num, section_text in sections:
structure['sections'].append({
'number': int(num),
'title': section_text.strip(),
'level': 2 # H2 level
})
# If no numbered sections found, try to extract from "including:" patterns
if not structure['sections']:
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
if including_match:
including_text = including_match.group(1)
# Split by common separators
parts = re.split(r'[,;]\s*', including_text)
for i, part in enumerate(parts, 1):
part = part.strip()
if part:
structure['sections'].append({
'number': i,
'title': part,
'level': 2
})
# If still no sections, extract from any list-like patterns
if not structure['sections']:
# Look for bullet points or dashes
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
bullets = re.findall(bullet_pattern, user_prompt)
for i, bullet in enumerate(bullets, 1):
bullet = bullet.strip()
if bullet and len(bullet) > 3:
structure['sections'].append({
'number': i,
'title': bullet,
'level': 2
})
# If still no sections, extract from sentence structure
if not structure['sections']:
# Split prompt into sentences and use as sections
sentences = re.split(r'[.!?]\s+', user_prompt)
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
sentence = sentence.strip()
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
structure['sections'].append({
'number': i,
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
'level': 2
})
# Final fallback: create sections from prompt keywords
if not structure['sections']:
# Extract key action words from prompt
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
found_actions = []
for action in action_words:
if action in user_prompt.lower():
found_actions.append(action.title())
if found_actions:
for i, action in enumerate(found_actions[:3], 1):
structure['sections'].append({
'number': i,
'title': f"{action} Document Content",
'level': 2
})
else:
# Last resort: generic but meaningful sections
structure['sections'] = [
{'number': 1, 'title': 'Document Analysis', 'level': 2},
{'number': 2, 'title': 'Key Information', 'level': 2},
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
]
return structure
def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
"""Generate DOCX content based on extracted structure."""
# Add sections based on prompt structure
for section in structure['sections']:
# Add section heading
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
# Add AI-generated content for this section
# Try to extract relevant content for this section from the AI response
section_content = self._extract_section_content(content, section['title'])
if section_content:
doc.add_paragraph(section_content)
else:
# If no specific content found, add a note
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
# Add some spacing
doc.add_paragraph()
# Add the complete AI-generated content as additional analysis
if content and content.strip():
doc.add_heading("Complete Analysis", level=1)
doc.add_paragraph(content)
def _extract_section_content(self, content: str, section_title: str) -> str:
"""Extract relevant content for a specific section from AI response."""
if not content or not section_title:
return ""
# Look for content that matches the section title
section_keywords = section_title.lower().split()
# Split content into paragraphs
paragraphs = content.split('\n\n')
relevant_paragraphs = []
for paragraph in paragraphs:
paragraph_lower = paragraph.lower()
# Check if paragraph contains keywords from section title
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
relevant_paragraphs.append(paragraph.strip())
if relevant_paragraphs:
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
return ""
def _setup_document_styles(self, doc):
"""Set up document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
# Set heading styles
for i in range(1, 4):
heading_style = doc.styles[f'Heading {i}']
heading_font = heading_style.font
heading_font.name = 'Calibri'
heading_font.size = Pt(16 - i * 2)
heading_font.bold = True
except Exception as e:
self.logger.warning(f"Could not set up document styles: {str(e)}")
def _process_section(self, doc, lines: list):
"""Process a section of content into DOCX elements."""
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extract_table_data(lines)
if table_data:
self._add_table(doc, table_data)
return
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
# This is a numbered list item
doc.add_paragraph(line[3:], style='List Number')
else:
# Regular paragraph
doc.add_paragraph(line)
def _extract_table_data(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []
def _add_table(self, doc, table_data: list):
"""Add a table to the document."""
try:
if not table_data:
return
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Add data to table
for row_idx, row_data in enumerate(table_data):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data
# Style the table
self._style_table(table)
except Exception as e:
self.logger.warning(f"Could not add table: {str(e)}")
def _style_table(self, table):
"""Apply styling to the table."""
try:
# Style header row
if len(table.rows) > 0:
header_cells = table.rows[0].cells
for cell in header_cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
return ""
# Remove debug information
lines = content.split('\n')
clean_lines = []
for line in lines:
# Skip debug lines and separators
if (line.startswith('[Skipped ') or
line.startswith('=== DOCUMENT:') or
line.startswith('---') or
line.startswith('FILENAME:') or
line.strip() == '' or
line.strip() == '---'):
continue
clean_lines.append(line)
# Join lines and remove duplicate content
clean_content = '\n'.join(clean_lines)
# Remove duplicate sections by keeping only the first occurrence
sections = clean_content.split('\n\n')
seen_sections = set()
unique_sections = []
for section in sections:
section_key = section.strip()[:50] # Use first 50 chars as key
if section_key not in seen_sections and section.strip():
seen_sections.add(section_key)
unique_sections.append(section)
return '\n\n'.join(unique_sections)
def _parse_and_format_content(self, doc, content: str, title: str):
"""Parse AI-generated structured content and format it as DOCX."""
if not content:
return
# Add title
title_para = doc.add_heading(title, 0)
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add generation date
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add page break
doc.add_page_break()
# Parse content line by line
lines = content.split('\n')
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
# Empty line - end current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
continue
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
if re.match(r'^\d+\)\s+.+', line):
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# Add as heading
heading_text = re.sub(r'^\d+\)\s+', '', line)
doc.add_heading(heading_text, level=1)
# Check if this is a bullet point (- item)
elif line.startswith('- '):
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# Add as bullet point
bullet_text = line[2:] # Remove "- "
self._add_bullet_point(doc, bullet_text)
# Check if this is a table row (contains pipe separator)
elif '|' in line:
# Flush current paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
current_paragraph = []
# This is a table row - collect table data
self._process_table_row(doc, line)
else:
# Regular text - finalize any open table first
if hasattr(self, '_current_table') and self._current_table is not None:
self._finalize_current_table(doc)
# Add to current paragraph
current_paragraph.append(line)
# Flush any remaining paragraph
if current_paragraph:
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
# Finalize any open table
self._finalize_current_table(doc)
def _finalize_current_table(self, doc):
"""Finalize the current table if one exists."""
if hasattr(self, '_current_table') and self._current_table is not None:
# Apply final styling to the table
self._style_table(self._current_table)
# Clear the current table reference
self._current_table = None
def _add_paragraph_to_doc(self, doc, text: str):
"""Add a paragraph to the document with proper formatting."""
if not text.strip():
return
# Check for bold text (**text**)
if '**' in text:
para = doc.add_paragraph()
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text
if part:
para.add_run(part)
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Clean the line - remove bullet point markers and bold markers
clean_line = line.strip()
if clean_line.startswith('- **'):
clean_line = clean_line[4:] # Remove "- **"
elif clean_line.startswith('- '):
clean_line = clean_line[2:] # Remove "- "
elif clean_line.startswith('**'):
clean_line = clean_line[2:] # Remove "**"
# Remove trailing ** if present
if clean_line.endswith('**'):
clean_line = clean_line[:-2]
# Split by pipe separator
parts = [part.strip() for part in clean_line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Check if this looks like a header row (contains common header words)
is_header = any(word.lower() in clean_line.lower() for word in ['name', 'quantity', 'part', 'number', 'description', 'tag', 'item', 'status'])
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold if it looks like a header
if is_header:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _add_bullet_point(self, doc, text: str):
"""Add a bullet point to the document."""
if not text.strip():
return
# Create paragraph with bullet style
para = doc.add_paragraph(text, style='List Bullet')
# Check for bold text in bullet point
if '**' in text:
# Clear the paragraph and rebuild with formatting
para.clear()
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text
if part:
para.add_run(part)
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)