""" DOCX renderer for report generation using python-docx. """ from .base_renderer import BaseRenderer from typing import Dict, Any, Tuple, List import io import base64 import re from datetime import datetime, UTC try: from docx import Document from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.shared import OxmlElement, qn from docx.oxml.ns import nsdecls from docx.oxml import parse_xml DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False class DocxRenderer(BaseRenderer): """Renders content to DOCX format using python-docx.""" @classmethod def get_supported_formats(cls) -> List[str]: """Return supported DOCX formats.""" return ['docx', 'doc'] @classmethod def get_format_aliases(cls) -> List[str]: """Return format aliases.""" return ['word', 'document'] @classmethod def get_priority(cls) -> int: """Return priority for DOCX renderer.""" return 115 def getExtractionPrompt(self, user_prompt: str, title: str) -> str: """Return only DOCX-specific guidelines; global prompt is built centrally.""" return ( "DOCX FORMAT GUIDELINES:\n" "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n" "- Use bullet points (-) for lists and sub-items\n" "- Use **bold** for emphasis on key terms\n" "- Use pipe-separated format (Item | Status) for tables when appropriate\n" "- Provide clean, structured content that can be directly converted to Word formatting\n" "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n" "- Start directly with your content - no introductory text or separators\n" "OUTPUT: Return ONLY the structured plain text to be converted into DOCX." ) async def render(self, extracted_content: str, title: str, user_prompt: str = None) -> Tuple[str, str]: """Render extracted content to DOCX format using user prompt as blueprint.""" try: if not DOCX_AVAILABLE: # Fallback to HTML if python-docx not available from .html_renderer import HtmlRenderer html_renderer = HtmlRenderer() html_content, _ = await html_renderer.render(extracted_content, title) return html_content, "text/html" # Generate DOCX using prompt-based structure docx_content = self._generate_docx_from_prompt(extracted_content, title, user_prompt) return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" except Exception as e: self.logger.error(f"Error rendering DOCX: {str(e)}") # Return minimal fallback return f"DOCX Generation Error: {str(e)}", "text/plain" def _generate_docx_from_prompt(self, content: str, title: str, user_prompt: str = None) -> str: """Generate DOCX content by parsing the AI-generated structured content.""" try: # Create new document doc = Document() # Set up document styles self._setup_document_styles(doc) # Clean the content - remove debug information clean_content = self._clean_ai_content(content) # Parse and convert the structured content to DOCX self._parse_and_format_content(doc, clean_content, title) # Save to buffer buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) # Convert to base64 docx_bytes = buffer.getvalue() docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') return docx_base64 except Exception as e: self.logger.error(f"Error generating DOCX from prompt: {str(e)}") raise Exception(f"DOCX generation failed: {str(e)}") def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]: """Extract document structure from user prompt.""" structure = { 'title': title, 'sections': [], 'format': 'standard' } if not user_prompt: return structure # Extract title from prompt if not provided if not title or title == "Generated Document": # Look for "create a ... document" or "generate a ... report" import re title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower()) if title_match: structure['title'] = title_match.group(1).strip().title() # Extract sections from numbered lists in prompt import re section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)' sections = re.findall(section_pattern, user_prompt) for num, section_text in sections: structure['sections'].append({ 'number': int(num), 'title': section_text.strip(), 'level': 2 # H2 level }) # If no numbered sections found, try to extract from "including:" patterns if not structure['sections']: including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL) if including_match: including_text = including_match.group(1) # Split by common separators parts = re.split(r'[,;]\s*', including_text) for i, part in enumerate(parts, 1): part = part.strip() if part: structure['sections'].append({ 'number': i, 'title': part, 'level': 2 }) # If still no sections, extract from any list-like patterns if not structure['sections']: # Look for bullet points or dashes bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)' bullets = re.findall(bullet_pattern, user_prompt) for i, bullet in enumerate(bullets, 1): bullet = bullet.strip() if bullet and len(bullet) > 3: structure['sections'].append({ 'number': i, 'title': bullet, 'level': 2 }) # If still no sections, extract from sentence structure if not structure['sections']: # Split prompt into sentences and use as sections sentences = re.split(r'[.!?]\s+', user_prompt) for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections sentence = sentence.strip() if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')): structure['sections'].append({ 'number': i, 'title': sentence[:50] + "..." if len(sentence) > 50 else sentence, 'level': 2 }) # Final fallback: create sections from prompt keywords if not structure['sections']: # Extract key action words from prompt action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate'] found_actions = [] for action in action_words: if action in user_prompt.lower(): found_actions.append(action.title()) if found_actions: for i, action in enumerate(found_actions[:3], 1): structure['sections'].append({ 'number': i, 'title': f"{action} Document Content", 'level': 2 }) else: # Last resort: generic but meaningful sections structure['sections'] = [ {'number': 1, 'title': 'Document Analysis', 'level': 2}, {'number': 2, 'title': 'Key Information', 'level': 2}, {'number': 3, 'title': 'Summary and Conclusions', 'level': 2} ] return structure def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]): """Generate DOCX content based on extracted structure.""" # Add sections based on prompt structure for section in structure['sections']: # Add section heading doc.add_heading(f"{section['number']}) {section['title']}", level=section['level']) # Add AI-generated content for this section # Try to extract relevant content for this section from the AI response section_content = self._extract_section_content(content, section['title']) if section_content: doc.add_paragraph(section_content) else: # If no specific content found, add a note doc.add_paragraph(f"Content for {section['title']} based on document analysis.") # Add some spacing doc.add_paragraph() # Add the complete AI-generated content as additional analysis if content and content.strip(): doc.add_heading("Complete Analysis", level=1) doc.add_paragraph(content) def _extract_section_content(self, content: str, section_title: str) -> str: """Extract relevant content for a specific section from AI response.""" if not content or not section_title: return "" # Look for content that matches the section title section_keywords = section_title.lower().split() # Split content into paragraphs paragraphs = content.split('\n\n') relevant_paragraphs = [] for paragraph in paragraphs: paragraph_lower = paragraph.lower() # Check if paragraph contains keywords from section title if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3): relevant_paragraphs.append(paragraph.strip()) if relevant_paragraphs: return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section return "" def _setup_document_styles(self, doc): """Set up document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) # Set heading styles for i in range(1, 4): heading_style = doc.styles[f'Heading {i}'] heading_font = heading_style.font heading_font.name = 'Calibri' heading_font.size = Pt(16 - i * 2) heading_font.bold = True except Exception as e: self.logger.warning(f"Could not set up document styles: {str(e)}") def _process_section(self, doc, lines: list): """Process a section of content into DOCX elements.""" for line in lines: if not line.strip(): continue # Check for tables (lines with |) if '|' in line and not line.startswith('|'): # This might be part of a table, process as table table_data = self._extract_table_data(lines) if table_data: self._add_table(doc, table_data) return # Check for lists if line.startswith('- ') or line.startswith('* '): # This is a list item doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): # This is a numbered list item doc.add_paragraph(line[3:], style='List Number') else: # Regular paragraph doc.add_paragraph(line) def _extract_table_data(self, lines: list) -> list: """Extract table data from lines.""" table_data = [] in_table = False for line in lines: if '|' in line: if not in_table: in_table = True # Split by | and clean up cells = [cell.strip() for cell in line.split('|') if cell.strip()] if cells: table_data.append(cells) elif in_table and not line.strip(): # Empty line, might be end of table break return table_data if len(table_data) > 1 else [] def _add_table(self, doc, table_data: list): """Add a table to the document.""" try: if not table_data: return # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Add data to table for row_idx, row_data in enumerate(table_data): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data # Style the table self._style_table(table) except Exception as e: self.logger.warning(f"Could not add table: {str(e)}") def _style_table(self, table): """Apply styling to the table.""" try: # Style header row if len(table.rows) > 0: header_cells = table.rows[0].cells for cell in header_cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True except Exception as e: self.logger.warning(f"Could not style table: {str(e)}") def _process_table_row(self, doc, line: str): """Process a table row and add it to the document.""" if not line.strip(): return # Split by pipe separator parts = [part.strip() for part in line.split('|')] if len(parts) >= 2: # This is a table row - create a table if it doesn't exist if not hasattr(self, '_current_table') or self._current_table is None: # Create new table self._current_table = doc.add_table(rows=1, cols=len(parts)) self._current_table.style = 'Table Grid' # Add header row for i, part in enumerate(parts): if i < len(self._current_table.rows[0].cells): cell = self._current_table.rows[0].cells[i] cell.text = part # Make header bold for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True else: # Add data row to existing table row = self._current_table.add_row() for i, part in enumerate(parts): if i < len(row.cells): row.cells[i].text = part else: # Not a table row, treat as regular text doc.add_paragraph(line) def _clean_ai_content(self, content: str) -> str: """Clean AI-generated content by removing debug information and duplicates.""" if not content: return "" # Remove debug information lines = content.split('\n') clean_lines = [] for line in lines: # Skip debug lines and separators if (line.startswith('[Skipped ') or line.startswith('=== DOCUMENT:') or line.startswith('---') or line.startswith('FILENAME:') or line.strip() == '' or line.strip() == '---'): continue clean_lines.append(line) # Join lines and remove duplicate content clean_content = '\n'.join(clean_lines) # Remove duplicate sections by keeping only the first occurrence sections = clean_content.split('\n\n') seen_sections = set() unique_sections = [] for section in sections: section_key = section.strip()[:50] # Use first 50 chars as key if section_key not in seen_sections and section.strip(): seen_sections.add(section_key) unique_sections.append(section) return '\n\n'.join(unique_sections) def _parse_and_format_content(self, doc, content: str, title: str): """Parse AI-generated structured content and format it as DOCX.""" if not content: return # Add title title_para = doc.add_heading(title, 0) title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add generation date date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add page break doc.add_page_break() # Parse content line by line lines = content.split('\n') current_paragraph = [] for line in lines: line = line.strip() if not line: # Empty line - end current paragraph if current_paragraph: self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) current_paragraph = [] continue # Check if this is a numbered heading (1) Title, 2) Title, etc.) if re.match(r'^\d+\)\s+.+', line): # Flush current paragraph if current_paragraph: self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) current_paragraph = [] # Add as heading heading_text = re.sub(r'^\d+\)\s+', '', line) doc.add_heading(heading_text, level=1) # Check if this is a bullet point (- item) elif line.startswith('- '): # Flush current paragraph if current_paragraph: self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) current_paragraph = [] # Add as bullet point bullet_text = line[2:] # Remove "- " self._add_bullet_point(doc, bullet_text) # Check if this is a table row (contains pipe separator) elif '|' in line: # Flush current paragraph if current_paragraph: self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) current_paragraph = [] # This is a table row - collect table data self._process_table_row(doc, line) else: # Regular text - finalize any open table first if hasattr(self, '_current_table') and self._current_table is not None: self._finalize_current_table(doc) # Add to current paragraph current_paragraph.append(line) # Flush any remaining paragraph if current_paragraph: self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph)) # Finalize any open table self._finalize_current_table(doc) def _finalize_current_table(self, doc): """Finalize the current table if one exists.""" if hasattr(self, '_current_table') and self._current_table is not None: # Apply final styling to the table self._style_table(self._current_table) # Clear the current table reference self._current_table = None def _add_paragraph_to_doc(self, doc, text: str): """Add a paragraph to the document with proper formatting.""" if not text.strip(): return # Check for bold text (**text**) if '**' in text: para = doc.add_paragraph() parts = text.split('**') for i, part in enumerate(parts): if i % 2 == 0: # Regular text if part: para.add_run(part) else: # Bold text if part: run = para.add_run(part) run.bold = True def _process_table_row(self, doc, line: str): """Process a table row and add it to the document.""" if not line.strip(): return # Clean the line - remove bullet point markers and bold markers clean_line = line.strip() if clean_line.startswith('- **'): clean_line = clean_line[4:] # Remove "- **" elif clean_line.startswith('- '): clean_line = clean_line[2:] # Remove "- " elif clean_line.startswith('**'): clean_line = clean_line[2:] # Remove "**" # Remove trailing ** if present if clean_line.endswith('**'): clean_line = clean_line[:-2] # Split by pipe separator parts = [part.strip() for part in clean_line.split('|')] if len(parts) >= 2: # This is a table row - create a table if it doesn't exist if not hasattr(self, '_current_table') or self._current_table is None: # Create new table self._current_table = doc.add_table(rows=1, cols=len(parts)) self._current_table.style = 'Table Grid' # Check if this looks like a header row (contains common header words) is_header = any(word.lower() in clean_line.lower() for word in ['name', 'quantity', 'part', 'number', 'description', 'tag', 'item', 'status']) # Add header row for i, part in enumerate(parts): if i < len(self._current_table.rows[0].cells): cell = self._current_table.rows[0].cells[i] cell.text = part # Make header bold if it looks like a header if is_header: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True else: # Add data row to existing table row = self._current_table.add_row() for i, part in enumerate(parts): if i < len(row.cells): row.cells[i].text = part else: # Not a table row, treat as regular text doc.add_paragraph(line) def _add_bullet_point(self, doc, text: str): """Add a bullet point to the document.""" if not text.strip(): return # Create paragraph with bullet style para = doc.add_paragraph(text, style='List Bullet') # Check for bold text in bullet point if '**' in text: # Clear the paragraph and rebuild with formatting para.clear() parts = text.split('**') for i, part in enumerate(parts): if i % 2 == 0: # Regular text if part: para.add_run(part) else: # Bold text if part: run = para.add_run(part) run.bold = True def _process_table_row(self, doc, line: str): """Process a table row and add it to the document.""" if not line.strip(): return # Split by pipe separator parts = [part.strip() for part in line.split('|')] if len(parts) >= 2: # This is a table row - create a table if it doesn't exist if not hasattr(self, '_current_table') or self._current_table is None: # Create new table self._current_table = doc.add_table(rows=1, cols=len(parts)) self._current_table.style = 'Table Grid' # Add header row for i, part in enumerate(parts): if i < len(self._current_table.rows[0].cells): cell = self._current_table.rows[0].cells[i] cell.text = part # Make header bold for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True else: # Add data row to existing table row = self._current_table.add_row() for i, part in enumerate(parts): if i < len(row.cells): row.cells[i].text = part else: # Not a table row, treat as regular text doc.add_paragraph(line)