gateway/modules/services/serviceGeneration/renderers/docx_renderer.py

"""
DOCX renderer for report generation using python-docx.
"""

from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
import re
from datetime import datetime, UTC

try:
    from docx import Document
    from docx.shared import Inches, Pt
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.enum.table import WD_TABLE_ALIGNMENT
    from docx.oxml.shared import OxmlElement, qn
    from docx.oxml.ns import nsdecls
    from docx.oxml import parse_xml
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False

class DocxRenderer(BaseRenderer):
    """Renders content to DOCX format using python-docx."""

    @classmethod
    def get_supported_formats(cls) -> List[str]:
        """Return supported DOCX formats."""
        return ['docx', 'doc']

    @classmethod
    def get_format_aliases(cls) -> List[str]:
        """Return format aliases."""
        return ['word', 'document']

    @classmethod
    def get_priority(cls) -> int:
        """Return priority for DOCX renderer."""
        return 115

    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
        """Return only DOCX-specific guidelines; global prompt is built centrally."""
        return (
            "DOCX FORMAT GUIDELINES:\n"
            "- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
            "- Use bullet points (-) for lists and sub-items\n"
            "- Use **bold** for emphasis on key terms\n"
            "- Use pipe-separated format (Item | Status) for tables when appropriate\n"
            "- Provide clean, structured content that can be directly converted to Word formatting\n"
            "- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
            "- Start directly with your content - no introductory text or separators\n"
            "OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
        )

    async def render(self, extracted_content: str, title: str, user_prompt: str = None) -> Tuple[str, str]:
        """Render extracted content to DOCX format using user prompt as blueprint."""
        try:
            if not DOCX_AVAILABLE:
                # Fallback to HTML if python-docx not available
                from .html_renderer import HtmlRenderer
                html_renderer = HtmlRenderer()
                html_content, _ = await html_renderer.render(extracted_content, title)
                return html_content, "text/html"

            # Generate DOCX using prompt-based structure
            docx_content = self._generate_docx_from_prompt(extracted_content, title, user_prompt)

            return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        except Exception as e:
            self.logger.error(f"Error rendering DOCX: {str(e)}")
            # Return minimal fallback
            return f"DOCX Generation Error: {str(e)}", "text/plain"

    def _generate_docx_from_prompt(self, content: str, title: str, user_prompt: str = None) -> str:
        """Generate DOCX content by parsing the AI-generated structured content."""
        try:
            # Create new document
            doc = Document()

            # Set up document styles
            self._setup_document_styles(doc)

            # Clean the content - remove debug information
            clean_content = self._clean_ai_content(content)

            # Parse and convert the structured content to DOCX
            self._parse_and_format_content(doc, clean_content, title)

            # Save to buffer
            buffer = io.BytesIO()
            doc.save(buffer)
            buffer.seek(0)

            # Convert to base64
            docx_bytes = buffer.getvalue()
            docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')

            return docx_base64

        except Exception as e:
            self.logger.error(f"Error generating DOCX from prompt: {str(e)}")
            raise Exception(f"DOCX generation failed: {str(e)}")

    def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
        """Extract document structure from user prompt."""
        structure = {
            'title': title,
            'sections': [],
            'format': 'standard'
        }

        if not user_prompt:
            return structure

        # Extract title from prompt if not provided
        if not title or title == "Generated Document":
            # Look for "create a ... document" or "generate a ... report"
            import re
            title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
            if title_match:
                structure['title'] = title_match.group(1).strip().title()

        # Extract sections from numbered lists in prompt
        import re
        section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
        sections = re.findall(section_pattern, user_prompt)

        for num, section_text in sections:
            structure['sections'].append({
                'number': int(num),
                'title': section_text.strip(),
                'level': 2  # H2 level
            })

        # If no numbered sections found, try to extract from "including:" patterns
        if not structure['sections']:
            including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
            if including_match:
                including_text = including_match.group(1)
                # Split by common separators
                parts = re.split(r'[,;]\s*', including_text)
                for i, part in enumerate(parts, 1):
                    part = part.strip()
                    if part:
                        structure['sections'].append({
                            'number': i,
                            'title': part,
                            'level': 2
                        })

        # If still no sections, extract from any list-like patterns
        if not structure['sections']:
            # Look for bullet points or dashes
            bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
            bullets = re.findall(bullet_pattern, user_prompt)
            for i, bullet in enumerate(bullets, 1):
                bullet = bullet.strip()
                if bullet and len(bullet) > 3:
                    structure['sections'].append({
                        'number': i,
                        'title': bullet,
                        'level': 2
                    })

        # If still no sections, extract from sentence structure
        if not structure['sections']:
            # Split prompt into sentences and use as sections
            sentences = re.split(r'[.!?]\s+', user_prompt)
            for i, sentence in enumerate(sentences[:5], 1):  # Max 5 sections
                sentence = sentence.strip()
                if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
                    structure['sections'].append({
                        'number': i,
                        'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
                        'level': 2
                    })

        # Final fallback: create sections from prompt keywords
        if not structure['sections']:
            # Extract key action words from prompt
            action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
            found_actions = []
            for action in action_words:
                if action in user_prompt.lower():
                    found_actions.append(action.title())

            if found_actions:
                for i, action in enumerate(found_actions[:3], 1):
                    structure['sections'].append({
                        'number': i,
                        'title': f"{action} Document Content",
                        'level': 2
                    })
            else:
                # Last resort: generic but meaningful sections
                structure['sections'] = [
                    {'number': 1, 'title': 'Document Analysis', 'level': 2},
                    {'number': 2, 'title': 'Key Information', 'level': 2},
                    {'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
                ]

        return structure

    def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
        """Generate DOCX content based on extracted structure."""
        # Add sections based on prompt structure
        for section in structure['sections']:
            # Add section heading
            doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])

            # Add AI-generated content for this section
            # Try to extract relevant content for this section from the AI response
            section_content = self._extract_section_content(content, section['title'])

            if section_content:
                doc.add_paragraph(section_content)
            else:
                # If no specific content found, add a note
                doc.add_paragraph(f"Content for {section['title']} based on document analysis.")

            # Add some spacing
            doc.add_paragraph()

        # Add the complete AI-generated content as additional analysis
        if content and content.strip():
            doc.add_heading("Complete Analysis", level=1)
            doc.add_paragraph(content)

    def _extract_section_content(self, content: str, section_title: str) -> str:
        """Extract relevant content for a specific section from AI response."""
        if not content or not section_title:
            return ""

        # Look for content that matches the section title
        section_keywords = section_title.lower().split()

        # Split content into paragraphs
        paragraphs = content.split('\n\n')

        relevant_paragraphs = []
        for paragraph in paragraphs:
            paragraph_lower = paragraph.lower()
            # Check if paragraph contains keywords from section title
            if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
                relevant_paragraphs.append(paragraph.strip())

        if relevant_paragraphs:
            return '\n\n'.join(relevant_paragraphs[:2])  # Max 2 paragraphs per section

        return ""

    def _setup_document_styles(self, doc):
        """Set up document styles."""
        try:
            # Set default font
            style = doc.styles['Normal']
            font = style.font
            font.name = 'Calibri'
            font.size = Pt(11)

            # Set heading styles
            for i in range(1, 4):
                heading_style = doc.styles[f'Heading {i}']
                heading_font = heading_style.font
                heading_font.name = 'Calibri'
                heading_font.size = Pt(16 - i * 2)
                heading_font.bold = True
        except Exception as e:
            self.logger.warning(f"Could not set up document styles: {str(e)}")

    def _process_section(self, doc, lines: list):
        """Process a section of content into DOCX elements."""
        for line in lines:
            if not line.strip():
                continue

            # Check for tables (lines with |)
            if '|' in line and not line.startswith('|'):
                # This might be part of a table, process as table
                table_data = self._extract_table_data(lines)
                if table_data:
                    self._add_table(doc, table_data)
                    return

            # Check for lists
            if line.startswith('- ') or line.startswith('* '):
                # This is a list item
                doc.add_paragraph(line[2:], style='List Bullet')
            elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
                # This is a numbered list item
                doc.add_paragraph(line[3:], style='List Number')
            else:
                # Regular paragraph
                doc.add_paragraph(line)

    def _extract_table_data(self, lines: list) -> list:
        """Extract table data from lines."""
        table_data = []
        in_table = False

        for line in lines:
            if '|' in line:
                if not in_table:
                    in_table = True
                # Split by | and clean up
                cells = [cell.strip() for cell in line.split('|') if cell.strip()]
                if cells:
                    table_data.append(cells)
            elif in_table and not line.strip():
                # Empty line, might be end of table
                break

        return table_data if len(table_data) > 1 else []

    def _add_table(self, doc, table_data: list):
        """Add a table to the document."""
        try:
            if not table_data:
                return

            # Create table
            table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
            table.alignment = WD_TABLE_ALIGNMENT.CENTER

            # Add data to table
            for row_idx, row_data in enumerate(table_data):
                for col_idx, cell_data in enumerate(row_data):
                    if col_idx < len(table.rows[row_idx].cells):
                        table.rows[row_idx].cells[col_idx].text = cell_data

            # Style the table
            self._style_table(table)

        except Exception as e:
            self.logger.warning(f"Could not add table: {str(e)}")

    def _style_table(self, table):
        """Apply styling to the table."""
        try:
            # Style header row
            if len(table.rows) > 0:
                header_cells = table.rows[0].cells
                for cell in header_cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            run.bold = True
        except Exception as e:
            self.logger.warning(f"Could not style table: {str(e)}")

    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return

        # Split by pipe separator
        parts = [part.strip() for part in line.split('|')]

        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'

                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
            # Not a table row, treat as regular text
            doc.add_paragraph(line)

    def _clean_ai_content(self, content: str) -> str:
        """Clean AI-generated content by removing debug information and duplicates."""
        if not content:
            return ""

        # Remove debug information
        lines = content.split('\n')
        clean_lines = []

        for line in lines:
            # Skip debug lines and separators
            if (line.startswith('[Skipped ') or
                line.startswith('=== DOCUMENT:') or
                line.startswith('---') or
                line.startswith('FILENAME:') or
                line.strip() == '' or
                line.strip() == '---'):
                continue
            clean_lines.append(line)

        # Join lines and remove duplicate content
        clean_content = '\n'.join(clean_lines)

        # Remove duplicate sections by keeping only the first occurrence
        sections = clean_content.split('\n\n')
        seen_sections = set()
        unique_sections = []

        for section in sections:
            section_key = section.strip()[:50]  # Use first 50 chars as key
            if section_key not in seen_sections and section.strip():
                seen_sections.add(section_key)
                unique_sections.append(section)

        return '\n\n'.join(unique_sections)

    def _parse_and_format_content(self, doc, content: str, title: str):
        """Parse AI-generated structured content and format it as DOCX."""
        if not content:
            return

        # Add title
        title_para = doc.add_heading(title, 0)
        title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

        # Add generation date
        date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
        date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

        # Add page break
        doc.add_page_break()

        # Parse content line by line
        lines = content.split('\n')
        current_paragraph = []

        for line in lines:
            line = line.strip()
            if not line:
                # Empty line - end current paragraph
                if current_paragraph:
                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
                    current_paragraph = []
                continue

            # Check if this is a numbered heading (1) Title, 2) Title, etc.)
            if re.match(r'^\d+\)\s+.+', line):
                # Flush current paragraph
                if current_paragraph:
                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
                    current_paragraph = []

                # Add as heading
                heading_text = re.sub(r'^\d+\)\s+', '', line)
                doc.add_heading(heading_text, level=1)

            # Check if this is a bullet point (- item)
            elif line.startswith('- '):
                # Flush current paragraph
                if current_paragraph:
                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
                    current_paragraph = []

                # Add as bullet point
                bullet_text = line[2:]  # Remove "- "
                self._add_bullet_point(doc, bullet_text)

            # Check if this is a table row (contains pipe separator)
            elif '|' in line:
                # Flush current paragraph
                if current_paragraph:
                    self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
                    current_paragraph = []

                # This is a table row - collect table data
                self._process_table_row(doc, line)

            else:
                # Regular text - finalize any open table first
                if hasattr(self, '_current_table') and self._current_table is not None:
                    self._finalize_current_table(doc)

                # Add to current paragraph
                current_paragraph.append(line)

        # Flush any remaining paragraph
        if current_paragraph:
            self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))

        # Finalize any open table
        self._finalize_current_table(doc)

    def _finalize_current_table(self, doc):
        """Finalize the current table if one exists."""
        if hasattr(self, '_current_table') and self._current_table is not None:
            # Apply final styling to the table
            self._style_table(self._current_table)
            # Clear the current table reference
            self._current_table = None

    def _add_paragraph_to_doc(self, doc, text: str):
        """Add a paragraph to the document with proper formatting."""
        if not text.strip():
            return

        # Check for bold text (**text**)
        if '**' in text:
            para = doc.add_paragraph()
            parts = text.split('**')
            for i, part in enumerate(parts):
                if i % 2 == 0:
                    # Regular text
                    if part:
                        para.add_run(part)
                else:
                    # Bold text
                    if part:
                        run = para.add_run(part)
                        run.bold = True

    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return

        # Clean the line - remove bullet point markers and bold markers
        clean_line = line.strip()
        if clean_line.startswith('- **'):
            clean_line = clean_line[4:]  # Remove "- **"
        elif clean_line.startswith('- '):
            clean_line = clean_line[2:]  # Remove "- "
        elif clean_line.startswith('**'):
            clean_line = clean_line[2:]  # Remove "**"

        # Remove trailing ** if present
        if clean_line.endswith('**'):
            clean_line = clean_line[:-2]

        # Split by pipe separator
        parts = [part.strip() for part in clean_line.split('|')]

        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'

                # Check if this looks like a header row (contains common header words)
                is_header = any(word.lower() in clean_line.lower() for word in ['name', 'quantity', 'part', 'number', 'description', 'tag', 'item', 'status'])

                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold if it looks like a header
                        if is_header:
                            for paragraph in cell.paragraphs:
                                for run in paragraph.runs:
                                    run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
            # Not a table row, treat as regular text
            doc.add_paragraph(line)

    def _add_bullet_point(self, doc, text: str):
        """Add a bullet point to the document."""
        if not text.strip():
            return

        # Create paragraph with bullet style
        para = doc.add_paragraph(text, style='List Bullet')

        # Check for bold text in bullet point
        if '**' in text:
            # Clear the paragraph and rebuild with formatting
            para.clear()
            parts = text.split('**')
            for i, part in enumerate(parts):
                if i % 2 == 0:
                    # Regular text
                    if part:
                        para.add_run(part)
                else:
                    # Bold text
                    if part:
                        run = para.add_run(part)
                        run.bold = True

    def _process_table_row(self, doc, line: str):
        """Process a table row and add it to the document."""
        if not line.strip():
            return

        # Split by pipe separator
        parts = [part.strip() for part in line.split('|')]

        if len(parts) >= 2:
            # This is a table row - create a table if it doesn't exist
            if not hasattr(self, '_current_table') or self._current_table is None:
                # Create new table
                self._current_table = doc.add_table(rows=1, cols=len(parts))
                self._current_table.style = 'Table Grid'

                # Add header row
                for i, part in enumerate(parts):
                    if i < len(self._current_table.rows[0].cells):
                        cell = self._current_table.rows[0].cells[i]
                        cell.text = part
                        # Make header bold
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                run.bold = True
            else:
                # Add data row to existing table
                row = self._current_table.add_row()
                for i, part in enumerate(parts):
                    if i < len(row.cells):
                        row.cells[i].text = part
        else:
            # Not a table row, treat as regular text
            doc.add_paragraph(line)