gateway/modules/services/serviceGeneration/renderers/pdf_renderer.py

"""
PDF renderer for report generation using reportlab.
"""

from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC

try:
    from reportlab.lib.pagesizes import letter, A4
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
    from reportlab.lib.units import inch
    from reportlab.lib import colors
    from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
    REPORTLAB_AVAILABLE = True
except ImportError:
    REPORTLAB_AVAILABLE = False

class PdfRenderer(BaseRenderer):
    """Renders content to PDF format using reportlab."""

    @classmethod
    def get_supported_formats(cls) -> List[str]:
        """Return supported PDF formats."""
        return ['pdf']

    @classmethod
    def get_format_aliases(cls) -> List[str]:
        """Return format aliases."""
        return ['document', 'print']

    @classmethod
    def get_priority(cls) -> int:
        """Return priority for PDF renderer."""
        return 120

    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
        """Return only PDF-specific guidelines; global prompt is built centrally."""
        return (
            "PDF FORMAT GUIDELINES:\n"
            "- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n"
            "- Use bullet lists and tables where useful; separate major sections clearly.\n"
            "- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n"
            "OUTPUT: Return ONLY the PDF-ready textual content (no fences)."
        )

    async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
        """Render extracted content to PDF format."""
        try:
            if not REPORTLAB_AVAILABLE:
                # Fallback to HTML if reportlab not available
                from .html_renderer import HtmlRenderer
                html_renderer = HtmlRenderer()
                html_content, _ = await html_renderer.render(extracted_content, title)
                return html_content, "text/html"

            # Generate PDF using reportlab
            pdf_content = self._generate_pdf(extracted_content, title)

            return pdf_content, "application/pdf"

        except Exception as e:
            self.logger.error(f"Error rendering PDF: {str(e)}")
            # Return minimal fallback
            return f"PDF Generation Error: {str(e)}", "text/plain"

    def _generate_pdf(self, content: str, title: str) -> str:
        """Generate PDF content using reportlab."""
        try:
            # Create a buffer to hold the PDF
            buffer = io.BytesIO()

            # Create PDF document
            doc = SimpleDocTemplate(
                buffer,
                pagesize=A4,
                rightMargin=72,
                leftMargin=72,
                topMargin=72,
                bottomMargin=18
            )

            # Get styles
            styles = getSampleStyleSheet()

            # Create custom styles
            title_style = ParagraphStyle(
                'CustomTitle',
                parent=styles['Heading1'],
                fontSize=24,
                spaceAfter=30,
                alignment=TA_CENTER,
                textColor=colors.darkblue
            )

            heading_style = ParagraphStyle(
                'CustomHeading',
                parent=styles['Heading2'],
                fontSize=16,
                spaceAfter=12,
                spaceBefore=12,
                textColor=colors.darkblue
            )

            # Build PDF content
            story = []

            # Title page
            story.append(Paragraph(title, title_style))
            story.append(Spacer(1, 20))
            story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal']))
            story.append(PageBreak())

            # Process content
            lines = content.split('\n')
            current_section = []

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check for headings
                if line.startswith('# '):
                    # H1 heading
                    if current_section:
                        story.extend(self._process_section(current_section, styles))
                        current_section = []
                    story.append(Paragraph(line[2:], title_style))
                    story.append(Spacer(1, 12))
                elif line.startswith('## '):
                    # H2 heading
                    if current_section:
                        story.extend(self._process_section(current_section, styles))
                        current_section = []
                    story.append(Paragraph(line[3:], heading_style))
                    story.append(Spacer(1, 8))
                elif line.startswith('### '):
                    # H3 heading
                    if current_section:
                        story.extend(self._process_section(current_section, styles))
                        current_section = []
                    story.append(Paragraph(line[4:], styles['Heading3']))
                    story.append(Spacer(1, 6))
                else:
                    current_section.append(line)

            # Process remaining content
            if current_section:
                story.extend(self._process_section(current_section, styles))

            # Build PDF
            doc.build(story)

            # Get PDF content as base64
            buffer.seek(0)
            pdf_bytes = buffer.getvalue()
            pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')

            return pdf_base64

        except Exception as e:
            self.logger.error(f"Error generating PDF: {str(e)}")
            raise

    def _process_section(self, lines: list, styles) -> list:
        """Process a section of content into PDF elements."""
        elements = []

        for line in lines:
            if not line.strip():
                continue

            # Check for tables (lines with |)
            if '|' in line and not line.startswith('|'):
                # This might be part of a table, process as table
                table_data = self._extract_table_data(lines)
                if table_data:
                    table = Table(table_data)
                    table.setStyle(TableStyle([
                        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                        ('FONTSIZE', (0, 0), (-1, 0), 14),
                        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                        ('GRID', (0, 0), (-1, -1), 1, colors.black)
                    ]))
                    elements.append(table)
                    elements.append(Spacer(1, 12))
                    return elements

            # Check for lists
            if line.startswith('- ') or line.startswith('* '):
                # This is a list item
                elements.append(Paragraph(f"• {line[2:]}", styles['Normal']))
            else:
                # Regular paragraph
                elements.append(Paragraph(line, styles['Normal']))

        elements.append(Spacer(1, 6))
        return elements

    def _extract_table_data(self, lines: list) -> list:
        """Extract table data from lines."""
        table_data = []
        in_table = False

        for line in lines:
            if '|' in line:
                if not in_table:
                    in_table = True
                # Split by | and clean up
                cells = [cell.strip() for cell in line.split('|') if cell.strip()]
                if cells:
                    table_data.append(cells)
            elif in_table and not line.strip():
                # Empty line, might be end of table
                break

        return table_data if len(table_data) > 1 else []