293 lines
No EOL
11 KiB
Python
293 lines
No EOL
11 KiB
Python
"""
|
|
DOCX renderer for report generation using python-docx.
|
|
"""
|
|
|
|
from .base_renderer import BaseRenderer
|
|
from typing import Dict, Any, Tuple, List
|
|
import io
|
|
import base64
|
|
from datetime import datetime, UTC
|
|
|
|
try:
|
|
from docx import Document
|
|
from docx.shared import Inches, Pt
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.enum.table import WD_TABLE_ALIGNMENT
|
|
from docx.oxml.shared import OxmlElement, qn
|
|
from docx.oxml.ns import nsdecls
|
|
from docx.oxml import parse_xml
|
|
DOCX_AVAILABLE = True
|
|
except ImportError:
|
|
DOCX_AVAILABLE = False
|
|
|
|
class DocxRenderer(BaseRenderer):
|
|
"""Renders content to DOCX format using python-docx."""
|
|
|
|
@classmethod
|
|
def get_supported_formats(cls) -> List[str]:
|
|
"""Return supported DOCX formats."""
|
|
return ['docx', 'doc']
|
|
|
|
@classmethod
|
|
def get_format_aliases(cls) -> List[str]:
|
|
"""Return format aliases."""
|
|
return ['word', 'document']
|
|
|
|
@classmethod
|
|
def get_priority(cls) -> int:
|
|
"""Return priority for DOCX renderer."""
|
|
return 115
|
|
|
|
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
"""Get DOCX-specific extraction prompt."""
|
|
return f"""
|
|
{user_prompt}
|
|
|
|
Generate a comprehensive DOCX report with the title: "{title}"
|
|
|
|
DOCX FORMAT REQUIREMENTS:
|
|
- Create structured content suitable for Word documents
|
|
- Use clear headings and sections with proper hierarchy
|
|
- Include tables for structured data
|
|
- Use bullet points and numbered lists where appropriate
|
|
- Include source document information
|
|
- Structure content for professional presentation
|
|
- Use consistent formatting throughout
|
|
|
|
DOCX STRUCTURE:
|
|
- Title page with report title and generation date
|
|
- Table of contents (if multiple sections)
|
|
- Executive summary
|
|
- Main content sections with clear headings
|
|
- Data tables and analysis
|
|
- Conclusions and recommendations
|
|
- Appendices with source information
|
|
|
|
FORMATTING RULES:
|
|
- Use clear section headings (H1, H2, H3 style)
|
|
- Include consistent paragraph formatting
|
|
- Use tables with proper alignment and borders
|
|
- Use bullet points and numbered lists
|
|
- Add source citations and references
|
|
- Include generation metadata
|
|
- Use professional fonts and spacing
|
|
|
|
OUTPUT POLICY:
|
|
- Return ONLY plain text content suitable for Word document generation
|
|
- NO markdown formatting (no **bold**, no # headings, no --- separators)
|
|
- NO HTML tags
|
|
- NO code blocks
|
|
- Use plain text with clear structure
|
|
- Use line breaks for separation
|
|
- Use indentation for lists
|
|
- Use ALL CAPS for major headings
|
|
- Use Title Case for subheadings
|
|
- Use bullet points with dashes (-) for lists
|
|
- Use numbers (1., 2., 3.) for numbered lists
|
|
- Professional document format
|
|
- Include all necessary information
|
|
|
|
CRITICAL: Use the actual data from the source documents to create the content. Do not generate placeholder text or templates. Extract and use the real data provided in the source documents to create meaningful content.
|
|
|
|
Generate the complete DOCX report content using the actual data from the source documents:
|
|
"""
|
|
|
|
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
"""Render extracted content to DOCX format."""
|
|
try:
|
|
if not DOCX_AVAILABLE:
|
|
# Fallback to HTML if python-docx not available
|
|
from .html_renderer import HtmlRenderer
|
|
html_renderer = HtmlRenderer()
|
|
html_content, _ = await html_renderer.render(extracted_content, title)
|
|
return html_content, "text/html"
|
|
|
|
# Generate DOCX using python-docx
|
|
docx_content = self._generate_docx(extracted_content, title)
|
|
|
|
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
|
# Return minimal fallback
|
|
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
|
|
|
def _generate_docx(self, content: str, title: str) -> str:
|
|
"""Generate DOCX content using python-docx."""
|
|
try:
|
|
# Create new document
|
|
doc = Document()
|
|
|
|
# Set up document styles
|
|
self._setup_document_styles(doc)
|
|
|
|
# Add title
|
|
title_para = doc.add_heading(title, 0)
|
|
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
# Add generation date
|
|
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
|
|
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
# Add page break
|
|
doc.add_page_break()
|
|
|
|
# Process content
|
|
lines = content.split('\n')
|
|
current_section = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check for ALL CAPS headings (major headings)
|
|
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line, level=1)
|
|
# Check for Title Case headings (subheadings)
|
|
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line, level=2)
|
|
# Check for markdown headings (fallback)
|
|
elif line.startswith('# '):
|
|
# H1 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[2:], level=1)
|
|
elif line.startswith('## '):
|
|
# H2 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[3:], level=2)
|
|
elif line.startswith('### '):
|
|
# H3 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[4:], level=3)
|
|
else:
|
|
current_section.append(line)
|
|
|
|
# Process remaining content
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
|
|
# Save to buffer
|
|
buffer = io.BytesIO()
|
|
doc.save(buffer)
|
|
buffer.seek(0)
|
|
|
|
# Convert to base64
|
|
docx_bytes = buffer.getvalue()
|
|
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
|
|
|
return docx_base64
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error generating DOCX: {str(e)}")
|
|
raise
|
|
|
|
def _setup_document_styles(self, doc):
|
|
"""Set up document styles."""
|
|
try:
|
|
# Set default font
|
|
style = doc.styles['Normal']
|
|
font = style.font
|
|
font.name = 'Calibri'
|
|
font.size = Pt(11)
|
|
|
|
# Set heading styles
|
|
for i in range(1, 4):
|
|
heading_style = doc.styles[f'Heading {i}']
|
|
heading_font = heading_style.font
|
|
heading_font.name = 'Calibri'
|
|
heading_font.size = Pt(16 - i * 2)
|
|
heading_font.bold = True
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
|
|
|
def _process_section(self, doc, lines: list):
|
|
"""Process a section of content into DOCX elements."""
|
|
for line in lines:
|
|
if not line.strip():
|
|
continue
|
|
|
|
# Check for tables (lines with |)
|
|
if '|' in line and not line.startswith('|'):
|
|
# This might be part of a table, process as table
|
|
table_data = self._extract_table_data(lines)
|
|
if table_data:
|
|
self._add_table(doc, table_data)
|
|
return
|
|
|
|
# Check for lists
|
|
if line.startswith('- ') or line.startswith('* '):
|
|
# This is a list item
|
|
doc.add_paragraph(line[2:], style='List Bullet')
|
|
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
|
# This is a numbered list item
|
|
doc.add_paragraph(line[3:], style='List Number')
|
|
else:
|
|
# Regular paragraph
|
|
doc.add_paragraph(line)
|
|
|
|
def _extract_table_data(self, lines: list) -> list:
|
|
"""Extract table data from lines."""
|
|
table_data = []
|
|
in_table = False
|
|
|
|
for line in lines:
|
|
if '|' in line:
|
|
if not in_table:
|
|
in_table = True
|
|
# Split by | and clean up
|
|
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
|
if cells:
|
|
table_data.append(cells)
|
|
elif in_table and not line.strip():
|
|
# Empty line, might be end of table
|
|
break
|
|
|
|
return table_data if len(table_data) > 1 else []
|
|
|
|
def _add_table(self, doc, table_data: list):
|
|
"""Add a table to the document."""
|
|
try:
|
|
if not table_data:
|
|
return
|
|
|
|
# Create table
|
|
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
|
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
|
|
|
# Add data to table
|
|
for row_idx, row_data in enumerate(table_data):
|
|
for col_idx, cell_data in enumerate(row_data):
|
|
if col_idx < len(table.rows[row_idx].cells):
|
|
table.rows[row_idx].cells[col_idx].text = cell_data
|
|
|
|
# Style the table
|
|
self._style_table(table)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not add table: {str(e)}")
|
|
|
|
def _style_table(self, table):
|
|
"""Apply styling to the table."""
|
|
try:
|
|
# Style header row
|
|
if len(table.rows) > 0:
|
|
header_cells = table.rows[0].cells
|
|
for cell in header_cells:
|
|
for paragraph in cell.paragraphs:
|
|
for run in paragraph.runs:
|
|
run.bold = True
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not style table: {str(e)}") |