249 lines
No EOL
9.6 KiB
Python
249 lines
No EOL
9.6 KiB
Python
"""
|
|
DOCX renderer for report generation using python-docx.
|
|
"""
|
|
|
|
from .base_renderer import BaseRenderer
|
|
from typing import Dict, Any, Tuple, List
|
|
import io
|
|
import base64
|
|
from datetime import datetime, UTC
|
|
|
|
try:
|
|
from docx import Document
|
|
from docx.shared import Inches, Pt
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.enum.table import WD_TABLE_ALIGNMENT
|
|
from docx.oxml.shared import OxmlElement, qn
|
|
from docx.oxml.ns import nsdecls
|
|
from docx.oxml import parse_xml
|
|
DOCX_AVAILABLE = True
|
|
except ImportError:
|
|
DOCX_AVAILABLE = False
|
|
|
|
class DocxRenderer(BaseRenderer):
|
|
"""Renders content to DOCX format using python-docx."""
|
|
|
|
@classmethod
|
|
def get_supported_formats(cls) -> List[str]:
|
|
"""Return supported DOCX formats."""
|
|
return ['docx', 'doc']
|
|
|
|
@classmethod
|
|
def get_format_aliases(cls) -> List[str]:
|
|
"""Return format aliases."""
|
|
return ['word', 'document']
|
|
|
|
@classmethod
|
|
def get_priority(cls) -> int:
|
|
"""Return priority for DOCX renderer."""
|
|
return 115
|
|
|
|
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
|
|
return (
|
|
"DOCX FORMAT GUIDELINES:\n"
|
|
"- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
|
|
"- Use clear section hierarchy; bullet and numbered lists where needed.\n"
|
|
"- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
|
|
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
|
|
)
|
|
|
|
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
"""Render extracted content to DOCX format."""
|
|
try:
|
|
if not DOCX_AVAILABLE:
|
|
# Fallback to HTML if python-docx not available
|
|
from .html_renderer import HtmlRenderer
|
|
html_renderer = HtmlRenderer()
|
|
html_content, _ = await html_renderer.render(extracted_content, title)
|
|
return html_content, "text/html"
|
|
|
|
# Generate DOCX using python-docx
|
|
docx_content = self._generate_docx(extracted_content, title)
|
|
|
|
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
|
# Return minimal fallback
|
|
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
|
|
|
def _generate_docx(self, content: str, title: str) -> str:
|
|
"""Generate DOCX content using python-docx."""
|
|
try:
|
|
# Create new document
|
|
doc = Document()
|
|
|
|
# Set up document styles
|
|
self._setup_document_styles(doc)
|
|
|
|
# Add title
|
|
title_para = doc.add_heading(title, 0)
|
|
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
# Add generation date
|
|
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
|
|
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
# Add page break
|
|
doc.add_page_break()
|
|
|
|
# Process content
|
|
lines = content.split('\n')
|
|
current_section = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check for ALL CAPS headings (major headings)
|
|
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line, level=1)
|
|
# Check for Title Case headings (subheadings)
|
|
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line, level=2)
|
|
# Check for markdown headings (fallback)
|
|
elif line.startswith('# '):
|
|
# H1 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[2:], level=1)
|
|
elif line.startswith('## '):
|
|
# H2 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[3:], level=2)
|
|
elif line.startswith('### '):
|
|
# H3 heading
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
current_section = []
|
|
doc.add_heading(line[4:], level=3)
|
|
else:
|
|
current_section.append(line)
|
|
|
|
# Process remaining content
|
|
if current_section:
|
|
self._process_section(doc, current_section)
|
|
|
|
# Save to buffer
|
|
buffer = io.BytesIO()
|
|
doc.save(buffer)
|
|
buffer.seek(0)
|
|
|
|
# Convert to base64
|
|
docx_bytes = buffer.getvalue()
|
|
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
|
|
|
return docx_base64
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error generating DOCX: {str(e)}")
|
|
raise
|
|
|
|
def _setup_document_styles(self, doc):
|
|
"""Set up document styles."""
|
|
try:
|
|
# Set default font
|
|
style = doc.styles['Normal']
|
|
font = style.font
|
|
font.name = 'Calibri'
|
|
font.size = Pt(11)
|
|
|
|
# Set heading styles
|
|
for i in range(1, 4):
|
|
heading_style = doc.styles[f'Heading {i}']
|
|
heading_font = heading_style.font
|
|
heading_font.name = 'Calibri'
|
|
heading_font.size = Pt(16 - i * 2)
|
|
heading_font.bold = True
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
|
|
|
def _process_section(self, doc, lines: list):
|
|
"""Process a section of content into DOCX elements."""
|
|
for line in lines:
|
|
if not line.strip():
|
|
continue
|
|
|
|
# Check for tables (lines with |)
|
|
if '|' in line and not line.startswith('|'):
|
|
# This might be part of a table, process as table
|
|
table_data = self._extract_table_data(lines)
|
|
if table_data:
|
|
self._add_table(doc, table_data)
|
|
return
|
|
|
|
# Check for lists
|
|
if line.startswith('- ') or line.startswith('* '):
|
|
# This is a list item
|
|
doc.add_paragraph(line[2:], style='List Bullet')
|
|
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
|
# This is a numbered list item
|
|
doc.add_paragraph(line[3:], style='List Number')
|
|
else:
|
|
# Regular paragraph
|
|
doc.add_paragraph(line)
|
|
|
|
def _extract_table_data(self, lines: list) -> list:
|
|
"""Extract table data from lines."""
|
|
table_data = []
|
|
in_table = False
|
|
|
|
for line in lines:
|
|
if '|' in line:
|
|
if not in_table:
|
|
in_table = True
|
|
# Split by | and clean up
|
|
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
|
if cells:
|
|
table_data.append(cells)
|
|
elif in_table and not line.strip():
|
|
# Empty line, might be end of table
|
|
break
|
|
|
|
return table_data if len(table_data) > 1 else []
|
|
|
|
def _add_table(self, doc, table_data: list):
|
|
"""Add a table to the document."""
|
|
try:
|
|
if not table_data:
|
|
return
|
|
|
|
# Create table
|
|
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
|
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
|
|
|
# Add data to table
|
|
for row_idx, row_data in enumerate(table_data):
|
|
for col_idx, cell_data in enumerate(row_data):
|
|
if col_idx < len(table.rows[row_idx].cells):
|
|
table.rows[row_idx].cells[col_idx].text = cell_data
|
|
|
|
# Style the table
|
|
self._style_table(table)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not add table: {str(e)}")
|
|
|
|
def _style_table(self, table):
|
|
"""Apply styling to the table."""
|
|
try:
|
|
# Style header row
|
|
if len(table.rows) > 0:
|
|
header_cells = table.rows[0].cells
|
|
for cell in header_cells:
|
|
for paragraph in cell.paragraphs:
|
|
for run in paragraph.runs:
|
|
run.bold = True
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not style table: {str(e)}") |