# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Markdown renderer for report generation. """ from .documentRendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List, Optional class RendererMarkdown(BaseRenderer): """Renders content to Markdown format with format-specific extraction.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported Markdown formats.""" return ['md', 'markdown'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['mdown', 'mkd'] @classmethod def getPriority(cls) -> int: """Return priority for markdown renderer.""" return 95 @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: """Return output style classification: Markdown documents are formatted documents.""" return 'document' @classmethod def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: """ Return list of section content types that Markdown renderer accepts. Markdown renderer accepts all section types (Markdown can represent all content types). """ from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to Markdown format.""" try: # Generate markdown from JSON structure markdownContent = self._generateMarkdownFromJson(extractedContent, title) # Determine filename from document or title documents = extractedContent.get("documents", []) if documents and isinstance(documents[0], dict): filename = documents[0].get("filename") if not filename: filename = self._determineFilename(title, "text/markdown") else: filename = self._determineFilename(title, "text/markdown") # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None return [ RenderedDocument( documentData=markdownContent.encode('utf-8'), mimeType="text/markdown", filename=filename, documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] except Exception as e: self.logger.error(f"Error rendering markdown: {str(e)}") # Return minimal markdown fallback fallbackContent = f"# {title}\n\nError rendering report: {str(e)}" metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None return [ RenderedDocument( documentData=fallbackContent.encode('utf-8'), mimeType="text/markdown", filename=self._determineFilename(title, "text/markdown"), documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] def _generateMarkdownFromJson(self, jsonContent: Dict[str, Any], title: str) -> str: """Generate markdown content from structured JSON document.""" try: # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(jsonContent): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(jsonContent) metadata = self._extractMetadata(jsonContent) # Use provided title (which comes from documents[].title) as primary source # Fallback to metadata.title only if title parameter is empty documentTitle = title if title else metadata.get("title", "Generated Document") # Build markdown content markdownParts = [] # Document title markdownParts.append(f"# {documentTitle}") markdownParts.append("") # Process each section for section in sections: sectionMarkdown = self._renderJsonSection(section) if sectionMarkdown: markdownParts.append(sectionMarkdown) markdownParts.append("") # Add spacing between sections # Add generation info markdownParts.append("---") markdownParts.append(f"*Generated: {self._formatTimestamp()}*") return '\n'.join(markdownParts) except Exception as e: self.logger.error(f"Error generating markdown from JSON: {str(e)}") raise Exception(f"Markdown generation failed: {str(e)}") def _renderJsonSection(self, section: Dict[str, Any]) -> str: """Render a single JSON section to markdown. Supports three content formats: reference, object (base64), extracted_text. """ try: sectionType = self._getSectionType(section) sectionData = self._getSectionData(section) # Check for three content formats from Phase 5D in elements if isinstance(sectionData, list): markdownParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" # Support three content formats from Phase 5D if element_type == "reference": # Document reference format doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") markdownParts.append(f"*[Reference: {label}]*") continue elif element_type == "extracted_text": # Extracted text format content = element.get("content", "") source = element.get("source", "") if content: source_text = f" *(Source: {source})*" if source else "" markdownParts.append(f"{content}{source_text}") continue # If we processed reference/extracted_text elements, return them if markdownParts: return '\n\n'.join(markdownParts) if sectionType == "table": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonTable(element) return "" elif sectionType == "bullet_list": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonBulletList(element) return "" elif sectionType == "heading": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonHeading(element) return "" elif sectionType == "paragraph": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonParagraph(element) elif isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData) return "" elif sectionType == "code_block": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonCodeBlock(element) return "" elif sectionType == "image": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonImage(element) return "" else: # Fallback to paragraph for unknown types if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonParagraph(element) elif isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData) return "" except Exception as e: self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}") return f"*[Error rendering section: {str(e)}]*" def _renderJsonTable(self, tableData: Dict[str, Any]) -> str: """Render a JSON table to markdown.""" try: # Extract from nested content structure: element.content.{headers, rows} content = tableData.get("content", {}) if not isinstance(content, dict): return "" headers = content.get("headers", []) rows = content.get("rows", []) if not headers or not rows: return "" markdownParts = [] # Create table header headerLine = " | ".join(str(header) for header in headers) markdownParts.append(headerLine) # Add separator line separatorLine = " | ".join("---" for _ in headers) markdownParts.append(separatorLine) # Add data rows for row in rows: rowLine = " | ".join(str(cellData) for cellData in row) markdownParts.append(rowLine) return '\n'.join(markdownParts) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return "" def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str: """Render a JSON bullet list to markdown.""" try: # Extract from nested content structure: element.content.{items} content = listData.get("content", {}) if not isinstance(content, dict): return "" items = content.get("items", []) if not items: return "" markdownParts = [] for item in items: if isinstance(item, str): markdownParts.append(f"- {item}") elif isinstance(item, dict) and "text" in item: markdownParts.append(f"- {item['text']}") return '\n'.join(markdownParts) except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return "" def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str: """Render a JSON heading to markdown.""" try: # Extract from nested content structure: element.content.{text, level} content = headingData.get("content", {}) if not isinstance(content, dict): return "" text = content.get("text", "") level = content.get("level", 1) if text: level = max(1, min(6, level)) return f"{'#' * level} {text}" return "" except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") return "" def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str: """Render a JSON paragraph to markdown.""" try: # Extract from nested content structure content = paragraphData.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" return text if text else "" except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return "" def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str: """Render a JSON code block to markdown.""" try: # Extract from nested content structure content = codeData.get("content", {}) if not isinstance(content, dict): return "" code = content.get("code", "") language = content.get("language", "") if code: if language: return f"```{language}\n{code}\n```" else: return f"```\n{code}\n```" return "" except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") return "" def _renderJsonImage(self, imageData: Dict[str, Any]) -> str: """Render a JSON image to markdown.""" try: # Extract from nested content structure: element.content.{base64Data, altText, caption} content = imageData.get("content", {}) if not isinstance(content, dict): return "" altText = content.get("altText", "Image") base64Data = content.get("base64Data", "") if base64Data: # For base64 images, we can't embed them directly in markdown # So we'll use a placeholder with the alt text return f"![{altText}](data:image/png;base64,{base64Data[:50]}...)" else: return f"![{altText}](image-placeholder)" except Exception as e: self.logger.warning(f"Error rendering image: {str(e)}") return f"![{imageData.get('altText', 'Image')}](image-error)"