# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Markdown renderer for report generation. """ from .documentRendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List, Optional class RendererMarkdown(BaseRenderer): """Renders content to Markdown format with format-specific extraction.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported Markdown formats.""" return ['md', 'markdown'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['mdown', 'mkd'] @classmethod def getPriority(cls) -> int: """Return priority for markdown renderer.""" return 95 @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: """Return output style classification: Markdown documents are formatted documents.""" return 'document' @classmethod def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: """ Return list of section content types that Markdown renderer accepts. Markdown renderer accepts all section types except images. """ from modules.datamodels.datamodelJson import supportedSectionTypes return [st for st in supportedSectionTypes if st != "image"] async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to Markdown format.""" try: # Generate markdown from JSON structure markdownContent = self._generateMarkdownFromJson(extractedContent, title) # Determine filename from document or title documents = extractedContent.get("documents", []) if documents and isinstance(documents[0], dict): filename = documents[0].get("filename") if not filename: filename = self._determineFilename(title, "text/markdown") else: filename = self._determineFilename(title, "text/markdown") # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None return [ RenderedDocument( documentData=markdownContent.encode('utf-8'), mimeType="text/markdown", filename=filename, documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] except Exception as e: self.logger.error(f"Error rendering markdown: {str(e)}") # Return minimal markdown fallback fallbackContent = f"# {title}\n\nError rendering report: {str(e)}" metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None return [ RenderedDocument( documentData=fallbackContent.encode('utf-8'), mimeType="text/markdown", filename=self._determineFilename(title, "text/markdown"), documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] def _generateMarkdownFromJson(self, jsonContent: Dict[str, Any], title: str) -> str: """Generate markdown content from structured JSON document.""" try: # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(jsonContent): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(jsonContent) metadata = self._extractMetadata(jsonContent) # Use provided title (which comes from documents[].title) as primary source # Fallback to metadata.title only if title parameter is empty documentTitle = title if title else metadata.get("title", "Generated Document") # Build markdown content markdownParts = [] # Document title markdownParts.append(f"# {documentTitle}") markdownParts.append("") # Process each section for section in sections: sectionMarkdown = self._renderJsonSection(section) if sectionMarkdown: markdownParts.append(sectionMarkdown) markdownParts.append("") # Add spacing between sections # Add generation info markdownParts.append("---") markdownParts.append(f"*Generated: {self._formatTimestamp()}*") return '\n'.join(markdownParts) except Exception as e: self.logger.error(f"Error generating markdown from JSON: {str(e)}") raise Exception(f"Markdown generation failed: {str(e)}") def _renderJsonSection(self, section: Dict[str, Any]) -> str: """Render a single JSON section to markdown. Supports three content formats: reference, object (base64), extracted_text. """ try: sectionType = self._getSectionType(section) sectionData = self._getSectionData(section) # Check for three content formats from Phase 5D in elements if isinstance(sectionData, list): markdownParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" # Support three content formats from Phase 5D if element_type == "reference": # Document reference format doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") markdownParts.append(f"*[Reference: {label}]*") continue elif element_type == "extracted_text": # Extracted text format content = element.get("content", "") source = element.get("source", "") if content: source_text = f" *(Source: {source})*" if source else "" markdownParts.append(f"{content}{source_text}") continue # If we processed reference/extracted_text elements, return them if markdownParts: return '\n\n'.join(markdownParts) if sectionType == "table": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonTable(element) return "" elif sectionType == "bullet_list": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonBulletList(element) return "" elif sectionType == "heading": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonHeading(element) return "" elif sectionType == "paragraph": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonParagraph(element) elif isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData) return "" elif sectionType == "code_block": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonCodeBlock(element) return "" elif sectionType == "image": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonImage(element) return "" else: # Fallback to paragraph for unknown types if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonParagraph(element) elif isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData) return "" except Exception as e: self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}") return f"*[Error rendering section: {str(e)}]*" def _renderJsonTable(self, tableData: Dict[str, Any]) -> str: """Render a JSON table to markdown.""" try: # Extract from nested content structure: element.content.{headers, rows} content = tableData.get("content", {}) if not isinstance(content, dict): return "" headers = content.get("headers", []) rows = content.get("rows", []) if not headers or not rows: return "" markdownParts = [] # Create table header headerLine = " | ".join(str(header) for header in headers) markdownParts.append(headerLine) # Add separator line separatorLine = " | ".join("---" for _ in headers) markdownParts.append(separatorLine) # Add data rows for row in rows: rowLine = " | ".join(str(cellData) for cellData in row) markdownParts.append(rowLine) return '\n'.join(markdownParts) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return "" def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str: """Render a JSON bullet list to markdown.""" try: # Extract from nested content structure: element.content.{items} content = listData.get("content", {}) if not isinstance(content, dict): return "" items = content.get("items", []) if not items: return "" markdownParts = [] for item in items: if isinstance(item, str): markdownParts.append(f"- {item}") elif isinstance(item, dict) and "text" in item: markdownParts.append(f"- {item['text']}") return '\n'.join(markdownParts) except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return "" def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str: """Render a JSON heading to markdown.""" try: # Extract from nested content structure: element.content.{text, level} content = headingData.get("content", {}) if not isinstance(content, dict): return "" text = content.get("text", "") level = content.get("level", 1) if text: level = max(1, min(6, level)) return f"{'#' * level} {text}" return "" except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") return "" def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str: """Render a JSON paragraph to markdown.""" try: # Extract from nested content structure content = paragraphData.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" return text if text else "" except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return "" def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str: """Render a JSON code block to markdown.""" try: # Extract from nested content structure content = codeData.get("content", {}) if not isinstance(content, dict): return "" code = content.get("code", "") language = content.get("language", "") if code: if language: return f"```{language}\n{code}\n```" else: return f"```\n{code}\n```" return "" except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") return "" def _renderJsonImage(self, imageData: Dict[str, Any]) -> str: """Render a JSON image to markdown.""" try: # Extract from nested content structure: element.content.{base64Data, altText, caption} content = imageData.get("content", {}) if not isinstance(content, dict): return "" altText = content.get("altText", "Image") base64Data = content.get("base64Data", "") if base64Data: # For base64 images, we can't embed them directly in markdown # So we'll use a placeholder with the alt text return f"![{altText}](data:image/png;base64,{base64Data[:50]}...)" else: return f"![{altText}](image-placeholder)" except Exception as e: self.logger.warning(f"Error rendering image: {str(e)}") return f"![{imageData.get('altText', 'Image')}](image-error)"