472 lines
21 KiB
Python
472 lines
21 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Text renderer for report generation.
|
|
"""
|
|
|
|
import re
|
|
|
|
from .documentRendererBaseTemplate import BaseRenderer
|
|
from modules.datamodels.datamodelDocument import RenderedDocument
|
|
from typing import Dict, Any, List, Optional, Union
|
|
|
|
class RendererText(BaseRenderer):
|
|
"""Renders content to plain text format with format-specific extraction."""
|
|
|
|
@classmethod
|
|
def getSupportedFormats(cls) -> List[str]:
|
|
"""Return supported text formats (excluding formats with dedicated renderers)."""
|
|
return [
|
|
'txt', 'text', 'plain',
|
|
# Programming languages
|
|
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
|
|
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
|
|
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
|
|
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
|
|
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
|
|
# Web technologies (excluding html/htm which have dedicated renderer)
|
|
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
|
|
# Data formats (excluding csv, md/markdown which have dedicated renderers)
|
|
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
|
|
# Configuration files
|
|
'env', 'properties', 'conf', 'config', 'rc',
|
|
'gitattributes', 'editorconfig', 'eslintrc',
|
|
# Documentation
|
|
'readme', 'changelog', 'license', 'authors',
|
|
'contributing', 'todo', 'notes', 'docs'
|
|
]
|
|
|
|
@classmethod
|
|
def getFormatAliases(cls) -> List[str]:
|
|
"""Return format aliases."""
|
|
return [
|
|
'ascii', 'utf8', 'utf-8', 'code', 'source',
|
|
'script', 'program', 'file', 'document',
|
|
'raw', 'unformatted', 'plaintext'
|
|
]
|
|
|
|
@classmethod
|
|
def getPriority(cls) -> int:
|
|
"""Return priority for text renderer."""
|
|
return 90
|
|
|
|
@classmethod
|
|
def getOutputStyle(cls, formatName: str = None) -> str:
|
|
"""
|
|
Return output style classification based on format.
|
|
For txt/text/plain: 'document' (unstructured text)
|
|
For all other formats: 'code' (structured formats with rules/syntax)
|
|
|
|
Note: formatName parameter is provided by registry when calling this method.
|
|
"""
|
|
# Plain text formats are document style
|
|
if formatName and formatName.lower() in ['txt', 'text', 'plain']:
|
|
return 'document'
|
|
# All other formats handled by RendererText are code style
|
|
return 'code'
|
|
|
|
@classmethod
|
|
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
|
|
"""
|
|
Return list of section content types that Text renderer accepts.
|
|
Text renderer accepts all section types except images (text formats cannot display images).
|
|
"""
|
|
from modules.datamodels.datamodelJson import supportedSectionTypes
|
|
|
|
# Text renderer accepts all types except images
|
|
return [st for st in supportedSectionTypes if st != "image"]
|
|
|
|
async def render(
|
|
self,
|
|
extractedContent: Dict[str, Any],
|
|
title: str,
|
|
userPrompt: str = None,
|
|
aiService=None,
|
|
*,
|
|
style: Dict[str, Any] = None,
|
|
) -> List[RenderedDocument]:
|
|
"""Render extracted JSON content to plain text format."""
|
|
_ = style # unified style from renderReport; plain text ignores formatting hints
|
|
try:
|
|
# Generate text from JSON structure
|
|
textContent = self._generateTextFromJson(extractedContent, title)
|
|
|
|
# Determine filename from document or title
|
|
documents = extractedContent.get("documents", [])
|
|
if documents and isinstance(documents[0], dict):
|
|
filename = documents[0].get("filename")
|
|
if not filename:
|
|
filename = self._determineFilename(title, "text/plain")
|
|
else:
|
|
filename = self._determineFilename(title, "text/plain")
|
|
|
|
# Extract metadata for document type and other info
|
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
|
|
|
# UTF-8 BOM helps editors/browsers recognize encoding (fixes grössten → grössten)
|
|
text_bytes = textContent.encode('utf-8')
|
|
if not text_bytes.startswith(b'\xef\xbb\xbf'):
|
|
text_bytes = b'\xef\xbb\xbf' + text_bytes
|
|
return [
|
|
RenderedDocument(
|
|
documentData=text_bytes,
|
|
mimeType="text/plain",
|
|
filename=filename,
|
|
documentType=documentType,
|
|
metadata=metadata if isinstance(metadata, dict) else None
|
|
)
|
|
]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error rendering text: {str(e)}")
|
|
# Return minimal text fallback
|
|
fallbackContent = f"{title}\n\nError rendering report: {str(e)}"
|
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
|
return [
|
|
RenderedDocument(
|
|
documentData=fallbackContent.encode('utf-8'),
|
|
mimeType="text/plain",
|
|
filename=self._determineFilename(title, "text/plain"),
|
|
documentType=documentType,
|
|
metadata=metadata if isinstance(metadata, dict) else None
|
|
)
|
|
]
|
|
|
|
def _generateTextFromJson(self, jsonContent: Dict[str, Any], title: str) -> str:
|
|
"""Generate text content from structured JSON document."""
|
|
try:
|
|
# Validate JSON structure
|
|
if not self._validateJsonStructure(jsonContent):
|
|
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
|
|
|
|
# Extract sections and metadata from standardized schema
|
|
sections = self._extractSections(jsonContent)
|
|
metadata = self._extractMetadata(jsonContent)
|
|
|
|
# Use provided title (which comes from documents[].title) as primary source
|
|
# Fallback to metadata.title only if title parameter is empty
|
|
documentTitle = title if title else metadata.get("title", "Generated Document")
|
|
|
|
# Build text content
|
|
textParts = []
|
|
|
|
# Document title
|
|
textParts.append(documentTitle)
|
|
textParts.append("=" * len(documentTitle))
|
|
textParts.append("")
|
|
|
|
# Process each section
|
|
for section in sections:
|
|
sectionText = self._renderJsonSection(section)
|
|
if sectionText:
|
|
textParts.append(sectionText)
|
|
textParts.append("") # Add spacing between sections
|
|
|
|
# Add generation info
|
|
textParts.append("")
|
|
textParts.append(f"Generated: {self._formatTimestamp()}")
|
|
|
|
return '\n'.join(textParts)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error generating text from JSON: {str(e)}")
|
|
raise Exception(f"Text generation failed: {str(e)}")
|
|
|
|
def _renderJsonSection(self, section: Dict[str, Any]) -> str:
|
|
"""Render a single JSON section to text.
|
|
Supports three content formats: reference, object (base64), extracted_text.
|
|
"""
|
|
try:
|
|
sectionType = self._getSectionType(section)
|
|
sectionData = self._getSectionData(section)
|
|
|
|
# Check for three content formats from Phase 5D in elements
|
|
if isinstance(sectionData, list):
|
|
textParts = []
|
|
for element in sectionData:
|
|
element_type = element.get("type", "") if isinstance(element, dict) else ""
|
|
|
|
# Support three content formats from Phase 5D
|
|
if element_type == "reference":
|
|
# Document reference format
|
|
doc_ref = element.get("documentReference", "")
|
|
label = element.get("label", "Reference")
|
|
textParts.append(f"[Reference: {label}]")
|
|
continue
|
|
elif element_type == "extracted_text":
|
|
# Extracted text format (str or raw bytes from ContentPart)
|
|
content = element.get("content", "")
|
|
if isinstance(content, (bytes, bytearray, memoryview)):
|
|
content = bytes(content).decode("utf-8", errors="replace")
|
|
source = element.get("source", "")
|
|
if content:
|
|
source_text = f" (Source: {source})" if source else ""
|
|
textParts.append(f"{content}{source_text}")
|
|
continue
|
|
|
|
# If we processed reference/extracted_text elements, return them
|
|
if textParts:
|
|
return '\n\n'.join(textParts)
|
|
|
|
if sectionType == "table":
|
|
# Work directly with elements like other renderers
|
|
if isinstance(sectionData, list) and sectionData:
|
|
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
|
|
return self._renderJsonTable(element)
|
|
return ""
|
|
elif sectionType == "bullet_list":
|
|
# Work directly with elements like other renderers
|
|
if isinstance(sectionData, list) and sectionData:
|
|
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
|
|
return self._renderJsonBulletList(element)
|
|
return ""
|
|
elif sectionType == "heading":
|
|
# Work directly with elements like other renderers
|
|
if isinstance(sectionData, list) and sectionData:
|
|
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
|
|
return self._renderJsonHeading(element)
|
|
return ""
|
|
elif sectionType == "paragraph":
|
|
# Render each paragraph element in the elements array
|
|
renderedElements = []
|
|
for element in sectionData:
|
|
renderedElements.append(self._renderJsonParagraph(element))
|
|
return "\n".join(renderedElements)
|
|
elif sectionType == "code_block":
|
|
# Work directly with elements like other renderers
|
|
if isinstance(sectionData, list) and sectionData:
|
|
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
|
|
return self._renderJsonCodeBlock(element)
|
|
return ""
|
|
elif sectionType == "image":
|
|
# Work directly with elements like other renderers
|
|
if isinstance(sectionData, list) and sectionData:
|
|
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
|
|
return self._renderJsonImage(element)
|
|
return ""
|
|
else:
|
|
# Fallback to paragraph for unknown types - render each element
|
|
# sectionData is already the elements array from _getSectionData
|
|
renderedElements = []
|
|
for element in sectionData:
|
|
renderedElements.append(self._renderJsonParagraph(element))
|
|
return "\n".join(renderedElements)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
|
|
return f"[Error rendering section: {str(e)}]"
|
|
|
|
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
|
"""Render a JSON table to text."""
|
|
try:
|
|
# Extract from nested content structure: element.content.{headers, rows}
|
|
content = tableData.get("content", {})
|
|
if not isinstance(content, dict):
|
|
return ""
|
|
headers = content.get("headers", [])
|
|
rows = content.get("rows", [])
|
|
|
|
if not headers or not rows:
|
|
return ""
|
|
|
|
textParts = []
|
|
|
|
# Create table header
|
|
headerLine = " | ".join(self._tableCellToPlainText(h) for h in headers)
|
|
textParts.append(headerLine)
|
|
|
|
# Add separator line
|
|
separatorLine = " | ".join("-" * len(self._tableCellToPlainText(h)) for h in headers)
|
|
textParts.append(separatorLine)
|
|
|
|
# Add data rows
|
|
for row in rows:
|
|
rowLine = " | ".join(self._tableCellToPlainText(cellData) for cellData in row)
|
|
textParts.append(rowLine)
|
|
|
|
return '\n'.join(textParts)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
|
"""Render a JSON bullet list to text. Strips markdown from item text."""
|
|
try:
|
|
# Extract from nested content structure: element.content.{items}
|
|
content = listData.get("content", {})
|
|
if not isinstance(content, dict):
|
|
return ""
|
|
items = content.get("items", [])
|
|
|
|
if not items:
|
|
return ""
|
|
|
|
textParts = []
|
|
for item in items:
|
|
if isinstance(item, str):
|
|
textParts.append(f"- {self._stripMarkdownForPlainText(item)}")
|
|
elif isinstance(item, dict) and "text" in item:
|
|
textParts.append(f"- {self._stripMarkdownForPlainText(item['text'])}")
|
|
elif isinstance(item, list):
|
|
# markdownToDocumentJson: each item is List[InlineRun]
|
|
textParts.append(f"- {self._inlineRunsToPlainText(item)}")
|
|
|
|
return '\n'.join(textParts)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
|
"""Render a JSON heading to text. Strips markdown from heading text."""
|
|
try:
|
|
# Extract from nested content structure: element.content.{text, level}
|
|
content = headingData.get("content", {})
|
|
if isinstance(content, dict) and content:
|
|
text = self._stripMarkdownForPlainText(content.get("text", ""))
|
|
level = content.get("level", 1)
|
|
else:
|
|
# AI shorthand: {"type":"heading","text":"...","level":2}
|
|
text = self._stripMarkdownForPlainText(str(headingData.get("text", "") or ""))
|
|
level = headingData.get("level", 1)
|
|
if not text:
|
|
return ""
|
|
|
|
try:
|
|
level_i = int(level) if level is not None else 1
|
|
except (TypeError, ValueError):
|
|
level_i = 1
|
|
level_i = max(1, min(6, level_i))
|
|
if level_i == 1:
|
|
return f"{text}\n{'=' * len(text)}"
|
|
if level_i == 2:
|
|
return f"{text}\n{'-' * len(text)}"
|
|
return f"{'#' * level_i} {text}"
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
|
return ""
|
|
|
|
def _stripMarkdownForPlainText(self, text: str) -> str:
|
|
"""Strip markdown formatting for plain text output (**bold** -> bold, *italic* -> italic)."""
|
|
if not text:
|
|
return ""
|
|
# **bold** and __bold__ -> plain
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
|
text = re.sub(r'__(.+?)__', r'\1', text)
|
|
# *italic* and _italic_ -> plain
|
|
text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'\1', text)
|
|
text = re.sub(r'(?<!_)_(?!_)([^_]+)(?<!_)_(?!_)', r'\1', text)
|
|
# `code` -> plain
|
|
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
return text.strip()
|
|
|
|
def _inlineRunsToPlainText(self, runs: Union[List[Any], Any]) -> str:
|
|
"""Flatten InlineRun dicts (from markdownToDocumentJson) to a single string."""
|
|
if runs is None:
|
|
return ""
|
|
if isinstance(runs, dict):
|
|
runs = [runs]
|
|
if not isinstance(runs, list):
|
|
return self._stripMarkdownForPlainText(str(runs))
|
|
parts: List[str] = []
|
|
for run in runs:
|
|
if not isinstance(run, dict):
|
|
parts.append(str(run))
|
|
continue
|
|
t = run.get("type") or "text"
|
|
val = run.get("value", "")
|
|
if t == "text":
|
|
parts.append(str(val))
|
|
elif t in ("bold", "italic", "code"):
|
|
parts.append(str(val))
|
|
elif t == "link":
|
|
parts.append(str(val))
|
|
elif t == "image":
|
|
parts.append(f"[{val}]")
|
|
else:
|
|
parts.append(str(val))
|
|
return "".join(parts)
|
|
|
|
def _tableCellToPlainText(self, cell: Any) -> str:
|
|
"""Table header/cell: plain str, legacy dict, or List[InlineRun]."""
|
|
if cell is None:
|
|
return ""
|
|
if isinstance(cell, str):
|
|
return self._stripMarkdownForPlainText(cell)
|
|
if isinstance(cell, list):
|
|
return self._inlineRunsToPlainText(cell)
|
|
if isinstance(cell, dict) and "text" in cell:
|
|
return self._stripMarkdownForPlainText(str(cell["text"]))
|
|
return self._stripMarkdownForPlainText(str(cell))
|
|
|
|
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
|
"""Render a JSON paragraph to text. Strips markdown for plain text output."""
|
|
try:
|
|
# Models often return {"type":"paragraph","text":"..."} without nested "content"
|
|
top = paragraphData.get("text")
|
|
raw_content = paragraphData.get("content", {})
|
|
if isinstance(top, str) and top.strip():
|
|
if raw_content is None or raw_content == {}:
|
|
return self._stripMarkdownForPlainText(top)
|
|
if isinstance(raw_content, dict):
|
|
if not (raw_content.get("text") or raw_content.get("inlineRuns")):
|
|
return self._stripMarkdownForPlainText(top)
|
|
|
|
content = raw_content
|
|
if content is None:
|
|
content = {}
|
|
if isinstance(content, dict):
|
|
runs = self._inlineRunsFromContent(content)
|
|
if runs:
|
|
return self._stripMarkdownForPlainText(self._inlineRunsToPlainText(runs))
|
|
text = content.get("text", "")
|
|
elif isinstance(content, str):
|
|
text = content
|
|
else:
|
|
text = ""
|
|
return self._stripMarkdownForPlainText(text) if text else ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
|
"""Render a JSON code block to text."""
|
|
try:
|
|
# Extract from nested content structure: element.content.{code, language}
|
|
content = codeData.get("content", {})
|
|
if not isinstance(content, dict):
|
|
return ""
|
|
code = content.get("code", "")
|
|
language = content.get("language", "")
|
|
|
|
if code:
|
|
if language:
|
|
return f"Code ({language}):\n{code}"
|
|
else:
|
|
return code
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
|
"""Render a JSON image to text."""
|
|
try:
|
|
# Extract from nested content structure: element.content.{base64Data, altText, caption}
|
|
content = imageData.get("content", {})
|
|
if isinstance(content, dict):
|
|
altText = content.get("altText", "Image")
|
|
else:
|
|
altText = imageData.get("altText", "Image")
|
|
return f"[Image: {altText}]"
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
|
return f"[Image: Image]"
|