fixed handovers from generator to renderers
This commit is contained in:
parent
0280879968
commit
3e7c75335a
18 changed files with 2067 additions and 709 deletions
|
|
@ -354,10 +354,11 @@ class AiOpenai(BaseConnectorAi):
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
|
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
|
||||||
return {
|
return AiModelResponse(
|
||||||
"success": False,
|
content="",
|
||||||
"error": f"DALL-E API error: {response.status_code} - {response.text}"
|
success=False,
|
||||||
}
|
error=f"DALL-E API error: {response.status_code} - {response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
responseJson = response.json()
|
responseJson = response.json()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ class DocumentMetadata(BaseModel):
|
||||||
sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs")
|
sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs")
|
||||||
extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction")
|
extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction")
|
||||||
version: str = Field(default="1.0", description="Document version")
|
version: str = Field(default="1.0", description="Document version")
|
||||||
|
documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')")
|
||||||
|
styles: Optional[Dict[str, Any]] = Field(default=None, description="Document styling configuration")
|
||||||
|
|
||||||
|
|
||||||
class TableData(BaseModel):
|
class TableData(BaseModel):
|
||||||
|
|
@ -112,6 +114,8 @@ class RenderedDocument(BaseModel):
|
||||||
documentData: bytes = Field(description="Document content as bytes")
|
documentData: bytes = Field(description="Document content as bytes")
|
||||||
mimeType: str = Field(description="MIME type of the document (e.g., 'text/html', 'application/pdf')")
|
mimeType: str = Field(description="MIME type of the document (e.g., 'text/html', 'application/pdf')")
|
||||||
filename: str = Field(description="Filename for the document (e.g., 'report.html', 'image.png')")
|
filename: str = Field(description="Filename for the document (e.g., 'report.html', 'image.png')")
|
||||||
|
documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')")
|
||||||
|
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Document metadata (title, author, etc.)")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
json_encoders = {
|
json_encoders = {
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,7 @@ class StructureFiller:
|
||||||
# Erstelle Operation-ID für Struktur-Abfüllen
|
# Erstelle Operation-ID für Struktur-Abfüllen
|
||||||
fillOperationId = f"{parentOperationId}_structure_filling"
|
fillOperationId = f"{parentOperationId}_structure_filling"
|
||||||
|
|
||||||
# Prüfe ob Struktur Chapters oder Sections hat
|
# Validate structure has chapters
|
||||||
hasChapters = False
|
hasChapters = False
|
||||||
for doc in structure.get("documents", []):
|
for doc in structure.get("documents", []):
|
||||||
if "chapters" in doc:
|
if "chapters" in doc:
|
||||||
|
|
@ -60,9 +60,9 @@ class StructureFiller:
|
||||||
break
|
break
|
||||||
|
|
||||||
if not hasChapters:
|
if not hasChapters:
|
||||||
# Fallback: Alte Struktur mit Sections direkt - verwende alte Logik
|
error_msg = "Structure must have chapters. Legacy section-based structure is not supported."
|
||||||
logger.warning("Structure has no chapters, using legacy section-based filling")
|
logger.error(error_msg)
|
||||||
return await self._fillStructureLegacy(structure, contentParts, userPrompt, fillOperationId)
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
# Starte ChatLog mit Parent-Referenz
|
# Starte ChatLog mit Parent-Referenz
|
||||||
chapterCount = sum(len(doc.get("chapters", [])) for doc in structure.get("documents", []))
|
chapterCount = sum(len(doc.get("chapters", [])) for doc in structure.get("documents", []))
|
||||||
|
|
@ -214,10 +214,11 @@ class StructureFiller:
|
||||||
contentType = section.get("content_type", "paragraph")
|
contentType = section.get("content_type", "paragraph")
|
||||||
useAiCall = section.get("useAiCall", False)
|
useAiCall = section.get("useAiCall", False)
|
||||||
|
|
||||||
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
|
# WICHTIG: Wenn keine ContentParts vorhanden sind UND kein generationHint, kann kein AI-Call gemacht werden
|
||||||
if len(contentPartIds) == 0:
|
# Aber: Wenn generationHint vorhanden ist, kann AI auch ohne ContentParts generieren (z.B. Executive Summary)
|
||||||
|
if len(contentPartIds) == 0 and not generationHint:
|
||||||
useAiCall = False
|
useAiCall = False
|
||||||
logger.debug(f"Section {sectionId}: No content parts, setting useAiCall=False")
|
logger.debug(f"Section {sectionId}: No content parts and no generation hint, setting useAiCall=False")
|
||||||
|
|
||||||
elements = []
|
elements = []
|
||||||
|
|
||||||
|
|
@ -259,11 +260,24 @@ class StructureFiller:
|
||||||
"label": part.metadata.get("usageHint", part.label)
|
"label": part.metadata.get("usageHint", part.label)
|
||||||
})
|
})
|
||||||
elif contentFormat == "object":
|
elif contentFormat == "object":
|
||||||
|
# Nested content structure for objects
|
||||||
|
if part.typeGroup == "image":
|
||||||
|
elements.append({
|
||||||
|
"type": "image",
|
||||||
|
"content": {
|
||||||
|
"base64Data": part.data,
|
||||||
|
"altText": part.metadata.get("usageHint", part.label),
|
||||||
|
"caption": part.metadata.get("caption", "")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
elements.append({
|
elements.append({
|
||||||
"type": part.typeGroup,
|
"type": part.typeGroup,
|
||||||
"base64Data": part.data,
|
"content": {
|
||||||
|
"data": part.data,
|
||||||
"mimeType": part.mimeType,
|
"mimeType": part.mimeType,
|
||||||
"altText": part.metadata.get("usageHint", part.label)
|
"label": part.metadata.get("usageHint", part.label)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
# Aggregiere extracted Parts mit AI
|
# Aggregiere extracted Parts mit AI
|
||||||
|
|
@ -300,11 +314,24 @@ class StructureFiller:
|
||||||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt (aggregation)")
|
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt (aggregation)")
|
||||||
|
|
||||||
# Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!)
|
# Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!)
|
||||||
|
# Use IMAGE_GENERATE for image content type
|
||||||
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||||
|
|
||||||
|
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||||
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
maxPromptLength = 4000
|
||||||
|
if len(generationPrompt) > maxPromptLength:
|
||||||
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||||
|
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||||
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||||
|
|
||||||
|
# For IMAGE_GENERATE, don't pass contentParts - image generation uses prompt only, not content chunks
|
||||||
|
contentPartsForCall = [] if operationType == OperationTypeEnum.IMAGE_GENERATE else extractedParts
|
||||||
request = AiCallRequest(
|
request = AiCallRequest(
|
||||||
prompt=generationPrompt,
|
prompt=generationPrompt,
|
||||||
contentParts=extractedParts, # ALLE PARTS!
|
contentParts=contentPartsForCall, # Empty for IMAGE_GENERATE, all parts for others
|
||||||
options=AiCallOptions(
|
options=AiCallOptions(
|
||||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
operationType=operationType,
|
||||||
priority=PriorityEnum.BALANCED,
|
priority=PriorityEnum.BALANCED,
|
||||||
processingMode=ProcessingModeEnum.DETAILED
|
processingMode=ProcessingModeEnum.DETAILED
|
||||||
)
|
)
|
||||||
|
|
@ -318,7 +345,32 @@ class StructureFiller:
|
||||||
)
|
)
|
||||||
logger.debug(f"Logged section response: section_content_{sectionId}_response (aggregation)")
|
logger.debug(f"Logged section response: section_content_{sectionId}_response (aggregation)")
|
||||||
|
|
||||||
# Parse und füge zu elements hinzu
|
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||||
|
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
import base64
|
||||||
|
# Convert image data to base64 string if needed
|
||||||
|
if isinstance(aiResponse.content, bytes):
|
||||||
|
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||||
|
elif isinstance(aiResponse.content, str):
|
||||||
|
# Already base64 string or data URI
|
||||||
|
if aiResponse.content.startswith("data:image/"):
|
||||||
|
# Extract base64 from data URI
|
||||||
|
base64Data = aiResponse.content.split(",", 1)[1]
|
||||||
|
else:
|
||||||
|
base64Data = aiResponse.content
|
||||||
|
else:
|
||||||
|
base64Data = ""
|
||||||
|
|
||||||
|
elements.append({
|
||||||
|
"type": "image",
|
||||||
|
"content": {
|
||||||
|
"base64Data": base64Data,
|
||||||
|
"altText": generationHint or "Generated image",
|
||||||
|
"caption": ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Parse JSON response for other content types
|
||||||
generatedElements = json.loads(
|
generatedElements = json.loads(
|
||||||
self.services.utils.jsonExtractString(aiResponse.content)
|
self.services.utils.jsonExtractString(aiResponse.content)
|
||||||
)
|
)
|
||||||
|
|
@ -342,6 +394,117 @@ class StructureFiller:
|
||||||
# NICHT raise - Section wird mit Fehlermeldung gerendert
|
# NICHT raise - Section wird mit Fehlermeldung gerendert
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
# Einzelverarbeitung: Jeder Part einzeln ODER Generation ohne ContentParts
|
||||||
|
# Handle case where no content parts but generationHint exists (e.g., Executive Summary)
|
||||||
|
if len(contentPartIds) == 0 and useAiCall and generationHint:
|
||||||
|
# Generate content from scratch using only generationHint
|
||||||
|
logger.debug(f"Processing section {sectionId}: No content parts, generating from generationHint only")
|
||||||
|
generationPrompt = self._buildSectionGenerationPrompt(
|
||||||
|
section=section,
|
||||||
|
contentParts=[], # NO PARTS
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
generationHint=generationHint,
|
||||||
|
allSections=all_sections_list,
|
||||||
|
sectionIndex=sectionIndex,
|
||||||
|
isAggregation=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Erstelle Operation-ID für Section-Generierung
|
||||||
|
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||||
|
|
||||||
|
# Starte ChatLog mit Parent-Referenz
|
||||||
|
self.services.chat.progressLogStart(
|
||||||
|
sectionOperationId,
|
||||||
|
"Section Generation",
|
||||||
|
"Section",
|
||||||
|
f"Generating section {sectionId} from generationHint",
|
||||||
|
parentOperationId=fillOperationId
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Debug: Log Prompt
|
||||||
|
self.services.utils.writeDebugFile(
|
||||||
|
generationPrompt,
|
||||||
|
f"section_content_{sectionId}_prompt"
|
||||||
|
)
|
||||||
|
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
||||||
|
|
||||||
|
# Verwende callAi ohne ContentParts
|
||||||
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||||
|
|
||||||
|
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||||
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
maxPromptLength = 4000
|
||||||
|
if len(generationPrompt) > maxPromptLength:
|
||||||
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||||
|
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||||
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=generationPrompt,
|
||||||
|
contentParts=[], # NO PARTS
|
||||||
|
options=AiCallOptions(
|
||||||
|
operationType=operationType,
|
||||||
|
priority=PriorityEnum.BALANCED,
|
||||||
|
processingMode=ProcessingModeEnum.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
aiResponse = await self.aiService.callAi(request)
|
||||||
|
|
||||||
|
# Debug: Log Response
|
||||||
|
self.services.utils.writeDebugFile(
|
||||||
|
aiResponse.content,
|
||||||
|
f"section_content_{sectionId}_response"
|
||||||
|
)
|
||||||
|
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
||||||
|
|
||||||
|
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||||
|
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
import base64
|
||||||
|
# Convert image data to base64 string if needed
|
||||||
|
if isinstance(aiResponse.content, bytes):
|
||||||
|
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||||
|
elif isinstance(aiResponse.content, str):
|
||||||
|
# Already base64 string or data URI
|
||||||
|
if aiResponse.content.startswith("data:image/"):
|
||||||
|
# Extract base64 from data URI
|
||||||
|
base64Data = aiResponse.content.split(",", 1)[1]
|
||||||
|
else:
|
||||||
|
base64Data = aiResponse.content
|
||||||
|
else:
|
||||||
|
base64Data = ""
|
||||||
|
|
||||||
|
elements.append({
|
||||||
|
"type": "image",
|
||||||
|
"content": {
|
||||||
|
"base64Data": base64Data,
|
||||||
|
"altText": generationHint or "Generated image",
|
||||||
|
"caption": ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Parse JSON response for other content types
|
||||||
|
generatedElements = json.loads(
|
||||||
|
self.services.utils.jsonExtractString(aiResponse.content)
|
||||||
|
)
|
||||||
|
if isinstance(generatedElements, list):
|
||||||
|
elements.extend(generatedElements)
|
||||||
|
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||||
|
elements.extend(generatedElements["elements"])
|
||||||
|
|
||||||
|
# ChatLog abschließen
|
||||||
|
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!)
|
||||||
|
self.services.chat.progressLogFinish(sectionOperationId, False)
|
||||||
|
elements.append({
|
||||||
|
"type": "error",
|
||||||
|
"message": f"Error generating section {sectionId}: {str(e)}",
|
||||||
|
"sectionId": sectionId
|
||||||
|
})
|
||||||
|
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
||||||
|
|
||||||
# Einzelverarbeitung: Jeder Part einzeln
|
# Einzelverarbeitung: Jeder Part einzeln
|
||||||
for partId in contentPartIds:
|
for partId in contentPartIds:
|
||||||
part = self._findContentPartById(partId, contentParts)
|
part = self._findContentPartById(partId, contentParts)
|
||||||
|
|
@ -359,12 +522,25 @@ class StructureFiller:
|
||||||
})
|
})
|
||||||
|
|
||||||
elif contentFormat == "object":
|
elif contentFormat == "object":
|
||||||
# Füge base64 Object hinzu
|
# Füge base64 Object hinzu (nested in content structure)
|
||||||
|
if part.typeGroup == "image":
|
||||||
elements.append({
|
elements.append({
|
||||||
"type": part.typeGroup, # "image", "binary", etc.
|
"type": "image",
|
||||||
|
"content": {
|
||||||
"base64Data": part.data,
|
"base64Data": part.data,
|
||||||
|
"altText": part.metadata.get("usageHint", part.label),
|
||||||
|
"caption": part.metadata.get("caption", "")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# For other object types, use generic structure
|
||||||
|
elements.append({
|
||||||
|
"type": part.typeGroup,
|
||||||
|
"content": {
|
||||||
|
"data": part.data,
|
||||||
"mimeType": part.mimeType,
|
"mimeType": part.mimeType,
|
||||||
"altText": part.metadata.get("usageHint", part.label)
|
"label": part.metadata.get("usageHint", part.label)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
elif contentFormat == "extracted":
|
elif contentFormat == "extracted":
|
||||||
|
|
@ -403,11 +579,24 @@ class StructureFiller:
|
||||||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
||||||
|
|
||||||
# Verwende callAi für ContentParts-Unterstützung
|
# Verwende callAi für ContentParts-Unterstützung
|
||||||
|
# Use IMAGE_GENERATE for image content type
|
||||||
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||||
|
|
||||||
|
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||||
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
maxPromptLength = 4000
|
||||||
|
if len(generationPrompt) > maxPromptLength:
|
||||||
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||||
|
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||||
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||||
|
|
||||||
|
# For IMAGE_GENERATE, don't pass contentParts - image generation uses prompt only, not content chunks
|
||||||
|
contentPartsForCall = [] if operationType == OperationTypeEnum.IMAGE_GENERATE else [part]
|
||||||
request = AiCallRequest(
|
request = AiCallRequest(
|
||||||
prompt=generationPrompt,
|
prompt=generationPrompt,
|
||||||
contentParts=[part],
|
contentParts=contentPartsForCall,
|
||||||
options=AiCallOptions(
|
options=AiCallOptions(
|
||||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
operationType=operationType,
|
||||||
priority=PriorityEnum.BALANCED,
|
priority=PriorityEnum.BALANCED,
|
||||||
processingMode=ProcessingModeEnum.DETAILED
|
processingMode=ProcessingModeEnum.DETAILED
|
||||||
)
|
)
|
||||||
|
|
@ -421,7 +610,32 @@ class StructureFiller:
|
||||||
)
|
)
|
||||||
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
||||||
|
|
||||||
# Parse und füge zu elements hinzu
|
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||||
|
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
import base64
|
||||||
|
# Convert image data to base64 string if needed
|
||||||
|
if isinstance(aiResponse.content, bytes):
|
||||||
|
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||||
|
elif isinstance(aiResponse.content, str):
|
||||||
|
# Already base64 string or data URI
|
||||||
|
if aiResponse.content.startswith("data:image/"):
|
||||||
|
# Extract base64 from data URI
|
||||||
|
base64Data = aiResponse.content.split(",", 1)[1]
|
||||||
|
else:
|
||||||
|
base64Data = aiResponse.content
|
||||||
|
else:
|
||||||
|
base64Data = ""
|
||||||
|
|
||||||
|
elements.append({
|
||||||
|
"type": "image",
|
||||||
|
"content": {
|
||||||
|
"base64Data": base64Data,
|
||||||
|
"altText": generationHint or "Generated image",
|
||||||
|
"caption": ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Parse JSON response for other content types
|
||||||
generatedElements = json.loads(
|
generatedElements = json.loads(
|
||||||
self.services.utils.jsonExtractString(aiResponse.content)
|
self.services.utils.jsonExtractString(aiResponse.content)
|
||||||
)
|
)
|
||||||
|
|
@ -502,16 +716,6 @@ class StructureFiller:
|
||||||
if partId in contentPartsMap:
|
if partId in contentPartsMap:
|
||||||
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
||||||
|
|
||||||
# Prüfe ob Sections direkt vorhanden sind (Legacy-Struktur)
|
|
||||||
elif "sections" in doc:
|
|
||||||
for section in doc.get("sections", []):
|
|
||||||
contentPartIds = section.get("contentPartIds", [])
|
|
||||||
if contentPartIds:
|
|
||||||
section["contentPartsMetadata"] = []
|
|
||||||
for partId in contentPartIds:
|
|
||||||
if partId in contentPartsMap:
|
|
||||||
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
|
||||||
|
|
||||||
return structure
|
return structure
|
||||||
|
|
||||||
def _flattenChaptersToSections(
|
def _flattenChaptersToSections(
|
||||||
|
|
@ -542,8 +746,10 @@ class StructureFiller:
|
||||||
"content_type": "heading",
|
"content_type": "heading",
|
||||||
"elements": [{
|
"elements": [{
|
||||||
"type": "heading",
|
"type": "heading",
|
||||||
"content": chapter.get("title"),
|
"content": {
|
||||||
|
"text": chapter.get("title", ""),
|
||||||
"level": chapter.get("level", 1)
|
"level": chapter.get("level", 1)
|
||||||
|
}
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
flattened_doc["sections"].append(heading_section)
|
flattened_doc["sections"].append(heading_section)
|
||||||
|
|
@ -555,276 +761,6 @@ class StructureFiller:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def _fillStructureLegacy(
|
|
||||||
self,
|
|
||||||
structure: Dict[str, Any],
|
|
||||||
contentParts: List[ContentPart],
|
|
||||||
userPrompt: str,
|
|
||||||
fillOperationId: str
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Legacy: Füllt Struktur mit Sections direkt (für Rückwärtskompatibilität).
|
|
||||||
"""
|
|
||||||
# Starte ChatLog
|
|
||||||
self.services.chat.progressLogStart(
|
|
||||||
fillOperationId,
|
|
||||||
"Structure Filling (Legacy)",
|
|
||||||
"Filling",
|
|
||||||
f"Filling {len(structure.get('documents', [{}])[0].get('sections', []))} sections",
|
|
||||||
parentOperationId=fillOperationId
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
filledStructure = copy.deepcopy(structure)
|
|
||||||
|
|
||||||
# Sammle alle Sections
|
|
||||||
sections_to_process = []
|
|
||||||
all_sections_list = []
|
|
||||||
for doc in filledStructure.get("documents", []):
|
|
||||||
doc_sections = doc.get("sections", [])
|
|
||||||
all_sections_list.extend(doc_sections)
|
|
||||||
for section in doc_sections:
|
|
||||||
sections_to_process.append((doc, section))
|
|
||||||
|
|
||||||
# Verarbeite Sections (bestehende Logik)
|
|
||||||
for sectionIndex, (doc, section) in enumerate(sections_to_process):
|
|
||||||
sectionId = section.get("id")
|
|
||||||
contentPartIds = section.get("contentPartIds", [])
|
|
||||||
contentFormats = section.get("contentFormats", {})
|
|
||||||
# Check both camelCase and snake_case for generationHint
|
|
||||||
generationHint = section.get("generationHint") or section.get("generation_hint")
|
|
||||||
contentType = section.get("content_type", "paragraph")
|
|
||||||
useAiCall = section.get("useAiCall", False)
|
|
||||||
|
|
||||||
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
|
|
||||||
if len(contentPartIds) == 0:
|
|
||||||
useAiCall = False
|
|
||||||
logger.debug(f"Section {sectionId} (legacy): No content parts, setting useAiCall=False")
|
|
||||||
|
|
||||||
elements = []
|
|
||||||
|
|
||||||
# Prüfe ob Aggregation nötig ist
|
|
||||||
needsAggregation = self._needsAggregation(
|
|
||||||
contentType=contentType,
|
|
||||||
contentPartCount=len(contentPartIds)
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Processing section {sectionId} (legacy): contentType={contentType}, contentPartCount={len(contentPartIds)}, useAiCall={useAiCall}, needsAggregation={needsAggregation}, hasGenerationHint={bool(generationHint)}")
|
|
||||||
|
|
||||||
if needsAggregation and useAiCall and generationHint:
|
|
||||||
# Aggregation: Alle Parts zusammen verarbeiten
|
|
||||||
sectionParts = [
|
|
||||||
self._findContentPartById(pid, contentParts)
|
|
||||||
for pid in contentPartIds
|
|
||||||
]
|
|
||||||
sectionParts = [p for p in sectionParts if p is not None]
|
|
||||||
|
|
||||||
if sectionParts:
|
|
||||||
# Filtere nur extracted Parts für Aggregation
|
|
||||||
extractedParts = [
|
|
||||||
p for p in sectionParts
|
|
||||||
if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted"
|
|
||||||
]
|
|
||||||
nonExtractedParts = [
|
|
||||||
p for p in sectionParts
|
|
||||||
if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Verarbeite non-extracted Parts separat
|
|
||||||
for part in nonExtractedParts:
|
|
||||||
contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat"))
|
|
||||||
|
|
||||||
if contentFormat == "reference":
|
|
||||||
elements.append({
|
|
||||||
"type": "reference",
|
|
||||||
"documentReference": part.metadata.get("documentReference"),
|
|
||||||
"label": part.metadata.get("usageHint", part.label)
|
|
||||||
})
|
|
||||||
elif contentFormat == "object":
|
|
||||||
elements.append({
|
|
||||||
"type": part.typeGroup,
|
|
||||||
"base64Data": part.data,
|
|
||||||
"mimeType": part.mimeType,
|
|
||||||
"altText": part.metadata.get("usageHint", part.label)
|
|
||||||
})
|
|
||||||
|
|
||||||
# Aggregiere extracted Parts mit AI
|
|
||||||
if extractedParts:
|
|
||||||
generationPrompt = self._buildSectionGenerationPrompt(
|
|
||||||
section=section,
|
|
||||||
contentParts=extractedParts,
|
|
||||||
userPrompt=userPrompt,
|
|
||||||
generationHint=generationHint,
|
|
||||||
allSections=all_sections_list,
|
|
||||||
sectionIndex=sectionIndex,
|
|
||||||
isAggregation=True
|
|
||||||
)
|
|
||||||
|
|
||||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
||||||
|
|
||||||
self.services.chat.progressLogStart(
|
|
||||||
sectionOperationId,
|
|
||||||
"Section Generation (Aggregation)",
|
|
||||||
"Section",
|
|
||||||
f"Generating section {sectionId} with {len(extractedParts)} parts",
|
|
||||||
parentOperationId=fillOperationId
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.services.utils.writeDebugFile(
|
|
||||||
generationPrompt,
|
|
||||||
f"section_content_{sectionId}_prompt"
|
|
||||||
)
|
|
||||||
|
|
||||||
request = AiCallRequest(
|
|
||||||
prompt=generationPrompt,
|
|
||||||
contentParts=extractedParts,
|
|
||||||
options=AiCallOptions(
|
|
||||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
||||||
priority=PriorityEnum.BALANCED,
|
|
||||||
processingMode=ProcessingModeEnum.DETAILED
|
|
||||||
)
|
|
||||||
)
|
|
||||||
aiResponse = await self.aiService.callAi(request)
|
|
||||||
|
|
||||||
self.services.utils.writeDebugFile(
|
|
||||||
aiResponse.content,
|
|
||||||
f"section_content_{sectionId}_response"
|
|
||||||
)
|
|
||||||
|
|
||||||
generatedElements = json.loads(
|
|
||||||
self.services.utils.jsonExtractString(aiResponse.content)
|
|
||||||
)
|
|
||||||
if isinstance(generatedElements, list):
|
|
||||||
elements.extend(generatedElements)
|
|
||||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
|
||||||
elements.extend(generatedElements["elements"])
|
|
||||||
|
|
||||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
||||||
elements.append({
|
|
||||||
"type": "error",
|
|
||||||
"message": f"Error generating section {sectionId}: {str(e)}",
|
|
||||||
"sectionId": sectionId
|
|
||||||
})
|
|
||||||
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Einzelverarbeitung: Jeder Part einzeln
|
|
||||||
for partId in contentPartIds:
|
|
||||||
part = self._findContentPartById(partId, contentParts)
|
|
||||||
if not part:
|
|
||||||
continue
|
|
||||||
|
|
||||||
contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat"))
|
|
||||||
|
|
||||||
if contentFormat == "reference":
|
|
||||||
elements.append({
|
|
||||||
"type": "reference",
|
|
||||||
"documentReference": part.metadata.get("documentReference"),
|
|
||||||
"label": part.metadata.get("usageHint", part.label)
|
|
||||||
})
|
|
||||||
|
|
||||||
elif contentFormat == "object":
|
|
||||||
elements.append({
|
|
||||||
"type": part.typeGroup,
|
|
||||||
"base64Data": part.data,
|
|
||||||
"mimeType": part.mimeType,
|
|
||||||
"altText": part.metadata.get("usageHint", part.label)
|
|
||||||
})
|
|
||||||
|
|
||||||
elif contentFormat == "extracted":
|
|
||||||
# WICHTIG: Prüfe sowohl useAiCall als auch generationHint
|
|
||||||
if useAiCall and generationHint:
|
|
||||||
# AI-Call mit einzelnen ContentPart
|
|
||||||
logger.debug(f"Processing section {sectionId}: Single extracted part with AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)})")
|
|
||||||
generationPrompt = self._buildSectionGenerationPrompt(
|
|
||||||
section=section,
|
|
||||||
contentParts=[part],
|
|
||||||
userPrompt=userPrompt,
|
|
||||||
generationHint=generationHint,
|
|
||||||
allSections=all_sections_list,
|
|
||||||
sectionIndex=sectionIndex,
|
|
||||||
isAggregation=False
|
|
||||||
)
|
|
||||||
|
|
||||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
||||||
|
|
||||||
self.services.chat.progressLogStart(
|
|
||||||
sectionOperationId,
|
|
||||||
"Section Generation",
|
|
||||||
"Section",
|
|
||||||
f"Generating section {sectionId}",
|
|
||||||
parentOperationId=fillOperationId
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.services.utils.writeDebugFile(
|
|
||||||
generationPrompt,
|
|
||||||
f"section_content_{sectionId}_prompt"
|
|
||||||
)
|
|
||||||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
|
||||||
|
|
||||||
request = AiCallRequest(
|
|
||||||
prompt=generationPrompt,
|
|
||||||
contentParts=[part],
|
|
||||||
options=AiCallOptions(
|
|
||||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
||||||
priority=PriorityEnum.BALANCED,
|
|
||||||
processingMode=ProcessingModeEnum.DETAILED
|
|
||||||
)
|
|
||||||
)
|
|
||||||
aiResponse = await self.aiService.callAi(request)
|
|
||||||
|
|
||||||
self.services.utils.writeDebugFile(
|
|
||||||
aiResponse.content,
|
|
||||||
f"section_content_{sectionId}_response"
|
|
||||||
)
|
|
||||||
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
|
||||||
|
|
||||||
generatedElements = json.loads(
|
|
||||||
self.services.utils.jsonExtractString(aiResponse.content)
|
|
||||||
)
|
|
||||||
if isinstance(generatedElements, list):
|
|
||||||
elements.extend(generatedElements)
|
|
||||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
|
||||||
elements.extend(generatedElements["elements"])
|
|
||||||
|
|
||||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
||||||
elements.append({
|
|
||||||
"type": "error",
|
|
||||||
"message": f"Error generating section {sectionId}: {str(e)}",
|
|
||||||
"sectionId": sectionId
|
|
||||||
})
|
|
||||||
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
|
||||||
else:
|
|
||||||
# Füge extrahierten Text direkt hinzu (kein AI-Call)
|
|
||||||
logger.debug(f"Processing section {sectionId}: Single extracted part WITHOUT AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)}) - adding extracted text directly")
|
|
||||||
elements.append({
|
|
||||||
"type": "extracted_text",
|
|
||||||
"content": part.data,
|
|
||||||
"source": part.metadata.get("documentId"),
|
|
||||||
"extractionPrompt": part.metadata.get("extractionPrompt")
|
|
||||||
})
|
|
||||||
|
|
||||||
section["elements"] = elements
|
|
||||||
|
|
||||||
# Füge ContentParts-Metadaten zur Struktur hinzu (für Validierung)
|
|
||||||
filledStructure = self._addContentPartsMetadata(filledStructure, contentParts)
|
|
||||||
|
|
||||||
self.services.chat.progressLogFinish(fillOperationId, True)
|
|
||||||
return filledStructure
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.services.chat.progressLogFinish(fillOperationId, False)
|
|
||||||
logger.error(f"Error in _fillStructureLegacy: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _buildChapterSectionsStructurePrompt(
|
def _buildChapterSectionsStructurePrompt(
|
||||||
self,
|
self,
|
||||||
chapterId: str,
|
chapterId: str,
|
||||||
|
|
@ -899,6 +835,18 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
||||||
"""
|
"""
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
def _getContentStructureExample(self, contentType: str) -> str:
|
||||||
|
"""Get the JSON structure example for a specific content type."""
|
||||||
|
structures = {
|
||||||
|
"table": '{{"headers": ["Column1", "Column2"], "rows": [["Value1", "Value2"], ["Value3", "Value4"]]}}',
|
||||||
|
"bullet_list": '{{"items": ["Item 1", "Item 2", "Item 3"]}}',
|
||||||
|
"heading": '{{"text": "Section Title", "level": 2}}',
|
||||||
|
"paragraph": '{{"text": "This is paragraph text."}}',
|
||||||
|
"code_block": '{{"code": "function example() {{ return true; }}", "language": "javascript"}}',
|
||||||
|
"image": '{{"base64Data": "<base64_encoded_image_data>", "altText": "Description", "caption": "Optional caption"}}'
|
||||||
|
}
|
||||||
|
return structures.get(contentType, '{{"text": ""}}')
|
||||||
|
|
||||||
def _buildSectionGenerationPrompt(
|
def _buildSectionGenerationPrompt(
|
||||||
self,
|
self,
|
||||||
section: Dict[str, Any],
|
section: Dict[str, Any],
|
||||||
|
|
@ -998,6 +946,8 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
||||||
for next in nextSections:
|
for next in nextSections:
|
||||||
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
|
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
|
||||||
|
|
||||||
|
contentStructureExample = self._getContentStructureExample(contentType)
|
||||||
|
|
||||||
if isAggregation:
|
if isAggregation:
|
||||||
prompt = f"""# TASK: Generate Section Content (Aggregation)
|
prompt = f"""# TASK: Generate Section Content (Aggregation)
|
||||||
|
|
||||||
|
|
@ -1027,21 +977,17 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
||||||
|
|
||||||
## OUTPUT FORMAT
|
## OUTPUT FORMAT
|
||||||
Return a JSON object with this structure:
|
Return a JSON object with this structure:
|
||||||
```json
|
|
||||||
{{
|
{{
|
||||||
"elements": [
|
"elements": [
|
||||||
{{
|
{{
|
||||||
"type": "{contentType}",
|
"type": "{contentType}",
|
||||||
"headers": [...], // if table
|
"content": {contentStructureExample}
|
||||||
"rows": [...], // if table
|
|
||||||
"items": [...], // if bullet_list
|
|
||||||
"content": "..." // if paragraph
|
|
||||||
}}
|
}}
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
```
|
|
||||||
|
|
||||||
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
CRITICAL: "content" MUST always be an object (never a string). Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||||
"""
|
"""
|
||||||
else:
|
else:
|
||||||
prompt = f"""# TASK: Generate Section Content
|
prompt = f"""# TASK: Generate Section Content
|
||||||
|
|
@ -1071,18 +1017,17 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
||||||
|
|
||||||
## OUTPUT FORMAT
|
## OUTPUT FORMAT
|
||||||
Return a JSON object with this structure:
|
Return a JSON object with this structure:
|
||||||
```json
|
|
||||||
{{
|
{{
|
||||||
"elements": [
|
"elements": [
|
||||||
{{
|
{{
|
||||||
"type": "{contentType}",
|
"type": "{contentType}",
|
||||||
"content": "..."
|
"content": {contentStructureExample}
|
||||||
}}
|
}}
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
```
|
|
||||||
|
|
||||||
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
CRITICAL: "content" MUST always be an object (never a string). Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||||
"""
|
"""
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1129,8 +1129,9 @@ class ExtractionService:
|
||||||
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
|
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
|
||||||
|
|
||||||
# If either condition fails, chunk the content
|
# If either condition fails, chunk the content
|
||||||
if totalTokens > maxTotalTokens or partSize > availableContentBytes:
|
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
|
||||||
# Part too large or total exceeds limit - chunk it
|
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
|
||||||
|
# Part too large or total exceeds limit - chunk it (but not for image generation)
|
||||||
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
||||||
if not chunks:
|
if not chunks:
|
||||||
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
||||||
|
|
|
||||||
|
|
@ -199,29 +199,40 @@ class BaseRenderer(ABC):
|
||||||
return "unknown"
|
return "unknown"
|
||||||
|
|
||||||
def _extractTableData(self, sectionData: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
|
def _extractTableData(self, sectionData: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
|
||||||
"""Extract table headers and rows from section data."""
|
"""Extract table headers and rows from section data. Expects nested content structure."""
|
||||||
# Normalize when elements array was passed in
|
# Normalize when elements array was passed in
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
if sectionData and isinstance(sectionData[0], dict):
|
if sectionData and isinstance(sectionData[0], dict):
|
||||||
sectionData = sectionData[0]
|
sectionData = sectionData[0]
|
||||||
else:
|
else:
|
||||||
# Empty list or invalid structure - return empty table
|
|
||||||
return [], []
|
return [], []
|
||||||
# Ensure sectionData is a dict before calling .get()
|
# Ensure sectionData is a dict
|
||||||
if not isinstance(sectionData, dict):
|
if not isinstance(sectionData, dict):
|
||||||
return [], []
|
return [], []
|
||||||
headers = sectionData.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = sectionData.get("rows", [])
|
content = sectionData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return [], []
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
return headers, rows
|
return headers, rows
|
||||||
|
|
||||||
def _extractBulletListItems(self, sectionData: Dict[str, Any]) -> List[str]:
|
def _extractBulletListItems(self, sectionData: Dict[str, Any]) -> List[str]:
|
||||||
"""Extract bullet list items from section data."""
|
"""Extract bullet list items from section data. Expects nested content structure."""
|
||||||
# Normalize when elements array or raw list was passed in
|
# Normalize when elements array was passed in
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
# Already a list of items (strings or dicts)
|
if sectionData and isinstance(sectionData[0], dict):
|
||||||
items = sectionData
|
sectionData = sectionData[0]
|
||||||
else:
|
else:
|
||||||
items = sectionData.get("items", [])
|
return []
|
||||||
|
# Ensure sectionData is a dict
|
||||||
|
if not isinstance(sectionData, dict):
|
||||||
|
return []
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = sectionData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
items = content.get("items", [])
|
||||||
result = []
|
result = []
|
||||||
for item in items:
|
for item in items:
|
||||||
if isinstance(item, str):
|
if isinstance(item, str):
|
||||||
|
|
@ -231,64 +242,89 @@ class BaseRenderer(ABC):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extractHeadingData(self, sectionData: Dict[str, Any]) -> Tuple[int, str]:
|
def _extractHeadingData(self, sectionData: Dict[str, Any]) -> Tuple[int, str]:
|
||||||
"""Extract heading level and text from section data."""
|
"""Extract heading level and text from section data. Expects nested content structure."""
|
||||||
# Normalize when elements array was passed in
|
# Normalize when elements array was passed in
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
if sectionData and isinstance(sectionData[0], dict):
|
if sectionData and isinstance(sectionData[0], dict):
|
||||||
sectionData = sectionData[0]
|
sectionData = sectionData[0]
|
||||||
else:
|
else:
|
||||||
# Empty list or invalid structure - return default
|
|
||||||
return 1, ""
|
return 1, ""
|
||||||
# Ensure sectionData is a dict before calling .get()
|
# Ensure sectionData is a dict
|
||||||
if not isinstance(sectionData, dict):
|
if not isinstance(sectionData, dict):
|
||||||
return 1, ""
|
return 1, ""
|
||||||
level = sectionData.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = sectionData.get("text", "")
|
content = sectionData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return 1, ""
|
||||||
|
level = content.get("level", 1)
|
||||||
|
text = content.get("text", "")
|
||||||
return level, text
|
return level, text
|
||||||
|
|
||||||
def _extractParagraphText(self, sectionData: Dict[str, Any]) -> str:
|
def _extractParagraphText(self, sectionData: Dict[str, Any]) -> str:
|
||||||
"""Extract paragraph text from section data."""
|
"""Extract paragraph text from section data. Expects nested content structure."""
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
# Join multiple paragraph elements if provided as a list
|
# Join multiple paragraph elements if provided as a list
|
||||||
texts = []
|
texts = []
|
||||||
for el in sectionData:
|
for el in sectionData:
|
||||||
if isinstance(el, dict) and "text" in el:
|
if isinstance(el, dict):
|
||||||
texts.append(el["text"])
|
content = el.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
if text:
|
||||||
|
texts.append(text)
|
||||||
elif isinstance(el, str):
|
elif isinstance(el, str):
|
||||||
texts.append(el)
|
texts.append(el)
|
||||||
return "\n".join(texts)
|
return "\n".join(texts)
|
||||||
return sectionData.get("text", "")
|
# Extract from nested content structure
|
||||||
|
if not isinstance(sectionData, dict):
|
||||||
|
return ""
|
||||||
|
content = sectionData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
return content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
return content
|
||||||
|
return ""
|
||||||
|
|
||||||
def _extractCodeBlockData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
def _extractCodeBlockData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
"""Extract code and language from section data."""
|
"""Extract code and language from section data. Expects nested content structure."""
|
||||||
# Normalize when elements array was passed in
|
# Normalize when elements array was passed in
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
if sectionData and isinstance(sectionData[0], dict):
|
if sectionData and isinstance(sectionData[0], dict):
|
||||||
sectionData = sectionData[0]
|
sectionData = sectionData[0]
|
||||||
else:
|
else:
|
||||||
# Empty list or invalid structure - return default
|
|
||||||
return "", ""
|
return "", ""
|
||||||
# Ensure sectionData is a dict before calling .get()
|
# Ensure sectionData is a dict
|
||||||
if not isinstance(sectionData, dict):
|
if not isinstance(sectionData, dict):
|
||||||
return "", ""
|
return "", ""
|
||||||
code = sectionData.get("code", "")
|
# Extract from nested content structure
|
||||||
language = sectionData.get("language", "")
|
content = sectionData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return "", ""
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
return code, language
|
return code, language
|
||||||
|
|
||||||
def _extractImageData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
def _extractImageData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
"""Extract base64 data and alt text from section data."""
|
"""Extract base64 data and alt text from section data. Expects nested content structure."""
|
||||||
# Normalize when elements array was passed in
|
# Normalize when elements array was passed in
|
||||||
if isinstance(sectionData, list):
|
if isinstance(sectionData, list):
|
||||||
if sectionData and isinstance(sectionData[0], dict):
|
if sectionData and isinstance(sectionData[0], dict):
|
||||||
sectionData = sectionData[0]
|
sectionData = sectionData[0]
|
||||||
else:
|
else:
|
||||||
# Empty list or invalid structure - return default
|
|
||||||
return "", "Image"
|
return "", "Image"
|
||||||
# Ensure sectionData is a dict before calling .get()
|
# Ensure sectionData is a dict
|
||||||
if not isinstance(sectionData, dict):
|
if not isinstance(sectionData, dict):
|
||||||
return "", "Image"
|
return "", "Image"
|
||||||
base64Data = sectionData.get("base64Data", "")
|
# Extract from nested content structure
|
||||||
altText = sectionData.get("altText", "Image")
|
content = sectionData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return "", "Image"
|
||||||
|
base64Data = content.get("base64Data", "")
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
return base64Data, altText
|
return base64Data, altText
|
||||||
|
|
||||||
def _renderImageSection(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
|
def _renderImageSection(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
|
||||||
|
|
|
||||||
|
|
@ -41,11 +41,17 @@ class RendererCsv(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
filename = self._determineFilename(title, "text/csv")
|
filename = self._determineFilename(title, "text/csv")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=csvContent.encode('utf-8'),
|
documentData=csvContent.encode('utf-8'),
|
||||||
mimeType="text/csv",
|
mimeType="text/csv",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -130,8 +136,12 @@ class RendererCsv(BaseRenderer):
|
||||||
def _renderJsonTableToCsv(self, tableData: Dict[str, Any]) -> List[List[str]]:
|
def _renderJsonTableToCsv(self, tableData: Dict[str, Any]) -> List[List[str]]:
|
||||||
"""Render a JSON table to CSV rows."""
|
"""Render a JSON table to CSV rows."""
|
||||||
try:
|
try:
|
||||||
headers = tableData.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = tableData.get("rows", [])
|
content = tableData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
csvRows = []
|
csvRows = []
|
||||||
|
|
||||||
|
|
@ -150,7 +160,11 @@ class RendererCsv(BaseRenderer):
|
||||||
def _renderJsonListToCsv(self, listData: Dict[str, Any]) -> List[List[str]]:
|
def _renderJsonListToCsv(self, listData: Dict[str, Any]) -> List[List[str]]:
|
||||||
"""Render a JSON list to CSV rows."""
|
"""Render a JSON list to CSV rows."""
|
||||||
try:
|
try:
|
||||||
items = listData.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = listData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
items = content.get("items", [])
|
||||||
csvRows = []
|
csvRows = []
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
@ -177,8 +191,12 @@ class RendererCsv(BaseRenderer):
|
||||||
def _renderJsonHeadingToCsv(self, headingData: Dict[str, Any]) -> List[List[str]]:
|
def _renderJsonHeadingToCsv(self, headingData: Dict[str, Any]) -> List[List[str]]:
|
||||||
"""Render a JSON heading to CSV rows."""
|
"""Render a JSON heading to CSV rows."""
|
||||||
try:
|
try:
|
||||||
text = headingData.get("text", "")
|
# Extract from nested content structure
|
||||||
level = headingData.get("level", 1)
|
content = headingData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
# Use # symbols for heading levels
|
# Use # symbols for heading levels
|
||||||
|
|
@ -194,7 +212,14 @@ class RendererCsv(BaseRenderer):
|
||||||
def _renderJsonParagraphToCsv(self, paragraphData: Dict[str, Any]) -> List[List[str]]:
|
def _renderJsonParagraphToCsv(self, paragraphData: Dict[str, Any]) -> List[List[str]]:
|
||||||
"""Render a JSON paragraph to CSV rows."""
|
"""Render a JSON paragraph to CSV rows."""
|
||||||
try:
|
try:
|
||||||
text = paragraphData.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraphData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
# Split long paragraphs into multiple rows if needed
|
# Split long paragraphs into multiple rows if needed
|
||||||
|
|
@ -229,8 +254,12 @@ class RendererCsv(BaseRenderer):
|
||||||
def _renderJsonCodeToCsv(self, codeData: Dict[str, Any]) -> List[List[str]]:
|
def _renderJsonCodeToCsv(self, codeData: Dict[str, Any]) -> List[List[str]]:
|
||||||
"""Render a JSON code block to CSV rows."""
|
"""Render a JSON code block to CSV rows."""
|
||||||
try:
|
try:
|
||||||
code = codeData.get("code", "")
|
# Extract from nested content structure
|
||||||
language = codeData.get("language", "")
|
content = codeData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
csvRows = []
|
csvRows = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,10 @@ class RendererDocx(BaseRenderer):
|
||||||
# Generate DOCX using AI-analyzed styling
|
# Generate DOCX using AI-analyzed styling
|
||||||
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
|
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
# Determine filename from document or title
|
# Determine filename from document or title
|
||||||
documents = extractedContent.get("documents", [])
|
documents = extractedContent.get("documents", [])
|
||||||
if documents and isinstance(documents[0], dict):
|
if documents and isinstance(documents[0], dict):
|
||||||
|
|
@ -74,7 +78,9 @@ class RendererDocx(BaseRenderer):
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=docx_bytes,
|
documentData=docx_bytes,
|
||||||
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -82,11 +88,15 @@ class RendererDocx(BaseRenderer):
|
||||||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||||||
# Return minimal fallback
|
# Return minimal fallback
|
||||||
fallbackContent = f"DOCX Generation Error: {str(e)}"
|
fallbackContent = f"DOCX Generation Error: {str(e)}"
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="text/plain",
|
mimeType="text/plain",
|
||||||
filename=self._determineFilename(title, "text/plain")
|
filename=self._determineFilename(title, "text/plain"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -96,8 +106,8 @@ class RendererDocx(BaseRenderer):
|
||||||
# Create new document
|
# Create new document
|
||||||
doc = Document()
|
doc = Document()
|
||||||
|
|
||||||
# Get style set: default styles, enhanced with AI if style instructions present
|
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||||
styleSet = await self._getStyleSet(userPrompt, aiService)
|
styleSet = await self._getStyleSet(json_content, userPrompt, aiService)
|
||||||
|
|
||||||
# Setup basic document styles and create all styles from style set
|
# Setup basic document styles and create all styles from style set
|
||||||
self._setupBasicDocumentStyles(doc)
|
self._setupBasicDocumentStyles(doc)
|
||||||
|
|
@ -137,12 +147,17 @@ class RendererDocx(BaseRenderer):
|
||||||
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
||||||
raise Exception(f"DOCX generation failed: {str(e)}")
|
raise Exception(f"DOCX generation failed: {str(e)}")
|
||||||
|
|
||||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
"""Get style set - use styles from document generation metadata if available,
|
||||||
|
otherwise enhance default styles with AI if userPrompt provided.
|
||||||
|
|
||||||
|
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||||
|
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
extractedContent: Document content with metadata (may contain styles)
|
||||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||||
aiService: AI service (used only if userPrompt provided)
|
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||||
templateName: Name of template style set (None = default)
|
templateName: Name of template style set (None = default)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -156,10 +171,18 @@ class RendererDocx(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
defaultStyleSet = self._getDefaultStyleSet()
|
defaultStyleSet = self._getDefaultStyleSet()
|
||||||
|
|
||||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||||
|
if extractedContent:
|
||||||
|
metadata = extractedContent.get("metadata", {})
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
styles = metadata.get("styles")
|
||||||
|
if styles and isinstance(styles, dict):
|
||||||
|
self.logger.debug("Using styles from document generation metadata")
|
||||||
|
return self._validateStylesContrast(styles)
|
||||||
|
|
||||||
|
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||||
if userPrompt and aiService:
|
if userPrompt and aiService:
|
||||||
# AI will naturally detect style instructions in any language
|
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
||||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||||
return self._validateStylesContrast(enhancedStyleSet)
|
return self._validateStylesContrast(enhancedStyleSet)
|
||||||
else:
|
else:
|
||||||
|
|
@ -264,6 +287,10 @@ class RendererDocx(BaseRenderer):
|
||||||
section_type = section.get("content_type", "paragraph")
|
section_type = section.get("content_type", "paragraph")
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
# If no elements, skip this section (it has no content to render)
|
||||||
|
if not elements:
|
||||||
|
return
|
||||||
|
|
||||||
# Process each element in the section
|
# Process each element in the section
|
||||||
for element in elements:
|
for element in elements:
|
||||||
element_type = element.get("type", "")
|
element_type = element.get("type", "")
|
||||||
|
|
@ -286,7 +313,21 @@ class RendererDocx(BaseRenderer):
|
||||||
para.add_run(f" (Source: {source})").italic = True
|
para.add_run(f" (Source: {source})").italic = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Standard section types
|
# Check element type, not section type (elements can have different types than section)
|
||||||
|
if element_type == "table":
|
||||||
|
self._renderJsonTable(doc, element, styles)
|
||||||
|
elif element_type == "bullet_list":
|
||||||
|
self._renderJsonBulletList(doc, element, styles)
|
||||||
|
elif element_type == "heading":
|
||||||
|
self._renderJsonHeading(doc, element, styles)
|
||||||
|
elif element_type == "paragraph":
|
||||||
|
self._renderJsonParagraph(doc, element, styles)
|
||||||
|
elif element_type == "code_block":
|
||||||
|
self._renderJsonCodeBlock(doc, element, styles)
|
||||||
|
elif element_type == "image":
|
||||||
|
self._renderJsonImage(doc, element, styles)
|
||||||
|
else:
|
||||||
|
# Fallback: if element_type not set, use section_type
|
||||||
if section_type == "table":
|
if section_type == "table":
|
||||||
self._renderJsonTable(doc, element, styles)
|
self._renderJsonTable(doc, element, styles)
|
||||||
elif section_type == "bullet_list":
|
elif section_type == "bullet_list":
|
||||||
|
|
@ -311,8 +352,12 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON table to DOCX using AI-generated styles."""
|
"""Render a JSON table to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
headers = table_data.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = table_data.get("rows", [])
|
content = table_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers or not rows:
|
if not headers or not rows:
|
||||||
return
|
return
|
||||||
|
|
@ -467,7 +512,11 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON bullet list to DOCX using AI-generated styles."""
|
"""Render a JSON bullet list to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
items = list_data.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = list_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return
|
||||||
|
items = content.get("items", [])
|
||||||
bullet_style = styles["bullet_list"]
|
bullet_style = styles["bullet_list"]
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
@ -482,8 +531,12 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON heading to DOCX using AI-generated styles."""
|
"""Render a JSON heading to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
level = heading_data.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = heading_data.get("text", "")
|
content = heading_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
level = max(1, min(6, level))
|
level = max(1, min(6, level))
|
||||||
|
|
@ -495,7 +548,25 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
text = paragraph_data.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraph_data.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
# CRITICAL: Prevent rendering base64 image data as text
|
||||||
|
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
|
||||||
|
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
|
||||||
|
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
|
||||||
|
# This looks like base64 data - don't render as text
|
||||||
|
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
|
||||||
|
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
|
||||||
|
if para.runs:
|
||||||
|
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||||||
|
return
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
para = doc.add_paragraph(text)
|
para = doc.add_paragraph(text)
|
||||||
|
|
@ -506,8 +577,12 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON code block to DOCX using AI-generated styles."""
|
"""Render a JSON code block to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
code = code_data.get("code", "")
|
# Extract from nested content structure
|
||||||
language = code_data.get("language", "")
|
content = code_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
if language:
|
if language:
|
||||||
|
|
@ -525,20 +600,33 @@ class RendererDocx(BaseRenderer):
|
||||||
def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a JSON image to DOCX."""
|
"""Render a JSON image to DOCX."""
|
||||||
try:
|
try:
|
||||||
base64_data = image_data.get("base64Data", "")
|
# Extract from nested content structure
|
||||||
alt_text = image_data.get("altText", "Image")
|
content = image_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return
|
||||||
|
base64_data = content.get("base64Data", "")
|
||||||
|
alt_text = content.get("altText", "Image")
|
||||||
|
|
||||||
if base64_data:
|
if base64_data:
|
||||||
|
try:
|
||||||
image_bytes = base64.b64decode(base64_data)
|
image_bytes = base64.b64decode(base64_data)
|
||||||
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
||||||
|
|
||||||
if alt_text:
|
if alt_text:
|
||||||
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
||||||
caption_para.runs[0].italic = True
|
caption_para.runs[0].italic = True
|
||||||
|
except Exception as embedError:
|
||||||
|
# Image decoding or embedding failed
|
||||||
|
raise Exception(f"Failed to decode or embed image: {str(embedError)}")
|
||||||
|
else:
|
||||||
|
raise Exception("No image data provided (base64Data is empty)")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
self.logger.error(f"Error embedding image in DOCX: {str(e)}")
|
||||||
doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]")
|
errorMsg = f"[Error: Could not embed image '{image_data.get('altText', 'Image')}'. {str(e)}]"
|
||||||
|
errorPara = doc.add_paragraph(errorMsg)
|
||||||
|
if errorPara.runs:
|
||||||
|
errorPara.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||||||
|
|
||||||
def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]:
|
def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]:
|
||||||
"""Extract document structure from user prompt."""
|
"""Extract document structure from user prompt."""
|
||||||
|
|
|
||||||
|
|
@ -55,12 +55,18 @@ class RendererHtml(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
htmlFilename = self._determineFilename(title, "text/html")
|
htmlFilename = self._determineFilename(title, "text/html")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
# Start with HTML document
|
# Start with HTML document
|
||||||
resultDocuments = [
|
resultDocuments = [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=htmlContent.encode('utf-8'),
|
documentData=htmlContent.encode('utf-8'),
|
||||||
mimeType="text/html",
|
mimeType="text/html",
|
||||||
filename=htmlFilename
|
filename=htmlFilename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -90,8 +96,8 @@ class RendererHtml(BaseRenderer):
|
||||||
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
||||||
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
||||||
try:
|
try:
|
||||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||||
styles = await self._getStyleSet(userPrompt, aiService)
|
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
|
||||||
|
|
||||||
# Validate JSON structure
|
# Validate JSON structure
|
||||||
if not self._validateJsonStructure(jsonContent):
|
if not self._validateJsonStructure(jsonContent):
|
||||||
|
|
@ -148,12 +154,17 @@ class RendererHtml(BaseRenderer):
|
||||||
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
||||||
raise Exception(f"HTML generation failed: {str(e)}")
|
raise Exception(f"HTML generation failed: {str(e)}")
|
||||||
|
|
||||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
"""Get style set - use styles from document generation metadata if available,
|
||||||
|
otherwise enhance default styles with AI if userPrompt provided.
|
||||||
|
|
||||||
|
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||||
|
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
extractedContent: Document content with metadata (may contain styles)
|
||||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||||
aiService: AI service (used only if userPrompt provided)
|
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||||
templateName: Name of template style set (None = default)
|
templateName: Name of template style set (None = default)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -162,10 +173,18 @@ class RendererHtml(BaseRenderer):
|
||||||
# Get default style set
|
# Get default style set
|
||||||
defaultStyleSet = self._getDefaultStyleSet()
|
defaultStyleSet = self._getDefaultStyleSet()
|
||||||
|
|
||||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||||
|
if extractedContent:
|
||||||
|
metadata = extractedContent.get("metadata", {})
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
styles = metadata.get("styles")
|
||||||
|
if styles and isinstance(styles, dict):
|
||||||
|
self.logger.debug("Using styles from document generation metadata")
|
||||||
|
return self._validateStylesContrast(styles)
|
||||||
|
|
||||||
|
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||||
if userPrompt and aiService:
|
if userPrompt and aiService:
|
||||||
# AI will naturally detect style instructions in any language
|
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
||||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||||
return self._validateStylesContrast(enhancedStyleSet)
|
return self._validateStylesContrast(enhancedStyleSet)
|
||||||
else:
|
else:
|
||||||
|
|
@ -446,8 +465,12 @@ class RendererHtml(BaseRenderer):
|
||||||
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON table to HTML using AI-generated styles."""
|
"""Render a JSON table to HTML using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
headers = tableData.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = tableData.get("rows", [])
|
content = tableData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers or not rows:
|
if not headers or not rows:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -477,9 +500,13 @@ class RendererHtml(BaseRenderer):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON bullet list to HTML using AI-generated styles."""
|
"""Render a JSON bullet list to HTML using AI-generated styles. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
items = listData.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = listData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
items = content.get("items", [])
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -513,8 +540,12 @@ class RendererHtml(BaseRenderer):
|
||||||
elif not isinstance(headingData, dict):
|
elif not isinstance(headingData, dict):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
level = headingData.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = headingData.get("text", "")
|
content = headingData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
level = max(1, min(6, level))
|
level = max(1, min(6, level))
|
||||||
|
|
@ -531,11 +562,19 @@ class RendererHtml(BaseRenderer):
|
||||||
try:
|
try:
|
||||||
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
|
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
|
||||||
if isinstance(paragraphData, list):
|
if isinstance(paragraphData, list):
|
||||||
# Extract text from all paragraph elements
|
# Extract text from all paragraph elements (expects nested content structure)
|
||||||
texts = []
|
texts = []
|
||||||
for el in paragraphData:
|
for el in paragraphData:
|
||||||
if isinstance(el, dict) and "text" in el:
|
if isinstance(el, dict):
|
||||||
texts.append(el["text"])
|
content = el.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
if text:
|
||||||
|
texts.append(text)
|
||||||
elif isinstance(el, str):
|
elif isinstance(el, str):
|
||||||
texts.append(el)
|
texts.append(el)
|
||||||
if texts:
|
if texts:
|
||||||
|
|
@ -545,7 +584,15 @@ class RendererHtml(BaseRenderer):
|
||||||
elif isinstance(paragraphData, str):
|
elif isinstance(paragraphData, str):
|
||||||
return f'<p>{paragraphData}</p>'
|
return f'<p>{paragraphData}</p>'
|
||||||
elif isinstance(paragraphData, dict):
|
elif isinstance(paragraphData, dict):
|
||||||
text = paragraphData.get("text", "")
|
# Handle nested content structure: element.content vs element.text
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = paragraphData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
if text:
|
if text:
|
||||||
return f'<p>{text}</p>'
|
return f'<p>{text}</p>'
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -557,10 +604,14 @@ class RendererHtml(BaseRenderer):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON code block to HTML using AI-generated styles."""
|
"""Render a JSON code block to HTML using AI-generated styles. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
code = codeData.get("code", "")
|
# Extract from nested content structure
|
||||||
language = codeData.get("language", "")
|
content = codeData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
if language:
|
if language:
|
||||||
|
|
@ -575,12 +626,16 @@ class RendererHtml(BaseRenderer):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON image to HTML with placeholder for later replacement."""
|
"""Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
import html
|
import html
|
||||||
base64Data = imageData.get("base64Data", "")
|
# Extract from nested content structure
|
||||||
altText = imageData.get("altText", "Image")
|
content = imageData.get("content", {})
|
||||||
caption = imageData.get("caption", "")
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
base64Data = content.get("base64Data", "")
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
caption = content.get("caption", "")
|
||||||
|
|
||||||
# Escape HTML in altText and caption to prevent injection
|
# Escape HTML in altText and caption to prevent injection
|
||||||
altTextEscaped = html.escape(str(altText))
|
altTextEscaped = html.escape(str(altText))
|
||||||
|
|
@ -600,8 +655,10 @@ class RendererHtml(BaseRenderer):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
self.logger.error(f"Error embedding image in HTML: {str(e)}")
|
||||||
return f'<div class="error">[Image: {imageData.get("altText", "Image")}]</div>'
|
altText = imageData.get("altText", "Image")
|
||||||
|
errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]")
|
||||||
|
return f'<div class="error" style="color: red; padding: 10px; border: 1px solid red;">{errorMsg}</div>'
|
||||||
|
|
||||||
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
|
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -626,12 +683,24 @@ class RendererHtml(BaseRenderer):
|
||||||
if section.get("content_type") == "image":
|
if section.get("content_type") == "image":
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
for element in elements:
|
for element in elements:
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
base64Data = ""
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
base64Data = content.get("base64Data", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
# Content might be base64 string directly (shouldn't happen)
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If base64Data not found in content, try direct element fields (fallback)
|
||||||
|
if not base64Data:
|
||||||
base64Data = element.get("base64Data", "")
|
base64Data = element.get("base64Data", "")
|
||||||
|
|
||||||
# If base64Data not found, try extracting from url data URI
|
# If base64Data still not found, try extracting from url data URI
|
||||||
if not base64Data:
|
if not base64Data:
|
||||||
url = element.get("url", "")
|
url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
|
||||||
if url.startswith("data:image/"):
|
if url and isinstance(url, str) and url.startswith("data:image/"):
|
||||||
# Extract base64 from data URI: data:image/png;base64,<base64>
|
# Extract base64 from data URI: data:image/png;base64,<base64>
|
||||||
import re
|
import re
|
||||||
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
||||||
|
|
@ -642,7 +711,8 @@ class RendererHtml(BaseRenderer):
|
||||||
sectionId = section.get("id", "unknown")
|
sectionId = section.get("id", "unknown")
|
||||||
|
|
||||||
# Bestimme MIME-Type und Extension
|
# Bestimme MIME-Type und Extension
|
||||||
mimeType = element.get("mimeType", "image/png")
|
mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "")
|
||||||
|
if not mimeType or mimeType == "unknown":
|
||||||
if not mimeType or mimeType == "unknown":
|
if not mimeType or mimeType == "unknown":
|
||||||
# Versuche MIME-Type aus base64 zu erkennen
|
# Versuche MIME-Type aus base64 zu erkennen
|
||||||
if base64Data.startswith("/9j/"):
|
if base64Data.startswith("/9j/"):
|
||||||
|
|
|
||||||
|
|
@ -54,11 +54,17 @@ class RendererImage(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
imageBytes = imageContent
|
imageBytes = imageContent
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=imageBytes,
|
documentData=imageBytes,
|
||||||
mimeType="image/png",
|
mimeType="image/png",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,11 +43,17 @@ class RendererJson(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
filename = self._determineFilename(title, "application/json")
|
filename = self._determineFilename(title, "application/json")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=jsonContent.encode('utf-8'),
|
documentData=jsonContent.encode('utf-8'),
|
||||||
mimeType="application/json",
|
mimeType="application/json",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -60,11 +66,15 @@ class RendererJson(BaseRenderer):
|
||||||
"metadata": {"error": str(e)}
|
"metadata": {"error": str(e)}
|
||||||
}
|
}
|
||||||
fallbackContent = json.dumps(fallbackData, indent=2)
|
fallbackContent = json.dumps(fallbackData, indent=2)
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="application/json",
|
mimeType="application/json",
|
||||||
filename=self._determineFilename(title, "application/json")
|
filename=self._determineFilename(title, "application/json"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,11 +41,17 @@ class RendererMarkdown(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
filename = self._determineFilename(title, "text/markdown")
|
filename = self._determineFilename(title, "text/markdown")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=markdownContent.encode('utf-8'),
|
documentData=markdownContent.encode('utf-8'),
|
||||||
mimeType="text/markdown",
|
mimeType="text/markdown",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -53,11 +59,15 @@ class RendererMarkdown(BaseRenderer):
|
||||||
self.logger.error(f"Error rendering markdown: {str(e)}")
|
self.logger.error(f"Error rendering markdown: {str(e)}")
|
||||||
# Return minimal markdown fallback
|
# Return minimal markdown fallback
|
||||||
fallbackContent = f"# {title}\n\nError rendering report: {str(e)}"
|
fallbackContent = f"# {title}\n\nError rendering report: {str(e)}"
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="text/markdown",
|
mimeType="text/markdown",
|
||||||
filename=self._determineFilename(title, "text/markdown")
|
filename=self._determineFilename(title, "text/markdown"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -164,8 +174,12 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON table to markdown."""
|
"""Render a JSON table to markdown."""
|
||||||
try:
|
try:
|
||||||
headers = tableData.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = tableData.get("rows", [])
|
content = tableData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers or not rows:
|
if not headers or not rows:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -194,7 +208,11 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON bullet list to markdown."""
|
"""Render a JSON bullet list to markdown."""
|
||||||
try:
|
try:
|
||||||
items = listData.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = listData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
items = content.get("items", [])
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -215,8 +233,12 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON heading to markdown."""
|
"""Render a JSON heading to markdown."""
|
||||||
try:
|
try:
|
||||||
level = headingData.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = headingData.get("text", "")
|
content = headingData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
level = max(1, min(6, level))
|
level = max(1, min(6, level))
|
||||||
|
|
@ -231,7 +253,14 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON paragraph to markdown."""
|
"""Render a JSON paragraph to markdown."""
|
||||||
try:
|
try:
|
||||||
text = paragraphData.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraphData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
return text if text else ""
|
return text if text else ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -241,8 +270,12 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON code block to markdown."""
|
"""Render a JSON code block to markdown."""
|
||||||
try:
|
try:
|
||||||
code = codeData.get("code", "")
|
# Extract from nested content structure
|
||||||
language = codeData.get("language", "")
|
content = codeData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
if language:
|
if language:
|
||||||
|
|
@ -259,8 +292,12 @@ class RendererMarkdown(BaseRenderer):
|
||||||
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON image to markdown."""
|
"""Render a JSON image to markdown."""
|
||||||
try:
|
try:
|
||||||
altText = imageData.get("altText", "Image")
|
# Extract from nested content structure
|
||||||
base64Data = imageData.get("base64Data", "")
|
content = imageData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
base64Data = content.get("base64Data", "")
|
||||||
|
|
||||||
if base64Data:
|
if base64Data:
|
||||||
# For base64 images, we can't embed them directly in markdown
|
# For base64 images, we can't embed them directly in markdown
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,10 @@ class RendererPdf(BaseRenderer):
|
||||||
# Generate PDF using AI-analyzed styling
|
# Generate PDF using AI-analyzed styling
|
||||||
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
|
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
# Determine filename from document or title
|
# Determine filename from document or title
|
||||||
documents = extractedContent.get("documents", [])
|
documents = extractedContent.get("documents", [])
|
||||||
if documents and isinstance(documents[0], dict):
|
if documents and isinstance(documents[0], dict):
|
||||||
|
|
@ -74,7 +78,9 @@ class RendererPdf(BaseRenderer):
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=pdf_bytes,
|
documentData=pdf_bytes,
|
||||||
mimeType="application/pdf",
|
mimeType="application/pdf",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -93,8 +99,8 @@ class RendererPdf(BaseRenderer):
|
||||||
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
||||||
"""Generate PDF content from structured JSON document using AI-generated styling."""
|
"""Generate PDF content from structured JSON document using AI-generated styling."""
|
||||||
try:
|
try:
|
||||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||||
styles = await self._getStyleSet(userPrompt, aiService)
|
styles = await self._getStyleSet(json_content, userPrompt, aiService)
|
||||||
|
|
||||||
# Validate JSON structure
|
# Validate JSON structure
|
||||||
if not self._validateJsonStructure(json_content):
|
if not self._validateJsonStructure(json_content):
|
||||||
|
|
@ -157,12 +163,17 @@ class RendererPdf(BaseRenderer):
|
||||||
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
|
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
|
||||||
raise Exception(f"PDF generation failed: {str(e)}")
|
raise Exception(f"PDF generation failed: {str(e)}")
|
||||||
|
|
||||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
"""Get style set - use styles from document generation metadata if available,
|
||||||
|
otherwise enhance default styles with AI if userPrompt provided.
|
||||||
|
|
||||||
|
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||||
|
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
extractedContent: Document content with metadata (may contain styles)
|
||||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||||
aiService: AI service (used only if userPrompt provided)
|
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||||
templateName: Name of template style set (None = default)
|
templateName: Name of template style set (None = default)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -171,10 +182,19 @@ class RendererPdf(BaseRenderer):
|
||||||
# Get default style set
|
# Get default style set
|
||||||
defaultStyleSet = self._getDefaultStyleSet()
|
defaultStyleSet = self._getDefaultStyleSet()
|
||||||
|
|
||||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||||
|
if extractedContent:
|
||||||
|
metadata = extractedContent.get("metadata", {})
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
styles = metadata.get("styles")
|
||||||
|
if styles and isinstance(styles, dict):
|
||||||
|
self.logger.debug("Using styles from document generation metadata")
|
||||||
|
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||||
|
return self._validateStylesContrast(enhancedStyleSet)
|
||||||
|
|
||||||
|
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||||
if userPrompt and aiService:
|
if userPrompt and aiService:
|
||||||
# AI will naturally detect style instructions in any language
|
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
||||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||||
# Convert colors to PDF format after getting styles
|
# Convert colors to PDF format after getting styles
|
||||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||||
|
|
@ -545,7 +565,21 @@ class RendererPdf(BaseRenderer):
|
||||||
all_elements.append(Spacer(1, 6))
|
all_elements.append(Spacer(1, 6))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Standard section types
|
# Check element type, not section type (elements can have different types than section)
|
||||||
|
if element_type == "table":
|
||||||
|
all_elements.extend(self._renderJsonTable(element, styles))
|
||||||
|
elif element_type == "bullet_list":
|
||||||
|
all_elements.extend(self._renderJsonBulletList(element, styles))
|
||||||
|
elif element_type == "heading":
|
||||||
|
all_elements.extend(self._renderJsonHeading(element, styles))
|
||||||
|
elif element_type == "paragraph":
|
||||||
|
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||||
|
elif element_type == "code_block":
|
||||||
|
all_elements.extend(self._renderJsonCodeBlock(element, styles))
|
||||||
|
elif element_type == "image":
|
||||||
|
all_elements.extend(self._renderJsonImage(element, styles))
|
||||||
|
else:
|
||||||
|
# Fallback: if element_type not set, use section_type as fallback
|
||||||
if section_type == "table":
|
if section_type == "table":
|
||||||
all_elements.extend(self._renderJsonTable(element, styles))
|
all_elements.extend(self._renderJsonTable(element, styles))
|
||||||
elif section_type == "bullet_list":
|
elif section_type == "bullet_list":
|
||||||
|
|
@ -559,7 +593,7 @@ class RendererPdf(BaseRenderer):
|
||||||
elif section_type == "image":
|
elif section_type == "image":
|
||||||
all_elements.extend(self._renderJsonImage(element, styles))
|
all_elements.extend(self._renderJsonImage(element, styles))
|
||||||
else:
|
else:
|
||||||
# Fallback to paragraph for unknown types
|
# Final fallback to paragraph for unknown types
|
||||||
all_elements.extend(self._renderJsonParagraph(element, styles))
|
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||||
|
|
||||||
return all_elements
|
return all_elements
|
||||||
|
|
@ -571,8 +605,13 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON table to PDF elements using AI-generated styles."""
|
"""Render a JSON table to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
headers = table_data.get("headers", [])
|
# Handle nested content structure: element.content.headers vs element.headers
|
||||||
rows = table_data.get("rows", [])
|
# Extract from nested content structure
|
||||||
|
content = table_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers or not rows:
|
if not headers or not rows:
|
||||||
return []
|
return []
|
||||||
|
|
@ -588,13 +627,13 @@ class RendererPdf(BaseRenderer):
|
||||||
table_cell_style = styles.get("table_cell", {})
|
table_cell_style = styles.get("table_cell", {})
|
||||||
|
|
||||||
table_style = [
|
table_style = [
|
||||||
('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))),
|
('BACKGROUND', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
|
||||||
('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))),
|
('TEXTCOLOR', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("text_color", "#FFFFFF"))),
|
||||||
('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))),
|
('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))),
|
||||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
|
||||||
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
|
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
|
||||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||||
('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))),
|
('BACKGROUND', (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
|
||||||
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
|
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
|
||||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||||
]
|
]
|
||||||
|
|
@ -610,7 +649,11 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
|
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
items = list_data.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = list_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
items = content.get("items", [])
|
||||||
bullet_style_def = styles.get("bullet_list", {})
|
bullet_style_def = styles.get("bullet_list", {})
|
||||||
|
|
||||||
elements = []
|
elements = []
|
||||||
|
|
@ -632,8 +675,12 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonHeading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonHeading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON heading to PDF elements using AI-generated styles."""
|
"""Render a JSON heading to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
level = heading_data.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = heading_data.get("text", "")
|
content = heading_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
level = max(1, min(6, level))
|
level = max(1, min(6, level))
|
||||||
|
|
@ -649,7 +696,14 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
|
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
text = paragraph_data.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraph_data.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
return [Paragraph(text, self._createNormalStyle(styles))]
|
return [Paragraph(text, self._createNormalStyle(styles))]
|
||||||
|
|
@ -663,8 +717,12 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonCodeBlock(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonCodeBlock(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON code block to PDF elements using AI-generated styles."""
|
"""Render a JSON code block to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
code = code_data.get("code", "")
|
# Extract from nested content structure
|
||||||
language = code_data.get("language", "")
|
content = code_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return []
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
code_style_def = styles.get("code_block", {})
|
code_style_def = styles.get("code_block", {})
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
|
|
@ -700,14 +758,34 @@ class RendererPdf(BaseRenderer):
|
||||||
def _renderJsonImage(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
def _renderJsonImage(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
"""Render a JSON image to PDF elements using reportlab."""
|
"""Render a JSON image to PDF elements using reportlab."""
|
||||||
try:
|
try:
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = image_data.get("content", {})
|
||||||
|
base64_data = ""
|
||||||
|
alt_text = "Image"
|
||||||
|
caption = ""
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
# Nested content structure
|
||||||
|
base64_data = content.get("base64Data", "")
|
||||||
|
alt_text = content.get("altText", "Image")
|
||||||
|
caption = content.get("caption", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
# Content might be base64 string directly (shouldn't happen, but handle it)
|
||||||
|
self.logger.warning("Image content is a string, not a dict. This should not happen.")
|
||||||
|
return [Paragraph(f"[Image: Invalid format]", self._createNormalStyle(styles))]
|
||||||
|
|
||||||
|
# If base64Data not found in content, try direct element fields (fallback)
|
||||||
|
if not base64_data:
|
||||||
base64_data = image_data.get("base64Data", "")
|
base64_data = image_data.get("base64Data", "")
|
||||||
|
if not alt_text or alt_text == "Image":
|
||||||
alt_text = image_data.get("altText", "Image")
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
if not caption:
|
||||||
caption = image_data.get("caption", "")
|
caption = image_data.get("caption", "")
|
||||||
|
|
||||||
# If base64Data not found, try extracting from url data URI
|
# If base64Data still not found, try extracting from url data URI
|
||||||
if not base64_data:
|
if not base64_data:
|
||||||
url = image_data.get("url", "")
|
url = image_data.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
|
||||||
if url.startswith("data:image/"):
|
if url and isinstance(url, str) and url.startswith("data:image/"):
|
||||||
# Extract base64 from data URI: data:image/png;base64,<base64>
|
# Extract base64 from data URI: data:image/png;base64,<base64>
|
||||||
import re
|
import re
|
||||||
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
||||||
|
|
@ -715,8 +793,18 @@ class RendererPdf(BaseRenderer):
|
||||||
base64_data = match.group(1)
|
base64_data = match.group(1)
|
||||||
|
|
||||||
if not base64_data:
|
if not base64_data:
|
||||||
|
self.logger.warning(f"No base64 data found for image. Alt text: {alt_text}")
|
||||||
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
|
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
|
||||||
|
|
||||||
|
# Validate that base64_data is actually base64 (not the entire element rendered as text)
|
||||||
|
if len(base64_data) > 10000: # Very long string might be entire element JSON
|
||||||
|
self.logger.warning(f"Base64 data seems too long ({len(base64_data)} chars), might be incorrectly extracted")
|
||||||
|
|
||||||
|
# Ensure base64_data is a string, not bytes or other type
|
||||||
|
if not isinstance(base64_data, str):
|
||||||
|
self.logger.warning(f"Base64 data is not a string: {type(base64_data)}")
|
||||||
|
return [Paragraph(f"[Image: {alt_text} - Invalid data type]", self._createNormalStyle(styles))]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from reportlab.platypus import Image as ReportLabImage
|
from reportlab.platypus import Image as ReportLabImage
|
||||||
from reportlab.lib.units import inch
|
from reportlab.lib.units import inch
|
||||||
|
|
@ -731,25 +819,61 @@ class RendererPdf(BaseRenderer):
|
||||||
# Try to get image dimensions from PIL
|
# Try to get image dimensions from PIL
|
||||||
try:
|
try:
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
pilImage = PILImage.open(imageStream)
|
from reportlab.lib.pagesizes import A4
|
||||||
imgWidth, imgHeight = pilImage.size
|
|
||||||
|
|
||||||
# Scale to fit page (max width 6 inches, maintain aspect ratio)
|
pilImage = PILImage.open(imageStream)
|
||||||
maxWidth = 6 * inch
|
originalWidth, originalHeight = pilImage.size
|
||||||
if imgWidth > maxWidth:
|
|
||||||
scale = maxWidth / imgWidth
|
# Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom)
|
||||||
imgWidth = maxWidth
|
pageWidth = A4[0] # 595.27 points
|
||||||
|
pageHeight = A4[1] # 841.89 points
|
||||||
|
leftMargin = 72
|
||||||
|
rightMargin = 72
|
||||||
|
topMargin = 72
|
||||||
|
bottomMargin = 18
|
||||||
|
|
||||||
|
# Use actual frame dimensions from SimpleDocTemplate
|
||||||
|
# Frame is smaller than page minus margins due to internal spacing
|
||||||
|
# From error message: frame is 439.27559055118115 x 739.8897637795277
|
||||||
|
# Use conservative values with safety margin
|
||||||
|
availableWidth = 430.0 # Slightly smaller than frame width for safety
|
||||||
|
availableHeight = 730.0 # Slightly smaller than frame height for safety
|
||||||
|
|
||||||
|
# Convert original image size from pixels to points (assuming 72 DPI)
|
||||||
|
# If image DPI is different, PIL will provide correct size
|
||||||
|
# For safety, use a conservative conversion
|
||||||
|
imgWidthPoints = originalWidth * (inch / 72) # Convert to inches, then to points
|
||||||
|
imgHeightPoints = originalHeight * (inch / 72)
|
||||||
|
|
||||||
|
# Scale to fit within available page dimensions while maintaining aspect ratio
|
||||||
|
widthScale = availableWidth / imgWidthPoints if imgWidthPoints > 0 else 1.0
|
||||||
|
heightScale = availableHeight / imgHeightPoints if imgHeightPoints > 0 else 1.0
|
||||||
|
|
||||||
|
# Use the smaller scale to ensure image fits both width and height
|
||||||
|
scale = min(widthScale, heightScale, 1.0) # Don't scale up, only down
|
||||||
|
|
||||||
|
imgWidth = imgWidthPoints * scale
|
||||||
|
imgHeight = imgHeightPoints * scale
|
||||||
|
|
||||||
|
# Additional safety check: ensure dimensions don't exceed available space
|
||||||
|
if imgWidth > availableWidth:
|
||||||
|
scale = availableWidth / imgWidth
|
||||||
|
imgWidth = availableWidth
|
||||||
imgHeight = imgHeight * scale
|
imgHeight = imgHeight * scale
|
||||||
else:
|
|
||||||
imgWidth = imgWidth * (inch / 72) # Convert pixels to inches (assuming 72 DPI)
|
if imgHeight > availableHeight:
|
||||||
imgHeight = imgHeight * (inch / 72)
|
scale = availableHeight / imgHeight
|
||||||
|
imgHeight = availableHeight
|
||||||
|
imgWidth = imgWidth * scale
|
||||||
|
|
||||||
# Reset stream for reportlab
|
# Reset stream for reportlab
|
||||||
imageStream.seek(0)
|
imageStream.seek(0)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
# Fallback: use default size
|
# Fallback: use default size that fits page
|
||||||
imgWidth = 4 * inch
|
self.logger.warning(f"Error calculating image size: {str(e)}, using safe default")
|
||||||
imgHeight = 3 * inch
|
# Use 80% of available width as safe default
|
||||||
|
imgWidth = 4 * inch # ~288 points, safe for ~451pt available width
|
||||||
|
imgHeight = 3 * inch # ~216 points, safe for ~751pt available height
|
||||||
imageStream.seek(0)
|
imageStream.seek(0)
|
||||||
|
|
||||||
# Create reportlab Image
|
# Create reportlab Image
|
||||||
|
|
@ -773,10 +897,16 @@ class RendererPdf(BaseRenderer):
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
except Exception as imgError:
|
except Exception as imgError:
|
||||||
self.logger.warning(f"Error embedding image in PDF: {str(imgError)}")
|
self.logger.error(f"Error embedding image in PDF: {str(imgError)}")
|
||||||
# Fallback to placeholder
|
# Return error message instead of placeholder
|
||||||
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
|
errorStyle = self._createNormalStyle(styles)
|
||||||
|
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
|
||||||
|
errorMsg = f"[Error: Could not embed image '{alt_text}'. {str(imgError)}]"
|
||||||
|
return [Paragraph(errorMsg, errorStyle)]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
self.logger.error(f"Error rendering image: {str(e)}")
|
||||||
return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._createNormalStyle(styles))]
|
errorStyle = self._createNormalStyle(styles)
|
||||||
|
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
|
||||||
|
errorMsg = f"[Error: Could not render image '{image_data.get('altText', 'Image')}'. {str(e)}]"
|
||||||
|
return [Paragraph(errorMsg, errorStyle)]
|
||||||
|
|
@ -48,8 +48,8 @@ class RendererPptx(BaseRenderer):
|
||||||
from pptx.dml.color import RGBColor
|
from pptx.dml.color import RGBColor
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||||
styles = await self._getStyleSet(userPrompt, aiService)
|
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
|
||||||
|
|
||||||
# Create new presentation
|
# Create new presentation
|
||||||
prs = Presentation()
|
prs = Presentation()
|
||||||
|
|
@ -99,7 +99,7 @@ class RendererPptx(BaseRenderer):
|
||||||
if title_shape.text_frame.paragraphs[0].font:
|
if title_shape.text_frame.paragraphs[0].font:
|
||||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
|
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
|
||||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
|
||||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||||
|
|
||||||
# Handle images first (if present)
|
# Handle images first (if present)
|
||||||
|
|
@ -133,7 +133,7 @@ class RendererPptx(BaseRenderer):
|
||||||
heading_style = styles.get("heading", {})
|
heading_style = styles.get("heading", {})
|
||||||
p.font.size = Pt(heading_style.get("font_size", 32))
|
p.font.size = Pt(heading_style.get("font_size", 32))
|
||||||
p.font.bold = heading_style.get("bold", True)
|
p.font.bold = heading_style.get("bold", True)
|
||||||
heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47)))
|
heading_color = self._getSafeColor(heading_style.get("color", (47, 47, 47)))
|
||||||
p.font.color.rgb = RGBColor(*heading_color)
|
p.font.color.rgb = RGBColor(*heading_color)
|
||||||
elif paragraph.startswith('##'):
|
elif paragraph.startswith('##'):
|
||||||
# Subheader
|
# Subheader
|
||||||
|
|
@ -141,7 +141,7 @@ class RendererPptx(BaseRenderer):
|
||||||
subheading_style = styles.get("subheading", {})
|
subheading_style = styles.get("subheading", {})
|
||||||
p.font.size = Pt(subheading_style.get("font_size", 24))
|
p.font.size = Pt(subheading_style.get("font_size", 24))
|
||||||
p.font.bold = subheading_style.get("bold", True)
|
p.font.bold = subheading_style.get("bold", True)
|
||||||
subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79)))
|
subheading_color = self._getSafeColor(subheading_style.get("color", (79, 79, 79)))
|
||||||
p.font.color.rgb = RGBColor(*subheading_color)
|
p.font.color.rgb = RGBColor(*subheading_color)
|
||||||
elif paragraph.startswith('*') and paragraph.endswith('*'):
|
elif paragraph.startswith('*') and paragraph.endswith('*'):
|
||||||
# Bold text
|
# Bold text
|
||||||
|
|
@ -149,14 +149,14 @@ class RendererPptx(BaseRenderer):
|
||||||
paragraph_style = styles.get("paragraph", {})
|
paragraph_style = styles.get("paragraph", {})
|
||||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||||
p.font.bold = True
|
p.font.bold = True
|
||||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
paragraph_color = self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))
|
||||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||||
else:
|
else:
|
||||||
# Regular text
|
# Regular text
|
||||||
paragraph_style = styles.get("paragraph", {})
|
paragraph_style = styles.get("paragraph", {})
|
||||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||||
p.font.bold = paragraph_style.get("bold", False)
|
p.font.bold = paragraph_style.get("bold", False)
|
||||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
paragraph_color = self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))
|
||||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||||
|
|
||||||
# Apply alignment
|
# Apply alignment
|
||||||
|
|
@ -181,7 +181,7 @@ class RendererPptx(BaseRenderer):
|
||||||
if title_shape.text_frame.paragraphs[0].font:
|
if title_shape.text_frame.paragraphs[0].font:
|
||||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
|
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
|
||||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
|
||||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||||
|
|
||||||
subtitle_shape = slide.placeholders[1]
|
subtitle_shape = slide.placeholders[1]
|
||||||
|
|
@ -215,32 +215,46 @@ class RendererPptx(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.presentationml.presentation")
|
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.presentationml.presentation")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=pptx_bytes,
|
documentData=pptx_bytes,
|
||||||
mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||||
fallbackContent = "python-pptx library not installed"
|
fallbackContent = "python-pptx library not installed"
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="text/plain",
|
mimeType="text/plain",
|
||||||
filename=self._determineFilename(title, "text/plain")
|
filename=self._determineFilename(title, "text/plain"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
|
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
|
||||||
fallbackContent = f"Error rendering PowerPoint presentation: {str(e)}"
|
fallbackContent = f"Error rendering PowerPoint presentation: {str(e)}"
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="text/plain",
|
mimeType="text/plain",
|
||||||
filename=self._determineFilename(title, "text/plain")
|
filename=self._determineFilename(title, "text/plain"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -349,12 +363,17 @@ class RendererPptx(BaseRenderer):
|
||||||
"""Get MIME type for rendered output."""
|
"""Get MIME type for rendered output."""
|
||||||
return self.outputMimeType
|
return self.outputMimeType
|
||||||
|
|
||||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
"""Get style set - use styles from document generation metadata if available,
|
||||||
|
otherwise enhance default styles with AI if userPrompt provided.
|
||||||
|
|
||||||
|
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||||
|
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
extractedContent: Document content with metadata (may contain styles)
|
||||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||||
aiService: AI service (used only if userPrompt provided)
|
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||||
templateName: Name of template style set (None = default)
|
templateName: Name of template style set (None = default)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -363,10 +382,19 @@ class RendererPptx(BaseRenderer):
|
||||||
# Get default style set
|
# Get default style set
|
||||||
defaultStyleSet = self._getDefaultStyleSet()
|
defaultStyleSet = self._getDefaultStyleSet()
|
||||||
|
|
||||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||||
|
if extractedContent:
|
||||||
|
metadata = extractedContent.get("metadata", {})
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
styles = metadata.get("styles")
|
||||||
|
if styles and isinstance(styles, dict):
|
||||||
|
self.logger.debug("Using styles from document generation metadata")
|
||||||
|
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||||
|
return self._validateStylesReadability(enhancedStyleSet)
|
||||||
|
|
||||||
|
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||||
if userPrompt and aiService:
|
if userPrompt and aiService:
|
||||||
# AI will naturally detect style instructions in any language
|
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
||||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||||
# Convert colors to PPTX format after getting styles
|
# Convert colors to PPTX format after getting styles
|
||||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||||
|
|
@ -690,14 +718,27 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
|
|
||||||
# Handle image sections specially
|
# Handle image sections specially
|
||||||
if content_type == "image":
|
if content_type == "image":
|
||||||
# Extract image data
|
# Extract image data from nested content structure
|
||||||
images = []
|
images = []
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if element.get("base64Data"):
|
if isinstance(element, dict):
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
base64Data = content.get("base64Data")
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
caption = content.get("caption", "")
|
||||||
|
else:
|
||||||
|
# Fallback to direct element fields
|
||||||
|
base64Data = element.get("base64Data")
|
||||||
|
altText = element.get("altText", "Image")
|
||||||
|
caption = element.get("caption", "")
|
||||||
|
|
||||||
|
if base64Data:
|
||||||
images.append({
|
images.append({
|
||||||
"base64Data": element.get("base64Data"),
|
"base64Data": base64Data,
|
||||||
"altText": element.get("altText", "Image"),
|
"altText": altText,
|
||||||
"caption": element.get("caption")
|
"caption": caption
|
||||||
})
|
})
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -719,7 +760,7 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
elif content_type == "code":
|
elif content_type == "code":
|
||||||
content_parts.append(self._formatCodeForSlide(elements))
|
content_parts.append(self._formatCodeForSlide(elements))
|
||||||
else:
|
else:
|
||||||
content_parts.append(self._format_paragraph_for_slide(elements))
|
content_parts.append(self._formatParagraphForSlide(elements))
|
||||||
|
|
||||||
# Combine content parts
|
# Combine content parts
|
||||||
slide_content = "\n\n".join(filter(None, content_parts))
|
slide_content = "\n\n".join(filter(None, content_parts))
|
||||||
|
|
@ -734,17 +775,20 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
logger.warning(f"Error creating slide from section: {str(e)}")
|
logger.warning(f"Error creating slide from section: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _formatTableForSlide(self, elements: List[Dict[str, Any]]) -> str:
|
def _formatTableForSlide(self, element: Dict[str, Any]) -> str:
|
||||||
"""Format table data for slide presentation."""
|
"""Format table data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
# Extract table data from elements array
|
# Extract table data from element - handle nested content structure
|
||||||
headers = []
|
if not isinstance(element, dict):
|
||||||
rows = []
|
return ""
|
||||||
for element in elements:
|
|
||||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
# Extract from nested content structure
|
||||||
headers = element.get("headers", [])
|
content = element.get("content", {})
|
||||||
rows = element.get("rows", [])
|
if not isinstance(content, dict):
|
||||||
break
|
return ""
|
||||||
|
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers:
|
if not headers:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -778,7 +822,11 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
def _formatListForSlide(self, list_data: Dict[str, Any]) -> str:
|
def _formatListForSlide(self, list_data: Dict[str, Any]) -> str:
|
||||||
"""Format list data for slide presentation."""
|
"""Format list data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
items = list_data.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = list_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
items = content.get("items", [])
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -810,8 +858,12 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
def _formatHeadingForSlide(self, heading_data: Dict[str, Any]) -> str:
|
def _formatHeadingForSlide(self, heading_data: Dict[str, Any]) -> str:
|
||||||
"""Format heading data for slide presentation."""
|
"""Format heading data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
text = heading_data.get("text", "")
|
# Extract from nested content structure
|
||||||
level = heading_data.get("level", 1)
|
content = heading_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
return f"{'#' * level} {text}"
|
return f"{'#' * level} {text}"
|
||||||
|
|
@ -825,7 +877,14 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
def _formatParagraphForSlide(self, paragraph_data: Dict[str, Any]) -> str:
|
def _formatParagraphForSlide(self, paragraph_data: Dict[str, Any]) -> str:
|
||||||
"""Format paragraph data for slide presentation."""
|
"""Format paragraph data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
text = paragraph_data.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraph_data.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
# Limit paragraph length based on content density
|
# Limit paragraph length based on content density
|
||||||
|
|
@ -844,8 +903,12 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
def _formatCodeForSlide(self, code_data: Dict[str, Any]) -> str:
|
def _formatCodeForSlide(self, code_data: Dict[str, Any]) -> str:
|
||||||
"""Format code data for slide presentation."""
|
"""Format code data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
code = code_data.get("code", "")
|
# Extract from nested content structure
|
||||||
language = code_data.get("language", "")
|
content = code_data.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
# Limit code length based on content density
|
# Limit code length based on content density
|
||||||
|
|
@ -912,6 +975,10 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
section_type = section.get("content_type", "paragraph")
|
section_type = section.get("content_type", "paragraph")
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
# Skip sections with no elements (unless they're headings that should create new slides)
|
||||||
|
if not elements and section_type != "heading":
|
||||||
|
continue
|
||||||
|
|
||||||
if section_type == "heading":
|
if section_type == "heading":
|
||||||
# If we have accumulated content, create a slide
|
# If we have accumulated content, create a slide
|
||||||
if current_slide_content:
|
if current_slide_content:
|
||||||
|
|
@ -923,10 +990,26 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
current_slide_content = []
|
current_slide_content = []
|
||||||
|
|
||||||
# Start new slide with heading as title
|
# Start new slide with heading as title
|
||||||
|
heading_found = False
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if isinstance(element, dict) and "text" in element:
|
if isinstance(element, dict):
|
||||||
current_slide_title = element.get("text", "Untitled Section")
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
heading_text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
heading_text = content
|
||||||
|
else:
|
||||||
|
heading_text = ""
|
||||||
|
|
||||||
|
if heading_text:
|
||||||
|
current_slide_title = heading_text
|
||||||
|
heading_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# If no heading text found but this is a heading section, use section ID or default
|
||||||
|
if not heading_found:
|
||||||
|
current_slide_title = section.get("id", "Untitled Section")
|
||||||
elif section_type == "image":
|
elif section_type == "image":
|
||||||
# Create separate slide for image
|
# Create separate slide for image
|
||||||
if current_slide_content:
|
if current_slide_content:
|
||||||
|
|
@ -940,11 +1023,24 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
# Extract image data
|
# Extract image data
|
||||||
imageData = []
|
imageData = []
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if element.get("base64Data"):
|
if isinstance(element, dict):
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
base64Data = content.get("base64Data")
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
caption = content.get("caption", "")
|
||||||
|
else:
|
||||||
|
# Fallback to direct element fields
|
||||||
|
base64Data = element.get("base64Data")
|
||||||
|
altText = element.get("altText", "Image")
|
||||||
|
caption = element.get("caption", "")
|
||||||
|
|
||||||
|
if base64Data:
|
||||||
imageData.append({
|
imageData.append({
|
||||||
"base64Data": element.get("base64Data"),
|
"base64Data": base64Data,
|
||||||
"altText": element.get("altText", "Image"),
|
"altText": altText,
|
||||||
"caption": element.get("caption")
|
"caption": caption
|
||||||
})
|
})
|
||||||
|
|
||||||
slides.append({
|
slides.append({
|
||||||
|
|
@ -986,17 +1082,17 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
content_parts = []
|
content_parts = []
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if content_type == "table":
|
if content_type == "table":
|
||||||
content_parts.append(self._formatTableForSlide([element]))
|
content_parts.append(self._formatTableForSlide(element))
|
||||||
elif content_type == "list":
|
elif content_type == "bullet_list" or content_type == "list":
|
||||||
content_parts.append(self._formatListForSlide([element]))
|
content_parts.append(self._formatListForSlide(element))
|
||||||
elif content_type == "heading":
|
elif content_type == "heading":
|
||||||
content_parts.append(self._formatHeadingForSlide([element]))
|
content_parts.append(self._formatHeadingForSlide(element))
|
||||||
elif content_type == "paragraph":
|
elif content_type == "paragraph":
|
||||||
content_parts.append(self._formatParagraphForSlide([element]))
|
content_parts.append(self._formatParagraphForSlide(element))
|
||||||
elif content_type == "code":
|
elif content_type == "code_block" or content_type == "code":
|
||||||
content_parts.append(self._formatCodeForSlide([element]))
|
content_parts.append(self._formatCodeForSlide(element))
|
||||||
else:
|
else:
|
||||||
content_parts.append(self._format_paragraph_for_slide([element]))
|
content_parts.append(self._formatParagraphForSlide(element))
|
||||||
|
|
||||||
return "\n\n".join(filter(None, content_parts))
|
return "\n\n".join(filter(None, content_parts))
|
||||||
|
|
||||||
|
|
@ -1009,6 +1105,7 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
try:
|
try:
|
||||||
from pptx.util import Inches, Pt
|
from pptx.util import Inches, Pt
|
||||||
from pptx.enum.text import PP_ALIGN
|
from pptx.enum.text import PP_ALIGN
|
||||||
|
from pptx.dml.color import RGBColor
|
||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
|
@ -1106,7 +1203,25 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
slide.shapes.add_picture(imageStream, left, top, width=imgWidth, height=imgHeight)
|
slide.shapes.add_picture(imageStream, left, top, width=imgWidth, height=imgHeight)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error adding images to slide: {str(e)}")
|
logger.error(f"Error embedding images in PPTX slide: {str(e)}")
|
||||||
|
# Add error message text box to slide
|
||||||
|
try:
|
||||||
|
from pptx.util import Inches, Pt
|
||||||
|
from pptx.enum.text import PP_ALIGN
|
||||||
|
errorMsg = f"[Error: Could not embed image(s). {str(e)}]"
|
||||||
|
errorBox = slide.shapes.add_textbox(
|
||||||
|
Inches(0.5),
|
||||||
|
Inches(2),
|
||||||
|
slideWidth - Inches(1),
|
||||||
|
Inches(0.5)
|
||||||
|
)
|
||||||
|
errorFrame = errorBox.text_frame
|
||||||
|
errorFrame.text = errorMsg
|
||||||
|
errorFrame.paragraphs[0].font.size = Pt(12)
|
||||||
|
errorFrame.paragraphs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color
|
||||||
|
errorFrame.paragraphs[0].alignment = PP_ALIGN.LEFT
|
||||||
|
except Exception as errorBoxError:
|
||||||
|
logger.error(f"Could not add error message to slide: {str(errorBoxError)}")
|
||||||
|
|
||||||
def _formatTimestamp(self) -> str:
|
def _formatTimestamp(self) -> str:
|
||||||
"""Format current timestamp for presentation generation."""
|
"""Format current timestamp for presentation generation."""
|
||||||
|
|
|
||||||
|
|
@ -63,11 +63,17 @@ class RendererText(BaseRenderer):
|
||||||
else:
|
else:
|
||||||
filename = self._determineFilename(title, "text/plain")
|
filename = self._determineFilename(title, "text/plain")
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=textContent.encode('utf-8'),
|
documentData=textContent.encode('utf-8'),
|
||||||
mimeType="text/plain",
|
mimeType="text/plain",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -75,11 +81,15 @@ class RendererText(BaseRenderer):
|
||||||
self.logger.error(f"Error rendering text: {str(e)}")
|
self.logger.error(f"Error rendering text: {str(e)}")
|
||||||
# Return minimal text fallback
|
# Return minimal text fallback
|
||||||
fallbackContent = f"{title}\n\nError rendering report: {str(e)}"
|
fallbackContent = f"{title}\n\nError rendering report: {str(e)}"
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
return [
|
return [
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=fallbackContent.encode('utf-8'),
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
mimeType="text/plain",
|
mimeType="text/plain",
|
||||||
filename=self._determineFilename(title, "text/plain")
|
filename=self._determineFilename(title, "text/plain"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -201,8 +211,12 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON table to text."""
|
"""Render a JSON table to text."""
|
||||||
try:
|
try:
|
||||||
headers = tableData.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = tableData.get("rows", [])
|
content = tableData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers or not rows:
|
if not headers or not rows:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -231,7 +245,11 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON bullet list to text."""
|
"""Render a JSON bullet list to text."""
|
||||||
try:
|
try:
|
||||||
items = listData.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = listData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
items = content.get("items", [])
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -252,8 +270,12 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON heading to text."""
|
"""Render a JSON heading to text."""
|
||||||
try:
|
try:
|
||||||
level = headingData.get("level", 1)
|
# Extract from nested content structure
|
||||||
text = headingData.get("text", "")
|
content = headingData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
level = max(1, min(6, level))
|
level = max(1, min(6, level))
|
||||||
|
|
@ -273,7 +295,14 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON paragraph to text."""
|
"""Render a JSON paragraph to text."""
|
||||||
try:
|
try:
|
||||||
text = paragraphData.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = paragraphData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
return text if text else ""
|
return text if text else ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -283,8 +312,12 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON code block to text."""
|
"""Render a JSON code block to text."""
|
||||||
try:
|
try:
|
||||||
code = codeData.get("code", "")
|
# Extract from nested content structure
|
||||||
language = codeData.get("language", "")
|
content = codeData.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return ""
|
||||||
|
code = content.get("code", "")
|
||||||
|
language = content.get("language", "")
|
||||||
|
|
||||||
if code:
|
if code:
|
||||||
if language:
|
if language:
|
||||||
|
|
@ -301,9 +334,14 @@ class RendererText(BaseRenderer):
|
||||||
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
||||||
"""Render a JSON image to text."""
|
"""Render a JSON image to text."""
|
||||||
try:
|
try:
|
||||||
|
# Extract from nested content structure
|
||||||
|
content = imageData.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
else:
|
||||||
altText = imageData.get("altText", "Image")
|
altText = imageData.get("altText", "Image")
|
||||||
return f"[Image: {altText}]"
|
return f"[Image: {altText}]"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
return f"[Image: {imageData.get('altText', 'Image')}]"
|
return f"[Image: Image]"
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,10 @@ class RendererXlsx(BaseRenderer):
|
||||||
# Generate Excel using AI-analyzed styling
|
# Generate Excel using AI-analyzed styling
|
||||||
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService)
|
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService)
|
||||||
|
|
||||||
|
# Extract metadata for document type and other info
|
||||||
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
|
||||||
# Determine filename from document or title
|
# Determine filename from document or title
|
||||||
documents = extractedContent.get("documents", [])
|
documents = extractedContent.get("documents", [])
|
||||||
if documents and isinstance(documents[0], dict):
|
if documents and isinstance(documents[0], dict):
|
||||||
|
|
@ -72,14 +76,27 @@ class RendererXlsx(BaseRenderer):
|
||||||
RenderedDocument(
|
RenderedDocument(
|
||||||
documentData=excel_bytes,
|
documentData=excel_bytes,
|
||||||
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
filename=filename
|
filename=filename,
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error rendering Excel: {str(e)}")
|
self.logger.error(f"Error rendering Excel: {str(e)}")
|
||||||
# Return CSV fallback
|
# Return CSV fallback with metadata
|
||||||
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||||
|
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||||
|
fallbackContent = f"Title,Content\n{title},Error rendering Excel report: {str(e)}"
|
||||||
|
return [
|
||||||
|
RenderedDocument(
|
||||||
|
documentData=fallbackContent.encode('utf-8'),
|
||||||
|
mimeType="text/csv",
|
||||||
|
filename=self._determineFilename(title, "text/csv"),
|
||||||
|
documentType=documentType,
|
||||||
|
metadata=metadata if isinstance(metadata, dict) else None
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def _generateExcel(self, content: str, title: str) -> str:
|
def _generateExcel(self, content: str, title: str) -> str:
|
||||||
"""Generate Excel content using openpyxl."""
|
"""Generate Excel content using openpyxl."""
|
||||||
|
|
@ -231,8 +248,8 @@ class RendererXlsx(BaseRenderer):
|
||||||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER")
|
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER")
|
||||||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER")
|
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER")
|
||||||
|
|
||||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||||
styles = await self._getStyleSet(userPrompt, aiService)
|
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
|
||||||
|
|
||||||
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
|
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
|
||||||
if not self._validateJsonStructure(jsonContent):
|
if not self._validateJsonStructure(jsonContent):
|
||||||
|
|
@ -275,12 +292,17 @@ class RendererXlsx(BaseRenderer):
|
||||||
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
|
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
|
||||||
raise Exception(f"Excel generation failed: {str(e)}")
|
raise Exception(f"Excel generation failed: {str(e)}")
|
||||||
|
|
||||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
"""Get style set - use styles from document generation metadata if available,
|
||||||
|
otherwise enhance default styles with AI if userPrompt provided.
|
||||||
|
|
||||||
|
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||||
|
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
extractedContent: Document content with metadata (may contain styles)
|
||||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||||
aiService: AI service (used only if userPrompt provided)
|
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||||
templateName: Name of template style set (None = default)
|
templateName: Name of template style set (None = default)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -289,10 +311,19 @@ class RendererXlsx(BaseRenderer):
|
||||||
# Get default style set
|
# Get default style set
|
||||||
defaultStyleSet = self._getDefaultStyleSet()
|
defaultStyleSet = self._getDefaultStyleSet()
|
||||||
|
|
||||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||||
|
if extractedContent:
|
||||||
|
metadata = extractedContent.get("metadata", {})
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
styles = metadata.get("styles")
|
||||||
|
if styles and isinstance(styles, dict):
|
||||||
|
self.logger.debug("Using styles from document generation metadata")
|
||||||
|
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||||
|
return self._validateStylesContrast(enhancedStyleSet)
|
||||||
|
|
||||||
|
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||||
if userPrompt and aiService:
|
if userPrompt and aiService:
|
||||||
# AI will naturally detect style instructions in any language
|
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
||||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||||
# Convert colors to Excel format after getting styles
|
# Convert colors to Excel format after getting styles
|
||||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||||
|
|
@ -462,86 +493,119 @@ class RendererXlsx(BaseRenderer):
|
||||||
|
|
||||||
# Create sheets
|
# Create sheets
|
||||||
for i, sheetName in enumerate(sheetNames):
|
for i, sheetName in enumerate(sheetNames):
|
||||||
|
# Sanitize sheet name before creating
|
||||||
|
sanitized_name = self._sanitizeSheetName(sheetName)
|
||||||
if i == 0:
|
if i == 0:
|
||||||
# Use the default sheet for the first sheet
|
# Use the default sheet for the first sheet
|
||||||
sheet = wb.active
|
sheet = wb.active
|
||||||
sheet.title = sheetName
|
sheet.title = sanitized_name
|
||||||
else:
|
else:
|
||||||
# Create additional sheets
|
# Create additional sheets
|
||||||
sheet = wb.create_sheet(sheetName, i)
|
sheet = wb.create_sheet(sanitized_name, i)
|
||||||
sheets[sheetName.lower()] = sheet
|
# Use sanitized name as key (lowercase for lookup)
|
||||||
|
sheets[sanitized_name.lower()] = sheet
|
||||||
|
|
||||||
return sheets
|
return sheets
|
||||||
|
|
||||||
|
def _sanitizeSheetName(self, name: str) -> str:
|
||||||
|
"""Sanitize sheet name: remove invalid characters and ensure valid length."""
|
||||||
|
if not name:
|
||||||
|
return "Sheet"
|
||||||
|
# Remove invalid characters: [ ] : * ? / \
|
||||||
|
invalid_chars = ['[', ']', ':', '*', '?', '/', '\\']
|
||||||
|
sanitized = name
|
||||||
|
for char in invalid_chars:
|
||||||
|
sanitized = sanitized.replace(char, '')
|
||||||
|
# Remove leading/trailing spaces and apostrophes
|
||||||
|
sanitized = sanitized.strip().strip("'")
|
||||||
|
# Ensure not empty
|
||||||
|
if not sanitized:
|
||||||
|
sanitized = "Sheet"
|
||||||
|
# Excel sheet name limit is 31 characters
|
||||||
|
return sanitized[:31]
|
||||||
|
|
||||||
def _generateSheetNamesFromContent(self, jsonContent: Dict[str, Any]) -> List[str]:
|
def _generateSheetNamesFromContent(self, jsonContent: Dict[str, Any]) -> List[str]:
|
||||||
"""Generate sheet names based on actual content structure."""
|
"""Generate sheet names: each heading section creates a new tab."""
|
||||||
sections = self._extractSections(jsonContent)
|
sections = self._extractSections(jsonContent)
|
||||||
|
|
||||||
# If no sections, create a single sheet
|
# If no sections, create a single sheet
|
||||||
if not sections:
|
if not sections:
|
||||||
return ["Content"]
|
return ["Content"]
|
||||||
|
|
||||||
# Generate sheet names based on content structure
|
# Simple logic: each heading section creates a new tab
|
||||||
sheetNames = []
|
sheetNames = []
|
||||||
|
for section in sections:
|
||||||
# Check if we have multiple table sections
|
if section.get("content_type") == "heading":
|
||||||
tableSections = [s for s in sections if s.get("content_type") == "table"]
|
# Extract heading text from elements
|
||||||
|
|
||||||
if len(tableSections) > 1:
|
|
||||||
# Create separate sheets for each table
|
|
||||||
for i, section in enumerate(tableSections, 1):
|
|
||||||
# Try to get caption from table element first, then section title, then fallback
|
|
||||||
sectionTitle = None
|
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||||
tableElement = elements[0]
|
headingElement = elements[0]
|
||||||
sectionTitle = tableElement.get("caption")
|
content = headingElement.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
if not sectionTitle:
|
headingText = content.get("text", "")
|
||||||
sectionTitle = section.get("title")
|
elif isinstance(content, str):
|
||||||
|
headingText = content
|
||||||
if not sectionTitle:
|
|
||||||
sectionTitle = f"Table {i}"
|
|
||||||
|
|
||||||
sheetNames.append(sectionTitle[:31]) # Excel sheet name limit
|
|
||||||
else:
|
else:
|
||||||
# Single table or mixed content - create only main sheet
|
headingText = ""
|
||||||
|
|
||||||
|
if headingText:
|
||||||
|
sanitized_name = self._sanitizeSheetName(headingText)
|
||||||
|
# Ensure unique sheet names
|
||||||
|
if sanitized_name not in sheetNames:
|
||||||
|
sheetNames.append(sanitized_name)
|
||||||
|
else:
|
||||||
|
# Add number suffix for duplicates
|
||||||
|
counter = 1
|
||||||
|
base_name = sanitized_name[:28] # Leave room for " (1)"
|
||||||
|
while f"{base_name} ({counter})" in sheetNames:
|
||||||
|
counter += 1
|
||||||
|
sheetNames.append(f"{base_name} ({counter})"[:31])
|
||||||
|
|
||||||
|
# If no headings found, use document title
|
||||||
|
if not sheetNames:
|
||||||
documentTitle = jsonContent.get("metadata", {}).get("title", "Document")
|
documentTitle = jsonContent.get("metadata", {}).get("title", "Document")
|
||||||
sheetNames.append(documentTitle[:31]) # Excel sheet name limit
|
sheetNames.append(self._sanitizeSheetName(documentTitle))
|
||||||
|
|
||||||
return sheetNames
|
return sheetNames
|
||||||
|
|
||||||
def _populateExcelSheets(self, sheets: Dict[str, Any], jsonContent: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _populateExcelSheets(self, sheets: Dict[str, Any], jsonContent: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Populate Excel sheets with content from JSON based on actual sheet names."""
|
"""Populate Excel sheets: each heading creates a new tab, all following content goes in that tab."""
|
||||||
try:
|
try:
|
||||||
# Get the actual sheet names that were created
|
# Get the actual sheet names that were created (keys are lowercase)
|
||||||
sheetNames = list(sheets.keys())
|
sheetNames = list(sheets.keys())
|
||||||
|
|
||||||
if not sheetNames:
|
if not sheetNames:
|
||||||
return
|
return
|
||||||
|
|
||||||
sections = self._extractSections(jsonContent)
|
sections = self._extractSections(jsonContent)
|
||||||
tableSections = [s for s in sections if s.get("content_type") == "table"]
|
|
||||||
|
|
||||||
if len(tableSections) > 1:
|
# Simple logic: iterate through sections, each heading creates a new tab
|
||||||
# Multiple tables - populate each sheet with its corresponding table
|
currentSheetIndex = 0
|
||||||
for i, section in enumerate(tableSections):
|
currentSheet = None
|
||||||
if i < len(sheetNames):
|
currentRow = 1
|
||||||
sheetName = sheetNames[i]
|
|
||||||
sheet = sheets[sheetName]
|
for section in sections:
|
||||||
# Use the caption from table element as sheet title, or fallback to sheet name
|
contentType = section.get("content_type", "paragraph")
|
||||||
sheetTitle = sheetName
|
|
||||||
elements = section.get("elements", [])
|
# Heading section: switch to next sheet
|
||||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
if contentType == "heading":
|
||||||
tableElement = elements[0]
|
if currentSheetIndex < len(sheetNames):
|
||||||
caption = tableElement.get("caption")
|
sheetName = sheetNames[currentSheetIndex]
|
||||||
if caption:
|
currentSheet = sheets[sheetName] # sheets dict uses lowercase keys
|
||||||
sheetTitle = caption
|
currentSheetIndex += 1
|
||||||
self._populateTableSheet(sheet, section, styles, sheetTitle)
|
currentRow = 1 # Start at row 1 for new sheet
|
||||||
else:
|
else:
|
||||||
# Single table or mixed content - populate only main sheet
|
# More headings than sheets - use last sheet
|
||||||
firstSheetName = sheetNames[0]
|
if sheetNames:
|
||||||
self._populateMainSheet(sheets[firstSheetName], jsonContent, styles)
|
currentSheet = sheets[sheetNames[-1]]
|
||||||
|
|
||||||
|
# Render content in current sheet (or first sheet if no headings yet)
|
||||||
|
if currentSheet is None and sheetNames:
|
||||||
|
currentSheet = sheets[sheetNames[0]]
|
||||||
|
|
||||||
|
if currentSheet:
|
||||||
|
currentRow = self._addSectionToSheet(currentSheet, section, styles, currentRow)
|
||||||
|
currentRow += 1 # Add spacing between sections
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
|
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
|
||||||
|
|
@ -558,9 +622,15 @@ class RendererXlsx(BaseRenderer):
|
||||||
# Get table data from elements (canonical JSON format)
|
# Get table data from elements (canonical JSON format)
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||||
table_data = elements[0]
|
table_element = elements[0]
|
||||||
headers = table_data.get("headers", [])
|
# Extract from nested content structure
|
||||||
rows = table_data.get("rows", [])
|
content = table_element.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
headers = []
|
||||||
|
rows = []
|
||||||
|
else:
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
else:
|
else:
|
||||||
headers = []
|
headers = []
|
||||||
rows = []
|
rows = []
|
||||||
|
|
@ -578,11 +648,28 @@ class RendererXlsx(BaseRenderer):
|
||||||
if header_style.get("background"):
|
if header_style.get("background"):
|
||||||
cell.fill = PatternFill(start_color=self._getSafeColor(header_style["background"]), end_color=self._getSafeColor(header_style["background"]), fill_type="solid")
|
cell.fill = PatternFill(start_color=self._getSafeColor(header_style["background"]), end_color=self._getSafeColor(header_style["background"]), fill_type="solid")
|
||||||
|
|
||||||
# Add rows
|
# Add rows - handle both array format and cells object format
|
||||||
cell_style = styles.get("table_cell", {})
|
cell_style = styles.get("table_cell", {})
|
||||||
for row_idx, row_data in enumerate(rows, 4):
|
for row_idx, row_data in enumerate(rows, 4):
|
||||||
for col_idx, cell_value in enumerate(row_data, 1):
|
# Handle different row formats
|
||||||
cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value)
|
if isinstance(row_data, list):
|
||||||
|
# Array format: [value1, value2, ...]
|
||||||
|
cell_values = row_data
|
||||||
|
elif isinstance(row_data, dict) and "cells" in row_data:
|
||||||
|
# Cells object format: {"cells": [{"value": ...}, ...]}
|
||||||
|
cell_values = [cell_obj.get("value", "") for cell_obj in row_data.get("cells", [])]
|
||||||
|
else:
|
||||||
|
# Unknown format, skip
|
||||||
|
continue
|
||||||
|
|
||||||
|
for col_idx, cell_value in enumerate(cell_values, 1):
|
||||||
|
# Extract value if it's a dict with "value" key
|
||||||
|
if isinstance(cell_value, dict):
|
||||||
|
actual_value = cell_value.get("value", "")
|
||||||
|
else:
|
||||||
|
actual_value = cell_value
|
||||||
|
|
||||||
|
cell = sheet.cell(row=row_idx, column=col_idx, value=actual_value)
|
||||||
if cell_style.get("text_color"):
|
if cell_style.get("text_color"):
|
||||||
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
||||||
|
|
||||||
|
|
@ -714,6 +801,21 @@ class RendererXlsx(BaseRenderer):
|
||||||
# Handle all section types using elements array
|
# Handle all section types using elements array
|
||||||
elements = section.get("elements", [])
|
elements = section.get("elements", [])
|
||||||
for element in elements:
|
for element in elements:
|
||||||
|
# Check element type, not section type (elements can have different types than section)
|
||||||
|
element_type = element.get("type", "") if isinstance(element, dict) else ""
|
||||||
|
|
||||||
|
if element_type == "table":
|
||||||
|
startRow = self._addTableToExcel(sheet, element, styles, startRow)
|
||||||
|
elif element_type == "bullet_list" or element_type == "list":
|
||||||
|
startRow = self._addListToExcel(sheet, element, styles, startRow)
|
||||||
|
elif element_type == "paragraph":
|
||||||
|
startRow = self._addParagraphToExcel(sheet, element, styles, startRow)
|
||||||
|
elif element_type == "heading":
|
||||||
|
startRow = self._addHeadingToExcel(sheet, element, styles, startRow)
|
||||||
|
elif element_type == "image":
|
||||||
|
startRow = self._addImageToExcel(sheet, element, styles, startRow)
|
||||||
|
else:
|
||||||
|
# Fallback: if element_type not set, use section_type
|
||||||
if section_type == "table":
|
if section_type == "table":
|
||||||
startRow = self._addTableToExcel(sheet, element, styles, startRow)
|
startRow = self._addTableToExcel(sheet, element, styles, startRow)
|
||||||
elif section_type == "bullet_list" or section_type == "list":
|
elif section_type == "bullet_list" or section_type == "list":
|
||||||
|
|
@ -733,36 +835,114 @@ class RendererXlsx(BaseRenderer):
|
||||||
self.logger.warning(f"Could not add section to sheet: {str(e)}")
|
self.logger.warning(f"Could not add section to sheet: {str(e)}")
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
|
def _sanitizeCellValue(self, value: Any) -> str:
|
||||||
|
"""Sanitize cell value: remove markdown, convert to string, handle None."""
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# Extract value from dict if present
|
||||||
|
return str(value.get("value", ""))
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return value # Keep numbers as-is
|
||||||
|
# Convert to string and remove markdown formatting
|
||||||
|
text = str(value)
|
||||||
|
# Remove markdown bold (**text**)
|
||||||
|
text = text.replace("**", "")
|
||||||
|
# Remove markdown italic (*text*)
|
||||||
|
text = text.replace("*", "")
|
||||||
|
# Remove other markdown
|
||||||
|
text = text.replace("__", "").replace("_", "")
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
def _addTableToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
def _addTableToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||||
"""Add a table element to Excel sheet."""
|
"""Add a table element to Excel sheet with proper formatting and borders."""
|
||||||
try:
|
try:
|
||||||
# In canonical JSON format, table elements have headers and rows directly
|
# Extract from nested content structure
|
||||||
headers = element.get("headers", [])
|
content = element.get("content", {})
|
||||||
rows = element.get("rows", [])
|
if not isinstance(content, dict):
|
||||||
|
return startRow
|
||||||
|
headers = content.get("headers", [])
|
||||||
|
rows = content.get("rows", [])
|
||||||
|
|
||||||
if not headers and not rows:
|
if not headers and not rows:
|
||||||
return startRow
|
return startRow
|
||||||
|
|
||||||
# Add headers
|
# Define border style
|
||||||
|
thin_border = Border(
|
||||||
|
left=Side(style='thin'),
|
||||||
|
right=Side(style='thin'),
|
||||||
|
top=Side(style='thin'),
|
||||||
|
bottom=Side(style='thin')
|
||||||
|
)
|
||||||
|
|
||||||
|
headerRow = startRow
|
||||||
header_style = styles.get("table_header", {})
|
header_style = styles.get("table_header", {})
|
||||||
|
|
||||||
|
# Add headers with formatting
|
||||||
for col, header in enumerate(headers, 1):
|
for col, header in enumerate(headers, 1):
|
||||||
cell = sheet.cell(row=startRow, column=col, value=header)
|
sanitized_header = self._sanitizeCellValue(header)
|
||||||
if header_style.get("bold"):
|
cell = sheet.cell(row=headerRow, column=col, value=sanitized_header)
|
||||||
cell.font = Font(bold=True, color=self._getSafeColor(header_style.get("text_color", "FF000000")))
|
|
||||||
|
# Font styling
|
||||||
|
cell.font = Font(
|
||||||
|
bold=header_style.get("bold", True),
|
||||||
|
color=self._getSafeColor(header_style.get("text_color", "FF000000"))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Background color
|
||||||
if header_style.get("background"):
|
if header_style.get("background"):
|
||||||
cell.fill = PatternFill(start_color=self._getSafeColor(header_style["background"]), end_color=self._getSafeColor(header_style["background"]), fill_type="solid")
|
cell.fill = PatternFill(
|
||||||
|
start_color=self._getSafeColor(header_style["background"]),
|
||||||
|
end_color=self._getSafeColor(header_style["background"]),
|
||||||
|
fill_type="solid"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Alignment
|
||||||
|
cell.alignment = Alignment(
|
||||||
|
horizontal=header_style.get("align", "left"),
|
||||||
|
vertical="center"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Border
|
||||||
|
cell.border = thin_border
|
||||||
|
|
||||||
startRow += 1
|
startRow += 1
|
||||||
|
|
||||||
# Add rows
|
# Add rows with formatting
|
||||||
cell_style = styles.get("table_cell", {})
|
cell_style = styles.get("table_cell", {})
|
||||||
for row_data in rows:
|
for row_data in rows:
|
||||||
for col, cell_value in enumerate(row_data, 1):
|
# Handle different row formats
|
||||||
cell = sheet.cell(row=startRow, column=col, value=cell_value)
|
if isinstance(row_data, list):
|
||||||
|
cell_values = row_data
|
||||||
|
elif isinstance(row_data, dict) and "cells" in row_data:
|
||||||
|
cell_values = [cell_obj.get("value", "") for cell_obj in row_data.get("cells", [])]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for col, cell_value in enumerate(cell_values, 1):
|
||||||
|
sanitized_value = self._sanitizeCellValue(cell_value)
|
||||||
|
cell = sheet.cell(row=startRow, column=col, value=sanitized_value)
|
||||||
|
|
||||||
|
# Font styling
|
||||||
if cell_style.get("text_color"):
|
if cell_style.get("text_color"):
|
||||||
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
||||||
|
|
||||||
|
# Alignment
|
||||||
|
cell.alignment = Alignment(
|
||||||
|
horizontal=cell_style.get("align", "left"),
|
||||||
|
vertical="center"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Border
|
||||||
|
cell.border = thin_border
|
||||||
|
|
||||||
startRow += 1
|
startRow += 1
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for col in range(1, len(headers) + 1):
|
||||||
|
column_letter = get_column_letter(col)
|
||||||
|
sheet.column_dimensions[column_letter].width = 20
|
||||||
|
|
||||||
return startRow
|
return startRow
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -770,9 +950,13 @@ class RendererXlsx(BaseRenderer):
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||||
"""Add a list element to Excel sheet."""
|
"""Add a list element to Excel sheet. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
list_items = element.get("items", [])
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return startRow
|
||||||
|
list_items = content.get("items", [])
|
||||||
|
|
||||||
list_style = styles.get("bullet_list", {})
|
list_style = styles.get("bullet_list", {})
|
||||||
for item in list_items:
|
for item in list_items:
|
||||||
|
|
@ -788,9 +972,16 @@ class RendererXlsx(BaseRenderer):
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||||
"""Add a paragraph element to Excel sheet."""
|
"""Add a paragraph element to Excel sheet. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
text = element.get("text", "")
|
# Extract from nested content structure
|
||||||
|
content = element.get("content", {})
|
||||||
|
if isinstance(content, dict):
|
||||||
|
text = content.get("text", "")
|
||||||
|
elif isinstance(content, str):
|
||||||
|
text = content
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
if text:
|
if text:
|
||||||
sheet.cell(row=startRow, column=1, value=text)
|
sheet.cell(row=startRow, column=1, value=text)
|
||||||
|
|
||||||
|
|
@ -807,10 +998,14 @@ class RendererXlsx(BaseRenderer):
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
def _addHeadingToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
def _addHeadingToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||||
"""Add a heading element to Excel sheet."""
|
"""Add a heading element to Excel sheet. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
text = element.get("text", "")
|
# Extract from nested content structure
|
||||||
level = element.get("level", 1)
|
content = element.get("content", {})
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
return startRow
|
||||||
|
text = content.get("text", "")
|
||||||
|
level = content.get("level", 1)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
sheet.cell(row=startRow, column=1, value=text)
|
sheet.cell(row=startRow, column=1, value=text)
|
||||||
|
|
@ -835,11 +1030,15 @@ class RendererXlsx(BaseRenderer):
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
def _addImageToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
def _addImageToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||||
"""Add an image element to Excel sheet using openpyxl."""
|
"""Add an image element to Excel sheet using openpyxl. Expects nested content structure."""
|
||||||
try:
|
try:
|
||||||
base64Data = element.get("base64Data", "")
|
# Extract from nested content structure
|
||||||
altText = element.get("altText", "Image")
|
content = element.get("content", {})
|
||||||
caption = element.get("caption", "")
|
if not isinstance(content, dict):
|
||||||
|
return startRow
|
||||||
|
base64Data = content.get("base64Data", "")
|
||||||
|
altText = content.get("altText", "Image")
|
||||||
|
caption = content.get("caption", "")
|
||||||
|
|
||||||
if not base64Data:
|
if not base64Data:
|
||||||
# No image data - add placeholder text
|
# No image data - add placeholder text
|
||||||
|
|
@ -891,16 +1090,23 @@ class RendererXlsx(BaseRenderer):
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.logger.warning("openpyxl.drawing.image not available, using placeholder")
|
self.logger.error("openpyxl.drawing.image not available, cannot embed image")
|
||||||
sheet.cell(row=startRow, column=1, value=f"[Image: {altText}]")
|
errorMsg = f"[Error: Image embedding not available. Image: {altText}]"
|
||||||
|
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||||
|
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
except Exception as imgError:
|
except Exception as imgError:
|
||||||
self.logger.warning(f"Error embedding image in Excel: {str(imgError)}")
|
self.logger.error(f"Error embedding image in Excel: {str(imgError)}")
|
||||||
sheet.cell(row=startRow, column=1, value=f"[Image: {altText}]")
|
errorMsg = f"[Error: Could not embed image '{altText}'. {str(imgError)}]"
|
||||||
|
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||||
|
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Could not add image to Excel: {str(e)}")
|
self.logger.error(f"Error adding image to Excel: {str(e)}")
|
||||||
|
errorMsg = f"[Error: Could not process image. {str(e)}]"
|
||||||
|
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||||
|
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||||
return startRow + 1
|
return startRow + 1
|
||||||
|
|
||||||
def _formatTimestamp(self) -> str:
|
def _formatTimestamp(self) -> str:
|
||||||
|
|
|
||||||
|
|
@ -213,10 +213,21 @@ class ContentValidator:
|
||||||
sourceJson = getattr(doc, 'sourceJson', None)
|
sourceJson = getattr(doc, 'sourceJson', None)
|
||||||
data = getattr(doc, 'documentData', None)
|
data = getattr(doc, 'documentData', None)
|
||||||
|
|
||||||
|
# WICHTIG: For rendered documents (HTML, PDF, DOCX, etc.), jsonStructure is METADATA about the structure,
|
||||||
|
# NOT the actual rendered content. The actual content is in documentData.
|
||||||
|
# Include both: jsonStructure for structure metadata, and contentPreview for actual content check
|
||||||
if sourceJson and isinstance(sourceJson, dict):
|
if sourceJson and isinstance(sourceJson, dict):
|
||||||
# Use source JSON for structure analysis (for rendered documents like xlsx/docx/pdf)
|
# Use source JSON for structure analysis (for rendered documents like xlsx/docx/pdf)
|
||||||
jsonSummary = self._summarizeJsonStructure(sourceJson)
|
jsonSummary = self._summarizeJsonStructure(sourceJson)
|
||||||
summary["jsonStructure"] = jsonSummary
|
summary["jsonStructure"] = jsonSummary
|
||||||
|
# Add note that this is metadata, not actual content
|
||||||
|
summary["note"] = "jsonStructure contains metadata about document structure. Actual rendered content is in documentData."
|
||||||
|
|
||||||
|
# For rendered documents, also check actual content
|
||||||
|
if data is not None:
|
||||||
|
contentPreview = self._getContentPreview(data, formatExt, mimeType)
|
||||||
|
if contentPreview:
|
||||||
|
summary["contentPreview"] = contentPreview
|
||||||
elif data is not None:
|
elif data is not None:
|
||||||
# Fallback: try to parse documentData as JSON (for non-rendered documents)
|
# Fallback: try to parse documentData as JSON (for non-rendered documents)
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
|
|
@ -227,6 +238,11 @@ class ContentValidator:
|
||||||
# Handle list of documents
|
# Handle list of documents
|
||||||
jsonSummary = self._summarizeJsonStructure(data[0])
|
jsonSummary = self._summarizeJsonStructure(data[0])
|
||||||
summary["jsonStructure"] = jsonSummary
|
summary["jsonStructure"] = jsonSummary
|
||||||
|
else:
|
||||||
|
# For non-JSON data (e.g., rendered HTML), get content preview
|
||||||
|
contentPreview = self._getContentPreview(data, formatExt, mimeType)
|
||||||
|
if contentPreview:
|
||||||
|
summary["contentPreview"] = contentPreview
|
||||||
|
|
||||||
summaries.append(summary)
|
summaries.append(summary)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -295,6 +311,73 @@ class ContentValidator:
|
||||||
bytes /= 1024.0
|
bytes /= 1024.0
|
||||||
return f"{bytes:.1f} TB"
|
return f"{bytes:.1f} TB"
|
||||||
|
|
||||||
|
def _getContentPreview(self, data: Any, formatExt: str, mimeType: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get structural validation info for rendered documents (generic, NO content preview for security/privacy)
|
||||||
|
|
||||||
|
Returns metadata about document structure to help validation distinguish between:
|
||||||
|
- Structure metadata (jsonStructure) - describes what should be rendered
|
||||||
|
- Actual rendered content (documentData) - the actual document file
|
||||||
|
|
||||||
|
Does NOT expose actual content, only structural indicators.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if data is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
preview = {}
|
||||||
|
|
||||||
|
# Generic content type detection
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
preview["dataType"] = "bytes"
|
||||||
|
preview["contentLength"] = len(data)
|
||||||
|
# Check if it's likely text-based (for text formats like HTML, TXT, etc.)
|
||||||
|
try:
|
||||||
|
# Try to decode as UTF-8 to check if it's text-based
|
||||||
|
decoded = data.decode('utf-8', errors='strict')
|
||||||
|
preview["isTextBased"] = True
|
||||||
|
preview["contentLength"] = len(decoded)
|
||||||
|
|
||||||
|
# For text-based formats, check if it looks like rendered content vs JSON metadata
|
||||||
|
# JSON metadata typically starts with { or [ and contains structure keywords
|
||||||
|
trimmed = decoded.strip()
|
||||||
|
looksLikeJson = (trimmed.startswith('{') or trimmed.startswith('[')) and \
|
||||||
|
('"sections"' in trimmed or '"contentPartIds"' in trimmed or '"generationHint"' in trimmed)
|
||||||
|
preview["looksLikeRenderedContent"] = not looksLikeJson
|
||||||
|
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Not valid UTF-8, likely binary (PDF, DOCX, images, etc.)
|
||||||
|
preview["isTextBased"] = False
|
||||||
|
preview["isBinary"] = True
|
||||||
|
# Binary files with content are rendered (not metadata)
|
||||||
|
preview["looksLikeRenderedContent"] = True
|
||||||
|
|
||||||
|
elif isinstance(data, str):
|
||||||
|
preview["dataType"] = "string"
|
||||||
|
preview["isTextBased"] = True
|
||||||
|
preview["contentLength"] = len(data)
|
||||||
|
|
||||||
|
# Check if it looks like rendered content vs JSON metadata
|
||||||
|
trimmed = data.strip()
|
||||||
|
looksLikeJson = (trimmed.startswith('{') or trimmed.startswith('[')) and \
|
||||||
|
('"sections"' in trimmed or '"contentPartIds"' in trimmed or '"generationHint"' in trimmed)
|
||||||
|
preview["looksLikeRenderedContent"] = not looksLikeJson
|
||||||
|
|
||||||
|
elif isinstance(data, (dict, list)):
|
||||||
|
# If documentData is still a dict/list, it's likely structure metadata, not rendered content
|
||||||
|
preview["dataType"] = "json"
|
||||||
|
preview["isTextBased"] = True
|
||||||
|
preview["looksLikeRenderedContent"] = False
|
||||||
|
preview["note"] = "documentData is JSON structure, not rendered document file"
|
||||||
|
else:
|
||||||
|
preview["dataType"] = type(data).__name__
|
||||||
|
preview["contentLength"] = len(str(data)) if hasattr(data, '__len__') else 0
|
||||||
|
|
||||||
|
return preview if preview else None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error getting content structure info: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
|
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
@ -445,31 +528,23 @@ EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContex
|
||||||
|
|
||||||
=== VALIDATION INSTRUCTIONS ===
|
=== VALIDATION INSTRUCTIONS ===
|
||||||
|
|
||||||
IMPORTANT: Different formats can represent the same data structure. Do not reject a format just because it differs from expected - check the structure summary for actual content.
|
CRITICAL: Validate ONLY metadata/structure. Documents may be binary (PDF, DOCX, images) or very large (200MB+). NEVER try to read or validate actual content values.
|
||||||
|
|
||||||
VALIDATION RULES:
|
VALIDATION RULES:
|
||||||
1. Use structure summary (sections, statistics, counts) as PRIMARY evidence for DATA-ORIENTED criteria. Trust structure over format claims.
|
1. METADATA ONLY: Use jsonStructure (sections, contentPartIds, content_type, statistics) and contentPreview (dataType, contentLength, looksLikeRenderedContent) for validation. These are METADATA indicators, NOT actual content.
|
||||||
2. Use ACTION HISTORY as PRIMARY evidence for PROCESS-ORIENTED criteria (e.g., "internet search performed", "sources cited"). Document metadata may only reflect the last action, not the entire workflow.
|
2. FORMAT VALIDATION: Check mimeType/format metadata only. Do NOT inspect content to determine format. Format mismatch = wrong_format gap.
|
||||||
3. For each criterion in criteriaMapping: evaluate ONLY that criterion. Do not mention other criteria.
|
3. CONTENT EXISTENCE: Use contentPreview.looksLikeRenderedContent=true to confirm content exists. Use jsonStructure.content_type to confirm data types exist (e.g., "image" section = image exists). Do NOT validate content quality, accuracy, or completeness of actual data values.
|
||||||
4. Priority: Data completeness > Format compatibility. Missing data is more critical than format mismatch.
|
4. STRUCTURE VALIDATION: Use jsonStructure.sections, statistics (counts, rowCount, columnCount) as evidence. Trust structure metadata over format claims.
|
||||||
5. Format understanding: Different formats can represent equivalent data structures. Focus on content, not format name.
|
5. PROCESS VALIDATION: Use ACTION HISTORY for process-oriented criteria (e.g., "search performed", "extraction done").
|
||||||
6. Multi-step workflow awareness: If ACTION HISTORY is present, consider the workflow as a whole. Document metadata (e.g., extraction_method) describes how data was EXTRACTED in the last step, not necessarily how it was OBTAINED in the workflow.
|
6. ONE CRITERION PER EVALUATION: Evaluate each criterion independently. Do not mention other criteria.
|
||||||
7. Data availability assessment: If delivered documents do not contain required data, clearly indicate this in findings. Re-reading the same documents might not help.
|
|
||||||
8. CRITICAL - Data vs Data Description: When criteria require specific data types (e.g., images, tables, charts, files), distinguish between:
|
|
||||||
- ACTUAL DATA: The actual data itself (binary data, structured data, embedded content)
|
|
||||||
- DATA DESCRIPTIONS: Text fields that describe or specify what data should be created (e.g., "image_description", "table_description", "chart_specification") - these are TEXT METADATA, NOT the actual data
|
|
||||||
- If only descriptions/specifications exist but no actual data, the criterion is NOT met. Descriptions are instructions for creating data, not the data itself.
|
|
||||||
- Check content types in sections/elements: if content_type matches the required data type (e.g., "image" for images, "table" for tables), actual data exists. If only text fields describing the data exist, the data is missing.
|
|
||||||
- Check document statistics: if counts for the required data type are 0, the data is missing even if descriptions exist.
|
|
||||||
|
|
||||||
VALIDATION STEPS:
|
VALIDATION STEPS:
|
||||||
- Check ACTION HISTORY first (if present) for PROCESS-ORIENTED criteria (e.g., "search performed", "sources used", "verification done")
|
- Check ACTION HISTORY for process-oriented criteria
|
||||||
- Check ACTION VALIDATION METADATA (if present) - this contains action-specific context for the LAST action only
|
- Check jsonStructure metadata (sections, content_type, statistics) for structure validation
|
||||||
- Check structure summary for quantities, counts, statistics (for DATA-ORIENTED criteria)
|
- Check contentPreview.looksLikeRenderedContent for content existence (not quality)
|
||||||
- Compare found values with required values from criteria
|
- Check mimeType/format for format validation
|
||||||
- If structure unavailable, use metadata only (format, filename, size)
|
- NEVER try to read actual content values (binary files, large files, data accuracy)
|
||||||
- Classify gaps: missing_data (less than required), incomplete_data (partial), wrong_structure (wrong organization), wrong_format (format mismatch but data present)
|
- Classify gaps: missing_data, incomplete_data, wrong_structure, wrong_format
|
||||||
- Assess if documents contain the required data: If structure shows documents lack the data, note this in findings - data must be generated or obtained elsewhere, not re-extracted from same documents
|
|
||||||
|
|
||||||
SCORING:
|
SCORING:
|
||||||
- Data complete + structure matches → qualityScore: 0.9-1.0
|
- Data complete + structure matches → qualityScore: 0.9-1.0
|
||||||
|
|
|
||||||
|
|
@ -379,8 +379,34 @@ def extractLearningsAndImprovements(context: Any) -> str:
|
||||||
return "No learnings available yet"
|
return "No learnings available yet"
|
||||||
|
|
||||||
def extractLatestRefinementFeedback(context: Any) -> str:
|
def extractLatestRefinementFeedback(context: Any) -> str:
|
||||||
"""Extract the latest refinement feedback. Maps to {{KEY:LATEST_REFINEMENT_FEEDBACK}}"""
|
"""Extract the latest refinement feedback. Maps to {{KEY:LATEST_REFINEMENT_FEEDBACK}}
|
||||||
|
|
||||||
|
CRITICAL: If ERROR level logs are found, refinement should stop processing.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
|
# First check for ERROR level logs in workflow
|
||||||
|
if hasattr(context, 'workflow') and context.workflow:
|
||||||
|
try:
|
||||||
|
import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects
|
||||||
|
from modules.interfaces.interfaceDbAppObjects import getRootInterface
|
||||||
|
rootInterface = getRootInterface()
|
||||||
|
interfaceDbChat = interfaceDbChatObjects.getInterface(rootInterface.currentUser)
|
||||||
|
|
||||||
|
# Get workflow logs
|
||||||
|
chatData = interfaceDbChat.getUnifiedChatData(context.workflow.id, None)
|
||||||
|
logs = chatData.get("logs", [])
|
||||||
|
|
||||||
|
# Check for ERROR level logs
|
||||||
|
for log in logs:
|
||||||
|
if isinstance(log, dict):
|
||||||
|
log_level = log.get("level", "").upper()
|
||||||
|
log_message = str(log.get("message", ""))
|
||||||
|
if log_level == "ERROR" or "ERROR" in log_message.upper():
|
||||||
|
return f"CRITICAL: Processing stopped due to ERROR in logs: {log_message[:200]}"
|
||||||
|
except Exception as log_check_error:
|
||||||
|
# If we can't check logs, continue with normal feedback extraction
|
||||||
|
logger.warning(f"Could not check for ERROR logs: {str(log_check_error)}")
|
||||||
|
|
||||||
if not hasattr(context, 'previousReviewResult') or not context.previousReviewResult or not isinstance(context.previousReviewResult, list):
|
if not hasattr(context, 'previousReviewResult') or not context.previousReviewResult or not isinstance(context.previousReviewResult, list):
|
||||||
return "No previous refinement feedback available"
|
return "No previous refinement feedback available"
|
||||||
|
|
||||||
|
|
|
||||||
541
tests/functional/test10_document_generation_formats.py
Normal file
541
tests/functional/test10_document_generation_formats.py
Normal file
|
|
@ -0,0 +1,541 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""
|
||||||
|
Document Generation Formats Test 10 - Tests document generation in DOCX, XLSX, PPTX, and PDF formats
|
||||||
|
Tests professional document formats with various content types including tables, images, and structured data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import base64
|
||||||
|
from typing import Dict, Any, List, Optional
|
||||||
|
|
||||||
|
# Add the gateway to path (go up 2 levels from tests/functional/)
|
||||||
|
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||||
|
if _gateway_path not in sys.path:
|
||||||
|
sys.path.insert(0, _gateway_path)
|
||||||
|
|
||||||
|
# Import the service initialization
|
||||||
|
from modules.services import getInterface as getServices
|
||||||
|
from modules.datamodels.datamodelChat import UserInputRequest, WorkflowModeEnum
|
||||||
|
from modules.datamodels.datamodelUam import User
|
||||||
|
from modules.features.workflow import chatStart
|
||||||
|
import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentGenerationFormatsTester10:
|
||||||
|
def __init__(self):
|
||||||
|
# Use root user for testing (has full access to everything)
|
||||||
|
from modules.interfaces.interfaceDbAppObjects import getRootInterface
|
||||||
|
rootInterface = getRootInterface()
|
||||||
|
self.testUser = rootInterface.currentUser
|
||||||
|
|
||||||
|
# Initialize services using the existing system
|
||||||
|
self.services = getServices(self.testUser, None) # Test user, no workflow
|
||||||
|
self.workflow = None
|
||||||
|
self.testResults = {}
|
||||||
|
self.generatedDocuments = {}
|
||||||
|
self.pdfFileId = None # Store PDF file ID for reuse
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the test environment."""
|
||||||
|
# Enable debug file logging for tests
|
||||||
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
APP_CONFIG.set("APP_DEBUG_CHAT_WORKFLOW_ENABLED", True)
|
||||||
|
|
||||||
|
# Set logging level to INFO to see workflow progress
|
||||||
|
import logging
|
||||||
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
|
|
||||||
|
print(f"Initialized test with user: {self.testUser.id}")
|
||||||
|
print(f"Mandate ID: {self.testUser.mandateId}")
|
||||||
|
print(f"Debug logging enabled: {APP_CONFIG.get('APP_DEBUG_CHAT_WORKFLOW_ENABLED', False)}")
|
||||||
|
|
||||||
|
# Upload PDF file for testing
|
||||||
|
await self.uploadPdfFile()
|
||||||
|
|
||||||
|
async def uploadPdfFile(self):
|
||||||
|
"""Upload the PDF file and store its file ID."""
|
||||||
|
pdfPath = os.path.join(os.path.dirname(__file__), "..", "..", "..", "local", "temp", "B2025-02c.pdf")
|
||||||
|
pdfPath = os.path.abspath(pdfPath)
|
||||||
|
|
||||||
|
if not os.path.exists(pdfPath):
|
||||||
|
print(f"⚠️ Warning: PDF file not found at {pdfPath}")
|
||||||
|
print(" Test will continue without PDF attachment")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read PDF file
|
||||||
|
with open(pdfPath, "rb") as f:
|
||||||
|
pdfContent = f.read()
|
||||||
|
|
||||||
|
# Create file using services.interfaceDbComponent
|
||||||
|
if not hasattr(self.services, 'interfaceDbComponent') or not self.services.interfaceDbComponent:
|
||||||
|
print("⚠️ Warning: interfaceDbComponent not available in services")
|
||||||
|
print(" Test will continue without PDF attachment")
|
||||||
|
return
|
||||||
|
|
||||||
|
interfaceDbComponent = self.services.interfaceDbComponent
|
||||||
|
|
||||||
|
fileItem = interfaceDbComponent.createFile(
|
||||||
|
name="B2025-02c.pdf",
|
||||||
|
mimeType="application/pdf",
|
||||||
|
content=pdfContent
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store file data
|
||||||
|
interfaceDbComponent.createFileData(fileItem.id, pdfContent)
|
||||||
|
|
||||||
|
self.pdfFileId = fileItem.id
|
||||||
|
print(f"✅ Uploaded PDF file: {fileItem.fileName} (ID: {self.pdfFileId}, Size: {len(pdfContent)} bytes)")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
print(f"⚠️ Warning: Failed to upload PDF file: {str(e)}")
|
||||||
|
print(f" Traceback: {traceback.format_exc()}")
|
||||||
|
print(" Test will continue without PDF attachment")
|
||||||
|
|
||||||
|
def createTestPrompt(self, format: str) -> str:
|
||||||
|
"""Create a test prompt for document generation in the specified format.
|
||||||
|
|
||||||
|
The prompt requests:
|
||||||
|
- Professional document structure with title, sections, tables, and images
|
||||||
|
- Extraction of content from attached PDF
|
||||||
|
- Structured data presentation appropriate for the format
|
||||||
|
"""
|
||||||
|
formatPrompts = {
|
||||||
|
"docx": (
|
||||||
|
"Create a professional Word document about 'Fuel Station Receipt Analysis' with:\n"
|
||||||
|
"1) A main title\n"
|
||||||
|
"2) An executive summary paragraph\n"
|
||||||
|
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||||
|
"4) A detailed analysis section with:\n"
|
||||||
|
" - Bullet points of key findings\n"
|
||||||
|
" - A table summarizing transaction details\n"
|
||||||
|
"5) A conclusion section with recommendations\n\n"
|
||||||
|
"Format as a professional DOCX document with proper headings and structure."
|
||||||
|
),
|
||||||
|
"xlsx": (
|
||||||
|
"Create an Excel spreadsheet analyzing the fuel station receipt from the attached PDF (B2025-02c.pdf).\n"
|
||||||
|
"Include:\n"
|
||||||
|
"1) A summary sheet with key metrics\n"
|
||||||
|
"2) A detailed data sheet with:\n"
|
||||||
|
" - Transaction details in rows\n"
|
||||||
|
" - Columns for: Date, Item, Quantity, Price, Total\n"
|
||||||
|
" - Proper formatting and headers\n"
|
||||||
|
"3) A calculations sheet with:\n"
|
||||||
|
" - VAT calculations\n"
|
||||||
|
" - Net and gross totals\n\n"
|
||||||
|
"Format as a professional XLSX spreadsheet with formulas and formatting."
|
||||||
|
),
|
||||||
|
"pptx": (
|
||||||
|
"Create a PowerPoint presentation about 'Fuel Station Receipt Analysis' with:\n"
|
||||||
|
"1) Title slide with main title\n"
|
||||||
|
"2) Overview slide explaining the receipt analysis\n"
|
||||||
|
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||||
|
"4) Analysis slides with:\n"
|
||||||
|
" - Bullet points of key findings\n"
|
||||||
|
" - Visual representation of data\n"
|
||||||
|
"5) Conclusion slide with recommendations\n\n"
|
||||||
|
"Format as a professional PPTX presentation with consistent styling."
|
||||||
|
),
|
||||||
|
"pdf": (
|
||||||
|
"Create a professional PDF document about 'Fuel Station Receipt Analysis' with:\n"
|
||||||
|
"1) A main title\n"
|
||||||
|
"2) An introduction paragraph explaining the receipt analysis\n"
|
||||||
|
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||||
|
"4) A section analyzing the receipt data with:\n"
|
||||||
|
" - Bullet points of key findings\n"
|
||||||
|
" - A table summarizing transaction details\n"
|
||||||
|
"5) A conclusion paragraph with recommendations\n\n"
|
||||||
|
"Format as a professional PDF document suitable for printing."
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return formatPrompts.get(format.lower(), formatPrompts["docx"])
|
||||||
|
|
||||||
|
async def generateDocumentInFormat(self, format: str) -> Dict[str, Any]:
|
||||||
|
"""Generate a document in the specified format using workflow."""
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print(f"GENERATING DOCUMENT IN {format.upper()} FORMAT")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
prompt = self.createTestPrompt(format)
|
||||||
|
print(f"Prompt: {prompt[:200]}...")
|
||||||
|
|
||||||
|
# Create user input request with PDF file attachment
|
||||||
|
listFileId = []
|
||||||
|
if self.pdfFileId:
|
||||||
|
listFileId = [self.pdfFileId]
|
||||||
|
print(f"Attaching PDF file (ID: {self.pdfFileId})")
|
||||||
|
else:
|
||||||
|
print("⚠️ No PDF file attached (file upload may have failed)")
|
||||||
|
|
||||||
|
# Create user input request
|
||||||
|
userInput = UserInputRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
listFileId=listFileId,
|
||||||
|
userLanguage="en"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start workflow
|
||||||
|
print(f"\nStarting workflow for {format.upper()} generation...")
|
||||||
|
workflow = await chatStart(
|
||||||
|
currentUser=self.testUser,
|
||||||
|
userInput=userInput,
|
||||||
|
workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC,
|
||||||
|
workflowId=None
|
||||||
|
)
|
||||||
|
|
||||||
|
if not workflow:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": "Failed to start workflow"
|
||||||
|
}
|
||||||
|
|
||||||
|
self.workflow = workflow
|
||||||
|
print(f"Workflow started: {workflow.id}")
|
||||||
|
|
||||||
|
# Wait for workflow completion (no timeout - wait indefinitely)
|
||||||
|
print(f"Waiting for workflow completion...")
|
||||||
|
completed = await self.waitForWorkflowCompletion(timeout=None)
|
||||||
|
|
||||||
|
if not completed:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": "Workflow did not complete",
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"status": workflow.status if workflow else "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Analyze results
|
||||||
|
results = self.analyzeWorkflowResults()
|
||||||
|
|
||||||
|
# Extract documents for this format
|
||||||
|
documents = results.get("documents", [])
|
||||||
|
formatDocuments = [d for d in documents if d.get("fileName", "").endswith(f".{format.lower()}")]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"format": format,
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"status": results.get("status"),
|
||||||
|
"documentCount": len(formatDocuments),
|
||||||
|
"documents": formatDocuments,
|
||||||
|
"results": results
|
||||||
|
}
|
||||||
|
|
||||||
|
async def waitForWorkflowCompletion(self, timeout: Optional[int] = None, checkInterval: int = 2) -> bool:
|
||||||
|
"""Wait for workflow to complete."""
|
||||||
|
if not self.workflow:
|
||||||
|
return False
|
||||||
|
|
||||||
|
startTime = time.time()
|
||||||
|
lastStatus = None
|
||||||
|
|
||||||
|
interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser)
|
||||||
|
|
||||||
|
if timeout is None:
|
||||||
|
print("Waiting indefinitely (no timeout)")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Check timeout only if specified
|
||||||
|
if timeout is not None and time.time() - startTime > timeout:
|
||||||
|
print(f"\n⏱️ Timeout after {timeout} seconds")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get current workflow status
|
||||||
|
try:
|
||||||
|
currentWorkflow = interfaceDbChat.getWorkflow(self.workflow.id)
|
||||||
|
if not currentWorkflow:
|
||||||
|
print("\n❌ Workflow not found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
currentStatus = currentWorkflow.status
|
||||||
|
elapsed = int(time.time() - startTime)
|
||||||
|
|
||||||
|
# Print status if it changed
|
||||||
|
if currentStatus != lastStatus:
|
||||||
|
print(f"Workflow status: {currentStatus} (elapsed: {elapsed}s)")
|
||||||
|
lastStatus = currentStatus
|
||||||
|
|
||||||
|
# Check if workflow is complete
|
||||||
|
if currentStatus in ["completed", "stopped", "failed"]:
|
||||||
|
self.workflow = currentWorkflow
|
||||||
|
statusIcon = "✅" if currentStatus == "completed" else "❌"
|
||||||
|
print(f"\n{statusIcon} Workflow finished with status: {currentStatus} (elapsed: {elapsed}s)")
|
||||||
|
return currentStatus == "completed"
|
||||||
|
|
||||||
|
# Wait before next check
|
||||||
|
await asyncio.sleep(checkInterval)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n⚠️ Error checking workflow status: {str(e)}")
|
||||||
|
await asyncio.sleep(checkInterval)
|
||||||
|
|
||||||
|
def analyzeWorkflowResults(self) -> Dict[str, Any]:
|
||||||
|
"""Analyze workflow results and extract information."""
|
||||||
|
if not self.workflow:
|
||||||
|
return {"error": "No workflow to analyze"}
|
||||||
|
|
||||||
|
interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser)
|
||||||
|
workflow = interfaceDbChat.getWorkflow(self.workflow.id)
|
||||||
|
|
||||||
|
if not workflow:
|
||||||
|
return {"error": "Workflow not found"}
|
||||||
|
|
||||||
|
# Get unified chat data
|
||||||
|
chatData = interfaceDbChat.getUnifiedChatData(workflow.id, None)
|
||||||
|
|
||||||
|
# Count messages
|
||||||
|
messages = chatData.get("messages", [])
|
||||||
|
userMessages = [m for m in messages if m.get("role") == "user"]
|
||||||
|
assistantMessages = [m for m in messages if m.get("role") == "assistant"]
|
||||||
|
|
||||||
|
# Count documents
|
||||||
|
documents = chatData.get("documents", [])
|
||||||
|
|
||||||
|
# Get logs
|
||||||
|
logs = chatData.get("logs", [])
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"status": workflow.status,
|
||||||
|
"workflowMode": str(workflow.workflowMode) if hasattr(workflow, 'workflowMode') else None,
|
||||||
|
"currentRound": workflow.currentRound,
|
||||||
|
"totalTasks": workflow.totalTasks,
|
||||||
|
"totalActions": workflow.totalActions,
|
||||||
|
"messageCount": len(messages),
|
||||||
|
"userMessageCount": len(userMessages),
|
||||||
|
"assistantMessageCount": len(assistantMessages),
|
||||||
|
"documentCount": len(documents),
|
||||||
|
"logCount": len(logs),
|
||||||
|
"documents": documents,
|
||||||
|
"logs": logs
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\nWorkflow Results:")
|
||||||
|
print(f" Status: {results['status']}")
|
||||||
|
print(f" Tasks: {results['totalTasks']}")
|
||||||
|
print(f" Actions: {results['totalActions']}")
|
||||||
|
print(f" Messages: {results['messageCount']}")
|
||||||
|
print(f" Documents: {results['documentCount']}")
|
||||||
|
|
||||||
|
# Print document details
|
||||||
|
if documents:
|
||||||
|
print(f"\nGenerated Documents:")
|
||||||
|
for doc in documents:
|
||||||
|
fileName = doc.get("fileName", "unknown")
|
||||||
|
fileSize = doc.get("fileSize", 0)
|
||||||
|
mimeType = doc.get("mimeType", "unknown")
|
||||||
|
documentType = doc.get("documentType", "N/A")
|
||||||
|
print(f" - {fileName} ({fileSize} bytes, {mimeType}, type: {documentType})")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def verifyDocumentFormat(self, document: Dict[str, Any], expectedFormat: str) -> Dict[str, Any]:
|
||||||
|
"""Verify that a document matches the expected format and contains expected metadata."""
|
||||||
|
fileName = document.get("fileName", "")
|
||||||
|
mimeType = document.get("mimeType", "")
|
||||||
|
fileSize = document.get("fileSize", 0)
|
||||||
|
documentType = document.get("documentType")
|
||||||
|
metadata = document.get("metadata")
|
||||||
|
|
||||||
|
# Expected MIME types
|
||||||
|
expectedMimeTypes = {
|
||||||
|
"pdf": ["application/pdf"],
|
||||||
|
"docx": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
|
||||||
|
"xlsx": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"],
|
||||||
|
"pptx": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Expected file extensions
|
||||||
|
expectedExtensions = {
|
||||||
|
"pdf": [".pdf"],
|
||||||
|
"docx": [".docx"],
|
||||||
|
"xlsx": [".xlsx"],
|
||||||
|
"pptx": [".pptx"]
|
||||||
|
}
|
||||||
|
|
||||||
|
formatLower = expectedFormat.lower()
|
||||||
|
expectedMimes = expectedMimeTypes.get(formatLower, [])
|
||||||
|
expectedExts = expectedExtensions.get(formatLower, [])
|
||||||
|
|
||||||
|
# Check file extension
|
||||||
|
hasCorrectExtension = any(fileName.lower().endswith(ext) for ext in expectedExts)
|
||||||
|
|
||||||
|
# Check MIME type
|
||||||
|
hasCorrectMimeType = any(mimeType.lower() == mime.lower() for mime in expectedMimes)
|
||||||
|
|
||||||
|
# Check file size (should be > 0)
|
||||||
|
hasValidSize = fileSize > 0
|
||||||
|
|
||||||
|
# Check document type (should be present)
|
||||||
|
hasDocumentType = documentType is not None
|
||||||
|
|
||||||
|
# Check metadata (should be present)
|
||||||
|
hasMetadata = metadata is not None and isinstance(metadata, dict)
|
||||||
|
|
||||||
|
verification = {
|
||||||
|
"format": expectedFormat,
|
||||||
|
"fileName": fileName,
|
||||||
|
"mimeType": mimeType,
|
||||||
|
"fileSize": fileSize,
|
||||||
|
"documentType": documentType,
|
||||||
|
"hasMetadata": hasMetadata,
|
||||||
|
"hasCorrectExtension": hasCorrectExtension,
|
||||||
|
"hasCorrectMimeType": hasCorrectMimeType,
|
||||||
|
"hasValidSize": hasValidSize,
|
||||||
|
"hasDocumentType": hasDocumentType,
|
||||||
|
"isValid": hasCorrectExtension and hasValidSize and hasCorrectMimeType,
|
||||||
|
"isComplete": hasCorrectExtension and hasValidSize and hasCorrectMimeType and hasDocumentType and hasMetadata
|
||||||
|
}
|
||||||
|
|
||||||
|
return verification
|
||||||
|
|
||||||
|
async def testAllFormats(self) -> Dict[str, Any]:
|
||||||
|
"""Test document generation in DOCX, XLSX, PPTX, and PDF formats."""
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("TESTING DOCUMENT GENERATION IN DOCX, XLSX, PPTX, AND PDF FORMATS")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
formats = ["docx", "xlsx", "pptx", "pdf"]
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for format in formats:
|
||||||
|
try:
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Testing {format.upper()} format...")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
|
result = await self.generateDocumentInFormat(format)
|
||||||
|
results[format] = result
|
||||||
|
|
||||||
|
if result.get("success"):
|
||||||
|
documents = result.get("documents", [])
|
||||||
|
if documents:
|
||||||
|
# Verify first document
|
||||||
|
verification = self.verifyDocumentFormat(documents[0], format)
|
||||||
|
result["verification"] = verification
|
||||||
|
|
||||||
|
print(f"\n✅ {format.upper()} generation successful!")
|
||||||
|
print(f" Documents: {len(documents)}")
|
||||||
|
print(f" Verification: {'✅ PASS' if verification['isValid'] else '❌ FAIL'}")
|
||||||
|
print(f" Complete (with metadata): {'✅ YES' if verification['isComplete'] else '❌ NO'}")
|
||||||
|
if verification.get("fileName"):
|
||||||
|
print(f" File: {verification['fileName']}")
|
||||||
|
print(f" Size: {verification['fileSize']} bytes")
|
||||||
|
print(f" MIME: {verification['mimeType']}")
|
||||||
|
print(f" Document Type: {verification.get('documentType', 'N/A')}")
|
||||||
|
print(f" Has Metadata: {'✅' if verification.get('hasMetadata') else '❌'}")
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {format.upper()} generation completed but no documents found")
|
||||||
|
else:
|
||||||
|
error = result.get("error", "Unknown error")
|
||||||
|
print(f"\n❌ {format.upper()} generation failed: {error}")
|
||||||
|
|
||||||
|
# Small delay between tests
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
print(f"\n❌ Error testing {format.upper()}: {str(e)}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
results[format] = {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"traceback": traceback.format_exc()
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def runTest(self):
|
||||||
|
"""Run the complete test."""
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("DOCUMENT GENERATION FORMATS TEST 10 - DOCX, XLSX, PPTX, PDF")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
# Test all formats
|
||||||
|
formatResults = await self.testAllFormats()
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
# Format tests summary
|
||||||
|
print("\nFormat Tests:")
|
||||||
|
successCount = 0
|
||||||
|
failCount = 0
|
||||||
|
completeCount = 0 # Documents with metadata
|
||||||
|
|
||||||
|
for format, result in formatResults.items():
|
||||||
|
if result.get("success"):
|
||||||
|
successCount += 1
|
||||||
|
verification = result.get("verification", {})
|
||||||
|
isValid = verification.get("isValid", False)
|
||||||
|
isComplete = verification.get("isComplete", False)
|
||||||
|
if isComplete:
|
||||||
|
completeCount += 1
|
||||||
|
statusIcon = "✅" if isValid else "⚠️"
|
||||||
|
completeIcon = "✅" if isComplete else "❌"
|
||||||
|
docCount = result.get("documentCount", 0)
|
||||||
|
print(f"{statusIcon} {format.upper():6s}: {'PASS' if isValid else 'FAIL'} - {docCount} document(s) - Metadata: {completeIcon}")
|
||||||
|
else:
|
||||||
|
failCount += 1
|
||||||
|
error = result.get("error", "Unknown error")
|
||||||
|
print(f"❌ {format.upper():6s}: FAIL - {error}")
|
||||||
|
|
||||||
|
print(f"\nFormat Tests: {successCount} passed, {failCount} failed out of {len(formatResults)} formats")
|
||||||
|
print(f"Complete Documents (with metadata): {completeCount} out of {successCount} successful generations")
|
||||||
|
|
||||||
|
self.testResults = {
|
||||||
|
"success": failCount == 0,
|
||||||
|
"formatTests": {
|
||||||
|
"successCount": successCount,
|
||||||
|
"failCount": failCount,
|
||||||
|
"completeCount": completeCount,
|
||||||
|
"totalFormats": len(formatResults),
|
||||||
|
"results": formatResults
|
||||||
|
},
|
||||||
|
"totalSuccess": successCount,
|
||||||
|
"totalFail": failCount
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.testResults
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
print(f"\n❌ Test failed with error: {type(e).__name__}: {str(e)}")
|
||||||
|
print(f"Traceback:\n{traceback.format_exc()}")
|
||||||
|
self.testResults = {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"traceback": traceback.format_exc()
|
||||||
|
}
|
||||||
|
return self.testResults
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run document generation formats test 10."""
|
||||||
|
tester = DocumentGenerationFormatsTester10()
|
||||||
|
results = await tester.runTest()
|
||||||
|
|
||||||
|
# Print final results as JSON for easy parsing
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("FINAL RESULTS (JSON)")
|
||||||
|
print("="*80)
|
||||||
|
print(json.dumps(results, indent=2, default=str))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
Loading…
Reference in a new issue