fixed handovers from generator to renderers
This commit is contained in:
parent
0280879968
commit
3e7c75335a
18 changed files with 2067 additions and 709 deletions
|
|
@ -354,10 +354,11 @@ class AiOpenai(BaseConnectorAi):
|
|||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"DALL-E API error: {response.status_code} - {response.text}"
|
||||
}
|
||||
return AiModelResponse(
|
||||
content="",
|
||||
success=False,
|
||||
error=f"DALL-E API error: {response.status_code} - {response.text}"
|
||||
)
|
||||
|
||||
responseJson = response.json()
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ class DocumentMetadata(BaseModel):
|
|||
sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs")
|
||||
extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction")
|
||||
version: str = Field(default="1.0", description="Document version")
|
||||
documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')")
|
||||
styles: Optional[Dict[str, Any]] = Field(default=None, description="Document styling configuration")
|
||||
|
||||
|
||||
class TableData(BaseModel):
|
||||
|
|
@ -112,6 +114,8 @@ class RenderedDocument(BaseModel):
|
|||
documentData: bytes = Field(description="Document content as bytes")
|
||||
mimeType: str = Field(description="MIME type of the document (e.g., 'text/html', 'application/pdf')")
|
||||
filename: str = Field(description="Filename for the document (e.g., 'report.html', 'image.png')")
|
||||
documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')")
|
||||
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Document metadata (title, author, etc.)")
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ class StructureFiller:
|
|||
# Erstelle Operation-ID für Struktur-Abfüllen
|
||||
fillOperationId = f"{parentOperationId}_structure_filling"
|
||||
|
||||
# Prüfe ob Struktur Chapters oder Sections hat
|
||||
# Validate structure has chapters
|
||||
hasChapters = False
|
||||
for doc in structure.get("documents", []):
|
||||
if "chapters" in doc:
|
||||
|
|
@ -60,9 +60,9 @@ class StructureFiller:
|
|||
break
|
||||
|
||||
if not hasChapters:
|
||||
# Fallback: Alte Struktur mit Sections direkt - verwende alte Logik
|
||||
logger.warning("Structure has no chapters, using legacy section-based filling")
|
||||
return await self._fillStructureLegacy(structure, contentParts, userPrompt, fillOperationId)
|
||||
error_msg = "Structure must have chapters. Legacy section-based structure is not supported."
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Starte ChatLog mit Parent-Referenz
|
||||
chapterCount = sum(len(doc.get("chapters", [])) for doc in structure.get("documents", []))
|
||||
|
|
@ -214,10 +214,11 @@ class StructureFiller:
|
|||
contentType = section.get("content_type", "paragraph")
|
||||
useAiCall = section.get("useAiCall", False)
|
||||
|
||||
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
|
||||
if len(contentPartIds) == 0:
|
||||
# WICHTIG: Wenn keine ContentParts vorhanden sind UND kein generationHint, kann kein AI-Call gemacht werden
|
||||
# Aber: Wenn generationHint vorhanden ist, kann AI auch ohne ContentParts generieren (z.B. Executive Summary)
|
||||
if len(contentPartIds) == 0 and not generationHint:
|
||||
useAiCall = False
|
||||
logger.debug(f"Section {sectionId}: No content parts, setting useAiCall=False")
|
||||
logger.debug(f"Section {sectionId}: No content parts and no generation hint, setting useAiCall=False")
|
||||
|
||||
elements = []
|
||||
|
||||
|
|
@ -259,12 +260,25 @@ class StructureFiller:
|
|||
"label": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
elif contentFormat == "object":
|
||||
elements.append({
|
||||
"type": part.typeGroup,
|
||||
"base64Data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"altText": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
# Nested content structure for objects
|
||||
if part.typeGroup == "image":
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": part.data,
|
||||
"altText": part.metadata.get("usageHint", part.label),
|
||||
"caption": part.metadata.get("caption", "")
|
||||
}
|
||||
})
|
||||
else:
|
||||
elements.append({
|
||||
"type": part.typeGroup,
|
||||
"content": {
|
||||
"data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"label": part.metadata.get("usageHint", part.label)
|
||||
}
|
||||
})
|
||||
|
||||
# Aggregiere extracted Parts mit AI
|
||||
if extractedParts:
|
||||
|
|
@ -300,11 +314,24 @@ class StructureFiller:
|
|||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt (aggregation)")
|
||||
|
||||
# Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!)
|
||||
# Use IMAGE_GENERATE for image content type
|
||||
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||
|
||||
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
maxPromptLength = 4000
|
||||
if len(generationPrompt) > maxPromptLength:
|
||||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||
|
||||
# For IMAGE_GENERATE, don't pass contentParts - image generation uses prompt only, not content chunks
|
||||
contentPartsForCall = [] if operationType == OperationTypeEnum.IMAGE_GENERATE else extractedParts
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=extractedParts, # ALLE PARTS!
|
||||
contentParts=contentPartsForCall, # Empty for IMAGE_GENERATE, all parts for others
|
||||
options=AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
||||
operationType=operationType,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
|
|
@ -318,14 +345,39 @@ class StructureFiller:
|
|||
)
|
||||
logger.debug(f"Logged section response: section_content_{sectionId}_response (aggregation)")
|
||||
|
||||
# Parse und füge zu elements hinzu
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
import base64
|
||||
# Convert image data to base64 string if needed
|
||||
if isinstance(aiResponse.content, bytes):
|
||||
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||
elif isinstance(aiResponse.content, str):
|
||||
# Already base64 string or data URI
|
||||
if aiResponse.content.startswith("data:image/"):
|
||||
# Extract base64 from data URI
|
||||
base64Data = aiResponse.content.split(",", 1)[1]
|
||||
else:
|
||||
base64Data = aiResponse.content
|
||||
else:
|
||||
base64Data = ""
|
||||
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": base64Data,
|
||||
"altText": generationHint or "Generated image",
|
||||
"caption": ""
|
||||
}
|
||||
})
|
||||
else:
|
||||
# Parse JSON response for other content types
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
|
@ -342,6 +394,117 @@ class StructureFiller:
|
|||
# NICHT raise - Section wird mit Fehlermeldung gerendert
|
||||
|
||||
else:
|
||||
# Einzelverarbeitung: Jeder Part einzeln ODER Generation ohne ContentParts
|
||||
# Handle case where no content parts but generationHint exists (e.g., Executive Summary)
|
||||
if len(contentPartIds) == 0 and useAiCall and generationHint:
|
||||
# Generate content from scratch using only generationHint
|
||||
logger.debug(f"Processing section {sectionId}: No content parts, generating from generationHint only")
|
||||
generationPrompt = self._buildSectionGenerationPrompt(
|
||||
section=section,
|
||||
contentParts=[], # NO PARTS
|
||||
userPrompt=userPrompt,
|
||||
generationHint=generationHint,
|
||||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=False
|
||||
)
|
||||
|
||||
# Erstelle Operation-ID für Section-Generierung
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
||||
# Starte ChatLog mit Parent-Referenz
|
||||
self.services.chat.progressLogStart(
|
||||
sectionOperationId,
|
||||
"Section Generation",
|
||||
"Section",
|
||||
f"Generating section {sectionId} from generationHint",
|
||||
parentOperationId=fillOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
# Debug: Log Prompt
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"section_content_{sectionId}_prompt"
|
||||
)
|
||||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
||||
|
||||
# Verwende callAi ohne ContentParts
|
||||
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||
|
||||
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
maxPromptLength = 4000
|
||||
if len(generationPrompt) > maxPromptLength:
|
||||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[], # NO PARTS
|
||||
options=AiCallOptions(
|
||||
operationType=operationType,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
|
||||
# Debug: Log Response
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content,
|
||||
f"section_content_{sectionId}_response"
|
||||
)
|
||||
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
||||
|
||||
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
import base64
|
||||
# Convert image data to base64 string if needed
|
||||
if isinstance(aiResponse.content, bytes):
|
||||
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||
elif isinstance(aiResponse.content, str):
|
||||
# Already base64 string or data URI
|
||||
if aiResponse.content.startswith("data:image/"):
|
||||
# Extract base64 from data URI
|
||||
base64Data = aiResponse.content.split(",", 1)[1]
|
||||
else:
|
||||
base64Data = aiResponse.content
|
||||
else:
|
||||
base64Data = ""
|
||||
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": base64Data,
|
||||
"altText": generationHint or "Generated image",
|
||||
"caption": ""
|
||||
}
|
||||
})
|
||||
else:
|
||||
# Parse JSON response for other content types
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
||||
except Exception as e:
|
||||
# Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!)
|
||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
||||
elements.append({
|
||||
"type": "error",
|
||||
"message": f"Error generating section {sectionId}: {str(e)}",
|
||||
"sectionId": sectionId
|
||||
})
|
||||
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
||||
|
||||
# Einzelverarbeitung: Jeder Part einzeln
|
||||
for partId in contentPartIds:
|
||||
part = self._findContentPartById(partId, contentParts)
|
||||
|
|
@ -359,13 +522,26 @@ class StructureFiller:
|
|||
})
|
||||
|
||||
elif contentFormat == "object":
|
||||
# Füge base64 Object hinzu
|
||||
elements.append({
|
||||
"type": part.typeGroup, # "image", "binary", etc.
|
||||
"base64Data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"altText": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
# Füge base64 Object hinzu (nested in content structure)
|
||||
if part.typeGroup == "image":
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": part.data,
|
||||
"altText": part.metadata.get("usageHint", part.label),
|
||||
"caption": part.metadata.get("caption", "")
|
||||
}
|
||||
})
|
||||
else:
|
||||
# For other object types, use generic structure
|
||||
elements.append({
|
||||
"type": part.typeGroup,
|
||||
"content": {
|
||||
"data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"label": part.metadata.get("usageHint", part.label)
|
||||
}
|
||||
})
|
||||
|
||||
elif contentFormat == "extracted":
|
||||
# WICHTIG: Prüfe sowohl useAiCall als auch generationHint
|
||||
|
|
@ -403,11 +579,24 @@ class StructureFiller:
|
|||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
||||
|
||||
# Verwende callAi für ContentParts-Unterstützung
|
||||
# Use IMAGE_GENERATE for image content type
|
||||
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
||||
|
||||
# For IMAGE_GENERATE, truncate prompt to 4000 chars (DALL-E limit)
|
||||
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
maxPromptLength = 4000
|
||||
if len(generationPrompt) > maxPromptLength:
|
||||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
# Keep the beginning (task, metadata, generation hint) and truncate from end
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] # Truncate at last newline
|
||||
|
||||
# For IMAGE_GENERATE, don't pass contentParts - image generation uses prompt only, not content chunks
|
||||
contentPartsForCall = [] if operationType == OperationTypeEnum.IMAGE_GENERATE else [part]
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[part],
|
||||
contentParts=contentPartsForCall,
|
||||
options=AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
||||
operationType=operationType,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
|
|
@ -421,14 +610,39 @@ class StructureFiller:
|
|||
)
|
||||
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
||||
|
||||
# Parse und füge zu elements hinzu
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
# Handle IMAGE_GENERATE differently - returns image data directly
|
||||
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
||||
import base64
|
||||
# Convert image data to base64 string if needed
|
||||
if isinstance(aiResponse.content, bytes):
|
||||
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
||||
elif isinstance(aiResponse.content, str):
|
||||
# Already base64 string or data URI
|
||||
if aiResponse.content.startswith("data:image/"):
|
||||
# Extract base64 from data URI
|
||||
base64Data = aiResponse.content.split(",", 1)[1]
|
||||
else:
|
||||
base64Data = aiResponse.content
|
||||
else:
|
||||
base64Data = ""
|
||||
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": base64Data,
|
||||
"altText": generationHint or "Generated image",
|
||||
"caption": ""
|
||||
}
|
||||
})
|
||||
else:
|
||||
# Parse JSON response for other content types
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
|
@ -502,16 +716,6 @@ class StructureFiller:
|
|||
if partId in contentPartsMap:
|
||||
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
||||
|
||||
# Prüfe ob Sections direkt vorhanden sind (Legacy-Struktur)
|
||||
elif "sections" in doc:
|
||||
for section in doc.get("sections", []):
|
||||
contentPartIds = section.get("contentPartIds", [])
|
||||
if contentPartIds:
|
||||
section["contentPartsMetadata"] = []
|
||||
for partId in contentPartIds:
|
||||
if partId in contentPartsMap:
|
||||
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
||||
|
||||
return structure
|
||||
|
||||
def _flattenChaptersToSections(
|
||||
|
|
@ -542,8 +746,10 @@ class StructureFiller:
|
|||
"content_type": "heading",
|
||||
"elements": [{
|
||||
"type": "heading",
|
||||
"content": chapter.get("title"),
|
||||
"level": chapter.get("level", 1)
|
||||
"content": {
|
||||
"text": chapter.get("title", ""),
|
||||
"level": chapter.get("level", 1)
|
||||
}
|
||||
}]
|
||||
}
|
||||
flattened_doc["sections"].append(heading_section)
|
||||
|
|
@ -555,276 +761,6 @@ class StructureFiller:
|
|||
|
||||
return result
|
||||
|
||||
async def _fillStructureLegacy(
|
||||
self,
|
||||
structure: Dict[str, Any],
|
||||
contentParts: List[ContentPart],
|
||||
userPrompt: str,
|
||||
fillOperationId: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Legacy: Füllt Struktur mit Sections direkt (für Rückwärtskompatibilität).
|
||||
"""
|
||||
# Starte ChatLog
|
||||
self.services.chat.progressLogStart(
|
||||
fillOperationId,
|
||||
"Structure Filling (Legacy)",
|
||||
"Filling",
|
||||
f"Filling {len(structure.get('documents', [{}])[0].get('sections', []))} sections",
|
||||
parentOperationId=fillOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
filledStructure = copy.deepcopy(structure)
|
||||
|
||||
# Sammle alle Sections
|
||||
sections_to_process = []
|
||||
all_sections_list = []
|
||||
for doc in filledStructure.get("documents", []):
|
||||
doc_sections = doc.get("sections", [])
|
||||
all_sections_list.extend(doc_sections)
|
||||
for section in doc_sections:
|
||||
sections_to_process.append((doc, section))
|
||||
|
||||
# Verarbeite Sections (bestehende Logik)
|
||||
for sectionIndex, (doc, section) in enumerate(sections_to_process):
|
||||
sectionId = section.get("id")
|
||||
contentPartIds = section.get("contentPartIds", [])
|
||||
contentFormats = section.get("contentFormats", {})
|
||||
# Check both camelCase and snake_case for generationHint
|
||||
generationHint = section.get("generationHint") or section.get("generation_hint")
|
||||
contentType = section.get("content_type", "paragraph")
|
||||
useAiCall = section.get("useAiCall", False)
|
||||
|
||||
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
|
||||
if len(contentPartIds) == 0:
|
||||
useAiCall = False
|
||||
logger.debug(f"Section {sectionId} (legacy): No content parts, setting useAiCall=False")
|
||||
|
||||
elements = []
|
||||
|
||||
# Prüfe ob Aggregation nötig ist
|
||||
needsAggregation = self._needsAggregation(
|
||||
contentType=contentType,
|
||||
contentPartCount=len(contentPartIds)
|
||||
)
|
||||
|
||||
logger.info(f"Processing section {sectionId} (legacy): contentType={contentType}, contentPartCount={len(contentPartIds)}, useAiCall={useAiCall}, needsAggregation={needsAggregation}, hasGenerationHint={bool(generationHint)}")
|
||||
|
||||
if needsAggregation and useAiCall and generationHint:
|
||||
# Aggregation: Alle Parts zusammen verarbeiten
|
||||
sectionParts = [
|
||||
self._findContentPartById(pid, contentParts)
|
||||
for pid in contentPartIds
|
||||
]
|
||||
sectionParts = [p for p in sectionParts if p is not None]
|
||||
|
||||
if sectionParts:
|
||||
# Filtere nur extracted Parts für Aggregation
|
||||
extractedParts = [
|
||||
p for p in sectionParts
|
||||
if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted"
|
||||
]
|
||||
nonExtractedParts = [
|
||||
p for p in sectionParts
|
||||
if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted"
|
||||
]
|
||||
|
||||
# Verarbeite non-extracted Parts separat
|
||||
for part in nonExtractedParts:
|
||||
contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat"))
|
||||
|
||||
if contentFormat == "reference":
|
||||
elements.append({
|
||||
"type": "reference",
|
||||
"documentReference": part.metadata.get("documentReference"),
|
||||
"label": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
elif contentFormat == "object":
|
||||
elements.append({
|
||||
"type": part.typeGroup,
|
||||
"base64Data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"altText": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
|
||||
# Aggregiere extracted Parts mit AI
|
||||
if extractedParts:
|
||||
generationPrompt = self._buildSectionGenerationPrompt(
|
||||
section=section,
|
||||
contentParts=extractedParts,
|
||||
userPrompt=userPrompt,
|
||||
generationHint=generationHint,
|
||||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=True
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
||||
self.services.chat.progressLogStart(
|
||||
sectionOperationId,
|
||||
"Section Generation (Aggregation)",
|
||||
"Section",
|
||||
f"Generating section {sectionId} with {len(extractedParts)} parts",
|
||||
parentOperationId=fillOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"section_content_{sectionId}_prompt"
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=extractedParts,
|
||||
options=AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content,
|
||||
f"section_content_{sectionId}_response"
|
||||
)
|
||||
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
||||
elements.append({
|
||||
"type": "error",
|
||||
"message": f"Error generating section {sectionId}: {str(e)}",
|
||||
"sectionId": sectionId
|
||||
})
|
||||
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
||||
|
||||
else:
|
||||
# Einzelverarbeitung: Jeder Part einzeln
|
||||
for partId in contentPartIds:
|
||||
part = self._findContentPartById(partId, contentParts)
|
||||
if not part:
|
||||
continue
|
||||
|
||||
contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat"))
|
||||
|
||||
if contentFormat == "reference":
|
||||
elements.append({
|
||||
"type": "reference",
|
||||
"documentReference": part.metadata.get("documentReference"),
|
||||
"label": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
|
||||
elif contentFormat == "object":
|
||||
elements.append({
|
||||
"type": part.typeGroup,
|
||||
"base64Data": part.data,
|
||||
"mimeType": part.mimeType,
|
||||
"altText": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
|
||||
elif contentFormat == "extracted":
|
||||
# WICHTIG: Prüfe sowohl useAiCall als auch generationHint
|
||||
if useAiCall and generationHint:
|
||||
# AI-Call mit einzelnen ContentPart
|
||||
logger.debug(f"Processing section {sectionId}: Single extracted part with AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)})")
|
||||
generationPrompt = self._buildSectionGenerationPrompt(
|
||||
section=section,
|
||||
contentParts=[part],
|
||||
userPrompt=userPrompt,
|
||||
generationHint=generationHint,
|
||||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=False
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
||||
self.services.chat.progressLogStart(
|
||||
sectionOperationId,
|
||||
"Section Generation",
|
||||
"Section",
|
||||
f"Generating section {sectionId}",
|
||||
parentOperationId=fillOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"section_content_{sectionId}_prompt"
|
||||
)
|
||||
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[part],
|
||||
options=AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content,
|
||||
f"section_content_{sectionId}_response"
|
||||
)
|
||||
logger.debug(f"Logged section response: section_content_{sectionId}_response")
|
||||
|
||||
generatedElements = json.loads(
|
||||
self.services.utils.jsonExtractString(aiResponse.content)
|
||||
)
|
||||
if isinstance(generatedElements, list):
|
||||
elements.extend(generatedElements)
|
||||
elif isinstance(generatedElements, dict) and "elements" in generatedElements:
|
||||
elements.extend(generatedElements["elements"])
|
||||
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
||||
elements.append({
|
||||
"type": "error",
|
||||
"message": f"Error generating section {sectionId}: {str(e)}",
|
||||
"sectionId": sectionId
|
||||
})
|
||||
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
||||
else:
|
||||
# Füge extrahierten Text direkt hinzu (kein AI-Call)
|
||||
logger.debug(f"Processing section {sectionId}: Single extracted part WITHOUT AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)}) - adding extracted text directly")
|
||||
elements.append({
|
||||
"type": "extracted_text",
|
||||
"content": part.data,
|
||||
"source": part.metadata.get("documentId"),
|
||||
"extractionPrompt": part.metadata.get("extractionPrompt")
|
||||
})
|
||||
|
||||
section["elements"] = elements
|
||||
|
||||
# Füge ContentParts-Metadaten zur Struktur hinzu (für Validierung)
|
||||
filledStructure = self._addContentPartsMetadata(filledStructure, contentParts)
|
||||
|
||||
self.services.chat.progressLogFinish(fillOperationId, True)
|
||||
return filledStructure
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(fillOperationId, False)
|
||||
logger.error(f"Error in _fillStructureLegacy: {str(e)}")
|
||||
raise
|
||||
|
||||
def _buildChapterSectionsStructurePrompt(
|
||||
self,
|
||||
chapterId: str,
|
||||
|
|
@ -899,6 +835,18 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
|||
"""
|
||||
return prompt
|
||||
|
||||
def _getContentStructureExample(self, contentType: str) -> str:
|
||||
"""Get the JSON structure example for a specific content type."""
|
||||
structures = {
|
||||
"table": '{{"headers": ["Column1", "Column2"], "rows": [["Value1", "Value2"], ["Value3", "Value4"]]}}',
|
||||
"bullet_list": '{{"items": ["Item 1", "Item 2", "Item 3"]}}',
|
||||
"heading": '{{"text": "Section Title", "level": 2}}',
|
||||
"paragraph": '{{"text": "This is paragraph text."}}',
|
||||
"code_block": '{{"code": "function example() {{ return true; }}", "language": "javascript"}}',
|
||||
"image": '{{"base64Data": "<base64_encoded_image_data>", "altText": "Description", "caption": "Optional caption"}}'
|
||||
}
|
||||
return structures.get(contentType, '{{"text": ""}}')
|
||||
|
||||
def _buildSectionGenerationPrompt(
|
||||
self,
|
||||
section: Dict[str, Any],
|
||||
|
|
@ -998,6 +946,8 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
|||
for next in nextSections:
|
||||
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
|
||||
|
||||
contentStructureExample = self._getContentStructureExample(contentType)
|
||||
|
||||
if isAggregation:
|
||||
prompt = f"""# TASK: Generate Section Content (Aggregation)
|
||||
|
||||
|
|
@ -1027,21 +977,17 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
|||
|
||||
## OUTPUT FORMAT
|
||||
Return a JSON object with this structure:
|
||||
```json
|
||||
|
||||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"headers": [...], // if table
|
||||
"rows": [...], // if table
|
||||
"items": [...], // if bullet_list
|
||||
"content": "..." // if paragraph
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||
CRITICAL: "content" MUST always be an object (never a string). Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||
"""
|
||||
else:
|
||||
prompt = f"""# TASK: Generate Section Content
|
||||
|
|
@ -1071,18 +1017,17 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
|
|||
|
||||
## OUTPUT FORMAT
|
||||
Return a JSON object with this structure:
|
||||
```json
|
||||
|
||||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"content": "..."
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||
CRITICAL: "content" MUST always be an object (never a string). Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
|
|
|||
|
|
@ -1129,8 +1129,9 @@ class ExtractionService:
|
|||
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
|
||||
|
||||
# If either condition fails, chunk the content
|
||||
if totalTokens > maxTotalTokens or partSize > availableContentBytes:
|
||||
# Part too large or total exceeds limit - chunk it
|
||||
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
|
||||
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
|
||||
# Part too large or total exceeds limit - chunk it (but not for image generation)
|
||||
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
||||
if not chunks:
|
||||
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
||||
|
|
|
|||
|
|
@ -199,29 +199,40 @@ class BaseRenderer(ABC):
|
|||
return "unknown"
|
||||
|
||||
def _extractTableData(self, sectionData: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
|
||||
"""Extract table headers and rows from section data."""
|
||||
"""Extract table headers and rows from section data. Expects nested content structure."""
|
||||
# Normalize when elements array was passed in
|
||||
if isinstance(sectionData, list):
|
||||
if sectionData and isinstance(sectionData[0], dict):
|
||||
sectionData = sectionData[0]
|
||||
else:
|
||||
# Empty list or invalid structure - return empty table
|
||||
return [], []
|
||||
# Ensure sectionData is a dict before calling .get()
|
||||
# Ensure sectionData is a dict
|
||||
if not isinstance(sectionData, dict):
|
||||
return [], []
|
||||
headers = sectionData.get("headers", [])
|
||||
rows = sectionData.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = sectionData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return [], []
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
return headers, rows
|
||||
|
||||
def _extractBulletListItems(self, sectionData: Dict[str, Any]) -> List[str]:
|
||||
"""Extract bullet list items from section data."""
|
||||
# Normalize when elements array or raw list was passed in
|
||||
"""Extract bullet list items from section data. Expects nested content structure."""
|
||||
# Normalize when elements array was passed in
|
||||
if isinstance(sectionData, list):
|
||||
# Already a list of items (strings or dicts)
|
||||
items = sectionData
|
||||
else:
|
||||
items = sectionData.get("items", [])
|
||||
if sectionData and isinstance(sectionData[0], dict):
|
||||
sectionData = sectionData[0]
|
||||
else:
|
||||
return []
|
||||
# Ensure sectionData is a dict
|
||||
if not isinstance(sectionData, dict):
|
||||
return []
|
||||
# Extract from nested content structure
|
||||
content = sectionData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
items = content.get("items", [])
|
||||
result = []
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
|
|
@ -231,64 +242,89 @@ class BaseRenderer(ABC):
|
|||
return result
|
||||
|
||||
def _extractHeadingData(self, sectionData: Dict[str, Any]) -> Tuple[int, str]:
|
||||
"""Extract heading level and text from section data."""
|
||||
"""Extract heading level and text from section data. Expects nested content structure."""
|
||||
# Normalize when elements array was passed in
|
||||
if isinstance(sectionData, list):
|
||||
if sectionData and isinstance(sectionData[0], dict):
|
||||
sectionData = sectionData[0]
|
||||
else:
|
||||
# Empty list or invalid structure - return default
|
||||
return 1, ""
|
||||
# Ensure sectionData is a dict before calling .get()
|
||||
# Ensure sectionData is a dict
|
||||
if not isinstance(sectionData, dict):
|
||||
return 1, ""
|
||||
level = sectionData.get("level", 1)
|
||||
text = sectionData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = sectionData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return 1, ""
|
||||
level = content.get("level", 1)
|
||||
text = content.get("text", "")
|
||||
return level, text
|
||||
|
||||
def _extractParagraphText(self, sectionData: Dict[str, Any]) -> str:
|
||||
"""Extract paragraph text from section data."""
|
||||
"""Extract paragraph text from section data. Expects nested content structure."""
|
||||
if isinstance(sectionData, list):
|
||||
# Join multiple paragraph elements if provided as a list
|
||||
texts = []
|
||||
for el in sectionData:
|
||||
if isinstance(el, dict) and "text" in el:
|
||||
texts.append(el["text"])
|
||||
if isinstance(el, dict):
|
||||
content = el.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
if text:
|
||||
texts.append(text)
|
||||
elif isinstance(el, str):
|
||||
texts.append(el)
|
||||
return "\n".join(texts)
|
||||
return sectionData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
if not isinstance(sectionData, dict):
|
||||
return ""
|
||||
content = sectionData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
return content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
return content
|
||||
return ""
|
||||
|
||||
def _extractCodeBlockData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
||||
"""Extract code and language from section data."""
|
||||
"""Extract code and language from section data. Expects nested content structure."""
|
||||
# Normalize when elements array was passed in
|
||||
if isinstance(sectionData, list):
|
||||
if sectionData and isinstance(sectionData[0], dict):
|
||||
sectionData = sectionData[0]
|
||||
else:
|
||||
# Empty list or invalid structure - return default
|
||||
return "", ""
|
||||
# Ensure sectionData is a dict before calling .get()
|
||||
# Ensure sectionData is a dict
|
||||
if not isinstance(sectionData, dict):
|
||||
return "", ""
|
||||
code = sectionData.get("code", "")
|
||||
language = sectionData.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = sectionData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return "", ""
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
return code, language
|
||||
|
||||
def _extractImageData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
|
||||
"""Extract base64 data and alt text from section data."""
|
||||
"""Extract base64 data and alt text from section data. Expects nested content structure."""
|
||||
# Normalize when elements array was passed in
|
||||
if isinstance(sectionData, list):
|
||||
if sectionData and isinstance(sectionData[0], dict):
|
||||
sectionData = sectionData[0]
|
||||
else:
|
||||
# Empty list or invalid structure - return default
|
||||
return "", "Image"
|
||||
# Ensure sectionData is a dict before calling .get()
|
||||
# Ensure sectionData is a dict
|
||||
if not isinstance(sectionData, dict):
|
||||
return "", "Image"
|
||||
base64Data = sectionData.get("base64Data", "")
|
||||
altText = sectionData.get("altText", "Image")
|
||||
# Extract from nested content structure
|
||||
content = sectionData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return "", "Image"
|
||||
base64Data = content.get("base64Data", "")
|
||||
altText = content.get("altText", "Image")
|
||||
return base64Data, altText
|
||||
|
||||
def _renderImageSection(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
|
||||
|
|
|
|||
|
|
@ -41,11 +41,17 @@ class RendererCsv(BaseRenderer):
|
|||
else:
|
||||
filename = self._determineFilename(title, "text/csv")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=csvContent.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -130,8 +136,12 @@ class RendererCsv(BaseRenderer):
|
|||
def _renderJsonTableToCsv(self, tableData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON table to CSV rows."""
|
||||
try:
|
||||
headers = tableData.get("headers", [])
|
||||
rows = tableData.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = tableData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
csvRows = []
|
||||
|
||||
|
|
@ -150,7 +160,11 @@ class RendererCsv(BaseRenderer):
|
|||
def _renderJsonListToCsv(self, listData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON list to CSV rows."""
|
||||
try:
|
||||
items = listData.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = listData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
items = content.get("items", [])
|
||||
csvRows = []
|
||||
|
||||
for item in items:
|
||||
|
|
@ -177,8 +191,12 @@ class RendererCsv(BaseRenderer):
|
|||
def _renderJsonHeadingToCsv(self, headingData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON heading to CSV rows."""
|
||||
try:
|
||||
text = headingData.get("text", "")
|
||||
level = headingData.get("level", 1)
|
||||
# Extract from nested content structure
|
||||
content = headingData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
# Use # symbols for heading levels
|
||||
|
|
@ -194,7 +212,14 @@ class RendererCsv(BaseRenderer):
|
|||
def _renderJsonParagraphToCsv(self, paragraphData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON paragraph to CSV rows."""
|
||||
try:
|
||||
text = paragraphData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraphData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
|
||||
if text:
|
||||
# Split long paragraphs into multiple rows if needed
|
||||
|
|
@ -229,8 +254,12 @@ class RendererCsv(BaseRenderer):
|
|||
def _renderJsonCodeToCsv(self, codeData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON code block to CSV rows."""
|
||||
try:
|
||||
code = codeData.get("code", "")
|
||||
language = codeData.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = codeData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
csvRows = []
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,10 @@ class RendererDocx(BaseRenderer):
|
|||
# Generate DOCX using AI-analyzed styling
|
||||
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
# Determine filename from document or title
|
||||
documents = extractedContent.get("documents", [])
|
||||
if documents and isinstance(documents[0], dict):
|
||||
|
|
@ -74,7 +78,9 @@ class RendererDocx(BaseRenderer):
|
|||
RenderedDocument(
|
||||
documentData=docx_bytes,
|
||||
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -82,11 +88,15 @@ class RendererDocx(BaseRenderer):
|
|||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||||
# Return minimal fallback
|
||||
fallbackContent = f"DOCX Generation Error: {str(e)}"
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename=self._determineFilename(title, "text/plain")
|
||||
filename=self._determineFilename(title, "text/plain"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -96,8 +106,8 @@ class RendererDocx(BaseRenderer):
|
|||
# Create new document
|
||||
doc = Document()
|
||||
|
||||
# Get style set: default styles, enhanced with AI if style instructions present
|
||||
styleSet = await self._getStyleSet(userPrompt, aiService)
|
||||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||
styleSet = await self._getStyleSet(json_content, userPrompt, aiService)
|
||||
|
||||
# Setup basic document styles and create all styles from style set
|
||||
self._setupBasicDocumentStyles(doc)
|
||||
|
|
@ -137,12 +147,17 @@ class RendererDocx(BaseRenderer):
|
|||
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
||||
raise Exception(f"DOCX generation failed: {str(e)}")
|
||||
|
||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
||||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - use styles from document generation metadata if available,
|
||||
otherwise enhance default styles with AI if userPrompt provided.
|
||||
|
||||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||
|
||||
Args:
|
||||
extractedContent: Document content with metadata (may contain styles)
|
||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||
aiService: AI service (used only if userPrompt provided)
|
||||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||
templateName: Name of template style set (None = default)
|
||||
|
||||
Returns:
|
||||
|
|
@ -156,10 +171,18 @@ class RendererDocx(BaseRenderer):
|
|||
else:
|
||||
defaultStyleSet = self._getDefaultStyleSet()
|
||||
|
||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
||||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||
if extractedContent:
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
if isinstance(metadata, dict):
|
||||
styles = metadata.get("styles")
|
||||
if styles and isinstance(styles, dict):
|
||||
self.logger.debug("Using styles from document generation metadata")
|
||||
return self._validateStylesContrast(styles)
|
||||
|
||||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||
if userPrompt and aiService:
|
||||
# AI will naturally detect style instructions in any language
|
||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
||||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||
return self._validateStylesContrast(enhancedStyleSet)
|
||||
else:
|
||||
|
|
@ -264,6 +287,10 @@ class RendererDocx(BaseRenderer):
|
|||
section_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
# If no elements, skip this section (it has no content to render)
|
||||
if not elements:
|
||||
return
|
||||
|
||||
# Process each element in the section
|
||||
for element in elements:
|
||||
element_type = element.get("type", "")
|
||||
|
|
@ -286,22 +313,36 @@ class RendererDocx(BaseRenderer):
|
|||
para.add_run(f" (Source: {source})").italic = True
|
||||
continue
|
||||
|
||||
# Standard section types
|
||||
if section_type == "table":
|
||||
# Check element type, not section type (elements can have different types than section)
|
||||
if element_type == "table":
|
||||
self._renderJsonTable(doc, element, styles)
|
||||
elif section_type == "bullet_list":
|
||||
elif element_type == "bullet_list":
|
||||
self._renderJsonBulletList(doc, element, styles)
|
||||
elif section_type == "heading":
|
||||
elif element_type == "heading":
|
||||
self._renderJsonHeading(doc, element, styles)
|
||||
elif section_type == "paragraph":
|
||||
elif element_type == "paragraph":
|
||||
self._renderJsonParagraph(doc, element, styles)
|
||||
elif section_type == "code_block":
|
||||
elif element_type == "code_block":
|
||||
self._renderJsonCodeBlock(doc, element, styles)
|
||||
elif section_type == "image":
|
||||
elif element_type == "image":
|
||||
self._renderJsonImage(doc, element, styles)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
self._renderJsonParagraph(doc, element, styles)
|
||||
# Fallback: if element_type not set, use section_type
|
||||
if section_type == "table":
|
||||
self._renderJsonTable(doc, element, styles)
|
||||
elif section_type == "bullet_list":
|
||||
self._renderJsonBulletList(doc, element, styles)
|
||||
elif section_type == "heading":
|
||||
self._renderJsonHeading(doc, element, styles)
|
||||
elif section_type == "paragraph":
|
||||
self._renderJsonParagraph(doc, element, styles)
|
||||
elif section_type == "code_block":
|
||||
self._renderJsonCodeBlock(doc, element, styles)
|
||||
elif section_type == "image":
|
||||
self._renderJsonImage(doc, element, styles)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
self._renderJsonParagraph(doc, element, styles)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||
|
|
@ -311,8 +352,12 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON table to DOCX using AI-generated styles."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = table_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return
|
||||
|
|
@ -467,7 +512,11 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON bullet list to DOCX using AI-generated styles."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = list_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return
|
||||
items = content.get("items", [])
|
||||
bullet_style = styles["bullet_list"]
|
||||
|
||||
for item in items:
|
||||
|
|
@ -482,8 +531,12 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON heading to DOCX using AI-generated styles."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = heading_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
|
|
@ -495,7 +548,25 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraph_data.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
|
||||
# CRITICAL: Prevent rendering base64 image data as text
|
||||
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
|
||||
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
|
||||
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
|
||||
# This looks like base64 data - don't render as text
|
||||
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
|
||||
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
|
||||
if para.runs:
|
||||
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||||
return
|
||||
|
||||
if text:
|
||||
para = doc.add_paragraph(text)
|
||||
|
|
@ -506,8 +577,12 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON code block to DOCX using AI-generated styles."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = code_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
|
|
@ -525,20 +600,33 @@ class RendererDocx(BaseRenderer):
|
|||
def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON image to DOCX."""
|
||||
try:
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
# Extract from nested content structure
|
||||
content = image_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return
|
||||
base64_data = content.get("base64Data", "")
|
||||
alt_text = content.get("altText", "Image")
|
||||
|
||||
if base64_data:
|
||||
image_bytes = base64.b64decode(base64_data)
|
||||
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
||||
|
||||
if alt_text:
|
||||
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
||||
caption_para.runs[0].italic = True
|
||||
try:
|
||||
image_bytes = base64.b64decode(base64_data)
|
||||
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
||||
|
||||
if alt_text:
|
||||
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
||||
caption_para.runs[0].italic = True
|
||||
except Exception as embedError:
|
||||
# Image decoding or embedding failed
|
||||
raise Exception(f"Failed to decode or embed image: {str(embedError)}")
|
||||
else:
|
||||
raise Exception("No image data provided (base64Data is empty)")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]")
|
||||
self.logger.error(f"Error embedding image in DOCX: {str(e)}")
|
||||
errorMsg = f"[Error: Could not embed image '{image_data.get('altText', 'Image')}'. {str(e)}]"
|
||||
errorPara = doc.add_paragraph(errorMsg)
|
||||
if errorPara.runs:
|
||||
errorPara.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||||
|
||||
def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]:
|
||||
"""Extract document structure from user prompt."""
|
||||
|
|
|
|||
|
|
@ -55,12 +55,18 @@ class RendererHtml(BaseRenderer):
|
|||
else:
|
||||
htmlFilename = self._determineFilename(title, "text/html")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
# Start with HTML document
|
||||
resultDocuments = [
|
||||
RenderedDocument(
|
||||
documentData=htmlContent.encode('utf-8'),
|
||||
mimeType="text/html",
|
||||
filename=htmlFilename
|
||||
filename=htmlFilename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -90,8 +96,8 @@ class RendererHtml(BaseRenderer):
|
|||
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
||||
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
||||
styles = await self._getStyleSet(userPrompt, aiService)
|
||||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
|
||||
|
||||
# Validate JSON structure
|
||||
if not self._validateJsonStructure(jsonContent):
|
||||
|
|
@ -148,12 +154,17 @@ class RendererHtml(BaseRenderer):
|
|||
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
||||
raise Exception(f"HTML generation failed: {str(e)}")
|
||||
|
||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
||||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - use styles from document generation metadata if available,
|
||||
otherwise enhance default styles with AI if userPrompt provided.
|
||||
|
||||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||
|
||||
Args:
|
||||
extractedContent: Document content with metadata (may contain styles)
|
||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||
aiService: AI service (used only if userPrompt provided)
|
||||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||
templateName: Name of template style set (None = default)
|
||||
|
||||
Returns:
|
||||
|
|
@ -162,10 +173,18 @@ class RendererHtml(BaseRenderer):
|
|||
# Get default style set
|
||||
defaultStyleSet = self._getDefaultStyleSet()
|
||||
|
||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
||||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||
if extractedContent:
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
if isinstance(metadata, dict):
|
||||
styles = metadata.get("styles")
|
||||
if styles and isinstance(styles, dict):
|
||||
self.logger.debug("Using styles from document generation metadata")
|
||||
return self._validateStylesContrast(styles)
|
||||
|
||||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||
if userPrompt and aiService:
|
||||
# AI will naturally detect style instructions in any language
|
||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
||||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||
return self._validateStylesContrast(enhancedStyleSet)
|
||||
else:
|
||||
|
|
@ -446,8 +465,12 @@ class RendererHtml(BaseRenderer):
|
|||
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to HTML using AI-generated styles."""
|
||||
try:
|
||||
headers = tableData.get("headers", [])
|
||||
rows = tableData.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = tableData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
|
@ -477,9 +500,13 @@ class RendererHtml(BaseRenderer):
|
|||
return ""
|
||||
|
||||
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to HTML using AI-generated styles."""
|
||||
"""Render a JSON bullet list to HTML using AI-generated styles. Expects nested content structure."""
|
||||
try:
|
||||
items = listData.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = listData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
items = content.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
|
@ -513,8 +540,12 @@ class RendererHtml(BaseRenderer):
|
|||
elif not isinstance(headingData, dict):
|
||||
return ""
|
||||
|
||||
level = headingData.get("level", 1)
|
||||
text = headingData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = headingData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
|
|
@ -531,11 +562,19 @@ class RendererHtml(BaseRenderer):
|
|||
try:
|
||||
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
|
||||
if isinstance(paragraphData, list):
|
||||
# Extract text from all paragraph elements
|
||||
# Extract text from all paragraph elements (expects nested content structure)
|
||||
texts = []
|
||||
for el in paragraphData:
|
||||
if isinstance(el, dict) and "text" in el:
|
||||
texts.append(el["text"])
|
||||
if isinstance(el, dict):
|
||||
content = el.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
if text:
|
||||
texts.append(text)
|
||||
elif isinstance(el, str):
|
||||
texts.append(el)
|
||||
if texts:
|
||||
|
|
@ -545,7 +584,15 @@ class RendererHtml(BaseRenderer):
|
|||
elif isinstance(paragraphData, str):
|
||||
return f'<p>{paragraphData}</p>'
|
||||
elif isinstance(paragraphData, dict):
|
||||
text = paragraphData.get("text", "")
|
||||
# Handle nested content structure: element.content vs element.text
|
||||
# Extract from nested content structure
|
||||
content = paragraphData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
if text:
|
||||
return f'<p>{text}</p>'
|
||||
return ""
|
||||
|
|
@ -557,10 +604,14 @@ class RendererHtml(BaseRenderer):
|
|||
return ""
|
||||
|
||||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to HTML using AI-generated styles."""
|
||||
"""Render a JSON code block to HTML using AI-generated styles. Expects nested content structure."""
|
||||
try:
|
||||
code = codeData.get("code", "")
|
||||
language = codeData.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = codeData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
|
|
@ -575,12 +626,16 @@ class RendererHtml(BaseRenderer):
|
|||
return ""
|
||||
|
||||
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to HTML with placeholder for later replacement."""
|
||||
"""Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure."""
|
||||
try:
|
||||
import html
|
||||
base64Data = imageData.get("base64Data", "")
|
||||
altText = imageData.get("altText", "Image")
|
||||
caption = imageData.get("caption", "")
|
||||
# Extract from nested content structure
|
||||
content = imageData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
base64Data = content.get("base64Data", "")
|
||||
altText = content.get("altText", "Image")
|
||||
caption = content.get("caption", "")
|
||||
|
||||
# Escape HTML in altText and caption to prevent injection
|
||||
altTextEscaped = html.escape(str(altText))
|
||||
|
|
@ -600,8 +655,10 @@ class RendererHtml(BaseRenderer):
|
|||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return f'<div class="error">[Image: {imageData.get("altText", "Image")}]</div>'
|
||||
self.logger.error(f"Error embedding image in HTML: {str(e)}")
|
||||
altText = imageData.get("altText", "Image")
|
||||
errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]")
|
||||
return f'<div class="error" style="color: red; padding: 10px; border: 1px solid red;">{errorMsg}</div>'
|
||||
|
||||
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
|
|
@ -626,12 +683,24 @@ class RendererHtml(BaseRenderer):
|
|||
if section.get("content_type") == "image":
|
||||
elements = section.get("elements", [])
|
||||
for element in elements:
|
||||
base64Data = element.get("base64Data", "")
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
base64Data = ""
|
||||
|
||||
# If base64Data not found, try extracting from url data URI
|
||||
if isinstance(content, dict):
|
||||
base64Data = content.get("base64Data", "")
|
||||
elif isinstance(content, str):
|
||||
# Content might be base64 string directly (shouldn't happen)
|
||||
pass
|
||||
|
||||
# If base64Data not found in content, try direct element fields (fallback)
|
||||
if not base64Data:
|
||||
url = element.get("url", "")
|
||||
if url.startswith("data:image/"):
|
||||
base64Data = element.get("base64Data", "")
|
||||
|
||||
# If base64Data still not found, try extracting from url data URI
|
||||
if not base64Data:
|
||||
url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
|
||||
if url and isinstance(url, str) and url.startswith("data:image/"):
|
||||
# Extract base64 from data URI: data:image/png;base64,<base64>
|
||||
import re
|
||||
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
||||
|
|
@ -642,7 +711,8 @@ class RendererHtml(BaseRenderer):
|
|||
sectionId = section.get("id", "unknown")
|
||||
|
||||
# Bestimme MIME-Type und Extension
|
||||
mimeType = element.get("mimeType", "image/png")
|
||||
mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "")
|
||||
if not mimeType or mimeType == "unknown":
|
||||
if not mimeType or mimeType == "unknown":
|
||||
# Versuche MIME-Type aus base64 zu erkennen
|
||||
if base64Data.startswith("/9j/"):
|
||||
|
|
|
|||
|
|
@ -54,11 +54,17 @@ class RendererImage(BaseRenderer):
|
|||
else:
|
||||
imageBytes = imageContent
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=imageBytes,
|
||||
mimeType="image/png",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -43,11 +43,17 @@ class RendererJson(BaseRenderer):
|
|||
else:
|
||||
filename = self._determineFilename(title, "application/json")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=jsonContent.encode('utf-8'),
|
||||
mimeType="application/json",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -60,11 +66,15 @@ class RendererJson(BaseRenderer):
|
|||
"metadata": {"error": str(e)}
|
||||
}
|
||||
fallbackContent = json.dumps(fallbackData, indent=2)
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="application/json",
|
||||
filename=self._determineFilename(title, "application/json")
|
||||
filename=self._determineFilename(title, "application/json"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -41,11 +41,17 @@ class RendererMarkdown(BaseRenderer):
|
|||
else:
|
||||
filename = self._determineFilename(title, "text/markdown")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=markdownContent.encode('utf-8'),
|
||||
mimeType="text/markdown",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -53,11 +59,15 @@ class RendererMarkdown(BaseRenderer):
|
|||
self.logger.error(f"Error rendering markdown: {str(e)}")
|
||||
# Return minimal markdown fallback
|
||||
fallbackContent = f"# {title}\n\nError rendering report: {str(e)}"
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/markdown",
|
||||
filename=self._determineFilename(title, "text/markdown")
|
||||
filename=self._determineFilename(title, "text/markdown"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -164,8 +174,12 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to markdown."""
|
||||
try:
|
||||
headers = tableData.get("headers", [])
|
||||
rows = tableData.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = tableData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
|
@ -194,7 +208,11 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to markdown."""
|
||||
try:
|
||||
items = listData.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = listData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
items = content.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
|
@ -215,8 +233,12 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON heading to markdown."""
|
||||
try:
|
||||
level = headingData.get("level", 1)
|
||||
text = headingData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = headingData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
|
|
@ -231,7 +253,14 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON paragraph to markdown."""
|
||||
try:
|
||||
text = paragraphData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraphData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
return text if text else ""
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -241,8 +270,12 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to markdown."""
|
||||
try:
|
||||
code = codeData.get("code", "")
|
||||
language = codeData.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = codeData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
|
|
@ -259,8 +292,12 @@ class RendererMarkdown(BaseRenderer):
|
|||
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to markdown."""
|
||||
try:
|
||||
altText = imageData.get("altText", "Image")
|
||||
base64Data = imageData.get("base64Data", "")
|
||||
# Extract from nested content structure
|
||||
content = imageData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
altText = content.get("altText", "Image")
|
||||
base64Data = content.get("base64Data", "")
|
||||
|
||||
if base64Data:
|
||||
# For base64 images, we can't embed them directly in markdown
|
||||
|
|
|
|||
|
|
@ -51,6 +51,10 @@ class RendererPdf(BaseRenderer):
|
|||
# Generate PDF using AI-analyzed styling
|
||||
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
# Determine filename from document or title
|
||||
documents = extractedContent.get("documents", [])
|
||||
if documents and isinstance(documents[0], dict):
|
||||
|
|
@ -74,7 +78,9 @@ class RendererPdf(BaseRenderer):
|
|||
RenderedDocument(
|
||||
documentData=pdf_bytes,
|
||||
mimeType="application/pdf",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -93,8 +99,8 @@ class RendererPdf(BaseRenderer):
|
|||
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
||||
"""Generate PDF content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
||||
styles = await self._getStyleSet(userPrompt, aiService)
|
||||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||
styles = await self._getStyleSet(json_content, userPrompt, aiService)
|
||||
|
||||
# Validate JSON structure
|
||||
if not self._validateJsonStructure(json_content):
|
||||
|
|
@ -157,12 +163,17 @@ class RendererPdf(BaseRenderer):
|
|||
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
|
||||
raise Exception(f"PDF generation failed: {str(e)}")
|
||||
|
||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
||||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - use styles from document generation metadata if available,
|
||||
otherwise enhance default styles with AI if userPrompt provided.
|
||||
|
||||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||
|
||||
Args:
|
||||
extractedContent: Document content with metadata (may contain styles)
|
||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||
aiService: AI service (used only if userPrompt provided)
|
||||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||
templateName: Name of template style set (None = default)
|
||||
|
||||
Returns:
|
||||
|
|
@ -171,10 +182,19 @@ class RendererPdf(BaseRenderer):
|
|||
# Get default style set
|
||||
defaultStyleSet = self._getDefaultStyleSet()
|
||||
|
||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
||||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||
if extractedContent:
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
if isinstance(metadata, dict):
|
||||
styles = metadata.get("styles")
|
||||
if styles and isinstance(styles, dict):
|
||||
self.logger.debug("Using styles from document generation metadata")
|
||||
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||
return self._validateStylesContrast(enhancedStyleSet)
|
||||
|
||||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||
if userPrompt and aiService:
|
||||
# AI will naturally detect style instructions in any language
|
||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
||||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||
# Convert colors to PDF format after getting styles
|
||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||
|
|
@ -545,22 +565,36 @@ class RendererPdf(BaseRenderer):
|
|||
all_elements.append(Spacer(1, 6))
|
||||
continue
|
||||
|
||||
# Standard section types
|
||||
if section_type == "table":
|
||||
# Check element type, not section type (elements can have different types than section)
|
||||
if element_type == "table":
|
||||
all_elements.extend(self._renderJsonTable(element, styles))
|
||||
elif section_type == "bullet_list":
|
||||
elif element_type == "bullet_list":
|
||||
all_elements.extend(self._renderJsonBulletList(element, styles))
|
||||
elif section_type == "heading":
|
||||
elif element_type == "heading":
|
||||
all_elements.extend(self._renderJsonHeading(element, styles))
|
||||
elif section_type == "paragraph":
|
||||
elif element_type == "paragraph":
|
||||
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||
elif section_type == "code_block":
|
||||
elif element_type == "code_block":
|
||||
all_elements.extend(self._renderJsonCodeBlock(element, styles))
|
||||
elif section_type == "image":
|
||||
elif element_type == "image":
|
||||
all_elements.extend(self._renderJsonImage(element, styles))
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||
# Fallback: if element_type not set, use section_type as fallback
|
||||
if section_type == "table":
|
||||
all_elements.extend(self._renderJsonTable(element, styles))
|
||||
elif section_type == "bullet_list":
|
||||
all_elements.extend(self._renderJsonBulletList(element, styles))
|
||||
elif section_type == "heading":
|
||||
all_elements.extend(self._renderJsonHeading(element, styles))
|
||||
elif section_type == "paragraph":
|
||||
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||
elif section_type == "code_block":
|
||||
all_elements.extend(self._renderJsonCodeBlock(element, styles))
|
||||
elif section_type == "image":
|
||||
all_elements.extend(self._renderJsonImage(element, styles))
|
||||
else:
|
||||
# Final fallback to paragraph for unknown types
|
||||
all_elements.extend(self._renderJsonParagraph(element, styles))
|
||||
|
||||
return all_elements
|
||||
|
||||
|
|
@ -571,8 +605,13 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON table to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
# Handle nested content structure: element.content.headers vs element.headers
|
||||
# Extract from nested content structure
|
||||
content = table_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return []
|
||||
|
|
@ -588,13 +627,13 @@ class RendererPdf(BaseRenderer):
|
|||
table_cell_style = styles.get("table_cell", {})
|
||||
|
||||
table_style = [
|
||||
('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))),
|
||||
('BACKGROUND', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("text_color", "#FFFFFF"))),
|
||||
('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
|
||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||
('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))),
|
||||
('BACKGROUND', (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
|
||||
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
|
||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||
]
|
||||
|
|
@ -610,7 +649,11 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = list_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
items = content.get("items", [])
|
||||
bullet_style_def = styles.get("bullet_list", {})
|
||||
|
||||
elements = []
|
||||
|
|
@ -632,8 +675,12 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonHeading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON heading to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = heading_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
|
|
@ -649,7 +696,14 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraph_data.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
|
||||
if text:
|
||||
return [Paragraph(text, self._createNormalStyle(styles))]
|
||||
|
|
@ -663,8 +717,12 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonCodeBlock(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON code block to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = code_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
code_style_def = styles.get("code_block", {})
|
||||
|
||||
if code:
|
||||
|
|
@ -700,14 +758,34 @@ class RendererPdf(BaseRenderer):
|
|||
def _renderJsonImage(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON image to PDF elements using reportlab."""
|
||||
try:
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
caption = image_data.get("caption", "")
|
||||
# Extract from nested content structure
|
||||
content = image_data.get("content", {})
|
||||
base64_data = ""
|
||||
alt_text = "Image"
|
||||
caption = ""
|
||||
|
||||
# If base64Data not found, try extracting from url data URI
|
||||
if isinstance(content, dict):
|
||||
# Nested content structure
|
||||
base64_data = content.get("base64Data", "")
|
||||
alt_text = content.get("altText", "Image")
|
||||
caption = content.get("caption", "")
|
||||
elif isinstance(content, str):
|
||||
# Content might be base64 string directly (shouldn't happen, but handle it)
|
||||
self.logger.warning("Image content is a string, not a dict. This should not happen.")
|
||||
return [Paragraph(f"[Image: Invalid format]", self._createNormalStyle(styles))]
|
||||
|
||||
# If base64Data not found in content, try direct element fields (fallback)
|
||||
if not base64_data:
|
||||
url = image_data.get("url", "")
|
||||
if url.startswith("data:image/"):
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
if not alt_text or alt_text == "Image":
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
if not caption:
|
||||
caption = image_data.get("caption", "")
|
||||
|
||||
# If base64Data still not found, try extracting from url data URI
|
||||
if not base64_data:
|
||||
url = image_data.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
|
||||
if url and isinstance(url, str) and url.startswith("data:image/"):
|
||||
# Extract base64 from data URI: data:image/png;base64,<base64>
|
||||
import re
|
||||
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
||||
|
|
@ -715,8 +793,18 @@ class RendererPdf(BaseRenderer):
|
|||
base64_data = match.group(1)
|
||||
|
||||
if not base64_data:
|
||||
self.logger.warning(f"No base64 data found for image. Alt text: {alt_text}")
|
||||
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
|
||||
|
||||
# Validate that base64_data is actually base64 (not the entire element rendered as text)
|
||||
if len(base64_data) > 10000: # Very long string might be entire element JSON
|
||||
self.logger.warning(f"Base64 data seems too long ({len(base64_data)} chars), might be incorrectly extracted")
|
||||
|
||||
# Ensure base64_data is a string, not bytes or other type
|
||||
if not isinstance(base64_data, str):
|
||||
self.logger.warning(f"Base64 data is not a string: {type(base64_data)}")
|
||||
return [Paragraph(f"[Image: {alt_text} - Invalid data type]", self._createNormalStyle(styles))]
|
||||
|
||||
try:
|
||||
from reportlab.platypus import Image as ReportLabImage
|
||||
from reportlab.lib.units import inch
|
||||
|
|
@ -731,25 +819,61 @@ class RendererPdf(BaseRenderer):
|
|||
# Try to get image dimensions from PIL
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
pilImage = PILImage.open(imageStream)
|
||||
imgWidth, imgHeight = pilImage.size
|
||||
from reportlab.lib.pagesizes import A4
|
||||
|
||||
# Scale to fit page (max width 6 inches, maintain aspect ratio)
|
||||
maxWidth = 6 * inch
|
||||
if imgWidth > maxWidth:
|
||||
scale = maxWidth / imgWidth
|
||||
imgWidth = maxWidth
|
||||
pilImage = PILImage.open(imageStream)
|
||||
originalWidth, originalHeight = pilImage.size
|
||||
|
||||
# Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom)
|
||||
pageWidth = A4[0] # 595.27 points
|
||||
pageHeight = A4[1] # 841.89 points
|
||||
leftMargin = 72
|
||||
rightMargin = 72
|
||||
topMargin = 72
|
||||
bottomMargin = 18
|
||||
|
||||
# Use actual frame dimensions from SimpleDocTemplate
|
||||
# Frame is smaller than page minus margins due to internal spacing
|
||||
# From error message: frame is 439.27559055118115 x 739.8897637795277
|
||||
# Use conservative values with safety margin
|
||||
availableWidth = 430.0 # Slightly smaller than frame width for safety
|
||||
availableHeight = 730.0 # Slightly smaller than frame height for safety
|
||||
|
||||
# Convert original image size from pixels to points (assuming 72 DPI)
|
||||
# If image DPI is different, PIL will provide correct size
|
||||
# For safety, use a conservative conversion
|
||||
imgWidthPoints = originalWidth * (inch / 72) # Convert to inches, then to points
|
||||
imgHeightPoints = originalHeight * (inch / 72)
|
||||
|
||||
# Scale to fit within available page dimensions while maintaining aspect ratio
|
||||
widthScale = availableWidth / imgWidthPoints if imgWidthPoints > 0 else 1.0
|
||||
heightScale = availableHeight / imgHeightPoints if imgHeightPoints > 0 else 1.0
|
||||
|
||||
# Use the smaller scale to ensure image fits both width and height
|
||||
scale = min(widthScale, heightScale, 1.0) # Don't scale up, only down
|
||||
|
||||
imgWidth = imgWidthPoints * scale
|
||||
imgHeight = imgHeightPoints * scale
|
||||
|
||||
# Additional safety check: ensure dimensions don't exceed available space
|
||||
if imgWidth > availableWidth:
|
||||
scale = availableWidth / imgWidth
|
||||
imgWidth = availableWidth
|
||||
imgHeight = imgHeight * scale
|
||||
else:
|
||||
imgWidth = imgWidth * (inch / 72) # Convert pixels to inches (assuming 72 DPI)
|
||||
imgHeight = imgHeight * (inch / 72)
|
||||
|
||||
if imgHeight > availableHeight:
|
||||
scale = availableHeight / imgHeight
|
||||
imgHeight = availableHeight
|
||||
imgWidth = imgWidth * scale
|
||||
|
||||
# Reset stream for reportlab
|
||||
imageStream.seek(0)
|
||||
except Exception:
|
||||
# Fallback: use default size
|
||||
imgWidth = 4 * inch
|
||||
imgHeight = 3 * inch
|
||||
except Exception as e:
|
||||
# Fallback: use default size that fits page
|
||||
self.logger.warning(f"Error calculating image size: {str(e)}, using safe default")
|
||||
# Use 80% of available width as safe default
|
||||
imgWidth = 4 * inch # ~288 points, safe for ~451pt available width
|
||||
imgHeight = 3 * inch # ~216 points, safe for ~751pt available height
|
||||
imageStream.seek(0)
|
||||
|
||||
# Create reportlab Image
|
||||
|
|
@ -773,10 +897,16 @@ class RendererPdf(BaseRenderer):
|
|||
return elements
|
||||
|
||||
except Exception as imgError:
|
||||
self.logger.warning(f"Error embedding image in PDF: {str(imgError)}")
|
||||
# Fallback to placeholder
|
||||
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
|
||||
self.logger.error(f"Error embedding image in PDF: {str(imgError)}")
|
||||
# Return error message instead of placeholder
|
||||
errorStyle = self._createNormalStyle(styles)
|
||||
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
|
||||
errorMsg = f"[Error: Could not embed image '{alt_text}'. {str(imgError)}]"
|
||||
return [Paragraph(errorMsg, errorStyle)]
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._createNormalStyle(styles))]
|
||||
self.logger.error(f"Error rendering image: {str(e)}")
|
||||
errorStyle = self._createNormalStyle(styles)
|
||||
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
|
||||
errorMsg = f"[Error: Could not render image '{image_data.get('altText', 'Image')}'. {str(e)}]"
|
||||
return [Paragraph(errorMsg, errorStyle)]
|
||||
|
|
@ -48,8 +48,8 @@ class RendererPptx(BaseRenderer):
|
|||
from pptx.dml.color import RGBColor
|
||||
import re
|
||||
|
||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
||||
styles = await self._getStyleSet(userPrompt, aiService)
|
||||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
|
||||
|
||||
# Create new presentation
|
||||
prs = Presentation()
|
||||
|
|
@ -99,7 +99,7 @@ class RendererPptx(BaseRenderer):
|
|||
if title_shape.text_frame.paragraphs[0].font:
|
||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
|
||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
|
||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||
|
||||
# Handle images first (if present)
|
||||
|
|
@ -133,7 +133,7 @@ class RendererPptx(BaseRenderer):
|
|||
heading_style = styles.get("heading", {})
|
||||
p.font.size = Pt(heading_style.get("font_size", 32))
|
||||
p.font.bold = heading_style.get("bold", True)
|
||||
heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47)))
|
||||
heading_color = self._getSafeColor(heading_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*heading_color)
|
||||
elif paragraph.startswith('##'):
|
||||
# Subheader
|
||||
|
|
@ -141,7 +141,7 @@ class RendererPptx(BaseRenderer):
|
|||
subheading_style = styles.get("subheading", {})
|
||||
p.font.size = Pt(subheading_style.get("font_size", 24))
|
||||
p.font.bold = subheading_style.get("bold", True)
|
||||
subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79)))
|
||||
subheading_color = self._getSafeColor(subheading_style.get("color", (79, 79, 79)))
|
||||
p.font.color.rgb = RGBColor(*subheading_color)
|
||||
elif paragraph.startswith('*') and paragraph.endswith('*'):
|
||||
# Bold text
|
||||
|
|
@ -149,14 +149,14 @@ class RendererPptx(BaseRenderer):
|
|||
paragraph_style = styles.get("paragraph", {})
|
||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||
p.font.bold = True
|
||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||
paragraph_color = self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||
else:
|
||||
# Regular text
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||
p.font.bold = paragraph_style.get("bold", False)
|
||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||
paragraph_color = self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||
|
||||
# Apply alignment
|
||||
|
|
@ -181,7 +181,7 @@ class RendererPptx(BaseRenderer):
|
|||
if title_shape.text_frame.paragraphs[0].font:
|
||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
|
||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
|
||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||
|
||||
subtitle_shape = slide.placeholders[1]
|
||||
|
|
@ -215,32 +215,46 @@ class RendererPptx(BaseRenderer):
|
|||
else:
|
||||
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.presentationml.presentation")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=pptx_bytes,
|
||||
mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
except ImportError:
|
||||
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||
fallbackContent = "python-pptx library not installed"
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename=self._determineFilename(title, "text/plain")
|
||||
filename=self._determineFilename(title, "text/plain"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
|
||||
fallbackContent = f"Error rendering PowerPoint presentation: {str(e)}"
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename=self._determineFilename(title, "text/plain")
|
||||
filename=self._determineFilename(title, "text/plain"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -349,12 +363,17 @@ class RendererPptx(BaseRenderer):
|
|||
"""Get MIME type for rendered output."""
|
||||
return self.outputMimeType
|
||||
|
||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
||||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - use styles from document generation metadata if available,
|
||||
otherwise enhance default styles with AI if userPrompt provided.
|
||||
|
||||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||
|
||||
Args:
|
||||
extractedContent: Document content with metadata (may contain styles)
|
||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||
aiService: AI service (used only if userPrompt provided)
|
||||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||
templateName: Name of template style set (None = default)
|
||||
|
||||
Returns:
|
||||
|
|
@ -363,10 +382,19 @@ class RendererPptx(BaseRenderer):
|
|||
# Get default style set
|
||||
defaultStyleSet = self._getDefaultStyleSet()
|
||||
|
||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
||||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||
if extractedContent:
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
if isinstance(metadata, dict):
|
||||
styles = metadata.get("styles")
|
||||
if styles and isinstance(styles, dict):
|
||||
self.logger.debug("Using styles from document generation metadata")
|
||||
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||
return self._validateStylesReadability(enhancedStyleSet)
|
||||
|
||||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||
if userPrompt and aiService:
|
||||
# AI will naturally detect style instructions in any language
|
||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
||||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||
# Convert colors to PPTX format after getting styles
|
||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||
|
|
@ -690,15 +718,28 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
|
||||
# Handle image sections specially
|
||||
if content_type == "image":
|
||||
# Extract image data
|
||||
# Extract image data from nested content structure
|
||||
images = []
|
||||
for element in elements:
|
||||
if element.get("base64Data"):
|
||||
images.append({
|
||||
"base64Data": element.get("base64Data"),
|
||||
"altText": element.get("altText", "Image"),
|
||||
"caption": element.get("caption")
|
||||
})
|
||||
if isinstance(element, dict):
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
base64Data = content.get("base64Data")
|
||||
altText = content.get("altText", "Image")
|
||||
caption = content.get("caption", "")
|
||||
else:
|
||||
# Fallback to direct element fields
|
||||
base64Data = element.get("base64Data")
|
||||
altText = element.get("altText", "Image")
|
||||
caption = element.get("caption", "")
|
||||
|
||||
if base64Data:
|
||||
images.append({
|
||||
"base64Data": base64Data,
|
||||
"altText": altText,
|
||||
"caption": caption
|
||||
})
|
||||
|
||||
return {
|
||||
"title": section_title or (elements[0].get("altText", "Image") if elements else "Image"),
|
||||
|
|
@ -719,7 +760,7 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
elif content_type == "code":
|
||||
content_parts.append(self._formatCodeForSlide(elements))
|
||||
else:
|
||||
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||
content_parts.append(self._formatParagraphForSlide(elements))
|
||||
|
||||
# Combine content parts
|
||||
slide_content = "\n\n".join(filter(None, content_parts))
|
||||
|
|
@ -734,17 +775,20 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
logger.warning(f"Error creating slide from section: {str(e)}")
|
||||
return None
|
||||
|
||||
def _formatTableForSlide(self, elements: List[Dict[str, Any]]) -> str:
|
||||
def _formatTableForSlide(self, element: Dict[str, Any]) -> str:
|
||||
"""Format table data for slide presentation."""
|
||||
try:
|
||||
# Extract table data from elements array
|
||||
headers = []
|
||||
rows = []
|
||||
for element in elements:
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
headers = element.get("headers", [])
|
||||
rows = element.get("rows", [])
|
||||
break
|
||||
# Extract table data from element - handle nested content structure
|
||||
if not isinstance(element, dict):
|
||||
return ""
|
||||
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers:
|
||||
return ""
|
||||
|
|
@ -778,7 +822,11 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
def _formatListForSlide(self, list_data: Dict[str, Any]) -> str:
|
||||
"""Format list data for slide presentation."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = list_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
items = content.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
|
@ -810,8 +858,12 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
def _formatHeadingForSlide(self, heading_data: Dict[str, Any]) -> str:
|
||||
"""Format heading data for slide presentation."""
|
||||
try:
|
||||
text = heading_data.get("text", "")
|
||||
level = heading_data.get("level", 1)
|
||||
# Extract from nested content structure
|
||||
content = heading_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
return f"{'#' * level} {text}"
|
||||
|
|
@ -825,7 +877,14 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
def _formatParagraphForSlide(self, paragraph_data: Dict[str, Any]) -> str:
|
||||
"""Format paragraph data for slide presentation."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraph_data.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
|
||||
if text:
|
||||
# Limit paragraph length based on content density
|
||||
|
|
@ -844,8 +903,12 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
def _formatCodeForSlide(self, code_data: Dict[str, Any]) -> str:
|
||||
"""Format code data for slide presentation."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = code_data.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
if code:
|
||||
# Limit code length based on content density
|
||||
|
|
@ -912,6 +975,10 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
section_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
# Skip sections with no elements (unless they're headings that should create new slides)
|
||||
if not elements and section_type != "heading":
|
||||
continue
|
||||
|
||||
if section_type == "heading":
|
||||
# If we have accumulated content, create a slide
|
||||
if current_slide_content:
|
||||
|
|
@ -923,10 +990,26 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
current_slide_content = []
|
||||
|
||||
# Start new slide with heading as title
|
||||
heading_found = False
|
||||
for element in elements:
|
||||
if isinstance(element, dict) and "text" in element:
|
||||
current_slide_title = element.get("text", "Untitled Section")
|
||||
break
|
||||
if isinstance(element, dict):
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
heading_text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
heading_text = content
|
||||
else:
|
||||
heading_text = ""
|
||||
|
||||
if heading_text:
|
||||
current_slide_title = heading_text
|
||||
heading_found = True
|
||||
break
|
||||
|
||||
# If no heading text found but this is a heading section, use section ID or default
|
||||
if not heading_found:
|
||||
current_slide_title = section.get("id", "Untitled Section")
|
||||
elif section_type == "image":
|
||||
# Create separate slide for image
|
||||
if current_slide_content:
|
||||
|
|
@ -940,12 +1023,25 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
# Extract image data
|
||||
imageData = []
|
||||
for element in elements:
|
||||
if element.get("base64Data"):
|
||||
imageData.append({
|
||||
"base64Data": element.get("base64Data"),
|
||||
"altText": element.get("altText", "Image"),
|
||||
"caption": element.get("caption")
|
||||
})
|
||||
if isinstance(element, dict):
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
base64Data = content.get("base64Data")
|
||||
altText = content.get("altText", "Image")
|
||||
caption = content.get("caption", "")
|
||||
else:
|
||||
# Fallback to direct element fields
|
||||
base64Data = element.get("base64Data")
|
||||
altText = element.get("altText", "Image")
|
||||
caption = element.get("caption", "")
|
||||
|
||||
if base64Data:
|
||||
imageData.append({
|
||||
"base64Data": base64Data,
|
||||
"altText": altText,
|
||||
"caption": caption
|
||||
})
|
||||
|
||||
slides.append({
|
||||
"title": section.get("title") or (imageData[0].get("altText", "Image") if imageData else "Image"),
|
||||
|
|
@ -986,17 +1082,17 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
content_parts = []
|
||||
for element in elements:
|
||||
if content_type == "table":
|
||||
content_parts.append(self._formatTableForSlide([element]))
|
||||
elif content_type == "list":
|
||||
content_parts.append(self._formatListForSlide([element]))
|
||||
content_parts.append(self._formatTableForSlide(element))
|
||||
elif content_type == "bullet_list" or content_type == "list":
|
||||
content_parts.append(self._formatListForSlide(element))
|
||||
elif content_type == "heading":
|
||||
content_parts.append(self._formatHeadingForSlide([element]))
|
||||
content_parts.append(self._formatHeadingForSlide(element))
|
||||
elif content_type == "paragraph":
|
||||
content_parts.append(self._formatParagraphForSlide([element]))
|
||||
elif content_type == "code":
|
||||
content_parts.append(self._formatCodeForSlide([element]))
|
||||
content_parts.append(self._formatParagraphForSlide(element))
|
||||
elif content_type == "code_block" or content_type == "code":
|
||||
content_parts.append(self._formatCodeForSlide(element))
|
||||
else:
|
||||
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||
content_parts.append(self._formatParagraphForSlide(element))
|
||||
|
||||
return "\n\n".join(filter(None, content_parts))
|
||||
|
||||
|
|
@ -1009,6 +1105,7 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
try:
|
||||
from pptx.util import Inches, Pt
|
||||
from pptx.enum.text import PP_ALIGN
|
||||
from pptx.dml.color import RGBColor
|
||||
import base64
|
||||
import io
|
||||
|
||||
|
|
@ -1106,7 +1203,25 @@ JSON ONLY. NO OTHER TEXT."""
|
|||
slide.shapes.add_picture(imageStream, left, top, width=imgWidth, height=imgHeight)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error adding images to slide: {str(e)}")
|
||||
logger.error(f"Error embedding images in PPTX slide: {str(e)}")
|
||||
# Add error message text box to slide
|
||||
try:
|
||||
from pptx.util import Inches, Pt
|
||||
from pptx.enum.text import PP_ALIGN
|
||||
errorMsg = f"[Error: Could not embed image(s). {str(e)}]"
|
||||
errorBox = slide.shapes.add_textbox(
|
||||
Inches(0.5),
|
||||
Inches(2),
|
||||
slideWidth - Inches(1),
|
||||
Inches(0.5)
|
||||
)
|
||||
errorFrame = errorBox.text_frame
|
||||
errorFrame.text = errorMsg
|
||||
errorFrame.paragraphs[0].font.size = Pt(12)
|
||||
errorFrame.paragraphs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color
|
||||
errorFrame.paragraphs[0].alignment = PP_ALIGN.LEFT
|
||||
except Exception as errorBoxError:
|
||||
logger.error(f"Could not add error message to slide: {str(errorBoxError)}")
|
||||
|
||||
def _formatTimestamp(self) -> str:
|
||||
"""Format current timestamp for presentation generation."""
|
||||
|
|
|
|||
|
|
@ -63,11 +63,17 @@ class RendererText(BaseRenderer):
|
|||
else:
|
||||
filename = self._determineFilename(title, "text/plain")
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=textContent.encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -75,11 +81,15 @@ class RendererText(BaseRenderer):
|
|||
self.logger.error(f"Error rendering text: {str(e)}")
|
||||
# Return minimal text fallback
|
||||
fallbackContent = f"{title}\n\nError rendering report: {str(e)}"
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename=self._determineFilename(title, "text/plain")
|
||||
filename=self._determineFilename(title, "text/plain"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
|
|
@ -201,8 +211,12 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonTable(self, tableData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to text."""
|
||||
try:
|
||||
headers = tableData.get("headers", [])
|
||||
rows = tableData.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = tableData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
|
@ -231,7 +245,11 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonBulletList(self, listData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to text."""
|
||||
try:
|
||||
items = listData.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = listData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
items = content.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
|
@ -252,8 +270,12 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonHeading(self, headingData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON heading to text."""
|
||||
try:
|
||||
level = headingData.get("level", 1)
|
||||
text = headingData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = headingData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
|
|
@ -273,7 +295,14 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON paragraph to text."""
|
||||
try:
|
||||
text = paragraphData.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = paragraphData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
return text if text else ""
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -283,8 +312,12 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonCodeBlock(self, codeData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to text."""
|
||||
try:
|
||||
code = codeData.get("code", "")
|
||||
language = codeData.get("language", "")
|
||||
# Extract from nested content structure
|
||||
content = codeData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return ""
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
|
|
@ -301,9 +334,14 @@ class RendererText(BaseRenderer):
|
|||
def _renderJsonImage(self, imageData: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to text."""
|
||||
try:
|
||||
altText = imageData.get("altText", "Image")
|
||||
# Extract from nested content structure
|
||||
content = imageData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
altText = content.get("altText", "Image")
|
||||
else:
|
||||
altText = imageData.get("altText", "Image")
|
||||
return f"[Image: {altText}]"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return f"[Image: {imageData.get('altText', 'Image')}]"
|
||||
return f"[Image: Image]"
|
||||
|
|
|
|||
|
|
@ -50,6 +50,10 @@ class RendererXlsx(BaseRenderer):
|
|||
# Generate Excel using AI-analyzed styling
|
||||
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService)
|
||||
|
||||
# Extract metadata for document type and other info
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
# Determine filename from document or title
|
||||
documents = extractedContent.get("documents", [])
|
||||
if documents and isinstance(documents[0], dict):
|
||||
|
|
@ -72,14 +76,27 @@ class RendererXlsx(BaseRenderer):
|
|||
RenderedDocument(
|
||||
documentData=excel_bytes,
|
||||
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
filename=filename
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering Excel: {str(e)}")
|
||||
# Return CSV fallback
|
||||
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
||||
# Return CSV fallback with metadata
|
||||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
fallbackContent = f"Title,Content\n{title},Error rendering Excel report: {str(e)}"
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackContent.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=self._determineFilename(title, "text/csv"),
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
def _generateExcel(self, content: str, title: str) -> str:
|
||||
"""Generate Excel content using openpyxl."""
|
||||
|
|
@ -231,8 +248,8 @@ class RendererXlsx(BaseRenderer):
|
|||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER")
|
||||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER")
|
||||
|
||||
# Get style set: default styles, enhanced with AI if userPrompt provided
|
||||
styles = await self._getStyleSet(userPrompt, aiService)
|
||||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||||
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
|
||||
|
||||
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
|
||||
if not self._validateJsonStructure(jsonContent):
|
||||
|
|
@ -275,12 +292,17 @@ class RendererXlsx(BaseRenderer):
|
|||
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
|
||||
raise Exception(f"Excel generation failed: {str(e)}")
|
||||
|
||||
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
||||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||||
"""Get style set - use styles from document generation metadata if available,
|
||||
otherwise enhance default styles with AI if userPrompt provided.
|
||||
|
||||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||||
|
||||
Args:
|
||||
extractedContent: Document content with metadata (may contain styles)
|
||||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||||
aiService: AI service (used only if userPrompt provided)
|
||||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||||
templateName: Name of template style set (None = default)
|
||||
|
||||
Returns:
|
||||
|
|
@ -289,10 +311,19 @@ class RendererXlsx(BaseRenderer):
|
|||
# Get default style set
|
||||
defaultStyleSet = self._getDefaultStyleSet()
|
||||
|
||||
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
||||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||||
if extractedContent:
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
if isinstance(metadata, dict):
|
||||
styles = metadata.get("styles")
|
||||
if styles and isinstance(styles, dict):
|
||||
self.logger.debug("Using styles from document generation metadata")
|
||||
enhancedStyleSet = self._convertColorsFormat(styles)
|
||||
return self._validateStylesContrast(enhancedStyleSet)
|
||||
|
||||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||||
if userPrompt and aiService:
|
||||
# AI will naturally detect style instructions in any language
|
||||
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
||||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||||
# Convert colors to Excel format after getting styles
|
||||
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
|
||||
|
|
@ -462,86 +493,119 @@ class RendererXlsx(BaseRenderer):
|
|||
|
||||
# Create sheets
|
||||
for i, sheetName in enumerate(sheetNames):
|
||||
# Sanitize sheet name before creating
|
||||
sanitized_name = self._sanitizeSheetName(sheetName)
|
||||
if i == 0:
|
||||
# Use the default sheet for the first sheet
|
||||
sheet = wb.active
|
||||
sheet.title = sheetName
|
||||
sheet.title = sanitized_name
|
||||
else:
|
||||
# Create additional sheets
|
||||
sheet = wb.create_sheet(sheetName, i)
|
||||
sheets[sheetName.lower()] = sheet
|
||||
sheet = wb.create_sheet(sanitized_name, i)
|
||||
# Use sanitized name as key (lowercase for lookup)
|
||||
sheets[sanitized_name.lower()] = sheet
|
||||
|
||||
return sheets
|
||||
|
||||
def _sanitizeSheetName(self, name: str) -> str:
|
||||
"""Sanitize sheet name: remove invalid characters and ensure valid length."""
|
||||
if not name:
|
||||
return "Sheet"
|
||||
# Remove invalid characters: [ ] : * ? / \
|
||||
invalid_chars = ['[', ']', ':', '*', '?', '/', '\\']
|
||||
sanitized = name
|
||||
for char in invalid_chars:
|
||||
sanitized = sanitized.replace(char, '')
|
||||
# Remove leading/trailing spaces and apostrophes
|
||||
sanitized = sanitized.strip().strip("'")
|
||||
# Ensure not empty
|
||||
if not sanitized:
|
||||
sanitized = "Sheet"
|
||||
# Excel sheet name limit is 31 characters
|
||||
return sanitized[:31]
|
||||
|
||||
def _generateSheetNamesFromContent(self, jsonContent: Dict[str, Any]) -> List[str]:
|
||||
"""Generate sheet names based on actual content structure."""
|
||||
"""Generate sheet names: each heading section creates a new tab."""
|
||||
sections = self._extractSections(jsonContent)
|
||||
|
||||
# If no sections, create a single sheet
|
||||
if not sections:
|
||||
return ["Content"]
|
||||
|
||||
# Generate sheet names based on content structure
|
||||
# Simple logic: each heading section creates a new tab
|
||||
sheetNames = []
|
||||
|
||||
# Check if we have multiple table sections
|
||||
tableSections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(tableSections) > 1:
|
||||
# Create separate sheets for each table
|
||||
for i, section in enumerate(tableSections, 1):
|
||||
# Try to get caption from table element first, then section title, then fallback
|
||||
sectionTitle = None
|
||||
for section in sections:
|
||||
if section.get("content_type") == "heading":
|
||||
# Extract heading text from elements
|
||||
elements = section.get("elements", [])
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
tableElement = elements[0]
|
||||
sectionTitle = tableElement.get("caption")
|
||||
|
||||
if not sectionTitle:
|
||||
sectionTitle = section.get("title")
|
||||
|
||||
if not sectionTitle:
|
||||
sectionTitle = f"Table {i}"
|
||||
|
||||
sheetNames.append(sectionTitle[:31]) # Excel sheet name limit
|
||||
else:
|
||||
# Single table or mixed content - create only main sheet
|
||||
headingElement = elements[0]
|
||||
content = headingElement.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
headingText = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
headingText = content
|
||||
else:
|
||||
headingText = ""
|
||||
|
||||
if headingText:
|
||||
sanitized_name = self._sanitizeSheetName(headingText)
|
||||
# Ensure unique sheet names
|
||||
if sanitized_name not in sheetNames:
|
||||
sheetNames.append(sanitized_name)
|
||||
else:
|
||||
# Add number suffix for duplicates
|
||||
counter = 1
|
||||
base_name = sanitized_name[:28] # Leave room for " (1)"
|
||||
while f"{base_name} ({counter})" in sheetNames:
|
||||
counter += 1
|
||||
sheetNames.append(f"{base_name} ({counter})"[:31])
|
||||
|
||||
# If no headings found, use document title
|
||||
if not sheetNames:
|
||||
documentTitle = jsonContent.get("metadata", {}).get("title", "Document")
|
||||
sheetNames.append(documentTitle[:31]) # Excel sheet name limit
|
||||
sheetNames.append(self._sanitizeSheetName(documentTitle))
|
||||
|
||||
return sheetNames
|
||||
|
||||
def _populateExcelSheets(self, sheets: Dict[str, Any], jsonContent: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Populate Excel sheets with content from JSON based on actual sheet names."""
|
||||
"""Populate Excel sheets: each heading creates a new tab, all following content goes in that tab."""
|
||||
try:
|
||||
# Get the actual sheet names that were created
|
||||
# Get the actual sheet names that were created (keys are lowercase)
|
||||
sheetNames = list(sheets.keys())
|
||||
|
||||
if not sheetNames:
|
||||
return
|
||||
|
||||
sections = self._extractSections(jsonContent)
|
||||
tableSections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(tableSections) > 1:
|
||||
# Multiple tables - populate each sheet with its corresponding table
|
||||
for i, section in enumerate(tableSections):
|
||||
if i < len(sheetNames):
|
||||
sheetName = sheetNames[i]
|
||||
sheet = sheets[sheetName]
|
||||
# Use the caption from table element as sheet title, or fallback to sheet name
|
||||
sheetTitle = sheetName
|
||||
elements = section.get("elements", [])
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
tableElement = elements[0]
|
||||
caption = tableElement.get("caption")
|
||||
if caption:
|
||||
sheetTitle = caption
|
||||
self._populateTableSheet(sheet, section, styles, sheetTitle)
|
||||
else:
|
||||
# Single table or mixed content - populate only main sheet
|
||||
firstSheetName = sheetNames[0]
|
||||
self._populateMainSheet(sheets[firstSheetName], jsonContent, styles)
|
||||
# Simple logic: iterate through sections, each heading creates a new tab
|
||||
currentSheetIndex = 0
|
||||
currentSheet = None
|
||||
currentRow = 1
|
||||
|
||||
for section in sections:
|
||||
contentType = section.get("content_type", "paragraph")
|
||||
|
||||
# Heading section: switch to next sheet
|
||||
if contentType == "heading":
|
||||
if currentSheetIndex < len(sheetNames):
|
||||
sheetName = sheetNames[currentSheetIndex]
|
||||
currentSheet = sheets[sheetName] # sheets dict uses lowercase keys
|
||||
currentSheetIndex += 1
|
||||
currentRow = 1 # Start at row 1 for new sheet
|
||||
else:
|
||||
# More headings than sheets - use last sheet
|
||||
if sheetNames:
|
||||
currentSheet = sheets[sheetNames[-1]]
|
||||
|
||||
# Render content in current sheet (or first sheet if no headings yet)
|
||||
if currentSheet is None and sheetNames:
|
||||
currentSheet = sheets[sheetNames[0]]
|
||||
|
||||
if currentSheet:
|
||||
currentRow = self._addSectionToSheet(currentSheet, section, styles, currentRow)
|
||||
currentRow += 1 # Add spacing between sections
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
|
||||
|
|
@ -558,9 +622,15 @@ class RendererXlsx(BaseRenderer):
|
|||
# Get table data from elements (canonical JSON format)
|
||||
elements = section.get("elements", [])
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
table_data = elements[0]
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
table_element = elements[0]
|
||||
# Extract from nested content structure
|
||||
content = table_element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
headers = []
|
||||
rows = []
|
||||
else:
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
else:
|
||||
headers = []
|
||||
rows = []
|
||||
|
|
@ -578,11 +648,28 @@ class RendererXlsx(BaseRenderer):
|
|||
if header_style.get("background"):
|
||||
cell.fill = PatternFill(start_color=self._getSafeColor(header_style["background"]), end_color=self._getSafeColor(header_style["background"]), fill_type="solid")
|
||||
|
||||
# Add rows
|
||||
# Add rows - handle both array format and cells object format
|
||||
cell_style = styles.get("table_cell", {})
|
||||
for row_idx, row_data in enumerate(rows, 4):
|
||||
for col_idx, cell_value in enumerate(row_data, 1):
|
||||
cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value)
|
||||
# Handle different row formats
|
||||
if isinstance(row_data, list):
|
||||
# Array format: [value1, value2, ...]
|
||||
cell_values = row_data
|
||||
elif isinstance(row_data, dict) and "cells" in row_data:
|
||||
# Cells object format: {"cells": [{"value": ...}, ...]}
|
||||
cell_values = [cell_obj.get("value", "") for cell_obj in row_data.get("cells", [])]
|
||||
else:
|
||||
# Unknown format, skip
|
||||
continue
|
||||
|
||||
for col_idx, cell_value in enumerate(cell_values, 1):
|
||||
# Extract value if it's a dict with "value" key
|
||||
if isinstance(cell_value, dict):
|
||||
actual_value = cell_value.get("value", "")
|
||||
else:
|
||||
actual_value = cell_value
|
||||
|
||||
cell = sheet.cell(row=row_idx, column=col_idx, value=actual_value)
|
||||
if cell_style.get("text_color"):
|
||||
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
||||
|
||||
|
|
@ -714,18 +801,33 @@ class RendererXlsx(BaseRenderer):
|
|||
# Handle all section types using elements array
|
||||
elements = section.get("elements", [])
|
||||
for element in elements:
|
||||
if section_type == "table":
|
||||
# Check element type, not section type (elements can have different types than section)
|
||||
element_type = element.get("type", "") if isinstance(element, dict) else ""
|
||||
|
||||
if element_type == "table":
|
||||
startRow = self._addTableToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "bullet_list" or section_type == "list":
|
||||
elif element_type == "bullet_list" or element_type == "list":
|
||||
startRow = self._addListToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "paragraph":
|
||||
elif element_type == "paragraph":
|
||||
startRow = self._addParagraphToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "heading":
|
||||
elif element_type == "heading":
|
||||
startRow = self._addHeadingToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "image":
|
||||
elif element_type == "image":
|
||||
startRow = self._addImageToExcel(sheet, element, styles, startRow)
|
||||
else:
|
||||
startRow = self._addParagraphToExcel(sheet, element, styles, startRow)
|
||||
# Fallback: if element_type not set, use section_type
|
||||
if section_type == "table":
|
||||
startRow = self._addTableToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "bullet_list" or section_type == "list":
|
||||
startRow = self._addListToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "paragraph":
|
||||
startRow = self._addParagraphToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "heading":
|
||||
startRow = self._addHeadingToExcel(sheet, element, styles, startRow)
|
||||
elif section_type == "image":
|
||||
startRow = self._addImageToExcel(sheet, element, styles, startRow)
|
||||
else:
|
||||
startRow = self._addParagraphToExcel(sheet, element, styles, startRow)
|
||||
|
||||
return startRow
|
||||
|
||||
|
|
@ -733,36 +835,114 @@ class RendererXlsx(BaseRenderer):
|
|||
self.logger.warning(f"Could not add section to sheet: {str(e)}")
|
||||
return startRow + 1
|
||||
|
||||
def _sanitizeCellValue(self, value: Any) -> str:
|
||||
"""Sanitize cell value: remove markdown, convert to string, handle None."""
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, dict):
|
||||
# Extract value from dict if present
|
||||
return str(value.get("value", ""))
|
||||
if isinstance(value, (int, float)):
|
||||
return value # Keep numbers as-is
|
||||
# Convert to string and remove markdown formatting
|
||||
text = str(value)
|
||||
# Remove markdown bold (**text**)
|
||||
text = text.replace("**", "")
|
||||
# Remove markdown italic (*text*)
|
||||
text = text.replace("*", "")
|
||||
# Remove other markdown
|
||||
text = text.replace("__", "").replace("_", "")
|
||||
return text.strip()
|
||||
|
||||
def _addTableToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||
"""Add a table element to Excel sheet."""
|
||||
"""Add a table element to Excel sheet with proper formatting and borders."""
|
||||
try:
|
||||
# In canonical JSON format, table elements have headers and rows directly
|
||||
headers = element.get("headers", [])
|
||||
rows = element.get("rows", [])
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return startRow
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
if not headers and not rows:
|
||||
return startRow
|
||||
|
||||
# Add headers
|
||||
# Define border style
|
||||
thin_border = Border(
|
||||
left=Side(style='thin'),
|
||||
right=Side(style='thin'),
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin')
|
||||
)
|
||||
|
||||
headerRow = startRow
|
||||
header_style = styles.get("table_header", {})
|
||||
|
||||
# Add headers with formatting
|
||||
for col, header in enumerate(headers, 1):
|
||||
cell = sheet.cell(row=startRow, column=col, value=header)
|
||||
if header_style.get("bold"):
|
||||
cell.font = Font(bold=True, color=self._getSafeColor(header_style.get("text_color", "FF000000")))
|
||||
sanitized_header = self._sanitizeCellValue(header)
|
||||
cell = sheet.cell(row=headerRow, column=col, value=sanitized_header)
|
||||
|
||||
# Font styling
|
||||
cell.font = Font(
|
||||
bold=header_style.get("bold", True),
|
||||
color=self._getSafeColor(header_style.get("text_color", "FF000000"))
|
||||
)
|
||||
|
||||
# Background color
|
||||
if header_style.get("background"):
|
||||
cell.fill = PatternFill(start_color=self._getSafeColor(header_style["background"]), end_color=self._getSafeColor(header_style["background"]), fill_type="solid")
|
||||
cell.fill = PatternFill(
|
||||
start_color=self._getSafeColor(header_style["background"]),
|
||||
end_color=self._getSafeColor(header_style["background"]),
|
||||
fill_type="solid"
|
||||
)
|
||||
|
||||
# Alignment
|
||||
cell.alignment = Alignment(
|
||||
horizontal=header_style.get("align", "left"),
|
||||
vertical="center"
|
||||
)
|
||||
|
||||
# Border
|
||||
cell.border = thin_border
|
||||
|
||||
startRow += 1
|
||||
|
||||
# Add rows
|
||||
# Add rows with formatting
|
||||
cell_style = styles.get("table_cell", {})
|
||||
for row_data in rows:
|
||||
for col, cell_value in enumerate(row_data, 1):
|
||||
cell = sheet.cell(row=startRow, column=col, value=cell_value)
|
||||
# Handle different row formats
|
||||
if isinstance(row_data, list):
|
||||
cell_values = row_data
|
||||
elif isinstance(row_data, dict) and "cells" in row_data:
|
||||
cell_values = [cell_obj.get("value", "") for cell_obj in row_data.get("cells", [])]
|
||||
else:
|
||||
continue
|
||||
|
||||
for col, cell_value in enumerate(cell_values, 1):
|
||||
sanitized_value = self._sanitizeCellValue(cell_value)
|
||||
cell = sheet.cell(row=startRow, column=col, value=sanitized_value)
|
||||
|
||||
# Font styling
|
||||
if cell_style.get("text_color"):
|
||||
cell.font = Font(color=self._getSafeColor(cell_style["text_color"]))
|
||||
|
||||
# Alignment
|
||||
cell.alignment = Alignment(
|
||||
horizontal=cell_style.get("align", "left"),
|
||||
vertical="center"
|
||||
)
|
||||
|
||||
# Border
|
||||
cell.border = thin_border
|
||||
|
||||
startRow += 1
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in range(1, len(headers) + 1):
|
||||
column_letter = get_column_letter(col)
|
||||
sheet.column_dimensions[column_letter].width = 20
|
||||
|
||||
return startRow
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -770,9 +950,13 @@ class RendererXlsx(BaseRenderer):
|
|||
return startRow + 1
|
||||
|
||||
def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||
"""Add a list element to Excel sheet."""
|
||||
"""Add a list element to Excel sheet. Expects nested content structure."""
|
||||
try:
|
||||
list_items = element.get("items", [])
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return startRow
|
||||
list_items = content.get("items", [])
|
||||
|
||||
list_style = styles.get("bullet_list", {})
|
||||
for item in list_items:
|
||||
|
|
@ -788,9 +972,16 @@ class RendererXlsx(BaseRenderer):
|
|||
return startRow + 1
|
||||
|
||||
def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||
"""Add a paragraph element to Excel sheet."""
|
||||
"""Add a paragraph element to Excel sheet. Expects nested content structure."""
|
||||
try:
|
||||
text = element.get("text", "")
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
if text:
|
||||
sheet.cell(row=startRow, column=1, value=text)
|
||||
|
||||
|
|
@ -807,10 +998,14 @@ class RendererXlsx(BaseRenderer):
|
|||
return startRow + 1
|
||||
|
||||
def _addHeadingToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||
"""Add a heading element to Excel sheet."""
|
||||
"""Add a heading element to Excel sheet. Expects nested content structure."""
|
||||
try:
|
||||
text = element.get("text", "")
|
||||
level = element.get("level", 1)
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return startRow
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
sheet.cell(row=startRow, column=1, value=text)
|
||||
|
|
@ -835,11 +1030,15 @@ class RendererXlsx(BaseRenderer):
|
|||
return startRow + 1
|
||||
|
||||
def _addImageToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
|
||||
"""Add an image element to Excel sheet using openpyxl."""
|
||||
"""Add an image element to Excel sheet using openpyxl. Expects nested content structure."""
|
||||
try:
|
||||
base64Data = element.get("base64Data", "")
|
||||
altText = element.get("altText", "Image")
|
||||
caption = element.get("caption", "")
|
||||
# Extract from nested content structure
|
||||
content = element.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return startRow
|
||||
base64Data = content.get("base64Data", "")
|
||||
altText = content.get("altText", "Image")
|
||||
caption = content.get("caption", "")
|
||||
|
||||
if not base64Data:
|
||||
# No image data - add placeholder text
|
||||
|
|
@ -891,16 +1090,23 @@ class RendererXlsx(BaseRenderer):
|
|||
return startRow + 1
|
||||
|
||||
except ImportError:
|
||||
self.logger.warning("openpyxl.drawing.image not available, using placeholder")
|
||||
sheet.cell(row=startRow, column=1, value=f"[Image: {altText}]")
|
||||
self.logger.error("openpyxl.drawing.image not available, cannot embed image")
|
||||
errorMsg = f"[Error: Image embedding not available. Image: {altText}]"
|
||||
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||
return startRow + 1
|
||||
except Exception as imgError:
|
||||
self.logger.warning(f"Error embedding image in Excel: {str(imgError)}")
|
||||
sheet.cell(row=startRow, column=1, value=f"[Image: {altText}]")
|
||||
self.logger.error(f"Error embedding image in Excel: {str(imgError)}")
|
||||
errorMsg = f"[Error: Could not embed image '{altText}'. {str(imgError)}]"
|
||||
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||
return startRow + 1
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add image to Excel: {str(e)}")
|
||||
self.logger.error(f"Error adding image to Excel: {str(e)}")
|
||||
errorMsg = f"[Error: Could not process image. {str(e)}]"
|
||||
errorCell = sheet.cell(row=startRow, column=1, value=errorMsg)
|
||||
errorCell.font = Font(color="FFFF0000", italic=True) # Red color
|
||||
return startRow + 1
|
||||
|
||||
def _formatTimestamp(self) -> str:
|
||||
|
|
|
|||
|
|
@ -213,10 +213,21 @@ class ContentValidator:
|
|||
sourceJson = getattr(doc, 'sourceJson', None)
|
||||
data = getattr(doc, 'documentData', None)
|
||||
|
||||
# WICHTIG: For rendered documents (HTML, PDF, DOCX, etc.), jsonStructure is METADATA about the structure,
|
||||
# NOT the actual rendered content. The actual content is in documentData.
|
||||
# Include both: jsonStructure for structure metadata, and contentPreview for actual content check
|
||||
if sourceJson and isinstance(sourceJson, dict):
|
||||
# Use source JSON for structure analysis (for rendered documents like xlsx/docx/pdf)
|
||||
jsonSummary = self._summarizeJsonStructure(sourceJson)
|
||||
summary["jsonStructure"] = jsonSummary
|
||||
# Add note that this is metadata, not actual content
|
||||
summary["note"] = "jsonStructure contains metadata about document structure. Actual rendered content is in documentData."
|
||||
|
||||
# For rendered documents, also check actual content
|
||||
if data is not None:
|
||||
contentPreview = self._getContentPreview(data, formatExt, mimeType)
|
||||
if contentPreview:
|
||||
summary["contentPreview"] = contentPreview
|
||||
elif data is not None:
|
||||
# Fallback: try to parse documentData as JSON (for non-rendered documents)
|
||||
if isinstance(data, dict):
|
||||
|
|
@ -227,6 +238,11 @@ class ContentValidator:
|
|||
# Handle list of documents
|
||||
jsonSummary = self._summarizeJsonStructure(data[0])
|
||||
summary["jsonStructure"] = jsonSummary
|
||||
else:
|
||||
# For non-JSON data (e.g., rendered HTML), get content preview
|
||||
contentPreview = self._getContentPreview(data, formatExt, mimeType)
|
||||
if contentPreview:
|
||||
summary["contentPreview"] = contentPreview
|
||||
|
||||
summaries.append(summary)
|
||||
except Exception as e:
|
||||
|
|
@ -295,6 +311,73 @@ class ContentValidator:
|
|||
bytes /= 1024.0
|
||||
return f"{bytes:.1f} TB"
|
||||
|
||||
def _getContentPreview(self, data: Any, formatExt: str, mimeType: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get structural validation info for rendered documents (generic, NO content preview for security/privacy)
|
||||
|
||||
Returns metadata about document structure to help validation distinguish between:
|
||||
- Structure metadata (jsonStructure) - describes what should be rendered
|
||||
- Actual rendered content (documentData) - the actual document file
|
||||
|
||||
Does NOT expose actual content, only structural indicators.
|
||||
"""
|
||||
try:
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
preview = {}
|
||||
|
||||
# Generic content type detection
|
||||
if isinstance(data, bytes):
|
||||
preview["dataType"] = "bytes"
|
||||
preview["contentLength"] = len(data)
|
||||
# Check if it's likely text-based (for text formats like HTML, TXT, etc.)
|
||||
try:
|
||||
# Try to decode as UTF-8 to check if it's text-based
|
||||
decoded = data.decode('utf-8', errors='strict')
|
||||
preview["isTextBased"] = True
|
||||
preview["contentLength"] = len(decoded)
|
||||
|
||||
# For text-based formats, check if it looks like rendered content vs JSON metadata
|
||||
# JSON metadata typically starts with { or [ and contains structure keywords
|
||||
trimmed = decoded.strip()
|
||||
looksLikeJson = (trimmed.startswith('{') or trimmed.startswith('[')) and \
|
||||
('"sections"' in trimmed or '"contentPartIds"' in trimmed or '"generationHint"' in trimmed)
|
||||
preview["looksLikeRenderedContent"] = not looksLikeJson
|
||||
|
||||
except UnicodeDecodeError:
|
||||
# Not valid UTF-8, likely binary (PDF, DOCX, images, etc.)
|
||||
preview["isTextBased"] = False
|
||||
preview["isBinary"] = True
|
||||
# Binary files with content are rendered (not metadata)
|
||||
preview["looksLikeRenderedContent"] = True
|
||||
|
||||
elif isinstance(data, str):
|
||||
preview["dataType"] = "string"
|
||||
preview["isTextBased"] = True
|
||||
preview["contentLength"] = len(data)
|
||||
|
||||
# Check if it looks like rendered content vs JSON metadata
|
||||
trimmed = data.strip()
|
||||
looksLikeJson = (trimmed.startswith('{') or trimmed.startswith('[')) and \
|
||||
('"sections"' in trimmed or '"contentPartIds"' in trimmed or '"generationHint"' in trimmed)
|
||||
preview["looksLikeRenderedContent"] = not looksLikeJson
|
||||
|
||||
elif isinstance(data, (dict, list)):
|
||||
# If documentData is still a dict/list, it's likely structure metadata, not rendered content
|
||||
preview["dataType"] = "json"
|
||||
preview["isTextBased"] = True
|
||||
preview["looksLikeRenderedContent"] = False
|
||||
preview["note"] = "documentData is JSON structure, not rendered document file"
|
||||
else:
|
||||
preview["dataType"] = type(data).__name__
|
||||
preview["contentLength"] = len(str(data)) if hasattr(data, '__len__') else 0
|
||||
|
||||
return preview if preview else None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error getting content structure info: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
|
||||
"""
|
||||
|
|
@ -445,31 +528,23 @@ EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContex
|
|||
|
||||
=== VALIDATION INSTRUCTIONS ===
|
||||
|
||||
IMPORTANT: Different formats can represent the same data structure. Do not reject a format just because it differs from expected - check the structure summary for actual content.
|
||||
CRITICAL: Validate ONLY metadata/structure. Documents may be binary (PDF, DOCX, images) or very large (200MB+). NEVER try to read or validate actual content values.
|
||||
|
||||
VALIDATION RULES:
|
||||
1. Use structure summary (sections, statistics, counts) as PRIMARY evidence for DATA-ORIENTED criteria. Trust structure over format claims.
|
||||
2. Use ACTION HISTORY as PRIMARY evidence for PROCESS-ORIENTED criteria (e.g., "internet search performed", "sources cited"). Document metadata may only reflect the last action, not the entire workflow.
|
||||
3. For each criterion in criteriaMapping: evaluate ONLY that criterion. Do not mention other criteria.
|
||||
4. Priority: Data completeness > Format compatibility. Missing data is more critical than format mismatch.
|
||||
5. Format understanding: Different formats can represent equivalent data structures. Focus on content, not format name.
|
||||
6. Multi-step workflow awareness: If ACTION HISTORY is present, consider the workflow as a whole. Document metadata (e.g., extraction_method) describes how data was EXTRACTED in the last step, not necessarily how it was OBTAINED in the workflow.
|
||||
7. Data availability assessment: If delivered documents do not contain required data, clearly indicate this in findings. Re-reading the same documents might not help.
|
||||
8. CRITICAL - Data vs Data Description: When criteria require specific data types (e.g., images, tables, charts, files), distinguish between:
|
||||
- ACTUAL DATA: The actual data itself (binary data, structured data, embedded content)
|
||||
- DATA DESCRIPTIONS: Text fields that describe or specify what data should be created (e.g., "image_description", "table_description", "chart_specification") - these are TEXT METADATA, NOT the actual data
|
||||
- If only descriptions/specifications exist but no actual data, the criterion is NOT met. Descriptions are instructions for creating data, not the data itself.
|
||||
- Check content types in sections/elements: if content_type matches the required data type (e.g., "image" for images, "table" for tables), actual data exists. If only text fields describing the data exist, the data is missing.
|
||||
- Check document statistics: if counts for the required data type are 0, the data is missing even if descriptions exist.
|
||||
1. METADATA ONLY: Use jsonStructure (sections, contentPartIds, content_type, statistics) and contentPreview (dataType, contentLength, looksLikeRenderedContent) for validation. These are METADATA indicators, NOT actual content.
|
||||
2. FORMAT VALIDATION: Check mimeType/format metadata only. Do NOT inspect content to determine format. Format mismatch = wrong_format gap.
|
||||
3. CONTENT EXISTENCE: Use contentPreview.looksLikeRenderedContent=true to confirm content exists. Use jsonStructure.content_type to confirm data types exist (e.g., "image" section = image exists). Do NOT validate content quality, accuracy, or completeness of actual data values.
|
||||
4. STRUCTURE VALIDATION: Use jsonStructure.sections, statistics (counts, rowCount, columnCount) as evidence. Trust structure metadata over format claims.
|
||||
5. PROCESS VALIDATION: Use ACTION HISTORY for process-oriented criteria (e.g., "search performed", "extraction done").
|
||||
6. ONE CRITERION PER EVALUATION: Evaluate each criterion independently. Do not mention other criteria.
|
||||
|
||||
VALIDATION STEPS:
|
||||
- Check ACTION HISTORY first (if present) for PROCESS-ORIENTED criteria (e.g., "search performed", "sources used", "verification done")
|
||||
- Check ACTION VALIDATION METADATA (if present) - this contains action-specific context for the LAST action only
|
||||
- Check structure summary for quantities, counts, statistics (for DATA-ORIENTED criteria)
|
||||
- Compare found values with required values from criteria
|
||||
- If structure unavailable, use metadata only (format, filename, size)
|
||||
- Classify gaps: missing_data (less than required), incomplete_data (partial), wrong_structure (wrong organization), wrong_format (format mismatch but data present)
|
||||
- Assess if documents contain the required data: If structure shows documents lack the data, note this in findings - data must be generated or obtained elsewhere, not re-extracted from same documents
|
||||
- Check ACTION HISTORY for process-oriented criteria
|
||||
- Check jsonStructure metadata (sections, content_type, statistics) for structure validation
|
||||
- Check contentPreview.looksLikeRenderedContent for content existence (not quality)
|
||||
- Check mimeType/format for format validation
|
||||
- NEVER try to read actual content values (binary files, large files, data accuracy)
|
||||
- Classify gaps: missing_data, incomplete_data, wrong_structure, wrong_format
|
||||
|
||||
SCORING:
|
||||
- Data complete + structure matches → qualityScore: 0.9-1.0
|
||||
|
|
|
|||
|
|
@ -379,8 +379,34 @@ def extractLearningsAndImprovements(context: Any) -> str:
|
|||
return "No learnings available yet"
|
||||
|
||||
def extractLatestRefinementFeedback(context: Any) -> str:
|
||||
"""Extract the latest refinement feedback. Maps to {{KEY:LATEST_REFINEMENT_FEEDBACK}}"""
|
||||
"""Extract the latest refinement feedback. Maps to {{KEY:LATEST_REFINEMENT_FEEDBACK}}
|
||||
|
||||
CRITICAL: If ERROR level logs are found, refinement should stop processing.
|
||||
"""
|
||||
try:
|
||||
# First check for ERROR level logs in workflow
|
||||
if hasattr(context, 'workflow') and context.workflow:
|
||||
try:
|
||||
import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects
|
||||
from modules.interfaces.interfaceDbAppObjects import getRootInterface
|
||||
rootInterface = getRootInterface()
|
||||
interfaceDbChat = interfaceDbChatObjects.getInterface(rootInterface.currentUser)
|
||||
|
||||
# Get workflow logs
|
||||
chatData = interfaceDbChat.getUnifiedChatData(context.workflow.id, None)
|
||||
logs = chatData.get("logs", [])
|
||||
|
||||
# Check for ERROR level logs
|
||||
for log in logs:
|
||||
if isinstance(log, dict):
|
||||
log_level = log.get("level", "").upper()
|
||||
log_message = str(log.get("message", ""))
|
||||
if log_level == "ERROR" or "ERROR" in log_message.upper():
|
||||
return f"CRITICAL: Processing stopped due to ERROR in logs: {log_message[:200]}"
|
||||
except Exception as log_check_error:
|
||||
# If we can't check logs, continue with normal feedback extraction
|
||||
logger.warning(f"Could not check for ERROR logs: {str(log_check_error)}")
|
||||
|
||||
if not hasattr(context, 'previousReviewResult') or not context.previousReviewResult or not isinstance(context.previousReviewResult, list):
|
||||
return "No previous refinement feedback available"
|
||||
|
||||
|
|
|
|||
541
tests/functional/test10_document_generation_formats.py
Normal file
541
tests/functional/test10_document_generation_formats.py
Normal file
|
|
@ -0,0 +1,541 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Document Generation Formats Test 10 - Tests document generation in DOCX, XLSX, PPTX, and PDF formats
|
||||
Tests professional document formats with various content types including tables, images, and structured data.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import base64
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
# Add the gateway to path (go up 2 levels from tests/functional/)
|
||||
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
if _gateway_path not in sys.path:
|
||||
sys.path.insert(0, _gateway_path)
|
||||
|
||||
# Import the service initialization
|
||||
from modules.services import getInterface as getServices
|
||||
from modules.datamodels.datamodelChat import UserInputRequest, WorkflowModeEnum
|
||||
from modules.datamodels.datamodelUam import User
|
||||
from modules.features.workflow import chatStart
|
||||
import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects
|
||||
|
||||
|
||||
class DocumentGenerationFormatsTester10:
|
||||
def __init__(self):
|
||||
# Use root user for testing (has full access to everything)
|
||||
from modules.interfaces.interfaceDbAppObjects import getRootInterface
|
||||
rootInterface = getRootInterface()
|
||||
self.testUser = rootInterface.currentUser
|
||||
|
||||
# Initialize services using the existing system
|
||||
self.services = getServices(self.testUser, None) # Test user, no workflow
|
||||
self.workflow = None
|
||||
self.testResults = {}
|
||||
self.generatedDocuments = {}
|
||||
self.pdfFileId = None # Store PDF file ID for reuse
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the test environment."""
|
||||
# Enable debug file logging for tests
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
APP_CONFIG.set("APP_DEBUG_CHAT_WORKFLOW_ENABLED", True)
|
||||
|
||||
# Set logging level to INFO to see workflow progress
|
||||
import logging
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
print(f"Initialized test with user: {self.testUser.id}")
|
||||
print(f"Mandate ID: {self.testUser.mandateId}")
|
||||
print(f"Debug logging enabled: {APP_CONFIG.get('APP_DEBUG_CHAT_WORKFLOW_ENABLED', False)}")
|
||||
|
||||
# Upload PDF file for testing
|
||||
await self.uploadPdfFile()
|
||||
|
||||
async def uploadPdfFile(self):
|
||||
"""Upload the PDF file and store its file ID."""
|
||||
pdfPath = os.path.join(os.path.dirname(__file__), "..", "..", "..", "local", "temp", "B2025-02c.pdf")
|
||||
pdfPath = os.path.abspath(pdfPath)
|
||||
|
||||
if not os.path.exists(pdfPath):
|
||||
print(f"⚠️ Warning: PDF file not found at {pdfPath}")
|
||||
print(" Test will continue without PDF attachment")
|
||||
return
|
||||
|
||||
try:
|
||||
# Read PDF file
|
||||
with open(pdfPath, "rb") as f:
|
||||
pdfContent = f.read()
|
||||
|
||||
# Create file using services.interfaceDbComponent
|
||||
if not hasattr(self.services, 'interfaceDbComponent') or not self.services.interfaceDbComponent:
|
||||
print("⚠️ Warning: interfaceDbComponent not available in services")
|
||||
print(" Test will continue without PDF attachment")
|
||||
return
|
||||
|
||||
interfaceDbComponent = self.services.interfaceDbComponent
|
||||
|
||||
fileItem = interfaceDbComponent.createFile(
|
||||
name="B2025-02c.pdf",
|
||||
mimeType="application/pdf",
|
||||
content=pdfContent
|
||||
)
|
||||
|
||||
# Store file data
|
||||
interfaceDbComponent.createFileData(fileItem.id, pdfContent)
|
||||
|
||||
self.pdfFileId = fileItem.id
|
||||
print(f"✅ Uploaded PDF file: {fileItem.fileName} (ID: {self.pdfFileId}, Size: {len(pdfContent)} bytes)")
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f"⚠️ Warning: Failed to upload PDF file: {str(e)}")
|
||||
print(f" Traceback: {traceback.format_exc()}")
|
||||
print(" Test will continue without PDF attachment")
|
||||
|
||||
def createTestPrompt(self, format: str) -> str:
|
||||
"""Create a test prompt for document generation in the specified format.
|
||||
|
||||
The prompt requests:
|
||||
- Professional document structure with title, sections, tables, and images
|
||||
- Extraction of content from attached PDF
|
||||
- Structured data presentation appropriate for the format
|
||||
"""
|
||||
formatPrompts = {
|
||||
"docx": (
|
||||
"Create a professional Word document about 'Fuel Station Receipt Analysis' with:\n"
|
||||
"1) A main title\n"
|
||||
"2) An executive summary paragraph\n"
|
||||
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||
"4) A detailed analysis section with:\n"
|
||||
" - Bullet points of key findings\n"
|
||||
" - A table summarizing transaction details\n"
|
||||
"5) A conclusion section with recommendations\n\n"
|
||||
"Format as a professional DOCX document with proper headings and structure."
|
||||
),
|
||||
"xlsx": (
|
||||
"Create an Excel spreadsheet analyzing the fuel station receipt from the attached PDF (B2025-02c.pdf).\n"
|
||||
"Include:\n"
|
||||
"1) A summary sheet with key metrics\n"
|
||||
"2) A detailed data sheet with:\n"
|
||||
" - Transaction details in rows\n"
|
||||
" - Columns for: Date, Item, Quantity, Price, Total\n"
|
||||
" - Proper formatting and headers\n"
|
||||
"3) A calculations sheet with:\n"
|
||||
" - VAT calculations\n"
|
||||
" - Net and gross totals\n\n"
|
||||
"Format as a professional XLSX spreadsheet with formulas and formatting."
|
||||
),
|
||||
"pptx": (
|
||||
"Create a PowerPoint presentation about 'Fuel Station Receipt Analysis' with:\n"
|
||||
"1) Title slide with main title\n"
|
||||
"2) Overview slide explaining the receipt analysis\n"
|
||||
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||
"4) Analysis slides with:\n"
|
||||
" - Bullet points of key findings\n"
|
||||
" - Visual representation of data\n"
|
||||
"5) Conclusion slide with recommendations\n\n"
|
||||
"Format as a professional PPTX presentation with consistent styling."
|
||||
),
|
||||
"pdf": (
|
||||
"Create a professional PDF document about 'Fuel Station Receipt Analysis' with:\n"
|
||||
"1) A main title\n"
|
||||
"2) An introduction paragraph explaining the receipt analysis\n"
|
||||
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
|
||||
"4) A section analyzing the receipt data with:\n"
|
||||
" - Bullet points of key findings\n"
|
||||
" - A table summarizing transaction details\n"
|
||||
"5) A conclusion paragraph with recommendations\n\n"
|
||||
"Format as a professional PDF document suitable for printing."
|
||||
)
|
||||
}
|
||||
|
||||
return formatPrompts.get(format.lower(), formatPrompts["docx"])
|
||||
|
||||
async def generateDocumentInFormat(self, format: str) -> Dict[str, Any]:
|
||||
"""Generate a document in the specified format using workflow."""
|
||||
print("\n" + "="*80)
|
||||
print(f"GENERATING DOCUMENT IN {format.upper()} FORMAT")
|
||||
print("="*80)
|
||||
|
||||
prompt = self.createTestPrompt(format)
|
||||
print(f"Prompt: {prompt[:200]}...")
|
||||
|
||||
# Create user input request with PDF file attachment
|
||||
listFileId = []
|
||||
if self.pdfFileId:
|
||||
listFileId = [self.pdfFileId]
|
||||
print(f"Attaching PDF file (ID: {self.pdfFileId})")
|
||||
else:
|
||||
print("⚠️ No PDF file attached (file upload may have failed)")
|
||||
|
||||
# Create user input request
|
||||
userInput = UserInputRequest(
|
||||
prompt=prompt,
|
||||
listFileId=listFileId,
|
||||
userLanguage="en"
|
||||
)
|
||||
|
||||
# Start workflow
|
||||
print(f"\nStarting workflow for {format.upper()} generation...")
|
||||
workflow = await chatStart(
|
||||
currentUser=self.testUser,
|
||||
userInput=userInput,
|
||||
workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC,
|
||||
workflowId=None
|
||||
)
|
||||
|
||||
if not workflow:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Failed to start workflow"
|
||||
}
|
||||
|
||||
self.workflow = workflow
|
||||
print(f"Workflow started: {workflow.id}")
|
||||
|
||||
# Wait for workflow completion (no timeout - wait indefinitely)
|
||||
print(f"Waiting for workflow completion...")
|
||||
completed = await self.waitForWorkflowCompletion(timeout=None)
|
||||
|
||||
if not completed:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Workflow did not complete",
|
||||
"workflowId": workflow.id,
|
||||
"status": workflow.status if workflow else "unknown"
|
||||
}
|
||||
|
||||
# Analyze results
|
||||
results = self.analyzeWorkflowResults()
|
||||
|
||||
# Extract documents for this format
|
||||
documents = results.get("documents", [])
|
||||
formatDocuments = [d for d in documents if d.get("fileName", "").endswith(f".{format.lower()}")]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"format": format,
|
||||
"workflowId": workflow.id,
|
||||
"status": results.get("status"),
|
||||
"documentCount": len(formatDocuments),
|
||||
"documents": formatDocuments,
|
||||
"results": results
|
||||
}
|
||||
|
||||
async def waitForWorkflowCompletion(self, timeout: Optional[int] = None, checkInterval: int = 2) -> bool:
|
||||
"""Wait for workflow to complete."""
|
||||
if not self.workflow:
|
||||
return False
|
||||
|
||||
startTime = time.time()
|
||||
lastStatus = None
|
||||
|
||||
interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser)
|
||||
|
||||
if timeout is None:
|
||||
print("Waiting indefinitely (no timeout)")
|
||||
|
||||
while True:
|
||||
# Check timeout only if specified
|
||||
if timeout is not None and time.time() - startTime > timeout:
|
||||
print(f"\n⏱️ Timeout after {timeout} seconds")
|
||||
return False
|
||||
|
||||
# Get current workflow status
|
||||
try:
|
||||
currentWorkflow = interfaceDbChat.getWorkflow(self.workflow.id)
|
||||
if not currentWorkflow:
|
||||
print("\n❌ Workflow not found")
|
||||
return False
|
||||
|
||||
currentStatus = currentWorkflow.status
|
||||
elapsed = int(time.time() - startTime)
|
||||
|
||||
# Print status if it changed
|
||||
if currentStatus != lastStatus:
|
||||
print(f"Workflow status: {currentStatus} (elapsed: {elapsed}s)")
|
||||
lastStatus = currentStatus
|
||||
|
||||
# Check if workflow is complete
|
||||
if currentStatus in ["completed", "stopped", "failed"]:
|
||||
self.workflow = currentWorkflow
|
||||
statusIcon = "✅" if currentStatus == "completed" else "❌"
|
||||
print(f"\n{statusIcon} Workflow finished with status: {currentStatus} (elapsed: {elapsed}s)")
|
||||
return currentStatus == "completed"
|
||||
|
||||
# Wait before next check
|
||||
await asyncio.sleep(checkInterval)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n⚠️ Error checking workflow status: {str(e)}")
|
||||
await asyncio.sleep(checkInterval)
|
||||
|
||||
def analyzeWorkflowResults(self) -> Dict[str, Any]:
|
||||
"""Analyze workflow results and extract information."""
|
||||
if not self.workflow:
|
||||
return {"error": "No workflow to analyze"}
|
||||
|
||||
interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser)
|
||||
workflow = interfaceDbChat.getWorkflow(self.workflow.id)
|
||||
|
||||
if not workflow:
|
||||
return {"error": "Workflow not found"}
|
||||
|
||||
# Get unified chat data
|
||||
chatData = interfaceDbChat.getUnifiedChatData(workflow.id, None)
|
||||
|
||||
# Count messages
|
||||
messages = chatData.get("messages", [])
|
||||
userMessages = [m for m in messages if m.get("role") == "user"]
|
||||
assistantMessages = [m for m in messages if m.get("role") == "assistant"]
|
||||
|
||||
# Count documents
|
||||
documents = chatData.get("documents", [])
|
||||
|
||||
# Get logs
|
||||
logs = chatData.get("logs", [])
|
||||
|
||||
results = {
|
||||
"workflowId": workflow.id,
|
||||
"status": workflow.status,
|
||||
"workflowMode": str(workflow.workflowMode) if hasattr(workflow, 'workflowMode') else None,
|
||||
"currentRound": workflow.currentRound,
|
||||
"totalTasks": workflow.totalTasks,
|
||||
"totalActions": workflow.totalActions,
|
||||
"messageCount": len(messages),
|
||||
"userMessageCount": len(userMessages),
|
||||
"assistantMessageCount": len(assistantMessages),
|
||||
"documentCount": len(documents),
|
||||
"logCount": len(logs),
|
||||
"documents": documents,
|
||||
"logs": logs
|
||||
}
|
||||
|
||||
print(f"\nWorkflow Results:")
|
||||
print(f" Status: {results['status']}")
|
||||
print(f" Tasks: {results['totalTasks']}")
|
||||
print(f" Actions: {results['totalActions']}")
|
||||
print(f" Messages: {results['messageCount']}")
|
||||
print(f" Documents: {results['documentCount']}")
|
||||
|
||||
# Print document details
|
||||
if documents:
|
||||
print(f"\nGenerated Documents:")
|
||||
for doc in documents:
|
||||
fileName = doc.get("fileName", "unknown")
|
||||
fileSize = doc.get("fileSize", 0)
|
||||
mimeType = doc.get("mimeType", "unknown")
|
||||
documentType = doc.get("documentType", "N/A")
|
||||
print(f" - {fileName} ({fileSize} bytes, {mimeType}, type: {documentType})")
|
||||
|
||||
return results
|
||||
|
||||
def verifyDocumentFormat(self, document: Dict[str, Any], expectedFormat: str) -> Dict[str, Any]:
|
||||
"""Verify that a document matches the expected format and contains expected metadata."""
|
||||
fileName = document.get("fileName", "")
|
||||
mimeType = document.get("mimeType", "")
|
||||
fileSize = document.get("fileSize", 0)
|
||||
documentType = document.get("documentType")
|
||||
metadata = document.get("metadata")
|
||||
|
||||
# Expected MIME types
|
||||
expectedMimeTypes = {
|
||||
"pdf": ["application/pdf"],
|
||||
"docx": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
|
||||
"xlsx": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"],
|
||||
"pptx": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"]
|
||||
}
|
||||
|
||||
# Expected file extensions
|
||||
expectedExtensions = {
|
||||
"pdf": [".pdf"],
|
||||
"docx": [".docx"],
|
||||
"xlsx": [".xlsx"],
|
||||
"pptx": [".pptx"]
|
||||
}
|
||||
|
||||
formatLower = expectedFormat.lower()
|
||||
expectedMimes = expectedMimeTypes.get(formatLower, [])
|
||||
expectedExts = expectedExtensions.get(formatLower, [])
|
||||
|
||||
# Check file extension
|
||||
hasCorrectExtension = any(fileName.lower().endswith(ext) for ext in expectedExts)
|
||||
|
||||
# Check MIME type
|
||||
hasCorrectMimeType = any(mimeType.lower() == mime.lower() for mime in expectedMimes)
|
||||
|
||||
# Check file size (should be > 0)
|
||||
hasValidSize = fileSize > 0
|
||||
|
||||
# Check document type (should be present)
|
||||
hasDocumentType = documentType is not None
|
||||
|
||||
# Check metadata (should be present)
|
||||
hasMetadata = metadata is not None and isinstance(metadata, dict)
|
||||
|
||||
verification = {
|
||||
"format": expectedFormat,
|
||||
"fileName": fileName,
|
||||
"mimeType": mimeType,
|
||||
"fileSize": fileSize,
|
||||
"documentType": documentType,
|
||||
"hasMetadata": hasMetadata,
|
||||
"hasCorrectExtension": hasCorrectExtension,
|
||||
"hasCorrectMimeType": hasCorrectMimeType,
|
||||
"hasValidSize": hasValidSize,
|
||||
"hasDocumentType": hasDocumentType,
|
||||
"isValid": hasCorrectExtension and hasValidSize and hasCorrectMimeType,
|
||||
"isComplete": hasCorrectExtension and hasValidSize and hasCorrectMimeType and hasDocumentType and hasMetadata
|
||||
}
|
||||
|
||||
return verification
|
||||
|
||||
async def testAllFormats(self) -> Dict[str, Any]:
|
||||
"""Test document generation in DOCX, XLSX, PPTX, and PDF formats."""
|
||||
print("\n" + "="*80)
|
||||
print("TESTING DOCUMENT GENERATION IN DOCX, XLSX, PPTX, AND PDF FORMATS")
|
||||
print("="*80)
|
||||
|
||||
formats = ["docx", "xlsx", "pptx", "pdf"]
|
||||
results = {}
|
||||
|
||||
for format in formats:
|
||||
try:
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Testing {format.upper()} format...")
|
||||
print(f"{'='*80}")
|
||||
|
||||
result = await self.generateDocumentInFormat(format)
|
||||
results[format] = result
|
||||
|
||||
if result.get("success"):
|
||||
documents = result.get("documents", [])
|
||||
if documents:
|
||||
# Verify first document
|
||||
verification = self.verifyDocumentFormat(documents[0], format)
|
||||
result["verification"] = verification
|
||||
|
||||
print(f"\n✅ {format.upper()} generation successful!")
|
||||
print(f" Documents: {len(documents)}")
|
||||
print(f" Verification: {'✅ PASS' if verification['isValid'] else '❌ FAIL'}")
|
||||
print(f" Complete (with metadata): {'✅ YES' if verification['isComplete'] else '❌ NO'}")
|
||||
if verification.get("fileName"):
|
||||
print(f" File: {verification['fileName']}")
|
||||
print(f" Size: {verification['fileSize']} bytes")
|
||||
print(f" MIME: {verification['mimeType']}")
|
||||
print(f" Document Type: {verification.get('documentType', 'N/A')}")
|
||||
print(f" Has Metadata: {'✅' if verification.get('hasMetadata') else '❌'}")
|
||||
else:
|
||||
print(f"\n⚠️ {format.upper()} generation completed but no documents found")
|
||||
else:
|
||||
error = result.get("error", "Unknown error")
|
||||
print(f"\n❌ {format.upper()} generation failed: {error}")
|
||||
|
||||
# Small delay between tests
|
||||
await asyncio.sleep(2)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f"\n❌ Error testing {format.upper()}: {str(e)}")
|
||||
print(traceback.format_exc())
|
||||
results[format] = {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"traceback": traceback.format_exc()
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
async def runTest(self):
|
||||
"""Run the complete test."""
|
||||
print("\n" + "="*80)
|
||||
print("DOCUMENT GENERATION FORMATS TEST 10 - DOCX, XLSX, PPTX, PDF")
|
||||
print("="*80)
|
||||
|
||||
try:
|
||||
# Initialize
|
||||
await self.initialize()
|
||||
|
||||
# Test all formats
|
||||
formatResults = await self.testAllFormats()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*80)
|
||||
print("TEST SUMMARY")
|
||||
print("="*80)
|
||||
|
||||
# Format tests summary
|
||||
print("\nFormat Tests:")
|
||||
successCount = 0
|
||||
failCount = 0
|
||||
completeCount = 0 # Documents with metadata
|
||||
|
||||
for format, result in formatResults.items():
|
||||
if result.get("success"):
|
||||
successCount += 1
|
||||
verification = result.get("verification", {})
|
||||
isValid = verification.get("isValid", False)
|
||||
isComplete = verification.get("isComplete", False)
|
||||
if isComplete:
|
||||
completeCount += 1
|
||||
statusIcon = "✅" if isValid else "⚠️"
|
||||
completeIcon = "✅" if isComplete else "❌"
|
||||
docCount = result.get("documentCount", 0)
|
||||
print(f"{statusIcon} {format.upper():6s}: {'PASS' if isValid else 'FAIL'} - {docCount} document(s) - Metadata: {completeIcon}")
|
||||
else:
|
||||
failCount += 1
|
||||
error = result.get("error", "Unknown error")
|
||||
print(f"❌ {format.upper():6s}: FAIL - {error}")
|
||||
|
||||
print(f"\nFormat Tests: {successCount} passed, {failCount} failed out of {len(formatResults)} formats")
|
||||
print(f"Complete Documents (with metadata): {completeCount} out of {successCount} successful generations")
|
||||
|
||||
self.testResults = {
|
||||
"success": failCount == 0,
|
||||
"formatTests": {
|
||||
"successCount": successCount,
|
||||
"failCount": failCount,
|
||||
"completeCount": completeCount,
|
||||
"totalFormats": len(formatResults),
|
||||
"results": formatResults
|
||||
},
|
||||
"totalSuccess": successCount,
|
||||
"totalFail": failCount
|
||||
}
|
||||
|
||||
return self.testResults
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f"\n❌ Test failed with error: {type(e).__name__}: {str(e)}")
|
||||
print(f"Traceback:\n{traceback.format_exc()}")
|
||||
self.testResults = {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"traceback": traceback.format_exc()
|
||||
}
|
||||
return self.testResults
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run document generation formats test 10."""
|
||||
tester = DocumentGenerationFormatsTester10()
|
||||
results = await tester.runTest()
|
||||
|
||||
# Print final results as JSON for easy parsing
|
||||
print("\n" + "="*80)
|
||||
print("FINAL RESULTS (JSON)")
|
||||
print("="*80)
|
||||
print(json.dumps(results, indent=2, default=str))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
Loading…
Reference in a new issue