From a2315d6ace26832ad225db782d03904d6d5f91f5 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Thu, 25 Dec 2025 00:34:45 +0100 Subject: [PATCH] fixed vision for pre-extracted content --- modules/services/serviceAi/mainServiceAi.py | 102 ++++++++++++++++++-- 1 file changed, 93 insertions(+), 9 deletions(-) diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 331a3289..74b90346 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -1135,7 +1135,8 @@ If no trackable items can be identified, return: {{"kpis": []}} fileName=preExtracted["originalDocument"]["fileName"], mimeType=preExtracted["originalDocument"]["mimeType"], fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize), - fileId=doc.fileId # Behalte fileId vom JSON + fileId=doc.fileId, # Behalte fileId vom JSON + messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden ) resolvedDocuments.append(originalDoc) else: @@ -1264,6 +1265,39 @@ If no trackable items can be identified, return: {{"kpis": []}} logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}") return None + async def _extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]: + """ + Extrahiere Text aus einem Image-Part mit Vision AI. + + Args: + imagePart: ContentPart mit typeGroup="image" + extractionPrompt: Prompt für die Text-Extraktion + + Returns: + Extrahierter Text oder None bei Fehler + """ + try: + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions + + # Erstelle AI-Call-Request mit Image-Part + request = AiCallRequest( + prompt=extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting.", + context="", + options=AiCallOptions(operationType="extraction"), + contentParts=[imagePart] + ) + + # Verwende AI-Service für Vision AI-Verarbeitung + response = await self.services.ai.call(request) + + if response and response.content: + return response.content.strip() + + return None + except Exception as e: + logger.warning(f"Error extracting text from image {imagePart.id}: {str(e)}") + return None + def _buildIntentAnalysisPrompt( self, userPrompt: str, @@ -1420,12 +1454,22 @@ Return ONLY valid JSON following the structure above. # Wenn Intent "render" für Images hat, erstelle auch object Part if "render" in partIntent and part.typeGroup == "image" and part.data: # Image-Part mit render Intent: Erstelle sowohl extracted als auch object Part - # 1. Extracted Part (bereits vorhanden) - part.metadata["intent"] = "extract" - part.metadata["fromExtractContent"] = True - part.metadata["skipExtraction"] = True - part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] - allContentParts.append(part) + # 1. Extracted Part - prüfe ob "extract" Intent vorhanden ist + if "extract" in partIntent: + # Image hat sowohl extract als auch render Intent + # Extracted Part: Wird mit Vision AI verarbeitet (skipExtraction=False) + part.metadata["intent"] = "extract" + part.metadata["fromExtractContent"] = True + part.metadata["skipExtraction"] = False # WICHTIG: Vision AI-Verarbeitung nötig! + part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] + allContentParts.append(part) + else: + # Nur render Intent - kein Text-Extraktion nötig + part.metadata["intent"] = "render" + part.metadata["fromExtractContent"] = True + part.metadata["skipExtraction"] = True + part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] + allContentParts.append(part) # 2. Object Part für Rendering (base64 data ist bereits im extracted Part) objectPart = ContentPart( @@ -1444,11 +1488,51 @@ Return ONLY valid JSON following the structure above. } ) allContentParts.append(objectPart) + elif part.typeGroup == "image" and "extract" in partIntent: + # Image mit extract Intent: Vision AI-Verarbeitung nötig + # Verarbeite Image mit Vision AI, um Text zu extrahieren + try: + extractedText = await self._extractTextFromImage(part, intent.extractionPrompt if intent else "Extract all text content from this image") + if extractedText: + # Erstelle neuen Text-Part mit extrahiertem Text + textPart = ContentPart( + id=f"extracted_{part.id}", + label=f"Extracted text from {part.label or 'Image'}", + typeGroup="text", + mimeType="text/plain", + data=extractedText, + metadata={ + "contentFormat": "extracted", + "documentId": document.id, + "intent": "extract", + "originalFileName": preExtracted["originalDocument"]["fileName"], + "relatedImagePartId": part.id, + "extractionPrompt": intent.extractionPrompt if intent else "Extract all text content from this image" + } + ) + allContentParts.append(textPart) + logger.info(f"✅ Extracted text from image {part.id} using Vision AI") + + # Wenn auch render Intent vorhanden, füge Image-Part hinzu + if "render" in partIntent: + part.metadata["intent"] = "render" + part.metadata["fromExtractContent"] = True + part.metadata["skipExtraction"] = True + part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] + allContentParts.append(part) + except Exception as e: + logger.warning(f"Failed to extract text from image {part.id}: {str(e)}, adding image as-is") + # Fallback: Füge Image-Part hinzu ohne Text-Extraktion + part.metadata["intent"] = "extract" + part.metadata["fromExtractContent"] = True + part.metadata["skipExtraction"] = False + part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] + allContentParts.append(part) else: - # Normales extracted Part + # Normales extracted Part (kein Image oder kein extract Intent) part.metadata["intent"] = partIntent[0] if partIntent else "extract" part.metadata["fromExtractContent"] = True - part.metadata["skipExtraction"] = True + part.metadata["skipExtraction"] = True # Bereits extrahiert part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"] allContentParts.append(part)