From 4f01a02b9faa3fbd670c263b26ac63707c506dc4 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Tue, 29 Jul 2025 18:36:34 +0200 Subject: [PATCH] validation not picky... --- app.py | 3 +- modules/chat/documents/documentExtraction.py | 80 +++++++++++--------- modules/chat/handling/handlingTasks.py | 19 ++++- modules/chat/handling/promptFactory.py | 52 ++++++++----- 4 files changed, 97 insertions(+), 57 deletions(-) diff --git a/app.py b/app.py index 81b1c9af..bfe82f8f 100644 --- a/app.py +++ b/app.py @@ -53,7 +53,8 @@ def initLogging(): 'response_closed.started', '_send_single_request', 'httpcore.http11', - 'httpx._client' + 'httpx._client', + 'HTTP Request' ] return not any(pattern in record.msg for pattern in http_debug_patterns) return True diff --git a/modules/chat/documents/documentExtraction.py b/modules/chat/documents/documentExtraction.py index 41588a62..0ad6188f 100644 --- a/modules/chat/documents/documentExtraction.py +++ b/modules/chat/documents/documentExtraction.py @@ -88,7 +88,7 @@ class DocumentExtraction: import PyPDF2 import fitz # PyMuPDF for more extensive PDF processing pdfExtractorLoaded = True - logger.debug("📄 PDF extraction libraries successfully loaded") + logger.debug("PDF extraction libraries successfully loaded") except ImportError as e: logger.warning(f"PDF extraction libraries could not be loaded: {e}") @@ -101,7 +101,7 @@ class DocumentExtraction: import docx # python-docx for Word documents import openpyxl # for Excel files officeExtractorLoaded = True - logger.debug("📄 Office extraction libraries successfully loaded") + logger.debug("Office extraction libraries successfully loaded") except ImportError as e: logger.warning(f"Office extraction libraries could not be loaded: {e}") @@ -113,7 +113,7 @@ class DocumentExtraction: global PIL, Image from PIL import Image imageProcessorLoaded = True - logger.debug("📄 Image processing libraries successfully loaded") + logger.debug("Image processing libraries successfully loaded") except ImportError as e: logger.warning(f"Image processing libraries could not be loaded: {e}") @@ -157,7 +157,7 @@ class DocumentExtraction: processedItems = await self._aiDataExtraction(contentItems, prompt) contentItems = processedItems except Exception as e: - logger.error(f"❌ Error processing content with AI: {str(e)}") + logger.error(f"Error processing content with AI: {str(e)}") return ExtractedContent( id=documentId if documentId else str(uuid.uuid4()), @@ -165,7 +165,7 @@ class DocumentExtraction: ) except Exception as e: - logger.error(f"❌ Error processing file data: {str(e)}") + logger.error(f"Error processing file data: {str(e)}") raise FileProcessingError(f"Failed to process file data: {str(e)}") @@ -187,7 +187,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing text document: {str(e)}") + logger.error(f"Error processing text document: {str(e)}") raise FileProcessingError(f"Failed to process text document: {str(e)}") async def _processCsv(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -206,7 +206,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing CSV document: {str(e)}") + logger.error(f"Error processing CSV document: {str(e)}") raise FileProcessingError(f"Failed to process CSV document: {str(e)}") async def _processJson(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -226,7 +226,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing JSON document: {str(e)}") + logger.error(f"Error processing JSON document: {str(e)}") raise FileProcessingError(f"Failed to process JSON document: {str(e)}") async def _processXml(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -245,7 +245,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing XML document: {str(e)}") + logger.error(f"Error processing XML document: {str(e)}") raise FileProcessingError(f"Failed to process XML document: {str(e)}") async def _processHtml(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -264,7 +264,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing HTML document: {str(e)}") + logger.error(f"Error processing HTML document: {str(e)}") raise FileProcessingError(f"Failed to process HTML document: {str(e)}") async def _processSvg(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -284,7 +284,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing SVG document: {str(e)}") + logger.error(f"Error processing SVG document: {str(e)}") raise FileProcessingError(f"Failed to process SVG document: {str(e)}") async def _processImage(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -315,7 +315,7 @@ class DocumentExtraction: metadata=metadata )] except Exception as e: - logger.error(f"❌ Error processing image document: {str(e)}") + logger.error(f"Error processing image document: {str(e)}") raise FileProcessingError(f"Failed to process image document: {str(e)}") async def _processPdf(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -378,13 +378,13 @@ class DocumentExtraction: ) )) except Exception as imgE: - logger.warning(f"⚠️ Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}") + logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}") doc.close() return contentItems except Exception as e: - logger.error(f"❌ Error processing PDF document: {str(e)}") + logger.error(f"Error processing PDF document: {str(e)}") raise FileProcessingError(f"Failed to process PDF document: {str(e)}") async def _processDocx(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -423,7 +423,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing Word document: {str(e)}") + logger.error(f"Error processing Word document: {str(e)}") raise FileProcessingError(f"Failed to process Word document: {str(e)}") async def _processXlsx(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -465,7 +465,7 @@ class DocumentExtraction: return contentItems except Exception as e: - logger.error(f"❌ Error processing Excel document: {str(e)}") + logger.error(f"Error processing Excel document: {str(e)}") raise FileProcessingError(f"Failed to process Excel document: {str(e)}") async def _processBinary(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]: @@ -482,7 +482,7 @@ class DocumentExtraction: ) )] except Exception as e: - logger.error(f"❌ Error processing binary document: {str(e)}") + logger.error(f"Error processing binary document: {str(e)}") raise FileProcessingError(f"Failed to process binary document: {str(e)}") async def _aiDataExtraction(self, contentItems: List[ContentItem], prompt: str) -> List[ContentItem]: @@ -502,7 +502,7 @@ class DocumentExtraction: try: # Get content type from metadata mimeType = item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else "text/plain" - logger.debug(f"📄 Processing content item with MIME type: {mimeType}, label: {item.label}") + logger.debug(f"Processing content item with MIME type: {mimeType}, label: {item.label}") # Chunk content based on type if mimeType.startswith('text/'): @@ -527,12 +527,12 @@ class DocumentExtraction: for chunk in chunks: # Process with AI based on content type try: - logger.debug(f"🤖 AI processing chunk with MIME type: {mimeType}") + logger.debug(f"AI processing chunk with MIME type: {mimeType}") if mimeType.startswith('image/'): # For images, use image AI service with base64 data # chunk is already base64 encoded string from _processImage # Use the original prompt directly for images (no content embedding) - logger.debug(f"🤖 Calling image AI service for MIME type: {mimeType}") + logger.debug(f"Calling image AI service for MIME type: {mimeType}") processedContent = await self._serviceCenter.callAiImageBasic(prompt, chunk, mimeType) else: # For text content, use text AI service @@ -553,32 +553,40 @@ class DocumentExtraction: Return ONLY the extracted information in a clear, concise format. """ - logger.debug(f"🤖 Calling text AI service for MIME type: {mimeType}") + logger.debug(f"Calling text AI service for MIME type: {mimeType}") processedContent = await self._serviceCenter.callAiTextBasic(aiPrompt, contentToProcess) chunkResults.append(processedContent) except Exception as aiError: - logger.error(f"❌ AI processing failed for chunk: {str(aiError)}") - # Fallback to original content - chunkResults.append(chunk) + logger.error(f"AI processing failed for chunk: {str(aiError)}") + # For non-text content, don't fallback to binary data + if mimeType.startswith('image/') or mimeType.startswith('video/') or mimeType.startswith('audio/'): + logger.warning(f"Skipping binary content fallback for {mimeType}") + continue # Skip this chunk entirely + else: + # Only fallback to original content for text-based formats + chunkResults.append(chunk) # Combine chunk results combinedResult = "\n".join(chunkResults) - # Update content with AI processed data - processedItems.append(ContentItem( - label=item.label, - data=combinedResult, - metadata=ContentMetadata( - size=len(combinedResult.encode('utf-8')), - pages=1, - mimeType="text/plain", - base64Encoded=False - ) - )) + # Only add processed item if we have results + if combinedResult.strip(): + processedItems.append(ContentItem( + label=item.label, + data=combinedResult, + metadata=ContentMetadata( + size=len(combinedResult.encode('utf-8')), + pages=1, + mimeType="text/plain", + base64Encoded=False + ) + )) + else: + logger.warning(f"No processed content available for {item.label}, skipping item") except Exception as e: - logger.error(f"❌ Error processing content chunk: {str(e)}") + logger.error(f"Error processing content chunk: {str(e)}") # Add original content if processing fails processedItems.append(item) diff --git a/modules/chat/handling/handlingTasks.py b/modules/chat/handling/handlingTasks.py index c78ce995..d8ef85a6 100644 --- a/modules/chat/handling/handlingTasks.py +++ b/modules/chat/handling/handlingTasks.py @@ -251,7 +251,7 @@ class HandlingTasks: if not isinstance(unmet_criteria, list): unmet_criteria = [] - return ReviewResult( + review_result = ReviewResult( status=review.get('status', 'unknown'), reason=review.get('reason', 'No reason provided'), improvements=improvements, @@ -261,6 +261,23 @@ class HandlingTasks: unmet_criteria=unmet_criteria, confidence=review.get('confidence', 0.5) ) + + # Enhanced validation logging + logger.info(f"VALIDATION RESULT - Task: '{task_step.description}' - Status: {review_result.status.upper()}, Quality: {review_result.quality_score}/10") + if review_result.status == 'success': + logger.info(f"VALIDATION SUCCESS - Task completed successfully") + if review_result.met_criteria: + logger.info(f"Met criteria: {', '.join(review_result.met_criteria)}") + elif review_result.status == 'retry': + logger.warning(f"VALIDATION RETRY - Task requires retry: {review_result.improvements}") + if review_result.unmet_criteria: + logger.warning(f"Unmet criteria: {', '.join(review_result.unmet_criteria)}") + else: + logger.error(f"VALIDATION FAILED - Task failed: {review_result.reason}") + if review_result.missing_outputs: + logger.error(f"Missing outputs: {', '.join(review_result.missing_outputs)}") + + return review_result except Exception as e: logger.error(f"Error in reviewTaskCompletion: {str(e)}") return ReviewResult( diff --git a/modules/chat/handling/promptFactory.py b/modules/chat/handling/promptFactory.py index 81863074..ade811b7 100644 --- a/modules/chat/handling/promptFactory.py +++ b/modules/chat/handling/promptFactory.py @@ -369,7 +369,7 @@ async def createResultReviewPrompt(self, review_context) -> str: step_result_json = json.dumps(step_result_serializable, indent=2, ensure_ascii=False) expected_outputs_str = ', '.join(task_step.expected_outputs or []) success_criteria_str = ', '.join(task_step.success_criteria or []) - return f"""You are a result review AI that evaluates task step completion and decides on next actions. + return f"""You are a result review AI that evaluates task step completion with BASIC validation. TASK STEP: {task_step.description} EXPECTED OUTPUTS: {expected_outputs_str} @@ -377,29 +377,43 @@ SUCCESS CRITERIA: {success_criteria_str} STEP RESULT: {step_result_json} -INSTRUCTIONS: -1. Evaluate if the task step was completed successfully -2. Check if all expected outputs were produced -3. Verify if success criteria were met -4. Decide on next action: continue, retry, or fail -5. If retry, provide specific improvements needed +BASIC VALIDATION RULES: +1. SUCCESS if: Action completed AND (documents were produced OR meaningful text output exists) +2. RETRY if: Action failed due to technical issues that can be fixed +3. FAILED if: Action completely failed with no recoverable output -IMPORTANT NOTES: -- Actions can produce either text results OR documents (or both) -- Empty result_summary is acceptable if documents were produced (documents_count > 0) -- Focus on whether the action achieved its intended purpose, not just text output -- Document-based actions (like file extractions) often have empty text results but successful document outputs -- Check the 'success_indicator' field: 'documents' means success via document output, 'text_result' means success via text, 'none' means no output +VALIDATION PRINCIPLES: +- Be GENEROUS with success - if the action achieved its basic purpose, mark as success +- Focus on FUNCTIONALITY, not perfection +- Document outputs are PRIMARY indicators of success +- Text outputs are SECONDARY indicators +- Only retry for CLEAR technical issues, not minor imperfections +- Don't be picky about formatting or minor details + +EXAMPLES OF SUCCESS: +- Document extraction produced a file (even if imperfect) +- Text analysis provided meaningful insights +- Data processing completed with results + +EXAMPLES OF RETRY: +- Technical errors (API failures, timeouts) +- Missing required inputs +- Clear implementation bugs + +EXAMPLES OF FAILED: +- Complete system failures +- No output whatsoever +- Unrecoverable errors REQUIRED JSON STRUCTURE: {{ "status": "success|retry|failed", - "reason": "Explanation of the decision", - "improvements": "Specific improvements for retry (if status is retry)", + "reason": "Brief explanation", + "improvements": ["specific technical fixes only"], "quality_score": 1-10, - "missing_outputs": ["output1", "output2"], - "met_criteria": ["criteria1", "criteria2"], - "unmet_criteria": ["criteria3", "criteria4"] + "missing_outputs": [], + "met_criteria": ["basic functionality achieved"], + "unmet_criteria": [] }} -NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" \ No newline at end of file +NOTE: Respond with ONLY the JSON object. Be GENEROUS with success ratings.""" \ No newline at end of file