validation not picky...
This commit is contained in:
parent
12fd5e0a41
commit
4f01a02b9f
4 changed files with 97 additions and 57 deletions
3
app.py
3
app.py
|
|
@ -53,7 +53,8 @@ def initLogging():
|
|||
'response_closed.started',
|
||||
'_send_single_request',
|
||||
'httpcore.http11',
|
||||
'httpx._client'
|
||||
'httpx._client',
|
||||
'HTTP Request'
|
||||
]
|
||||
return not any(pattern in record.msg for pattern in http_debug_patterns)
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ class DocumentExtraction:
|
|||
import PyPDF2
|
||||
import fitz # PyMuPDF for more extensive PDF processing
|
||||
pdfExtractorLoaded = True
|
||||
logger.debug("📄 PDF extraction libraries successfully loaded")
|
||||
logger.debug("PDF extraction libraries successfully loaded")
|
||||
except ImportError as e:
|
||||
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ class DocumentExtraction:
|
|||
import docx # python-docx for Word documents
|
||||
import openpyxl # for Excel files
|
||||
officeExtractorLoaded = True
|
||||
logger.debug("📄 Office extraction libraries successfully loaded")
|
||||
logger.debug("Office extraction libraries successfully loaded")
|
||||
except ImportError as e:
|
||||
logger.warning(f"Office extraction libraries could not be loaded: {e}")
|
||||
|
||||
|
|
@ -113,7 +113,7 @@ class DocumentExtraction:
|
|||
global PIL, Image
|
||||
from PIL import Image
|
||||
imageProcessorLoaded = True
|
||||
logger.debug("📄 Image processing libraries successfully loaded")
|
||||
logger.debug("Image processing libraries successfully loaded")
|
||||
except ImportError as e:
|
||||
logger.warning(f"Image processing libraries could not be loaded: {e}")
|
||||
|
||||
|
|
@ -157,7 +157,7 @@ class DocumentExtraction:
|
|||
processedItems = await self._aiDataExtraction(contentItems, prompt)
|
||||
contentItems = processedItems
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing content with AI: {str(e)}")
|
||||
logger.error(f"Error processing content with AI: {str(e)}")
|
||||
|
||||
return ExtractedContent(
|
||||
id=documentId if documentId else str(uuid.uuid4()),
|
||||
|
|
@ -165,7 +165,7 @@ class DocumentExtraction:
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing file data: {str(e)}")
|
||||
logger.error(f"Error processing file data: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process file data: {str(e)}")
|
||||
|
||||
|
||||
|
|
@ -187,7 +187,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing text document: {str(e)}")
|
||||
logger.error(f"Error processing text document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process text document: {str(e)}")
|
||||
|
||||
async def _processCsv(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -206,7 +206,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing CSV document: {str(e)}")
|
||||
logger.error(f"Error processing CSV document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process CSV document: {str(e)}")
|
||||
|
||||
async def _processJson(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -226,7 +226,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing JSON document: {str(e)}")
|
||||
logger.error(f"Error processing JSON document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process JSON document: {str(e)}")
|
||||
|
||||
async def _processXml(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -245,7 +245,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing XML document: {str(e)}")
|
||||
logger.error(f"Error processing XML document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process XML document: {str(e)}")
|
||||
|
||||
async def _processHtml(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -264,7 +264,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing HTML document: {str(e)}")
|
||||
logger.error(f"Error processing HTML document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process HTML document: {str(e)}")
|
||||
|
||||
async def _processSvg(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -284,7 +284,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing SVG document: {str(e)}")
|
||||
logger.error(f"Error processing SVG document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process SVG document: {str(e)}")
|
||||
|
||||
async def _processImage(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -315,7 +315,7 @@ class DocumentExtraction:
|
|||
metadata=metadata
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing image document: {str(e)}")
|
||||
logger.error(f"Error processing image document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process image document: {str(e)}")
|
||||
|
||||
async def _processPdf(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -378,13 +378,13 @@ class DocumentExtraction:
|
|||
)
|
||||
))
|
||||
except Exception as imgE:
|
||||
logger.warning(f"⚠️ Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
|
||||
logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
|
||||
|
||||
doc.close()
|
||||
|
||||
return contentItems
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing PDF document: {str(e)}")
|
||||
logger.error(f"Error processing PDF document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process PDF document: {str(e)}")
|
||||
|
||||
async def _processDocx(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -423,7 +423,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing Word document: {str(e)}")
|
||||
logger.error(f"Error processing Word document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process Word document: {str(e)}")
|
||||
|
||||
async def _processXlsx(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -465,7 +465,7 @@ class DocumentExtraction:
|
|||
|
||||
return contentItems
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing Excel document: {str(e)}")
|
||||
logger.error(f"Error processing Excel document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process Excel document: {str(e)}")
|
||||
|
||||
async def _processBinary(self, fileData: bytes, filename: str, mimeType: str) -> List[ContentItem]:
|
||||
|
|
@ -482,7 +482,7 @@ class DocumentExtraction:
|
|||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing binary document: {str(e)}")
|
||||
logger.error(f"Error processing binary document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process binary document: {str(e)}")
|
||||
|
||||
async def _aiDataExtraction(self, contentItems: List[ContentItem], prompt: str) -> List[ContentItem]:
|
||||
|
|
@ -502,7 +502,7 @@ class DocumentExtraction:
|
|||
try:
|
||||
# Get content type from metadata
|
||||
mimeType = item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else "text/plain"
|
||||
logger.debug(f"📄 Processing content item with MIME type: {mimeType}, label: {item.label}")
|
||||
logger.debug(f"Processing content item with MIME type: {mimeType}, label: {item.label}")
|
||||
|
||||
# Chunk content based on type
|
||||
if mimeType.startswith('text/'):
|
||||
|
|
@ -527,12 +527,12 @@ class DocumentExtraction:
|
|||
for chunk in chunks:
|
||||
# Process with AI based on content type
|
||||
try:
|
||||
logger.debug(f"🤖 AI processing chunk with MIME type: {mimeType}")
|
||||
logger.debug(f"AI processing chunk with MIME type: {mimeType}")
|
||||
if mimeType.startswith('image/'):
|
||||
# For images, use image AI service with base64 data
|
||||
# chunk is already base64 encoded string from _processImage
|
||||
# Use the original prompt directly for images (no content embedding)
|
||||
logger.debug(f"🤖 Calling image AI service for MIME type: {mimeType}")
|
||||
logger.debug(f"Calling image AI service for MIME type: {mimeType}")
|
||||
processedContent = await self._serviceCenter.callAiImageBasic(prompt, chunk, mimeType)
|
||||
else:
|
||||
# For text content, use text AI service
|
||||
|
|
@ -553,32 +553,40 @@ class DocumentExtraction:
|
|||
Return ONLY the extracted information in a clear, concise format.
|
||||
"""
|
||||
|
||||
logger.debug(f"🤖 Calling text AI service for MIME type: {mimeType}")
|
||||
logger.debug(f"Calling text AI service for MIME type: {mimeType}")
|
||||
processedContent = await self._serviceCenter.callAiTextBasic(aiPrompt, contentToProcess)
|
||||
|
||||
chunkResults.append(processedContent)
|
||||
except Exception as aiError:
|
||||
logger.error(f"❌ AI processing failed for chunk: {str(aiError)}")
|
||||
# Fallback to original content
|
||||
chunkResults.append(chunk)
|
||||
logger.error(f"AI processing failed for chunk: {str(aiError)}")
|
||||
# For non-text content, don't fallback to binary data
|
||||
if mimeType.startswith('image/') or mimeType.startswith('video/') or mimeType.startswith('audio/'):
|
||||
logger.warning(f"Skipping binary content fallback for {mimeType}")
|
||||
continue # Skip this chunk entirely
|
||||
else:
|
||||
# Only fallback to original content for text-based formats
|
||||
chunkResults.append(chunk)
|
||||
|
||||
# Combine chunk results
|
||||
combinedResult = "\n".join(chunkResults)
|
||||
|
||||
# Update content with AI processed data
|
||||
processedItems.append(ContentItem(
|
||||
label=item.label,
|
||||
data=combinedResult,
|
||||
metadata=ContentMetadata(
|
||||
size=len(combinedResult.encode('utf-8')),
|
||||
pages=1,
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
))
|
||||
# Only add processed item if we have results
|
||||
if combinedResult.strip():
|
||||
processedItems.append(ContentItem(
|
||||
label=item.label,
|
||||
data=combinedResult,
|
||||
metadata=ContentMetadata(
|
||||
size=len(combinedResult.encode('utf-8')),
|
||||
pages=1,
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
))
|
||||
else:
|
||||
logger.warning(f"No processed content available for {item.label}, skipping item")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing content chunk: {str(e)}")
|
||||
logger.error(f"Error processing content chunk: {str(e)}")
|
||||
# Add original content if processing fails
|
||||
processedItems.append(item)
|
||||
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ class HandlingTasks:
|
|||
if not isinstance(unmet_criteria, list):
|
||||
unmet_criteria = []
|
||||
|
||||
return ReviewResult(
|
||||
review_result = ReviewResult(
|
||||
status=review.get('status', 'unknown'),
|
||||
reason=review.get('reason', 'No reason provided'),
|
||||
improvements=improvements,
|
||||
|
|
@ -261,6 +261,23 @@ class HandlingTasks:
|
|||
unmet_criteria=unmet_criteria,
|
||||
confidence=review.get('confidence', 0.5)
|
||||
)
|
||||
|
||||
# Enhanced validation logging
|
||||
logger.info(f"VALIDATION RESULT - Task: '{task_step.description}' - Status: {review_result.status.upper()}, Quality: {review_result.quality_score}/10")
|
||||
if review_result.status == 'success':
|
||||
logger.info(f"VALIDATION SUCCESS - Task completed successfully")
|
||||
if review_result.met_criteria:
|
||||
logger.info(f"Met criteria: {', '.join(review_result.met_criteria)}")
|
||||
elif review_result.status == 'retry':
|
||||
logger.warning(f"VALIDATION RETRY - Task requires retry: {review_result.improvements}")
|
||||
if review_result.unmet_criteria:
|
||||
logger.warning(f"Unmet criteria: {', '.join(review_result.unmet_criteria)}")
|
||||
else:
|
||||
logger.error(f"VALIDATION FAILED - Task failed: {review_result.reason}")
|
||||
if review_result.missing_outputs:
|
||||
logger.error(f"Missing outputs: {', '.join(review_result.missing_outputs)}")
|
||||
|
||||
return review_result
|
||||
except Exception as e:
|
||||
logger.error(f"Error in reviewTaskCompletion: {str(e)}")
|
||||
return ReviewResult(
|
||||
|
|
|
|||
|
|
@ -369,7 +369,7 @@ async def createResultReviewPrompt(self, review_context) -> str:
|
|||
step_result_json = json.dumps(step_result_serializable, indent=2, ensure_ascii=False)
|
||||
expected_outputs_str = ', '.join(task_step.expected_outputs or [])
|
||||
success_criteria_str = ', '.join(task_step.success_criteria or [])
|
||||
return f"""You are a result review AI that evaluates task step completion and decides on next actions.
|
||||
return f"""You are a result review AI that evaluates task step completion with BASIC validation.
|
||||
|
||||
TASK STEP: {task_step.description}
|
||||
EXPECTED OUTPUTS: {expected_outputs_str}
|
||||
|
|
@ -377,29 +377,43 @@ SUCCESS CRITERIA: {success_criteria_str}
|
|||
|
||||
STEP RESULT: {step_result_json}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. Evaluate if the task step was completed successfully
|
||||
2. Check if all expected outputs were produced
|
||||
3. Verify if success criteria were met
|
||||
4. Decide on next action: continue, retry, or fail
|
||||
5. If retry, provide specific improvements needed
|
||||
BASIC VALIDATION RULES:
|
||||
1. SUCCESS if: Action completed AND (documents were produced OR meaningful text output exists)
|
||||
2. RETRY if: Action failed due to technical issues that can be fixed
|
||||
3. FAILED if: Action completely failed with no recoverable output
|
||||
|
||||
IMPORTANT NOTES:
|
||||
- Actions can produce either text results OR documents (or both)
|
||||
- Empty result_summary is acceptable if documents were produced (documents_count > 0)
|
||||
- Focus on whether the action achieved its intended purpose, not just text output
|
||||
- Document-based actions (like file extractions) often have empty text results but successful document outputs
|
||||
- Check the 'success_indicator' field: 'documents' means success via document output, 'text_result' means success via text, 'none' means no output
|
||||
VALIDATION PRINCIPLES:
|
||||
- Be GENEROUS with success - if the action achieved its basic purpose, mark as success
|
||||
- Focus on FUNCTIONALITY, not perfection
|
||||
- Document outputs are PRIMARY indicators of success
|
||||
- Text outputs are SECONDARY indicators
|
||||
- Only retry for CLEAR technical issues, not minor imperfections
|
||||
- Don't be picky about formatting or minor details
|
||||
|
||||
EXAMPLES OF SUCCESS:
|
||||
- Document extraction produced a file (even if imperfect)
|
||||
- Text analysis provided meaningful insights
|
||||
- Data processing completed with results
|
||||
|
||||
EXAMPLES OF RETRY:
|
||||
- Technical errors (API failures, timeouts)
|
||||
- Missing required inputs
|
||||
- Clear implementation bugs
|
||||
|
||||
EXAMPLES OF FAILED:
|
||||
- Complete system failures
|
||||
- No output whatsoever
|
||||
- Unrecoverable errors
|
||||
|
||||
REQUIRED JSON STRUCTURE:
|
||||
{{
|
||||
"status": "success|retry|failed",
|
||||
"reason": "Explanation of the decision",
|
||||
"improvements": "Specific improvements for retry (if status is retry)",
|
||||
"reason": "Brief explanation",
|
||||
"improvements": ["specific technical fixes only"],
|
||||
"quality_score": 1-10,
|
||||
"missing_outputs": ["output1", "output2"],
|
||||
"met_criteria": ["criteria1", "criteria2"],
|
||||
"unmet_criteria": ["criteria3", "criteria4"]
|
||||
"missing_outputs": [],
|
||||
"met_criteria": ["basic functionality achieved"],
|
||||
"unmet_criteria": []
|
||||
}}
|
||||
|
||||
NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
|
||||
NOTE: Respond with ONLY the JSON object. Be GENEROUS with success ratings."""
|
||||
Loading…
Reference in a new issue