Multi-document output implemented
This commit is contained in:
parent
0c357dc8a9
commit
0bc71c99d5
6 changed files with 1448 additions and 50 deletions
|
|
@ -649,6 +649,11 @@ class AiService:
|
|||
|
||||
for part in ec.parts:
|
||||
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
||||
# Skip empty container chunks (they're just metadata containers)
|
||||
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
||||
logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
|
||||
continue
|
||||
|
||||
chunks_to_process.append({
|
||||
'part': part,
|
||||
'chunk_index': chunk_index,
|
||||
|
|
@ -764,7 +769,14 @@ class AiService:
|
|||
elif part.typeGroup in ("container", "binary"):
|
||||
# Handle ALL container and binary content generically - let AI process any document type
|
||||
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
||||
if part.mimeType and part.data and len(part.data.strip()) > 0:
|
||||
|
||||
# Skip empty container chunks (they're just metadata containers)
|
||||
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
||||
self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
||||
logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
|
||||
# Skip processing this chunk
|
||||
pass
|
||||
elif part.mimeType and part.data and len(part.data.strip()) > 0:
|
||||
# Process any document container as text content
|
||||
request_options = options if options is not None else AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
|
@ -869,12 +881,19 @@ class AiService:
|
|||
# Log extraction context length
|
||||
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
|
||||
|
||||
# Debug: Log the actual prompt being sent to AI
|
||||
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
|
||||
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=prompt,
|
||||
context=part.data,
|
||||
options=request_options
|
||||
)
|
||||
response = await self.aiObjects.call(request)
|
||||
|
||||
# Debug: Log what AI actually returned
|
||||
logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
|
||||
ai_result = response.content
|
||||
|
||||
# Log extraction response length
|
||||
|
|
@ -900,16 +919,20 @@ class AiService:
|
|||
import json
|
||||
import re
|
||||
|
||||
# Clean the response - remove markdown code blocks if present
|
||||
# Clean the response - remove markdown code blocks and extra formatting
|
||||
cleaned_result = ai_result.strip()
|
||||
if cleaned_result.startswith('```json'):
|
||||
# Remove ```json from start and ``` from end
|
||||
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
|
||||
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
||||
elif cleaned_result.startswith('```'):
|
||||
# Remove ``` from start and end
|
||||
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
|
||||
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
||||
|
||||
# Remove any markdown code block markers (```json, ```, etc.)
|
||||
cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
|
||||
cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
|
||||
|
||||
# Remove any remaining ``` markers anywhere in the text
|
||||
cleaned_result = re.sub(r'```', '', cleaned_result)
|
||||
|
||||
# Try to extract JSON from the response if it's embedded in other text
|
||||
json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
|
||||
if json_match:
|
||||
cleaned_result = json_match.group(0)
|
||||
|
||||
# Validate JSON
|
||||
json.loads(cleaned_result)
|
||||
|
|
@ -1193,7 +1216,13 @@ class AiService:
|
|||
# Parse JSON from AI result
|
||||
chunk_json = json.loads(chunk_result.aiResult)
|
||||
|
||||
# Extract sections from this chunk
|
||||
# Check if this is a multi-file response (has "documents" key)
|
||||
if isinstance(chunk_json, dict) and "documents" in chunk_json:
|
||||
# This is a multi-file response - return it as-is
|
||||
logger.info("Detected multi-file response from AI - preserving structure")
|
||||
return chunk_json
|
||||
|
||||
# Extract sections from single-file response
|
||||
if isinstance(chunk_json, dict) and "sections" in chunk_json:
|
||||
for section in chunk_json["sections"]:
|
||||
# Add document context to section
|
||||
|
|
@ -1527,6 +1556,152 @@ class AiService:
|
|||
# This ensures MIME-type checking, chunk mapping, and parallel processing
|
||||
return await self._processDocumentsPerChunk(documents, prompt, options)
|
||||
|
||||
async def _callAiDirect(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Call AI directly with prompt and documents for JSON output.
|
||||
Used for multi-file generation - uses the existing generation pipeline.
|
||||
"""
|
||||
# Use the existing generation pipeline that already works
|
||||
# This ensures proper document processing and content extraction
|
||||
logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
|
||||
|
||||
# Process documents with JSON merging using the existing pipeline
|
||||
result = await self._processDocumentsPerChunkJson(documents, prompt, options)
|
||||
|
||||
# Convert single-file result to multi-file format if needed
|
||||
if "sections" in result and "documents" not in result:
|
||||
logger.info("Converting single-file result to multi-file format")
|
||||
# This is a single-file result, convert it to multi-file format
|
||||
return {
|
||||
"metadata": result.get("metadata", {"title": "Converted Document"}),
|
||||
"documents": [{
|
||||
"id": "doc_1",
|
||||
"title": result.get("metadata", {}).get("title", "Document"),
|
||||
"filename": "document.txt",
|
||||
"sections": result.get("sections", [])
|
||||
}]
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
async def _processDocumentsPerChunkJsonWithPrompt(
|
||||
self,
|
||||
documents: List[ChatDocument],
|
||||
custom_prompt: str,
|
||||
options: Optional[AiCallOptions] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process documents with per-chunk AI calls and merge results in JSON mode.
|
||||
Uses a custom prompt instead of the default extraction prompt.
|
||||
"""
|
||||
if not documents:
|
||||
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
||||
|
||||
# Get model capabilities for size calculation
|
||||
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
|
||||
|
||||
# Build extraction options for chunking with intelligent merging
|
||||
extractionOptions: Dict[str, Any] = {
|
||||
"prompt": custom_prompt, # Use the custom prompt instead of default
|
||||
"operationType": options.operationType if options else "general",
|
||||
"processDocumentsIndividually": True, # Process each document separately
|
||||
"maxSize": model_capabilities["maxContextBytes"],
|
||||
"chunkAllowed": True,
|
||||
"textChunkSize": model_capabilities["textChunkSize"],
|
||||
"imageChunkSize": model_capabilities["imageChunkSize"],
|
||||
"imageMaxPixels": 1024 * 1024,
|
||||
"imageQuality": 85,
|
||||
"mergeStrategy": {
|
||||
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
||||
"modelCapabilities": model_capabilities,
|
||||
"prompt": custom_prompt, # Use the custom prompt
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
},
|
||||
}
|
||||
|
||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
||||
|
||||
try:
|
||||
# Extract content with chunking
|
||||
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
|
||||
|
||||
if not isinstance(extractionResult, list):
|
||||
return {"metadata": {"title": "Error Document"}, "sections": []}
|
||||
|
||||
# Process chunks with proper mapping
|
||||
logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
|
||||
logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
|
||||
|
||||
# Debug: Show what content is being processed (before filtering)
|
||||
for i, ec in enumerate(extractionResult):
|
||||
logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}")
|
||||
|
||||
# Check each part within the ContentExtracted
|
||||
if hasattr(ec, 'parts'):
|
||||
for j, part in enumerate(ec.parts):
|
||||
if hasattr(part, 'data') and part.data:
|
||||
logger.debug(f" Part {j} content preview: {part.data[:200]}...")
|
||||
else:
|
||||
# Check what attributes the part actually has
|
||||
part_attrs = [attr for attr in dir(part) if not attr.startswith('_')]
|
||||
part_type = getattr(part, 'typeGroup', None)
|
||||
part_mime = getattr(part, 'mimeType', '')
|
||||
has_data = hasattr(part, 'data') and bool(part.data)
|
||||
|
||||
logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}")
|
||||
logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}")
|
||||
|
||||
# Check if this is an empty container chunk (which is expected)
|
||||
is_empty_container = False
|
||||
if part_type == "container" and part_mime and 'document' in part_mime.lower():
|
||||
is_empty_container = True
|
||||
|
||||
if is_empty_container:
|
||||
logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}")
|
||||
else:
|
||||
logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}")
|
||||
else:
|
||||
logger.warning(f"ContentExtracted {i} has no parts attribute")
|
||||
|
||||
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
|
||||
|
||||
# Debug: Show what chunks were actually processed (after filtering)
|
||||
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
|
||||
for i, chunk_result in enumerate(chunkResults):
|
||||
if chunk_result and chunk_result.metadata.get("success", False):
|
||||
logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars")
|
||||
else:
|
||||
logger.debug(f"Processed chunk {i}: error or skipped")
|
||||
|
||||
# Merge with JSON mode
|
||||
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
||||
|
||||
# Debug: Show what the AI actually returned
|
||||
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
|
||||
if 'sections' in mergedJsonDocument:
|
||||
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
|
||||
if mergedJsonDocument['sections']:
|
||||
logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...")
|
||||
else:
|
||||
logger.warning("AI returned empty sections array")
|
||||
if 'documents' in mergedJsonDocument:
|
||||
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
|
||||
else:
|
||||
logger.warning("AI did not return 'documents' key - this is single-file format")
|
||||
|
||||
return mergedJsonDocument
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
|
||||
return {"metadata": {"title": "Error Document"}, "sections": []}
|
||||
|
||||
async def _callAiJson(
|
||||
self,
|
||||
prompt: str,
|
||||
|
|
@ -1821,6 +1996,88 @@ class AiService:
|
|||
target_length = int(len(text) * reduction_factor)
|
||||
return text[:target_length] + "... [reduced]"
|
||||
|
||||
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Use AI to analyze user prompt and determine processing requirements."""
|
||||
if not ai_service:
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and determine if it requires multiple file output or single file output.
|
||||
|
||||
User request: "{prompt}"
|
||||
|
||||
Respond with JSON only in this exact format:
|
||||
{{
|
||||
"is_multi_file": true/false,
|
||||
"strategy": "single|per_entity|by_section|by_criteria|custom",
|
||||
"criteria": "description of how to split content",
|
||||
"file_naming_pattern": "suggested pattern for filenames",
|
||||
"reasoning": "brief explanation of the analysis"
|
||||
}}
|
||||
|
||||
Consider:
|
||||
- Does the user want separate files for different entities (customers, products, etc.)?
|
||||
- Does the user want to split content into multiple documents?
|
||||
- What would be the most logical way to organize the content?
|
||||
- What language is the request in? (analyze in the original language)
|
||||
|
||||
Return only the JSON response.
|
||||
"""
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Extract JSON from response
|
||||
result = response.content.strip()
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
|
||||
analysis = json.loads(result)
|
||||
return analysis
|
||||
else:
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
|
||||
"""Validate that AI response matches the expected structure."""
|
||||
try:
|
||||
if not isinstance(response, dict):
|
||||
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
|
||||
return False
|
||||
|
||||
# Check for multi-file structure
|
||||
if prompt_analysis.get("is_multi_file", False):
|
||||
has_documents = "documents" in response
|
||||
is_documents_list = isinstance(response.get("documents"), list)
|
||||
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
|
||||
if has_documents and is_documents_list:
|
||||
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
|
||||
else:
|
||||
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
|
||||
logger.warning(f"Available keys: {list(response.keys())}")
|
||||
return has_documents and is_documents_list
|
||||
else:
|
||||
has_sections = "sections" in response
|
||||
is_sections_list = isinstance(response.get("sections"), list)
|
||||
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
|
||||
return has_sections and is_sections_list
|
||||
except Exception as e:
|
||||
logger.warning(f"Response validation failed with exception: {str(e)}")
|
||||
return False
|
||||
|
||||
async def _callAiWithDocumentGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
|
|
@ -1831,6 +2088,7 @@ class AiService:
|
|||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Handle AI calls with document generation in specific output format.
|
||||
Now supports both single-file and multi-file generation.
|
||||
|
||||
Args:
|
||||
prompt: The main prompt for the AI call
|
||||
|
|
@ -1842,6 +2100,43 @@ class AiService:
|
|||
Returns:
|
||||
Dict with generated documents and metadata
|
||||
"""
|
||||
try:
|
||||
# Use AI to analyze prompt intent
|
||||
prompt_analysis = await self._analyzePromptIntent(prompt, self)
|
||||
logger.info(f"Prompt analysis result: {prompt_analysis}")
|
||||
|
||||
if prompt_analysis.get("is_multi_file", False):
|
||||
return await self._callAiWithMultiFileGeneration(
|
||||
prompt, documents, options, outputFormat, title, prompt_analysis
|
||||
)
|
||||
else:
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document generation: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"content": "",
|
||||
"rendered_content": "",
|
||||
"mime_type": "text/plain",
|
||||
"filename": f"error_{outputFormat}",
|
||||
"format": outputFormat,
|
||||
"title": title or "Error",
|
||||
"documents": []
|
||||
}
|
||||
|
||||
async def _callAiWithSingleFileGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle single-file document generation (existing functionality)."""
|
||||
try:
|
||||
# Get format-specific extraction prompt from generation service
|
||||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
|
|
@ -1912,20 +2207,216 @@ class AiService:
|
|||
"documentName": filename,
|
||||
"documentData": renderedContent,
|
||||
"mimeType": mimeType
|
||||
}]
|
||||
}],
|
||||
"is_multi_file": False
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document generation: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"content": "",
|
||||
"rendered_content": "",
|
||||
"mime_type": "text/plain",
|
||||
"filename": f"error_{outputFormat}",
|
||||
"format": outputFormat,
|
||||
"title": title or "Error",
|
||||
"documents": []
|
||||
}
|
||||
logger.error(f"Error in single-file document generation: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _callAiWithMultiFileGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str],
|
||||
prompt_analysis: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle multi-file document generation using AI analysis."""
|
||||
try:
|
||||
# Get multi-file extraction prompt based on AI analysis
|
||||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
generation_service = GenerationService(self.services)
|
||||
|
||||
# Use default title if not provided
|
||||
if not title:
|
||||
title = "AI Generated Documents"
|
||||
|
||||
# Get adaptive extraction prompt
|
||||
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=title,
|
||||
promptAnalysis=prompt_analysis,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
|
||||
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
|
||||
|
||||
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
|
||||
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
|
||||
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
|
||||
|
||||
# Use the existing pipeline but replace the prompt with our adaptive one
|
||||
# This ensures proper document processing while using the multi-file prompt
|
||||
ai_response = await self._processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
|
||||
|
||||
logger.info(f"AI response type: {type(ai_response)}")
|
||||
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
|
||||
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
|
||||
|
||||
# Validate response structure
|
||||
if not self._validateResponseStructure(ai_response, prompt_analysis):
|
||||
# Fallback to single-file if multi-file fails
|
||||
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
|
||||
logger.warning(f"Prompt analysis: {prompt_analysis}")
|
||||
logger.warning("Falling back to single-file generation")
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
# Process multiple documents
|
||||
generated_documents = []
|
||||
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
||||
# Transform AI-generated sections to renderer-compatible format
|
||||
transformed_sections = []
|
||||
for section in doc_data.get("sections", []):
|
||||
# Convert AI format to renderer format
|
||||
transformed_section = {
|
||||
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
||||
"type": section.get("content_type", "paragraph"),
|
||||
"data": {
|
||||
"text": "",
|
||||
"elements": section.get("elements", [])
|
||||
},
|
||||
"order": section.get("order", len(transformed_sections) + 1)
|
||||
}
|
||||
|
||||
# Extract text from elements for simple text-based sections
|
||||
if section.get("content_type") in ["paragraph", "heading"]:
|
||||
text_parts = []
|
||||
for element in section.get("elements", []):
|
||||
if "text" in element:
|
||||
text_parts.append(element["text"])
|
||||
transformed_section["data"]["text"] = "\n".join(text_parts)
|
||||
|
||||
transformed_sections.append(transformed_section)
|
||||
|
||||
# Create complete document structure for rendering
|
||||
complete_document = {
|
||||
"metadata": {
|
||||
"title": doc_data["title"],
|
||||
"source_document": "multi_file_generation",
|
||||
"document_id": doc_data.get("id", f"doc_{i+1}"),
|
||||
"filename": doc_data.get("filename", f"document_{i+1}"),
|
||||
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||
},
|
||||
"sections": transformed_sections,
|
||||
"summary": f"Generated document: {doc_data['title']}",
|
||||
"tags": ["multi_file", "ai_generated"]
|
||||
}
|
||||
|
||||
rendered_content, mime_type = await generation_service.renderReport(
|
||||
extractedContent=complete_document,
|
||||
outputFormat=outputFormat,
|
||||
title=doc_data["title"],
|
||||
userPrompt=prompt,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Generate proper filename with correct extension
|
||||
base_filename = doc_data.get("filename", f"document_{i+1}")
|
||||
# Remove any existing extension and add the correct one
|
||||
if '.' in base_filename:
|
||||
base_filename = base_filename.rsplit('.', 1)[0]
|
||||
|
||||
# Add proper extension based on output format
|
||||
if outputFormat.lower() == "docx":
|
||||
filename = f"{base_filename}.docx"
|
||||
elif outputFormat.lower() == "pdf":
|
||||
filename = f"{base_filename}.pdf"
|
||||
elif outputFormat.lower() == "html":
|
||||
filename = f"{base_filename}.html"
|
||||
else:
|
||||
filename = f"{base_filename}.{outputFormat}"
|
||||
|
||||
generated_documents.append({
|
||||
"documentName": filename,
|
||||
"documentData": rendered_content,
|
||||
"mimeType": mime_type
|
||||
})
|
||||
|
||||
# Save debug files for multi-file generation - only if debug enabled
|
||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
try:
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
debug_root = "./test-chat/ai"
|
||||
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
|
||||
# Save metadata
|
||||
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"title: {title}\n")
|
||||
f.write(f"format: {outputFormat}\n")
|
||||
f.write(f"documents_count: {len(generated_documents)}\n")
|
||||
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
|
||||
f.write(f"prompt_analysis: {prompt_analysis}\n")
|
||||
|
||||
# Save each generated document
|
||||
for i, doc in enumerate(generated_documents):
|
||||
doc_filename = doc["documentName"]
|
||||
doc_data = doc["documentData"]
|
||||
doc_mime = doc["mimeType"]
|
||||
|
||||
# Determine file extension
|
||||
if outputFormat.lower() == "docx":
|
||||
file_ext = ".docx"
|
||||
elif outputFormat.lower() == "pdf":
|
||||
file_ext = ".pdf"
|
||||
elif outputFormat.lower() == "html":
|
||||
file_ext = ".html"
|
||||
else:
|
||||
file_ext = f".{outputFormat}"
|
||||
|
||||
# Save the rendered document
|
||||
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
|
||||
|
||||
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
||||
# Text-based formats
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
else:
|
||||
# Binary formats - decode from base64 if needed
|
||||
try:
|
||||
import base64
|
||||
doc_bytes = base64.b64decode(doc_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(doc_bytes)
|
||||
except Exception:
|
||||
# If not base64, save as text
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
|
||||
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
|
||||
|
||||
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save multi-file debug output: {e}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": ai_response,
|
||||
"rendered_content": None, # Not applicable for multi-file
|
||||
"mime_type": None, # Not applicable for multi-file
|
||||
"filename": None, # Not applicable for multi-file
|
||||
"format": outputFormat,
|
||||
"title": title,
|
||||
"documents": generated_documents,
|
||||
"is_multi_file": True,
|
||||
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in multi-file document generation: {str(e)}")
|
||||
# Fallback to single-file
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import logging
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||
from datetime import datetime, UTC
|
||||
import re
|
||||
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||
|
|
@ -372,6 +372,42 @@ class GenerationService:
|
|||
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def getAdaptiveExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
promptAnalysis: Dict[str, Any],
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get adaptive extraction prompt based on AI analysis."""
|
||||
from .subPromptBuilder import buildAdaptiveExtractionPrompt
|
||||
return await buildAdaptiveExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
promptAnalysis=promptAnalysis,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getGenericExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get generic extraction prompt that works for both single and multi-file."""
|
||||
from .subPromptBuilder import buildGenericExtractionPrompt
|
||||
return await buildGenericExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
|
||||
"""
|
||||
Get the format-specific extraction prompt for AI content extraction.
|
||||
|
|
@ -409,6 +445,75 @@ class GenerationService:
|
|||
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def renderAdaptiveReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None,
|
||||
isMultiFile: bool = False
|
||||
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
|
||||
"""Render report adaptively based on content structure."""
|
||||
|
||||
if isMultiFile and "documents" in extractedContent:
|
||||
return await self._renderMultiFileReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
else:
|
||||
return await self._renderSingleFileReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
|
||||
async def _renderMultiFileReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Render multiple documents from extracted content."""
|
||||
|
||||
generated_documents = []
|
||||
|
||||
for doc_data in extractedContent.get("documents", []):
|
||||
# Use existing single-file renderer for each document
|
||||
renderer = self._getFormatRenderer(outputFormat)
|
||||
if not renderer:
|
||||
continue
|
||||
|
||||
# Render individual document
|
||||
rendered_content, mime_type = await renderer.render(
|
||||
extractedContent={"sections": doc_data["sections"]},
|
||||
title=doc_data["title"],
|
||||
userPrompt=userPrompt,
|
||||
aiService=aiService
|
||||
)
|
||||
|
||||
generated_documents.append({
|
||||
"filename": doc_data["filename"],
|
||||
"content": rendered_content,
|
||||
"mime_type": mime_type,
|
||||
"title": doc_data["title"]
|
||||
})
|
||||
|
||||
return generated_documents
|
||||
|
||||
async def _renderSingleFileReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None
|
||||
) -> Tuple[str, str]:
|
||||
"""Render single file report (existing functionality)."""
|
||||
# Use existing renderReport method
|
||||
return await self.renderReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
|
||||
def _getFormatRenderer(self, output_format: str):
|
||||
"""Get the appropriate renderer for the specified format using auto-discovery."""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -6,8 +6,197 @@ This module provides schemas that guide AI to generate structured JSON output.
|
|||
from typing import Dict, Any
|
||||
|
||||
|
||||
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
|
||||
"""Get the JSON schema for multi-document generation."""
|
||||
return {
|
||||
"type": "object",
|
||||
"required": ["metadata", "documents"],
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"required": ["title", "splitStrategy"],
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "Document title"},
|
||||
"splitStrategy": {
|
||||
"type": "string",
|
||||
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
|
||||
"description": "Strategy for splitting content into multiple files"
|
||||
},
|
||||
"splitCriteria": {
|
||||
"type": "object",
|
||||
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
|
||||
},
|
||||
"fileNamingPattern": {
|
||||
"type": "string",
|
||||
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
|
||||
},
|
||||
"author": {"type": "string", "description": "Document author (optional)"},
|
||||
"source_documents": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of source document IDs"
|
||||
},
|
||||
"extraction_method": {
|
||||
"type": "string",
|
||||
"default": "ai_extraction",
|
||||
"description": "Method used for extraction"
|
||||
}
|
||||
}
|
||||
},
|
||||
"documents": {
|
||||
"type": "array",
|
||||
"description": "Array of individual documents to generate",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "title", "sections", "filename"],
|
||||
"properties": {
|
||||
"id": {"type": "string", "description": "Unique document identifier"},
|
||||
"title": {"type": "string", "description": "Document title"},
|
||||
"filename": {"type": "string", "description": "Generated filename"},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"description": "Document sections containing structured content",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "content_type", "elements", "order"],
|
||||
"properties": {
|
||||
"id": {"type": "string", "description": "Unique section identifier"},
|
||||
"title": {"type": "string", "description": "Section title (optional)"},
|
||||
"content_type": {
|
||||
"type": "string",
|
||||
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
||||
"description": "Primary content type of this section"
|
||||
},
|
||||
"elements": {
|
||||
"type": "array",
|
||||
"description": "Content elements in this section",
|
||||
"items": {
|
||||
"oneOf": [
|
||||
{"$ref": "#/definitions/table"},
|
||||
{"$ref": "#/definitions/bullet_list"},
|
||||
{"$ref": "#/definitions/paragraph"},
|
||||
{"$ref": "#/definitions/heading"},
|
||||
{"$ref": "#/definitions/code_block"}
|
||||
]
|
||||
}
|
||||
},
|
||||
"order": {"type": "integer", "description": "Section order in document"},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Additional section metadata"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Document-specific metadata"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"table": {
|
||||
"type": "object",
|
||||
"required": ["headers", "rows"],
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Table column headers"
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"description": "Table data rows"
|
||||
},
|
||||
"caption": {
|
||||
"type": "string",
|
||||
"description": "Table caption (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"bullet_list": {
|
||||
"type": "object",
|
||||
"required": ["items"],
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "List items"
|
||||
},
|
||||
"list_type": {
|
||||
"type": "string",
|
||||
"enum": ["bullet", "numbered", "checklist"],
|
||||
"default": "bullet",
|
||||
"description": "Type of list"
|
||||
}
|
||||
}
|
||||
},
|
||||
"list_item": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"paragraph": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Paragraph text"},
|
||||
"formatting": {
|
||||
"type": "object",
|
||||
"description": "Text formatting (bold, italic, etc.)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"heading": {
|
||||
"type": "object",
|
||||
"required": ["text", "level"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Heading text"},
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"maximum": 6,
|
||||
"description": "Heading level (1-6)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"code_block": {
|
||||
"type": "object",
|
||||
"required": ["code"],
|
||||
"properties": {
|
||||
"code": {"type": "string", "description": "Code content"},
|
||||
"language": {"type": "string", "description": "Programming language (optional)"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def get_document_subJsonSchema() -> Dict[str, Any]:
|
||||
"""Get the JSON schema for structured document generation."""
|
||||
"""Get the JSON schema for structured document generation (single document)."""
|
||||
return {
|
||||
"type": "object",
|
||||
"required": ["metadata", "sections"],
|
||||
|
|
@ -227,6 +416,13 @@ Return only the enhanced JSON structure following the schema. Do not include any
|
|||
"""
|
||||
|
||||
|
||||
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""Automatically select appropriate schema based on prompt analysis."""
|
||||
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
|
||||
return get_multi_document_subJsonSchema()
|
||||
else:
|
||||
return get_document_subJsonSchema()
|
||||
|
||||
def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
||||
"""Validate that the JSON data follows the document schema."""
|
||||
try:
|
||||
|
|
@ -234,35 +430,86 @@ def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
|||
if not isinstance(json_data, dict):
|
||||
return False
|
||||
|
||||
if "metadata" not in json_data or "sections" not in json_data:
|
||||
return False
|
||||
|
||||
metadata = json_data["metadata"]
|
||||
if not isinstance(metadata, dict) or "title" not in metadata:
|
||||
return False
|
||||
|
||||
sections = json_data["sections"]
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section
|
||||
for i, section in enumerate(sections):
|
||||
if not isinstance(section, dict):
|
||||
# Check if it's multi-document or single-document structure
|
||||
if "documents" in json_data:
|
||||
# Multi-document structure
|
||||
if "metadata" not in json_data:
|
||||
return False
|
||||
|
||||
required_fields = ["id", "content_type", "elements", "order"]
|
||||
for field in required_fields:
|
||||
if field not in section:
|
||||
metadata = json_data["metadata"]
|
||||
if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
|
||||
return False
|
||||
|
||||
documents = json_data["documents"]
|
||||
if not isinstance(documents, list):
|
||||
return False
|
||||
|
||||
# Validate each document
|
||||
for doc in documents:
|
||||
if not isinstance(doc, dict):
|
||||
return False
|
||||
|
||||
required_fields = ["id", "title", "sections", "filename"]
|
||||
for field in required_fields:
|
||||
if field not in doc:
|
||||
return False
|
||||
|
||||
# Validate sections in each document
|
||||
sections = doc.get("sections", [])
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
|
||||
section_required = ["id", "content_type", "elements", "order"]
|
||||
for field in section_required:
|
||||
if field not in section:
|
||||
return False
|
||||
|
||||
# Validate content_type
|
||||
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||
if section["content_type"] not in valid_types:
|
||||
return False
|
||||
|
||||
# Validate elements
|
||||
if not isinstance(section["elements"], list):
|
||||
return False
|
||||
|
||||
# Validate content_type
|
||||
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||
if section["content_type"] not in valid_types:
|
||||
elif "sections" in json_data:
|
||||
# Single-document structure (existing validation)
|
||||
if "metadata" not in json_data:
|
||||
return False
|
||||
|
||||
# Validate elements
|
||||
if not isinstance(section["elements"], list):
|
||||
metadata = json_data["metadata"]
|
||||
if not isinstance(metadata, dict) or "title" not in metadata:
|
||||
return False
|
||||
|
||||
sections = json_data["sections"]
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section
|
||||
for i, section in enumerate(sections):
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
|
||||
required_fields = ["id", "content_type", "elements", "order"]
|
||||
for field in required_fields:
|
||||
if field not in section:
|
||||
return False
|
||||
|
||||
# Validate content_type
|
||||
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||
if section["content_type"] not in valid_types:
|
||||
return False
|
||||
|
||||
# Validate elements
|
||||
if not isinstance(section["elements"], list):
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ Builds a robust prompt that:
|
|||
- Requires the AI to output a filename header that we can parse and use
|
||||
"""
|
||||
|
||||
from typing import Protocol
|
||||
import json
|
||||
from typing import Protocol, Dict, Any
|
||||
|
||||
|
||||
class _RendererLike(Protocol):
|
||||
|
|
@ -16,6 +17,291 @@ class _RendererLike(Protocol):
|
|||
...
|
||||
|
||||
|
||||
async def buildAdaptiveExtractionPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
promptAnalysis: Dict[str, Any],
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""Build adaptive extraction prompt based on AI analysis."""
|
||||
|
||||
# Get appropriate JSON schema based on analysis
|
||||
from .subJsonSchema import get_adaptive_json_schema
|
||||
json_schema = get_adaptive_json_schema(promptAnalysis)
|
||||
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
schema_type = "multi-document"
|
||||
else:
|
||||
schema_type = "single-document"
|
||||
|
||||
# Build adaptive prompt using AI analysis - match single-file style
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
# Check if this is JSON email data
|
||||
is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation'])
|
||||
|
||||
if is_json_email:
|
||||
# Specialized prompt for JSON email data
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "Email Conversations",
|
||||
"splitStrategy": "per_entity"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "Email from SENDER to RECIPIENT",
|
||||
"filename": "email_sender_to_recipient.txt",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "Email from SENDER to RECIPIENT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "FULL_EMAIL_CONTENT_HERE"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
# Generic multi-file prompt
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
||||
"splitStrategy": "by_section"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
||||
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
|
||||
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
||||
3. Create one JSON document entry for each section found
|
||||
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
||||
5. Generate appropriate filenames for each section
|
||||
|
||||
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(multi_file_example, indent=2)}
|
||||
|
||||
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
||||
- "id": unique identifier
|
||||
- "title": section title from the document
|
||||
- "filename": appropriate filename for the section
|
||||
- "sections": array of content sections
|
||||
|
||||
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
||||
|
||||
INSTRUCTIONS:
|
||||
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
|
||||
- Use actual section titles, headings, and text from the document
|
||||
- Create meaningful filenames based on section content
|
||||
- Ensure each section contains the complete content for that part of the document
|
||||
- Do not use generic placeholder text like "Section 1", "Section 2"
|
||||
- Extract real headings, paragraphs, lists, and other content elements
|
||||
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
||||
|
||||
CONTEXT (Document Content):
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
""".strip()
|
||||
else:
|
||||
# Single-file prompt - use original style
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
|
||||
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{json.dumps(json_schema, indent=2)}
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
""".strip()
|
||||
|
||||
return adaptive_prompt
|
||||
|
||||
async def buildGenericExtractionPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""Build generic extraction prompt that works for both single and multi-file."""
|
||||
|
||||
# Use AI to determine the best approach
|
||||
if aiService:
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and determine the best JSON structure for document extraction.
|
||||
|
||||
User request: "{userPrompt}"
|
||||
|
||||
Respond with JSON only:
|
||||
{{
|
||||
"requires_multi_file": true/false,
|
||||
"recommended_schema": "single_document|multi_document",
|
||||
"split_approach": "description of how to organize content",
|
||||
"file_naming": "suggested naming pattern"
|
||||
}}
|
||||
|
||||
Consider the user's intent and the most logical way to organize the extracted content.
|
||||
"""
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
import re
|
||||
|
||||
result = response.content.strip()
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
|
||||
analysis = json.loads(result)
|
||||
|
||||
# Use analysis to build appropriate prompt
|
||||
return await buildAdaptiveExtractionPrompt(
|
||||
outputFormat, userPrompt, title, analysis, aiService, services
|
||||
)
|
||||
except Exception as e:
|
||||
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
|
||||
# Fallback to single-file prompt
|
||||
from .subJsonSchema import get_document_subJsonSchema
|
||||
json_schema = get_document_subJsonSchema()
|
||||
|
||||
return f"""
|
||||
{userPrompt}
|
||||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
|
||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{json.dumps(json_schema, indent=2)}
|
||||
|
||||
Requirements:
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Use the exact JSON schema provided
|
||||
- Maintain data integrity and structure
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
|
||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||
"""
|
||||
|
||||
async def buildExtractionPrompt(
|
||||
outputFormat: str,
|
||||
renderer: _RendererLike,
|
||||
|
|
@ -48,7 +334,7 @@ async def buildExtractionPrompt(
|
|||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
|
||||
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{{
|
||||
|
|
@ -106,6 +392,10 @@ Image Analysis Requirements:
|
|||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
|
||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||
""".strip()
|
||||
|
||||
# Final assembly
|
||||
|
|
|
|||
|
|
@ -220,6 +220,8 @@ async def process_documents_and_generate_summary():
|
|||
|
||||
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
|
||||
|
||||
# userPrompt = "Can you create one file for each section in the document"
|
||||
|
||||
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
||||
|
||||
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
|
||||
|
|
|
|||
263
test_multifile_processing.py
Normal file
263
test_multifile_processing.py
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for multi-file processing implementation.
|
||||
This script tests the new multi-file functionality without breaking existing single-file processing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def test_multi_file_detection():
|
||||
"""Test AI-powered multi-file detection."""
|
||||
print("=== Testing Multi-File Detection ===")
|
||||
|
||||
# Mock AI service for testing
|
||||
class MockAiService:
|
||||
async def call(self, request):
|
||||
class MockResponse:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
return MockResponse('{"is_multi_file": true, "strategy": "per_entity", "criteria": "customer_id", "file_naming_pattern": "{customer_name}_data.docx", "reasoning": "User wants separate files for each customer"}')
|
||||
|
||||
class MockAiObjects:
|
||||
def __init__(self):
|
||||
self.call = MockAiService().call
|
||||
|
||||
# Import the AI service
|
||||
try:
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
|
||||
# Create mock service center
|
||||
class MockServiceCenter:
|
||||
def __init__(self):
|
||||
self.utils = MockUtils()
|
||||
|
||||
class MockUtils:
|
||||
def debugLogToFile(self, message, category):
|
||||
print(f"[{category}] {message}")
|
||||
|
||||
# Create AI service instance
|
||||
ai_service = AiService(MockServiceCenter())
|
||||
ai_service.aiObjects = MockAiObjects()
|
||||
|
||||
# Test prompts
|
||||
test_prompts = [
|
||||
"Create one file for each customer in the document",
|
||||
"Split the data into separate files by category",
|
||||
"Generate individual files for each product",
|
||||
"Create a single report with all data",
|
||||
"Erstelle eine Datei für jeden Kunden", # German
|
||||
"Créer un fichier par section" # French
|
||||
]
|
||||
|
||||
for prompt in test_prompts:
|
||||
print(f"\nTesting prompt: '{prompt}'")
|
||||
try:
|
||||
analysis = await ai_service._analyzePromptIntent(prompt, ai_service)
|
||||
print(f" Analysis: {analysis}")
|
||||
|
||||
if analysis.get("is_multi_file"):
|
||||
print(f" ✓ Detected as multi-file with strategy: {analysis.get('strategy')}")
|
||||
else:
|
||||
print(f" ✓ Detected as single-file")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
print("\n=== Multi-File Detection Test Complete ===")
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Import error: {e}")
|
||||
print("Make sure you're running from the gateway directory")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error during testing: {e}")
|
||||
return False
|
||||
|
||||
async def test_json_schema_validation():
|
||||
"""Test JSON schema validation for both single and multi-file."""
|
||||
print("\n=== Testing JSON Schema Validation ===")
|
||||
|
||||
try:
|
||||
from modules.services.serviceGeneration.subJsonSchema import (
|
||||
get_document_subJsonSchema,
|
||||
get_multi_document_subJsonSchema,
|
||||
get_adaptive_json_schema,
|
||||
validate_json_document
|
||||
)
|
||||
|
||||
# Test single document schema
|
||||
single_doc_schema = get_document_subJsonSchema()
|
||||
print(f"✓ Single document schema loaded: {len(single_doc_schema)} properties")
|
||||
|
||||
# Test multi-document schema
|
||||
multi_doc_schema = get_multi_document_subJsonSchema()
|
||||
print(f"✓ Multi-document schema loaded: {len(multi_doc_schema)} properties")
|
||||
|
||||
# Test adaptive schema selection
|
||||
single_analysis = {"is_multi_file": False}
|
||||
multi_analysis = {"is_multi_file": True}
|
||||
|
||||
single_schema = get_adaptive_json_schema(single_analysis)
|
||||
multi_schema = get_adaptive_json_schema(multi_analysis)
|
||||
|
||||
print(f"✓ Adaptive schema selection working")
|
||||
print(f" Single-file schema type: {single_schema.get('type', 'unknown')}")
|
||||
print(f" Multi-file schema type: {multi_schema.get('type', 'unknown')}")
|
||||
|
||||
# Test validation with sample data
|
||||
single_doc_data = {
|
||||
"metadata": {"title": "Test Document"},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "paragraph",
|
||||
"elements": [{"text": "Test content"}],
|
||||
"order": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
multi_doc_data = {
|
||||
"metadata": {
|
||||
"title": "Test Documents",
|
||||
"splitStrategy": "per_entity"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "Document 1",
|
||||
"filename": "doc1.docx",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "paragraph",
|
||||
"elements": [{"text": "Content 1"}],
|
||||
"order": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
single_valid = validate_json_document(single_doc_data)
|
||||
multi_valid = validate_json_document(multi_doc_data)
|
||||
|
||||
print(f"✓ Single document validation: {'PASS' if single_valid else 'FAIL'}")
|
||||
print(f"✓ Multi-document validation: {'PASS' if multi_valid else 'FAIL'}")
|
||||
|
||||
print("\n=== JSON Schema Validation Test Complete ===")
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Import error: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error during schema testing: {e}")
|
||||
return False
|
||||
|
||||
async def test_prompt_builder():
|
||||
"""Test adaptive prompt building."""
|
||||
print("\n=== Testing Prompt Builder ===")
|
||||
|
||||
try:
|
||||
from modules.services.serviceGeneration.subPromptBuilder import (
|
||||
buildAdaptiveExtractionPrompt,
|
||||
buildGenericExtractionPrompt
|
||||
)
|
||||
|
||||
# Mock services
|
||||
class MockServices:
|
||||
def __init__(self):
|
||||
self.utils = MockUtils()
|
||||
|
||||
class MockUtils:
|
||||
def debugLogToFile(self, message, category):
|
||||
print(f"[{category}] {message}")
|
||||
|
||||
services = MockServices()
|
||||
|
||||
# Test adaptive prompt building
|
||||
prompt_analysis = {
|
||||
"is_multi_file": True,
|
||||
"strategy": "per_entity",
|
||||
"criteria": "customer_id",
|
||||
"file_naming_pattern": "{customer_name}_data.docx"
|
||||
}
|
||||
|
||||
adaptive_prompt = await buildAdaptiveExtractionPrompt(
|
||||
outputFormat="docx",
|
||||
userPrompt="Create one file for each customer",
|
||||
title="Customer Data",
|
||||
promptAnalysis=prompt_analysis,
|
||||
aiService=None,
|
||||
services=services
|
||||
)
|
||||
|
||||
print(f"✓ Adaptive prompt generated: {len(adaptive_prompt)} characters")
|
||||
print(f" Contains multi-file instructions: {'documents' in adaptive_prompt}")
|
||||
|
||||
# Test generic prompt building
|
||||
generic_prompt = await buildGenericExtractionPrompt(
|
||||
outputFormat="docx",
|
||||
userPrompt="Create a single report",
|
||||
title="Report",
|
||||
aiService=None,
|
||||
services=services
|
||||
)
|
||||
|
||||
print(f"✓ Generic prompt generated: {len(generic_prompt)} characters")
|
||||
print(f" Contains single-file instructions: {'sections' in generic_prompt}")
|
||||
|
||||
print("\n=== Prompt Builder Test Complete ===")
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Import error: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error during prompt builder testing: {e}")
|
||||
return False
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
print("Starting Multi-File Processing Tests...")
|
||||
print("=" * 50)
|
||||
|
||||
tests = [
|
||||
test_multi_file_detection,
|
||||
test_json_schema_validation,
|
||||
test_prompt_builder
|
||||
]
|
||||
|
||||
results = []
|
||||
for test in tests:
|
||||
try:
|
||||
result = await test()
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"Test failed with exception: {e}")
|
||||
results.append(False)
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("Test Results Summary:")
|
||||
print(f" Tests run: {len(tests)}")
|
||||
print(f" Passed: {sum(results)}")
|
||||
print(f" Failed: {len(tests) - sum(results)}")
|
||||
|
||||
if all(results):
|
||||
print("\n🎉 All tests passed! Multi-file processing is ready.")
|
||||
else:
|
||||
print("\n⚠️ Some tests failed. Check the implementation.")
|
||||
|
||||
return all(results)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Loading…
Reference in a new issue