fixed chunk parsing

2025-10-14 00:30:47 +02:00 · 2025-10-14 00:30:47 +02:00 · ac755681b3
commit ac755681b3
parent 0bc71c99d5
3 changed files with 76 additions and 127 deletions
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -1204,8 +1204,10 @@ class AiService:
            results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
        
        # Merge JSON results for each document
+        all_documents = []
        all_sections = []
        document_titles = []
+        combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"}
        
        for doc_id, doc_chunks in results_by_document.items():
            # Process each chunk's JSON result
@ -1218,12 +1220,23 @@ class AiService:
                        
                        # Check if this is a multi-file response (has "documents" key)
                        if isinstance(chunk_json, dict) and "documents" in chunk_json:
-                            # This is a multi-file response - return it as-is
-                            logger.info("Detected multi-file response from AI - preserving structure")
-                            return chunk_json
+                            # This is a multi-file response - merge all documents
+                            logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents")
+                            
+                            # Add all documents from this chunk
+                            for doc in chunk_json["documents"]:
+                                # Add chunk context to document
+                                doc["metadata"] = doc.get("metadata", {})
+                                doc["metadata"]["source_chunk"] = chunk_result.chunkIndex
+                                doc["metadata"]["source_document"] = doc_id
+                                all_documents.append(doc)
+                            
+                            # Update combined metadata
+                            if "metadata" in chunk_json:
+                                combined_metadata.update(chunk_json["metadata"])
                        
-                        # Extract sections from single-file response
-                        if isinstance(chunk_json, dict) and "sections" in chunk_json:
+                        # Extract sections from single-file response (fallback)
+                        elif isinstance(chunk_json, dict) and "sections" in chunk_json:
                            for section in chunk_json["sections"]:
                                # Add document context to section
                                section["metadata"] = section.get("metadata", {})
@ -1276,7 +1289,15 @@ class AiService:
        # Sort sections by order
        all_sections.sort(key=lambda x: x.get("order", 0))
        
-        # Create merged document
+        # If we have merged documents from multi-file responses, return them
+        if all_documents:
+            logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks")
+            return {
+                "metadata": combined_metadata,
+                "documents": all_documents
+            }
+        
+        # Otherwise, create merged document with sections (single-file fallback)
        merged_document = {
            "metadata": {
                "title": document_titles[0] if document_titles else "Merged Document",
@ -1641,60 +1662,36 @@ class AiService:
            
            # Debug: Show what content is being processed (before filtering)
            for i, ec in enumerate(extractionResult):
-                logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}")
-                
-                # Check each part within the ContentExtracted
                if hasattr(ec, 'parts'):
                    for j, part in enumerate(ec.parts):
-                        if hasattr(part, 'data') and part.data:
-                            logger.debug(f"  Part {j} content preview: {part.data[:200]}...")
-                        else:
-                            # Check what attributes the part actually has
-                            part_attrs = [attr for attr in dir(part) if not attr.startswith('_')]
+                        if not (hasattr(part, 'data') and part.data):
+                            # Check if this is an empty container chunk (which is expected)
                            part_type = getattr(part, 'typeGroup', None)
                            part_mime = getattr(part, 'mimeType', '')
-                            has_data = hasattr(part, 'data') and bool(part.data)
                            
-                            logger.debug(f"  Part {j} DEBUG: available_attrs={part_attrs}")
-                            logger.debug(f"  Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}")
+                            is_empty_container = (
+                                part_type == "container" and 
+                                part_mime and 
+                                'document' in part_mime.lower()
+                            )
                            
-                            # Check if this is an empty container chunk (which is expected)
-                            is_empty_container = False
-                            if part_type == "container" and part_mime and 'document' in part_mime.lower():
-                                is_empty_container = True
-                            
-                            if is_empty_container:
-                                logger.debug(f"  Part {j} is empty container (will be filtered out) - mimeType={part_mime}")
-                            else:
-                                logger.warning(f"  Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}")
-                else:
-                    logger.warning(f"ContentExtracted {i} has no parts attribute")
+                            if not is_empty_container:
+                                logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'")
            
            chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
            
            # Debug: Show what chunks were actually processed (after filtering)
            logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
-            for i, chunk_result in enumerate(chunkResults):
-                if chunk_result and chunk_result.metadata.get("success", False):
-                    logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars")
-                else:
-                    logger.debug(f"Processed chunk {i}: error or skipped")
            
            # Merge with JSON mode
            mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
            
            # Debug: Show what the AI actually returned
            logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
-            if 'sections' in mergedJsonDocument:
-                logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
-                if mergedJsonDocument['sections']:
-                    logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...")
-                else:
-                    logger.warning("AI returned empty sections array")
            if 'documents' in mergedJsonDocument:
                logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
-            else:
-                logger.warning("AI did not return 'documents' key - this is single-file format")
+            elif 'sections' in mergedJsonDocument:
+                logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
            
            return mergedJsonDocument
            
--- a/modules/services/serviceGeneration/subPromptBuilder.py
+++ b/modules/services/serviceGeneration/subPromptBuilder.py
@ -38,85 +38,43 @@ async def buildAdaptiveExtractionPrompt(
    
    # Build adaptive prompt using AI analysis - match single-file style
    if promptAnalysis.get("is_multi_file", False):
-        # Check if this is JSON email data
-        is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation'])
-        
-        if is_json_email:
-            # Specialized prompt for JSON email data
-            multi_file_example = {
-                "metadata": {
-                    "title": "Email Conversations",
-                    "splitStrategy": "per_entity"
-                },
-                "documents": [
-                    {
-                        "id": "doc_1",
-                        "title": "Email from SENDER to RECIPIENT",
-                        "filename": "email_sender_to_recipient.txt",
-                        "sections": [
-                            {
-                                "id": "section_1",
-                                "content_type": "heading",
-                                "elements": [
-                                    {
-                                        "text": "Email from SENDER to RECIPIENT",
-                                        "level": 1
-                                    }
-                                ],
-                                "order": 1
-                            },
-                            {
-                                "id": "section_2",
-                                "content_type": "paragraph",
-                                "elements": [
-                                    {
-                                        "text": "FULL_EMAIL_CONTENT_HERE"
-                                    }
-                                ],
-                                "order": 2
-                            }
-                        ]
-                    }
-                ]
-            }
-        else:
-            # Generic multi-file prompt
-            multi_file_example = {
-                "metadata": {
-                    "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
-                    "splitStrategy": "by_section"
-                },
-                "documents": [
-                    {
-                        "id": "doc_1",
-                        "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
-                        "filename": "REPLACE_WITH_ACTUAL_FILENAME",
-                        "sections": [
-                            {
-                                "id": "section_1",
-                                "content_type": "heading",
-                                "elements": [
-                                    {
-                                        "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
-                                        "level": 1
-                                    }
-                                ],
-                                "order": 1
-                            },
-                            {
-                                "id": "section_2",
-                                "content_type": "paragraph",
-                                "elements": [
-                                    {
-                                        "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
-                                    }
-                                ],
-                                "order": 2
-                            }
-                        ]
-                    }
-                ]
-            }
+        # Multi-file prompt - use simple example format like single-file
+        multi_file_example = {
+            "metadata": {
+                "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
+                "splitStrategy": "by_section"
+            },
+            "documents": [
+                {
+                    "id": "doc_1",
+                    "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
+                    "filename": "REPLACE_WITH_ACTUAL_FILENAME",
+                    "sections": [
+                        {
+                            "id": "section_1",
+                            "content_type": "heading",
+                            "elements": [
+                                {
+                                    "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
+                                    "level": 1
+                                }
+                            ],
+                            "order": 1
+                        },
+                        {
+                            "id": "section_2",
+                            "content_type": "paragraph",
+                            "elements": [
+                                {
+                                    "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
+                                }
+                            ],
+                            "order": 2
+                        }
+                    ]
+                }
+            ]
+        }
        
        adaptive_prompt = f"""
 {userPrompt}
--- a/test_document_processing.py
+++ b/test_document_processing.py
@ -20,15 +20,9 @@ from modules.services.serviceAi.mainServiceAi import AiService
 from modules.services.serviceGeneration.mainServiceGeneration import GenerationService

 # Set up logging
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)

-# Set all module loggers to DEBUG level
-logging.getLogger('modules.services.serviceAi.mainServiceAi').setLevel(logging.DEBUG)
-logging.getLogger('modules.services.serviceGeneration.mainServiceGeneration').setLevel(logging.DEBUG)
-logging.getLogger('modules.services.serviceGeneration.subPromptBuilder').setLevel(logging.DEBUG)
-logging.getLogger('modules.services.serviceExtraction.mainServiceExtraction').setLevel(logging.DEBUG)
-

 async def process_documents_and_generate_summary():
    """Process documents using the main AI service with intelligent chunk integration."""