From ac755681b35dc6a62baff8df24679968641aa793 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Tue, 14 Oct 2025 00:30:47 +0200 Subject: [PATCH] fixed chunk parsing --- modules/services/serviceAi/mainServiceAi.py | 79 ++++++------ .../serviceGeneration/subPromptBuilder.py | 116 ++++++------------ test_document_processing.py | 8 +- 3 files changed, 76 insertions(+), 127 deletions(-) diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 927f696e..8f2964e2 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -1204,8 +1204,10 @@ class AiService: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge JSON results for each document + all_documents = [] all_sections = [] document_titles = [] + combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"} for doc_id, doc_chunks in results_by_document.items(): # Process each chunk's JSON result @@ -1218,12 +1220,23 @@ class AiService: # Check if this is a multi-file response (has "documents" key) if isinstance(chunk_json, dict) and "documents" in chunk_json: - # This is a multi-file response - return it as-is - logger.info("Detected multi-file response from AI - preserving structure") - return chunk_json + # This is a multi-file response - merge all documents + logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents") + + # Add all documents from this chunk + for doc in chunk_json["documents"]: + # Add chunk context to document + doc["metadata"] = doc.get("metadata", {}) + doc["metadata"]["source_chunk"] = chunk_result.chunkIndex + doc["metadata"]["source_document"] = doc_id + all_documents.append(doc) + + # Update combined metadata + if "metadata" in chunk_json: + combined_metadata.update(chunk_json["metadata"]) - # Extract sections from single-file response - if isinstance(chunk_json, dict) and "sections" in chunk_json: + # Extract sections from single-file response (fallback) + elif isinstance(chunk_json, dict) and "sections" in chunk_json: for section in chunk_json["sections"]: # Add document context to section section["metadata"] = section.get("metadata", {}) @@ -1276,7 +1289,15 @@ class AiService: # Sort sections by order all_sections.sort(key=lambda x: x.get("order", 0)) - # Create merged document + # If we have merged documents from multi-file responses, return them + if all_documents: + logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks") + return { + "metadata": combined_metadata, + "documents": all_documents + } + + # Otherwise, create merged document with sections (single-file fallback) merged_document = { "metadata": { "title": document_titles[0] if document_titles else "Merged Document", @@ -1641,60 +1662,36 @@ class AiService: # Debug: Show what content is being processed (before filtering) for i, ec in enumerate(extractionResult): - logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}") - - # Check each part within the ContentExtracted if hasattr(ec, 'parts'): for j, part in enumerate(ec.parts): - if hasattr(part, 'data') and part.data: - logger.debug(f" Part {j} content preview: {part.data[:200]}...") - else: - # Check what attributes the part actually has - part_attrs = [attr for attr in dir(part) if not attr.startswith('_')] + if not (hasattr(part, 'data') and part.data): + # Check if this is an empty container chunk (which is expected) part_type = getattr(part, 'typeGroup', None) part_mime = getattr(part, 'mimeType', '') - has_data = hasattr(part, 'data') and bool(part.data) - logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}") - logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}") + is_empty_container = ( + part_type == "container" and + part_mime and + 'document' in part_mime.lower() + ) - # Check if this is an empty container chunk (which is expected) - is_empty_container = False - if part_type == "container" and part_mime and 'document' in part_mime.lower(): - is_empty_container = True - - if is_empty_container: - logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}") - else: - logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}") - else: - logger.warning(f"ContentExtracted {i} has no parts attribute") + if not is_empty_container: + logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'") chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True) # Debug: Show what chunks were actually processed (after filtering) logger.info(f"After filtering: {len(chunkResults)} chunks will be processed") - for i, chunk_result in enumerate(chunkResults): - if chunk_result and chunk_result.metadata.get("success", False): - logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars") - else: - logger.debug(f"Processed chunk {i}: error or skipped") # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) # Debug: Show what the AI actually returned logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}") - if 'sections' in mergedJsonDocument: - logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") - if mergedJsonDocument['sections']: - logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...") - else: - logger.warning("AI returned empty sections array") if 'documents' in mergedJsonDocument: logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}") - else: - logger.warning("AI did not return 'documents' key - this is single-file format") + elif 'sections' in mergedJsonDocument: + logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") return mergedJsonDocument diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py index 31ffb26e..7b7342bc 100644 --- a/modules/services/serviceGeneration/subPromptBuilder.py +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -38,85 +38,43 @@ async def buildAdaptiveExtractionPrompt( # Build adaptive prompt using AI analysis - match single-file style if promptAnalysis.get("is_multi_file", False): - # Check if this is JSON email data - is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation']) - - if is_json_email: - # Specialized prompt for JSON email data - multi_file_example = { - "metadata": { - "title": "Email Conversations", - "splitStrategy": "per_entity" - }, - "documents": [ - { - "id": "doc_1", - "title": "Email from SENDER to RECIPIENT", - "filename": "email_sender_to_recipient.txt", - "sections": [ - { - "id": "section_1", - "content_type": "heading", - "elements": [ - { - "text": "Email from SENDER to RECIPIENT", - "level": 1 - } - ], - "order": 1 - }, - { - "id": "section_2", - "content_type": "paragraph", - "elements": [ - { - "text": "FULL_EMAIL_CONTENT_HERE" - } - ], - "order": 2 - } - ] - } - ] - } - else: - # Generic multi-file prompt - multi_file_example = { - "metadata": { - "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE", - "splitStrategy": "by_section" - }, - "documents": [ - { - "id": "doc_1", - "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE", - "filename": "REPLACE_WITH_ACTUAL_FILENAME", - "sections": [ - { - "id": "section_1", - "content_type": "heading", - "elements": [ - { - "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT", - "level": 1 - } - ], - "order": 1 - }, - { - "id": "section_2", - "content_type": "paragraph", - "elements": [ - { - "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT" - } - ], - "order": 2 - } - ] - } - ] - } + # Multi-file prompt - use simple example format like single-file + multi_file_example = { + "metadata": { + "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE", + "splitStrategy": "by_section" + }, + "documents": [ + { + "id": "doc_1", + "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE", + "filename": "REPLACE_WITH_ACTUAL_FILENAME", + "sections": [ + { + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT", + "level": 1 + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT" + } + ], + "order": 2 + } + ] + } + ] + } adaptive_prompt = f""" {userPrompt} diff --git a/test_document_processing.py b/test_document_processing.py index 49a42f72..7d6bb64f 100644 --- a/test_document_processing.py +++ b/test_document_processing.py @@ -20,15 +20,9 @@ from modules.services.serviceAi.mainServiceAi import AiService from modules.services.serviceGeneration.mainServiceGeneration import GenerationService # Set up logging -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) -# Set all module loggers to DEBUG level -logging.getLogger('modules.services.serviceAi.mainServiceAi').setLevel(logging.DEBUG) -logging.getLogger('modules.services.serviceGeneration.mainServiceGeneration').setLevel(logging.DEBUG) -logging.getLogger('modules.services.serviceGeneration.subPromptBuilder').setLevel(logging.DEBUG) -logging.getLogger('modules.services.serviceExtraction.mainServiceExtraction').setLevel(logging.DEBUG) - async def process_documents_and_generate_summary(): """Process documents using the main AI service with intelligent chunk integration."""