fixed chunk parsing
This commit is contained in:
parent
0bc71c99d5
commit
ac755681b3
3 changed files with 76 additions and 127 deletions
|
|
@ -1204,8 +1204,10 @@ class AiService:
|
|||
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
|
||||
|
||||
# Merge JSON results for each document
|
||||
all_documents = []
|
||||
all_sections = []
|
||||
document_titles = []
|
||||
combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"}
|
||||
|
||||
for doc_id, doc_chunks in results_by_document.items():
|
||||
# Process each chunk's JSON result
|
||||
|
|
@ -1218,12 +1220,23 @@ class AiService:
|
|||
|
||||
# Check if this is a multi-file response (has "documents" key)
|
||||
if isinstance(chunk_json, dict) and "documents" in chunk_json:
|
||||
# This is a multi-file response - return it as-is
|
||||
logger.info("Detected multi-file response from AI - preserving structure")
|
||||
return chunk_json
|
||||
# This is a multi-file response - merge all documents
|
||||
logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents")
|
||||
|
||||
# Add all documents from this chunk
|
||||
for doc in chunk_json["documents"]:
|
||||
# Add chunk context to document
|
||||
doc["metadata"] = doc.get("metadata", {})
|
||||
doc["metadata"]["source_chunk"] = chunk_result.chunkIndex
|
||||
doc["metadata"]["source_document"] = doc_id
|
||||
all_documents.append(doc)
|
||||
|
||||
# Update combined metadata
|
||||
if "metadata" in chunk_json:
|
||||
combined_metadata.update(chunk_json["metadata"])
|
||||
|
||||
# Extract sections from single-file response
|
||||
if isinstance(chunk_json, dict) and "sections" in chunk_json:
|
||||
# Extract sections from single-file response (fallback)
|
||||
elif isinstance(chunk_json, dict) and "sections" in chunk_json:
|
||||
for section in chunk_json["sections"]:
|
||||
# Add document context to section
|
||||
section["metadata"] = section.get("metadata", {})
|
||||
|
|
@ -1276,7 +1289,15 @@ class AiService:
|
|||
# Sort sections by order
|
||||
all_sections.sort(key=lambda x: x.get("order", 0))
|
||||
|
||||
# Create merged document
|
||||
# If we have merged documents from multi-file responses, return them
|
||||
if all_documents:
|
||||
logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks")
|
||||
return {
|
||||
"metadata": combined_metadata,
|
||||
"documents": all_documents
|
||||
}
|
||||
|
||||
# Otherwise, create merged document with sections (single-file fallback)
|
||||
merged_document = {
|
||||
"metadata": {
|
||||
"title": document_titles[0] if document_titles else "Merged Document",
|
||||
|
|
@ -1641,60 +1662,36 @@ class AiService:
|
|||
|
||||
# Debug: Show what content is being processed (before filtering)
|
||||
for i, ec in enumerate(extractionResult):
|
||||
logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}")
|
||||
|
||||
# Check each part within the ContentExtracted
|
||||
if hasattr(ec, 'parts'):
|
||||
for j, part in enumerate(ec.parts):
|
||||
if hasattr(part, 'data') and part.data:
|
||||
logger.debug(f" Part {j} content preview: {part.data[:200]}...")
|
||||
else:
|
||||
# Check what attributes the part actually has
|
||||
part_attrs = [attr for attr in dir(part) if not attr.startswith('_')]
|
||||
if not (hasattr(part, 'data') and part.data):
|
||||
# Check if this is an empty container chunk (which is expected)
|
||||
part_type = getattr(part, 'typeGroup', None)
|
||||
part_mime = getattr(part, 'mimeType', '')
|
||||
has_data = hasattr(part, 'data') and bool(part.data)
|
||||
|
||||
logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}")
|
||||
logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}")
|
||||
is_empty_container = (
|
||||
part_type == "container" and
|
||||
part_mime and
|
||||
'document' in part_mime.lower()
|
||||
)
|
||||
|
||||
# Check if this is an empty container chunk (which is expected)
|
||||
is_empty_container = False
|
||||
if part_type == "container" and part_mime and 'document' in part_mime.lower():
|
||||
is_empty_container = True
|
||||
|
||||
if is_empty_container:
|
||||
logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}")
|
||||
else:
|
||||
logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}")
|
||||
else:
|
||||
logger.warning(f"ContentExtracted {i} has no parts attribute")
|
||||
if not is_empty_container:
|
||||
logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'")
|
||||
|
||||
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
|
||||
|
||||
# Debug: Show what chunks were actually processed (after filtering)
|
||||
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
|
||||
for i, chunk_result in enumerate(chunkResults):
|
||||
if chunk_result and chunk_result.metadata.get("success", False):
|
||||
logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars")
|
||||
else:
|
||||
logger.debug(f"Processed chunk {i}: error or skipped")
|
||||
|
||||
# Merge with JSON mode
|
||||
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
||||
|
||||
# Debug: Show what the AI actually returned
|
||||
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
|
||||
if 'sections' in mergedJsonDocument:
|
||||
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
|
||||
if mergedJsonDocument['sections']:
|
||||
logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...")
|
||||
else:
|
||||
logger.warning("AI returned empty sections array")
|
||||
if 'documents' in mergedJsonDocument:
|
||||
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
|
||||
else:
|
||||
logger.warning("AI did not return 'documents' key - this is single-file format")
|
||||
elif 'sections' in mergedJsonDocument:
|
||||
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
|
||||
|
||||
return mergedJsonDocument
|
||||
|
||||
|
|
|
|||
|
|
@ -38,85 +38,43 @@ async def buildAdaptiveExtractionPrompt(
|
|||
|
||||
# Build adaptive prompt using AI analysis - match single-file style
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
# Check if this is JSON email data
|
||||
is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation'])
|
||||
|
||||
if is_json_email:
|
||||
# Specialized prompt for JSON email data
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "Email Conversations",
|
||||
"splitStrategy": "per_entity"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "Email from SENDER to RECIPIENT",
|
||||
"filename": "email_sender_to_recipient.txt",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "Email from SENDER to RECIPIENT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "FULL_EMAIL_CONTENT_HERE"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
# Generic multi-file prompt
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
||||
"splitStrategy": "by_section"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
||||
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
# Multi-file prompt - use simple example format like single-file
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
||||
"splitStrategy": "by_section"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
||||
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
|
|
|||
|
|
@ -20,15 +20,9 @@ from modules.services.serviceAi.mainServiceAi import AiService
|
|||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Set all module loggers to DEBUG level
|
||||
logging.getLogger('modules.services.serviceAi.mainServiceAi').setLevel(logging.DEBUG)
|
||||
logging.getLogger('modules.services.serviceGeneration.mainServiceGeneration').setLevel(logging.DEBUG)
|
||||
logging.getLogger('modules.services.serviceGeneration.subPromptBuilder').setLevel(logging.DEBUG)
|
||||
logging.getLogger('modules.services.serviceExtraction.mainServiceExtraction').setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def process_documents_and_generate_summary():
|
||||
"""Process documents using the main AI service with intelligent chunk integration."""
|
||||
|
|
|
|||
Loading…
Reference in a new issue