ai iteration loops running 21 of 22 test cases - fixing object tree extraction
This commit is contained in:
parent
24f152d0b9
commit
adbc29f069
7 changed files with 1071 additions and 127 deletions
|
|
@ -0,0 +1,72 @@
|
|||
Message 0 (user)
|
||||
Length: 2015 chars
|
||||
================================================================================
|
||||
User request: "Generate the first 1000 prime numbers."
|
||||
|
||||
Generate a NEW, COMPLETE JSON response. The template below shows ONLY the structure pattern - it is NOT existing content. Start from the beginning.
|
||||
|
||||
JSON structure template (reference only - shows the pattern):
|
||||
{
|
||||
"metadata": {
|
||||
"split_strategy": "single_document",
|
||||
"source_documents": [],
|
||||
"extraction_method": "ai_generation"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "Generated Document",
|
||||
"filename": "document.json",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_heading_example",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{"level": 1, "text": "Heading Text"}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_paragraph_example",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{"text": "Paragraph text content"}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_list_example",
|
||||
"content_type": "list",
|
||||
"elements": [
|
||||
{
|
||||
"items": [
|
||||
{"text": "Item 1"},
|
||||
{"text": "Item 2"}
|
||||
],
|
||||
"list_type": "numbered"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_table_example",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [
|
||||
["Row 1 Col 1", "Row 1 Col 2"],
|
||||
["Row 2 Col 1", "Row 2 Col 2"]
|
||||
],
|
||||
"caption": "Table caption"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_code_example",
|
||||
"content_type": "code",
|
||||
"elements": [
|
||||
{
|
||||
"code... [truncated]
|
||||
================================================================================
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
API Response
|
||||
Length: 6082 chars
|
||||
================================================================================
|
||||
{
|
||||
"metadata": {
|
||||
"split_strategy": "single_document",
|
||||
"source_documents": [],
|
||||
"extraction_method": "ai_generation"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "List of First 1000 Prime Numbers",
|
||||
"filename": "prime_numbers.json",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_prime_numbers",
|
||||
"content_type": "list",
|
||||
"elements": [
|
||||
{
|
||||
"items": [
|
||||
{"number": 2},
|
||||
{"number": 3},
|
||||
{"number": 5},
|
||||
{"number": 7},
|
||||
{"number": 11},
|
||||
{"number": 13},
|
||||
{"number": 17},
|
||||
{"number": 19},
|
||||
{"number": 23},
|
||||
{"number": 29},
|
||||
{"number": 31},
|
||||
{"number": 37},
|
||||
{"number": 41},
|
||||
{"number": 43},
|
||||
{"number": 47},
|
||||
{"number": 53},
|
||||
{"number": 59},
|
||||
{"number": 61},
|
||||
{"number": 67},
|
||||
{"number": 71},
|
||||
{"number": 73},
|
||||
{"number": 79},
|
||||
{"number": 83},
|
||||
{"number": 89},
|
||||
{"number": 97},
|
||||
{"number": 101},
|
||||
{"number": 103},
|
||||
{"number": 107},
|
||||
{"number": 109},
|
||||
{"number": 113},
|
||||
{"number": 127},
|
||||
{"number": 131},
|
||||
{"number": 137},
|
||||
{"number": 139},
|
||||
{"number": 149},
|
||||
{"number": 151},
|
||||
{"number": 157},
|
||||
{"number": 163},
|
||||
{"number": 167},
|
||||
{"number": 173},
|
||||
{"number": 179},
|
||||
{"number": 181},
|
||||
{"number": 191},
|
||||
{"number": 193},
|
||||
{"number": 197},
|
||||
{"number": 199},
|
||||
{"number": 211},
|
||||
{"number": 223},
|
||||
{"number": 227},
|
||||
{"number": 229},
|
||||
{"number": 233},
|
||||
{"number": 239},
|
||||
{"number": 241},
|
||||
{"number": 251},
|
||||
{"number": 257},
|
||||
{"number": 263},
|
||||
{"number": 269},
|
||||
{"number": 271},
|
||||
{"number": 277},
|
||||
{"number": 281},
|
||||
{"number": 283},
|
||||
{"number": 293},
|
||||
{"number": 307},
|
||||
{"number": 311},
|
||||
{"number": 313},
|
||||
{"number": 317},
|
||||
{"number": 331},
|
||||
{"number": 337},
|
||||
{"number": 347},
|
||||
{"number": 349},
|
||||
{"number": 353},
|
||||
{"number": 359},
|
||||
{"number": 367},
|
||||
{"number": 373},
|
||||
{"number": 379},
|
||||
{"number": 383},
|
||||
{"number": 389},
|
||||
{"number": 397},
|
||||
{"number": 401},
|
||||
{"number": 409},
|
||||
{"number": 419},
|
||||
{"number": 421},
|
||||
{"number": 431},
|
||||
{"number": 433},
|
||||
{"number": 439},
|
||||
{"number": 443},
|
||||
{"number": 449},
|
||||
{"number": 457},
|
||||
{"number": 461},
|
||||
{"number": 463},
|
||||
{"number": 467},
|
||||
{"number": 479},
|
||||
{"number": 487},
|
||||
{"number": 491},
|
||||
{"number": 499},
|
||||
{"number": 503},
|
||||
{"number": 509},
|
||||
{"number": 521},
|
||||
{"number": 523},
|
||||
{"number": 541},
|
||||
{"number": 547},
|
||||
{"number": 557},
|
||||
{"number": 563},
|
||||
{"number": 569},
|
||||
{"number": 571},
|
||||
{"number": 577},
|
||||
{"number": 587},
|
||||
{"number": 593},
|
||||
{"number": 599},
|
||||
{"number": 601},
|
||||
{"number": 607},
|
||||
{"number": 613},
|
||||
{"number": 617},
|
||||
{"number": 619},
|
||||
{"number": 631},
|
||||
{"number": 641},
|
||||
{"number": 643},
|
||||
{"number": 647},
|
||||
{"number": 653},
|
||||
{"number": 659},
|
||||
{"number": 661},
|
||||
{"number": 673},
|
||||
{"number": 677},
|
||||
{"number": 683},
|
||||
{"number": 691},
|
||||
{"number": 701},
|
||||
{"number": 709},
|
||||
{"number": 719},
|
||||
{"number": 727},
|
||||
{"number": 733},
|
||||
{"number": 739},
|
||||
{"number": 743},
|
||||
{"number": 751},
|
||||
{"number": 757},
|
||||
{"number": 761},
|
||||
{"number": 769},
|
||||
{"number": 773},
|
||||
{"number": 787},
|
||||
{"number": 797},
|
||||
{"number": 809},
|
||||
{"number": 811},
|
||||
{"number": 821},
|
||||
{"number": 823},
|
||||
{"number": 827},
|
||||
{"number": 829},
|
||||
{"number": 839},
|
||||
{"number": 853},
|
||||
{"number": 857},
|
||||
{"number": 859},
|
||||
{"number": 863},
|
||||
{"number": 877},
|
||||
{"number": 881},
|
||||
{"number": 883},
|
||||
{"number": 887},
|
||||
{"number": 907},
|
||||
{"number": 911},
|
||||
{"number": 919},
|
||||
{"number": 929},
|
||||
{"number": 937},
|
||||
{"number": 941},
|
||||
{"number": 947},
|
||||
{"number": 953},
|
||||
{"number": 967},
|
||||
{"number": 971},
|
||||
{"number": 977},
|
||||
{"number": 983},
|
||||
{"number": 991},
|
||||
{"number": 997}
|
||||
],
|
||||
"list_type": "numbered"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
================================================================================
|
||||
|
|
@ -92,18 +92,6 @@ class AiObjects:
|
|||
|
||||
# Input bytes will be calculated inside _callWithModel
|
||||
|
||||
# Compress optionally (prompt/context) - simple truncation fallback kept here
|
||||
def _maybeTruncate(text: str, limit: int) -> str:
|
||||
data = text.encode("utf-8")
|
||||
if len(data) <= limit:
|
||||
return text
|
||||
return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]"
|
||||
|
||||
if options.compressPrompt and len(prompt.encode("utf-8")) > 2000:
|
||||
prompt = _maybeTruncate(prompt, 2000)
|
||||
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
||||
context = _maybeTruncate(context, 70000)
|
||||
|
||||
# Generation parameters are handled inside _callWithModel
|
||||
|
||||
# Get failover models for this operation type
|
||||
|
|
@ -129,7 +117,7 @@ class AiObjects:
|
|||
try:
|
||||
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
||||
|
||||
# Call the model
|
||||
# Call the model directly - no truncation or compression here
|
||||
response = await self._callWithModel(model, prompt, context, options)
|
||||
|
||||
logger.info(f"✅ AI call successful with model: {model.name}")
|
||||
|
|
|
|||
|
|
@ -124,7 +124,9 @@ Respond with ONLY a JSON object in this exact format:
|
|||
self,
|
||||
prompt: str,
|
||||
options: AiCallOptions,
|
||||
debugPrefix: str = "ai_call"
|
||||
debugPrefix: str = "ai_call",
|
||||
promptBuilder: Optional[callable] = None,
|
||||
promptArgs: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Shared core function for AI calls with repair-based looping system.
|
||||
|
|
@ -141,6 +143,7 @@ Respond with ONLY a JSON object in this exact format:
|
|||
max_iterations = 50 # Prevent infinite loops
|
||||
iteration = 0
|
||||
allSections = [] # Accumulate all sections across iterations
|
||||
lastRawResponse = None # Store last raw JSON response for continuation
|
||||
|
||||
logger.debug(f"Starting AI call with repair-based looping (debug prefix: {debugPrefix})")
|
||||
|
||||
|
|
@ -149,14 +152,18 @@ Respond with ONLY a JSON object in this exact format:
|
|||
logger.debug(f"AI call iteration {iteration}/{max_iterations}")
|
||||
|
||||
# Build iteration prompt
|
||||
if len(allSections) > 0:
|
||||
# This is a continuation - build continuation context
|
||||
continuationContext = buildContinuationContext(allSections)
|
||||
logger.info(f"Continuation context: {continuationContext.get('section_count')} sections, next order: {continuationContext.get('next_order')}")
|
||||
if len(allSections) > 0 and promptBuilder and promptArgs:
|
||||
# This is a continuation - build continuation context with raw JSON and rebuild prompt
|
||||
continuationContext = buildContinuationContext(allSections, lastRawResponse)
|
||||
logger.info(f"Continuation context: {continuationContext.get('section_count')} sections")
|
||||
if lastRawResponse:
|
||||
logger.debug(f"Iteration {iteration}: Including previous response in continuation context ({len(lastRawResponse)} chars)")
|
||||
else:
|
||||
logger.warning(f"Iteration {iteration}: No previous response available for continuation!")
|
||||
|
||||
# If prompt contains a placeholder for continuation, inject the context
|
||||
# For now, we'll handle this at the calling code level
|
||||
iterationPrompt = prompt
|
||||
# Rebuild prompt with continuation context using the provided prompt builder
|
||||
iterationPrompt = await promptBuilder(**promptArgs, continuationContext=continuationContext)
|
||||
logger.debug(f"Rebuilt prompt with continuation context for iteration {iteration}")
|
||||
else:
|
||||
# First iteration - use original prompt
|
||||
iterationPrompt = prompt
|
||||
|
|
@ -179,6 +186,13 @@ Respond with ONLY a JSON object in this exact format:
|
|||
response = await self.aiObjects.call(request)
|
||||
result = response.content
|
||||
|
||||
# Debug: Check response immediately from API
|
||||
if iteration == 1 and result:
|
||||
first_chars = result[:200].replace('\n', '\\n').replace('\r', '\\r')
|
||||
logger.debug(f"Iteration 1: Raw API response starts with (first 200 chars): '{first_chars}'")
|
||||
if result.strip().startswith('},') or result.strip().startswith('],'):
|
||||
logger.error(f"Iteration 1: API returned fragment! Full start: '{result[:200]}'")
|
||||
|
||||
# Write raw AI response to debug file
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
|
||||
|
|
@ -196,10 +210,23 @@ Respond with ONLY a JSON object in this exact format:
|
|||
logger.warning(f"Iteration {iteration}: Empty response, stopping")
|
||||
break
|
||||
|
||||
# Store raw response for continuation (even if broken)
|
||||
lastRawResponse = result
|
||||
|
||||
# Check for complete_response flag in raw response (before parsing)
|
||||
import re
|
||||
if re.search(r'"complete_response"\s*:\s*true', result, re.IGNORECASE):
|
||||
logger.info(f"Iteration {iteration}: Detected complete_response flag in raw response")
|
||||
|
||||
# Extract sections from response (handles both valid and broken JSON)
|
||||
extractedSections, wasJsonComplete = self._extractSectionsFromResponse(result, iteration, debugPrefix)
|
||||
|
||||
if not extractedSections:
|
||||
# If we're in continuation mode and JSON was incomplete, don't stop - continue to allow retry
|
||||
if iteration > 1 and not wasJsonComplete:
|
||||
logger.warning(f"Iteration {iteration}: No sections extracted from continuation fragment, continuing for another attempt")
|
||||
continue
|
||||
# Otherwise, stop if no sections
|
||||
logger.warning(f"Iteration {iteration}: No sections extracted, stopping")
|
||||
break
|
||||
|
||||
|
|
@ -208,7 +235,7 @@ Respond with ONLY a JSON object in this exact format:
|
|||
logger.info(f"Iteration {iteration}: Extracted {len(extractedSections)} sections (total: {len(allSections)})")
|
||||
|
||||
# Check if we should continue (completion detection)
|
||||
if self._shouldContinueGeneration(allSections, iteration, wasJsonComplete):
|
||||
if self._shouldContinueGeneration(allSections, iteration, wasJsonComplete, result):
|
||||
logger.debug(f"Iteration {iteration}: Continuing generation")
|
||||
continue
|
||||
else:
|
||||
|
|
@ -241,6 +268,7 @@ Respond with ONLY a JSON object in this exact format:
|
|||
"""
|
||||
Extract sections from AI response, handling both valid and broken JSON.
|
||||
Uses repair mechanism for broken JSON.
|
||||
Checks for "complete_response": true flag to determine completion.
|
||||
Returns (sections, wasJsonComplete)
|
||||
"""
|
||||
# First, try to parse as valid JSON
|
||||
|
|
@ -248,14 +276,35 @@ Respond with ONLY a JSON object in this exact format:
|
|||
extracted = extractJsonString(result)
|
||||
parsed_result = json.loads(extracted)
|
||||
|
||||
# Check if AI marked response as complete
|
||||
isComplete = parsed_result.get("complete_response", False) == True
|
||||
if isComplete:
|
||||
logger.info(f"Iteration {iteration}: AI marked response as complete (complete_response: true)")
|
||||
|
||||
# Extract sections from parsed JSON
|
||||
sections = extractSectionsFromDocument(parsed_result)
|
||||
logger.debug(f"Iteration {iteration}: Valid JSON - extracted {len(sections)} sections")
|
||||
return sections, True # JSON was complete
|
||||
|
||||
# If AI marked as complete, always return as complete
|
||||
if isComplete:
|
||||
return sections, True
|
||||
|
||||
# If in continuation mode (iteration > 1), continuation responses are expected to be fragments
|
||||
# A fragment with 0 extractable sections means JSON is incomplete - need another iteration
|
||||
# Don't use repair mechanism - just mark as incomplete so loop continues
|
||||
if len(sections) == 0 and iteration > 1:
|
||||
logger.info(f"Iteration {iteration}: Continuation fragment with 0 extractable sections - JSON incomplete, continuing")
|
||||
return sections, False # Mark as incomplete so loop continues
|
||||
|
||||
# First iteration with 0 sections means empty response - stop
|
||||
if len(sections) == 0:
|
||||
return sections, True # Complete but empty
|
||||
|
||||
return sections, True # JSON was complete with sections
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
# Broken JSON - try repair mechanism
|
||||
logger.warning(f"Iteration {iteration}: Invalid JSON, attempting repair: {str(e)}")
|
||||
# Broken JSON - try repair mechanism (normal in iterative generation)
|
||||
logger.info(f"Iteration {iteration}: JSON incomplete/broken, attempting repair: {str(e)}")
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_broken_json_iteration_{iteration}")
|
||||
|
||||
# Try to repair
|
||||
|
|
@ -279,16 +328,25 @@ Respond with ONLY a JSON object in this exact format:
|
|||
self,
|
||||
allSections: List[Dict[str, Any]],
|
||||
iteration: int,
|
||||
wasJsonComplete: bool
|
||||
wasJsonComplete: bool,
|
||||
rawResponse: str = None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if generation should continue based on JSON completeness.
|
||||
Determine if generation should continue based on JSON completeness and complete_response flag.
|
||||
Returns True if we should continue, False if done.
|
||||
"""
|
||||
if len(allSections) == 0:
|
||||
return True # No sections yet, continue
|
||||
|
||||
# Simple rule: if JSON was complete, we're done
|
||||
# Check for complete_response flag in raw response
|
||||
if rawResponse:
|
||||
import re
|
||||
# Look for complete_response: true pattern (allowing for whitespace variations)
|
||||
if re.search(r'"complete_response"\s*:\s*true', rawResponse, re.IGNORECASE):
|
||||
logger.info("AI marked response as complete (complete_response: true) - stopping generation")
|
||||
return False
|
||||
|
||||
# If JSON was complete (and no complete_response flag), we're done
|
||||
# If JSON was broken and repaired, continue to get more content
|
||||
if wasJsonComplete:
|
||||
logger.info("JSON was complete - stopping generation")
|
||||
|
|
@ -398,6 +456,15 @@ Respond with ONLY a JSON object in this exact format:
|
|||
else:
|
||||
logger.debug(f"Using provided options: operationType={options.operationType}, priority={options.priority}")
|
||||
|
||||
# CRITICAL: For document generation with JSON templates, NEVER compress the prompt
|
||||
# Compressing would truncate the template structure and confuse the AI
|
||||
if outputFormat: # Document generation with structured output
|
||||
if not options:
|
||||
options = AiCallOptions()
|
||||
options.compressPrompt = False # JSON templates must NOT be truncated
|
||||
options.compressContext = False # Context also should not be compressed
|
||||
logger.debug("Document generation detected - disabled prompt/context compression")
|
||||
|
||||
# Handle document generation with specific output format using unified approach
|
||||
if outputFormat:
|
||||
# Use unified generation method for all document generation
|
||||
|
|
@ -411,7 +478,22 @@ Respond with ONLY a JSON object in this exact format:
|
|||
from modules.services.serviceGeneration.subPromptBuilderGeneration import buildGenerationPrompt
|
||||
# First call without continuation context
|
||||
generation_prompt = await buildGenerationPrompt(outputFormat, prompt, title, extracted_content, None)
|
||||
generated_json = await self._callAiWithLooping(generation_prompt, options, "document_generation")
|
||||
|
||||
# Prepare prompt builder arguments for continuation
|
||||
promptArgs = {
|
||||
"outputFormat": outputFormat,
|
||||
"userPrompt": prompt,
|
||||
"title": title,
|
||||
"extracted_content": extracted_content
|
||||
}
|
||||
|
||||
generated_json = await self._callAiWithLooping(
|
||||
generation_prompt,
|
||||
options,
|
||||
"document_generation",
|
||||
buildGenerationPrompt,
|
||||
promptArgs
|
||||
)
|
||||
|
||||
# Parse the generated JSON (extract fenced/embedded JSON first)
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from typing import Dict, Any
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Centralized JSON structure template for document generation
|
||||
# Includes examples for all content types so AI knows the structure patterns
|
||||
TEMPLATE_JSON_DOCUMENT_GENERATION = """{
|
||||
"metadata": {
|
||||
"split_strategy": "single_document",
|
||||
|
|
@ -22,16 +23,60 @@ TEMPLATE_JSON_DOCUMENT_GENERATION = """{
|
|||
"filename": "document.json",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading|paragraph|table|list|code",
|
||||
"id": "section_heading_example",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
// heading: {"level": 1, "text": "..."}
|
||||
// paragraph: {"text": "..."}
|
||||
// table: {"headers": [...], "rows": [[...]], "caption": "..."}
|
||||
// list: {"items": [{"text": "...", "subitems": [...]}], "list_type": "bullet|numbered"}
|
||||
// code: {"code": "...", "language": "..."}
|
||||
{"level": 1, "text": "Heading Text"}
|
||||
],
|
||||
"order": 1
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_paragraph_example",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{"text": "Paragraph text content"}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_list_example",
|
||||
"content_type": "list",
|
||||
"elements": [
|
||||
{
|
||||
"items": [
|
||||
{"text": "Item 1"},
|
||||
{"text": "Item 2"}
|
||||
],
|
||||
"list_type": "numbered"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_table_example",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [
|
||||
["Row 1 Col 1", "Row 1 Col 2"],
|
||||
["Row 2 Col 1", "Row 2 Col 2"]
|
||||
],
|
||||
"caption": "Table caption"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
},
|
||||
{
|
||||
"id": "section_code_example",
|
||||
"content_type": "code",
|
||||
"elements": [
|
||||
{
|
||||
"code": "function example() { return true; }",
|
||||
"language": "javascript"
|
||||
}
|
||||
],
|
||||
"order": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -48,10 +93,10 @@ async def buildGenerationPrompt(
|
|||
) -> str:
|
||||
"""
|
||||
Build the unified generation prompt using a single JSON template.
|
||||
Simplified version without continuation logic in prompt.
|
||||
Generic solution that works for any user request.
|
||||
|
||||
Args:
|
||||
outputFormat: Target output format (html, pdf, docx, etc.)
|
||||
outputFormat: Target output format (html, pdf, docx, etc.) - not used in prompt
|
||||
userPrompt: User's original prompt for document generation
|
||||
title: Title for the document
|
||||
extracted_content: Optional extracted content from documents to prepend to prompt
|
||||
|
|
@ -64,63 +109,88 @@ async def buildGenerationPrompt(
|
|||
title_value = title if title else "Generated Document"
|
||||
json_template = TEMPLATE_JSON_DOCUMENT_GENERATION.replace("{{DOCUMENT_TITLE}}", title_value)
|
||||
|
||||
# Check if this is a continuation request
|
||||
if continuationContext and continuationContext.get("section_count", 0) > 0:
|
||||
# Continuation prompt - simple and focused
|
||||
section_count = continuationContext.get("section_count", 0)
|
||||
next_order = continuationContext.get("next_order", 1)
|
||||
last_content_sample = continuationContext.get("last_content_sample", "")
|
||||
# Build prompt based on whether this is a continuation or first call
|
||||
# Check if we have valid continuation context with actual JSON fragment
|
||||
has_continuation = (
|
||||
continuationContext
|
||||
and continuationContext.get("section_count", 0) > 0
|
||||
and continuationContext.get("last_raw_json", "")
|
||||
and continuationContext.get("last_raw_json", "").strip() != "{}"
|
||||
)
|
||||
|
||||
if has_continuation:
|
||||
# CONTINUATION PROMPT - user already received first part, continue from where it stopped
|
||||
last_raw_json = continuationContext.get("last_raw_json", "")
|
||||
last_item_object = continuationContext.get("last_item_object", "") # Full object like {"text": "value"}
|
||||
last_items_from_fragment = continuationContext.get("last_items_from_fragment", "")
|
||||
total_items_count = continuationContext.get("total_items_count", 0)
|
||||
|
||||
generation_prompt = f"""Continue generating structured JSON content.
|
||||
# Show the last few items to indicate where to continue (limit fragment size)
|
||||
# Extract just the ending portion of the JSON to show where it cut off
|
||||
fragment_snippet = ""
|
||||
if last_raw_json:
|
||||
# Show last 1500 chars or the whole thing if shorter - just enough to show the cut point
|
||||
fragment_snippet = last_raw_json[-1500:] if len(last_raw_json) > 1500 else last_raw_json
|
||||
# Add ellipsis if truncated
|
||||
if len(last_raw_json) > 1500:
|
||||
fragment_snippet = "..." + fragment_snippet
|
||||
|
||||
# Build clear continuation guidance
|
||||
continuation_guidance = []
|
||||
|
||||
if total_items_count > 0:
|
||||
continuation_guidance.append(f"You have already generated {total_items_count} items.")
|
||||
|
||||
# Show the last complete item object (full object format)
|
||||
if last_item_object:
|
||||
continuation_guidance.append(f"Last item in previous response: {last_item_object}. Continue with the NEXT item after this.")
|
||||
|
||||
continuation_text = "\n".join(continuation_guidance) if continuation_guidance else "Continue from where it stopped."
|
||||
|
||||
generation_prompt = f"""User request: "{userPrompt}"
|
||||
|
||||
ORIGINAL REQUEST: "{userPrompt}"
|
||||
TARGET FORMAT: {outputFormat}
|
||||
TITLE: "{title_value}"
|
||||
The user already received part of the response. Continue generating the remaining content.
|
||||
|
||||
CONTEXT - Already generated:
|
||||
- Total sections generated: {section_count}
|
||||
- Next section order: {next_order}
|
||||
- Last content: {last_content_sample}
|
||||
{continuation_text}
|
||||
|
||||
YOUR TASK:
|
||||
Continue where previous generation stopped.
|
||||
Generate the NEXT section(s) starting with section_{next_order}.
|
||||
Generate as much content as possible.
|
||||
Previous response ended here (JSON was cut off at this point):
|
||||
```json
|
||||
{fragment_snippet if fragment_snippet else "(No fragment available)"}
|
||||
```
|
||||
|
||||
RULES:
|
||||
- Follow the JSON template structure below exactly
|
||||
- Fill sections with ACTUAL data based on the user request
|
||||
- Use appropriate content_type for the data
|
||||
- Generate REAL content, not summaries or placeholders
|
||||
- Generate multiple sections if possible
|
||||
|
||||
Return raw JSON (no ```json blocks, no text before/after)
|
||||
|
||||
JSON Template
|
||||
JSON structure template:
|
||||
{json_template}
|
||||
|
||||
Instructions:
|
||||
- Return full JSON structure (metadata + documents + sections)
|
||||
- Continue from where it stopped - add NEW items only, do not repeat old items
|
||||
- Use the element structures shown in the template
|
||||
- Generate all remaining content needed to complete the user request
|
||||
- Fill with actual content (no comments, no "Add more..." text, no placeholders)
|
||||
- When fully complete, add "complete_response": true at root level
|
||||
- Return only valid JSON (no comments, no markdown blocks)
|
||||
|
||||
Continue generating:
|
||||
"""
|
||||
else:
|
||||
# First call - simple prompt without continuation complexity
|
||||
generation_prompt = f"""Generate structured JSON content for document creation.
|
||||
# FIRST CALL - initial generation
|
||||
generation_prompt = f"""User request: "{userPrompt}"
|
||||
|
||||
USER REQUEST: "{userPrompt}"
|
||||
TARGET FORMAT: {outputFormat}
|
||||
TITLE: "{title_value}"
|
||||
Generate a NEW, COMPLETE JSON response. The template below shows ONLY the structure pattern - it is NOT existing content. Start from the beginning.
|
||||
|
||||
INSTRUCTIONS:
|
||||
- Follow the JSON template structure below exactly
|
||||
- Emit only one JSON object in the response
|
||||
- Fill sections with ACTUAL data based on the user request
|
||||
- Use appropriate content_type for each section
|
||||
- Generate REAL content, not summaries or instructions
|
||||
- Structure content in sections with order 1, 2, 3...
|
||||
- Each section should be complete before next
|
||||
- Generate as much content as possible
|
||||
|
||||
Return raw JSON (no ```json blocks, no text before/after)
|
||||
|
||||
JSON Template
|
||||
JSON structure template (reference only - shows the pattern):
|
||||
{json_template}
|
||||
|
||||
Instructions:
|
||||
- Start your response with {{"metadata": ...}} - return COMPLETE JSON from the beginning
|
||||
- Do NOT continue from the template examples above - create your own sections
|
||||
- Generate content based on the user request
|
||||
- Use the element structures shown in the template (heading, paragraph, list, table, code)
|
||||
- Create your own section IDs (do not use the example IDs like "section_heading_example")
|
||||
- When fully complete, add "complete_response": true at root level
|
||||
- Return only valid JSON (no comments, no markdown blocks, no text before/after)
|
||||
|
||||
Generate your complete response starting from {{"metadata": ...}}:
|
||||
"""
|
||||
|
||||
# If we have extracted content, prepend it to the prompt
|
||||
|
|
|
|||
|
|
@ -138,42 +138,73 @@ def mergeRootLists(json_parts: List[Union[str, Dict, List]]) -> Dict[str, Any]:
|
|||
def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Attempt to repair broken JSON using multiple strategies.
|
||||
Generic solution that works for any content type.
|
||||
Returns the best repair attempt or None if all fail.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Strategy 1: Progressive parsing - try to find longest valid prefix
|
||||
# Strategy 1: Try to extract sections from the entire text first
|
||||
# This handles cases where the JSON structure is broken but content is intact
|
||||
extracted_sections = _extractSectionsRegex(text)
|
||||
if extracted_sections:
|
||||
logger.info(f"Extracted {len(extracted_sections)} sections using regex")
|
||||
return {
|
||||
"metadata": {
|
||||
"split_strategy": "single_document",
|
||||
"source_documents": [],
|
||||
"extraction_method": "ai_generation"
|
||||
},
|
||||
"documents": [{"sections": extracted_sections}]
|
||||
}
|
||||
|
||||
# Strategy 2: Progressive parsing - try to find longest valid prefix
|
||||
best_result = None
|
||||
best_valid_length = 0
|
||||
|
||||
for i in range(len(text), 0, -1):
|
||||
test_str = text[:i]
|
||||
closed_str = _closeJsonStructures(test_str)
|
||||
obj, err, _ = tryParseJson(closed_str)
|
||||
if err is None and isinstance(obj, dict):
|
||||
best_result = obj
|
||||
best_valid_length = i
|
||||
logger.debug(f"Progressive parsing success at length {i}")
|
||||
# Try different step sizes to find the best valid JSON
|
||||
for step_size in [100, 50, 10, 1]:
|
||||
for i in range(len(text), 0, -step_size):
|
||||
test_str = text[:i]
|
||||
closed_str = _closeJsonStructures(test_str)
|
||||
obj, err, _ = tryParseJson(closed_str)
|
||||
if err is None and isinstance(obj, dict):
|
||||
best_result = obj
|
||||
best_valid_length = i
|
||||
logger.debug(f"Progressive parsing success at length {i} (step: {step_size})")
|
||||
break
|
||||
if best_result:
|
||||
break
|
||||
|
||||
if best_result:
|
||||
logger.info(f"Repaired JSON using progressive parsing (valid length: {best_valid_length})")
|
||||
return best_result
|
||||
|
||||
# Check if we have sections in the result
|
||||
sections = extractSectionsFromDocument(best_result)
|
||||
if sections:
|
||||
logger.info(f"Progressive parsing found {len(sections)} sections")
|
||||
return best_result
|
||||
else:
|
||||
# No sections found in progressive parsing, try to extract from broken part
|
||||
logger.info("Progressive parsing found no sections, trying to extract from broken part")
|
||||
extracted_sections = _extractSectionsRegex(text[best_valid_length:])
|
||||
if extracted_sections:
|
||||
logger.info(f"Extracted {len(extracted_sections)} sections from broken part")
|
||||
# Merge with the valid part
|
||||
if "documents" not in best_result:
|
||||
best_result["documents"] = []
|
||||
if not best_result["documents"]:
|
||||
best_result["documents"] = [{"sections": []}]
|
||||
best_result["documents"][0]["sections"].extend(extracted_sections)
|
||||
return best_result
|
||||
|
||||
# Strategy 2: Structure closing - close incomplete structures
|
||||
# Strategy 3: Structure closing - close incomplete structures
|
||||
closed_str = _closeJsonStructures(text)
|
||||
obj, err, _ = tryParseJson(closed_str)
|
||||
if err is None and isinstance(obj, dict):
|
||||
logger.info("Repaired JSON using structure closing")
|
||||
return obj
|
||||
|
||||
# Strategy 3: Regex extraction (fallback for completely broken JSON)
|
||||
extracted = _extractSectionsRegex(text)
|
||||
if extracted:
|
||||
logger.info("Repaired JSON using regex extraction")
|
||||
return {"documents": [{"sections": extracted}]}
|
||||
|
||||
logger.warning("All repair strategies failed")
|
||||
return None
|
||||
|
||||
|
|
@ -204,7 +235,7 @@ def _closeJsonStructures(text: str) -> str:
|
|||
def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract sections from broken JSON using regex patterns.
|
||||
Fallback strategy when JSON is completely corrupted.
|
||||
Generic solution that works for any content type.
|
||||
"""
|
||||
import re
|
||||
|
||||
|
|
@ -218,10 +249,10 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
|
|||
content_type = match.group(2)
|
||||
order = int(match.group(3))
|
||||
|
||||
# Try to extract elements array
|
||||
# Try to extract elements array - look for the elements array after this section
|
||||
elements_match = re.search(
|
||||
r'"elements"\s*:\s*\[(.*?)\]',
|
||||
text[match.end():match.end()+500] # Look ahead for elements
|
||||
text[match.end():match.end()+5000] # Look ahead for elements (large range)
|
||||
)
|
||||
|
||||
elements = []
|
||||
|
|
@ -230,7 +261,9 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
|
|||
elements_str = '[' + elements_match.group(1) + ']'
|
||||
elements = json.loads(elements_str)
|
||||
except:
|
||||
pass
|
||||
# If JSON parsing fails, try to extract individual items manually
|
||||
elements_text = elements_match.group(1)
|
||||
elements = _extractElementsFromText(elements_text, content_type)
|
||||
|
||||
sections.append({
|
||||
"id": section_id,
|
||||
|
|
@ -239,6 +272,243 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
|
|||
"order": order
|
||||
})
|
||||
|
||||
# If no sections found with the main pattern, try to find any content patterns
|
||||
if not sections:
|
||||
sections = _extractGenericContent(text)
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def _extractElementsFromText(elements_text: str, content_type: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract elements from text when JSON parsing fails.
|
||||
Generic approach that works for any content type.
|
||||
Handles incomplete strings and corrupted data.
|
||||
Excludes the last incomplete item to prevent corrupted data.
|
||||
"""
|
||||
import re
|
||||
|
||||
elements = []
|
||||
|
||||
if content_type == "list":
|
||||
# Look for {"text": "..."} patterns, including incomplete ones
|
||||
text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elements_text)
|
||||
# Also look for incomplete patterns like {"text": "36
|
||||
incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text)
|
||||
|
||||
# Combine both complete and incomplete items
|
||||
all_items = text_items + incomplete_items
|
||||
# Remove duplicates and empty strings
|
||||
unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_items:
|
||||
unique_items = _removeLastIncompleteItem(unique_items, elements_text)
|
||||
|
||||
elements = [{"text": item} for item in unique_items]
|
||||
|
||||
elif content_type == "paragraph":
|
||||
# Look for {"text": "..."} patterns, including incomplete ones
|
||||
text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elements_text)
|
||||
incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text)
|
||||
|
||||
all_items = text_items + incomplete_items
|
||||
unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_items:
|
||||
unique_items = _removeLastIncompleteItem(unique_items, elements_text)
|
||||
|
||||
elements = [{"text": item} for item in unique_items]
|
||||
|
||||
elif content_type == "heading":
|
||||
# Look for {"level": X, "text": "..."} patterns, including incomplete ones
|
||||
heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*)"\}', elements_text)
|
||||
incomplete_heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text)
|
||||
|
||||
all_items = heading_items + incomplete_heading_items
|
||||
unique_items = list(dict.fromkeys([(int(level), text) for level, text in all_items if text.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_items:
|
||||
unique_items = _removeLastIncompleteItem(unique_items, elements_text)
|
||||
|
||||
elements = [{"level": level, "text": text} for level, text in unique_items]
|
||||
|
||||
elif content_type == "table":
|
||||
# Look for table patterns
|
||||
table_items = re.findall(r'\{"headers"\s*:\s*\[(.*?)\]\s*,\s*"rows"\s*:\s*\[(.*?)\]\s*,\s*"caption"\s*:\s*"([^"]*)"\}', elements_text)
|
||||
for headers_str, rows_str, caption in table_items:
|
||||
# Extract headers
|
||||
headers = re.findall(r'"([^"]+)"', headers_str)
|
||||
# Extract rows (simplified)
|
||||
rows = []
|
||||
row_matches = re.findall(r'\[(.*?)\]', rows_str)
|
||||
for row_match in row_matches:
|
||||
row_items = re.findall(r'"([^"]+)"', row_match)
|
||||
rows.append(row_items)
|
||||
|
||||
elements.append({
|
||||
"headers": headers,
|
||||
"rows": rows,
|
||||
"caption": caption
|
||||
})
|
||||
|
||||
elif content_type == "code":
|
||||
# Look for {"code": "...", "language": "..."} patterns, including incomplete ones
|
||||
code_items = re.findall(r'\{"code"\s*:\s*"([^"]*)"\s*,\s*"language"\s*:\s*"([^"]*)"\}', elements_text)
|
||||
incomplete_code_items = re.findall(r'\{"code"\s*:\s*"([^"]*?)(?:\n|$)', elements_text)
|
||||
|
||||
all_items = code_items + [(code, "unknown") for code in incomplete_code_items]
|
||||
unique_items = list(dict.fromkeys([(code, lang) for code, lang in all_items if code.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_items:
|
||||
unique_items = _removeLastIncompleteItem(unique_items, elements_text)
|
||||
|
||||
elements = [{"code": code, "language": lang} for code, lang in unique_items]
|
||||
|
||||
else:
|
||||
# Generic fallback - look for any text content, including incomplete
|
||||
text_items = re.findall(r'"text"\s*:\s*"([^"]*)"', elements_text)
|
||||
incomplete_text_items = re.findall(r'"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text)
|
||||
|
||||
all_items = text_items + incomplete_text_items
|
||||
unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_items:
|
||||
unique_items = _removeLastIncompleteItem(unique_items, elements_text)
|
||||
|
||||
elements = [{"text": item} for item in unique_items]
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def _removeLastIncompleteItem(items: List[str], original_text: str) -> List[str]:
|
||||
"""
|
||||
Remove the last item if it appears to be incomplete/corrupted.
|
||||
This prevents corrupted data from being included in the final result.
|
||||
"""
|
||||
import re
|
||||
|
||||
if not items:
|
||||
return items
|
||||
|
||||
# Check if the original text ends with incomplete JSON patterns
|
||||
# Look for patterns that suggest the last item was cut off
|
||||
|
||||
# Pattern 1: Text ends with incomplete string like {"text": "36
|
||||
if re.search(r'\{"[^"]*"\s*:\s*"[^"]*$', original_text):
|
||||
logger.debug("Detected incomplete string at end - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# Pattern 2: Text ends with incomplete boolean like {"bool_flag": tr
|
||||
if re.search(r'\{"[^"]*"\s*:\s*(true|false|tr|fa)$', original_text):
|
||||
logger.debug("Detected incomplete boolean at end - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# Pattern 3: Text ends with incomplete number like {"number": 123
|
||||
if re.search(r'\{"[^"]*"\s*:\s*\d+$', original_text):
|
||||
logger.debug("Detected incomplete number at end - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# Pattern 4: Text ends with incomplete array like {"array": [1,2,3
|
||||
if re.search(r'\{"[^"]*"\s*:\s*\[[^\]]*$', original_text):
|
||||
logger.debug("Detected incomplete array at end - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# Pattern 5: Text ends with incomplete object like {"obj": {"key": "val
|
||||
if re.search(r'\{"[^"]*"\s*:\s*\{[^}]*$', original_text):
|
||||
logger.debug("Detected incomplete object at end - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# Pattern 6: Text ends with trailing comma (common sign of incomplete JSON)
|
||||
if original_text.rstrip().endswith(','):
|
||||
logger.debug("Detected trailing comma - removing last item")
|
||||
return items[:-1]
|
||||
|
||||
# If no incomplete patterns detected, return all items
|
||||
return items
|
||||
|
||||
|
||||
def _extractGenericContent(text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract generic content when no specific section patterns are found.
|
||||
This handles cases where the JSON structure is completely broken.
|
||||
Handles incomplete strings and corrupted data.
|
||||
Excludes the last incomplete item to prevent corrupted data.
|
||||
"""
|
||||
import re
|
||||
|
||||
sections = []
|
||||
|
||||
# Look for any structured content patterns
|
||||
# Pattern 1: Look for list items {"text": "..."}, including incomplete ones
|
||||
list_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text)
|
||||
incomplete_list_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text)
|
||||
|
||||
all_list_items = list_items + incomplete_list_items
|
||||
unique_list_items = list(dict.fromkeys([item for item in all_list_items if item.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_list_items:
|
||||
unique_list_items = _removeLastIncompleteItem(unique_list_items, text)
|
||||
|
||||
if unique_list_items:
|
||||
elements = [{"text": item} for item in unique_list_items]
|
||||
sections.append({
|
||||
"id": "section_1",
|
||||
"content_type": "list",
|
||||
"elements": elements,
|
||||
"order": 1
|
||||
})
|
||||
|
||||
# Pattern 2: Look for paragraph text {"text": "..."}, including incomplete ones
|
||||
elif re.search(r'\{"text"\s*:\s*"[^"]*\}', text):
|
||||
# Extract all text elements, including incomplete ones
|
||||
text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text)
|
||||
incomplete_text_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text)
|
||||
|
||||
all_text_items = text_items + incomplete_text_items
|
||||
unique_text_items = list(dict.fromkeys([item for item in all_text_items if item.strip()]))
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if unique_text_items:
|
||||
unique_text_items = _removeLastIncompleteItem(unique_text_items, text)
|
||||
|
||||
if unique_text_items:
|
||||
elements = [{"text": item} for item in unique_text_items]
|
||||
sections.append({
|
||||
"id": "section_1",
|
||||
"content_type": "paragraph",
|
||||
"elements": elements,
|
||||
"order": 1
|
||||
})
|
||||
|
||||
# Pattern 3: Look for any quoted strings that might be content, including incomplete ones
|
||||
elif re.search(r'"([^"]{3,})"', text): # Strings longer than 3 chars (reduced threshold)
|
||||
# Extract longer quoted strings, including incomplete ones
|
||||
text_items = re.findall(r'"([^"]{3,})"', text)
|
||||
incomplete_text_items = re.findall(r'"([^"]{3,}?)(?:\n|$)', text)
|
||||
|
||||
all_text_items = text_items + incomplete_text_items
|
||||
# Filter out likely JSON keys
|
||||
content_items = [item for item in all_text_items if not item.startswith(('section_', 'doc_', 'metadata', 'split_strategy', 'source_documents', 'extraction_method', 'id', 'content_type', 'elements', 'order', 'title', 'filename'))]
|
||||
|
||||
# Remove the last item if it appears to be incomplete/corrupted
|
||||
if content_items:
|
||||
content_items = _removeLastIncompleteItem(content_items, text)
|
||||
|
||||
if content_items:
|
||||
elements = [{"text": item} for item in content_items[:10]] # Limit to first 10 items
|
||||
sections.append({
|
||||
"id": "section_1",
|
||||
"content_type": "paragraph",
|
||||
"elements": elements,
|
||||
"order": 1
|
||||
})
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
|
|
@ -324,33 +594,295 @@ def extractContentSample(section: Dict[str, Any]) -> str:
|
|||
return "Content exists"
|
||||
|
||||
|
||||
def buildContinuationContext(allSections: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
def _buildDetailedContinuationInfo(section: Dict[str, Any], content_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Build context information from accumulated sections for continuation prompt.
|
||||
Returns dict with metadata about what was already generated.
|
||||
Build detailed continuation information for better AI guidance.
|
||||
Completely generic - works for any content type (list, paragraph, code, table, etc.)
|
||||
"""
|
||||
if not allSections:
|
||||
elements = section.get("elements", [])
|
||||
|
||||
if not elements:
|
||||
return {
|
||||
"section_count": 0,
|
||||
"next_order": 1,
|
||||
"last_content_sample": "No content yet"
|
||||
"type": "continue_general",
|
||||
"sample": extractContentSample(section),
|
||||
"last_item": "",
|
||||
"item_count": 0,
|
||||
"guidance": "Continue generating content in the same format and style."
|
||||
}
|
||||
|
||||
# Sort sections by order
|
||||
sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0))
|
||||
# Count elements regardless of type
|
||||
element_count = len(elements)
|
||||
|
||||
last_section = sorted_sections[-1]
|
||||
last_order = last_section.get("order", 0)
|
||||
|
||||
# Get content sample from last section
|
||||
last_content_sample = extractContentSample(last_section)
|
||||
# Extract sample for context - completely generic
|
||||
sample = extractContentSample(section)
|
||||
|
||||
# Generic continuation guidance - applies to ANY content type
|
||||
# Tell AI to generate ALL REMAINING content to complete the user request
|
||||
return {
|
||||
"section_count": len(allSections),
|
||||
"last_section_id": last_section.get("id", ""),
|
||||
"last_order": last_order,
|
||||
"next_order": last_order + 1,
|
||||
"last_content_type": last_section.get("content_type", ""),
|
||||
"last_content_sample": last_content_sample
|
||||
"type": "continue_general",
|
||||
"sample": sample,
|
||||
"last_item": "",
|
||||
"item_count": element_count,
|
||||
"guidance": "Generate ALL remaining content to complete the user's request. Continue from where you left off and finish everything that was requested."
|
||||
}
|
||||
|
||||
|
||||
def _extractLastItemsFromFragment(fragment: str, max_items: int = 10) -> str:
|
||||
"""
|
||||
Extract the last few items from a JSON fragment for continuation context.
|
||||
Uses JSON structure (sections -> elements -> items) - fully generic.
|
||||
Works with broken/incomplete JSON by trying to parse and extract sections.
|
||||
"""
|
||||
if not fragment:
|
||||
return ""
|
||||
|
||||
# Strategy 1: Try to parse as JSON and extract from structure
|
||||
try:
|
||||
# Try to repair and parse the fragment
|
||||
parsed = repairBrokenJson(fragment)
|
||||
if parsed:
|
||||
# Extract sections from parsed JSON using structure
|
||||
sections = extractSectionsFromDocument(parsed)
|
||||
if sections:
|
||||
# Get the last section (likely where continuation should happen)
|
||||
sorted_sections = sorted(sections, key=lambda s: s.get("order", 0))
|
||||
last_section = sorted_sections[-1]
|
||||
elements = last_section.get("elements", [])
|
||||
|
||||
if elements and isinstance(elements, list):
|
||||
content_type = last_section.get("content_type", "").lower()
|
||||
|
||||
# For list content_type, extract from items array
|
||||
if content_type == "list" and len(elements) > 0:
|
||||
last_element = elements[-1]
|
||||
if isinstance(last_element, dict):
|
||||
# Check if it has an "items" array (list structure)
|
||||
if "items" in last_element and isinstance(last_element["items"], list):
|
||||
items_list = last_element["items"]
|
||||
if items_list:
|
||||
# Get last max_items from this items array
|
||||
last_items = items_list[-max_items:] if len(items_list) > max_items else items_list
|
||||
# Extract text from each item
|
||||
texts = []
|
||||
for item in last_items:
|
||||
if isinstance(item, dict) and "text" in item:
|
||||
texts.append(str(item["text"]))
|
||||
if texts:
|
||||
return ', '.join(texts)
|
||||
|
||||
# Or if elements themselves are items (alternative structure)
|
||||
elif "text" in last_element:
|
||||
# Get last max_items elements that have text
|
||||
elements_with_text = [e for e in elements if isinstance(e, dict) and "text" in e]
|
||||
if elements_with_text:
|
||||
last_elements = elements_with_text[-max_items:] if len(elements_with_text) > max_items else elements_with_text
|
||||
texts = [str(e.get("text", "")) for e in last_elements]
|
||||
if texts:
|
||||
return ', '.join(texts)
|
||||
|
||||
# For other content types, extract from elements
|
||||
elif len(elements) > 0:
|
||||
# Get last max_items elements that have text/code
|
||||
valid_elements = [e for e in elements if isinstance(e, dict) and ("text" in e or "code" in e)]
|
||||
if valid_elements:
|
||||
last_elements = valid_elements[-max_items:] if len(valid_elements) > max_items else valid_elements
|
||||
texts = []
|
||||
for elem in last_elements:
|
||||
if "text" in elem:
|
||||
texts.append(str(elem["text"]))
|
||||
elif "code" in elem:
|
||||
# For code, show snippet
|
||||
code = str(elem["code"])
|
||||
texts.append(code[:50] + "..." if len(code) > 50 else code)
|
||||
if texts:
|
||||
return ', '.join(texts)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not extract items from fragment using JSON structure: {e}")
|
||||
|
||||
# Strategy 2: If parsing failed, try progressive parsing from the end
|
||||
# Look for the last complete JSON structures near the end
|
||||
try:
|
||||
# Try parsing different lengths from the end
|
||||
for length in [3000, 2000, 1000, 500]:
|
||||
if len(fragment) > length:
|
||||
end_portion = fragment[-length:]
|
||||
closed = _closeJsonStructures(end_portion)
|
||||
obj, err, _ = tryParseJson(closed)
|
||||
if err is None and isinstance(obj, dict):
|
||||
# Successfully parsed - extract sections
|
||||
sections = extractSectionsFromDocument(obj)
|
||||
if sections:
|
||||
# Same extraction logic as above
|
||||
sorted_sections = sorted(sections, key=lambda s: s.get("order", 0))
|
||||
if sorted_sections:
|
||||
last_section = sorted_sections[-1]
|
||||
elements = last_section.get("elements", [])
|
||||
if elements:
|
||||
# Extract texts using same logic as Strategy 1
|
||||
texts = []
|
||||
for elem in elements[-max_items:]:
|
||||
if isinstance(elem, dict):
|
||||
if "items" in elem and isinstance(elem["items"], list):
|
||||
# Get last item from items array
|
||||
if elem["items"]:
|
||||
last_item = elem["items"][-1]
|
||||
if isinstance(last_item, dict) and "text" in last_item:
|
||||
texts.append(str(last_item["text"]))
|
||||
elif "text" in elem:
|
||||
texts.append(str(elem["text"]))
|
||||
if texts:
|
||||
return ', '.join(texts[-max_items:])
|
||||
except Exception as e:
|
||||
logger.debug(f"Progressive parsing from end failed: {e}")
|
||||
|
||||
# Strategy 3: If all parsing fails, try simple extraction from raw fragment
|
||||
# Look for last complete {"text": "..."} pattern near the end
|
||||
try:
|
||||
# Look at last 2000 chars for the pattern
|
||||
end_portion = fragment[-2000:] if len(fragment) > 2000 else fragment
|
||||
# Find all {"text": "value"} patterns
|
||||
import re
|
||||
# Pattern to match {"text": "..."} with escaped quotes
|
||||
pattern = r'\{"text"\s*:\s*"([^"]+)"\}'
|
||||
matches = re.findall(pattern, end_portion)
|
||||
if matches:
|
||||
# Get last max_items
|
||||
last_matches = matches[-max_items:] if len(matches) > max_items else matches
|
||||
return ', '.join(last_matches)
|
||||
except Exception as e:
|
||||
logger.debug(f"Simple pattern extraction failed: {e}")
|
||||
|
||||
# Strategy 4: If all fails, return empty (will use last_item_from_sections)
|
||||
return ""
|
||||
|
||||
|
||||
def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Build context information from accumulated sections for continuation prompt.
|
||||
Extracts last items and provides clear continuation point.
|
||||
|
||||
Args:
|
||||
allSections: List of sections already generated
|
||||
lastRawResponse: Raw JSON response from last iteration (can be broken/incomplete)
|
||||
|
||||
Returns:
|
||||
Dict with section_count, last_raw_json, last_items, and continuation point
|
||||
"""
|
||||
context = {
|
||||
"section_count": len(allSections),
|
||||
}
|
||||
|
||||
# Extract last COMPLETE object directly from raw response (generic - works for any structure)
|
||||
# This is extracted BEFORE any merging/accumulation happens
|
||||
# Returns the full last complete object like {"text": "..."} or {"code": "...", "language": "..."} etc.
|
||||
# Logic: find the last complete {...} where there are no nested { inside (flat object)
|
||||
last_complete_object = "" # Full object as JSON string
|
||||
total_items_count = 0
|
||||
|
||||
if lastRawResponse:
|
||||
raw_json = stripCodeFences(lastRawResponse.strip())
|
||||
if raw_json and raw_json.strip() != "{}":
|
||||
# Find last complete flat object (no nested objects inside)
|
||||
# Scan from the end backwards to find the last complete {...} object
|
||||
# A flat object is complete if: starts with {, ends with }, and has no nested { inside
|
||||
|
||||
# Work backwards from the end, find last }
|
||||
for i in range(len(raw_json) - 1, -1, -1):
|
||||
if raw_json[i] == '}':
|
||||
# Found a closing brace, work backwards to find its opening brace
|
||||
depth = 1
|
||||
opening_pos = -1
|
||||
|
||||
for j in range(i - 1, -1, -1):
|
||||
if raw_json[j] == '}':
|
||||
depth += 1
|
||||
elif raw_json[j] == '{':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
# Found matching opening brace
|
||||
opening_pos = j
|
||||
# Check if this is a flat object (no nested { inside)
|
||||
obj_content = raw_json[j + 1:i]
|
||||
if '{' not in obj_content:
|
||||
# This is a flat object (no nested objects inside)
|
||||
last_complete_object = raw_json[j:i + 1]
|
||||
break
|
||||
|
||||
if last_complete_object:
|
||||
break
|
||||
|
||||
# Also try structure-based parsing for item count
|
||||
try:
|
||||
parsed = repairBrokenJson(raw_json)
|
||||
if parsed:
|
||||
sections = extractSectionsFromDocument(parsed)
|
||||
if sections:
|
||||
sorted_sections = sorted(sections, key=lambda s: s.get("order", 0))
|
||||
last_section = sorted_sections[-1]
|
||||
elements = last_section.get("elements", [])
|
||||
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
if last_section.get("content_type") == "list":
|
||||
last_element = elements[-1]
|
||||
if isinstance(last_element, dict):
|
||||
if "items" in last_element and isinstance(last_element["items"], list):
|
||||
items_list = last_element["items"]
|
||||
# Only count complete items (those successfully extracted)
|
||||
total_items_count = len(items_list)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not extract item count from raw response structure: {e}")
|
||||
|
||||
# Also extract last items for display (fragment extraction)
|
||||
last_items_from_fragment = _extractLastItemsFromFragment(raw_json, max_items=10)
|
||||
|
||||
context["last_raw_json"] = raw_json
|
||||
context["last_item_object"] = last_complete_object # Full last complete object (generic - any structure)
|
||||
context["last_items_from_fragment"] = last_items_from_fragment
|
||||
context["total_items_count"] = total_items_count # Count from raw response
|
||||
|
||||
logger.debug(f"Included previous JSON response in continuation context ({len(raw_json)} chars, {total_items_count} items in response, last complete object: {last_complete_object})")
|
||||
else:
|
||||
logger.warning("lastRawResponse was empty or just '{}' - continuation may not work correctly")
|
||||
else:
|
||||
# No raw response - fallback to extracting from accumulated sections
|
||||
# Extract the last complete object from the last element
|
||||
last_item_object_from_sections = ""
|
||||
if allSections:
|
||||
sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0))
|
||||
last_section = sorted_sections[-1]
|
||||
elements = last_section.get("elements", [])
|
||||
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
# Get the last element (could be any structure - generic)
|
||||
last_element = elements[-1]
|
||||
if isinstance(last_element, dict):
|
||||
# Try to get items if it's a list structure
|
||||
if "items" in last_element and isinstance(last_element["items"], list):
|
||||
items_list = last_element["items"]
|
||||
total_items_count = len(items_list)
|
||||
if items_list:
|
||||
# Get last item (any structure)
|
||||
last_item = items_list[-1]
|
||||
if isinstance(last_item, dict):
|
||||
# Convert to JSON string (generic - works for any object structure)
|
||||
import json
|
||||
try:
|
||||
last_item_object_from_sections = json.dumps(last_item)
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
# Element itself is the object (no items array)
|
||||
total_items_count = len(elements)
|
||||
# Convert to JSON string (generic)
|
||||
import json
|
||||
try:
|
||||
last_item_object_from_sections = json.dumps(last_element)
|
||||
except:
|
||||
pass
|
||||
|
||||
context["last_item_object"] = last_item_object_from_sections
|
||||
context["total_items_count"] = total_items_count
|
||||
logger.debug(f"No previous raw response available for continuation context (but have {total_items_count} items accumulated, last item object: {last_item_object_from_sections})")
|
||||
|
||||
return context
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ class MethodAiOperationsTester:
|
|||
"resultType": "json"
|
||||
},
|
||||
OperationTypeEnum.DATA_GENERATE: {
|
||||
"aiPrompt": "Generate the first 9000 prime numbers.",
|
||||
"aiPrompt": "Generate the first 4000 prime numbers.",
|
||||
"resultType": "txt"
|
||||
},
|
||||
OperationTypeEnum.DATA_EXTRACT: {
|
||||
|
|
|
|||
Loading…
Reference in a new issue